예제 #1
0
def select_and_evaluate_decoys(f, target, file_loc='./', output_loc='./', 
                               dataset='ALL', num_cand_dec_per_act=100, 
                               num_dec_per_act=50, max_idx_cmpd=10000):
    print("Processing: ", f)
    dec_results = [f]
    dec_results.append(dataset)
    # Read data
    data = decoy_utils.read_paired_file(file_loc+f)
    # Filter dupes and actives that are too small
    dec_results.append(len(set([d[0] for d in data])))
    tmp = [Chem.MolFromSmiles(d[0]) for d in data]
    data = [d for idx, d in enumerate(data) if tmp[idx] is not None \
            and tmp[idx].GetNumHeavyAtoms()>min_active_size]
    data = pd.DataFrame(data, columns=['act', 'dec'])
    
    pss = get_pss_from_smiles(
        data['act'].values, data['dec'].values)
    data['pss'] = pss.mean(0)
    data['score'] = data['pss']
    result = []
    for key, tmp_df in data.groupby('act'):
        tmp_df = tmp_df.sort_values('score', ascending=False)
        tmp_df = tmp_df.reset_index(drop=True)
        for i in range(min([5, tmp_df.shape[0]])):
            result.append([key, tmp_df['dec'].values[i]])
    result = pd.DataFrame(result, columns=['act', 'dec'])
    result.to_csv('tmp.smi', index=False, header=None, sep=' ')
# =============================================================================
#     
# =============================================================================
    decoy_smis_gen = list(set(result['dec']))
    decoy_mols_gen = [Chem.MolFromSmiles(smi) for smi in decoy_smis_gen]
    active_smis_gen = list(set(result['act']))
    active_mols_gen = [Chem.MolFromSmiles(smi) for smi in active_smis_gen]
    dataset = 'dude'
    print('Calc props for chosen decoys')
    actives_feat = decoy_utils.calc_dataset_props_dude(active_mols_gen)
    decoys_feat = decoy_utils.calc_dataset_props_dude(decoy_mols_gen)

    print('ML model performance')
    print(actives_feat.shape)
    print(decoys_feat.shape)
    dec_results.extend(list(decoy_utils.calc_xval_performance(
        actives_feat, decoys_feat, n_jobs=1)))

    print('DEKOIS paper metrics (LADS, DOE, Doppelganger score)')
    dec_results.append(decoy_utils.doe_score(actives_feat, decoys_feat))
    lads_scores = decoy_utils.lads_score_v2(active_mols_gen, decoy_mols_gen)
    dec_results.append(np.mean(lads_scores))
    dg_scores, dg_ids = decoy_utils.dg_score(active_mols_gen, decoy_mols_gen)
    dec_results.extend([np.mean(dg_scores), max(dg_scores)])
    
    print('Save decoy mols')
    print(dec_results)
    return dec_results
예제 #2
0
파일: 02_eval.py 프로젝트: SY575/Decoys
def select_and_evaluate_decoys(f, target, file_loc='./', output_loc='./'):
    print("Processing: ", f)
    dec_results = [f]
    # Read data
    data = decoy_utils.read_paired_file(file_loc + f)
    # Filter dupes and actives that are too small
    dec_results.append(len(set([d[0] for d in data])))
    tmp = [Chem.MolFromSmiles(d[0]) for d in data]
    data = [d for idx, d in enumerate(data) if tmp[idx] is not None \
            and tmp[idx].GetNumHeavyAtoms()>10]
    data = pd.DataFrame(data, columns=['act', 'dec'])
    if target == 'SA':
        data['style'] = data['dec'].apply(lambda x: get_sa(x))
        data['style'] = (5 - data['style']) / 3
    else:
        style = pd.read_csv('./eval/results/predict_TOX.csv')
        style = style.rename(columns={
            'smiles': 'dec',
            'pred_0': 'style'
        })[['dec', 'style']]
        data = data.merge(style, on='dec', how='inner')
        data['style'] = 1 - data['style']
    pss = get_pss_from_smiles(data['act'].values, data['dec'].values)
    data['pss'] = pss.mean(0)
    data['score'] = data['pss'] + data['style']
    result = []
    for key, tmp_df in data.groupby('act'):
        tmp_df = tmp_df.sort_values('score', ascending=False)
        tmp_df = tmp_df.reset_index(drop=True)
        for i in range(min([1, tmp_df.shape[0]])):
            result.append([key, tmp_df['dec'].values[i]])
    result = pd.DataFrame(result, columns=['act', 'dec'])
    output_name = output_loc + \
        f'/{target}_results.smi'
    result = result.drop_duplicates().reset_index(drop=True)
    result.to_csv(output_name, index=False, header=None, sep=' ')
예제 #3
0
def select_and_evaluate_decoys(f,
                               target,
                               file_loc='./',
                               output_loc='./',
                               T_simi=0.15,
                               N=5):
    print("Processing: ", f)
    # Read data
    data = decoy_utils.read_paired_file(file_loc + f)
    data = pd.DataFrame(data,
                        columns=['act', 'dec'
                                 ])  #.sample(frac=0.01).reset_index(drop=True)
    mol_acts = [Chem.MolFromSmiles(smi) for smi in data['act'].values]
    mol_decs = [Chem.MolFromSmiles(smi) for smi in data['dec'].values]
    fp_acts = [get_fp(mol) for mol in mol_acts]
    fp_decs = [get_fp(mol) for mol in mol_decs]
    simi = [
        get_scaffold_simi(fp_acts[i], fp_decs[i]) for i in range(len(fp_acts))
    ]
    idxs = np.where(np.array(simi) < T_simi)
    mol_acts = np.array(mol_acts)[idxs]
    mol_decs = np.array(mol_decs)[idxs]

    pss = get_pss_from_smiles(mol_acts, mol_decs)
    data = pd.DataFrame(data.values[idxs], columns=['act', 'dec'])
    data['pss'] = pss.mean(0)
    data['score'] = data['pss']
    result = []
    for key, tmp_df in data.groupby('act'):
        tmp_df = tmp_df.sort_values('score', ascending=False)
        tmp_df = tmp_df.reset_index(drop=True)
        for i in range(min([N, tmp_df.shape[0]])):
            result.append([key, tmp_df['dec'].values[i]])
    result = pd.DataFrame(result, columns=['act', 'dec'])
    result = result.drop_duplicates().reset_index(drop=True)
    result.to_csv(f'{target}_decoys.smi', index=False, header=None, sep=' ')
예제 #4
0
파일: eval.py 프로젝트: SY575/Decoys
    pred = pd.read_csv('./eval/zinc_all_alerts_pred.csv',
                       usecols=['smiles', 'pred']).rename(columns={
                           'smiles': 'content',
                           'pred': 'content_TOX'
                       })[['content', 'content_TOX']]

    df = df.merge(pred, on='content', how='left')

df = df.drop_duplicates(['content', 'gene'])

df['tmp'] = df[['content', 'gene']].values.tolist()

print(df.shape)

pss = get_pss_from_smiles(df['content'].values, df['gene'].values)

df['PSS'] = pss.mean(0)

df['content_fp'] = df['content'].apply(lambda x: get_mol_features(x))

df['gene_fp'] = df['gene'].apply(lambda x: get_mol_features(x))

df['similarity'] = df[['content_fp', 'gene_fp']].values.tolist()

df['similarity'] = df['similarity'].apply(
    lambda x: np.sum(np.array(x[0]) * np.array(x[1])))

# higher is better