Пример #1
0
def select_and_evaluate_decoys(f, target, file_loc='./', output_loc='./', 
                               dataset='ALL', num_cand_dec_per_act=100, 
                               num_dec_per_act=50, max_idx_cmpd=10000):
    print("Processing: ", f)
    dec_results = [f]
    dec_results.append(dataset)
    # Read data
    data = decoy_utils.read_paired_file(file_loc+f)
    # Filter dupes and actives that are too small
    dec_results.append(len(set([d[0] for d in data])))
    tmp = [Chem.MolFromSmiles(d[0]) for d in data]
    data = [d for idx, d in enumerate(data) if tmp[idx] is not None \
            and tmp[idx].GetNumHeavyAtoms()>min_active_size]
    data = pd.DataFrame(data, columns=['act', 'dec'])
    
    pss = get_pss_from_smiles(
        data['act'].values, data['dec'].values)
    data['pss'] = pss.mean(0)
    data['score'] = data['pss']
    result = []
    for key, tmp_df in data.groupby('act'):
        tmp_df = tmp_df.sort_values('score', ascending=False)
        tmp_df = tmp_df.reset_index(drop=True)
        for i in range(min([5, tmp_df.shape[0]])):
            result.append([key, tmp_df['dec'].values[i]])
    result = pd.DataFrame(result, columns=['act', 'dec'])
    result.to_csv('tmp.smi', index=False, header=None, sep=' ')
# =============================================================================
#     
# =============================================================================
    decoy_smis_gen = list(set(result['dec']))
    decoy_mols_gen = [Chem.MolFromSmiles(smi) for smi in decoy_smis_gen]
    active_smis_gen = list(set(result['act']))
    active_mols_gen = [Chem.MolFromSmiles(smi) for smi in active_smis_gen]
    dataset = 'dude'
    print('Calc props for chosen decoys')
    actives_feat = decoy_utils.calc_dataset_props_dude(active_mols_gen)
    decoys_feat = decoy_utils.calc_dataset_props_dude(decoy_mols_gen)

    print('ML model performance')
    print(actives_feat.shape)
    print(decoys_feat.shape)
    dec_results.extend(list(decoy_utils.calc_xval_performance(
        actives_feat, decoys_feat, n_jobs=1)))

    print('DEKOIS paper metrics (LADS, DOE, Doppelganger score)')
    dec_results.append(decoy_utils.doe_score(actives_feat, decoys_feat))
    lads_scores = decoy_utils.lads_score_v2(active_mols_gen, decoy_mols_gen)
    dec_results.append(np.mean(lads_scores))
    dg_scores, dg_ids = decoy_utils.dg_score(active_mols_gen, decoy_mols_gen)
    dec_results.extend([np.mean(dg_scores), max(dg_scores)])
    
    print('Save decoy mols')
    print(dec_results)
    return dec_results
Пример #2
0
def select_and_evaluate_decoys(f,
                               target,
                               file_loc='./',
                               output_loc='./',
                               dataset='ALL',
                               num_cand_dec_per_act=100,
                               num_dec_per_act=50,
                               max_idx_cmpd=10000):
    print("Processing: ", f)
    dec_results = [f]
    dec_results.append(dataset)
    # Read data
    data = decoy_utils.read_paired_file(file_loc + f)
    # Filter dupes and actives that are too small
    dec_results.append(len(set([d[0] for d in data])))
    tmp = [Chem.MolFromSmiles(d[0]) for d in data]
    data = [d for idx, d in enumerate(data) if tmp[idx] is not None \
            and tmp[idx].GetNumHeavyAtoms()>min_active_size]
    result = pd.DataFrame(data, columns=['act', 'dec'])

    decoy_smis_gen = list(set(result['dec']))
    decoy_mols_gen = [Chem.MolFromSmiles(smi) for smi in decoy_smis_gen]
    active_smis_gen = list(set(result['act']))
    active_mols_gen = [Chem.MolFromSmiles(smi) for smi in active_smis_gen]
    dataset = 'dude'
    print('Calc props for chosen decoys')
    actives_feat = decoy_utils.calc_dataset_props_dude(active_mols_gen)
    decoys_feat = decoy_utils.calc_dataset_props_dude(decoy_mols_gen)

    print('ML model performance')
    print(actives_feat.shape)
    print(decoys_feat.shape)
    dec_results.extend(
        list(
            decoy_utils.calc_xval_performance(actives_feat,
                                              decoys_feat,
                                              n_jobs=1)))

    print('DEKOIS paper metrics (LADS, DOE, Doppelganger score)')
    dec_results.append(decoy_utils.doe_score(actives_feat, decoys_feat))
    lads_scores = decoy_utils.lads_score_v2(active_mols_gen, decoy_mols_gen)
    dec_results.append(np.mean(lads_scores))
    dg_scores, dg_ids = decoy_utils.dg_score(active_mols_gen, decoy_mols_gen)
    dec_results.extend([np.mean(dg_scores), max(dg_scores)])

    print('Save decoy mols')
    print(dec_results)
    return dec_results
Пример #3
0
def select_and_evaluate_decoys(f, target, file_loc='./', output_loc='./'):
    print("Processing: ", f)
    dec_results = [f]
    # Read data
    data = decoy_utils.read_paired_file(file_loc + f)
    # Filter dupes and actives that are too small
    dec_results.append(len(set([d[0] for d in data])))
    tmp = [Chem.MolFromSmiles(d[0]) for d in data]
    data = [d for idx, d in enumerate(data) if tmp[idx] is not None \
            and tmp[idx].GetNumHeavyAtoms()>10]
    data = pd.DataFrame(data, columns=['act', 'dec'])
    if target == 'SA':
        data['style'] = data['dec'].apply(lambda x: get_sa(x))
        data['style'] = (5 - data['style']) / 3
    else:
        style = pd.read_csv('./eval/results/predict_TOX.csv')
        style = style.rename(columns={
            'smiles': 'dec',
            'pred_0': 'style'
        })[['dec', 'style']]
        data = data.merge(style, on='dec', how='inner')
        data['style'] = 1 - data['style']
    pss = get_pss_from_smiles(data['act'].values, data['dec'].values)
    data['pss'] = pss.mean(0)
    data['score'] = data['pss'] + data['style']
    result = []
    for key, tmp_df in data.groupby('act'):
        tmp_df = tmp_df.sort_values('score', ascending=False)
        tmp_df = tmp_df.reset_index(drop=True)
        for i in range(min([1, tmp_df.shape[0]])):
            result.append([key, tmp_df['dec'].values[i]])
    result = pd.DataFrame(result, columns=['act', 'dec'])
    output_name = output_loc + \
        f'/{target}_results.smi'
    result = result.drop_duplicates().reset_index(drop=True)
    result.to_csv(output_name, index=False, header=None, sep=' ')
Пример #4
0
def select_and_evaluate_decoys(f,
                               target,
                               file_loc='./',
                               output_loc='./',
                               T_simi=0.15,
                               N=5):
    print("Processing: ", f)
    # Read data
    data = decoy_utils.read_paired_file(file_loc + f)
    data = pd.DataFrame(data,
                        columns=['act', 'dec'
                                 ])  #.sample(frac=0.01).reset_index(drop=True)
    mol_acts = [Chem.MolFromSmiles(smi) for smi in data['act'].values]
    mol_decs = [Chem.MolFromSmiles(smi) for smi in data['dec'].values]
    fp_acts = [get_fp(mol) for mol in mol_acts]
    fp_decs = [get_fp(mol) for mol in mol_decs]
    simi = [
        get_scaffold_simi(fp_acts[i], fp_decs[i]) for i in range(len(fp_acts))
    ]
    idxs = np.where(np.array(simi) < T_simi)
    mol_acts = np.array(mol_acts)[idxs]
    mol_decs = np.array(mol_decs)[idxs]

    pss = get_pss_from_smiles(mol_acts, mol_decs)
    data = pd.DataFrame(data.values[idxs], columns=['act', 'dec'])
    data['pss'] = pss.mean(0)
    data['score'] = data['pss']
    result = []
    for key, tmp_df in data.groupby('act'):
        tmp_df = tmp_df.sort_values('score', ascending=False)
        tmp_df = tmp_df.reset_index(drop=True)
        for i in range(min([N, tmp_df.shape[0]])):
            result.append([key, tmp_df['dec'].values[i]])
    result = pd.DataFrame(result, columns=['act', 'dec'])
    result = result.drop_duplicates().reset_index(drop=True)
    result.to_csv(f'{target}_decoys.smi', index=False, header=None, sep=' ')
Пример #5
0
def select_and_evaluate_decoys(f, target, idx, file_loc='./', output_loc='./', 
                               dataset='ALL', num_cand_dec_per_act=100, 
                               num_dec_per_act=50, max_idx_cmpd=10000):
    print("Processing: ", f)
    dec_results = [f]
    dec_results.append(dataset)
    # Read data
    data = decoy_utils.read_paired_file(file_loc+f)
# =============================================================================
#         
# =============================================================================
    data = [d+[Chem.MolFromSmiles(d[1])] for d in data]
    lads_scores = decoy_utils.lads_score_v2(
        [Chem.MolFromSmiles(smi) for smi in list(set([d[0] for d in data]))], 
        [d[2] for d in data])
    data = [d for idx, d in enumerate(data) if lads_scores[idx]<0.5]
# =============================================================================
#     data = [d for d in data if AllChem.EmbedMolecule(
#         Chem.AddHs(d[2]), randomSeed=42) != -1]
# =============================================================================
    data = [d[:2] for d in data]
# =============================================================================
#         
# =============================================================================
    # Filter dupes and actives that are too small
    dec_results.append(len(set([d[0] for d in data])))
    seen = set()
    tmp = [Chem.MolFromSmiles(d[0]) for d in data]
    data = [d for idx, d in enumerate(data) if tmp[idx] is not None \
            and tmp[idx].GetNumHeavyAtoms()>min_active_size]
    unique_data = [x for x in data if not (tuple(x) in seen or seen.add(tuple(x)))]
    
    in_smis = [d[0] for d in data]
    in_mols = [Chem.MolFromSmiles(smi) for smi in in_smis]
    set_in_smis = list(set(in_smis))
    set_in_mols = [Chem.MolFromSmiles(smi) for smi in set_in_smis]
    gen_smis = [d[1] for d in data]
    gen_mols = [Chem.MolFromSmiles(smi) for smi in gen_smis]
    dec_results.extend([len(set(in_smis)), len(data), len(unique_data)])

    print('Calculate properties of in_smis and gen_mols')
    used = set([])
    in_smis_set = [x for x in in_smis if x not in used and (used.add(x) or True)]
    in_mols_set = [Chem.MolFromSmiles(smi) for smi in in_smis_set]
    if dataset == "dude_ext":
        in_props_temp = decoy_utils.calc_dataset_props_dude_extended(in_mols_set, verbose=True)
        gen_props = decoy_utils.calc_dataset_props_dude_extended(gen_mols, verbose=True)
    elif dataset == "dekois":
        in_props_temp = decoy_utils.calc_dataset_props_dekois(in_mols_set, verbose=True)
        gen_props = decoy_utils.calc_dataset_props_dekois(gen_mols, verbose=True)
    elif dataset == "MUV":
        in_props_temp = decoy_utils.calc_dataset_props_muv(in_mols_set, verbose=True)
        gen_props = decoy_utils.calc_dataset_props_muv(gen_mols, verbose=True)
    elif dataset == "ALL":
        in_props_temp = decoy_utils.calc_dataset_props_all(in_mols_set, verbose=True)
        gen_props = decoy_utils.calc_dataset_props_all(gen_mols, verbose=True)
    elif dataset == "dude":
        in_props_temp = decoy_utils.calc_dataset_props_dude(in_mols_set, verbose=True)
        gen_props = decoy_utils.calc_dataset_props_dude(gen_mols, verbose=True)
    else:
        print("Incorrect dataset")
        exit()
    in_mols_temp = list(in_smis_set) # copy
    in_props = []
    for i, smi in enumerate(in_smis):
        in_props.append(in_props_temp[in_mols_temp.index(smi)])

    in_basic_temp = decoy_utils.calc_dataset_props_basic(in_mols_set, verbose=True)
    in_mols_temp = list(in_smis_set) # copy
    in_basic = []
    for i, smi in enumerate(in_smis):
        in_basic.append(in_basic_temp[in_mols_temp.index(smi)])

    gen_basic_props = decoy_utils.calc_dataset_props_basic(gen_mols, verbose=True)

    print('Scale properties based on in_mols props')
    active_props_scaled_all = []
    decoy_props_scaled_all = []

    active_min_all = []
    active_max_all = []
    active_scale_all = []

    active_props = in_props_temp
    print('Exclude errors from min/max calc')
    act_prop = np.array(active_props)

    active_maxes = np.amax(act_prop, axis=0)
    active_mins = np.amin(act_prop, axis=0)

    active_max_all.append(active_maxes)
    active_min_all.append(active_mins)

    scale = []
    for (a_max, a_min) in zip(active_maxes,active_mins):
        if a_max != a_min:
            scale.append(a_max - a_min)
        else:
            scale.append(a_min)
    scale = np.array(scale)
    scale[scale == 0.0] = 1.0
    active_scale_all.append(scale)
    active_props_scaled = (active_props - active_mins) / scale
    active_props_scaled_all.append(active_props_scaled)

    # Calc SA scores
    in_sa_temp = [sascorer.calculateScore(mol) for mol in set_in_mols]
    in_smis_temp = list(set(in_smis))
    in_sa = []
    for i, smi in enumerate(in_smis):
        in_sa.append(in_sa_temp[in_smis_temp.index(smi)])
    gen_sa_props = [sascorer.calculateScore(mol) for mol in gen_mols]

    print('Calc Morgan fingerprints')
    in_fps = []
    for i, mol in enumerate(in_mols):
        in_fps.append(
            AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024))
    gen_fps = []
    for i, mol in enumerate(gen_mols):
        gen_fps.append(
            AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024))

    print('Calc DG scores')
    dg_scores, dg_ids = decoy_utils.dg_score_rev(set_in_mols, gen_mols)

    print('Calc LADS scores')
    lads_scores = decoy_utils.lads_score_v2(set_in_mols, gen_mols)
    
    print('Construct dictionary of results')
    results_dict = {}
    for i in range(len(in_smis)):
        # Get scaling
        in_props_scaled = (in_props[i] - active_min_all) / active_scale_all
        gen_props_scaled = (np.array(gen_props[i]) - active_min_all) / active_scale_all
        prop_diff = np.linalg.norm(np.array(in_props_scaled)-np.array(gen_props_scaled))

        # Get basic props diff
        basic_diff = np.sum(abs(np.array(in_basic[i])-np.array(gen_basic_props[i])))

        if in_smis[i] in results_dict:
            sim = DataStructs.TanimotoSimilarity(in_fps[i], gen_fps[i])
            results_dict[in_smis[i]].append(
                [in_smis[i], gen_smis[i], in_props[i], gen_props[i], prop_diff, 
                 sim, basic_diff, abs(gen_sa_props[i]-in_sa[i]), 
                 dg_scores[i], lads_scores[i], gen_mols[i]])
        else:
            sim = DataStructs.TanimotoSimilarity(in_fps[i], gen_fps[i])
            results_dict[in_smis[i]] = [
                [in_smis[i], gen_smis[i], in_props[i], gen_props[i], prop_diff, 
                 sim, basic_diff, abs(gen_sa_props[i]-in_sa[i]), 
                 dg_scores[i], lads_scores[i], gen_mols[i]] ]

    print('Get decoy matches')
    results = []
    results_success_only = []
    sorted_mols_success = []
    for key in results_dict:
        # Set initial criteria - Note most of these are relatively weak
        prop_max_diff = 5
        max_basic_diff = 3
        max_sa_diff = 1.51
        max_dg_score = 0.35
        max_lads_score = 0.2# 5# 0.1
        while True:
            count_success = sum([i[4]<prop_max_diff \
                                 and i[6]<max_basic_diff and i[7]<max_sa_diff \
                                 and i[8]<max_dg_score and i[9]<max_lads_score \
                                     for i in results_dict[key][0:max_idx_cmpd]])
            # Adjust criteria if not enough successes
            if count_success < num_cand_dec_per_act and max_dg_score<1:
                #print("Widening search", count_success)
                prop_max_diff *= 1.1
                max_basic_diff += 1
                max_sa_diff *= 1.1
                max_dg_score *= 1.1
                max_lads_score *= 1.1
            else:
                #print("Reached threshold", count_success)
                # Sort by sum of LADS and property difference (smaller better)
                sorted_mols_success.append(
                    [(i[0], i[1], i[4], i[9], i[4]+i[9], i[10]) \
                     for i in sorted(results_dict[key][0:max_idx_cmpd], 
                                     key=lambda i: i[4]+i[9], reverse=False)   
                    if i[4]<prop_max_diff \
                        and i[6]<max_basic_diff and i[7]<max_sa_diff \
                            and i[8]<max_dg_score and i[9]<max_lads_score])
                #assert count_success == len(sorted_mols_success[-1])
                break

    print('Choose decoys')
# =============================================================================
#     active_smis_gen = []
# =============================================================================
    decoy_smis_gen = set()

    embed_fails = 0
    dupes_wanted = 0
    for act_res in sorted_mols_success:
        count = 0
        # Greedy selection based on sum of LADS score and property difference (smaller better)
        for ent in act_res:
            # Check can gen conformer
            if ent[1] not in decoy_smis_gen: # Check conf and not a decoy for another ligand
                decoy_smis_gen.update([ent[1]])
                count +=1
                if count >= num_dec_per_act:
                    break
            elif ent[1] in decoy_smis_gen:
                dupes_wanted +=1
            else:
                embed_fails += 1
# =============================================================================
#         active_smis_gen.append(act_res[0][0])
# =============================================================================
    decoy_smis_gen = list(decoy_smis_gen)
    decoy_mols_gen = [Chem.MolFromSmiles(smi) for smi in decoy_smis_gen]
# =============================================================================
#     active_mols_gen = [Chem.MolFromSmiles(smi) for smi in active_smis_gen]
# =============================================================================
    active_mols_gen = set_in_mols
    dataset = 'dude'
    print('Calc props for chosen decoys')
    if dataset == "dude_ext":
        actives_feat = decoy_utils.calc_dataset_props_dude_extended(active_mols_gen, verbose=True)
        decoys_feat = decoy_utils.calc_dataset_props_dude_extended(decoy_mols_gen, verbose=True)
    elif dataset == "dekois":
        actives_feat = decoy_utils.calc_dataset_props_dekois(active_mols_gen, verbose=True)
        decoys_feat = decoy_utils.calc_dataset_props_dekois(decoy_mols_gen, verbose=True)
    elif dataset == "MUV":
        actives_feat = decoy_utils.calc_dataset_props_muv(active_mols_gen, verbose=True)
        decoys_feat = decoy_utils.calc_dataset_props_muv(decoy_mols_gen, verbose=True)
    elif dataset == "ALL":
        actives_feat = decoy_utils.calc_dataset_props_all(active_mols_gen, verbose=True)
        decoys_feat = decoy_utils.calc_dataset_props_all(decoy_mols_gen, verbose=True)
    elif dataset == "dude":
        actives_feat = decoy_utils.calc_dataset_props_dude(active_mols_gen)
        decoys_feat = decoy_utils.calc_dataset_props_dude(decoy_mols_gen)
    else:
        print("Incorrect dataset")
        exit()

    print('ML model performance')
    print(actives_feat.shape)
    print(decoys_feat.shape)
    dec_results.extend(list(decoy_utils.calc_xval_performance(
        actives_feat, decoys_feat, n_jobs=1)))

    print('DEKOIS paper metrics (LADS, DOE, Doppelganger score)')
    dec_results.append(decoy_utils.doe_score(actives_feat, decoys_feat))
    lads_scores = decoy_utils.lads_score_v2(active_mols_gen, decoy_mols_gen)
    dec_results.append(np.mean(lads_scores))
    dg_scores, dg_ids = decoy_utils.dg_score(active_mols_gen, decoy_mols_gen)
    dec_results.extend([np.mean(dg_scores), max(dg_scores)])
    
    # Save intermediate performance results in unique file
    #with open(output_loc+'results_'+f+'.csv', 'w') as csvfile:
    #    writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    #    writer.writerow(dec_results)

    print('Save decoy mols')
    output_name = output_loc + \
        f'/{target}_{idx}_selected_{num_dec_per_act}_{num_cand_dec_per_act}.smi'
    with open(output_name, 'w') as outfile:
        for i, smi in enumerate(decoy_smis_gen):
            outfile.write(set_in_smis[i//num_dec_per_act] + ' ' + smi + '\n')
    print(dec_results)
    GM = np.mean(dec_results[7+1:7+1+3])
    print(f'GM: {GM:.4f}')
    dec_results.append(GM)
    return dec_results
Пример #6
0
def select_and_evaluate_decoys(f,
                               file_loc='./',
                               dataset='dude',
                               num_cand_dec_per_act=100,
                               num_dec_per_act=50,
                               max_idx_cmpd=10000):

    print("Processing: ", f)

    dec_results = [f]

    dec_results.append(dataset)

    # Read data

    data = decoy_utils.read_paired_file(file_loc + f)

    # Filter dupes and actives that are too small

    dec_results.append(len(set([d[0] for d in data])))

    seen = set()

    data = [
        d for d in data if Chem.MolFromSmiles(d[0]) is not None
        and Chem.MolFromSmiles(d[0]).GetNumHeavyAtoms() > min_active_size
    ]

    unique_data = [
        x for x in data if not (tuple(x) in seen or seen.add(tuple(x)))
    ]

    in_smis = [d[0] for d in data]

    gen_smis = [d[1] for d in data]

    dec_results.extend([len(set(in_smis)), len(data), len(unique_data)])

    print('Calculate properties of in_smis and gen_mols')

    used = set([])

    in_smis_set = [
        x for x in in_smis if x not in used and (used.add(x) or True)
    ]

    active_mols_gen = [Chem.MolFromSmiles(smi) for smi in in_smis_set]

    decoy_mols_gen = [Chem.MolFromSmiles(smi) for smi in gen_smis]

    dataset = 'dude'

    print('Calc props for chosen decoys')

    actives_feat = decoy_utils.calc_dataset_props_dude(active_mols_gen)

    decoys_feat = decoy_utils.calc_dataset_props_dude(decoy_mols_gen)

    print('ML model performance')

    dec_results.extend(
        list(
            decoy_utils.calc_xval_performance(actives_feat,
                                              decoys_feat,
                                              n_jobs=1)))

    print('DEKOIS paper metrics (LADS, DOE, Doppelganger score)')

    dec_results.append(decoy_utils.doe_score(actives_feat, decoys_feat))

    lads_scores = decoy_utils.lads_score_v2(active_mols_gen, decoy_mols_gen)

    dec_results.append(np.mean(lads_scores))

    dg_scores, dg_ids = decoy_utils.dg_score(active_mols_gen, decoy_mols_gen)

    dec_results.extend([np.mean(dg_scores), max(dg_scores)])

    print(dec_results)

    return dec_results