def dg_score(active_mols, decoy_mols): # Similar to DEKOIS # Lower is better (less like actives), higher is worse (more like actives) active_fps = [AllChem.GetMorganFingerprintAsBitVect(mol,3,useFeatures=True) \ for mol in active_mols] # Roughly FCFP_6 decoys_fps = [AllChem.GetMorganFingerprintAsBitVect(mol,3,useFeatures=True) \ if mol is not None else None for mol in decoy_mols] # Roughly FCFP_6 closest_sims = [] closest_sims_id = [] for active_fp in active_fps: active_sims = [] for decoy_fp in decoys_fps: active_sims.append(DataStructs.TanimotoSimilarity(active_fp, decoy_fp) \ if decoy_fp is not None else 0) closest_sims.append(max(active_sims)) closest_sims_id.append(np.argmax(active_sims)) return np.array(closest_sims), np.array(closest_sims_id)
def dg_score_rev(actives, decoys): # Similar to DEKOIS # Lower is better (less like actives), higher is worse (more like actives) active_fps = [ AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smi), 3, useFeatures=True) for smi in actives ] # Roughly FCFP_6 decoys_fps = [ AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smi), 3, useFeatures=True) for smi in decoys ] # Roughly FCFP_6 closest_sims = [] closest_sims_id = [] for decoy_fp in decoys_fps: active_sims = [] for active_fp in active_fps: active_sims.append( DataStructs.TanimotoSimilarity(active_fp, decoy_fp)) closest_sims.append(max(active_sims)) closest_sims_id.append(np.argmax(active_sims)) return closest_sims, closest_sims_id
def tanimoto(self, mol): try: with Timeout(seconds=1): fp = Generate.Gen2DFingerprint(mol, self.sigFactory) return DataStructs.TanimotoSimilarity(fp, self.query_fp) except TimeoutError: logging.debug("SMILES Pharmacophore timeout: ", Chem.MolToSmiles(mol, isomericSmiles=False)) return 0
def __call__(self, smiles: List[str]) -> dict: mols = [Chem.MolFromSmiles(smile) for smile in smiles] valid = [1 if mol is not None else 0 for mol in mols] valid_idxs = [idx for idx, boolean in enumerate(valid) if boolean == 1] valid_mols = [mols[idx] for idx in valid_idxs] fps = [AllChem.GetMorganFingerprint(mol, 2, useCounts=False, useFeatures=True) for mol in valid_mols] tanimoto_dist = np.array([DataStructs.TanimotoSimilarity(self.query_fp, fp, returnDistance=True) for fp in fps]) score = np.full(len(smiles), 0, dtype=np.float32) for idx, value in zip(valid_idxs, tanimoto_dist): score[idx] = value return {"total_score": np.array(score, dtype=np.float32)}
def compute_similarity_kernel_matrices(dataset): """ Computes the drug-drug and protein-protein kernel matrices for kernel-based methods (e.g. Kron-RLS) :param dataset: :return: tuple """ start = time.time() print("Computing kernel matrices (KD_dict, KT_dict)") all_comps = set() all_prots = set() for idx, pair in enumerate(dataset.X): mol, prot = pair all_comps.add(mol) all_prots.add(prot) # compounds / drugs comps_mat = {} for c1 in all_comps: fp1 = c1.fingerprint for c2 in all_comps: fp2 = c2.fingerprint # Tanimoto coefficient score = DataStructs.TanimotoSimilarity(fp1, fp2) comps_mat[Pair(c1, c2)] = score # proteins / targets aligner = Align.PairwiseAligner() aligner.mode = 'local' # SW algorithm prots_mat = {} for p1 in all_prots: seq1 = p1.sequence[1] p1_score = aligner.score(seq1, seq1) for p2 in all_prots: seq2 = p2.sequence[1] p2_score = aligner.score(seq2, seq2) score = aligner.score(seq1, seq2) # Normalized SW score prots_mat[Pair(p1, p2)] = score / (sqrt(p1_score) * sqrt(p2_score)) print("Kernel entities: Drugs={}, Prots={}".format(len(all_comps), len(all_prots))) duration = time.time() - start print("Kernel matrices computation finished in: {:.0f}m {:.0f}s".format( duration // 60, duration % 60)) return comps_mat, prots_mat
def generate_fingerprints_and_create_list(self): #generate fingerprints of predicted ligands and known ligands: gen_mo = rdFingerprintGenerator.GetMorganGenerator(fpSize=2048, radius=2) predicted_fps = [ gen_mo.GetFingerprint(mol) for mol in self.predicted['molecules'] ] true_fps = [ gen_mo.GetFingerprint(mol) for mol in self.true_pos['molecules'] ] similarities = list() for count, mol in enumerate(predicted_fps): tanimoto_values = ([ DataStructs.TanimotoSimilarity(mol, i) for i in true_fps ]) index_of_highest = np.argmax(tanimoto_values) similarities.append(tanimoto_values[index_of_highest]) #module code is in: https://github.com/rdkit/rdkit/tree/master/Contrib/SA_Score sa_score = [ sascorer.calculateScore(i) for i in list(self.predicted['molecules']) ] #create a list holding the QED drug-likeness score #reference: https://doi.org/10.1038/nchem.1243 qeds = [qed(mol) for mol in self.predicted['molecules']] #create a list holding logp: logp = [Descriptors.MolLogP(m) for m in self.predicted['molecules']] #filter catalog usage instructions are here: https://github.com/rdkit/rdkit/pull/536 params = FilterCatalogParams() params.AddCatalog(FilterCatalogParams.FilterCatalogs.BRENK) catalog = FilterCatalog(params) self.brenk = np.array( [catalog.HasMatch(m) for m in self.predicted['molecules']]) #add these lists as columns to the 'predicted' pd.DataFrame self.predicted['similarities'] = similarities self.predicted['sa_score'] = sa_score self.predicted['qeds'] = qeds self.predicted['logp'] = logp print(self.predicted['logp'] < 6) shortlist_mask = ((self.predicted['similarities'] < 0.2) & (self.predicted['sa_score'] < 4) & (self.predicted['qeds'] > 0.25) & (self.predicted['logp'] < 6) & (~self.brenk))
def distij(i, j, features=features): return 1.0 - DataStructs.TanimotoSimilarity(features[int(i)], features[int(j)])
def select_and_evaluate_decoys(f, target, idx, file_loc='./', output_loc='./', dataset='ALL', num_cand_dec_per_act=100, num_dec_per_act=50, max_idx_cmpd=10000): print("Processing: ", f) dec_results = [f] dec_results.append(dataset) # Read data data = decoy_utils.read_paired_file(file_loc+f) # ============================================================================= # # ============================================================================= data = [d+[Chem.MolFromSmiles(d[1])] for d in data] lads_scores = decoy_utils.lads_score_v2( [Chem.MolFromSmiles(smi) for smi in list(set([d[0] for d in data]))], [d[2] for d in data]) data = [d for idx, d in enumerate(data) if lads_scores[idx]<0.5] # ============================================================================= # data = [d for d in data if AllChem.EmbedMolecule( # Chem.AddHs(d[2]), randomSeed=42) != -1] # ============================================================================= data = [d[:2] for d in data] # ============================================================================= # # ============================================================================= # Filter dupes and actives that are too small dec_results.append(len(set([d[0] for d in data]))) seen = set() tmp = [Chem.MolFromSmiles(d[0]) for d in data] data = [d for idx, d in enumerate(data) if tmp[idx] is not None \ and tmp[idx].GetNumHeavyAtoms()>min_active_size] unique_data = [x for x in data if not (tuple(x) in seen or seen.add(tuple(x)))] in_smis = [d[0] for d in data] in_mols = [Chem.MolFromSmiles(smi) for smi in in_smis] set_in_smis = list(set(in_smis)) set_in_mols = [Chem.MolFromSmiles(smi) for smi in set_in_smis] gen_smis = [d[1] for d in data] gen_mols = [Chem.MolFromSmiles(smi) for smi in gen_smis] dec_results.extend([len(set(in_smis)), len(data), len(unique_data)]) print('Calculate properties of in_smis and gen_mols') used = set([]) in_smis_set = [x for x in in_smis if x not in used and (used.add(x) or True)] in_mols_set = [Chem.MolFromSmiles(smi) for smi in in_smis_set] if dataset == "dude_ext": in_props_temp = decoy_utils.calc_dataset_props_dude_extended(in_mols_set, verbose=True) gen_props = decoy_utils.calc_dataset_props_dude_extended(gen_mols, verbose=True) elif dataset == "dekois": in_props_temp = decoy_utils.calc_dataset_props_dekois(in_mols_set, verbose=True) gen_props = decoy_utils.calc_dataset_props_dekois(gen_mols, verbose=True) elif dataset == "MUV": in_props_temp = decoy_utils.calc_dataset_props_muv(in_mols_set, verbose=True) gen_props = decoy_utils.calc_dataset_props_muv(gen_mols, verbose=True) elif dataset == "ALL": in_props_temp = decoy_utils.calc_dataset_props_all(in_mols_set, verbose=True) gen_props = decoy_utils.calc_dataset_props_all(gen_mols, verbose=True) elif dataset == "dude": in_props_temp = decoy_utils.calc_dataset_props_dude(in_mols_set, verbose=True) gen_props = decoy_utils.calc_dataset_props_dude(gen_mols, verbose=True) else: print("Incorrect dataset") exit() in_mols_temp = list(in_smis_set) # copy in_props = [] for i, smi in enumerate(in_smis): in_props.append(in_props_temp[in_mols_temp.index(smi)]) in_basic_temp = decoy_utils.calc_dataset_props_basic(in_mols_set, verbose=True) in_mols_temp = list(in_smis_set) # copy in_basic = [] for i, smi in enumerate(in_smis): in_basic.append(in_basic_temp[in_mols_temp.index(smi)]) gen_basic_props = decoy_utils.calc_dataset_props_basic(gen_mols, verbose=True) print('Scale properties based on in_mols props') active_props_scaled_all = [] decoy_props_scaled_all = [] active_min_all = [] active_max_all = [] active_scale_all = [] active_props = in_props_temp print('Exclude errors from min/max calc') act_prop = np.array(active_props) active_maxes = np.amax(act_prop, axis=0) active_mins = np.amin(act_prop, axis=0) active_max_all.append(active_maxes) active_min_all.append(active_mins) scale = [] for (a_max, a_min) in zip(active_maxes,active_mins): if a_max != a_min: scale.append(a_max - a_min) else: scale.append(a_min) scale = np.array(scale) scale[scale == 0.0] = 1.0 active_scale_all.append(scale) active_props_scaled = (active_props - active_mins) / scale active_props_scaled_all.append(active_props_scaled) # Calc SA scores in_sa_temp = [sascorer.calculateScore(mol) for mol in set_in_mols] in_smis_temp = list(set(in_smis)) in_sa = [] for i, smi in enumerate(in_smis): in_sa.append(in_sa_temp[in_smis_temp.index(smi)]) gen_sa_props = [sascorer.calculateScore(mol) for mol in gen_mols] print('Calc Morgan fingerprints') in_fps = [] for i, mol in enumerate(in_mols): in_fps.append( AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024)) gen_fps = [] for i, mol in enumerate(gen_mols): gen_fps.append( AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024)) print('Calc DG scores') dg_scores, dg_ids = decoy_utils.dg_score_rev(set_in_mols, gen_mols) print('Calc LADS scores') lads_scores = decoy_utils.lads_score_v2(set_in_mols, gen_mols) print('Construct dictionary of results') results_dict = {} for i in range(len(in_smis)): # Get scaling in_props_scaled = (in_props[i] - active_min_all) / active_scale_all gen_props_scaled = (np.array(gen_props[i]) - active_min_all) / active_scale_all prop_diff = np.linalg.norm(np.array(in_props_scaled)-np.array(gen_props_scaled)) # Get basic props diff basic_diff = np.sum(abs(np.array(in_basic[i])-np.array(gen_basic_props[i]))) if in_smis[i] in results_dict: sim = DataStructs.TanimotoSimilarity(in_fps[i], gen_fps[i]) results_dict[in_smis[i]].append( [in_smis[i], gen_smis[i], in_props[i], gen_props[i], prop_diff, sim, basic_diff, abs(gen_sa_props[i]-in_sa[i]), dg_scores[i], lads_scores[i], gen_mols[i]]) else: sim = DataStructs.TanimotoSimilarity(in_fps[i], gen_fps[i]) results_dict[in_smis[i]] = [ [in_smis[i], gen_smis[i], in_props[i], gen_props[i], prop_diff, sim, basic_diff, abs(gen_sa_props[i]-in_sa[i]), dg_scores[i], lads_scores[i], gen_mols[i]] ] print('Get decoy matches') results = [] results_success_only = [] sorted_mols_success = [] for key in results_dict: # Set initial criteria - Note most of these are relatively weak prop_max_diff = 5 max_basic_diff = 3 max_sa_diff = 1.51 max_dg_score = 0.35 max_lads_score = 0.2# 5# 0.1 while True: count_success = sum([i[4]<prop_max_diff \ and i[6]<max_basic_diff and i[7]<max_sa_diff \ and i[8]<max_dg_score and i[9]<max_lads_score \ for i in results_dict[key][0:max_idx_cmpd]]) # Adjust criteria if not enough successes if count_success < num_cand_dec_per_act and max_dg_score<1: #print("Widening search", count_success) prop_max_diff *= 1.1 max_basic_diff += 1 max_sa_diff *= 1.1 max_dg_score *= 1.1 max_lads_score *= 1.1 else: #print("Reached threshold", count_success) # Sort by sum of LADS and property difference (smaller better) sorted_mols_success.append( [(i[0], i[1], i[4], i[9], i[4]+i[9], i[10]) \ for i in sorted(results_dict[key][0:max_idx_cmpd], key=lambda i: i[4]+i[9], reverse=False) if i[4]<prop_max_diff \ and i[6]<max_basic_diff and i[7]<max_sa_diff \ and i[8]<max_dg_score and i[9]<max_lads_score]) #assert count_success == len(sorted_mols_success[-1]) break print('Choose decoys') # ============================================================================= # active_smis_gen = [] # ============================================================================= decoy_smis_gen = set() embed_fails = 0 dupes_wanted = 0 for act_res in sorted_mols_success: count = 0 # Greedy selection based on sum of LADS score and property difference (smaller better) for ent in act_res: # Check can gen conformer if ent[1] not in decoy_smis_gen: # Check conf and not a decoy for another ligand decoy_smis_gen.update([ent[1]]) count +=1 if count >= num_dec_per_act: break elif ent[1] in decoy_smis_gen: dupes_wanted +=1 else: embed_fails += 1 # ============================================================================= # active_smis_gen.append(act_res[0][0]) # ============================================================================= decoy_smis_gen = list(decoy_smis_gen) decoy_mols_gen = [Chem.MolFromSmiles(smi) for smi in decoy_smis_gen] # ============================================================================= # active_mols_gen = [Chem.MolFromSmiles(smi) for smi in active_smis_gen] # ============================================================================= active_mols_gen = set_in_mols dataset = 'dude' print('Calc props for chosen decoys') if dataset == "dude_ext": actives_feat = decoy_utils.calc_dataset_props_dude_extended(active_mols_gen, verbose=True) decoys_feat = decoy_utils.calc_dataset_props_dude_extended(decoy_mols_gen, verbose=True) elif dataset == "dekois": actives_feat = decoy_utils.calc_dataset_props_dekois(active_mols_gen, verbose=True) decoys_feat = decoy_utils.calc_dataset_props_dekois(decoy_mols_gen, verbose=True) elif dataset == "MUV": actives_feat = decoy_utils.calc_dataset_props_muv(active_mols_gen, verbose=True) decoys_feat = decoy_utils.calc_dataset_props_muv(decoy_mols_gen, verbose=True) elif dataset == "ALL": actives_feat = decoy_utils.calc_dataset_props_all(active_mols_gen, verbose=True) decoys_feat = decoy_utils.calc_dataset_props_all(decoy_mols_gen, verbose=True) elif dataset == "dude": actives_feat = decoy_utils.calc_dataset_props_dude(active_mols_gen) decoys_feat = decoy_utils.calc_dataset_props_dude(decoy_mols_gen) else: print("Incorrect dataset") exit() print('ML model performance') print(actives_feat.shape) print(decoys_feat.shape) dec_results.extend(list(decoy_utils.calc_xval_performance( actives_feat, decoys_feat, n_jobs=1))) print('DEKOIS paper metrics (LADS, DOE, Doppelganger score)') dec_results.append(decoy_utils.doe_score(actives_feat, decoys_feat)) lads_scores = decoy_utils.lads_score_v2(active_mols_gen, decoy_mols_gen) dec_results.append(np.mean(lads_scores)) dg_scores, dg_ids = decoy_utils.dg_score(active_mols_gen, decoy_mols_gen) dec_results.extend([np.mean(dg_scores), max(dg_scores)]) # Save intermediate performance results in unique file #with open(output_loc+'results_'+f+'.csv', 'w') as csvfile: # writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) # writer.writerow(dec_results) print('Save decoy mols') output_name = output_loc + \ f'/{target}_{idx}_selected_{num_dec_per_act}_{num_cand_dec_per_act}.smi' with open(output_name, 'w') as outfile: for i, smi in enumerate(decoy_smis_gen): outfile.write(set_in_smis[i//num_dec_per_act] + ' ' + smi + '\n') print(dec_results) GM = np.mean(dec_results[7+1:7+1+3]) print(f'GM: {GM:.4f}') dec_results.append(GM) return dec_results
def calc_similarity(self, mol1, mol2): fp_mol1 = rdMolDescriptors.GetMACCSKeysFingerprint(mol1) fp_mol2 = rdMolDescriptors.GetMACCSKeysFingerprint(mol2) score = DataStructs.TanimotoSimilarity(fp_mol1, fp_mol2) return score
def calc_tanimoto(m1,m2): fp1 = AllChem.GetMorganFingerprintAsBitVect( m1,2 ) fp2 = AllChem.GetMorganFingerprintAsBitVect( m2,2 ) tc = DataStructs.TanimotoSimilarity( fp1, fp2 ) return tc
import pandas as pd from rdkit import Chem from rdkit.Chem import rdBase from rdkit.Chem import RDConfig from rdkit.Chem import AllChem from rdkit.Chem.rdMolDescriptors import GetUSRScore, GetUSRCAT from rdkit.Chem import DataStructs print(rdBase.rdkitVersion) mols = [mol for mol in Chem.SDMolSupplier("cdk2.sdf")] for mol in mols: AllChem.EmbedMolecule(mol, useExpTorsionAnglePrefs=True, useBasicKnowledge=True) usrcats = [GetUSRCAT(mol) for mol in mols] fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols] data = {"tanimoto": [], "usrscore": []} for i in range(len(usrcats)): for j in range(i): tc = DataStructs.TanimotoSimilarity(fps[i], fps[j]) score = GetUSRScore(usrcats[i], usrcats[j]) data["tanimoto"].append(tc) data["usrscore"].append(score) print(score, tc) df = pd.DataFrame(data) fig = sns.pairplot(df) fig.savefig('plot.png')
mol1 = Chem.MolFromSmiles(smiles1) mol2 = Chem.MolFromSmiles(smiles2) mol3 = Chem.MolFromSmiles(smiles3) fp1 = AllChem.GetMorganFingerprint( mol1, 3, #nBits = 1024, useFeatures=False, useChirality=True, ) fp2 = AllChem.GetMorganFingerprint( mol2, 3, #nBits = 1024, useFeatures=False, useChirality=True, ) fp3 = AllChem.GetMorganFingerprint( mol3, 3, #nBits = 1024, useFeatures=False, useChirality=True, ) print(DataStructs.TanimotoSimilarity(fp1, fp2)) print(DataStructs.TanimotoSimilarity(fp1, fp3)) print(DataStructs.TanimotoSimilarity(fp2, fp3))
def __call__(self, molecule): molecule_noh = Chem.RemoveHs(molecule) fingerprint = AllChem.GetMorganFingerprint(molecule_noh, self.radius) similarity = DataStructs.TanimotoSimilarity(self.reference, fingerprint) return similarity
def compute_simboost_drug_target_features(dataset, mf_simboost_data_dict, nbins=10, sim_threshold=0.5): """ Constructs the type 1,2, and 3 features (with the matrix factorization part) of SimBoost as described in: https://jcheminf.biomedcentral.com/articles/10.1186/s13321-017-0209-z The Matrix Factorization part is deferred to the mf.py script. :param sim_threshold: :param nbins: :param dataset: :return: """ assert isinstance( mf_simboost_data_dict, dict), "Drug-Target features dictionary must be provided." print('SimBoost Drug-Target feature vector computation started') print('Processing M matrix') all_comps = set() all_prots = set() pair_to_value_y = {} Mgraph = nx.Graph(name='drug_target_network') Mrows = defaultdict(list) Mcols = defaultdict(list) for x, y, w, id in tqdm(dataset.itersamples()): mol, prot = x all_comps.add(mol) all_prots.add(prot) pair_to_value_y[Pair(mol, prot)] = y Mrows[mol].append(y) Mcols[prot].append(y) Mgraph.add_edge(mol, prot, weight=y) print('Number of compounds = %d' % len(all_comps)) print('Number of targets = %d' % len(all_prots)) # compounds / drugs print('Processing drug similarity matrix') D = {} Dgraph = nx.Graph(name='drug_drug_network') for c1 in tqdm(all_comps): fp1 = c1.fingerprint for c2 in all_comps: fp2 = c2.fingerprint # Tanimoto coefficient score = DataStructs.TanimotoSimilarity(fp1, fp2) D[Pair(c1, c2)] = score Dgraph.add_nodes_from([c1, c2]) if score >= sim_threshold and c1 != c2: Dgraph.add_edge(c1, c2) comp_feats = compute_type2_features( compute_type1_features(Mrows, all_comps, D, nbins), D, Dgraph) # proteins / targets print('Processing target similarity matrix') aligner = Align.PairwiseAligner() aligner.mode = 'local' # SW algorithm T = {} Tgraph = nx.Graph(name='target_target_network') for p1 in tqdm(all_prots): seq1 = p1.sequence[1] p1_score = aligner.score(seq1, seq1) for p2 in all_prots: seq2 = p2.sequence[1] p2_score = aligner.score(seq2, seq2) score = aligner.score(seq1, seq2) # Normalized SW score normalized_score = score / (sqrt(p1_score) * sqrt(p2_score)) T[Pair(p1, p2)] = normalized_score Tgraph.add_nodes_from([p1, p2]) if normalized_score >= sim_threshold and p1 != p2: Tgraph.add_edge(p1, p2) prot_feats = compute_type2_features( compute_type1_features(Mcols, all_prots, T, nbins), T, Tgraph) pbar = UnboundedProgressbar() pbar.start() print('Processing type 3 features') # Type 3 features btw_cent = nx.betweenness_centrality(Mgraph) cls_cent = nx.closeness_centrality(Mgraph) # eig_cent = nx.eigenvector_centrality(Mgraph, tol=1e-3, max_iter=500) # pagerank = nx.pagerank(Mgraph, tol=1e-3, max_iter=1000) drug_target_feats_dict = defaultdict(list) vec_lengths = [] # Retrieve data from the Matrix Factorization stage comp_mat = mf_simboost_data_dict['comp_mat'] prot_mat = mf_simboost_data_dict['prot_mat'] comp_index = mf_simboost_data_dict['comp_index'] prot_index = mf_simboost_data_dict['prot_index'] for pair in tqdm(pair_to_value_y): comp, prot = pair.p1, pair.p2 feat = drug_target_feats_dict[Pair(comp, prot)] # mf cidx = comp_index[comp] pidx = prot_index[prot] c_vec = comp_mat[cidx].tolist() p_vec = prot_mat[pidx].tolist() mf = c_vec + p_vec feat.extend(mf) # d.t.ave d_av_lst = [] for n in Mgraph.neighbors(prot): if Pair(comp, n) in pair_to_value_y: d_av_lst.append(pair_to_value_y[Pair(comp, n)]) if len(d_av_lst) > 0: feat.append(np.mean(d_av_lst)) # t.d.ave t_av_lst = [] for n in Mgraph.neighbors(comp): if Pair(n, prot) in pair_to_value_y: t_av_lst.append(pair_to_value_y[Pair(n, prot)]) if len(t_av_lst) > 0: feat.append(np.mean(t_av_lst)) # d.t.bt, d.t.cl, d.t.ev feat.append(btw_cent[comp]) feat.append(btw_cent[prot]) feat.append(cls_cent[comp]) feat.append(cls_cent[prot]) # feat.append(eig_cent[comp]) # feat.append(eig_cent[prot]) # d.t.pr # feat.append(pagerank[comp]) # feat.append(pagerank[prot]) # add type 1 features feat.extend(comp_feats[comp]) feat.extend(prot_feats[prot]) vec_lengths.append(len(feat)) # zero-pad all vectors to be of the same dimension dim = max(vec_lengths) for k in drug_target_feats_dict: feat = drug_target_feats_dict[k] pvec = [0] * (dim - len(feat)) feat.extend(pvec) pbar.stop() pbar.join() print( 'SimBoost Drug-Target feature vector computation finished. Vector dimension={}' .format(dim)) return drug_target_feats_dict
def get_scaffold_simi(fp1, fp2): return DataStructs.TanimotoSimilarity(fp1, fp2)