def filter_actions(smiles, valid_actions, target_fps, target_atoms, target_bonds, target_C_envs, radius): filter_actions = [] reach = False mol1 = Chem.MolFromSmiles(smiles) fps1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=radius, nBits=1024) base_similarity = DataStructs.FingerprintSimilarity(fps1, target_fps) for next_smiles in valid_actions: fps2, atoms2, bonds2, C_envs2 = get_mol_infos(next_smiles, radius) # print (all(elem in target_C_envs for elem in C_envs2)) next_similarity = DataStructs.FingerprintSimilarity(fps2, target_fps) if next_similarity > base_similarity and not mol_violation( atoms2, bonds2, C_envs2, target_atoms, target_bonds, target_C_envs): # base_similarity = next_similarity ## Accelerate # print (next_smiles) # print ('target', target_C_envs) # print ('next', C_envs2) filter_actions.append(next_smiles) # print (next_smiles, next_similarity) if next_similarity == 1: reach = True filter_actions = [next_smiles] break return filter_actions, reach
def rd_kit(dir_sdf = "../data/sdf/"): temp_str = "ls " + dir_sdf temp = os.popen(temp_str).read() temp = str(temp).split() bit_length = 1024 sim_matrix_morgan = [] sim_matrix_rdk = [] sim_matrix_aval = [] sim_matrix_layer = [] baseline = SDMolSupplier(dir_sdf + temp[0]) baseline_morgan = AllChem.GetMorganFingerprintAsBitVect(baseline[0], 2, nBits=bit_length) baseline_rdk = AllChem.RDKFingerprint(baseline[0], maxPath=2) baseline_aval = pyAvalonTools.GetAvalonFP(baseline[0], 128) baseline_layer = AllChem.LayeredFingerprint(baseline[0]) count = 0 for item in temp: suppl = SDMolSupplier(dir_sdf + item) count += 1 fp = AllChem.GetMorganFingerprint(suppl[0], 2) fp_bit = AllChem.GetMorganFingerprintAsBitVect(suppl[0], 3, nBits=bit_length) fp_rdk = AllChem.RDKFingerprint(suppl[0], maxPath=3) fp_aval = pyAvalonTools.GetAvalonFP(suppl[0], 128) fp_layer = AllChem.LayeredFingerprint(suppl[0]) sim_matrix_morgan.append( DataStructs.FingerprintSimilarity(baseline_morgan, fp_bit, metric=DataStructs.TanimotoSimilarity)) sim_matrix_rdk.append( DataStructs.FingerprintSimilarity(baseline_rdk, fp_rdk, metric=DataStructs.TanimotoSimilarity)) sim_matrix_aval.append( DataStructs.FingerprintSimilarity(baseline_aval, fp_aval, metric=DataStructs.TanimotoSimilarity)) sim_matrix_layer.append( DataStructs.FingerprintSimilarity(baseline_layer, fp_layer, metric=DataStructs.TanimotoSimilarity)) sim_matrix_morgan = np.array(sim_matrix_morgan) sim_matrix_rdk = np.array(sim_matrix_rdk) sim_matrix_aval = np.array(sim_matrix_aval) sim_matrix_layer = np.array(sim_matrix_layer) label_morgan = "morgan" + str(bit_length) plt.hist(sim_matrix_morgan, label = label_morgan) plt.hist(sim_matrix_rdk, label = "rdk2") #plt.hist(sim_matrix_aval, label = "avalon128") #plt.hist(sim_matrix_layer, label = "layer") print(np.mean(sim_matrix_rdk)) print(count) plt.xlabel("Similarity to Baseline") plt.ylabel("Counts") plt.title("Different Fingerprinting Methods, Similarity to Baseline") plt.legend() plt.show()
def chemical_random_episode(env, search_dict, target_fps, target_atoms, target_bonds, target_C_envs, radius): initial_state = env.reset() state = initial_state pre_state = initial_state episode = [state] reach = False while True: if state not in search_dict: valid_actions = env._get_valid_actions() valid_actions, reach = filter_actions(state, valid_actions, target_fps, target_atoms, target_bonds, target_C_envs, radius) # filter actions search_dict[ state] = valid_actions # first meet state, record possible actions elif search_dict == {initial_state: []}: search_dict = 'terminate' break else: valid_actions = search_dict[state] # load updated actions valid_actions, reach = filter_actions(state, valid_actions, target_fps, target_atoms, target_bonds, target_C_envs, radius) ##filter again # print (valid_actions) nA = len(valid_actions) if nA == 0: # if len(valid_actions) == 0, fail and remove this state from dictionary and never add back search_dict.pop( state) # if state has no action left, delete from dictionary search_dict[pre_state].remove(state) mol1 = Chem.MolFromSmiles(state) fps1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=radius) print('No action space, Last action: %s, similarity: %.3f' % (state, DataStructs.FingerprintSimilarity(fps1, target_fps))) break action = np.random.randint(nA) next_state, reward, done = env.step(valid_actions, action) episode.append(next_state) if reach == True: search_dict[state].remove(next_state) mol2 = Chem.MolFromSmiles(next_state) fps2 = AllChem.GetMorganFingerprintAsBitVect(mol2, radius=radius) print('Reach, last action: %s, similarity: %.3f' % (next_state, DataStructs.FingerprintSimilarity(fps2, target_fps))) break pre_state = state state = next_state return episode, reach, search_dict
def measure_similarity(self, db_fps, sim_metric=DataStructs.TanimotoSimilarity, th=0.8): global user_ip_fps global db_cntr global fps_matches if db_cntr % 10000 == 0: self.jlogger.info("Completed checking similarity with {} compound of db".format(db_cntr)) u_fps_cntr = 0 if not db_fps is None: for u_fps in user_ip_fps: try: if not u_fps is None: sim = DataStructs.FingerprintSimilarity(u_fps, db_fps, metric=sim_metric) if sim >= th: if db_cntr in fps_matches: fps_matches[db_cntr].append((u_fps_cntr, sim)) else: fps_matches[db_cntr] = [(u_fps_cntr, sim)] else: self.jlogger.debug( "User Finger print is unavailable, skipping this compound {}".format(u_fps_cntr)) except Exception as e: logger.exception( "Error measuring similarity of compound db_cntr {} and u_fps_cntr {}".format(db_cntr, u_fps_cntr)) self.jlogger.debug( "Error measuring similarity of compound db_cntr {} and u_fps_cntr {}".format(db_cntr, u_fps_cntr)) u_fps_cntr += 1 db_cntr += 1 else: self.jlogger.debug("DB Finger print is unavailable, skipping this compound {}".format(db_cntr))
def TakeInput(filepath, hmdb_filepath, OR_name): positive_Cancer = extractPositiveOnes(filepath) data_hmdb = pd.read_csv(hmdb_filepath, encoding="ISO-8859-1") positive_Cancer = pd.read_csv(filepath, encoding="ISO-8859-1") hmdb_names = data_hmdb['NAME'] hmdb_SMILES = data_hmdb['SMILES'] positive_Cancer_SMILES = positive_Cancer['Smiles'] positive_Cancer_Names = positive_Cancer["Ligand"] hmdb_data = pd.concat([hmdb_SMILES, hmdb_names], axis=1) dataframe = pd.concat([positive_Cancer_SMILES, positive_Cancer_Names], axis=1) Cancer_clean_data = dataframe.drop_duplicates() Cancer_clean_data = Cancer_clean_data.reset_index(drop=True) df1 = pd.DataFrame({ "Cancer_Molecule": [], "Cancer_SMILES": [], "HMDB_Molecule": [], "HMDB_SMILES": [], "TANIMOTO_Similarity_Value": [] }) hmdb_data = hmdb_data.reset_index(drop=True) k = 0 for i in range(len(Cancer_clean_data)): # df1=df1.iloc[0:0] # df1= pd.DataFrame({"Cancer_clean_data_Molecule":[],"Cancer_clean_data_SMILES":[],"HMDB_Molecule":[],"HMDB_SMILES":[],"TANIMOTO_Similarity_Value":[]}) y = Chem.MolFromSmiles(Cancer_clean_data['Smiles'][i]) fps1 = FingerprintMols.FingerprintMol(y) for j in range(len(hmdb_data)): try: x = Chem.MolFromSmiles(hmdb_data['SMILES'][j]) fps2 = FingerprintMols.FingerprintMol(x) sim_val = DataStructs.FingerprintSimilarity(fps1, fps2) if sim_val >= 0.85: # threshold for similarity value df1.loc[k] = [ Cancer_clean_data['Ligand'][i], Cancer_clean_data['Smiles'][i], hmdb_data['NAME'][j], hmdb_data['SMILES'][j], sim_val ] k = k + 1 except: print("WARNING") print("Comparison Done for Ligand :" + str(i)) df1.to_csv("Final_test_set_" + OR_name + ".csv") Ligand = df1["Cancer_clean_data_Molecule"] Smiles = df1["Cancer_clean_data_SMILES"] Activation_Status = [] Shortlisted_Metabolites = pd.DataFrame( list(zip(Smiles, Ligand, Activation_Status)), columns=['Smiles', 'Ligand', 'Activation Status']) Shortlisted_Metabolites = Shortlisted_Metabolites.drop_duplicates( subset='Ligand', keep='first') Shortlisted_Metabolites.to_csv("Shortlisted_Metabolites" + OR_Name + ".csv") print("Shortlisted_Metabolites" + OR_Name + ".csv" + " has been saved") print("Congrats! Final_test_set_" + OR_Name + ".csv has been successfully saved!")
def label_switching_decoder(key_smiles, bit_list, nmol_df): ''' :param key_smiles: key molecules :param bit_list: model predictions :param df: df where to pick key molecule and the 'neighbor' molecules :return: list; ACSII code ''' bit_list = list(map(int, bit_list)) #conver string to integers # build a list from 0 to 127 orig_label = [i for i in range(128)] key_mol = Chem.MolFromSmiles(key_smiles) key_fp = MACCSkeys.GenMACCSKeys(key_mol) # rebuild root_seed and rotor_seed based on MW and number of atoms of key_mol root_seed = int(Chem.Descriptors.ExactMolWt(key_mol)) rotor_seed = key_mol.GetNumAtoms() #pick 128 neighbor molecules # Pick the 128 reference molecules np.random.seed(root_seed) ref_smiles = np.random.choice(nmol_df.SMILES, size=128, replace=False) #compute the distance dist = [] for i in range(len(ref_smiles)): mol = Chem.MolFromSmiles(ref_smiles[i]) fp = MACCSkeys.GenMACCSKeys(mol) dist.append(DataStructs.FingerprintSimilarity(key_fp, fp)) decoded_message = [] for index, bit in enumerate(bit_list): SEED = root_seed + index * rotor_seed # Pick the 128 reference molecules np.random.seed(SEED) step_dist = np.random.choice(dist, size=len(dist), replace=False) # Base on the distance, swap the original cluster labels # get the index of ordered distances dict_rank = [0] * len(dist) for i, x in enumerate( sorted(range(len(step_dist)), key=lambda y: step_dist[y])): dict_rank[x] = i swaper_dict = dict(zip(orig_label, dict_rank)) # print(swaper_dict) decoded_message.append(swaper_dict.get(bit)) output = ''.join([chr(i) for i in decoded_message]) return output
def rd_kit_rd(dir_sdf = "../data/sdf/"): temp_str = "ls " + dir_sdf temp = os.popen(temp_str).read() temp = str(temp).split() sim_matrix_rdk = [] baseline = SDMolSupplier("../data/sdf/" + temp[0]) baseline_rdk = AllChem.RDKFingerprint(baseline[0], maxPath=2) for item in temp: suppl = SDMolSupplier("../data/sdf/" + item) fp_rdk = AllChem.RDKFingerprint(suppl[0], maxPath=2) sim_matrix_rdk.append(DataStructs.FingerprintSimilarity(baseline_rdk, fp_rdk, metric=DataStructs.TanimotoSimilarity)) sim_matrix_rdk = np.array(sim_matrix_rdk) return sim_matrix_rdk
def _searchMorganFP (self, cutoff, numsel, metric): incompatible = ['Substructural'] if metric is None: metric = 'Tanimoto' elif metric in incompatible: LOG.warning (f'Metric {metric} is not compatible with the descriptors present in this space') metric = 'Tanimoto' results = [] t1 = time.time() # for each compound in the search set for ivector in self.X: bitestring="".join(ivector.astype(str)) ifp = DataStructs.cDataStructs.CreateFromBitString(bitestring) selected_i = [] selected_d = [] d_worst = 0.000 #TODO Check speed BulkTanimoto # for each compound in the space for j, jvector in enumerate(self.Xref): d = DataStructs.FingerprintSimilarity(ifp,jvector, metric=DataStructs.TanimotoSimilarity) if d <= cutoff: continue # if results set is not completed add if len(selected_i) < numsel: selected_i.append(j) selected_d.append(d) z = sorted (zip(selected_d,selected_i),reverse=True) selected_d = [x for x,_ in z] selected_i = [x for _,x in z] d_worst = selected_d[-1] # otherwyse, compare the new d with the min d else: if d > d_worst: # better than worse compound #replace worst selected_i[-1]=j selected_d[-1]=d z = sorted (zip(selected_d,selected_i),reverse=True) selected_d = [x for x,_ in z] selected_i = [x for _,x in z] d_worst = selected_d[-1] # if the worst compound is identical, we cannot improve the search if d_worst == 1.000: break # results for molecule i are stored in a dictionary results_info = {} results_info['distances'] = [] # distances are allways stored for oi in self.objinforef: results_info[oi] = [] # all the objects information (name, smiles, ID, activity, etc.) for sd,si in zip(selected_d, selected_i): results_info['distances'].append(sd) for oi in self.objinforef: results_info[oi].append(self.objinforef[oi][si]) results.append(results_info) LOG.info (f'search completed in time: {time.time()-t1:.4f} secs') return True, results
baseline_rdk = AllChem.RDKFingerprint(baseline[0], maxPath=2) baseline_aval = pyAvalonTools.GetAvalonFP(baseline[0], 128) baseline_layer = AllChem.LayeredFingerprint(baseline[0]) for item in temp: suppl = SDMolSupplier("../data/sdf/" + item) fp = AllChem.GetMorganFingerprint(suppl[0], 2) fp_bit = AllChem.GetMorganFingerprintAsBitVect(suppl[0], 2, nBits=bit_length) fp_rdk = AllChem.RDKFingerprint(suppl[0], maxPath=2) fp_aval = pyAvalonTools.GetAvalonFP(suppl[0], 128) fp_layer = AllChem.LayeredFingerprint(suppl[0]) sim_matrix_morgan.append( DataStructs.FingerprintSimilarity(baseline_morgan, fp_bit, metric=DataStructs.TanimotoSimilarity)) sim_matrix_rdk.append( DataStructs.FingerprintSimilarity(baseline_rdk, fp_rdk, metric=DataStructs.TanimotoSimilarity)) sim_matrix_aval.append( DataStructs.FingerprintSimilarity(baseline_aval, fp_aval, metric=DataStructs.TanimotoSimilarity)) sim_matrix_layer.append( DataStructs.FingerprintSimilarity(baseline_layer, fp_layer, metric=DataStructs.TanimotoSimilarity)) sim_matrix_morgan = np.array(sim_matrix_morgan) sim_matrix_rdk = np.array(sim_matrix_rdk) sim_matrix_aval = np.array(sim_matrix_aval) sim_matrix_layer = np.array(sim_matrix_layer) label_morgan = "morgan" + str(bit_length) plt.hist(sim_matrix_morgan, label = label_morgan) plt.hist(sim_matrix_rdk, label = "rdk2")
def search (self, cutoff, numsel, metric): ''' This function searches for compounds in the chemical space similar to the compounds of input file already characterized by the X matrix the metric and the cutoff used for the search (distance cutoff and number to extract) are defined as parameters ''' # load pickle with reference space self.load_space() # set defaults if cutoff is None: cutoff = 0.0 if numsel is None: #numsel = len(self.X) numsel = 10 # float variables only can be compared using euclidean if self.isFingerprint is False: metric = 'Euclidean' else: if metric is None: if self.isFingerprint : metric = 'Tanimoto' else: metric = 'Euclidean' results = [] # for each compound in the search set for ivector in self.X: if self.isFingerprint: bitestring="".join(ivector.astype(str)) ifp = DataStructs.cDataStructs.CreateFromBitString(bitestring) # for each compound in the space selected_i = [] selected_d = [] #print ('searching compound:', i) d_worst = 0.000 for j, jvector in enumerate(self.Xref): if metric == 'Tanimoto': d = DataStructs.FingerprintSimilarity(ifp,jvector, metric=DataStructs.TanimotoSimilarity) elif metric == 'Euclidean': d = 1.000-(distance.euclidean(ivector,jvector)/self.Dmax) if d <= cutoff: continue # if results set is not completed add if len(selected_i) < numsel: selected_i.append(j) selected_d.append(d) z = sorted (zip(selected_d,selected_i),reverse=True) selected_d = [x for x,_ in z] selected_i = [x for _,x in z] d_worst = selected_d[-1] # otherwyse, compare the new d with the min d else: if d > d_worst: # better than worse compound #replace worst selected_i[-1]=j selected_d[-1]=d z = sorted (zip(selected_d,selected_i),reverse=True) selected_d = [x for x,_ in z] selected_i = [x for _,x in z] d_worst = selected_d[-1] # if the worst compound is identical, we cannot improve the search if d_worst == 1.000: break # results for molecule i are stored in a dictionary results_info = {} results_info['distances'] = [] # distances are allways stored for oi in self.objinfo: results_info[oi] = [] # all the objects information (name, smiles, ID, activity, etc.) for sd,si in zip(selected_d, selected_i): results_info['distances'].append(sd) for oi in self.objinfo: results_info[oi].append(self.objinfo[oi][si]) results.append(results_info) return True, results
def search (self, X, cutoff, numsel, metric): ''' This function searches for compounds in the chemical space similar to the compounds of input file already characterized by the X matrix the metric and the cutoff used for the search (distance cutoff and number to extract) are defined as parameters ''' # load pickle with reference space self.load_space() # True for fingerprint MD isFingerprint = (self.param.getVal('computeMD_method') == ['morganFP']) # set defaults if cutoff is None: cutoff = 0.0 if numsel is None: #numsel = len(self.X) numsel = 10 if metric is None: if isFingerprint : metric = 'Tanimoto' else: metric = 'Euclidean' results = [] # for each compound in the search set for i, ivector in enumerate(X): if isFingerprint: bitestring="".join(ivector.astype(str)) ifp = DataStructs.cDataStructs.CreateFromBitString(bitestring) # for each compound in the space selected_i = [] selected_d = [] #print ('searching compound:', i) d_worst = 0.000 for j, jvector in enumerate(self.X): if metric == 'Tanimoto': d = DataStructs.FingerprintSimilarity(ifp,jvector, metric=DataStructs.TanimotoSimilarity) elif metric == 'Euclidean': d = 1.000-(distance.euclidean(ivector,jvector)/self.Dmax) if d <= cutoff: continue # if results set is not completed add if len(selected_i) < numsel: selected_i.append(j) selected_d.append(d) z = sorted (zip(selected_d,selected_i),reverse=True) selected_d = [x for x,_ in z] selected_i = [x for _,x in z] d_worst = selected_d[-1] # otherwyse, compare the new d with the min d else: if d > d_worst: # better than worse compound #replace worst selected_i[-1]=j selected_d[-1]=d z = sorted (zip(selected_d,selected_i),reverse=True) selected_d = [x for x,_ in z] selected_i = [x for _,x in z] d_worst = selected_d[-1] # if the worst compound is identical, we cannot improve the search if d_worst == 1.000: break #print ('completed') results_distances = [] results_names = [] results_ids = [] results_smiles = [] for sd,si in zip(selected_d, selected_i): results_distances.append(sd) results_names.append(self.names[si]) results_ids.append(self.ids[si]) results_smiles.append(self.SMILES[si]) #print (i, sd, self.names[si], self.SMILES[si]) results.append({'distances':results_distances, 'names':results_names, 'ids':results_ids, 'SMILES':results_smiles }) return True, results
) fp2 = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) DataStructs.ConvertToNumpyArray(fp2, xmatrix[1]) with open('nfingers.pkl', 'wb') as fo: pickle.dump(xmatrix, fo) with open('rfingers.pkl', 'wb') as fo: pickle.dump(fp1, fo) pickle.dump(fp2, fo) # xmatrix = np.vstack((xmatrix, fp2)) print('start') for i in range(1000000): d = DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.TanimotoSimilarity) print(d) # d = DataStructs.FingerprintSimilarity(xmatrix[0],xmatrix[1], metric=DataStructs.TanimotoSimilarity) print('start') x1 = xmatrix[0] x2 = xmatrix[1] for i in range(100): d = 1.0 - distance.jaccard(x1, x2) print(d) # fp1 = np.array(AllChem.GetMorganFingerprintAsBitVect(mol1, 8), dtype='bool') # fp2 = np.array(AllChem.GetMorganFingerprintAsBitVect(mol2, 8), dtype='bool')