def filter_actions(smiles, valid_actions, target_fps, target_atoms, target_bonds, target_C_envs, radius): filter_actions = [] reach = False mol1 = Chem.MolFromSmiles(smiles) fps1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=radius, nBits=1024) base_similarity = DataStructs.FingerprintSimilarity(fps1, target_fps) for next_smiles in valid_actions: fps2, atoms2, bonds2, C_envs2 = get_mol_infos(next_smiles, radius) # print (all(elem in target_C_envs for elem in C_envs2)) next_similarity = DataStructs.FingerprintSimilarity(fps2, target_fps) if next_similarity > base_similarity and not mol_violation( atoms2, bonds2, C_envs2, target_atoms, target_bonds, target_C_envs): # base_similarity = next_similarity ## Accelerate # print (next_smiles) # print ('target', target_C_envs) # print ('next', C_envs2) filter_actions.append(next_smiles) # print (next_smiles, next_similarity) if next_similarity == 1: reach = True filter_actions = [next_smiles] break return filter_actions, reach
def create_rxn_Morgan2FP_separately(rsmi, psmi, rxnfpsize=gc.fingerprint_bits, pfpsize=gc.fingerprint_bits, useFeatures=False, calculate_rfp=True, useChirality=False): # Similar as the above function but takes smiles separately and returns pfp and rfp separately rsmi = rsmi.encode('utf-8') psmi = psmi.encode('utf-8') try: mol = Chem.MolFromSmiles(rsmi) except Exception as e: print(e) return try: fp_bit = AllChem.GetMorganFingerprintAsBitVect( mol=mol, radius=2, nBits=rxnfpsize, useFeatures=useFeatures, useChirality=useChirality) fp = np.empty(rxnfpsize, dtype='float32') DataStructs.ConvertToNumpyArray(fp_bit, fp) except Exception as e: print("Cannot build reactant fp due to {}".format(e)) return rfp = fp try: mol = Chem.MolFromSmiles(psmi) except Exception as e: return try: fp_bit = AllChem.GetMorganFingerprintAsBitVect( mol=mol, radius=2, nBits=pfpsize, useFeatures=useFeatures, useChirality=useChirality) fp = np.empty(pfpsize, dtype='float32') DataStructs.ConvertToNumpyArray(fp_bit, fp) except Exception as e: print("Cannot build product fp due to {}".format(e)) return pfp = fp return [pfp, rfp]
def fps_to_nparr(x): """ Convert fps strings (base64) to integers. """ import base64 from rdkit.Chem import DataStructs x = DataStructs.ExplicitBitVect(base64.b64decode(x)) arr = np.zeros((1, )) DataStructs.ConvertToNumpyArray(x, arr) return arr
def getFpArr( fps ): X = [] for item in fps: bv = DataStructs.ExplicitBitVect(4096) DataStructs.ExplicitBitVect.FromBase64(bv, item) arr = np.zeros( (1,) ) DataStructs.ConvertToNumpyArray( bv, arr ) X.append(arr) return X
def rd_kit(dir_sdf = "../data/sdf/"): temp_str = "ls " + dir_sdf temp = os.popen(temp_str).read() temp = str(temp).split() bit_length = 1024 sim_matrix_morgan = [] sim_matrix_rdk = [] sim_matrix_aval = [] sim_matrix_layer = [] baseline = SDMolSupplier(dir_sdf + temp[0]) baseline_morgan = AllChem.GetMorganFingerprintAsBitVect(baseline[0], 2, nBits=bit_length) baseline_rdk = AllChem.RDKFingerprint(baseline[0], maxPath=2) baseline_aval = pyAvalonTools.GetAvalonFP(baseline[0], 128) baseline_layer = AllChem.LayeredFingerprint(baseline[0]) count = 0 for item in temp: suppl = SDMolSupplier(dir_sdf + item) count += 1 fp = AllChem.GetMorganFingerprint(suppl[0], 2) fp_bit = AllChem.GetMorganFingerprintAsBitVect(suppl[0], 3, nBits=bit_length) fp_rdk = AllChem.RDKFingerprint(suppl[0], maxPath=3) fp_aval = pyAvalonTools.GetAvalonFP(suppl[0], 128) fp_layer = AllChem.LayeredFingerprint(suppl[0]) sim_matrix_morgan.append( DataStructs.FingerprintSimilarity(baseline_morgan, fp_bit, metric=DataStructs.TanimotoSimilarity)) sim_matrix_rdk.append( DataStructs.FingerprintSimilarity(baseline_rdk, fp_rdk, metric=DataStructs.TanimotoSimilarity)) sim_matrix_aval.append( DataStructs.FingerprintSimilarity(baseline_aval, fp_aval, metric=DataStructs.TanimotoSimilarity)) sim_matrix_layer.append( DataStructs.FingerprintSimilarity(baseline_layer, fp_layer, metric=DataStructs.TanimotoSimilarity)) sim_matrix_morgan = np.array(sim_matrix_morgan) sim_matrix_rdk = np.array(sim_matrix_rdk) sim_matrix_aval = np.array(sim_matrix_aval) sim_matrix_layer = np.array(sim_matrix_layer) label_morgan = "morgan" + str(bit_length) plt.hist(sim_matrix_morgan, label = label_morgan) plt.hist(sim_matrix_rdk, label = "rdk2") #plt.hist(sim_matrix_aval, label = "avalon128") #plt.hist(sim_matrix_layer, label = "layer") print(np.mean(sim_matrix_rdk)) print(count) plt.xlabel("Similarity to Baseline") plt.ylabel("Counts") plt.title("Different Fingerprinting Methods, Similarity to Baseline") plt.legend() plt.show()
def chemical_random_episode(env, search_dict, target_fps, target_atoms, target_bonds, target_C_envs, radius): initial_state = env.reset() state = initial_state pre_state = initial_state episode = [state] reach = False while True: if state not in search_dict: valid_actions = env._get_valid_actions() valid_actions, reach = filter_actions(state, valid_actions, target_fps, target_atoms, target_bonds, target_C_envs, radius) # filter actions search_dict[ state] = valid_actions # first meet state, record possible actions elif search_dict == {initial_state: []}: search_dict = 'terminate' break else: valid_actions = search_dict[state] # load updated actions valid_actions, reach = filter_actions(state, valid_actions, target_fps, target_atoms, target_bonds, target_C_envs, radius) ##filter again # print (valid_actions) nA = len(valid_actions) if nA == 0: # if len(valid_actions) == 0, fail and remove this state from dictionary and never add back search_dict.pop( state) # if state has no action left, delete from dictionary search_dict[pre_state].remove(state) mol1 = Chem.MolFromSmiles(state) fps1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=radius) print('No action space, Last action: %s, similarity: %.3f' % (state, DataStructs.FingerprintSimilarity(fps1, target_fps))) break action = np.random.randint(nA) next_state, reward, done = env.step(valid_actions, action) episode.append(next_state) if reach == True: search_dict[state].remove(next_state) mol2 = Chem.MolFromSmiles(next_state) fps2 = AllChem.GetMorganFingerprintAsBitVect(mol2, radius=radius) print('Reach, last action: %s, similarity: %.3f' % (next_state, DataStructs.FingerprintSimilarity(fps2, target_fps))) break pre_state = state state = next_state return episode, reach, search_dict
def fit_model(self, toxicity_data): y = [] X = None # Loading data with open(toxicity_data, "r") as file_hdl: reader = csv.DictReader(file_hdl, delimiter='\t') for row in reader: y.append(math.log(float(row["toxicity"]))) arr = np.zeros((1, )) fp = self.calculate_ECFP(row["InChI"]) DataStructs.ConvertToNumpyArray(fp, arr) arr = np.reshape(arr, (1, 1024)) if X is None: X = arr else: X = np.concatenate((X, arr), axis=0) self.log_loading = "Loaded {} compounds from {}".format( len(y), toxicity_data) y = np.array(y) # Fitting mdoel: best_model, score = self.select_current_best_model(X, y, models_number=10) y_pred = best_model.predict(X) score = sklearn.metrics.r2_score(y, y_pred) self.log_score = "The toxicity model has a R2 score of {} on itself".format( round(score, 2)) self.model = best_model
def GetRDkitFPs(mol, nBits=2048, return_bitInfo=False): """ ################################################################# Calculate Daylight-like fingerprint or topological fingerprint (1024 bits). Usage: result=CalculateDaylightFingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ bitInfo = {} fp = RDKFingerprint(mol, fpSize=nBits, bitInfo=bitInfo) arr = np.zeros((0, ), dtype=np.bool) DataStructs.ConvertToNumpyArray(fp, arr) if return_bitInfo: return arr, return_bitInfo return arr
def measure_similarity(self, db_fps, sim_metric=DataStructs.TanimotoSimilarity, th=0.8): global user_ip_fps global db_cntr global fps_matches if db_cntr % 10000 == 0: self.jlogger.info("Completed checking similarity with {} compound of db".format(db_cntr)) u_fps_cntr = 0 if not db_fps is None: for u_fps in user_ip_fps: try: if not u_fps is None: sim = DataStructs.FingerprintSimilarity(u_fps, db_fps, metric=sim_metric) if sim >= th: if db_cntr in fps_matches: fps_matches[db_cntr].append((u_fps_cntr, sim)) else: fps_matches[db_cntr] = [(u_fps_cntr, sim)] else: self.jlogger.debug( "User Finger print is unavailable, skipping this compound {}".format(u_fps_cntr)) except Exception as e: logger.exception( "Error measuring similarity of compound db_cntr {} and u_fps_cntr {}".format(db_cntr, u_fps_cntr)) self.jlogger.debug( "Error measuring similarity of compound db_cntr {} and u_fps_cntr {}".format(db_cntr, u_fps_cntr)) u_fps_cntr += 1 db_cntr += 1 else: self.jlogger.debug("DB Finger print is unavailable, skipping this compound {}".format(db_cntr))
def get_vars_for_sim_calc(self, fp1, fp2): # ref: https://github.com/rdkit/rdkit-orig/blob/master/rdkit/DataStructs/__init__.py sz1 = fp1.GetNumBits() sz2 = fp2.GetNumBits() if sz1 < sz2: fp2 = DataStructs.FoldFingerprint(fp2, sz2 // sz1) elif sz2 < sz1: fp1 = DataStructs.FoldFingerprint(fp1, sz1 // sz2) a = fp1.GetNumOnBits() b = fp2.GetNumOnBits() c = len(DataStructs.OnBitsInCommon(fp1, fp2)) return a, b, c
def dg_score(active_mols, decoy_mols): # Similar to DEKOIS # Lower is better (less like actives), higher is worse (more like actives) active_fps = [AllChem.GetMorganFingerprintAsBitVect(mol,3,useFeatures=True) \ for mol in active_mols] # Roughly FCFP_6 decoys_fps = [AllChem.GetMorganFingerprintAsBitVect(mol,3,useFeatures=True) \ if mol is not None else None for mol in decoy_mols] # Roughly FCFP_6 closest_sims = [] closest_sims_id = [] for active_fp in active_fps: active_sims = [] for decoy_fp in decoys_fps: active_sims.append(DataStructs.TanimotoSimilarity(active_fp, decoy_fp) \ if decoy_fp is not None else 0) closest_sims.append(max(active_sims)) closest_sims_id.append(np.argmax(active_sims)) return np.array(closest_sims), np.array(closest_sims_id)
def compound_scoring(compound): ECFP = compound._get_ECFP() arr = np.zeros((1, )) DataStructs.ConvertToNumpyArray(ECFP, arr) arr = np.reshape(arr, (1, 1024)) y_pred = self.model.predict(arr) return (y_pred)
def __call__(self, smiles: List[str]) -> dict: mols = [Chem.MolFromSmiles(smile) for smile in smiles] valid = [1 if mol is not None else 0 for mol in mols] valid_idxs = [idx for idx, boolean in enumerate(valid) if boolean == 1] valid_mols = [mols[idx] for idx in valid_idxs] fps = [ AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=False) for mol in valid_mols ] tanimoto = np.array([ np.max(DataStructs.BulkTanimotoSimilarity(fp, self.ref_fps)) for fp in fps ]) tanimoto = np.maximum((1 - 2 * np.absolute(0.5 - tanimoto)), 0) score = np.full(len(smiles), 0, dtype=np.float32) for idx, value in zip(valid_idxs, tanimoto): score[idx] = value return {"total_score": np.array(score, dtype=np.float32)}
def _compute_fps(self) -> None: """Compute a numpy array of Morgan fingerprint vectors. """ fp_vects = [] for mol in tqdm.tqdm(self.data.mol, desc='Computing fingerprints', disable=self.prog): if self.fp_type == 'morgan': fp_vect = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, self.fp_rad, self.fp_bits) if self.fp_type == 'rdkit': fp_vect = Chem.RDKFingerprint( mol, minPath=self.fp_rad, maxPath=self.fp_rad, fpSize=self.fp_bits, ) array = np.zeros((0, ), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp_vect, array) fp_vects.append(array) self.fps = np.zeros((len(fp_vects), self.fp_bits)) for i, fp_vect in enumerate(fp_vects): self.fps[i, :] = fp_vect
def dg_score_rev(actives, decoys): # Similar to DEKOIS # Lower is better (less like actives), higher is worse (more like actives) active_fps = [ AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smi), 3, useFeatures=True) for smi in actives ] # Roughly FCFP_6 decoys_fps = [ AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smi), 3, useFeatures=True) for smi in decoys ] # Roughly FCFP_6 closest_sims = [] closest_sims_id = [] for decoy_fp in decoys_fps: active_sims = [] for active_fp in active_fps: active_sims.append( DataStructs.TanimotoSimilarity(active_fp, decoy_fp)) closest_sims.append(max(active_sims)) closest_sims_id.append(np.argmax(active_sims)) return closest_sims, closest_sims_id
def chemical_space(fname): """ from text file with smiles data, create a chemical space representation :param fname: :return: """ ligands = [] X = [] with open(fname, "r") as f: entries = f.read().splitlines() for e in entries: smiles = e.split(",")[2] mol = Chem.MolFromSmiles(smiles) mol.SetProp("_Name", str(e.split(",")[0] + "/" + e.split(",")[1])) ligands.append(mol) for l in ligands: AllChem.Compute2DCoords(l) arr = np.zeros((0,)) fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2) DataStructs.ConvertToNumpyArray(fp, arr) X.append(arr) #return TSNE(n_components=3, metric=tanimoto_dist).fit_transform(X) return umap.UMAP(n_neighbors=5, min_dist=0.2, metric=tanimoto_dist).fit_transform(X)
def search_by_mols(self, mols, topk=10): ''' :param mols: a list of molecuar :param topk: :return: [[{"id": xx, "smiles": xx, "score": xx}, {}, ...], []] ''' mols_vec = [] for mol in mols: tmp_arr = np.array([]) DataStructs.ConvertToNumpyArray( rdMolDescriptors.GetMACCSKeysFingerprint(mol), tmp_arr) mols_vec.append(self.vec2bytes(tmp_arr)) ret_dists, ret_ids = self.index.search( np.array(mols_vec).astype("uint8"), topk) rets = [] for mol, dists, ids in zip(mols, ret_dists, ret_ids): ret = [] for id in ids: ret.append({ "id": self.df_zinc.iloc[id]["zinc_id"], "smiles": self.df_zinc.iloc[id]["smiles"], "score": self.calc_similarity( mol, Chem.MolFromSmiles(self.df_zinc.iloc[id]["smiles"])) }) rets.append(sorted(ret, key=lambda item: item["score"], reverse=True)) return rets
def convert_fps(fp): """ Converts RDKit Fingerprints to numpy array """ np_fps = [] array = numpy.zeros((1, )) DataStructs.ConvertToNumpyArray(fp, array) np_fps.append(''.join([str(int(x)) for x in array])) return np_fps
def getFpArr( mols, nBits = 1024 ): fps = [ AllChem.GetMorganFingerprintAsBitVect( mol, 2, nBits=nBits ) for mol in mols ] X = [] for fp in fps: arr = np.zeros( (1,) ) DataStructs.ConvertToNumpyArray( fp, arr ) X.append( arr ) return np.array( X )
def calc_fp_arr( mols ): fplist = [] for mol in mols: arr = np.zeros( (1,) ) fp = AllChem.GetMorganFingerprintAsBitVect( mol, 2 ) DataStructs.ConvertToNumpyArray( fp, arr ) fplist.append( arr ) return np.asarray( fplist )
def convert_reaction_to_fp(rsmi, psmi, fpsize=2048): rsmi = rsmi.encode('utf-8') try: mol = Chem.MolFromSmiles(rsmi) except Exception as e: print("Cannot build reactant mol due to {}".format(e)) return try: fp_bit = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=fpsize, useFeatures=False, useChirality=True) fp = np.empty(fpsize, dtype='int8') DataStructs.ConvertToNumpyArray(fp_bit, fp) except Exception as e: print("Cannot build reactant fp due to {}".format(e)) print(rsmi) return rfp = fp psmi = psmi.encode('utf-8') try: mol = Chem.MolFromSmiles(psmi) except Exception as e: print("Cannot build product mol due to {}".format(e)) return try: fp_bit = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=fpsize, useFeatures=False, useChirality=True) fp = np.empty(fpsize, dtype='int8') DataStructs.ConvertToNumpyArray(fp_bit, fp) except Exception as e: print("Cannot build product fp due to {}".format(e)) return pfp = fp rxnfp = pfp - rfp return np.asarray(pfp), np.asarray(rxnfp)
def smiles2fps(self, smiles): arr = np.zeros((1, )) mol = Chem.MolFromSmiles(smiles) mol = AllChem.AddHs(mol) fp = AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=self.state_size) DataStructs.ConvertToNumpyArray(fp, arr) return np.array([arr])
def pka_similarities(smile, mol_set, n): mol = Chem.MolFromSmiles(smile) mol_fp = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol) similarity = [] for molecule in mol_set: sim = DataStructs.DiceSimilarity(mol_fp, molecule[2]) similarity.append([sim, molecule[1]]) return np.asarray(sorted(similarity)[:n]).flatten()
def TakeInput(filepath, hmdb_filepath, OR_name): positive_Cancer = extractPositiveOnes(filepath) data_hmdb = pd.read_csv(hmdb_filepath, encoding="ISO-8859-1") positive_Cancer = pd.read_csv(filepath, encoding="ISO-8859-1") hmdb_names = data_hmdb['NAME'] hmdb_SMILES = data_hmdb['SMILES'] positive_Cancer_SMILES = positive_Cancer['Smiles'] positive_Cancer_Names = positive_Cancer["Ligand"] hmdb_data = pd.concat([hmdb_SMILES, hmdb_names], axis=1) dataframe = pd.concat([positive_Cancer_SMILES, positive_Cancer_Names], axis=1) Cancer_clean_data = dataframe.drop_duplicates() Cancer_clean_data = Cancer_clean_data.reset_index(drop=True) df1 = pd.DataFrame({ "Cancer_Molecule": [], "Cancer_SMILES": [], "HMDB_Molecule": [], "HMDB_SMILES": [], "TANIMOTO_Similarity_Value": [] }) hmdb_data = hmdb_data.reset_index(drop=True) k = 0 for i in range(len(Cancer_clean_data)): # df1=df1.iloc[0:0] # df1= pd.DataFrame({"Cancer_clean_data_Molecule":[],"Cancer_clean_data_SMILES":[],"HMDB_Molecule":[],"HMDB_SMILES":[],"TANIMOTO_Similarity_Value":[]}) y = Chem.MolFromSmiles(Cancer_clean_data['Smiles'][i]) fps1 = FingerprintMols.FingerprintMol(y) for j in range(len(hmdb_data)): try: x = Chem.MolFromSmiles(hmdb_data['SMILES'][j]) fps2 = FingerprintMols.FingerprintMol(x) sim_val = DataStructs.FingerprintSimilarity(fps1, fps2) if sim_val >= 0.85: # threshold for similarity value df1.loc[k] = [ Cancer_clean_data['Ligand'][i], Cancer_clean_data['Smiles'][i], hmdb_data['NAME'][j], hmdb_data['SMILES'][j], sim_val ] k = k + 1 except: print("WARNING") print("Comparison Done for Ligand :" + str(i)) df1.to_csv("Final_test_set_" + OR_name + ".csv") Ligand = df1["Cancer_clean_data_Molecule"] Smiles = df1["Cancer_clean_data_SMILES"] Activation_Status = [] Shortlisted_Metabolites = pd.DataFrame( list(zip(Smiles, Ligand, Activation_Status)), columns=['Smiles', 'Ligand', 'Activation Status']) Shortlisted_Metabolites = Shortlisted_Metabolites.drop_duplicates( subset='Ligand', keep='first') Shortlisted_Metabolites.to_csv("Shortlisted_Metabolites" + OR_Name + ".csv") print("Shortlisted_Metabolites" + OR_Name + ".csv" + " has been saved") print("Congrats! Final_test_set_" + OR_Name + ".csv has been successfully saved!")
def GetAvalonFPs(mol, nBits=2048): ''' Avalon_fingerprints: https://pubs.acs.org/doi/pdf/10.1021/ci050413p ''' fp = GAFP(mol, nBits = nBits) arr = np.zeros((0,), dtype=np.bool) DataStructs.ConvertToNumpyArray(fp, arr) return arr
def GetMACCSFPs(mol): ''' 166 bits ''' fp = AllChem.GetMACCSKeysFingerprint(mol) arr = np.zeros((0, ), dtype=np.bool) DataStructs.ConvertToNumpyArray(fp, arr) return arr
def tanimoto(self, mol): try: with Timeout(seconds=1): fp = Generate.Gen2DFingerprint(mol, self.sigFactory) return DataStructs.TanimotoSimilarity(fp, self.query_fp) except TimeoutError: logging.debug("SMILES Pharmacophore timeout: ", Chem.MolToSmiles(mol, isomericSmiles=False)) return 0
def GetTorsionFPs(mol, nBits = 2048, binary = True): ''' atompairs fingerprints ''' fp = Torsions.GetHashedTopologicalTorsionFingerprint(mol, nBits = nBits) if binary: arr = np.zeros((0,), dtype=np.bool) else: arr = np.zeros((0,), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp, arr) return arr
def build_mol_features(in_file, out_file): df_zinc = pd.read_csv(in_file, compression="zip") fp_list = [] for smi in tqdm.tqdm(df_zinc["smiles"], total=len(df_zinc)): tmp_arr = np.array([]) DataStructs.ConvertToNumpyArray( rdMolDescriptors.GetMACCSKeysFingerprint(Chem.MolFromSmiles(smi)), tmp_arr) fp_list.append(tmp_arr) fp_arr = np.array(fp_list) np.save(out_file, fp_arr)
def get_on_bits(self, mol): if isinstance(mol, str): mol = Chem.MolFromSmiles(mol) mol_fp = Chem.RDKFingerprint(mol) on_bits = [] for i, s_fp_i in enumerate(self.scaffold_fps): if DataStructs.AllProbeBitsMatch(s_fp_i, mol_fp): if mol.HasSubstructMatch(self.scaffolds[i]): on_bits.append(i) return on_bits