def create_rxn_Morgan2FP_separately(rsmi, psmi, rxnfpsize=gc.fingerprint_bits, pfpsize=gc.fingerprint_bits, useFeatures=False, calculate_rfp=True, useChirality=False): # Similar as the above function but takes smiles separately and returns pfp and rfp separately rsmi = rsmi.encode('utf-8') psmi = psmi.encode('utf-8') try: mol = Chem.MolFromSmiles(rsmi) except Exception as e: print(e) return try: fp_bit = AllChem.GetMorganFingerprintAsBitVect( mol=mol, radius=2, nBits=rxnfpsize, useFeatures=useFeatures, useChirality=useChirality) fp = np.empty(rxnfpsize, dtype='float32') DataStructs.ConvertToNumpyArray(fp_bit, fp) except Exception as e: print("Cannot build reactant fp due to {}".format(e)) return rfp = fp try: mol = Chem.MolFromSmiles(psmi) except Exception as e: return try: fp_bit = AllChem.GetMorganFingerprintAsBitVect( mol=mol, radius=2, nBits=pfpsize, useFeatures=useFeatures, useChirality=useChirality) fp = np.empty(pfpsize, dtype='float32') DataStructs.ConvertToNumpyArray(fp_bit, fp) except Exception as e: print("Cannot build product fp due to {}".format(e)) return pfp = fp return [pfp, rfp]
def _compute_fps(self) -> None: """Compute a numpy array of Morgan fingerprint vectors. """ fp_vects = [] for mol in tqdm.tqdm(self.data.mol, desc='Computing fingerprints', disable=self.prog): if self.fp_type == 'morgan': fp_vect = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, self.fp_rad, self.fp_bits) if self.fp_type == 'rdkit': fp_vect = Chem.RDKFingerprint( mol, minPath=self.fp_rad, maxPath=self.fp_rad, fpSize=self.fp_bits, ) array = np.zeros((0, ), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp_vect, array) fp_vects.append(array) self.fps = np.zeros((len(fp_vects), self.fp_bits)) for i, fp_vect in enumerate(fp_vects): self.fps[i, :] = fp_vect
def fit_model(self, toxicity_data): y = [] X = None # Loading data with open(toxicity_data, "r") as file_hdl: reader = csv.DictReader(file_hdl, delimiter='\t') for row in reader: y.append(math.log(float(row["toxicity"]))) arr = np.zeros((1, )) fp = self.calculate_ECFP(row["InChI"]) DataStructs.ConvertToNumpyArray(fp, arr) arr = np.reshape(arr, (1, 1024)) if X is None: X = arr else: X = np.concatenate((X, arr), axis=0) self.log_loading = "Loaded {} compounds from {}".format( len(y), toxicity_data) y = np.array(y) # Fitting mdoel: best_model, score = self.select_current_best_model(X, y, models_number=10) y_pred = best_model.predict(X) score = sklearn.metrics.r2_score(y, y_pred) self.log_score = "The toxicity model has a R2 score of {} on itself".format( round(score, 2)) self.model = best_model
def compound_scoring(compound): ECFP = compound._get_ECFP() arr = np.zeros((1, )) DataStructs.ConvertToNumpyArray(ECFP, arr) arr = np.reshape(arr, (1, 1024)) y_pred = self.model.predict(arr) return (y_pred)
def chemical_space(fname): """ from text file with smiles data, create a chemical space representation :param fname: :return: """ ligands = [] X = [] with open(fname, "r") as f: entries = f.read().splitlines() for e in entries: smiles = e.split(",")[2] mol = Chem.MolFromSmiles(smiles) mol.SetProp("_Name", str(e.split(",")[0] + "/" + e.split(",")[1])) ligands.append(mol) for l in ligands: AllChem.Compute2DCoords(l) arr = np.zeros((0,)) fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2) DataStructs.ConvertToNumpyArray(fp, arr) X.append(arr) #return TSNE(n_components=3, metric=tanimoto_dist).fit_transform(X) return umap.UMAP(n_neighbors=5, min_dist=0.2, metric=tanimoto_dist).fit_transform(X)
def search_by_mols(self, mols, topk=10): ''' :param mols: a list of molecuar :param topk: :return: [[{"id": xx, "smiles": xx, "score": xx}, {}, ...], []] ''' mols_vec = [] for mol in mols: tmp_arr = np.array([]) DataStructs.ConvertToNumpyArray( rdMolDescriptors.GetMACCSKeysFingerprint(mol), tmp_arr) mols_vec.append(self.vec2bytes(tmp_arr)) ret_dists, ret_ids = self.index.search( np.array(mols_vec).astype("uint8"), topk) rets = [] for mol, dists, ids in zip(mols, ret_dists, ret_ids): ret = [] for id in ids: ret.append({ "id": self.df_zinc.iloc[id]["zinc_id"], "smiles": self.df_zinc.iloc[id]["smiles"], "score": self.calc_similarity( mol, Chem.MolFromSmiles(self.df_zinc.iloc[id]["smiles"])) }) rets.append(sorted(ret, key=lambda item: item["score"], reverse=True)) return rets
def GetRDkitFPs(mol, nBits=2048, return_bitInfo=False): """ ################################################################# Calculate Daylight-like fingerprint or topological fingerprint (1024 bits). Usage: result=CalculateDaylightFingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ bitInfo = {} fp = RDKFingerprint(mol, fpSize=nBits, bitInfo=bitInfo) arr = np.zeros((0, ), dtype=np.bool) DataStructs.ConvertToNumpyArray(fp, arr) if return_bitInfo: return arr, return_bitInfo return arr
def calc_fp_arr( mols ): fplist = [] for mol in mols: arr = np.zeros( (1,) ) fp = AllChem.GetMorganFingerprintAsBitVect( mol, 2 ) DataStructs.ConvertToNumpyArray( fp, arr ) fplist.append( arr ) return np.asarray( fplist )
def convert_fps(fp): """ Converts RDKit Fingerprints to numpy array """ np_fps = [] array = numpy.zeros((1, )) DataStructs.ConvertToNumpyArray(fp, array) np_fps.append(''.join([str(int(x)) for x in array])) return np_fps
def convert_reaction_to_fp(rsmi, psmi, fpsize=2048): rsmi = rsmi.encode('utf-8') try: mol = Chem.MolFromSmiles(rsmi) except Exception as e: print("Cannot build reactant mol due to {}".format(e)) return try: fp_bit = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=fpsize, useFeatures=False, useChirality=True) fp = np.empty(fpsize, dtype='int8') DataStructs.ConvertToNumpyArray(fp_bit, fp) except Exception as e: print("Cannot build reactant fp due to {}".format(e)) print(rsmi) return rfp = fp psmi = psmi.encode('utf-8') try: mol = Chem.MolFromSmiles(psmi) except Exception as e: print("Cannot build product mol due to {}".format(e)) return try: fp_bit = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=fpsize, useFeatures=False, useChirality=True) fp = np.empty(fpsize, dtype='int8') DataStructs.ConvertToNumpyArray(fp_bit, fp) except Exception as e: print("Cannot build product fp due to {}".format(e)) return pfp = fp rxnfp = pfp - rfp return np.asarray(pfp), np.asarray(rxnfp)
def getFpArr( mols, nBits = 1024 ): fps = [ AllChem.GetMorganFingerprintAsBitVect( mol, 2, nBits=nBits ) for mol in mols ] X = [] for fp in fps: arr = np.zeros( (1,) ) DataStructs.ConvertToNumpyArray( fp, arr ) X.append( arr ) return np.array( X )
def fps_to_nparr(x): """ Convert fps strings (base64) to integers. """ import base64 from rdkit.Chem import DataStructs x = DataStructs.ExplicitBitVect(base64.b64decode(x)) arr = np.zeros((1, )) DataStructs.ConvertToNumpyArray(x, arr) return arr
def getFpArr( fps ): X = [] for item in fps: bv = DataStructs.ExplicitBitVect(4096) DataStructs.ExplicitBitVect.FromBase64(bv, item) arr = np.zeros( (1,) ) DataStructs.ConvertToNumpyArray( bv, arr ) X.append(arr) return X
def smiles2fps(self, smiles): arr = np.zeros((1, )) mol = Chem.MolFromSmiles(smiles) mol = AllChem.AddHs(mol) fp = AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=self.state_size) DataStructs.ConvertToNumpyArray(fp, arr) return np.array([arr])
def GetMACCSFPs(mol): ''' 166 bits ''' fp = AllChem.GetMACCSKeysFingerprint(mol) arr = np.zeros((0, ), dtype=np.bool) DataStructs.ConvertToNumpyArray(fp, arr) return arr
def GetAvalonFPs(mol, nBits=2048): ''' Avalon_fingerprints: https://pubs.acs.org/doi/pdf/10.1021/ci050413p ''' fp = GAFP(mol, nBits = nBits) arr = np.zeros((0,), dtype=np.bool) DataStructs.ConvertToNumpyArray(fp, arr) return arr
def build_mol_features(in_file, out_file): df_zinc = pd.read_csv(in_file, compression="zip") fp_list = [] for smi in tqdm.tqdm(df_zinc["smiles"], total=len(df_zinc)): tmp_arr = np.array([]) DataStructs.ConvertToNumpyArray( rdMolDescriptors.GetMACCSKeysFingerprint(Chem.MolFromSmiles(smi)), tmp_arr) fp_list.append(tmp_arr) fp_arr = np.array(fp_list) np.save(out_file, fp_arr)
def GetTorsionFPs(mol, nBits = 2048, binary = True): ''' atompairs fingerprints ''' fp = Torsions.GetHashedTopologicalTorsionFingerprint(mol, nBits = nBits) if binary: arr = np.zeros((0,), dtype=np.bool) else: arr = np.zeros((0,), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp, arr) return arr
def getFpArrSmiles( smiles, radius=2, nBits=1024 ): X = [] for line in smiles: try: m = Chem.MolFromSmiles(line) fp = AllChem.GetMorganFingerprintAsBitVect( m, 2, nBits=nBits ) except Boost.Python.ArgumentError: continue # mis-formed arr = np.zeros( (1,) ) DataStructs.ConvertToNumpyArray( fp, arr ) X.append( arr ) return X
def transform(self): super().transform() fts = [] self.mol_names = [] for mol in self.structures: fp = RDKFingerprint(mol) arr = np.zeros((0, ), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp, arr) fts.append(arr) self.features = np.array(fts) self.mol_names.append(mol.GetProp("_Name")) self.columns = [str(i) for i in list(range(self.features.shape[1]))] return self.features
def get_ecfp_count_vector(smiles: str, radius: int, nbits: int) -> np.ndarray: """ Returns the count ECFP representation as numpy array :param smiles: Smiles string :param radius: Radius for the ECFP algorithm. (eq. to number of iterations per atom) :param nbits: Length of final ECFP representation :return: ECFP as numpy array """ m = Chem.MolFromSmiles(smiles) fp = AllChem.GetHashedMorganFingerprint(m, radius, nbits) ecfp_count = np.zeros((0, ), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp, ecfp_count) return ecfp_count
def maccs_molstring(molecule, fptype): """ Method for make molstring for maccs fingerprint :param molecule: molecule object :param fptype: type, radius and size of fingerprint :type fptype: dict :return: molstring for maccs fingerprint """ arr = np.zeros((1, ), dtype=int) DataStructs.ConvertToNumpyArray(MACCSkeys.GenMACCSKeys(molecule), arr) return arr
def fingerprint_features(smile_string, radius=2, size=256): mol = MolFromSmiles(smile_string) new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) arr = np.zeros((0,), dtype=np.int8) DataStructs.ConvertToNumpyArray( rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=size, useChirality=True, useBondTypes=True, useFeatures=False ), arr) return arr
def calc_descriptors(rdmol): fp = Chem.GetMorganFingerprintAsBitVect(rdmol, radius=2, nBits=N_BITS, useFeatures=False) np_fp = np.zeros(N_BITS) ecfp = DataStructs.ConvertToNumpyArray(fp, np_fp) logp = Descriptors.MolLogP(rdmol) mwt = Descriptors.MolWt(rdmol) rtb = Descriptors.NumRotatableBonds(rdmol) hbd = Descriptors.NumHDonors(rdmol) hba = Descriptors.NumHAcceptors(rdmol) tpsa = Descriptors.TPSA(rdmol) return [logp, mwt, rtb, hbd, hba, tpsa, np_fp]
def rdk_molstring(molecule, fptype): """ Method for make molstring for rdk fingerprint :param molecule: molecule object :param fptype: type, radius and size of fingerprint :type fptype: dict :return: molstring for rdk fingerprint """ arr = np.zeros((1, ), dtype=int) DataStructs.ConvertToNumpyArray( RDKFingerprint(molecule, fpSize=fptype['Size']), arr) return arr
def get_smiles2mol(smiles): """load mol and generate morgan fp""" mols = [Chem.MolFromSmiles(smi) for smi in smiles] for mol in mols: AllChem.Compute2DCoords(mol) smiles2mol = dict(zip(smiles, mols)) X = [] for mol in mols: arr = np.zeros((0, )) fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2) DataStructs.ConvertToNumpyArray(fp, arr) X.append(arr) print('{} mols loaded'.format(len(X))) return X, smiles, smiles2mol
def GetMorganFPs(mol, nBits=2048, radius = 2, return_bitInfo = False): """ ECFP4: radius=2 """ bitInfo={} fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, bitInfo=bitInfo, nBits = nBits) arr = np.zeros((0,), dtype=np.bool) DataStructs.ConvertToNumpyArray(fp, arr) if return_bitInfo: return arr, bitInfo return arr
def avalon_molstring(molecule, fptype): """ Method for make molstring for avalon fingerprint :param molecule: molecule object :param fptype: type, radius and size of fingerprint :type fptype: dict :return: molstring for avalon fingerprint """ arr = np.zeros((1, ), dtype=int) DataStructs.ConvertToNumpyArray( GetAvalonCountFP(molecule, nBits=fptype['Size']), arr) return arr
def pharma_molstring(molecule, fptype): """ Method for make molstring for pharma fingerprint :param molecule: molecule object :param fptype: type, radius and size of fingerprint :type fptype: dict :return: molstring for pharma fingerprint """ arr = np.zeros((1, ), dtype=int) DataStructs.ConvertToNumpyArray( ConvertToExplicit( Generate.Gen2DFingerprint(molecule, Gobbi_Pharm2D.factory)), arr) return arr
def mapperfunc( mol ): fig, weight = SimilarityMaps.GetSimilarityMapForModel( mol, SimilarityMaps.GetMorganFingerprint, lambda x: getProba( x, cls.predict_proba), colorMap=cm.bwr ) fp = AllChem.GetMorganFingerprintAsBitVect( mol, 2 ) print(fp) arr = np.zeros((1,)) DataStructs.ConvertToNumpyArray( fp, arr ) print(arr) res = cls.predict( arr ) smi = Chem.MolToSmiles( mol ) print(smi) if res[0] == 1: fig.savefig( "res/act_"+smi+"_.png", bbox_inches = "tight" ) else: fig.savefig("res/nonact_"+smi+"_.png", bbox_inches = "tight" )