def similarity(individual): final_vector = [0.0 for x in range(256)] individual_latent_vector = [x for x in individual] counter = 0 for i in range(256): if i in non_zero_index: final_vector[i] = individual_latent_vector[counter] counter += 1 final_vector = np.reshape(final_vector, (1, 256)) smiles = latent_to_smiles(charset, smiles_len, char_to_int, int_to_char, latent_to_states_model, sample_model, final_vector, type='2_layers') molecule = Chem.MolFromSmiles(smiles) if molecule and smiles is not '' and len(smiles) != 1: try: mol_fp = GetAvalonFP(molecule, 512) ref = GetAvalonFP( Chem.MolFromSmiles('CC(C)Cc1ccc(cc1)[C@@H](C)C(=O)O'), 512) dissimilarity_to_ref = (1 - TanimotoSimilarity(mol_fp, ref)) print(Chem.MolToSmiles(molecule)) print(dissimilarity_to_ref) return dissimilarity_to_ref, except: return 9999, else: return 9999,
def make_fingerprints(data, length=512, verbose=False): fp_list = [ fingerprint(Chem.rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect, "Torsion "), fingerprint(lambda x: GetMorganFingerprintAsBitVect(x, 2, nBits=length), "Morgan"), fingerprint(FingerprintMol, "Estate (1995)"), fingerprint(lambda x: GetAvalonFP(x, nBits=length), "Avalon bit based (2006)"), fingerprint(lambda x: np.append(GetAvalonFP(x, nBits=length), Descriptors.MolWt(x)), "Avalon+mol. weight"), fingerprint(lambda x: GetErGFingerprint(x), "ErG fingerprint (2006)"), fingerprint(lambda x: RDKFingerprint(x, fpSize=length), "RDKit fingerprint"), fingerprint(lambda x: MACCSkeys.GenMACCSKeys(x), "MACCS fingerprint"), fingerprint(lambda x: get_fingerprint(x,fp_type='pubchem'), "PubChem"), # fingerprint(lambda x: get_fingerprint(x, fp_type='FP4'), "FP4") fingerprint(lambda x: Generate.Gen2DFingerprint(x,Gobbi_Pharm2D.factory,dMat=Chem.Get3DDistanceMatrix(x)), "3D pharmacophore"), ] for fp in fp_list: if (verbose): print("doing", fp.name) fp.apply_fp(data) return fp_list
def get_fingerprints(smiles_df, r=2, length=512, type_='morgan'): if type_ == 'morgan': fp = [AllChem.GetMorganFingerprintAsBitVect(m, r, nBits = length)\ for m in smiles_df['mol']] elif type_ == 'fcpf': fp = [AllChem.GetMorganFingerprintAsBitVect(m, r, useFeatures=True, nBits = length)\ for m in smiles_df['mol']] elif type_ == 'atom pair': fp = [GetHashedAtomPairFingerprintAsBitVect(m, nBits = length)\ for m in smiles_df['mol']] elif type_ == 'avalon': fp = [GetAvalonFP(m, nBits = length) for m in smiles_df['mol']] elif type_ == 'torsion': fp = [GetHashedTopologicalTorsionFingerprintAsBitVect(m, nBits = length)\ for m in smiles_df['mol']] elif type_ == 'rdkit': fp = [RDKFingerprint(m, fpSize = length) for m in smiles_df['mol']] else: raise ValueError("Possible values: morgan, fcpf, atom pair, avalon, torision and rdkit") drug_names = smiles_df['drug'].values return fp_to_pandas(fp=fp, drug_names=drug_names)
def make_fingerprints(mols, length=1024, verbose=False): fp_list = [ #fingerprint(lambda x : GetBPFingerprint(x, fpfn=GetHashedAtomPairFingerprintAsBitVect), # "Physiochemical properties (1996)"), ##NOTE: takes a long time to compute fingerprint( lambda x: GetHashedAtomPairFingerprintAsBitVect(x, nBits=length), "Atom pair (1985)"), fingerprint( lambda x: GetHashedTopologicalTorsionFingerprintAsBitVect( x, nBits=length), "Topological Torsion (1987)"), fingerprint( lambda x: GetMorganFingerprintAsBitVect(x, 2, nBits=length), "ECFPs/Morgan Circular (2010) "), fingerprint(fp_Estate_ints, "E-state (fixed length) (1995)"), #fingerprint(fp_Estate_and_mw, "E-state + MW weight (1995)"), #fingerprint(FingerprintMol, "E-state, index sum (1995)"), fingerprint(lambda x: GetAvalonFP(x, nBits=length), "Avalon (2006)"), #fingerprint(lambda x: np.append(GetAvalonFP(x, nBits=length), Descriptors.MolWt(x)), # "Avalon+mol. weight"), fingerprint(lambda x: GetErGFingerprint(x), "ErG (2006)"), fingerprint(lambda x: RDKFingerprint(x, fpSize=length), "RDKit topological (2006)") ] for fp in fp_list: if (verbose): print("doing", fp.name) fp.apply_fp(mols) return fp_list
def make_fingerprints(length, verbose, mols, chosen=None): if chosen == 1: fp_list = [ fingerprint(lambda x : GetHashedAtomPairFingerprintAsBitVect(x, nBits = length), "&qfuot;Atom pair (1985)")] elif chosen == 2: fp_list = [ fingerprint(lambda x : GetHashedTopologicalTorsionFingerprintAsBitVect(x, nBits = length), "Topological torsion (1987)")] elif chosen == 3: fp_list = [ fingerprint(lambda x : GetMorganFingerprintAsBitVect(x, 3, nBits = length), "Morgan circular ")] elif chosen == 4: fp_list = [ fingerprint(FingerprintMol, "Estate (1995)")] elif chosen == 5: fp_list = [ fingerprint(lambda x: GetAvalonFP(x, nBits=length), "Avalon bit based (2006)")] elif chosen == 6: fp_list = [ fingerprint(lambda x: np.append(GetAvalonFP(x, nBits=length), Descriptors.MolWt(x)), "Avalon+mol. weight")] elif chosen == 7: fp_list = [ fingerprint(lambda x: GetErGFingerprint(x), "ErG fingerprint (2006)")] elif chosen == 8: fp_list = [ fingerprint(lambda x : RDKFingerprint(x, fpSize=length), "RDKit fingerprint")] elif chosen == 9: fp_list = [ fingerprint(lambda x : FingerprintMols.FingerprintMol(x), "RDKit fingerprint2")] else: fp_list = [fingerprint(lambda x : MACCSkeys.GenMACCSKeys(x), "RDKit MACCSkeys")] for fp in fp_list: if (verbose): print("doing", fp.name) fp.apply_fp(mols) return fp_list
def predict(self, mol, selected_descriptors): options = [0, 0, 0, 0, 0] return_properties = {} for option in selected_descriptors: if option == 'logP': options[0] = 1 elif option == 'sol': options[0] = 1 options[1] = 1 elif option == 'mp': options[0] = 1 options[1] = 1 options[2] = 1 elif option == 'pka': options[3] = 1 elif option == 'mol_wt': options[4] = 1 fp = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol) if options[0]: logP = self.logP_model.run(fp) return_properties['logP'] = logP if options[1]: logP_sol = self.logP_solubility_model.run(logP) atom_pair_sol = self.atom_pair_sol_model.run(fp) combined_sol = self.combined_model.run(mol, logP, logP_sol, atom_pair_sol) mg_ml_sol = logs_to_mg_ml(combined_sol, mol) return_properties['sol'] = mg_ml_sol if options[2]: mp = self.melting_point_model.run(combined_sol, logP) return_properties['mp'] = mp if options[3]: avalon = GetAvalonFP(mol) maacs = MACCSkeys.GenMACCSKeys(mol) pka = self.pKa_model.run(avalon + maacs + fp) return_properties['pka'] = pka if options[4]: wt = rdMolDescriptors.CalcExactMolWt(mol) return_properties['mol_wt'] = wt return return_properties
def avalon_fp(mol): """Generates the Avalon fingerprint for a passed 'rdkit.Chem.rdchem.Mol' object using `rdkit.Avalon.pyAvalonTools.GetAvalonFP`. Parameters ---------- mol : rdkit.Chem.rdchem.Mol `rdkit` mol object. Returns ------- fp_arr : np.ndarray, shape(512,) Fingerprint expressed as a numpy row vector. """ fp = GetAvalonFP(mol) fp_arr = _rdkit_fp_to_np_arr(fp) return fp_arr
def avalon(m: list) -> list: """Avalon fingerprint.""" return [GetAvalonFP(x, nBits=1024) for x in m]
def avalon(mol, **kwargs): return list(GetAvalonFP(mol, **kwargs).GetOnBits())
class FingerprintsTransformer(MoleculeTransformer): r""" Fingerprint molecule transformer. This transformer is able to compute various fingerprints regularly used in QSAR modeling. Arguments ---------- kind: str, optional Name of the fingerprinting method used. Should be one of {'global_properties', 'atom_pair', 'topological_torsion', 'morgan_circular', 'estate', 'avalon_bit', 'avalon_count', 'erg', 'rdkit', 'maccs'} (Default value = 'morgan_circular') length: int, optional Length of the fingerprint to use (Default value = 2000) Attributes ---------- kind: str Name of the fingerprinting technique used length: int Length of the fingerprint to use fpfun: function function to call to compute the fingerprint """ MAPPING = OrderedDict( # global_properties=lambda x, params: augmented_mol_properties(x), # physiochemical=lambda x: GetBPFingerprint(x), atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect( x, **params), topological_torsion=lambda x, params: GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params), ecfp2=lambda x, params: GetMorganFingerprintAsBitVect(x, 1, **params), ecfp4=lambda x, params: GetMorganFingerprintAsBitVect(x, 2, **params), ecfp6=lambda x, params: GetMorganFingerprintAsBitVect(x, 3, **params), estate=lambda x, params: FingerprintMol(x)[0], avalon_bit=lambda x, params: GetAvalonFP(x, **params), avalon_count=lambda x, params: GetAvalonCountFP(x, **params), erg=lambda x, params: GetErGFingerprint(x), rdkit=lambda x, params: RDKFingerprint(x, **params), maccs=lambda x, params: GetMACCSKeysFingerprint(x)) def __init__(self, kind='ecfp2', length=4096): super(FingerprintsTransformer, self).__init__() if not (isinstance(kind, str) and (kind in FingerprintsTransformer.MAPPING.keys())): raise ValueError("Argument kind must be in: " + ', '.join(FingerprintsTransformer.MAPPING.keys())) self.kind = kind self.length = length self.fpfun = self.MAPPING.get(kind, None) if not self.fpfun: raise ValueError("Fingerprint {} is not offered".format(kind)) self._params = {} self._params.update({ ('fpSize' if kind == 'rdkit' else 'nBits'): length }) def _transform(self, mol): r""" Transforms a molecule into a fingerprint vector :raises ValueError: when the input molecule is None Arguments ---------- mol: rdkit.Chem.Mol Molecule of interest Returns ------- fp: np.ndarray The computed fingerprint """ if mol is None: raise ValueError("Expecting a Chem.Mol object, got None") # expect cryptic rdkit errors here if this fails, #rdkitdev fp = self.fpfun(mol, self._params) if isinstance(fp, ExplicitBitVect): fp = explicit_bit_vect_to_array(fp) else: fp = list(fp) return fp def transform(self, mols, **kwargs): r""" Transforms a batch of molecules into fingerprint vectors. .. note:: The recommended way is to use the object as a callable. Arguments ---------- mols: (str or rdkit.Chem.Mol) iterable List of SMILES or molecules kwargs: named parameters for transform (see below) Returns ------- fp: array computed fingerprints of size NxD, where D is the requested length of features and N is the number of input molecules that have been successfully featurized. See Also -------- :func:`~ivbase.transformers.features.MoleculeTransformer.transform` """ mol_list = [ self.to_mol(mol, addHs=False) for i, mol in enumerate(mols) ] # idx = [i for i, m in enumerate(mol_list) if m is None] mol_list = list(filter(None.__ne__, mol_list)) features = np.array([self._transform(mol) for mol in mol_list]).astype(np.float32) features = totensor(features, gpu=False) return features def __call__(self, mols, dtype=torch.long, cuda=False, **kwargs): r""" Transforms a batch of molecules into fingerprint vectors, and return the transformation in the desired data type format as well as the set of valid indexes. Arguments ---------- mols: (str or rdkit.Chem.Mol) iterable The list of input smiles or molecules dtype: torch.dtype or numpy.dtype, optional Datatype of the transformed variable. Expect a tensor if you provide a torch dtype, a numpy array if you provide a numpy dtype (supports valid strings) or a vanilla int/float. Any other option will return the output of the transform function. (Default value = torch.long) cuda: bool, optional Whether to transfer tensor on the GPU (if output is a tensor) kwargs: named parameters for transform (see below) Returns ------- fp: array computed fingerprints (in `dtype` datatype) of size NxD, where D is the requested length of features and N is the number of input molecules that have been successfully featurized. ids: array all valid molecule positions that did not failed during featurization See Also -------- :func:`~ivbase.transformers.features.FingerprintsTransformer.transform` """ fp, ids = super(FingerprintsTransformer, self).__call__(mols, **kwargs) if is_dtype_numpy_array(dtype): fp = np.array(fp, dtype=dtype) elif is_dtype_torch_tensor(dtype): fp = totensor(fp, gpu=cuda, dtype=dtype) else: raise (TypeError('The type {} is not supported'.format(dtype))) return fp, ids
""" Predict the pKa of an acid from SMILES string Returns both the fingerprint model prediction and similarity model predictions """ from chemical_models import AcidSimilarity, AcidpKa from rdkit import Chem from rdkit.Chem import rdMolDescriptors, MACCSkeys from rdkit.Avalon.pyAvalonTools import GetAvalonFP import sys # Load models sim_model = AcidSimilarity('acid_sim') fp_model = AcidpKa('pKa_acid') # Set of acids required for similarity model acid_data = open('data/pKa/formatted_acidic.txt', 'r') acids = [] mol = Chem.MolFromSmiles(sys.argv[1]) # Read acids from file for line in acid_data.readlines(): split = line.split(' ') acids.append([split[0], float(split[1][:-1]), rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(Chem.MolFromSmiles(split[0]))]) # Run the models and print results print("Similarity based model: " + str(sim_model.run(sys.argv[1], acids))) print("Fingerprint based model: " + str(fp_model.run(GetAvalonFP(mol) + MACCSkeys.GenMACCSKeys(mol) + rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol))))
class FingerprintsTransformer(MoleculeTransformer): """Molecule transformer into molecular fingerprint Parameters ---------- kind : {'global_properties', 'atom_pair', 'topological_torsion', 'morgan_circular', 'estate', 'avalon_bit', 'avalon_count', 'erg', 'rdkit', 'maccs'}, optional, default='global_properties' Name of the fingerprinting technique used length: int Length of the fingerprint to use Attributes ---------- kind : str Name of the fingerprinting technique used length : int Length of the fingerprint to use fpfun : function function to call to compute the fingerprint """ mapping = OrderedDict( # physiochemical=lambda x: GetBPFingerprint(x), atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect( x, **params), topological_torsion=lambda x, params: GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params), morgan_circular=lambda x, params: GetMorganFingerprintAsBitVect( x, 2, **params), estate=lambda x, params: FingerprintMol(x)[0], avalon_bit=lambda x, params: GetAvalonFP(x, **params), avalon_count=lambda x, params: GetAvalonCountFP(x, **params), erg=lambda x, params: GetErGFingerprint(x), rdkit=lambda x, params: RDKFingerprint(x, **params), maccs=lambda x, params: GetMACCSKeysFingerprint(x)) def __init__(self, kind='morgan_circular', length=2000): super(FingerprintsTransformer, self).__init__() if not (isinstance(kind, str) and (kind in FingerprintsTransformer.mapping)): raise ValueError("Argument kind must be in: " + ', '.join(FingerprintsTransformer.mapping.keys())) self.kind = kind self.length = length self.fpfun = self.mapping.get(kind, None) if not self.fpfun: raise ValueError("Fingerprint {} is not offered".format(kind)) self._params = {} self._params.update({ ('fpSize' if kind == 'rdkit' else 'nBits'): length }) def _transform(self, mol): """Transform a molecule into a fingerprint vector Parameters ---------- mol: str or rdkit.Chem.Mol The smiles of the molecule of interest or the molecule itself Returns ------- fp : np.ndarray The computed fingerprint """ if mol is None: warnings.warn("None value received for argument mol") fp = np.zeros(self.length) else: fp = self.fpfun(mol, self._params) if isinstance(fp, ExplicitBitVect): fp = explicit_bit_vect_to_array(fp) else: fp = np.array(list(fp)) return fp def transform(self, mols): """Transform a batch of molecule into a fingerprint vectors Parameters ---------- X: (str or rdkit.Chem.Mol) list The list of smiles or molecule Returns ------- fp : 2d np.ndarray The computed fingerprint vectors """ res = np.array( super(FingerprintsTransformer, self).transform(mols, as_numpy=True)) return res
def avalon(n_bits=2048): return lambda x: GetAvalonFP(x, nBits=n_bits)
from rdkit.Avalon.pyAvalonTools import GetAvalonFP from rdkit.Chem import AllChem y_name = 'boiling_point' fingerprint_type = 0 # 0: MACCS key, 1: RDKit, 2: Morgan (≒ECFP4), 3: Avalon sdf = Chem.SDMolSupplier('boiling_point.sdf') # sdf ファイルの読み込み # フィンガープリントの計算 # 分子ごとに、リスト型の変数 y に物性値を、fingerprints に計算されたフィンガープリントを、smiles に SMILES を追加 fingerprints, y, smiles = [], [], [] print('分子の数 :', len(sdf)) for index, molecule in enumerate(sdf): print(index + 1, '/', len(sdf)) y.append(float(molecule.GetProp(y_name))) smiles.append(Chem.MolToSmiles(molecule)) if fingerprint_type == 0: fingerprints.append(AllChem.GetMACCSKeysFingerprint(molecule)) elif fingerprint_type == 1: fingerprints.append(Chem.RDKFingerprint(molecule)) elif fingerprint_type == 2: fingerprints.append(AllChem.GetMorganFingerprintAsBitVect(molecule, 2, nBits=2048)) elif fingerprint_type == 3: fingerprints.append(GetAvalonFP(molecule)) fingerprints = pd.DataFrame(np.array(fingerprints, int), index=smiles) y = pd.DataFrame(y, index=smiles, columns=[y_name]) # 保存 fingerprints_with_y = pd.concat([y, fingerprints], axis=1) # y と記述子を結合 fingerprints_with_y.to_csv('fingerprints_with_y.csv') # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol)]) acid_model = AcidpKa('pKa_acid') sim_model = AcidSimilarity('acid_sim') X = [] Y = [] # For x combine predictions and descriptors, for y append actual pKa for line in data.readlines(): split = line.split(' ') mol = Chem.MolFromSmiles(split[0]) fingerprint = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol) pKa = acid_model.run(GetAvalonFP(mol) + MACCSkeys.GenMACCSKeys(mol) + fingerprint) sim_pKa = sim_model.run(split[0], acids) X.append([pKa, sim_pKa, Lipinski.NumHDonors(mol), Lipinski.NumHAcceptors(mol), Lipinski.NHOHCount(mol)]) Y.append(float(split[1][:-1])) scaler = preprocessing.StandardScaler() X = scaler.fit_transform(np.asarray(X)) Y = np.asarray(Y)
# Predicts pKa of any molecule from chemical_models import GeneralPKa from rdkit import Chem from rdkit.Chem import rdMolDescriptors, MACCSkeys from rdkit.Avalon.pyAvalonTools import GetAvalonFP import sys # Load models model = GeneralPKa('pKa') mol = Chem.MolFromSmiles(sys.argv[1]) # Run the models and print results print("Predicted pKa: " + str( model.run( GetAvalonFP(mol) + MACCSkeys.GenMACCSKeys(mol) + rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol))))
def convert_to_avalon(SMILES): mol = MS(SMILES) desc_val = GetAvalonFP(mol) desc_val_float_list = list( np.asarray(list(desc_val.ToBitString()), dtype=float)) return desc_val_float_list
def default_FP_Func(smiles): m = mol_from_smiles(smiles) fp = GetAvalonFP(m) fp = [int(i) for i in fp] return fp
def main(): parser = argparse.ArgumentParser(description='Generate chemical \ fingerprints from smiles strings') parser.add_argument('-S', '--smiles', action='store', nargs=1, dest='smiles', help='List of smiles strings to convert to chemical \ chemical fingerprint - should be in a column named \ "smiles" (.csv format)') parser.add_argument('-f', '--fingerprint', action='store', nargs='*', dest='fingerprints', help='Desired fingerprint type \ (avalon, ecfp, fcfp, or maccs)') parser.add_argument('-n', '--name', action='store', nargs=1, dest='name', help='Name of fingerprint csv file \ to write') parser.add_argument('-i', '--input_directory', action='store', nargs=1, dest='input', default=['./'], help='Directory where input files are stored') parser.add_argument('-o', '--output_directory', action='store', nargs=1, dest='output', default=['./'], help='Directory where output files should be written') args = vars(parser.parse_args()) for fptype in args['fingerprints']: data = pd.read_csv(args['input'][0] + args['smiles'][0], usecols=['smiles']) ofile = args['output'][0] + args['name'][0] time_start = time.time() with open(ofile, 'w') as csv_file: writer = csv.writer(csv_file, delimiter=',', lineterminator='\n') for smiles in data.smiles.unique(): mol = MolFromSmiles(smiles) try: if fptype == 'avalon': fp = GetAvalonFP(mol, nBits=2048) elif fptype == 'ecfp': fp = GetMorganFingerprintAsBitVect(mol, radius=2) elif fptype == 'fcfp': fp = GetMorganFingerprintAsBitVect(mol, radius=2, useFeatures=True) elif fptype == 'maccs': fp = MACCSkeys.GenMACCSKeys(mol) fp_bitstr = list(fp.ToBitString()) fp_bitstr.insert(0, smiles) writer.writerow(fp_bitstr) except: writer.writerow((smiles, "NA")) print('Issue with conversion to ' + fptype + ' fingerprint: ' + str(smiles)) print('Done writing ' + fptype + ' fingerprints! Time elapsed: \ {} seconds'.format(time.time() - time_start))
def default_FP_Func(m): return GetAvalonFP(m)