def _smilarity_between_two_mols(mol1, mol2): # mol1, mol2 = Chem.MolFromSmiles(smi1), Chem.MolFromSmiles(smi2) vec1 = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol1, 4, nBits=512) vec2 = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol2, 4, nBits=512) tani = DataStructs.TanimotoSimilarity(vec1, vec2) return tani
def calculate_fp(smi: str, fp_type: str): """Calculates fp based on fp_type and smiles""" mol = Chem.MolFromSmiles(smi) if mol: #Circular fingerprints if fp_type == "ECFP4": fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024) # ECFP4 elif fp_type == "ECFP6": fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024) # ECFP6 # Structural fingerprints: elif fp_type == "Avalon": fp = pyAvalonTools.GetAvalonFP(mol, nBits=1024) # Avalon elif fp_type == "MACCSkeys": fp = rdkit.Chem.rdMolDescriptors.GetMACCSKeysFingerprint(mol) #MACCS Keys # Path-based fingerprints elif fp_type == "hashAP": fp = rdkit.Chem.rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=1024) elif fp_type == "hashTT": fp = rdkit.Chem.rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=1024) elif fp_type == "RDK5": fp = rdkit.Chem.rdmolops.RDKFingerprint(mol, maxPath=5, fpSize=1024, nBitsPerHash=2) elif fp_type == "RDK6": fp = rdkit.Chem.rdmolops.RDKFingerprint(mol, maxPath=6, fpSize=1024, nBitsPerHash=2) elif fp_type == "RDK7": fp = rdkit.Chem.rdmolops.RDKFingerprint(mol, maxPath=7, fpSize=1024, nBitsPerHash=2) return np.asarray(fp).reshape(1, -1) else: return None
def __init__(self, fp_type, fp_bits=2048): """ :param fp_type: fingerprint type :param fp_bits: number of fingerprint bits """ self.fp_type = fp_type self.fp_dict = {} self.fp_dict['morgan2'] = [ lambda m: rdmd.GetMorganFingerprintAsBitVect(m, 2, nBits=fp_bits), fp_bits ] self.fp_dict['morgan3'] = [ lambda m: rdmd.GetMorganFingerprintAsBitVect(m, 3, nBits=fp_bits), fp_bits ] self.fp_dict['ap'] = [ lambda m: rdmd.GetHashedAtomPairFingerprintAsBitVect( m, nBits=fp_bits), fp_bits ] self.fp_dict['rdk5'] = [ lambda m: Chem.RDKFingerprint( m, maxPath=5, fpSize=fp_bits, nBitsPerHash=2), fp_bits ] if self.fp_dict.get(fp_type): self.fp_function = self.fp_dict[fp_type] else: print("invalid fingerprint type: %s" % fp_type) sys.exit(0)
def mol_train_test(dataset, labels, test_size=0.1, random_state=2019, nbits=1024): # TAKING WRONG INCHIS all_mols = [ Chem.MolFromSmiles(SMILES_string) for SMILES_string in dataset['SMILES'] ] drop_index = [i for i, mol in enumerate(all_mols) if mol == None] # FINDING WRONG INCHIS # DROP FROM MOLS, lABELS, AND DATASET if len(drop_index) != 0: labels = labels.drop(drop_index).reset_index(drop=True) dataset = dataset.drop(drop_index).reset_index(drop=True) all_mols = [ Chem.MolFromSmiles(SMILES_string) for SMILES_string in dataset['SMILES'] ] ### FIND BETTER WAY TO NOT CALCULATE AGAIN!!!! # TRAIN-TEST SPLITS train_mols, test_mols, y_train, y_test = train_test_split(all_mols, labels, test_size=test_size\ , random_state=random_state) # CONVERT TRAINING MOLECULES INTO FINGERPRINT AS 256BITS VECTORS bi = {} fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(m, radius=2, bitInfo= bi, nBits=nbits) \ for m in train_mols] # PUT ALL EACH OF THE CORRESPONDING 256BITS FINGERPRINTS INTO A LIST train_fps_array = [] for fp in fps: arr = np.zeros((1, ), dtype=int) DataStructs.ConvertToNumpyArray(fp, arr) train_fps_array.append(arr) # CONVERT InChi STRINGS INTO MOLECULES FOR TEST DATA test_fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(test_m, radius=2, bitInfo= bi, nBits=nbits) \ for test_m in test_mols] #Convert testing fingerprints into binary, and put all testing binaries into arrays test_np_fps_array = [] for test_fp in test_fps: test_arr = np.zeros((1, ), dtype=int) DataStructs.ConvertToNumpyArray(test_fp, test_arr) test_np_fps_array.append(test_arr) return dataset, labels, all_mols, y_train, y_test, train_fps_array, test_np_fps_array
def maxmin_picker(dataset: list, input_format='smiles', n=3, seed=123, radius=2, nBits=1024): """ Select a subset of molecules and return a list of diverse RDKit mols. http://rdkit.blogspot.com/2014/08/optimizing-diversity-picking-in-rdkit.html """ if input_format == 'smiles': mols = [ Chem.MolFromSmiles(smi) for smi in dataset if Chem.MolFromSmiles(smi) ] elif input_format == 'mol': mols = dataset else: print('Format not recognized') raise fps = [ rdMolDescriptors.GetMorganFingerprintAsBitVect(m, radius=radius, nBits=nBits) for m in mols ] mmp = SimDivFilters.MaxMinPicker() ids = mmp.LazyBitVectorPick(fps, len(fps), n) subset = [mols[i] for i in ids] return subset
def _featurize(self, mol): """ Calculate circular fingerprint. Parameters ---------- mol : RDKit Mol Molecule. """ if self.sparse: info = {} fp = rdMolDescriptors.GetMorganFingerprint( mol, self.radius, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features, bitInfo=info) fp = fp.GetNonzeroElements() # convert to a dict # generate SMILES for fragments if self.smiles: fp_smiles = {} for fragment_id, count in fp.items(): root, radius = info[fragment_id][0] env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, root) frag = Chem.PathToSubmol(mol, env) smiles = Chem.MolToSmiles(frag) fp_smiles[fragment_id] = {'smiles': smiles, 'count': count} fp = fp_smiles else: fp = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, self.radius, nBits=self.size, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features) return fp
def calculate_fingerprint(smiles, radi): binary = np.zeros((2048 * (radi)), int) formula = np.zeros((2048), int) mol = Chem.MolFromSmiles(smiles) mol = Chem.AddHs(mol) mol_bi = {} for r in range(radi + 1): mol_fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=r, bitInfo=mol_bi, nBits=2048) mol_bi_QC = [] for i in mol_fp.GetOnBits(): num_ = len(mol_bi[i]) for j in range(num_): if mol_bi[i][j][1] == r: mol_bi_QC.append(i) break if r == 0: for i in mol_bi_QC: formula[i] = len([k for k in mol_bi[i] if k[1] == 0]) else: for i in mol_bi_QC: binary[(2048 * (r - 1)) + i] = len( [k for k in mol_bi[i] if k[1] == r]) return formula.reshape(1, 2048), binary.reshape(1, 4096)
def smiles2fp(smiles, radius=2, n_bits=1024): mol = Chem.MolFromSmiles(smiles) try: return desc.GetMorganFingerprintAsBitVect(mol, radius, n_bits) except Exception as e: print(e) return -1
def testDrawMorgan(self): m = Chem.MolFromSmiles('c1ccccc1CC1CC1') bi = {} _ = rdMolDescriptors.GetMorganFingerprintAsBitVect(m, radius=2, bitInfo=bi) self.assertTrue(872 in bi) svg1 = Draw.DrawMorganBit(m, 872, bi) aid, r = bi[872][0] svg2 = Draw.DrawMorganEnv(m, aid, r) self.assertEqual(svg1, svg2) self.assertTrue("style='fill:#CCCCCC;" in svg1) self.assertTrue("style='fill:#E5E533;" in svg1) self.assertTrue("style='fill:#9999E5;" in svg1) svg1 = Draw.DrawMorganBit(m, 872, bi, centerColor=None) aid, r = bi[872][0] svg2 = Draw.DrawMorganEnv(m, aid, r, centerColor=None) self.assertEqual(svg1, svg2) self.assertTrue("style='fill:#CCCCCC;" in svg1) self.assertTrue("style='fill:#E5E533;" in svg1) self.assertFalse("style='fill:#9999E5;" in svg1) with self.assertRaises(KeyError): Draw.DrawMorganBit(m, 32, bi) if hasattr(Draw, 'MolDraw2DCairo'): # Github #3796: make sure we aren't trying to generate metadata: png = Draw.DrawMorganBit(m, 872, bi, useSVG=False) self.assertIn(b'PNG', png) self.assertIsNone(Chem.MolFromPNGString(png))
def GenerateMorganFeaturesFingerprints(Mols): """Generate MorganFeatures fingerprints.""" MiscUtil.PrintInfo("\nGenerating MorganFeatures %s fingerprints..." % OptionsInfo["SpecifiedFingerprintsType"]) # Setup fingerprints parameters... Radius = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["Radius"] UseChirality = OptionsInfo["FingerprintsParams"]["MorganFeatures"][ "UseChirality"] FPSize = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["FPSize"] UseFeatures = True if re.match("^BitVect$", OptionsInfo["SpecifiedFingerprintsType"], re.I): # Generate ExplicitBitVect fingerprints... MiscUtil.PrintInfo("FPSize: %s" % (FPSize)) MolsFingerprints = [ rdMolDescriptors.GetMorganFingerprintAsBitVect( Mol, Radius, useFeatures=UseFeatures, useChirality=UseChirality, nBits=FPSize) for Mol in Mols ] else: # Generate UIntSparseIntVect fingerprints... MolsFingerprints = [ rdMolDescriptors.GetMorganFingerprint(Mol, Radius, useFeatures=UseFeatures, useChirality=UseChirality) for Mol in Mols ] return MolsFingerprints
def GenerateMorganFeaturesFingerprints(Mols): """Generate MorganFeatures fingerprints.""" MiscUtil.PrintInfo("\nGenerating MorganFeatures fingerprints...") # Setup fingerprints parameters... Radius = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["Radius"] UseChirality = OptionsInfo["FingerprintsParams"]["MorganFeatures"][ "UseChirality"] UseFeatures = True if OptionsInfo["GenerateBitVectFingerints"]: # Generate ExplicitBitVect fingerprints... FPSize = 2048 MolsFingerprints = [ rdMolDescriptors.GetMorganFingerprintAsBitVect( Mol, Radius, useFeatures=UseFeatures, useChirality=UseChirality, nBits=FPSize) for Mol in Mols ] else: # Generate UIntSparseIntVect fingerprints... MolsFingerprints = [ rdMolDescriptors.GetMorganFingerprint(Mol, Radius, useFeatures=UseFeatures, useChirality=UseChirality) for Mol in Mols ] return MolsFingerprints
def dmat_sim(smiles_target, ntopick=10): """ Function to select most dissimilar compounds from a given set Adapted from: http://rdkit.blogspot.com/2014/08/optimizing-diversity-picking-in-rdkit.html Args: smiles_target: DataFrame which contains compound-target activity pairs. The compounds should be in the smiles strings format and in a column named "smiles" ntoppick: The number of dissimiliar compounds to pick from the ranked list of dissimilarity Returns: A DataFrame of compound-target activity pairs that were sampled from the input smiles_target DataFrame based on their dissimilarity """ ds = [] smiles_target.reset_index(drop=True, inplace=True) mols = [MolFromSmiles(smi) for smi in smiles_target['smiles']] fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(m, 2) for m in mols] for i in range(1, len(fps)): ds.extend( DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i], returnDistance=True)) mmp = SimDivFilters.MaxMinPicker() ids = mmp.Pick(np.array(ds), len(fps), ntopick) smiles_target_dissim = smiles_target.iloc[list(ids)] return smiles_target_dissim
def _compute_fps(self) -> None: """Compute a numpy array of Morgan fingerprint vectors. """ fp_vects = [] for mol in tqdm.tqdm(self.data.mol, desc='Computing fingerprints', disable=self.prog): if self.fp_type == 'morgan': fp_vect = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, self.fp_rad, self.fp_bits) if self.fp_type == 'rdkit': fp_vect = Chem.RDKFingerprint( mol, minPath=self.fp_rad, maxPath=self.fp_rad, fpSize=self.fp_bits, ) array = np.zeros((0, ), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp_vect, array) fp_vects.append(array) self.fps = np.zeros((len(fp_vects), self.fp_bits)) for i, fp_vect in enumerate(fp_vects): self.fps[i, :] = fp_vect
def calculateMol(self, m, smiles, internalParsing=False): return list( rd.GetMorganFingerprintAsBitVect( m, radius=self.radius, nBits=self.nbits, invariants=rd.GetFeatureInvariants(m)))
def testDrawMorgan(self): from rdkit.Chem import rdMolDescriptors m = Chem.MolFromSmiles('c1ccccc1CC1CC1') bi = {} fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(m, radius=2, bitInfo=bi) self.assertTrue(872 in bi) svg1 = Draw.DrawMorganBit(m, 872, bi) aid, r = bi[872][0] svg2 = Draw.DrawMorganEnv(m, aid, r) self.assertEqual(svg1, svg2) self.assertTrue("style='fill:#CCCCCC;" in svg1) self.assertTrue("style='fill:#E5E533;" in svg1) self.assertTrue("style='fill:#9999E5;" in svg1) svg1 = Draw.DrawMorganBit(m, 872, bi, centerColor=None) aid, r = bi[872][0] svg2 = Draw.DrawMorganEnv(m, aid, r, centerColor=None) self.assertEqual(svg1, svg2) self.assertTrue("style='fill:#CCCCCC;" in svg1) self.assertTrue("style='fill:#E5E533;" in svg1) self.assertFalse("style='fill:#9999E5;" in svg1) with self.assertRaises(KeyError): Draw.DrawMorganBit(m, 32, bi)
def smi2vec(smi): mol = Chem.MolFromSmiles(smi) bit_vec = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, 4, nBits=n_bit) vec = [bit_vec[i] for i in range(n_bit)] return vec
def calc_morgan_fp(smiles): mol = Chem.MolFromSmiles(smiles) fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, RADIUS, nBits=FP_SIZE) a = np.zeros((0, ), dtype=np.float32) Chem.DataStructs.ConvertToNumpyArray(fp, a) return a
def reward_target_molecule_similarity(mol, target, radius=2, nBits=2048, useChirality=True): """ Reward for a target molecule similarity, based on tanimoto similarity between the ECFP fingerprints of the x molecule and target molecule :param mol: rdkit mol object :param target: rdkit mol object :return: float, [0.0, 1.0] """ x = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, radius=radius, nBits=nBits, useChirality=useChirality) target = rdMolDescriptors.GetMorganFingerprintAsBitVect( target, radius=radius, nBits=nBits, useChirality=useChirality) return DataStructs.TanimotoSimilarity(x, target)
def getFingerprintFromMolecule( moles, nBits=2048 ) : fps = [ rdMolDescriptors.GetMorganFingerprintAsBitVect( m, 2, nBits=nBits ) for m in moles ] np_fps = [] for fp in fps: arr = np.zeros( ( 1, ) ) #DataStructs.ConvertToNumpyArray( fp, arr ) DataStructs.cDataStructs.ConvertToNumpyArray( fp, arr ) np_fps.append( arr ) return np.array( np_fps )
def morgan_fingerprinter(mol): fp = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, radius, nBits=fpSize, useChirality=useChirality, useBondTypes=useBondTypes, useFeatures=useFeatures) return _fp_to_bytes(fp)
def features_ext(smile_string, radius=2, nBits=256): mols = Chem.rdmolfiles.MolFromSmiles(smile_string) fps = rdMolDescriptors.GetMorganFingerprintAsBitVect(mols, radius=radius, bitInfo=dict(), nBits=nBits) return np.array(fps)
def pred_(model, string_mol, inputopt): # Labels labels = np.array([ 'Alcohol', 'Aldehyde', 'Alicycle', 'Amide', 'Aromatic', 'Carbocycle', 'Carboxylic acid', 'Chiral', 'Ester', 'Ether', 'Fused rings', 'Ketone', 'Lactame', 'Metal-organic', 'Nitrogen heterocycle', 'Oxygen heterocycle', 'Sulfide', 'Sulfur heterocycle', 'Thiol', 'Urea' ]) if inputopt == 'InChI': # Read molecule input molecule = Chem.inchi.MolFromInchi(string_mol) if inputopt == 'SMILES': # Read molecule input molecule = Chem.MolFromSmiles(string_mol) # Convert input molecule to descriptors bi = {} morganFP = rdMolDescriptors.GetMorganFingerprintAsBitVect(molecule, radius=2, bitInfo=bi, nBits=1024) train_fps_array = [] morganFP_array = np.zeros((1, ), dtype=int) DataStructs.ConvertToNumpyArray(morganFP, morganFP_array) train_fps_array.append(morganFP_array) # Classification prediction = model.predict(np.array(train_fps_array), batch_size=1, verbose=1) result = pd.DataFrame(prediction, columns=labels) result_bin = result.round(0).astype(int) result_labels = result_bin.astype(bool).to_numpy().tolist() result_confidence = result.to_numpy() # Display Result result_display = labels[tuple(result_labels)] result_confidence_float = [conf for conf in result_confidence[0] ] # use for sorting (list) # Transforming into percentages result_confidence = [format(n, '.2%') for n in result_confidence_float ] # use for stack (str) # Formatting results result_display = ', '.join(result_display) if result_display == '': result_display = 'No functional groups found' result_confidence = np.column_stack((labels, result_confidence)) # Stack result_confidence = result_confidence[np.argsort(result_confidence_float) [::-1]] # Order descending return result_display, result_confidence
def fingerprint_features(smile_string, radius=2, size=2048): mol = MolFromSmiles(smile_string) new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=size, useChirality=True, useBondTypes=True, useFeatures=False)
def smiles_to_fingerprint_bin(smiles, trust_smiles=False): mol = Chem.MolFromSmiles(smiles, sanitize=(not trust_smiles)) if mol is None: return None if trust_smiles: mol.UpdatePropertyCache() Chem.FastFindRings(mol) fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, 2, BITCOUNT) return DataStructs.BitVectToBinaryText(fp)
def smiles_to_ecfp_list(smi_list): from rdkit.Chem import rdMolDescriptors ecfp = [] for i in smi_list: ecfp.append([ int(j) for j in list( rdMolDescriptors.GetMorganFingerprintAsBitVect( Chem.MolFromSmiles( i), radius=6, nBits=512).ToBitString()) ]) return ecfp
def draw_fragment(self, fragment_id: Union[str, int], show_zscore: bool = True) -> str: """Draw a specified fragmnet. Args: fragment_id (Union[str, int]): User-defined fragment string, or position of the Morgan fingerprint bit to be drawn. show_zscore (bool, optional): Annotate drawing with zscore. Defaults to True. Returns: str: Molecule drawing SVG. """ # images will be annotated with zscore legend = f"zscore = {self.zscores[fragment_id]:.2f}" if show_zscore else "" # handle drawing of user-defined fragments if self.user_frags: mol = Chem.MolFromSmarts(fragment_id) img = Draw.MolsToGridImage([mol], molsPerRow=1, subImgSize=(200, 200), legends=[legend]) # handle drawing of auto-generated fragments mol = self._get_mol_with_frag(fragment_id) bit_info = {} if self.fp_type == "morgan": _ = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, radius=self.fp_rad, nBits=self.fp_bits, bitInfo=bit_info) img = Draw.DrawMorganBit(mol, fragment_id, bit_info, useSVG=True, legend=legend) if self.fp_type == "rdkit": _ = Chem.RDKFingerprint( mol, minPath=self.fp_rad, maxPath=self.fp_rad, fpSize=self.fp_bits, bitInfo=bit_info, ) img = Draw.DrawRDKitBit(mol, fragment_id, bit_info, useSVG=True, legend=legend) return img
def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray: """Calculate circular fingerprint. Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A numpy array of circular fingerprint. """ try: from rdkit import Chem from rdkit.Chem import rdMolDescriptors except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) if self.sparse: info: Dict = {} fp = rdMolDescriptors.GetMorganFingerprint( datapoint, self.radius, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features, bitInfo=info) fp = fp.GetNonzeroElements() # convert to a dict # generate SMILES for fragments if self.smiles: fp_smiles = {} for fragment_id, count in fp.items(): root, radius = info[fragment_id][0] env = Chem.FindAtomEnvironmentOfRadiusN(datapoint, radius, root) frag = Chem.PathToSubmol(datapoint, env) smiles = Chem.MolToSmiles(frag) fp_smiles[fragment_id] = {'smiles': smiles, 'count': count} fp = fp_smiles else: fp = rdMolDescriptors.GetMorganFingerprintAsBitVect( datapoint, self.radius, nBits=self.size, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features) fp = np.asarray(fp, dtype=float) return fp
def get_input_features(self, mol): try: fp = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, self.radius) except Exception as e: logger = getLogger(__name__) logger.debug('exception caught at ECFPPreprocessor:', e) # Extracting feature failed raise MolFeatureExtractionError # TODO(Nakago): Test it. return numpy.asarray(fp, numpy.float32)
def get_morgan(molecule, length=512): try: # radius=2 = ECFP4, radius=3 = ECFP6, etc. desc = rdMolDescriptors.GetMorganFingerprintAsBitVect(molecule, 2, nBits=length) except Exception as e: print(e) print('error ' + str(molecule)) desc = np.nan return desc
def index_row(key, smiles): err = '' morgan_fp = '' try: mol = Chem.MolFromSmiles(smiles) fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2) morgan_fp = fp.ToBase64() except Exception as e: err = f'Exception {e} processing {smiles}' return {'key': key, 'morgan_fp': morgan_fp, 'error': err}