def _morgan(self, molecules): if self.vector == 'int': from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint self.fps_ = [ GetMorganFingerprint(self._sanitary(mol), self.radius, **self.kwargs) for mol in molecules ] # get nonzero elements as a dictionary for each molecule dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps_] # pairScores = [] # for fp in dict_nonzero: # pairScores += list(fp) data = pd.DataFrame( dict_nonzero) #, columns=list(set(pairScores))) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect self.fps_ = [ GetMorganFingerprintAsBitVect(self._sanitary(mol), self.radius, nBits=self.n_bits, **self.kwargs) for mol in molecules ] data = np.array(self.fps_) data = pd.DataFrame(data) return data
def test_keep_similar_samples(self): samp = self.sampler.sample(100, filter_similar=False, verbose=False) scores = list() i, j = 0, 0 while i < len(samp) - 1: j = i + 1 mol1 = Chem.MolFromSmiles(samp[i]) fp1 = GetMorganFingerprintAsBitVect(mol1, 4, nBits=1024) while j < len(samp): mol2 = Chem.MolFromSmiles(samp[j]) fp2 = GetMorganFingerprintAsBitVect(mol2, 4, nBits=1024) score = FingerprintSimilarity(fp1, fp2) scores.append(score) j += 1 i += 1 self.assertFalse(all([s < 0.85 for s in scores]))
def morgan_fingerprint( df: pd.DataFrame, mols_col: str, radius: int = 3, nbits: int = 2048, kind: str = "counts", ): """ Convert a column of RDKIT Mol objects into Morgan Fingerprints. Returns a new dataframe without any of the original data. This is intentional, as Morgan fingerprints are usually high-dimensional features. Method chaining usage: .. code-block:: python df = pd.DataFrame(...) morgans = df.morgan_fingerprint(mols_col='mols', radius=3, nbits=2048) If you wish to join the Morgans back into the original dataframe, this can be accomplished by doing a `join`, becuase the indices are preserved: ..code-block:: python joined = df.join(morgans) :param df: A pandas DataFrame. :param mols_col: The name of the column that has the RDKIT mol objects :param radius: Radius of Morgan fingerprints. Defaults to 3. :param nbits: The length of the fingerprints. Defaults to 2048. :param kind: Whether to return counts or bits. Defaults to counts. :returns: A pandas DataFrame """ acceptable_kinds = ["counts", "bits"] if kind not in acceptable_kinds: raise ValueError(f"`kind` must be one of {acceptable_kinds}") if kind == "bits": fps = [ GetMorganFingerprintAsBitVect(m, radius, nbits) for m in df[mols_col] ] elif kind == "counts": fps = [ GetHashedMorganFingerprint(m, radius, nbits) for m in df[mols_col] ] np_fps = [] for fp in fps: arr = np.zeros((1, )) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) np_fps = np.vstack(np_fps) fpdf = pd.DataFrame(np_fps) fpdf.index = df.index return fpdf
def test__string_output_format__binary(self) -> None: fprintr = CircularFPFeaturizer(output_format="sparse_string", fp_mode="binary_folded") fps_str = fprintr.fit_transform(self.smis) # using SMILES # Output shape self.assertEqual(self.n_mols, len(fps_str)) # Fingerprint matrix structure for i, mol in enumerate(self.mols): fps_ref = GetMorganFingerprintAsBitVect( mol, radius=fprintr.radius, useFeatures=fprintr.use_features_, useChirality=fprintr.use_chirality, nBits=fprintr.n_bits_ ) fp_i_from_str = eval("{" + fps_str[i] + "}") for idx in fps_ref.GetOnBits(): self.assertIn(idx, fp_i_from_str)
def test_correct_filter_similar_samples(self): samp = self.sampler.sample(60, filter_similar=True, threshold=0.3, verbose=False) scores = list() i, j = 0, 0 while i < len(samp) - 1: j = i + 1 mol1 = Chem.MolFromSmiles(samp[i]) fp1 = GetMorganFingerprintAsBitVect(mol1, 4, nBits=2048) while j < len(samp): mol2 = Chem.MolFromSmiles(samp[j]) fp2 = GetMorganFingerprintAsBitVect(mol2, 4, nBits=2048) score = FingerprintSimilarity(fp1, fp2) scores.append(score) j += 1 i += 1 self.assertTrue(all([s < 0.3 for s in scores]))
def __represent(self, smiles): # The descriptor must be a binary Morgan fingerprint with radius 2 and 1024 bits. mol = Chem.MolFromSmiles(smiles.strip()) if mol is None: msg = '%s is not a valid SMILES representation' % smiles raise ValueError(msg) else: return np.array( GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024))
def ecfp(mol, r=3, nBits=4096, errors_as_zeros=True): mol = Chem.MolFromSmiles(mol) if not isinstance( mol, rdkit.Chem.rdchem.Mol) else mol try: arr = np.zeros((1, )) ConvertToNumpyArray(GetMorganFingerprintAsBitVect(mol, r, nBits), arr) return arr.astype(np.float32) except: return np.NaN if not errors_as_zeros else np.zeros( (nBits, ), dtype=np.float32)
def _transform_mol(self, mol): """Private method to transform a skchem molecule. Use `transform` for the public method, which genericizes the argument to iterables of mols. Args: mol (skchem.Mol): Molecule to calculate fingerprint for. Returns: np.array or dict: Fingerprint as an array (or a dict if sparse). """ if self.as_bits and self.n_feats > 0: fp = GetMorganFingerprintAsBitVect( mol, self.radius, nBits=self.n_feats, useFeatures=self.use_features, useBondTypes=self.use_bond_types, useChirality=self.use_chirality) res = np.array(0) ConvertToNumpyArray(fp, res) res = res.astype(np.uint8) else: if self.n_feats <= 0: res = GetMorganFingerprint(mol, self.radius, useFeatures=self.use_features, useBondTypes=self.use_bond_types, useChirality=self.use_chirality) res = res.GetNonzeroElements() if self.as_bits: res = {k: int(v > 0) for k, v in res.items()} else: res = GetHashedMorganFingerprint( mol, self.radius, nBits=self.n_feats, useFeatures=self.use_features, useBondTypes=self.use_bond_types, useChirality=self.use_chirality) res = np.array(list(res)) return res
def mol_to_1jxx_feats(mol: MyMol, atom0: int, atom1: int): path = mol.path(atom0, atom1) assert path[0] == atom0, 'wrong path' s_nodes = determine_surrounding_nodes_1jhx(mol.G, path) nodes_in_interest = [ NamedNode(name=str(i), n=n) for i, n in enumerate(path) ] + s_nodes node_feats = reduce(lambda x, y: { **x, **y }, [node_to_feat(mol, n.n, n.name) for n in nodes_in_interest]) all_3d_dist_feats = calc_all_3d_dist(mol, nodes_in_interest) all_angle_feats = calc_all_angle_feats(mol, nodes_in_interest) dist_stats_feats = calc_dist_stats_feats(mol, nodes_in_interest) dihedral_stats_feats = reduce(lambda a, b: { **a, **b }, [ calc_node_to_atom_dihedral_stats(mol, nn, atom_sym) for nn, atom_sym in product(nodes_in_interest, ['H', 'C', 'N', 'O']) ]) angle_stats_feats = reduce(lambda a, b: { **a, **b }, [ calc_node_to_atom_angle_stats(mol, nn, atom_sym) for nn, atom_sym in product(nodes_in_interest, ['H', 'C', 'N', 'O']) ]) fp = GetMorganFingerprintAsBitVect(mol.mol, 2, fromAtoms=path) return { **node_feats, **all_3d_dist_feats, **all_angle_feats, **dist_stats_feats, **dihedral_stats_feats, **angle_stats_feats, **{ 'fp': fp, }, # Not feature **{ 'molecule_name': mol.name, }, **{'n{}'.format(nn.name): nn.n for nn in nodes_in_interest} }
def create_circular_fingerprint(mol, radius, size, chirality): """ :param mol: :param radius: :param size: :param chirality: :return: np array of morgan fingerprint """ fp = GetMorganFingerprintAsBitVect(mol, radius, nBits=size, useChirality=chirality) return np.array(fp)
def CalculateECFP(self, mol): """Function to compute ECFP fingerprint under useFeatures is True :param mol: molecule :type mol: rdkit.Chem.rdchem.Mol :return: fingerprint :rtype: list """ fp = GetMorganFingerprintAsBitVect(mol, radius=self.radius, nBits=self.nBits) fp = list(fp) return fp
def test__folded_binary_fingerprints__ecfp(self) -> None: fprintr = CircularFPFeaturizer(fp_mode="binary_folded", n_bits_folded=512) fps_mat_smi = fprintr.fit_transform(self.smis) # using SMILES fps_mat_mol = fprintr.fit_transform(self.mols) # using Mol objects # Output shape self.assertEqual(fps_mat_smi.shape[0], self.n_mols) self.assertEqual(fps_mat_smi.shape[1], fprintr.n_bits_folded) self.assertEqual(fps_mat_mol.shape[0], self.n_mols) self.assertEqual(fps_mat_mol.shape[1], fprintr.n_bits_folded) # Fingerprint matrix structure for i, mol in enumerate(self.mols): fps_ref = GetMorganFingerprintAsBitVect(mol, radius=fprintr.radius, useFeatures=fprintr.use_features_, useChirality=fprintr.use_chirality, nBits=fprintr.n_bits_folded) on_bits = list(fps_ref.GetOnBits()) for j in range(fprintr.n_bits_folded): if j in on_bits: self.assertTrue(fps_mat_smi[i, j]) self.assertTrue(fps_mat_mol[i, j]) else: self.assertFalse(fps_mat_smi[i, j]) self.assertFalse(fps_mat_mol[i, j])
def CalculateFCFP(self, mol): """ Parameters: ----------- mols: rdkit.Chem.rdchem.Mol Return: ----------- fps: list """ fp = GetMorganFingerprintAsBitVect(mol, radius=self.radius, nBits=self.nBits, useFeatures=True) fp = list(fp) return fp
def CalculateECFP(self, mol): """ Parameters: ----------- mols: Iterable object, each element is a rdkit.Chem.rdchem.Mol The molecule(s) to be scanned Return: ----------- fps: list """ fp = GetMorganFingerprintAsBitVect(mol, radius=self.radius, nBits=self.nBits) fp = list(fp) return fp
def _sample_w_filter(self, n_samples, data, threshold, verbose): count = 0 selected_mols_fp = list() if verbose: pb = tqdm(total=n_samples, ascii=True, desc="Sampling") while count < n_samples: idx = random.sample(range(len(data)), 1)[0] smiles = data[idx].split(",")[0] mol = Chem.MolFromSmiles(smiles) if mol is None: data[idx] = data.pop() continue fp = GetMorganFingerprintAsBitVect(mol, 4, nBits=1024) if self._are_similar(fp, selected_mols_fp, threshold): data[idx] = data.pop() continue self._samples.append(smiles) selected_mols_fp.append(fp) if verbose: pb.update(1) count += 1
def GetFoldedCircularFragment(mol, minRadius=1, maxRadius=2, nBits=1024, maxFragment=True, disposed=True): """Get folded circular fragment Parameters ---------- mol : dkit.Chem.rdchem.Mol object Compound to be Calculated minRadius : int, optional The probable minimum radius of circular fragment, by default 1 maxRadius : int, optional The probable maximum radius of circular fragment, by default 2 nBits : int, optional, the number of bit of morgan, by default 1014 maxFragment : bool, optional Whether only return the maximum fragment at a center atom, by default True disposed : bool, optional Whether dispose the original bitinfo, by default True Returns ------- fragments : list of list The first element is the ID of all fragments generated the second one is the ID of output fragments """ bitInfo = {} fp = GetMorganFingerprintAsBitVect(mol, radius=maxRadius, nBits=nBits, bitInfo=bitInfo) fragments = _DisposeCircularBitInfo( bitInfo, minRadius, maxFragment ) if disposed else bitInfo return fragments
def get_ecfp(mol): from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect bitstring = GetMorganFingerprintAsBitVect(mol, 2, nBits=2048).ToBitString() return np.array(list(bitstring))
) from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprint from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect from rdkit.DataStructs import ( ExplicitBitVect, BulkTanimotoSimilarity, BulkDiceSimilarity, BulkTverskySimilarity, ) from rdkit.ML.Cluster import Butina DEBUG = True DESCRIPTORS = { 'path': RDKFingerprint, 'ecfp4': lambda mol: GetMorganFingerprintAsBitVect(mol, radius=2), 'zinc': lambda mol: GetMorganFingerprintAsBitVect(mol, radius=2, nBits=512), 'apair': lambda mol: GetAtomPairFingerprint(mol) } COEFFICIENTS = { 'tanimoto': lambda x, ys, *args: BulkTanimotoSimilarity(x, ys), 'dice': lambda x, ys, *args: BulkDiceSimilarity(x, ys), 'tversky': lambda x, ys, a, b, *args: BulkTverskySimilarity(x, ys, a, b), } CLUSTERING_APPROACHES = [ 'butina', 'cassidy', ]
def convert_to_morgan(mol): morgan = GetMorganFingerprintAsBitVect(mol, 2, nBits=1024) return morgan
def Fingerprint(self): if self.FPtype == 'Hashed_atom_pair' or self.FPtype == 'HAP': if self.vector == 'int': from rdkit.Chem.AtomPairs.Pairs import GetHashedAtomPairFingerprint self.fps = [ GetHashedAtomPairFingerprint(m, nBits=self.nBits) for m in self.molecules ] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] data = pd.DataFrame(dict_nonzero, columns=range(self.nBits)) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.rdMolDescriptors import GetHashedAtomPairFingerprintAsBitVect self.fps = [ GetHashedAtomPairFingerprintAsBitVect(m, nBits=self.nBits) for m in self.molecules ] data = np.array(self.fps) data = pd.DataFrame(data) return data else: msg = "The argument vector can be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'Atom_pair' or self.FPtype == 'AP': if self.vector == 'int': from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprint self.fps = [GetAtomPairFingerprint(m) for m in self.molecules] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] pairScores = [] for fp in dict_nonzero: pairScores += [key for key in fp] data = pd.DataFrame(dict_nonzero, columns=list(set(pairScores))) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprintAsBitVect self.fps = [ GetAtomPairFingerprintAsBitVect(m) for m in self.molecules ] data = np.array(self.fps) data = pd.DataFrame(data) print len(data.columns) d_des = data.describe() for i in data.columns: if d_des[i]['mean'] == 0: data.drop(i, 1) print len(data.columns) dict_nonzero = [] for fp in self.fps: dict_nonzero.append( {i: el for i, el in enumerate(fp) if el != 0}) pairScores = [] for fp in dict_nonzero: pairScores += [key for key in fp] data = pd.DataFrame(dict_nonzero, columns=list(set(pairScores))) data.fillna(0, inplace=True) return data else: msg = "The argument vector can be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'MACCS': if self.vector == 'int': msg = "There is no RDKit function to encode int vectors for MACCS keys" raise ValueError(msg) elif self.vector == 'bit': from rdkit.Chem.MACCSkeys import GenMACCSKeys self.fps = [GenMACCSKeys(mol) for mol in self.molecules] data = np.array(self.fps) data = pd.DataFrame(data) return data else: msg = "The vector argument can only be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'Morgan': if self.vector == 'int': from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint self.fps = [ GetMorganFingerprint(mol, self.radius) for mol in self.molecules ] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] pairScores = [] for fp in dict_nonzero: pairScores += list(fp) data = pd.DataFrame(dict_nonzero, columns=list(set(pairScores))) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect self.fps = [ GetMorganFingerprintAsBitVect(mol, self.radius, nBits=self.nBits) for mol in self.molecules ] data = np.array(self.fps) data = pd.DataFrame(data) return data else: msg = "The argument vector can only be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'Hashed_topological_torsion' or self.FPtype == 'HTT': if self.vector == 'int': from rdkit.Chem.rdMolDescriptors import GetHashedTopologicalTorsionFingerprint self.fps = [ GetHashedTopologicalTorsionFingerprint(m, nBits=self.nBits) for m in self.molecules ] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] data = pd.DataFrame(dict_nonzero, columns=range(self.nBits)) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.rdMolDescriptors import GetHashedTopologicalTorsionFingerprintAsBitVect self.fps = [ GetHashedTopologicalTorsionFingerprintAsBitVect( m, nBits=self.nBits) for m in self.molecules ] data = np.array(self.fps) data = pd.DataFrame(data) return data else: msg = "The argument vector can be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'Topological_torsion' or self.FPtype == 'TT': if self.vector == 'int': from rdkit.Chem.AtomPairs.Torsions import GetTopologicalTorsionFingerprintAsIntVect self.fps = [ GetTopologicalTorsionFingerprintAsIntVect(mol) for mol in self.molecules ] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] pairScores = [] for fp in dict_nonzero: pairScores += list(fp) data = pd.DataFrame(dict_nonzero, columns=list(set(pairScores))) data.fillna(0, inplace=True) return data elif self.vector == 'bit': msg = "There is no RDKit function to encode bit vectors for Topological Torsion Fingerprints" raise ValueError(msg) else: msg = "The argument vector can only be 'int'" raise ValueError(msg) else: msg = "The type argument '%s' is not a valid fingerprint type" % self.FPtype raise ValueError(msg)
def morgan_fingerprint( df: pd.DataFrame, mols_column_name: str, radius: int = 3, nbits: int = 2048, kind: str = "counts", ) -> pd.DataFrame: """ Convert a column of RDKIT Mol objects into Morgan Fingerprints. Returns a new dataframe without any of the original data. This is intentional, as Morgan fingerprints are usually high-dimensional features. This method does not mutate the original DataFrame. Functional usage example: .. code-block:: python import pandas as pd import janitor.chemistry df = pd.DataFrame(...) # For "counts" kind morgans = janitor.chemistry.morgan_fingerprint( df=df.smiles2mol('smiles', 'mols'), mols_column_name='mols', radius=3, # Defaults to 3 nbits=2048, # Defaults to 2048 kind='counts' # Defaults to "counts" ) # For "bits" kind morgans = janitor.chemistry.morgan_fingerprint( df=df.smiles2mol('smiles', 'mols'), mols_column_name='mols', radius=3, # Defaults to 3 nbits=2048, # Defaults to 2048 kind='bits' # Defaults to "counts" ) Method chaining usage example: .. code-block:: python import pandas as pd import janitor.chemistry df = pd.DataFrame(...) # For "counts" kind morgans = ( df.smiles2mol('smiles', 'mols') .morgan_fingerprint(mols_column_name='mols', radius=3, # Defaults to 3 nbits=2048, # Defaults to 2048 kind='counts' # Defaults to "counts" ) ) # For "bits" kind morgans = ( df.smiles2mol('smiles', 'mols') .morgan_fingerprint(mols_column_name='mols', radius=3, # Defaults to 3 nbits=2048, # Defaults to 2048 kind='bits' # Defaults to "counts" ) ) If you wish to join the morgan fingerprints back into the original dataframe, this can be accomplished by doing a `join`, because the indices are preserved: .. code-block:: python joined = df.join(morgans) :param df: A pandas DataFrame. :param mols_column_name: The name of the column that has the RDKIT mol objects :param radius: Radius of Morgan fingerprints. Defaults to 3. :param nbits: The length of the fingerprints. Defaults to 2048. :param kind: Whether to return counts or bits. Defaults to counts. :returns: A new pandas DataFrame of Morgan fingerprints. """ acceptable_kinds = ["counts", "bits"] if kind not in acceptable_kinds: raise ValueError(f"`kind` must be one of {acceptable_kinds}") if kind == "bits": fps = [ GetMorganFingerprintAsBitVect(m, radius, nbits, useChirality=True) for m in df[mols_column_name] ] elif kind == "counts": fps = [ GetHashedMorganFingerprint(m, radius, nbits, useChirality=True) for m in df[mols_column_name] ] np_fps = [] for fp in fps: arr = np.zeros((1,)) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) np_fps = np.vstack(np_fps) fpdf = pd.DataFrame(np_fps) fpdf.index = df.index return fpdf
from rdkit import Chem from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect from rdkit import DataStructs from tqdm import tqdm if __name__ == "__main__": with open("./test.csv", "r") as f: header, *data = f.readlines() i = 0 pb1 = tqdm(total=len(data), ascii=True, desc="Main progress") while i < len(data): pb2 = tqdm(total=len(data), ascii=True, desc="Look for similar") m1 = Chem.MolFromSmiles(data[i].split(",")[0]) fp1 = GetMorganFingerprintAsBitVect(m1, 4, nBits=2048) j = i + 1 while j < len(data): m2 = Chem.MolFromSmiles(data[j].split(",")[0]) if m2 is None: data[j] = data.pop() pb2.update(1) continue fp2 = GetMorganFingerprintAsBitVect(m2, 4, nBits=2048) similarity = DataStructs.FingerprintSimilarity(fp1, fp2) if similarity > 0.85: if j == len(data) - 1: data.pop() else: data[j] = data.pop() else: