def getAtomPairFingerPrintFromSmile(smile): try: molecule = Chem.MolFromSmiles(smile) # print("here1") atomPairFP = GetHashedAtomPairFingerprintAsBitVect(molecule) # print("Here2") return atomPairFP.ToBitString() except: raise Exception("Not able to Generate FigerPrint")
def get_fp(mols): fps = [] if (args.fpType == 'ECFP4'): for x in mols: if (x): z = AllChem.GetMorganFingerprintAsBitVect(x, 2) fps.append(z) if (args.fpType == 'ECFP6'): for x in mols: if (x): z = AllChem.GetMorganFingerprintAsBitVect(x, 3) fps.append(z) if (args.fpType == 'ECFP12'): for x in mols: if (x): z = AllChem.GetMorganFingerprintAsBitVect(x, 6) fps.append(z) if (args.fpType == 'MACCS'): for x in mols: if (x): z = Chem.MACCSkeys.GenMACCSKeys(x) fps.append(z) if (args.fpType == 'Daylight'): for x in mols: if (x): z = FingerprintMols.FingerprintMol(x) fps.append(z) if (args.fpType == 'AP'): for x in mols: if (x): z = GetHashedAtomPairFingerprintAsBitVect(x, nBits=4096) fps.append(z) return fps
def get_fingerprints(smiles_df, r=2, length=512, type_='morgan'): if type_ == 'morgan': fp = [AllChem.GetMorganFingerprintAsBitVect(m, r, nBits = length)\ for m in smiles_df['mol']] elif type_ == 'fcpf': fp = [AllChem.GetMorganFingerprintAsBitVect(m, r, useFeatures=True, nBits = length)\ for m in smiles_df['mol']] elif type_ == 'atom pair': fp = [GetHashedAtomPairFingerprintAsBitVect(m, nBits = length)\ for m in smiles_df['mol']] elif type_ == 'avalon': fp = [GetAvalonFP(m, nBits = length) for m in smiles_df['mol']] elif type_ == 'torsion': fp = [GetHashedTopologicalTorsionFingerprintAsBitVect(m, nBits = length)\ for m in smiles_df['mol']] elif type_ == 'rdkit': fp = [RDKFingerprint(m, fpSize = length) for m in smiles_df['mol']] else: raise ValueError("Possible values: morgan, fcpf, atom pair, avalon, torision and rdkit") drug_names = smiles_df['drug'].values return fp_to_pandas(fp=fp, drug_names=drug_names)
def _transform_mol(self, mol): """Private method to transform a skchem molecule. Use transform` for the public method, which genericizes the argument to iterables of mols. Args: mol (skchem.Mol): Molecule to calculate fingerprint for. Returns: np.array or dict: Fingerprint as an array (or a dict if sparse). """ if self.as_bits and self.n_feats > 0: fp = GetHashedAtomPairFingerprintAsBitVect( mol, nBits=self.n_feats, minLength=self.min_length, maxLength=self.max_length, includeChirality=self.use_chirality) res = np.array(0) ConvertToNumpyArray(fp, res) res = res.astype(np.uint8) else: if self.n_feats <= 0: res = GetAtomPairFingerprint( mol, nBits=self.n_feats, minLength=self.min_length, maxLength=self.max_length, includeChirality=self.use_chirality) res = res.GetNonzeroElements() if self.as_bits: res = {k: int(v > 0) for k, v in res.items()} else: res = GetHashedAtomPairFingerprint( mol, nBits=self.n_feats, minLength=self.min_length, maxLength=self.max_length, includeChirality=self.use_chirality) res = np.array(list(res)) return res
def get_fp(mols): fps = [] if (args.fpType == 'ECFP4'): for x in mols: if (x): z = AllChem.GetMorganFingerprintAsBitVect(x, 2) #, nBits=4096 ) fps.append(z) if (args.fpType == 'ECFP6'): for x in mols: if (x): z = AllChem.GetMorganFingerprintAsBitVect(x, 3) #, nBits=4096 ) fps.append(z) if (args.fpType == 'ECFP12'): for x in mols: if (x): z = AllChem.GetMorganFingerprintAsBitVect(x, 6) #, nBits=4096 ) fps.append(z) if (args.fpType == 'MACCS'): for x in mols: if (x): z = Chem.MACCSkeys.GenMACCSKeys(x) fps.append(z) if (args.fpType == 'simple'): describer = MUVDescriptors() for x in mols: if (x): z = describer.calculate_descriptors(x) fps.append(z) if (args.fpType == 'Daylight'): for x in mols: if (x): z = FingerprintMols.FingerprintMol(x) fps.append(z) if (args.fpType == 'AP'): for x in mols: if (x): z = GetHashedAtomPairFingerprintAsBitVect(x, nBits=4096) #z=Pairs.GetAtomPairFingerprint( x ) fps.append(z) return fps
def _hap(self, molecules): if self.vector == 'int': from rdkit.Chem.AtomPairs.Pairs import GetHashedAtomPairFingerprint self.fps_ = [ GetHashedAtomPairFingerprint(self._sanitary(m), nBits=self.n_bits, **self.kwargs) for m in molecules ] # get nonzero elements as a dictionary for each molecule dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps_] data = pd.DataFrame(dict_nonzero) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.rdMolDescriptors import GetHashedAtomPairFingerprintAsBitVect self.fps_ = [ GetHashedAtomPairFingerprintAsBitVect(self._sanitary(m), nBits=self.n_bits, **self.kwargs) for m in molecules ] data = np.array(self.fps_) data = pd.DataFrame(data) return data
class FingerprintsTransformer(MoleculeTransformer): r""" Fingerprint molecule transformer. This transformer is able to compute various fingerprints regularly used in QSAR modeling. Arguments ---------- kind: str, optional Name of the fingerprinting method used. Should be one of {'global_properties', 'atom_pair', 'topological_torsion', 'morgan_circular', 'estate', 'avalon_bit', 'avalon_count', 'erg', 'rdkit', 'maccs'} (Default value = 'morgan_circular') length: int, optional Length of the fingerprint to use (Default value = 2000) Attributes ---------- kind: str Name of the fingerprinting technique used length: int Length of the fingerprint to use fpfun: function function to call to compute the fingerprint """ MAPPING = OrderedDict( # global_properties=lambda x, params: augmented_mol_properties(x), # physiochemical=lambda x: GetBPFingerprint(x), atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect( x, **params), topological_torsion=lambda x, params: GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params), ecfp2=lambda x, params: GetMorganFingerprintAsBitVect(x, 1, **params), ecfp4=lambda x, params: GetMorganFingerprintAsBitVect(x, 2, **params), ecfp6=lambda x, params: GetMorganFingerprintAsBitVect(x, 3, **params), estate=lambda x, params: FingerprintMol(x)[0], avalon_bit=lambda x, params: GetAvalonFP(x, **params), avalon_count=lambda x, params: GetAvalonCountFP(x, **params), erg=lambda x, params: GetErGFingerprint(x), rdkit=lambda x, params: RDKFingerprint(x, **params), maccs=lambda x, params: GetMACCSKeysFingerprint(x)) def __init__(self, kind='ecfp2', length=4096): super(FingerprintsTransformer, self).__init__() if not (isinstance(kind, str) and (kind in FingerprintsTransformer.MAPPING.keys())): raise ValueError("Argument kind must be in: " + ', '.join(FingerprintsTransformer.MAPPING.keys())) self.kind = kind self.length = length self.fpfun = self.MAPPING.get(kind, None) if not self.fpfun: raise ValueError("Fingerprint {} is not offered".format(kind)) self._params = {} self._params.update({ ('fpSize' if kind == 'rdkit' else 'nBits'): length }) def _transform(self, mol): r""" Transforms a molecule into a fingerprint vector :raises ValueError: when the input molecule is None Arguments ---------- mol: rdkit.Chem.Mol Molecule of interest Returns ------- fp: np.ndarray The computed fingerprint """ if mol is None: raise ValueError("Expecting a Chem.Mol object, got None") # expect cryptic rdkit errors here if this fails, #rdkitdev fp = self.fpfun(mol, self._params) if isinstance(fp, ExplicitBitVect): fp = explicit_bit_vect_to_array(fp) else: fp = list(fp) return fp def transform(self, mols, **kwargs): r""" Transforms a batch of molecules into fingerprint vectors. .. note:: The recommended way is to use the object as a callable. Arguments ---------- mols: (str or rdkit.Chem.Mol) iterable List of SMILES or molecules kwargs: named parameters for transform (see below) Returns ------- fp: array computed fingerprints of size NxD, where D is the requested length of features and N is the number of input molecules that have been successfully featurized. See Also -------- :func:`~ivbase.transformers.features.MoleculeTransformer.transform` """ mol_list = [ self.to_mol(mol, addHs=False) for i, mol in enumerate(mols) ] # idx = [i for i, m in enumerate(mol_list) if m is None] mol_list = list(filter(None.__ne__, mol_list)) features = np.array([self._transform(mol) for mol in mol_list]).astype(np.float32) features = totensor(features, gpu=False) return features def __call__(self, mols, dtype=torch.long, cuda=False, **kwargs): r""" Transforms a batch of molecules into fingerprint vectors, and return the transformation in the desired data type format as well as the set of valid indexes. Arguments ---------- mols: (str or rdkit.Chem.Mol) iterable The list of input smiles or molecules dtype: torch.dtype or numpy.dtype, optional Datatype of the transformed variable. Expect a tensor if you provide a torch dtype, a numpy array if you provide a numpy dtype (supports valid strings) or a vanilla int/float. Any other option will return the output of the transform function. (Default value = torch.long) cuda: bool, optional Whether to transfer tensor on the GPU (if output is a tensor) kwargs: named parameters for transform (see below) Returns ------- fp: array computed fingerprints (in `dtype` datatype) of size NxD, where D is the requested length of features and N is the number of input molecules that have been successfully featurized. ids: array all valid molecule positions that did not failed during featurization See Also -------- :func:`~ivbase.transformers.features.FingerprintsTransformer.transform` """ fp, ids = super(FingerprintsTransformer, self).__call__(mols, **kwargs) if is_dtype_numpy_array(dtype): fp = np.array(fp, dtype=dtype) elif is_dtype_torch_tensor(dtype): fp = totensor(fp, gpu=cuda, dtype=dtype) else: raise (TypeError('The type {} is not supported'.format(dtype))) return fp, ids
def convert_to_atompair(SMILES): mol = MS(SMILES) atom_pair = AtomPair(mol) atom_pair_float_list = list( np.asarray(list(atom_pair.ToBitString()), dtype=float)) return atom_pair_float_list
def Fingerprint(self): if self.FPtype == 'Hashed_atom_pair' or self.FPtype == 'HAP': if self.vector == 'int': from rdkit.Chem.AtomPairs.Pairs import GetHashedAtomPairFingerprint self.fps = [ GetHashedAtomPairFingerprint(m, nBits=self.nBits) for m in self.molecules ] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] data = pd.DataFrame(dict_nonzero, columns=range(self.nBits)) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.rdMolDescriptors import GetHashedAtomPairFingerprintAsBitVect self.fps = [ GetHashedAtomPairFingerprintAsBitVect(m, nBits=self.nBits) for m in self.molecules ] data = np.array(self.fps) data = pd.DataFrame(data) return data else: msg = "The argument vector can be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'Atom_pair' or self.FPtype == 'AP': if self.vector == 'int': from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprint self.fps = [GetAtomPairFingerprint(m) for m in self.molecules] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] pairScores = [] for fp in dict_nonzero: pairScores += [key for key in fp] data = pd.DataFrame(dict_nonzero, columns=list(set(pairScores))) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprintAsBitVect self.fps = [ GetAtomPairFingerprintAsBitVect(m) for m in self.molecules ] data = np.array(self.fps) data = pd.DataFrame(data) print len(data.columns) d_des = data.describe() for i in data.columns: if d_des[i]['mean'] == 0: data.drop(i, 1) print len(data.columns) dict_nonzero = [] for fp in self.fps: dict_nonzero.append( {i: el for i, el in enumerate(fp) if el != 0}) pairScores = [] for fp in dict_nonzero: pairScores += [key for key in fp] data = pd.DataFrame(dict_nonzero, columns=list(set(pairScores))) data.fillna(0, inplace=True) return data else: msg = "The argument vector can be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'MACCS': if self.vector == 'int': msg = "There is no RDKit function to encode int vectors for MACCS keys" raise ValueError(msg) elif self.vector == 'bit': from rdkit.Chem.MACCSkeys import GenMACCSKeys self.fps = [GenMACCSKeys(mol) for mol in self.molecules] data = np.array(self.fps) data = pd.DataFrame(data) return data else: msg = "The vector argument can only be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'Morgan': if self.vector == 'int': from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint self.fps = [ GetMorganFingerprint(mol, self.radius) for mol in self.molecules ] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] pairScores = [] for fp in dict_nonzero: pairScores += list(fp) data = pd.DataFrame(dict_nonzero, columns=list(set(pairScores))) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect self.fps = [ GetMorganFingerprintAsBitVect(mol, self.radius, nBits=self.nBits) for mol in self.molecules ] data = np.array(self.fps) data = pd.DataFrame(data) return data else: msg = "The argument vector can only be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'Hashed_topological_torsion' or self.FPtype == 'HTT': if self.vector == 'int': from rdkit.Chem.rdMolDescriptors import GetHashedTopologicalTorsionFingerprint self.fps = [ GetHashedTopologicalTorsionFingerprint(m, nBits=self.nBits) for m in self.molecules ] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] data = pd.DataFrame(dict_nonzero, columns=range(self.nBits)) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.rdMolDescriptors import GetHashedTopologicalTorsionFingerprintAsBitVect self.fps = [ GetHashedTopologicalTorsionFingerprintAsBitVect( m, nBits=self.nBits) for m in self.molecules ] data = np.array(self.fps) data = pd.DataFrame(data) return data else: msg = "The argument vector can be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'Topological_torsion' or self.FPtype == 'TT': if self.vector == 'int': from rdkit.Chem.AtomPairs.Torsions import GetTopologicalTorsionFingerprintAsIntVect self.fps = [ GetTopologicalTorsionFingerprintAsIntVect(mol) for mol in self.molecules ] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] pairScores = [] for fp in dict_nonzero: pairScores += list(fp) data = pd.DataFrame(dict_nonzero, columns=list(set(pairScores))) data.fillna(0, inplace=True) return data elif self.vector == 'bit': msg = "There is no RDKit function to encode bit vectors for Topological Torsion Fingerprints" raise ValueError(msg) else: msg = "The argument vector can only be 'int'" raise ValueError(msg) else: msg = "The type argument '%s' is not a valid fingerprint type" % self.FPtype raise ValueError(msg)
class FingerprintsTransformer(MoleculeTransformer): """Molecule transformer into molecular fingerprint Parameters ---------- kind : {'global_properties', 'atom_pair', 'topological_torsion', 'morgan_circular', 'estate', 'avalon_bit', 'avalon_count', 'erg', 'rdkit', 'maccs'}, optional, default='global_properties' Name of the fingerprinting technique used length: int Length of the fingerprint to use Attributes ---------- kind : str Name of the fingerprinting technique used length : int Length of the fingerprint to use fpfun : function function to call to compute the fingerprint """ mapping = OrderedDict( # physiochemical=lambda x: GetBPFingerprint(x), atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect( x, **params), topological_torsion=lambda x, params: GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params), morgan_circular=lambda x, params: GetMorganFingerprintAsBitVect( x, 2, **params), estate=lambda x, params: FingerprintMol(x)[0], avalon_bit=lambda x, params: GetAvalonFP(x, **params), avalon_count=lambda x, params: GetAvalonCountFP(x, **params), erg=lambda x, params: GetErGFingerprint(x), rdkit=lambda x, params: RDKFingerprint(x, **params), maccs=lambda x, params: GetMACCSKeysFingerprint(x)) def __init__(self, kind='morgan_circular', length=2000): super(FingerprintsTransformer, self).__init__() if not (isinstance(kind, str) and (kind in FingerprintsTransformer.mapping)): raise ValueError("Argument kind must be in: " + ', '.join(FingerprintsTransformer.mapping.keys())) self.kind = kind self.length = length self.fpfun = self.mapping.get(kind, None) if not self.fpfun: raise ValueError("Fingerprint {} is not offered".format(kind)) self._params = {} self._params.update({ ('fpSize' if kind == 'rdkit' else 'nBits'): length }) def _transform(self, mol): """Transform a molecule into a fingerprint vector Parameters ---------- mol: str or rdkit.Chem.Mol The smiles of the molecule of interest or the molecule itself Returns ------- fp : np.ndarray The computed fingerprint """ if mol is None: warnings.warn("None value received for argument mol") fp = np.zeros(self.length) else: fp = self.fpfun(mol, self._params) if isinstance(fp, ExplicitBitVect): fp = explicit_bit_vect_to_array(fp) else: fp = np.array(list(fp)) return fp def transform(self, mols): """Transform a batch of molecule into a fingerprint vectors Parameters ---------- X: (str or rdkit.Chem.Mol) list The list of smiles or molecule Returns ------- fp : 2d np.ndarray The computed fingerprint vectors """ res = np.array( super(FingerprintsTransformer, self).transform(mols, as_numpy=True)) return res