Exemplo n.º 1
0
def truncated_Estate_featurizer(mol_list, return_names=False):
    
    X = np.array([FingerprintMol(mol)[0][6:37] for mol in mol_list])
    Estate_names=['-CH3', '=CH2', '—CH2—', '\\#CH', '=CH-', 'aCHa', '>CH-', '=c=', '\\#C-', '=C$<$', 'aCa',
    'aaCa', '$>$C$<$', '-NH3[+1]', '-NH2', '-NH2-[+1]', '=NH', '-NH-', 'aNHa', '\\#N', '$>$NH-[+1]',
    '=N—', 'aNa', '$>$N—', '—N$<$$<$', 'aaNs', '$>$N$<$[+1]', '-OH', '=0', '-0-', 'aOa']
    if (return_names == True):
        return Estate_names, X
    else:
        return X
Exemplo n.º 2
0
# Print out the SMILE data to verify proper read in SMILE (Special characters in SMILE are properly stored.)
# Also calculate the corresponding Morgan finger print
for Gap_opt_RF_SMILE in RF_on_gap_data1['SMILE']:
    #print (Gap_opt_RF_SMILE)
    # translate mol from its SMILE formula
    mol = Chem.MolFromSmiles(Gap_opt_RF_SMILE)

    # calculate the Morgan fingerprint
    #print(AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024).ToBitString())
    Morgan_fingerprint.append(
        AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024))

    # calculate the Estate fingerprint
    #print(FingerprintMol(mol)[0])
    Estate_fingerprint.append(FingerprintMol(mol)[0])

    # calculate the RDKit fingerprint
    RDKit_fingerprint.append(RDKFingerprint(mol, fpSize=1024))

# Morgan_fingerprint and bandgaps using RF model
# use sorted(sklearn.metrics.SCORERS.keys()) to find what are available in sklearn lib
RF_on_gap_Morgan = GridSearchCV(
    RandomForestRegressor(),
    cv=8,
    param_grid={"n_estimators": np.linspace(50, 300, 25).astype('int')},
    scoring='neg_mean_absolute_error',
    n_jobs=-1)
RF_on_gap_Morgan.fit(Morgan_fingerprint, Experimental_Gap)

Best_RF_on_gap_Morgan = RF_on_gap_Morgan.best_estimator_
Exemplo n.º 3
0
def fp_Estate_and_mw(mol):
    return np.append(FingerprintMol(mol)[0][6:37], Descriptors.MolWt(mol))
Exemplo n.º 4
0
def fp_Estate_reals(mol):
    return FingerprintMol(mol)[1][6:37]
Exemplo n.º 5
0
def fp_Estate_ints(mol):
    return FingerprintMol(mol)[0][6:37]
Exemplo n.º 6
0
def truncated_Estate_fp(mol_list):
    return np.array([FingerprintMol(mol)[0][6:37] for mol in mol_list])
Exemplo n.º 7
0
class FingerprintsTransformer(MoleculeTransformer):
    r"""
    Fingerprint molecule transformer.
    This transformer is able to compute various fingerprints regularly used in QSAR modeling.

    Arguments
    ----------
        kind: str, optional
            Name of the fingerprinting method used. Should be one of
            {'global_properties', 'atom_pair', 'topological_torsion',
            'morgan_circular', 'estate', 'avalon_bit', 'avalon_count', 'erg',
            'rdkit', 'maccs'}
            (Default value = 'morgan_circular')
        length: int, optional
            Length of the fingerprint to use
            (Default value = 2000)

    Attributes
    ----------
        kind: str
            Name of the fingerprinting technique used
        length: int
            Length of the fingerprint to use
        fpfun: function
            function to call to compute the fingerprint
    """
    MAPPING = OrderedDict(
        # global_properties=lambda x, params: augmented_mol_properties(x),
        # physiochemical=lambda x: GetBPFingerprint(x),
        atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect(
            x, **params),
        topological_torsion=lambda x, params:
        GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params),
        ecfp2=lambda x, params: GetMorganFingerprintAsBitVect(x, 1, **params),
        ecfp4=lambda x, params: GetMorganFingerprintAsBitVect(x, 2, **params),
        ecfp6=lambda x, params: GetMorganFingerprintAsBitVect(x, 3, **params),
        estate=lambda x, params: FingerprintMol(x)[0],
        avalon_bit=lambda x, params: GetAvalonFP(x, **params),
        avalon_count=lambda x, params: GetAvalonCountFP(x, **params),
        erg=lambda x, params: GetErGFingerprint(x),
        rdkit=lambda x, params: RDKFingerprint(x, **params),
        maccs=lambda x, params: GetMACCSKeysFingerprint(x))

    def __init__(self, kind='ecfp2', length=4096):
        super(FingerprintsTransformer, self).__init__()
        if not (isinstance(kind, str) and
                (kind in FingerprintsTransformer.MAPPING.keys())):
            raise ValueError("Argument kind must be in: " +
                             ', '.join(FingerprintsTransformer.MAPPING.keys()))
        self.kind = kind
        self.length = length
        self.fpfun = self.MAPPING.get(kind, None)
        if not self.fpfun:
            raise ValueError("Fingerprint {} is not offered".format(kind))
        self._params = {}
        self._params.update({
            ('fpSize' if kind == 'rdkit' else 'nBits'): length
        })

    def _transform(self, mol):
        r"""
        Transforms a molecule into a fingerprint vector
        :raises ValueError: when the input molecule is None

        Arguments
        ----------
            mol: rdkit.Chem.Mol
                Molecule of interest

        Returns
        -------
            fp: np.ndarray
                The computed fingerprint

        """

        if mol is None:
            raise ValueError("Expecting a Chem.Mol object, got None")
        # expect cryptic rdkit errors here if this fails, #rdkitdev
        fp = self.fpfun(mol, self._params)
        if isinstance(fp, ExplicitBitVect):
            fp = explicit_bit_vect_to_array(fp)
        else:
            fp = list(fp)
        return fp

    def transform(self, mols, **kwargs):
        r"""
        Transforms a batch of molecules into fingerprint vectors.

        .. note::
            The recommended way is to use the object as a callable.

        Arguments
        ----------
            mols: (str or rdkit.Chem.Mol) iterable
                List of SMILES or molecules
            kwargs: named parameters for transform (see below)

        Returns
        -------
            fp: array
                computed fingerprints of size NxD, where D is the
                requested length of features and N is the number of input
                molecules that have been successfully featurized.

        See Also
        --------
            :func:`~ivbase.transformers.features.MoleculeTransformer.transform`

        """
        mol_list = [
            self.to_mol(mol, addHs=False) for i, mol in enumerate(mols)
        ]
        # idx = [i for i, m in enumerate(mol_list) if m is None]
        mol_list = list(filter(None.__ne__, mol_list))
        features = np.array([self._transform(mol)
                             for mol in mol_list]).astype(np.float32)
        features = totensor(features, gpu=False)

        return features

    def __call__(self, mols, dtype=torch.long, cuda=False, **kwargs):
        r"""
        Transforms a batch of molecules into fingerprint vectors,
        and return the transformation in the desired data type format as well as
        the set of valid indexes.

        Arguments
        ----------
            mols: (str or rdkit.Chem.Mol) iterable
                The list of input smiles or molecules
            dtype: torch.dtype or numpy.dtype, optional
                Datatype of the transformed variable.
                Expect a tensor if you provide a torch dtype, a numpy array if you provide a
                numpy dtype (supports valid strings) or a vanilla int/float. Any other option will
                return the output of the transform function.
                (Default value = torch.long)
            cuda: bool, optional
                Whether to transfer tensor on the GPU (if output is a tensor)
            kwargs: named parameters for transform (see below)

        Returns
        -------
            fp: array
                computed fingerprints (in `dtype` datatype) of size NxD,
                where D is the requested length of features and N is the number
                of input molecules that have been successfully featurized.
            ids: array
                all valid molecule positions that did not failed during featurization

        See Also
        --------
            :func:`~ivbase.transformers.features.FingerprintsTransformer.transform`

        """
        fp, ids = super(FingerprintsTransformer, self).__call__(mols, **kwargs)
        if is_dtype_numpy_array(dtype):
            fp = np.array(fp, dtype=dtype)
        elif is_dtype_torch_tensor(dtype):
            fp = totensor(fp, gpu=cuda, dtype=dtype)
        else:
            raise (TypeError('The type {} is not supported'.format(dtype)))
        return fp, ids
Exemplo n.º 8
0
def estate_fingerprint(mol):
    return FingerprintMol(mol)[0]
Exemplo n.º 9
0
mymols = make_molecules(cno)

#Make sum over bonds descriptor
bond_types, bonds_in_molecule = sum_over_bonds(mymols)
np.savetxt("sum_over_bonds.out", bonds_in_molecule, delimiter=" ")

#*********** Generate Estate indices************************
#
#Note that there are 79 possible Estate descriptors,
#however only a subset are non-zero for the Huang-Massa/Mathieu dataset so I
#remove the null vectors using scrub_null_columns()
num_smiles = len(smi)
icount = 0
estate_fingers = np.zeros((num_smiles, 79))  #There are 79 possible descriptors
while icount < num_smiles:
    m = Chem.MolFromSmiles(smi[icount])
    counts, sums = FingerprintMol(m)
    estate_fingers[icount, :] = np.transpose(
        counts)  #can also use sums as descriptor
    icount += 1
nz_estate = scrub_null_columns(estate_fingers)
np.savetxt("nz_estate.out", nz_estate, delimiter=" ")
#
#
#**********Done with Estate Generation**************************

# Make Morgan fingerprints using Dan's code
dan_prints = make_fingerprints(mymols)
morgan_prints = np.asarray(dan_prints[2].x)
np.savetxt("morgan_prints.out", morgan_prints, delimiter=" ")
Exemplo n.º 10
0
class FingerprintsTransformer(MoleculeTransformer):
    """Molecule transformer into molecular fingerprint

    Parameters
    ----------
    kind : {'global_properties', 'atom_pair', 'topological_torsion', 'morgan_circular',
        'estate', 'avalon_bit', 'avalon_count', 'erg', 'rdkit', 'maccs'}, optional, default='global_properties'
        Name of the fingerprinting technique used
    length: int
        Length of the fingerprint to use

    Attributes
    ----------
    kind : str
        Name of the fingerprinting technique used
    length : int
        Length of the fingerprint to use
    fpfun : function
        function to call to compute the fingerprint
    """
    mapping = OrderedDict(
        # physiochemical=lambda x: GetBPFingerprint(x),
        atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect(
            x, **params),
        topological_torsion=lambda x, params:
        GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params),
        morgan_circular=lambda x, params: GetMorganFingerprintAsBitVect(
            x, 2, **params),
        estate=lambda x, params: FingerprintMol(x)[0],
        avalon_bit=lambda x, params: GetAvalonFP(x, **params),
        avalon_count=lambda x, params: GetAvalonCountFP(x, **params),
        erg=lambda x, params: GetErGFingerprint(x),
        rdkit=lambda x, params: RDKFingerprint(x, **params),
        maccs=lambda x, params: GetMACCSKeysFingerprint(x))

    def __init__(self, kind='morgan_circular', length=2000):
        super(FingerprintsTransformer, self).__init__()
        if not (isinstance(kind, str) and
                (kind in FingerprintsTransformer.mapping)):
            raise ValueError("Argument kind must be in: " +
                             ', '.join(FingerprintsTransformer.mapping.keys()))
        self.kind = kind
        self.length = length
        self.fpfun = self.mapping.get(kind, None)
        if not self.fpfun:
            raise ValueError("Fingerprint {} is not offered".format(kind))
        self._params = {}
        self._params.update({
            ('fpSize' if kind == 'rdkit' else 'nBits'): length
        })

    def _transform(self, mol):
        """Transform a molecule into a fingerprint vector

        Parameters
        ----------
        mol: str or rdkit.Chem.Mol
            The smiles of the molecule of interest or the molecule itself
        Returns
        -------
        fp : np.ndarray
            The computed fingerprint
        """
        if mol is None:
            warnings.warn("None value received for argument mol")
            fp = np.zeros(self.length)
        else:
            fp = self.fpfun(mol, self._params)
        if isinstance(fp, ExplicitBitVect):
            fp = explicit_bit_vect_to_array(fp)
        else:
            fp = np.array(list(fp))
        return fp

    def transform(self, mols):
        """Transform a batch of molecule into a fingerprint vectors

        Parameters
        ----------
        X: (str or rdkit.Chem.Mol) list
            The list of smiles or molecule

        Returns
        -------
        fp : 2d np.ndarray
            The computed fingerprint vectors
        """
        res = np.array(
            super(FingerprintsTransformer, self).transform(mols,
                                                           as_numpy=True))
        return res