示例#1
0
    def molecule_descriptors(self):

        descriptors = []
        """
        Receives the SMILES which is used to generate molecular descriptors (200) and saves as numpy file
        
        Parameter
        ---------
        
        input smiles : str
            Compouds in the form of smiles are used
    
        return : np.array
            Descriptors are saved in the form of numpy files
        """

        try:

            calc = MoleculeDescriptors.MolecularDescriptorCalculator(
                [x[0] for x in Descriptors._descList])
            mol = Chem.MolFromSmiles(self.smiles)
            ds = calc.CalcDescriptors(mol)
            ds = list(ds)
            max_value = max(ds)
            if max_value > 10**30:
                return None

            ds = np.asarray(ds)
            descriptors.append(ds)

        except:
            return None

        features = (np.asarray((descriptors), dtype=object))
        return features
 def setUp(self):
     self.descs = ['MolLogP', 'Chi1v']
     self.vers = ('1.1.0', '1.0.0')
     self.calc = MoleculeDescriptors.MolecularDescriptorCalculator(
         self.descs)
     self.testD = [('CCOC', (0.6527, 1.40403)), ('CC=O', (0.2052, 0.81305)),
                   ('CCC(=O)O', (0.481, 1.48839))]
示例#3
0
  def predict(self, model_number):
    """try to predict activity of compounds using giving model-Number"""
    if len(self.model) <= model_number:
      sys.stderr.write("\nModel-Number %d doesn't exist, there are just %d Models\n" %
                       (model_number, len(self.model)))
      sys.exit(-1)
    descriptors = []
    active, inactive = 0, 0

    for D in Descriptors._descList:
      descriptors.append(D[0])
    calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors)

    clf_RF = self.model[model_number]

    for sample in self.sd_entries:
      use = False
      try:
        pattern = calculator.CalcDescriptors(sample)
        use = True
      except e:
        sys.stderr.write("Error computing descriptors for %s, skip" % sample)

      if use:
        dataDescrs_array = np.asarray(pattern)
        y_predict = int(clf_RF.predict(dataDescrs_array)[0])
        if y_predict == 0:
          inactive += 1
        if y_predict == 1:
          active += 1
        sample.SetProp("TL_prediction", str(y_predict))
    return (active, inactive)
def calc_rdkit(molecules, name_col='CASRN'):
    """
    Takes in a list of rdkit molecules, calculates molecular descriptors for each molecule, and returns a machine
    learning-ready pandas DataFrame.

    :param molecules: List of rdkit molecule objects with no None values
    :param name_col: Name of the field to index the resulting DataFrame.  Needs to be a valid property of all molecules

    :return: pandas DataFrame of dimensions m x n, where m = # of descriptors and n = # of molecules
    """

    # Checks for appropriate input
    assert None not in molecules, 'The list of molecules entered contains None values.'

    # Generates molecular descriptor calculator
    calculator = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors.descList])

    # Calculates descriptors and stores in pandas DataFrame
    X = pd.DataFrame([list(calculator.CalcDescriptors(mol)) for mol in molecules],
                     index=[mol.GetProp(name_col) if mol.HasProp(name_col) else '' for mol in molecules],
                     columns=list(calculator.GetDescriptorNames()))

    # Imputes the data and replaces NaN values with mean from the column
    desc_matrix = X.fillna(X.mean())

    # Removes descriptors with infinity values
    desc_matrix = desc_matrix.loc[:, ~desc_matrix.isin([inf, -inf]).any(axis=0)]

    # Checks for appropriate output
    assert len(desc_matrix.columns) != 0, 'All features contained at least one null value. No descriptor matrix ' \
                                          'could be generated.'

    return desc_matrix
示例#5
0
def _RDKit_descriptors(ifile, **kwargs) -> (bool, (np.ndarray, list, list)):
    '''
    computes RDKit descriptors for the file provided as argument

    output is a boolean and a tupla with the xmatrix and the variable names
    '''
    try:
        suppl = Chem.SDMolSupplier(ifile)
    except Exception as e:
        LOG.error(f'Unable to create supplier with exception {e}')
        return False, 'Unable to compute RDKit MD'

    LOG.info('Computing RDKit descriptors...')
    # what is this??
    nms = [x[0] for x in Descriptors._descList]

    md = MoleculeDescriptors.MolecularDescriptorCalculator(nms)
    success_list = []
    xmatrix = []

    try:
        num_obj = 0
        for mol in suppl:
            if mol is None:
                LOG.error('Unable to process molecule'
                          f'#{num_obj+1} in {ifile}')
                success_list.append(False)
                continue

            if num_obj == 0:
                xmatrix = md.CalcDescriptors(mol)
                LOG.debug(
                    f'first descriptor vector computet with shape {np.shape(xmatrix)}'
                )
                if np.isnan(xmatrix).any():
                    # what is the deal if there is any NaN?
                    success_list.append(False)
                    continue
            else:
                descriptors = md.CalcDescriptors(mol)
                if np.isnan(descriptors).any():
                    success_list.append(False)
                    continue
                xmatrix = np.vstack((xmatrix, descriptors))

            success_list.append(True)
            num_obj += 1

    except:  # if any mol fails the whole try except will break
        return False, 'Failed computing RDKit descriptors for molecule' + str(
            num_obj + 1) + 'in file ' + ifile

    LOG.debug(
        f'computed RDKit descriptors matrix with shape {np.shape(xmatrix)}')
    if num_obj == 0:
        return False, 'Unable to compute RDKit properties for molecule ' + ifile

    results = {'matrix': xmatrix, 'names': nms, 'success_arr': success_list}

    return True, results
示例#6
0
    def _get_descriptors(self, smiles_list):
        '''Calculates the descriptor list of a molecule. 
           It uses the rdkit package.
        '''

        meta = load('meta')
        desc_object = MoleculeDescriptors.MolecularDescriptorCalculator(
            meta['descriptor_names'].values)

        invalid_molecules = []
        ids, descriptors = [], []
        for smiles in smiles_list:

            try:
                mol = Chem.MolFromSmiles(smiles)
                if mol is not None:
                    desc_list = list(desc_object.CalcDescriptors(mol))
                    descriptors.append(desc_list)
                    ids.append(smiles)
            except:
                invalid_molecules.append(smiles)

        if len(invalid_molecules) > 0 and len(descriptors) != 0:
            print('Some molecules could not be processed.')
            print(invalid_molecules)

        if len(descriptors) == 0:
            print('No molecules could be processed.')

        else:
            df = pd.DataFrame(descriptors,
                              columns=meta['descriptor_names'].values)
            df.insert(0, 'smiles', ids)
            return df
示例#7
0
    def __init__(self,
                 n_jobs=-1,
                 *,
                 input_type='mol',
                 on_errors='raise',
                 return_type='any'):
        """
        All descriptors in RDKit (length = 200) [may include NaN]
            see https://www.rdkit.org/docs/GettingStartedInPython.html#list-of-available-descriptors for the full list

        Parameters
        ----------
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict. Set -1 to use all cpu cores (default).
        input_type: string
            Set the specific type of transform input.
            Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
            When set to ``smlies``, ``transform`` method can use a SMILES list as input.
            Set to ``any`` to use both.
            If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
            for ``None`` returns, a ``ValueError`` exception will be raised.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        """
        # self.arg = arg # arg[0] = radius, arg[1] = bit length
        super().__init__(n_jobs=n_jobs,
                         on_errors=on_errors,
                         return_type=return_type)
        self.input_type = input_type
        nms = [x[0] for x in Descriptors._descList]
        self.calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms)
示例#8
0
    def __init__(self,
                 n_jobs=-1,
                 *,
                 input_type='mol',
                 on_errors='raise',
                 return_type='any',
                 target_col=None,
                 desc_list='all',
                 add_Hs=False):
        """
        All descriptors in RDKit (length = 200) [may include NaN]
            see https://www.rdkit.org/docs/GettingStartedInPython.html#list-of-available-descriptors for the full list

        Parameters
        ----------
        n_jobs: int
            The number of jobs to run in parallel for both fit and predict.
            Can be -1 or # of cups. Set -1 to use all cpu cores (default).
        input_type: string
            Set the specific type of transform input.
            Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
            When set to ``smlies``, ``transform`` method can use a SMILES list as input.
            Set to ``any`` to use both.
            If input is SMILES, ``Chem.MolFromSmiles`` function will be used inside.
            for ``None`` returns, a ``ValueError`` exception will be raised.
        on_errors: string
            How to handle exceptions in feature calculations. Can be 'nan', 'keep', 'raise'.
            When 'nan', return a column with ``np.nan``.
            The length of column corresponding to the number of feature labs.
            When 'keep', return a column with exception objects.
            The default is 'raise' which will raise up the exception.
        target_col
            Only relevant when input is pd.DataFrame, otherwise ignored.
            Specify a single column to be used for transformation.
            If ``None``, all columns of the pd.DataFrame is used.
            Default is None.
        desc_list: string or list
            List of descriptor names to be called in rdkit to calculate molecule descriptors.
            If ``classic``, the full list of rdkit v.2020.03.xx is used. (length = 200)
            Default is to use the latest list available in the rdkit. (length = 208 in rdkit v.2020.09.xx)
        add_Hs: boolean
            Add hydrogen atoms to the mol format in RDKit or not.
            This may affect a few physical descriptors (e.g., charge related ones).
        """
        # self.arg = arg # arg[0] = radius, arg[1] = bit length
        super().__init__(n_jobs=n_jobs,
                         on_errors=on_errors,
                         return_type=return_type,
                         target_col=target_col)
        self.input_type = input_type
        self.add_Hs = add_Hs
        if desc_list == 'all':
            self.nms = [x[0] for x in ChemDesc._descList]
        elif desc_list == 'classic':
            self.nms = self.classic
        else:
            self.nms = desc_list
        self.calc = MoleculeDescriptors.MolecularDescriptorCalculator(self.nms)
        self.__authors__ = ['Stephen Wu', 'TsumiNa']
示例#9
0
 def get_moldescs(self, rdmols):
     descriptor_names = [
         descriptor_name[0] for descriptor_name in Descriptors._descList
     ]
     calculater = MoleculeDescriptors.MolecularDescriptorCalculator(
         descriptor_names)
     X = pd.DataFrame([calculater.CalcDescriptors(mol) for mol in rdmols])
     return X.values
示例#10
0
 def __init__(self, auto_correct=True, dict_mode=True):
     self.desc_list = [desc_name[0] for desc_name in Descriptors.descList]
     self.calculator = MoleculeDescriptors.MolecularDescriptorCalculator(
         self.desc_list)
     self.desc_list = [
         "RDKit_desc_" + desc_name for desc_name in self.desc_list
     ]
     self.auto_correct = auto_correct
     self.dict_mode = dict_mode
示例#11
0
 def predict(self, path_to_model):
     descr_names = [i[0] for i in Descriptors._descList if i[0] != 'ExactMolWt']
     m = Chem.MolFromSmiles(str(self.smiles))
     calc = MoleculeDescriptors.MolecularDescriptorCalculator(descr_names)
     descr = [i for i in calc.CalcDescriptors(m)]
     with open(path_to_model, 'rb') as f:
         self.model = pickle.load(f)
     self.predicted = self.model.predict(descr)
     return self.predicted[0]
示例#12
0
    def test_github3511(self):
        mol = Chem.MolFromSmiles('C')
        descriptors = [name for name, _ in Chem.Descriptors.descList]
        calculator = MoleculeDescriptors.MolecularDescriptorCalculator(
            descriptors)
        calculator.CalcDescriptors(mol)

        # This should not raise a pickling exception
        pickle.dumps(mol)
示例#13
0
 def step_6_calc_descriptors(self):
   """calculate descriptors for each compound, according to Descriptors._descList"""
   nms = [x[0] for x in Descriptors._descList]
   calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms)
   for i in range(len(self.sd_entries)):
     descrs = calc.CalcDescriptors(self.sd_entries[i])
     for j in range(len(descrs)):
       self.sd_entries[i].SetProp(str(nms[j]), str(descrs[j]))
   return True
示例#14
0
def calculate_descriptors(mols, names=None, ipc_avg=False):
    if names is None:
        names = [d[0] for d in Descriptors._descList]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(names)
    descs = [calc.CalcDescriptors(mol) for mol in mols]
    descs = pd.DataFrame(descs, columns=names)
    if 'Ipc' in names and ipc_avg:
        descs['Ipc'] = [Descriptors.Ipc(mol, avg=True) for mol in mols]      
    return descs
示例#15
0
文件: ChemX.py 项目: zinph/ChemX
    def __init__(self, target_compound, name):
        self.directory = 'G:/My Drive/NCSU/DiamondHacks/ChemX/'
        self.target = target_compound

        self.fragment_database()
        ##        self.frag_database = [self.fcat.GetEntryDescription(i) for i in range(self.fcat.GetNumEntries())]
        self.calc = MoleculeDescriptors.MolecularDescriptorCalculator(
            [x[0] for x in Descriptors._descList])
        self.chembank = open('data/' + name, 'a+')
        self.templates = []
示例#16
0
def descriptors (df):
    dfrandom = df.copy()
    dfrandom.index = dfrandom.CID
    dfrandom['rdkit'] = [Chem.MolFromSmiles(smi) if Chem.MolFromSmiles(smi) else None
                         for smi in dfrandom.SMILES]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors.descList])
    X = pd.DataFrame([list(calc.CalcDescriptors(mol)) for mol in dfrandom.rdkit],
                     columns = list(calc.GetDescriptorNames()),
                     index = dfrandom.index)
    return X
示例#17
0
    def get_descs(self, smi, return_desc=False):
        descs = self.descs
        mol = Chem.MolFromSmiles(smi)
        calc = MoleculeDescriptors.MolecularDescriptorCalculator(descs)
        c = list(calc.CalcDescriptors(mol))
        for idx in range(len(c)):
            if np.isinf(c[idx]) or math.isnan(c[idx]):
                c[idx] = 0

        return c
示例#18
0
 def compute_2Drdkit(self, name):
     rdkit_2d_desc = []
     calc = MoleculeDescriptors.MolecularDescriptorCalculator(
         [x[0] for x in Descriptors._descList])
     header = calc.GetDescriptorNames()
     for i in range(len(self.mols)):
         ds = calc.CalcDescriptors(self.mols[i])
         rdkit_2d_desc.append(ds)
     df = pd.DataFrame(rdkit_2d_desc, columns=header)
     df.insert(loc=0, column='smiles', value=self.smiles)
     df.to_csv(name[:-4] + '_RDKit_2D.csv', index=False)
示例#19
0
def _calculate_pc_descriptors(mol):
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(DESCRIPTORS)

    _fp = calc.CalcDescriptors(mol)
    _fp = np.array(_fp)
    mask = np.isfinite(_fp)
    if (mask == 0).sum() > 0:
        print(f'{mol} contains an NAN physchem descriptor')
        _fp[~mask] = 0

    return _fp
示例#20
0
 def load_rdkit(self):
     ds = self.ds.copy()
     ds['rdkit'] = [
         Chem.MolFromSmiles(smi) if Chem.MolFromSmiles(smi) else None
         for smi in ds.SMILES
     ]
     calc = MoleculeDescriptors.MolecularDescriptorCalculator(
         [desc[0] for desc in Descriptors.descList])
     X = pd.DataFrame([list(calc.CalcDescriptors(mol)) for mol in ds.rdkit],
                      columns=list(calc.GetDescriptorNames()),
                      index=ds.index)
     return X
示例#21
0
    def setup(self):
        """ sets up class with some dummy data """

        self.clfs = [clf[0] for clf in SKLearnModels.CLASSIFIERS]
        self.ds = PubChemDataSet(1224861).load()
        mols = [Chem.MolFromSmiles(smi) for smi in self.ds.SMILES]
        calc = MoleculeDescriptors.MolecularDescriptorCalculator(
            [desc[0] for desc in Descriptors.descList])
        self.X = pd.DataFrame(
            [list(calc.CalcDescriptors(mol)) for mol in mols],
            columns=list(calc.GetDescriptorNames()),
            index=self.ds.index)
示例#22
0
def get_fps(mol):
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(
        [x[0] for x in Descriptors._descList])
    ds = np.asarray(calc.CalcDescriptors(mol))

    # EState fingerprints
    arr = Fingerprinter.FingerprintMol(mol)[0]

    # Morgan fingerprints
    #fps=AllChem.GetMorganFingerprintAsBitVect(mol,3,nBits=1024)
    #arr=np.zeros((1,))
    #DataStructs.ConvertToNumpyArray(fps, arr)
    return np.append(arr, ds)
def calculate_descriptors(df, molecule_column='mol'):
    """
    Uses RDKit to compute various descriptors for compounds in the given data frame. Expects
    compounds to be represented by RDKit Mol objects in the column given by molecule_column.
    Returns the input data frame with added columns for the descriptors.
    """

    descriptors = [x[0] for x in Descriptors._descList]
    calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors)
    for i in df.index:
        cd = calculator.CalcDescriptors(df.at[i, molecule_column])
        for desc, d in list(zip(descriptors, cd)):
            df.at[i, desc] = d
示例#24
0
def _calculate_pc_descriptors(smiles: str, pc_descriptors: List[str]) -> np.array:
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(pc_descriptors)

    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    _fp = calc.CalcDescriptors(mol)
    _fp = np.array(_fp)
    mask = np.isfinite(_fp)
    if (mask == 0).sum() > 0:
        logger.warning(f'{smiles} contains an NAN physchem descriptor')
        _fp[~mask] = 0

    return _fp
示例#25
0
def generate_molecular_descriptors(molecules):
    """
    Generates molecular descriptors (RDKit) for each molecule

    :param molecules: list of RDKit molecules
    :return: list of list of descriptors for each molecule
    """
    names_descriptors = [x[0] for x in Descriptors._descList]
    my_desc_obj = MoleculeDescriptors.MolecularDescriptorCalculator(
        names_descriptors)
    feature_vector = [my_desc_obj.CalcDescriptors(x) for x in molecules]
    feature_vector = np.asarray(feature_vector)
    feature_vector[np.isnan(feature_vector)] = 0  # Replace NaN with 0
    return feature_vector
示例#26
0
def calc_builtin_props(m):
    """Calculates properties that are part of rdkit base

    @param m: molecule for which to perform calculations
    @return: molecule with properties attached
    """

    nms = ('FractionCSP3', 'MolWt', 'RingCount')
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms)

    descrs = calc.CalcDescriptors(m)
    for x in range(len(descrs)):
        m.SetProp(str(nms[x]), str(descrs[x]))

    return m
示例#27
0
def generate_descriptors(data, descriptor_list):
    """
    Generate molecular descripts
    """

    calc = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_list)
    feature_matrix = []
    for i, sm in enumerate(data.SMILES):
        mol = Chem.MolFromSmiles(sm)
        descriptor_values = [v for v in calc.CalcDescriptors(mol)]
        feature_matrix.append(descriptor_values)
    feature_matrix = np.asarray(feature_matrix)
    for index, descriptor in enumerate(descriptor_list):
        data[descriptor] = feature_matrix[:, index]
    return data
示例#28
0
def calc_208descriptors(smiles):
    desc_names = [x[0] for x in Descriptors._descList if x[0]]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_names)

    matrix = []
    for smile in smiles:
        row = []
        mol = Chem.MolFromSmiles(smile)
        for d in calc.CalcDescriptors(mol):
            row.append(d)
        matrix.append(row)
        if len(matrix) % 1000 == 0:
            print("{} smiles processed in calc_208descriptors...".format(
                len(matrix)))

    return pd.DataFrame(matrix, columns=desc_names)
示例#29
0
def descriptor_ad(smiles, enzyme):
    # input: SMILES of compound to be tested and any of these strings: "catb", "catl", "cats", "mpro"
    mol = Chem.MolFromSmiles(smiles)
    descs = ["MolLogP", "MolWt", "NumHAcceptors",
             "NumHDonors", "NumRotatableBonds", "TPSA"]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(descs)
    mol_descs = calc.CalcDescriptors(mol)
    mol_descs_dict = dict(zip(descs, mol_descs))

    # Compare to dataset
    fail = 0
    for desc in descs:
        if (mol_descs_dict[desc] < descs_df.loc[enzyme, desc][0]) or (mol_descs_dict[desc] > descs_df.loc[enzyme, desc][1]):
            fail += 1

    return fail < 1
示例#30
0
def getDescDiff(outMols):

    descDiffList = []
    nms = [x[0] for x in Descriptors._descList]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms)
    for outIdx in range(len(outMols)):
        for inIdx in range(outIdx + 1, len(outMols)):
            descrs1 = calc.CalcDescriptors(outMols[outIdx])
            descrs2 = calc.CalcDescriptors(outMols[inIdx])
            descDiff = []
            for idx in range(len(descrs1)):
                if descrs1[idx] != descrs2[idx]:
                    print "Differing descriptor ", nms[idx]
                    print descrs1[idx], descrs2[idx]
                    descDiff.append(string.replace(nms[idx], "_", ""))
            descDiffList.append(len(descDiff))
    return descDiffList