示例#1
0
def use_mordred(mols, descs=None):
    if descs is None:
        calc = Calculator(descriptors, ignore_3D=True)
        df = pd.DataFrame(calc.pandas(mols,quiet=True).fill_missing().dropna(axis='columns'),dtype=np.float64)
        return df
    else:
        calc = Calculator(descriptors, ignore_3D=True)
        calc.descriptors = [d for d in calc.descriptors if str(d) in descs]
        df = pd.DataFrame(calc.pandas(mols,quiet=True).fill_missing().dropna(axis='columns'),dtype=np.float64)
        return df
示例#2
0
def calculation(mol_list, smiles_list, index_list, descriptor_type):
    if descriptor_type == '2D':
        calc = Calculator(descriptors, ignore_3D=True)
    elif descriptor_type == '3D':
        calc = Calculator(descriptors)
    df = calc.pandas(mol_list)
    df = df.astype(str)
    masks = df.apply(lambda d: d.str.contains('[a-zA-Z]', na=False))
    df = df[~masks]
    df = df.astype(float)
    # reset index
    df['SMILES'] = smiles_list
    df['index'] = index_list
    df = df.set_index('index')
    return df
示例#3
0
def AutoCorrMordred(mol):
    from mordred import Calculator, Autocorrelation
    calc = Calculator()
    # ATS, ATSC, AATS, AATSC ?
    if metric == 'MATS':
        descriptor = Autocorrelation.MATS
    elif metric == 'ATSC':
        descriptor = Autocorrelation.ATSC
    elif metric == 'AATS':
        descriptor = Autocorrelation.AATS
    elif metric == 'AATSC':
        descriptor = Autocorrelation.AATSC
    else:
        descriptor = Autocorrelation.ATS
    calc.register(descriptor)
    res = calc(mol)
    res = res.fill_missing()
    # Z: atomic num, pe=pauling electronegativity, p=polarizability, x=unweighted(identity), v=vdw-volume
    # dv= nValence d=nsigmaelectrons
    props= ['Z', 'pe', 'p', 'v', 'd', 'dv' ]
    keys = [ 'ATS{d}{p}'.format(d=d, p=p) for d in range(maxBonds+1) for p in props ]
    #print "keys:", keys
    res = { k:v for k, v in res.asdict().iteritems() if k in keys }
    for key in keys:
        if not key in res:
            print key
    #print "res:", res
    vector = [ value for (key, value) in sorted(res.items())]
    #print "len(vector):", len(vector)
    return vector
示例#4
0
def mordred_descriptors(mol):
    """
    Function to get chemical descriptors from CDK

    Parameters
    ----------
    mol : object :: rdkit.Chem.rdchem.Mol
        mol object from rdkit

    Returns
    -------
    dict
        dictionary containing the chemical descriptor name and
        the chemical descriptor value
    """
    calc = Calculator(descriptors, ignore_3D=True)
    if type(mol) == list:
        print("here")
        print(mol)
        df = calc.pandas(mol, nproc=1).T
        print("here2")
        return df.to_dict()
    else:
        df = calc.pandas([mol], nproc=1).T
        return df.to_dict()[0]
示例#5
0
def get_md(smi_path, data_path='./'):

    if type(smi_path) is str:
        smi_path = Path(smi_path)

    def get_smi(smi_path):
        smiles = {}
        with open(str(smi_path), 'r+') as f:
            lines = f.readlines()
            smiles = pd.DataFrame({
                'cindex': [
                    smi_path.stem + '_' + str(idx)
                    for idx, content in enumerate(lines)
                ],
                'smiles':
                [content.strip('\n') for idx, content in enumerate(lines)]
            })
        return smiles

    smiles = get_smi(smi_path)['smiles']
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    calc = Calculator(descriptors)
    md = calc.pandas(mols)
    data = pd.concat([md, pd.DataFrame(get_smi(smi_path))], axis=1)
    data.to_csv(data_path + '/' + smi_path.stem + '_md.csv')
    return data
示例#6
0
def sms_bandgap(sms):
    """Function from sms to predict Bandgap,
       sms represents the smiles string of the chemical that you want to predict,
       >>> sms_bandgap('c1ccccc1')
       >>> array([2.70371115])
    """
    bandgap = pd.DataFrame(columns=['substance', 'bandgap'])
    bandgap.loc[0, 'substance'] = sms
    freeze_support()
    mols = Chem.MolFromSmiles(
        sms)  #transform smiles string to molecular structure
    if mols is None:
        raise TypeError('Invalid Smiles String')
    else:
        m = [Chem.MolFromSmiles(sms)]
        calc = Calculator(descriptors)
        raw_data = calc.pandas(m)  #calculate descriptors
        new = {
            'AXp-0d': raw_data['AXp-0d'].values,
            'AXp-1d': raw_data['AXp-1d'].values,
            'AXp-2d': raw_data['AXp-2d'].values,
            'ETA_eta_L': raw_data['ETA_eta_L'].values,
            'ETA_epsilon_3': raw_data['ETA_epsilon_3'].values
        }  # extract the five most useful descriptors data
        new_data = pd.DataFrame(index=[1], data=new)
        regressor2 = load_model()
        bandgap.loc[0, 'bandgap'] = regressor2.predict(new_data)[
            0]  # calculate bandgap
        return bandgap
示例#7
0
def compute_descript(smile, walltime=1):
    """
    import random
    import time
    if random.randint(0,8) == 0:
        time.sleep(1)
    """
    from mordred import Calculator, descriptors
    from rdkit import Chem
    import numpy as np
    import pickle

    calc = Calculator(
        descriptors, ignore_3D=True
    )  # this object doesn't need to be created everytime. Can make global I think?

    #read smiles
    mol = Chem.MolFromSmiles(smile)
    if mol is None:
        print("Error processing mol")
        return pickle.dumps(None)

    descs = calc(mol)

    data = np.array(descs).flatten().astype(
        np.float32)  #could run in FP16 UNO , something to think about
    return pickle.dumps(
        data
    )  # We do this to avoid a bug in the serialization routines that Parsl
示例#8
0
  def _featurize(self, mol: RDKitMol) -> np.ndarray:
    """
    Calculate Mordred descriptors.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      1D array of Mordred descriptors for `mol`.
      If ignore_3D is True, the length is 1613.
      If ignore_3D is False, the length is 1826.
    """
    if self.calc is None:
      try:
        from mordred import Calculator, descriptors, is_missing
        self.is_missing = is_missing
        self.calc = Calculator(descriptors, ignore_3D=self.ignore_3D)
        self.descriptors = list(descriptors.__all__)
      except ModuleNotFoundError:
        raise ImportError("This class requires Mordred to be installed.")

    feature = self.calc(mol)
    # convert errors to zero
    feature = [
        0.0 if self.is_missing(val) or isinstance(val, str) else val
        for val in feature
    ]
    return np.asarray(feature)
示例#9
0
 def __init__(self, s, e, n, table):
     self.s = s
     self.e = e
     self.n = n
     self.data = np.load("data.npy")
     self.table = pd.read_csv(table)
     self.calc = Calculator(descriptors, ignore_3D=True)
示例#10
0
def test_VEA():
    calc = Calculator([AdjacencyMatrix, DistanceMatrix])

    for line in data:
        line = line.strip().split()

        smi = line[0]
        mol = Chem.MolFromSmiles(smi)

        desireds = dict(zip(descs, map(parse_reference, line[1:])))
        actuals = {str(k): v for k, v in zip(calc.descriptors, calc(mol))}

        for desc in descs:
            actual = actuals[desc]
            decimal, desired = desireds[desc]
            if desired is None:
                continue

            assert not is_missing(actual), actual

            yield (
                assert_almost_equal,
                actual,
                desired,
                decimal,
                "{} of {}".format(desc, smi),
            )
示例#11
0
def FilterItLogS(mol):
    '''
    Fragement based solubity value: """Filter-it™ LogS descriptor.: based on a simple fragment-based method. 
    http://silicos-it.be.s3-website-eu-west-1.amazonaws.com/software/filter-it/1.0.2/filter-it.html#installation
    '''
    calc = Calculator(descriptors.LogS)
    return calc(mol).asdict().get('FilterItLogS')
def determine_descriptors_from_mordred(smile_series):
    smile_series = smile_series.drop_duplicates(
    )  #no need for duplicates they just take time, we can merge by smiles later to sort it out
    calc = Calculator(
        descriptors, ignore_3D=True
    )  # create a calculation object (ignore_3D is the default, just a reminder it is there)
    molecule_objects = []
    bad_smiles = []
    for smile in smile_series:
        if type(smile) != str:
            bad_smiles.append(smile)
            continue
        to_check = Chem.MolFromSmiles(
            smile)  #a "SMILES Parse Error" does not trigger
        if to_check:
            molecule_objects.append(to_check)
        else:
            bad_smiles.append(smile)
    if bad_smiles:
        return bad_smiles, False
    descriptor_dataframe = calc.pandas(
        molecule_objects)  #so long as all smiles are valid this should be fine
    #need to merge smiles with the descriptor_dataframe
    descriptor_dataframe = descriptor_dataframe.set_index(smile_series)
    return descriptor_dataframe, True
示例#13
0
def test_pickle_calculator():
    orig = Calculator(descriptors)
    d0 = orig.descriptors[0]
    d1 = orig.descriptors[1]
    orig.register([
        d0 + d1,
        d0 - d1,
        d0 * d1,
        d0 // d1,
        d0 % d1,
        d0**d1,
        -d0,
        +d1,
        abs(d0),
        math.trunc(d0),
    ])

    if six.PY3:
        orig.register([math.ceil(d0), math.floor(d1)])

    pickled = pickle.loads(pickle.dumps(orig))

    mol = Chem.MolFromSmiles("c1ccccc1C(O)O")

    for a, b in zip(orig.descriptors, pickled.descriptors):
        yield eq_, a, b

    for a, b in zip(orig(mol), pickled(mol)):
        if isinstance(a, MissingValueBase):
            yield eq_, a.__class__, b.__class__
        else:
            yield assert_almost_equal, a, b
示例#14
0
def smiles_to_mordred(smiles, features=None):
    # create descriptor calculator with all descriptors
    calc = Calculator(all_descriptors)
    print("Convering SMILES string to Mol format...")
    mols_raw = [Chem.MolFromSmiles(smi) for smi in smiles]
    print("Computing 3D coordinates...")
    s = SaltRemover.SaltRemover()
    mols = {}
    n = len(mols_raw)
    p = ProgressBar(n)
    for i, mol in enumerate(mols_raw):
        p.animate(i, status="Embedding %s" % smiles[i])
        try:
            mol = s.StripMol(mol, dontRemoveEverything=True)
            mol = Chem.AddHs(mol)
            AllChem.Compute2DCoords(mol)
            AllChem.EmbedMolecule(mol)
            AllChem.UFFOptimizeMolecule(mol)  # Is this deterministic?
        except Exception:
            print("Exception for %s" % smiles[i])
        else:
            mols[smiles[i]] = mol
    p.animate(n, status="Finished embedding all molecules")
    print("\nComputing Mordred features...")
    df = calc.pandas(mols.values())
    if features is not None:
        df = df[features]  # Retain only the specified features
    mordred = pd.DataFrame(df.values, index=mols.keys(), columns=df.columns)
    print("There are %d molecules and %d features" % mordred.shape)
    return mordred
示例#15
0
    def calculate(SMILEs, filter=None):

        calc = Calculator(descriptors, ignore_3D=True)
        d = []
        for smi in SMILEs:
            try:
                m = Chem.MolFromSmiles(smi)
                d.append(calc(m))
            except:
                # The input SMILEs is invaild
                raise ValueError("Bad SMILEs Detected. Please Check: " + smi)


#                 warnings.warn("Bad SMILEs  Detect. Filling NA Values: "+smi)
#                 d.append(['NA'] * len(calc))

        d_df = pd.DataFrame(d,
                            index=SMILEs,
                            columns=[str(e_d) for e_d in calc.descriptors
                                     ]).apply(pd.to_numeric, errors='coerce')
        if filter:
            d_df = d_df.loc[:, filter]

        d_df.fillna(0, inplace=True)
        return d_df.values
示例#16
0
def mordred_fingerprint2d(mols):
    result = np.zeros((len(mols), len(descriptors2d)), dtype=np.float32)
    calc = Calculator(descriptors2d)
    for i, m in enumerate(tqdm.tqdm(mols)):
        for j, v in enumerate(calc(m)):
            result[i, j] = v if not is_missing(v) else np.nan
    header = np.array([str(d) for d in descriptors2d])
    return result, header
示例#17
0
def calc_mordred_desc(mols: list):

    from mordred import Calculator, descriptors

    calc = Calculator(descriptors, ignore_3D=True)
    res = calc.pandas(mols)
    res = _convert_error_columns(res)
    return res
def get_descriptors():
    calc = Calculator(descriptors, ignore_3D=True)
    mol = Chem.MolFromSmiles('c1ccccc1')
    variable = calc(mol).asdict(False)
    descriptor_names_list = []
    for key, value in variable.items():
        descriptor_names_list.append(key)
    return descriptor_names_list
示例#19
0
def load_data(dir, filename, fingerprint_size=100, prediction=False):
    # Load GA dataset
    data = pd.read_csv(filename)
    data = data.sample(frac=1)
    smiles = data['SMILES'].to_list()
    conv = []
    featurizer = dc.feat.graph_features.ConvMolFeaturizer()
    for smile in smiles:
        conv.append(Chem.MolFromSmiles(smile))
    graphs = featurizer.featurize(conv)

    if prediction:
        scores = [0 for _ in range(len(smiles))]
        predict_dataset = NumpyDataset(graphs, scores, ids=smiles)
        Features_decrease1, adj_decrease1, edge_decrease1, full_feature_decrease1, Interactions1, smiles1 = get_feature(
            predict_dataset)
        save_feature(dir, Features_decrease1, adj_decrease1, Interactions1, smiles1, edge_decrease1,
                     full_feature_decrease1,
                     dataset='predict_data')
    else:
        scores = data['score'].to_list()
        step = len(smiles) // 10
        test_dataset = NumpyDataset(graphs[:step], scores[:step], ids=smiles[:step])
        valid_dataset = NumpyDataset(graphs[step:2 * step + 1], scores[step:2 * step + 1],
                                     ids=smiles[step:2 * step + 1])
        train_dataset = NumpyDataset(graphs[2 * step + 1:], scores[2 * step + 1:], ids=smiles[2 * step + 1:])

        # Create files of graph information
        Features_decrease1, adj_decrease1, edge_decrease1, full_feature_decrease1, Interactions1, smiles1 = get_feature(
            train_dataset)
        Features_decrease2, adj_decrease2, edge_decrease2, full_feature_decrease2, Interactions2, smiles2 = get_feature(
            valid_dataset)
        Features_decrease3, adj_decrease3, edge_decrease3, full_feature_decrease3, Interactions3, smiles3 = get_feature(
            test_dataset)
        save_feature(dir, Features_decrease1, adj_decrease1, Interactions1, smiles1, edge_decrease1,
                     full_feature_decrease1,
                     dataset='train_data')
        save_feature(dir, Features_decrease2, adj_decrease1, Interactions2, smiles2, edge_decrease2,
                     full_feature_decrease2,
                     dataset='valid_data')
        save_feature(dir, Features_decrease3, adj_decrease1, Interactions3, smiles3, edge_decrease3,
                     full_feature_decrease3,
                     dataset='test_data')

    # Creat the fingerprints based on mordred
    calc = Calculator(descriptors, ignore_3D=True)
    if prediction:
        datasets = ['predict_data']
    else:
        datasets = ["train_data", "valid_data", "test_data"]

    for inc, dataset in enumerate(datasets):
        data = np.load(dir + dataset + "/smiles.npy")
        alldes = []
        for smiles in data:
            mol = Chem.MolFromSmiles(smiles)
            alldes.append(calc(mol)[:fingerprint_size])
        np.save(dir + dataset + "/fingerprint_stand.npy", np.array(alldes))
示例#20
0
 def transform(self):
     super().transform()
     self.mol_names = []
     calc = Calculator(descriptors, ignore_3D=True)
     self.df = calc.pandas(self.structures)
     self.columns = self.df.columns
     self.features = self.df.values
     self.mol_names = [mol.GetProp("_Name") for mol in self.structures]
     return self.features
示例#21
0
def test_descriptor_order():
    calc = Calculator(descriptors)
    it = iter(calc.descriptors)
    before = next(it).__module__
    for current in it:
        current = current.__module__
        assert before <= current, "{!r} > {!r}".format(before, current)

        before = current
示例#22
0
 def transform(self, molecules):
     print("\tBuilding Descriptors")
     df = pd.DataFrame()
     molecules = molecules["molecules"].tolist()
     #df["MW"] = [dc.FpDensityMorgan1(mol) for mol in molecules]
     if self.descriptors:
         print(self.descriptors)
         calcs = Calculator(self.descriptors, ignore_3D=True) 
     else:
         calcs = Calculator(descriptors, ignore_3D=True)
     #calcs = Calculator([md.CarbonTypes, md.LogS, md.ABCIndex, md.BondCount, md.ZagrebIndex, md.WienerIndex,md.TopologicalCharge, md.InformationContent, md.AcidBase,md.RingCount, md.AtomCount, md.Polarizability, md.HydrogenBond,md.SLogP,md.RotatableBond, md.Aromatic, md.CPSA], ignore_3D=True) 
     #df["MG"] = [dc.FpDensityMorgan1(mol) for mol in molecules]
     #df["headers"] = list(df)*(df.shape[0]+1)
     descriptors_df = pd.concat([df, calcs.pandas(molecules)], axis=1)
     if self.headers:
         descriptors_df["headers"] = [list(descriptors_df)]*descriptors_df.shape[0]
     np.savetxt("2D_descriptors.txt", list(descriptors_df), fmt="%s")
     return  descriptors_df.astype(float)
示例#23
0
def test_ETA():
    calc = Calculator(ExtendedTopochemicalAtom)

    for smi, desireds in references.items():
        mol = Chem.MolFromSmiles(smi)
        actuals = {str(d): v for d, v in zip(calc.descriptors, calc(mol))}

        for name, desired in desireds.items():
            yield assert_almost_equal, actuals[name], desired, 2, "{} of {}".format(name, smi)
示例#24
0
def calculate_molecular_descriptors(df: pd.DataFrame) -> pd.DataFrame:
    calc = Calculator(descriptors, ignore_3D=True)
    mols = [Chem.MolFromSmiles(smi) for smi in df.SMILES]
    invalid_indices = get_invalid_smiles_indices(mols)
    mols_without_invalid = [
        mol for index, mol in enumerate(mols) if index not in invalid_indices
    ]
    descriptor_df = calc.pandas(mols_without_invalid)
    return df.drop(df.index[invalid_indices]).join(descriptor_df)
def smile_to_mordred(smi, imputer_dict=None):
    smi = Chem.MolFromSmiles(smi)
    calc = Calculator(descriptors, ignore_3D=True)
    res = calc(smi)
    res = np.array(list(res.values())).reshape(1, -1)
    if imputer_dict is not None:
        imputer_dict = imputer_dict[0]
        res = imputer_dict['scaler'].transform(
            imputer_dict['imputer'].transform(res))
    return res.flatten().astype(np.float32)
示例#26
0
 def get_MD(self, ignore_3D=True):
     """
     Get MD ONLY for non-error cases
     """
     calc = Calculator(descriptors, ignore_3D=ignore_3D)
     error_cases = np.squeeze(np.argwhere(self._error_mask))
     mol_noError = list_where(self._mol_lst, error_cases,
                              False)  # index(error_cases)에 없으면 가져옴
     mol_descriptor = calc.pandas(mol_noError)
     self._MD = mol_descriptor.astype("float64")
示例#27
0
def mol_to_mordred(mols, features=None):
    calc = Calculator(all_descriptors)
    print("\nComputing Mordred features...")
    df = calc.pandas(mols.values())
    df = df.fill_missing()  # Use NaN instead of Missing object
    if features is not None:
        df = df[features]  # Retain only the specified features
    mordred = pd.DataFrame(df.values, index=mols.keys(), columns=df.columns)
    print("There are %d molecules and %d features" % mordred.shape)
    return mordred
示例#28
0
 def __init__(self, rank=None, args=None):
     self.calc = Calculator(descriptors, ignore_3D=True)
     if rank is not None:
         self.rank = rank
     else:
         raise ValueError('rank is not set properly')
     if args is not None:
         self.args = args
     else:
         raise ValueError('args is not set properly')
示例#29
0
 def __init__(self, dict_mode=True, auto_correct=True, ignore_3D=True):
     from mordred import Calculator, descriptors
     super(RDKitDescriptors, self).__init__()
     self.dict_mode = dict_mode
     self.calculator = Calculator(descriptors, ignore_3D=ignore_3D)
     self.auto_correct = auto_correct
     self.desc_list = list(
         self.calculator.pandas([mol_from_smiles("C")]).columns)
     self.desc_list = [
         "Mordred_desc_" + desc_name for desc_name in self.desc_list
     ]
示例#30
0
def smile_to_mordred(smi, imputer_dict=None, userdkit=False):
    calc = Calculator(descriptors, ignore_3D=True)
    if userdkit:
        smi = Chem.MolFromSmiles(smi)
        assert(smi is not None)
    res = calc(smi)
    res = np.array(list(res.values())).reshape(1, -1).astype(np.float32)
    res = np.nan_to_num(res, posinf=0, neginf=0, nan=0)
    if imputer_dict is not None:
        imputer_dict = imputer_dict[0]
        res = imputer_dict['scaler'].transform(imputer_dict['imputer'].transform(res))
    return res.flatten().astype(np.float32)