def use_mordred(mols, descs=None): if descs is None: calc = Calculator(descriptors, ignore_3D=True) df = pd.DataFrame(calc.pandas(mols,quiet=True).fill_missing().dropna(axis='columns'),dtype=np.float64) return df else: calc = Calculator(descriptors, ignore_3D=True) calc.descriptors = [d for d in calc.descriptors if str(d) in descs] df = pd.DataFrame(calc.pandas(mols,quiet=True).fill_missing().dropna(axis='columns'),dtype=np.float64) return df
def calculation(mol_list, smiles_list, index_list, descriptor_type): if descriptor_type == '2D': calc = Calculator(descriptors, ignore_3D=True) elif descriptor_type == '3D': calc = Calculator(descriptors) df = calc.pandas(mol_list) df = df.astype(str) masks = df.apply(lambda d: d.str.contains('[a-zA-Z]', na=False)) df = df[~masks] df = df.astype(float) # reset index df['SMILES'] = smiles_list df['index'] = index_list df = df.set_index('index') return df
def AutoCorrMordred(mol): from mordred import Calculator, Autocorrelation calc = Calculator() # ATS, ATSC, AATS, AATSC ? if metric == 'MATS': descriptor = Autocorrelation.MATS elif metric == 'ATSC': descriptor = Autocorrelation.ATSC elif metric == 'AATS': descriptor = Autocorrelation.AATS elif metric == 'AATSC': descriptor = Autocorrelation.AATSC else: descriptor = Autocorrelation.ATS calc.register(descriptor) res = calc(mol) res = res.fill_missing() # Z: atomic num, pe=pauling electronegativity, p=polarizability, x=unweighted(identity), v=vdw-volume # dv= nValence d=nsigmaelectrons props= ['Z', 'pe', 'p', 'v', 'd', 'dv' ] keys = [ 'ATS{d}{p}'.format(d=d, p=p) for d in range(maxBonds+1) for p in props ] #print "keys:", keys res = { k:v for k, v in res.asdict().iteritems() if k in keys } for key in keys: if not key in res: print key #print "res:", res vector = [ value for (key, value) in sorted(res.items())] #print "len(vector):", len(vector) return vector
def mordred_descriptors(mol): """ Function to get chemical descriptors from CDK Parameters ---------- mol : object :: rdkit.Chem.rdchem.Mol mol object from rdkit Returns ------- dict dictionary containing the chemical descriptor name and the chemical descriptor value """ calc = Calculator(descriptors, ignore_3D=True) if type(mol) == list: print("here") print(mol) df = calc.pandas(mol, nproc=1).T print("here2") return df.to_dict() else: df = calc.pandas([mol], nproc=1).T return df.to_dict()[0]
def get_md(smi_path, data_path='./'): if type(smi_path) is str: smi_path = Path(smi_path) def get_smi(smi_path): smiles = {} with open(str(smi_path), 'r+') as f: lines = f.readlines() smiles = pd.DataFrame({ 'cindex': [ smi_path.stem + '_' + str(idx) for idx, content in enumerate(lines) ], 'smiles': [content.strip('\n') for idx, content in enumerate(lines)] }) return smiles smiles = get_smi(smi_path)['smiles'] mols = [Chem.MolFromSmiles(smi) for smi in smiles] calc = Calculator(descriptors) md = calc.pandas(mols) data = pd.concat([md, pd.DataFrame(get_smi(smi_path))], axis=1) data.to_csv(data_path + '/' + smi_path.stem + '_md.csv') return data
def sms_bandgap(sms): """Function from sms to predict Bandgap, sms represents the smiles string of the chemical that you want to predict, >>> sms_bandgap('c1ccccc1') >>> array([2.70371115]) """ bandgap = pd.DataFrame(columns=['substance', 'bandgap']) bandgap.loc[0, 'substance'] = sms freeze_support() mols = Chem.MolFromSmiles( sms) #transform smiles string to molecular structure if mols is None: raise TypeError('Invalid Smiles String') else: m = [Chem.MolFromSmiles(sms)] calc = Calculator(descriptors) raw_data = calc.pandas(m) #calculate descriptors new = { 'AXp-0d': raw_data['AXp-0d'].values, 'AXp-1d': raw_data['AXp-1d'].values, 'AXp-2d': raw_data['AXp-2d'].values, 'ETA_eta_L': raw_data['ETA_eta_L'].values, 'ETA_epsilon_3': raw_data['ETA_epsilon_3'].values } # extract the five most useful descriptors data new_data = pd.DataFrame(index=[1], data=new) regressor2 = load_model() bandgap.loc[0, 'bandgap'] = regressor2.predict(new_data)[ 0] # calculate bandgap return bandgap
def compute_descript(smile, walltime=1): """ import random import time if random.randint(0,8) == 0: time.sleep(1) """ from mordred import Calculator, descriptors from rdkit import Chem import numpy as np import pickle calc = Calculator( descriptors, ignore_3D=True ) # this object doesn't need to be created everytime. Can make global I think? #read smiles mol = Chem.MolFromSmiles(smile) if mol is None: print("Error processing mol") return pickle.dumps(None) descs = calc(mol) data = np.array(descs).flatten().astype( np.float32) #could run in FP16 UNO , something to think about return pickle.dumps( data ) # We do this to avoid a bug in the serialization routines that Parsl
def _featurize(self, mol: RDKitMol) -> np.ndarray: """ Calculate Mordred descriptors. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray 1D array of Mordred descriptors for `mol`. If ignore_3D is True, the length is 1613. If ignore_3D is False, the length is 1826. """ if self.calc is None: try: from mordred import Calculator, descriptors, is_missing self.is_missing = is_missing self.calc = Calculator(descriptors, ignore_3D=self.ignore_3D) self.descriptors = list(descriptors.__all__) except ModuleNotFoundError: raise ImportError("This class requires Mordred to be installed.") feature = self.calc(mol) # convert errors to zero feature = [ 0.0 if self.is_missing(val) or isinstance(val, str) else val for val in feature ] return np.asarray(feature)
def __init__(self, s, e, n, table): self.s = s self.e = e self.n = n self.data = np.load("data.npy") self.table = pd.read_csv(table) self.calc = Calculator(descriptors, ignore_3D=True)
def test_VEA(): calc = Calculator([AdjacencyMatrix, DistanceMatrix]) for line in data: line = line.strip().split() smi = line[0] mol = Chem.MolFromSmiles(smi) desireds = dict(zip(descs, map(parse_reference, line[1:]))) actuals = {str(k): v for k, v in zip(calc.descriptors, calc(mol))} for desc in descs: actual = actuals[desc] decimal, desired = desireds[desc] if desired is None: continue assert not is_missing(actual), actual yield ( assert_almost_equal, actual, desired, decimal, "{} of {}".format(desc, smi), )
def FilterItLogS(mol): ''' Fragement based solubity value: """Filter-it™ LogS descriptor.: based on a simple fragment-based method. http://silicos-it.be.s3-website-eu-west-1.amazonaws.com/software/filter-it/1.0.2/filter-it.html#installation ''' calc = Calculator(descriptors.LogS) return calc(mol).asdict().get('FilterItLogS')
def determine_descriptors_from_mordred(smile_series): smile_series = smile_series.drop_duplicates( ) #no need for duplicates they just take time, we can merge by smiles later to sort it out calc = Calculator( descriptors, ignore_3D=True ) # create a calculation object (ignore_3D is the default, just a reminder it is there) molecule_objects = [] bad_smiles = [] for smile in smile_series: if type(smile) != str: bad_smiles.append(smile) continue to_check = Chem.MolFromSmiles( smile) #a "SMILES Parse Error" does not trigger if to_check: molecule_objects.append(to_check) else: bad_smiles.append(smile) if bad_smiles: return bad_smiles, False descriptor_dataframe = calc.pandas( molecule_objects) #so long as all smiles are valid this should be fine #need to merge smiles with the descriptor_dataframe descriptor_dataframe = descriptor_dataframe.set_index(smile_series) return descriptor_dataframe, True
def test_pickle_calculator(): orig = Calculator(descriptors) d0 = orig.descriptors[0] d1 = orig.descriptors[1] orig.register([ d0 + d1, d0 - d1, d0 * d1, d0 // d1, d0 % d1, d0**d1, -d0, +d1, abs(d0), math.trunc(d0), ]) if six.PY3: orig.register([math.ceil(d0), math.floor(d1)]) pickled = pickle.loads(pickle.dumps(orig)) mol = Chem.MolFromSmiles("c1ccccc1C(O)O") for a, b in zip(orig.descriptors, pickled.descriptors): yield eq_, a, b for a, b in zip(orig(mol), pickled(mol)): if isinstance(a, MissingValueBase): yield eq_, a.__class__, b.__class__ else: yield assert_almost_equal, a, b
def smiles_to_mordred(smiles, features=None): # create descriptor calculator with all descriptors calc = Calculator(all_descriptors) print("Convering SMILES string to Mol format...") mols_raw = [Chem.MolFromSmiles(smi) for smi in smiles] print("Computing 3D coordinates...") s = SaltRemover.SaltRemover() mols = {} n = len(mols_raw) p = ProgressBar(n) for i, mol in enumerate(mols_raw): p.animate(i, status="Embedding %s" % smiles[i]) try: mol = s.StripMol(mol, dontRemoveEverything=True) mol = Chem.AddHs(mol) AllChem.Compute2DCoords(mol) AllChem.EmbedMolecule(mol) AllChem.UFFOptimizeMolecule(mol) # Is this deterministic? except Exception: print("Exception for %s" % smiles[i]) else: mols[smiles[i]] = mol p.animate(n, status="Finished embedding all molecules") print("\nComputing Mordred features...") df = calc.pandas(mols.values()) if features is not None: df = df[features] # Retain only the specified features mordred = pd.DataFrame(df.values, index=mols.keys(), columns=df.columns) print("There are %d molecules and %d features" % mordred.shape) return mordred
def calculate(SMILEs, filter=None): calc = Calculator(descriptors, ignore_3D=True) d = [] for smi in SMILEs: try: m = Chem.MolFromSmiles(smi) d.append(calc(m)) except: # The input SMILEs is invaild raise ValueError("Bad SMILEs Detected. Please Check: " + smi) # warnings.warn("Bad SMILEs Detect. Filling NA Values: "+smi) # d.append(['NA'] * len(calc)) d_df = pd.DataFrame(d, index=SMILEs, columns=[str(e_d) for e_d in calc.descriptors ]).apply(pd.to_numeric, errors='coerce') if filter: d_df = d_df.loc[:, filter] d_df.fillna(0, inplace=True) return d_df.values
def mordred_fingerprint2d(mols): result = np.zeros((len(mols), len(descriptors2d)), dtype=np.float32) calc = Calculator(descriptors2d) for i, m in enumerate(tqdm.tqdm(mols)): for j, v in enumerate(calc(m)): result[i, j] = v if not is_missing(v) else np.nan header = np.array([str(d) for d in descriptors2d]) return result, header
def calc_mordred_desc(mols: list): from mordred import Calculator, descriptors calc = Calculator(descriptors, ignore_3D=True) res = calc.pandas(mols) res = _convert_error_columns(res) return res
def get_descriptors(): calc = Calculator(descriptors, ignore_3D=True) mol = Chem.MolFromSmiles('c1ccccc1') variable = calc(mol).asdict(False) descriptor_names_list = [] for key, value in variable.items(): descriptor_names_list.append(key) return descriptor_names_list
def load_data(dir, filename, fingerprint_size=100, prediction=False): # Load GA dataset data = pd.read_csv(filename) data = data.sample(frac=1) smiles = data['SMILES'].to_list() conv = [] featurizer = dc.feat.graph_features.ConvMolFeaturizer() for smile in smiles: conv.append(Chem.MolFromSmiles(smile)) graphs = featurizer.featurize(conv) if prediction: scores = [0 for _ in range(len(smiles))] predict_dataset = NumpyDataset(graphs, scores, ids=smiles) Features_decrease1, adj_decrease1, edge_decrease1, full_feature_decrease1, Interactions1, smiles1 = get_feature( predict_dataset) save_feature(dir, Features_decrease1, adj_decrease1, Interactions1, smiles1, edge_decrease1, full_feature_decrease1, dataset='predict_data') else: scores = data['score'].to_list() step = len(smiles) // 10 test_dataset = NumpyDataset(graphs[:step], scores[:step], ids=smiles[:step]) valid_dataset = NumpyDataset(graphs[step:2 * step + 1], scores[step:2 * step + 1], ids=smiles[step:2 * step + 1]) train_dataset = NumpyDataset(graphs[2 * step + 1:], scores[2 * step + 1:], ids=smiles[2 * step + 1:]) # Create files of graph information Features_decrease1, adj_decrease1, edge_decrease1, full_feature_decrease1, Interactions1, smiles1 = get_feature( train_dataset) Features_decrease2, adj_decrease2, edge_decrease2, full_feature_decrease2, Interactions2, smiles2 = get_feature( valid_dataset) Features_decrease3, adj_decrease3, edge_decrease3, full_feature_decrease3, Interactions3, smiles3 = get_feature( test_dataset) save_feature(dir, Features_decrease1, adj_decrease1, Interactions1, smiles1, edge_decrease1, full_feature_decrease1, dataset='train_data') save_feature(dir, Features_decrease2, adj_decrease1, Interactions2, smiles2, edge_decrease2, full_feature_decrease2, dataset='valid_data') save_feature(dir, Features_decrease3, adj_decrease1, Interactions3, smiles3, edge_decrease3, full_feature_decrease3, dataset='test_data') # Creat the fingerprints based on mordred calc = Calculator(descriptors, ignore_3D=True) if prediction: datasets = ['predict_data'] else: datasets = ["train_data", "valid_data", "test_data"] for inc, dataset in enumerate(datasets): data = np.load(dir + dataset + "/smiles.npy") alldes = [] for smiles in data: mol = Chem.MolFromSmiles(smiles) alldes.append(calc(mol)[:fingerprint_size]) np.save(dir + dataset + "/fingerprint_stand.npy", np.array(alldes))
def transform(self): super().transform() self.mol_names = [] calc = Calculator(descriptors, ignore_3D=True) self.df = calc.pandas(self.structures) self.columns = self.df.columns self.features = self.df.values self.mol_names = [mol.GetProp("_Name") for mol in self.structures] return self.features
def test_descriptor_order(): calc = Calculator(descriptors) it = iter(calc.descriptors) before = next(it).__module__ for current in it: current = current.__module__ assert before <= current, "{!r} > {!r}".format(before, current) before = current
def transform(self, molecules): print("\tBuilding Descriptors") df = pd.DataFrame() molecules = molecules["molecules"].tolist() #df["MW"] = [dc.FpDensityMorgan1(mol) for mol in molecules] if self.descriptors: print(self.descriptors) calcs = Calculator(self.descriptors, ignore_3D=True) else: calcs = Calculator(descriptors, ignore_3D=True) #calcs = Calculator([md.CarbonTypes, md.LogS, md.ABCIndex, md.BondCount, md.ZagrebIndex, md.WienerIndex,md.TopologicalCharge, md.InformationContent, md.AcidBase,md.RingCount, md.AtomCount, md.Polarizability, md.HydrogenBond,md.SLogP,md.RotatableBond, md.Aromatic, md.CPSA], ignore_3D=True) #df["MG"] = [dc.FpDensityMorgan1(mol) for mol in molecules] #df["headers"] = list(df)*(df.shape[0]+1) descriptors_df = pd.concat([df, calcs.pandas(molecules)], axis=1) if self.headers: descriptors_df["headers"] = [list(descriptors_df)]*descriptors_df.shape[0] np.savetxt("2D_descriptors.txt", list(descriptors_df), fmt="%s") return descriptors_df.astype(float)
def test_ETA(): calc = Calculator(ExtendedTopochemicalAtom) for smi, desireds in references.items(): mol = Chem.MolFromSmiles(smi) actuals = {str(d): v for d, v in zip(calc.descriptors, calc(mol))} for name, desired in desireds.items(): yield assert_almost_equal, actuals[name], desired, 2, "{} of {}".format(name, smi)
def calculate_molecular_descriptors(df: pd.DataFrame) -> pd.DataFrame: calc = Calculator(descriptors, ignore_3D=True) mols = [Chem.MolFromSmiles(smi) for smi in df.SMILES] invalid_indices = get_invalid_smiles_indices(mols) mols_without_invalid = [ mol for index, mol in enumerate(mols) if index not in invalid_indices ] descriptor_df = calc.pandas(mols_without_invalid) return df.drop(df.index[invalid_indices]).join(descriptor_df)
def smile_to_mordred(smi, imputer_dict=None): smi = Chem.MolFromSmiles(smi) calc = Calculator(descriptors, ignore_3D=True) res = calc(smi) res = np.array(list(res.values())).reshape(1, -1) if imputer_dict is not None: imputer_dict = imputer_dict[0] res = imputer_dict['scaler'].transform( imputer_dict['imputer'].transform(res)) return res.flatten().astype(np.float32)
def get_MD(self, ignore_3D=True): """ Get MD ONLY for non-error cases """ calc = Calculator(descriptors, ignore_3D=ignore_3D) error_cases = np.squeeze(np.argwhere(self._error_mask)) mol_noError = list_where(self._mol_lst, error_cases, False) # index(error_cases)에 없으면 가져옴 mol_descriptor = calc.pandas(mol_noError) self._MD = mol_descriptor.astype("float64")
def mol_to_mordred(mols, features=None): calc = Calculator(all_descriptors) print("\nComputing Mordred features...") df = calc.pandas(mols.values()) df = df.fill_missing() # Use NaN instead of Missing object if features is not None: df = df[features] # Retain only the specified features mordred = pd.DataFrame(df.values, index=mols.keys(), columns=df.columns) print("There are %d molecules and %d features" % mordred.shape) return mordred
def __init__(self, rank=None, args=None): self.calc = Calculator(descriptors, ignore_3D=True) if rank is not None: self.rank = rank else: raise ValueError('rank is not set properly') if args is not None: self.args = args else: raise ValueError('args is not set properly')
def __init__(self, dict_mode=True, auto_correct=True, ignore_3D=True): from mordred import Calculator, descriptors super(RDKitDescriptors, self).__init__() self.dict_mode = dict_mode self.calculator = Calculator(descriptors, ignore_3D=ignore_3D) self.auto_correct = auto_correct self.desc_list = list( self.calculator.pandas([mol_from_smiles("C")]).columns) self.desc_list = [ "Mordred_desc_" + desc_name for desc_name in self.desc_list ]
def smile_to_mordred(smi, imputer_dict=None, userdkit=False): calc = Calculator(descriptors, ignore_3D=True) if userdkit: smi = Chem.MolFromSmiles(smi) assert(smi is not None) res = calc(smi) res = np.array(list(res.values())).reshape(1, -1).astype(np.float32) res = np.nan_to_num(res, posinf=0, neginf=0, nan=0) if imputer_dict is not None: imputer_dict = imputer_dict[0] res = imputer_dict['scaler'].transform(imputer_dict['imputer'].transform(res)) return res.flatten().astype(np.float32)