def calc_properties(smiles_list): logp_vals, mw_vals = [], [] for smiles in smiles_list: mol = Chem.MolFromSmiles(smiles) logp_vals.append(MolLogP(mol, True)) mw_vals.append(ExactMolWt(mol)) return pd.DataFrame({'logp': logp_vals, 'mw': mw_vals})
def score(self, smiles): mol = Chem.MolFromSmiles(smiles) try: logp = MolLogP(mol) except: logp = -1000 sa_score = -sascorer.calculateScore(mol) cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(mol))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_score = -cycle_length sa_score_norm = (sa_score - self._sa_mean) / self._sa_std logp_norm = (logp - self._logp_mean) / self._logp_std cycle_score_norm = (cycle_score - self._cycle_mean) / self._cycle_std return sa_score_norm + logp_norm + cycle_score_norm
def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return 0 # extracted = DeepSMILESLanguageModelUtils.extract(generated) # tokenized = DeepSMILESTokenizer(extracted) # len_score = len(tokenized.get_tokens()) / (text_length - 1) # provide more reward for longer text sequences decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(smiles) logp = factor * MolLogP(mol) logp_score = (logp - logp_min) / (logp_max - logp_min ) # normalize logP between 0 and 1 score = logp_score # (logp_score * 0.5) + (len_score * 0.5) logger.info("%s, %s" % (generated, str(score))) return score
def save(self, force_insert=False, force_update=False, using=None, update_fields=None, *args, **kwargs): smiles = self.smiles if smiles: try: self.mol = Chem.MolFromSmiles(smiles) self.mol_block = Chem.MolToMolBlock(self.mol) self.mol_weight = Descriptors.ExactMolWt(self.mol) self.alogp = MolLogP(self.mol) self.hba = NumHAcceptors(self.mol) self.hbd = NumHDonors(self.mol) self.psa = Chem.MolSurf.TPSA(self.mol) self.rtb = NumRotatableBonds(self.mol) super(Compound, self).save(*args, **kwargs) self.formula = Chem.rdMolDescriptors.CalcMolFormula(self.mol) self.bfp = MORGANBV_FP(Value(smiles)) except (ValueError, TypeError): print "Error when storing mol object" pass super(Compound, self).save(*args, **kwargs)
def save(self, force_insert=False, force_update=False, using=None, update_fields=None): self.molecule_chembl_id_url = 'https://www.ebi.ac.uk/chembl/compound/inspect/{}'.format( self.molecule_chembl_id) super(ChEMBL_small_molecule, self).save() smiles = self.molecule_smile if smiles: try: self.mol = Chem.MolFromSmiles(smiles) self.mol_block = Chem.MolToMolBlock(self.mol) self.mol_weight = Descriptors.ExactMolWt(self.mol) self.alogp = MolLogP(self.mol) self.hba = NumHAcceptors(self.mol) self.hbd = NumHDonors(self.mol) self.psa = Chem.MolSurf.TPSA(self.mol) self.rtb = NumRotatableBonds(self.mol) super(ChEMBL_small_molecule, self).save() self.formula = Chem.rdMolDescriptors.CalcMolFormula(self.mol) self.bfp = MORGANBV_FP(Value(smiles)) except (ValueError, TypeError): print('Error when storing mol object') pass super(ChEMBL_small_molecule, self).save()
def get_logP(smi_list): logP_list = [] for smi in smi_list: m = Chem.MolFromSmiles(smi) logP_list.append(MolLogP(m)) return logP_list
def preprocess(dataset, dir_input): train_smiles = list(dataset['SMILES']) train_adducts = dataset['Adducts'] train_ccs = list(dataset['CCS']) adducts_encoder = AdductToOneHotEncoder() adducts_encoder.fit(train_adducts) adducts = adducts_encoder.transform(train_adducts) Smiles, molecules, adjacencies, properties, descriptors = '', [], [], [], [] for i, smi in enumerate(train_smiles): if '.' in smi: continue smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi)) mol = Chem.MolFromSmiles(smi) mol = Chem.AddHs(mol) atoms = create_atoms(mol) i_jbond_dict = create_ijbonddict(mol) fingerprints = extract_fingerprints(atoms, i_jbond_dict, radius) adjacency = create_adjacency(mol) Smiles += smi + '\n' molecules.append(fingerprints) adjacencies.append(adjacency) properties.append([[train_ccs[i]]]) descriptors.append([ ExactMolWt(mol), MolLogP(mol), GetFormalCharge(mol), CalcNumRings(mol), CalcNumRotatableBonds(mol), CalcLogS(mol), AcidCount(mol), BaseCount(mol), APolar(mol), BPolar(mol) ]) properties = np.array(properties) mean, std = np.mean(properties), np.std(properties) properties = np.array((properties - mean) / std) os.makedirs(dir_input, exist_ok=True) with open(dir_input + 'Smiles.txt', 'w') as f: f.write(Smiles) np.save(dir_input + 'molecules', molecules) np.save(dir_input + 'adducts', adducts) np.save(dir_input + 'adjacencies', adjacencies) np.save(dir_input + 'properties', properties) np.save(dir_input + 'descriptors', descriptors) np.save(dir_input + 'mean', mean) np.save(dir_input + 'std', std) dump_dictionary(fingerprint_dict, dir_input + 'fingerprint_dict.pickle')
def calc_properties(smi): # returns logP, TPSA, MW, MR m = Chem.MolFromSmiles(smi.numpy()) logP = MolLogP(m) tpsa = CalcTPSA(m) # sas = calculateScore(m) mw = ExactMolWt(m) mr = MolMR(m) return np.asarray(logP), np.asarray(tpsa), np.asarray(mw), np.asarray(mr)
def analyze(self, smiles: List[str], only_drugs=True) -> pd.DataFrame: features = self.preprocessor.transform(smiles) # RDKit molecular properties inchikey = [] weight = [] logp = [] hdonors = [] hacceptors = [] for example in smiles: mol = MolFromSmiles(example) if not mol: raise ValueError("Malformed molecule passed in to analyze") inchikey.append(MolToInchiKey(mol)) weight.append(ExactMolWt(mol)) logp.append(MolLogP(mol)) hdonors.append(NumHDonors(mol)) hacceptors.append(NumHAcceptors(mol)) # Scores safety = self.safety.predict(features) feasibility = self.feasibility.predict(features) bbbp = self.bbbp.predict_proba(features) dataframe = pd.DataFrame( { "key": inchikey, "smiles": smiles, "weight": weight, "logp": logp, "hdonors": hdonors, "hacceptors": hacceptors, "safety": safety, "feasibility": feasibility, "bbbp": (i[1] for i in bbbp), } ) if only_drugs: # Lipinsky's rules dataframe = dataframe[dataframe.weight < 500] dataframe = dataframe[dataframe.hdonors <= 5] dataframe = dataframe[dataframe.hacceptors <= 10] dataframe = dataframe[dataframe.logp <= 5] # Filter too toxic and infeasible compounds dataframe = dataframe[dataframe.safety > 0.75] dataframe = dataframe[dataframe.feasibility > 0.75] dataframe = dataframe.reset_index(drop=True) return dataframe
def process_smile(row): """Return molecular properties """ try: smi = row.strip() m = Chem.MolFromSmiles(smi) logP = MolLogP(m) length = len(list(smi)) del m return smi, logP, length except: return None, None, None
def check_lipinski(mol): fgs = load_functional_groups() h_donors = Lipinski.NumHDonors(mol.rdmol) h_acceptors = Lipinski.NumHAcceptors(mol.rdmol) log_p = MolLogP(mol.rdmol) wt = MolWt(mol.rdmol) if h_donors <= 5 and h_acceptors <= 5 and log_p < 5: if wt >= 450: mol.join(fgs['terminal_fg'].get_random()) return True, False else: return True, False else: return False, False
def read_ZINC_smiles(file_name, num_mol): f = open(file_name, 'r') contents = f.readlines() smi_list = [] logP_list = [] for i in tqdm_notebook(range(num_mol), desc='Reading Data'): smi = contents[i].strip() m = Chem.MolFromSmiles(smi) smi_list.append(smi) logP_list.append(MolLogP(m)) logP_list = np.asarray(logP_list).astype(float) return smi_list, logP_list
def read_ZINC_smiles(num_mol): f = open('ZINC.smiles', 'r') contents = f.readlines() smi_list = [] logP_list = [] tpsa_list = [] for i in range(num_mol): smi = contents[i].strip() m = Chem.MolFromSmiles(smi) smi_list.append(smi) logP_list.append(MolLogP(m)) tpsa_list.append(CalcTPSA(m)) logP_list = np.asarray(logP_list).astype(float) tpsa_list = np.asarray(tpsa_list).astype(float) return smi_list, logP_list, tpsa_list
def get_global_features(mol): """Computes global-level features for a molecule. Parameters ---------- mol : rdkit mol Returns ------- [np.ndarray] Global-level features """ # MW, TPSA, logP, n.hdonors mw = MolWt(mol) tpsa = CalcTPSA(mol) logp = MolLogP(mol) n_hdonors = NumHDonors(mol) desc = np.array([mw, tpsa, logp, n_hdonors], dtype=np.float32) return desc
def calc_properties(smi): # returns logP, TPSA, MW, MR # normalize quantities m = Chem.MolFromSmiles(smi.numpy()) logP = np.asarray(MolLogP(m)) logP = (logP - LOGP_MEAN) / LOGP_STD tpsa = np.asarray(CalcTPSA(m)) tpsa = np.log10(tpsa + 1) tpsa = (tpsa - TPSA_MEAN) / TPSA_STD # sas = calculateScore(m) mw = np.asarray(ExactMolWt(m)) mw = np.log10(mw + 1) mw = (mw - MW_MEAN) / MW_STD mr = np.asarray(MolMR(m)) mr = np.log10(mr + 1) mr = (mr - MR_MEAN) / MR_STD return logP, tpsa, mw, mr
def read_ZINC(num_mol): f = open('ZINC.smiles', 'r') contents = f.readlines() smi = [] fps = [] logP = [] tpsa = [] for i in range(num_mol): smi = contents[i].strip() m = Chem.MolFromSmiles(smi) fp = AllChem.GetMorganFingerprintAsBitVect(m, 2) arr = np.zeros((1, )) DataStructs.ConvertToNumpyArray(fp, arr) fps.append(arr) logP.append(MolLogP(m)) tpsa.append(CalcTPSA(m)) fps = np.asarray(fps) logP = np.asarray(logP) tpsa = np.asarray(tpsa) return fps, logP, tpsa
def calc_properties(smi): """ :param smi: :return: logP, TPSA, MR, MW """ m = Chem.MolFromSmiles(smi.numpy()) logP = np.asarray(MolLogP(m)) logP = (logP - LOGP_MEAN) / LOGP_STD tpsa = np.asarray(CalcTPSA(m)) tpsa = np.log10(tpsa + 1) tpsa = (tpsa - TPSA_MEAN) / TPSA_STD # sas = calculateScore(m) mw = np.asarray(ExactMolWt(m)) mw = np.log10(mw + 1) mw = (mw - MW_MEAN) / MW_STD mr = np.asarray(MolMR(m)) mr = np.log10(mr + 1) mr = (mr - MR_MEAN) / MR_STD return logP, tpsa, mr, mw
def read_ZINC(num_mol): f = open('../Data/logP/ZINC.smiles', 'r') contents = f.readlines() list_smi = [] fps = [] logP = [] tpsa = [] for i in tqdm_notebook(range(num_mol)): smi = contents[i].strip() list_smi.append(smi) m = Chem.MolFromSmiles(smi) fp = AllChem.GetMorganFingerprintAsBitVect(m, 2) arr = np.zeros((1, )) DataStructs.ConvertToNumpyArray(fp, arr) fps.append(arr) logP.append(MolLogP(m)) tpsa.append(CalcTPSA(m)) fps = np.asarray(fps).astype(float) logP = np.asarray(logP).astype(float) tpsa = np.asarray(tpsa).astype(float) return list_smi, logP, tpsa
logps = [] nhdonors = [] values = [] dataset = [] for data in list(LABEL_GUIDE.keys()) + ["cyp"]: with open(os.path.join(DATA_PATH, data, f"data_{data}.pt"), "rb") as handle: inchis, v = pickle.load(handle) values.extend(v) for inchi in tqdm(inchis): mol = MolFromInchi(inchi) mws.append(MolWt(mol)) logps.append(MolLogP(mol)) nhdonors.append(NumHDonors(mol)) dataset.append(DATASET_GUIDE[data]) df = pd.DataFrame({ "Molecular weight (gr./mol)": mws, r"aLog$P$": logps, "No. hydrogen donors": nhdonors, "values": values, "dataset": dataset, }) f, axs = plt.subplots(1, 3, figsize=(18, 6)) axs[0].grid(alpha=0.5) axs[1].grid(alpha=0.5)
import numpy as np from utils import * from rdkit import Chem from rdkit.Chem.Crippen import MolLogP molecules = np.load("./inputs/molecules.npy") char = np.load("./inputs/char.npy") N = 200000 logP_list = [] for i in range(N): smi = convert_to_smiles(molecules[i], char) mol = Chem.MolFromSmiles(smi) logP = MolLogP(mol) logP_list.append(logP) np.save("./inputs/logP.npy", np.asarray(logP_list))
def cal_prop(s): m = Chem.MolFromSmiles(s) if m is None : return None return Chem.MolToSmiles(m), ExactMolWt(m), MolLogP(m), CalcNumHBD(m), CalcNumHBA(m), CalcTPSA(m)
def convert_to_clogp(SMILES): mol = MS(SMILES) logp = MolLogP(mol) return logp
def logP_benchmark(smi): m = Chem.MolFromSmiles(smi.numpy()) logP = MolLogP(m) return np.asarray(logP)
_, _, char, vocab, _, _ = load_data(args.prop_file, args.seq_length) vocab_size = len(char) model = GNMTP(vocab_size, args ) model.restore(args.save_file) target_prop = np.array([[float(p) for p in args.target_prop.split()] for _ in range(args.batch_size)]) start_codon = np.array([np.array(list(map(vocab.get, 'X')))for _ in range(args.batch_size)]) smiles = [] for _ in range(args.num_iteration): latent_vector = s = np.random.normal(args.mean, args.stddev, (args.batch_size, args.latent_size)) generated = model.sample(latent_vector, target_prop, start_codon, args.seq_length) smiles += [convert_to_smiles(generated[i], char) for i in range(len(generated))] print ('number of trial : ', len(smiles)) smiles = list(set([s.split('E')[0] for s in smiles] )) print ('number of generated smiles : ', len(smiles)) ms = [Chem.MolFromSmiles(s) for s in smiles] ms = [m for m in ms if m is not None] print ('number of valid smiles : ', len(ms)) with open(args.result_filename, 'w') as w: w.write('smiles\t MW\t LogP\t TPSA\n') for m in ms: try: w.write('%s\t%.3f\t%.3f\t%.3f\n' %(Chem.MolToSmiles(m), ExactMolWt(m), MolLogP(m), CalcTPSA(m))) except: continue
def lipinski_filter(smiles): mol = MolFromSmiles(smiles) return MolLogP(mol) <= 5 and NumHAcceptors(mol) <= 10 and NumHDonors(mol) <= 5 and 100 <= ExactMolWt(mol) <= 500
def get_logP(mol): ''' clogP 或 LogP ''' return MolLogP(mol)
s for s in smiles[:20000] if Chem.MolFromSmiles(s).GetNumAtoms() < 50 ] print('Number of smiles:', len(smiles)) Y = [] num_data = 20000 st = time.time() for s in smiles[:num_data]: m = Chem.MolFromSmiles(s) logp = MolLogP(m) Y.append(logp) end = time.time() print(f'Time:{(end-st):.3f}') #Dataset from torch.utils.data import Dataset, DataLoader from rdkit.Chem.rdmolops import GetAdjacencyMatrix class MolDataset(Dataset):
score = logp_score # (logp_score * 0.5) + (len_score * 0.5) logger.info("%s, %s" % (generated, str(score))) return score # mcts = LanguageModelMCTSWithUCB1(lm, width, text_length, eval_function) mcts = LanguageModelMCTSWithPUCT(lm, width, text_length, eval_function, cpuct=5) state = start_state logger.info("beginning search...") mcts.search(state, num_simulations) best = mcts.get_best_sequence() generated_text = ''.join(best[0]) logger.info("generated text: %s (score: %s, perplexity: %s)" % (generated_text, str(best[1]), lm.perplexity(generated_text))) decoded = DeepSMILESLanguageModelUtils.decode(generated_text, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(smiles) logp = MolLogP(mol) logger.info("SMILES: %s, logP: %s" % (smiles, logp))
lm = EmptyDeepSMILESLanguageModel(vocab, n=6) current_best_score = None current_best_smiles = None beats_current = lambda score: score < current_best_score for i in range(1000): generated = lm.generate(num_chars=25, text_seed="<s>") try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') sanitized = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(sanitized) logp_score = MolLogP(mol) logger.info("successful: %s , score: %s" % (sanitized, str(logp_score))) if current_best_score is None or beats_current(logp_score): current_best_score = logp_score current_best_smiles = sanitized except Exception as e: pass logger.info("best: %s , score: %s" % (current_best_smiles, str(current_best_score)))
from rdkit import Chem from rdkit.Chem.Crippen import MolLogP with open('../id_smiles.txt') as f, open('data.txt', 'w') as w: for l in f: m_id, s1, s2 = l.split() m1, m2 = Chem.MolFromSmiles(s1), Chem.MolFromSmiles(s2) if m1 is None or m2 is None: continue c1, c2 = MolLogP(m1), MolLogP(m2) w.write(m_id + '\t' + str(c1) + '\t' + str(c2) + '\n')