def __init__(self, num_iterations, keep_top_n, time_per_iteration_minutes): self.num_iterations = num_iterations self.keep_top_n = keep_top_n self.time_per_iteration_minutes = time_per_iteration_minutes self.lm = None env = os.environ.copy() env["PATH"] = "/Users/luis/kenlm/build/bin:" + env["PATH"] self.lm_trainer = KenLMTrainer(env) self.converter = Converter(rings=True, branches=True)
class DeepSmilesEncoder(FunctionApplier): def __init__(self, astype=Series, branches=True, rings=True): super().__init__(self.encode) self.encoder = DeepSmilesConverter(branches, rings) def encode(self, smiles): k = self.function_kwargs return self.encoder.encode(smiles)
logger.info("TanimotoScorer(abilify, radius=6)") logger.info("num_iterations = 100") logger.info("simulations_per_iteration = 50000") logger.info("keep_top_n = 5000") logger.info("loading language model...") vocab = get_arpa_vocab( '../resources/zinc12_fragments_deepsmiles_klm_10gram_200502.arpa') lm = KenLMDeepSMILESLanguageModel( '../resources/zinc12_fragments_deepsmiles_klm_10gram_200502.klm', vocab) abilify = "Clc4cccc(N3CCN(CCCCOc2ccc1c(NC(=O)CC1)c2)CC3)c4Cl" scorer = TanimotoScorer(abilify, radius=6) converter = Converter(rings=True, branches=True) env = os.environ.copy() env["PATH"] = "/Users/luis/kenlm/build/bin:" + env["PATH"] lm_trainer = KenLMTrainer(env) def log_best(j, all_best, n_valid, lggr): if j % 1000 == 0: lggr.info("--iteration: %d--" % j) lggr.info("num valid: %d" % n_valid) log_top_best(all_best, 5, lggr) def smiles_to_deepsmiles(smiles): canonical = pybel.readstring( "smi", smiles).write("can").strip() # TODO do we need to canonicalize?
def __init__(self, astype=Series, branches=True, rings=True): super().__init__(self.encode) self.encoder = DeepSmilesConverter(branches, rings)
from rdkit import Chem from rdkit.Chem import AllChem from deepsmiles import Converter f = open('ms-nmr.txt') f1 = open('ms-m-ir-nmr.txt', 'w') # alcohol, ether, carbonyl for line in f: if '[' in line: smi = Converter(rings=True, branches=True).decode(line.split('],')[1]) mol = Chem.MolFromSmiles(smi) mol = AllChem.AddHs(mol) # print(smi,mol.HasSubstructMatch(Chem.MolFromSmarts('[$(C[OX2H1]);!$([CX3](=O)[OX2H1])]'))) print(smi, mol.HasSubstructMatch(Chem.MolFromSmarts('[$(c[OX2H1])]'))) # print(smi,mol.HasSubstructMatch(Chem.MolFromSmarts('[$(C[OX2H1])&(c[OX2H1])]'))) o = 1 if mol.HasSubstructMatch(Chem.MolFromSmarts( '[$(C[OX2H1])]')) == True or mol.HasSubstructMatch( Chem.MolFromSmarts('[$(c[OX2H1])]')) == True else 0 e = 1 if mol.HasSubstructMatch(Chem.MolFromSmarts( '[o]([c])[c]')) == True or mol.HasSubstructMatch( Chem.MolFromSmarts('[O]([C])[C]')) == True else 0 carb = 1 if mol.HasSubstructMatch( Chem.MolFromSmarts( '[$([CX3]=[OX1]),$([CX3+]-[OX1-])]')) == True else 0 atoms = mol.GetAtoms() l = [a.GetSymbol() for a in atoms] cs = l.count('C') hs = l.count('H') os = l.count('O') #print(line,cs,hs,os) l1 = str(line.split(']')[0]) + ', ' + str( [cs, hs, os, o, e, carb]).split('[')[1] + line.split(']')[1]
class ChemgramsGoalDirectedGenerator(GoalDirectedGenerator): def __init__(self, num_iterations, keep_top_n, time_per_iteration_minutes): self.num_iterations = num_iterations self.keep_top_n = keep_top_n self.time_per_iteration_minutes = time_per_iteration_minutes self.lm = None env = os.environ.copy() env["PATH"] = "/Users/luis/kenlm/build/bin:" + env["PATH"] self.lm_trainer = KenLMTrainer(env) self.converter = Converter(rings=True, branches=True) def generate_optimized_molecules(self, scoring_function, number_molecules, starting_population=None): self.new_model_dir() vocab = get_arpa_vocab( '../resources/chembl_25_deepsmiles_klm_10gram_200503.arpa') self.lm = KenLMDeepSMILESLanguageModel( '../resources/chembl_25_deepsmiles_klm_10gram_200503.klm', vocab) print("generating %s samples..." % number_molecules) smiles_and_scores = [] TIME_PER_ITERATION = self.time_per_iteration_minutes * 60 # in seconds found = False for n in range(1, self.num_iterations + 1): print("iteration %s" % n) num_valid = 0 start = time.time() elapsed = time.time() - start while elapsed < TIME_PER_ITERATION: try: generated = self.lm.generate(num_chars=100, text_seed='<s>') decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) score = scoring_function.score(smiles) num_valid += 1 smiles_and_scores.append((smiles, score)) if score == 1.0: found = True break except Exception: pass elapsed = time.time() - start print("num valid: %s" % num_valid) if found: break self.retrain(n, self.keep_top_n, smiles_and_scores) return [ pair[0] for pair in list( reversed(sorted(smiles_and_scores, key=lambda p: p[1]))) [:number_molecules] ] def new_model_dir(self): print( "deleting any existing molexit directory, and creating a new one..." ) path = Path("../models/molexit/") if os.path.exists(path) and os.path.isdir(path): shutil.rmtree(path) path.mkdir(parents=True, exist_ok=True) def retrain(self, n, keep_top_n, smiles_and_scores): print("writing dataset...") name = 'molexit-%d' % n dataset = '../models/molexit/%s.txt' % name dataset_scores = [] with open(dataset, 'w') as f: for smi, score in list( reversed(sorted(smiles_and_scores, key=lambda p: p[1])))[:keep_top_n]: dsmi = self.converter.encode( pybel.readstring("smi", smi.strip()).write("can").strip()) tok = DeepSMILESTokenizer(dsmi) tokens = tok.get_tokens() f.write(' '.join([t.value for t in tokens])) f.write("\n") dataset_scores.append(score) print('dataset: size: %s, mean score: %s, max score: %s' % (len(dataset_scores), np.mean(dataset_scores), np.max(dataset_scores))) print('training new LM...') self.lm_trainer.train(10, dataset, '../models/molexit', name) vocab = get_arpa_vocab('../models/molexit/%s.arpa' % name) self.lm = KenLMDeepSMILESLanguageModel( '../models/molexit/%s.klm' % name, vocab)