def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return 0 # extracted = DeepSMILESLanguageModelUtils.extract(generated) # tokenized = DeepSMILESTokenizer(extracted) # len_score = len(tokenized.get_tokens()) / (text_length - 1) # provide more reward for longer text sequences decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(smiles) logp = factor * MolLogP(mol) logp_score = (logp - logp_min) / (logp_max - logp_min ) # normalize logP between 0 and 1 score = logp_score # (logp_score * 0.5) + (len_score * 0.5) logger.info("%s, %s" % (generated, str(score))) return score
def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return 0 decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(smiles) num_atoms = mol.GetNumAtoms() num_aromatic_atoms = 0 for i in range(num_atoms): if mol.GetAtomWithIdx(i).GetIsAromatic(): num_aromatic_atoms += 1 arom_reward = num_aromatic_atoms / 23 perplexity = lm.perplexity(text) perplexity_reward = perplexity / (1 + perplexity) score = (perplexity_reward * 0.5) + (arom_reward * 0.5) logger.info("%s, %s" % (generated, str(score))) return score
def eval_function(text): global i, num_valid, all_smiles i += 1 generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: log_best(i, all_smiles, num_valid, logger) return -1.0 num_valid += 1 # synthetic accessibility score is a number between 1 (easy to make) and 10 (very difficult to make) sascore = sascorer.calculateScore(Chem.MolFromSmiles(smiles)) / 10. # cycle score, squashed between 0 and 1 cyclescore = cycle_scorer.score(smiles) cyclescore = cyclescore / (1 + cyclescore) distance_score = distance_scorer.score(smiles) score = (0.75 * distance_score) + (0.15 * (1 - sascore)) + (0.10 * (1 - cyclescore)) all_smiles[smiles] = (score, generated) logger.debug("%s, %s" % (smiles, str(score))) log_best(i, all_smiles, num_valid, logger) return score
def eval_function(text): global i, num_valid, all_smiles, elapsed if elapsed >= TIME_LIMIT: raise StopTreeSearch() i += 1 generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return -1.0 num_valid += 1 if smiles in all_smiles: score = -1.0 else: score = 1.0 all_smiles.add(smiles) elapsed = time.time() - start return score
def eval_function(text): global i, num_valid, all_smiles, elapsed if elapsed >= TIME_LIMIT: raise StopTreeSearch() i += 1 generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: elapsed = time.time() - start return -1.0 num_valid += 1 if smiles in all_smiles: score = -1.0 else: jscore = jscorer.score(smiles) score = jscore / (1 + np.abs(jscore)) all_smiles[smiles] = (jscore, generated) logger.debug("%s, %s" % (smiles, str(score))) elapsed = time.time() - start return score
def eval_function(text): global simulations, num_valid, all_unique, all_valid, elapsed, current_best_score, current_best_smiles, beats_current if elapsed >= TIME_LIMIT: raise StopTreeSearch() simulations += 1 generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(smiles) if mol is None: raise Exception except Exception: elapsed = time.time() - start return -1.0 num_valid += 1 score = distance_scorer.score_mol(mol) all_unique[smiles] = (score, generated) if current_best_score is None or beats_current(score): current_best_score = score current_best_smiles = smiles all_valid.append((smiles, score)) ret_score = -1.0 if smiles in all_unique else score # rescale score from [0,1] to [-1,1] ret_score = (ret_score * 2) + (-1) if ret_score >= 0. else ret_score elapsed = time.time() - start return ret_score
def eval_function(text): global i, num_valid, all_smiles i += 1 generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: log_best(i, all_smiles, num_valid, logger) return -1.0 num_valid += 1 if smiles in all_smiles: score = -1.0 else: score = scorer.score(smiles) all_smiles[smiles] = (score, generated) logger.debug("%s, %s" % (smiles, str(score))) log_best(i, all_smiles, num_valid, logger) return score
def generate_optimized_molecules(self, scoring_function, number_molecules, starting_population=None): self.new_model_dir() vocab = get_arpa_vocab( '../resources/chembl_25_deepsmiles_klm_10gram_200503.arpa') self.lm = KenLMDeepSMILESLanguageModel( '../resources/chembl_25_deepsmiles_klm_10gram_200503.klm', vocab) print("generating %s samples..." % number_molecules) smiles_and_scores = [] TIME_PER_ITERATION = self.time_per_iteration_minutes * 60 # in seconds found = False for n in range(1, self.num_iterations + 1): print("iteration %s" % n) num_valid = 0 start = time.time() elapsed = time.time() - start while elapsed < TIME_PER_ITERATION: try: generated = self.lm.generate(num_chars=100, text_seed='<s>') decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) score = scoring_function.score(smiles) num_valid += 1 smiles_and_scores.append((smiles, score)) if score == 1.0: found = True break except Exception: pass elapsed = time.time() - start print("num valid: %s" % num_valid) if found: break self.retrain(n, self.keep_top_n, smiles_and_scores) return [ pair[0] for pair in list( reversed(sorted(smiles_and_scores, key=lambda p: p[1]))) [:number_molecules] ]
def eval_function(text): global simulations, all_unique, elapsed, current_best_score, current_best_smiles, beats_current if elapsed >= time_limit or len(seen) == max_gen: # if elapsed >= time_limit or simulations == max_sims: raise StopTreeSearch() simulations += 1 generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(smiles) if mol is None: raise Exception except Exception: elapsed = time.time() - start return -1.0 # synthetic accessibility score is a number between 1 (easy to make) and 10 (very difficult to make) sascore = sascorer.calculateScore(mol) / 10. # cycle score, squashed between 0 and 1 cyclescore = cycle_scorer.score_mol(mol) cyclescore = cyclescore / (1 + cyclescore) distance_score = distance_scorer.score_mol(mol) weighted_score = (0.75 * distance_score) + (0.15 * (1 - sascore)) + ( 0.10 * (1 - cyclescore)) if current_best_score is None or beats_current(distance_score): current_best_score = distance_score current_best_smiles = smiles if distance_score == 1.0: logger.info("FOUND!") # ret_score = -1.0 if smiles in seen else weighted_score ret_score = -1.0 if smiles in all_unique else weighted_score # rescale score from [0,1] to [-1,1] ret_score = (ret_score * 2) + (-1) if ret_score >= 0. else ret_score all_unique[smiles] = (distance_score, generated) all_valid.append((smiles, distance_score)) seen[smiles] = distance_score elapsed = time.time() - start return ret_score
def eval_function(text): global simulations, num_valid, all_smiles, elapsed if elapsed >= TIME_PER_ITERATION: raise StopTreeSearch() simulations += 1 generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(smiles) if mol is None: raise Exception except Exception: elapsed = time.time() - start return -1.0 num_valid += 1 if smiles in seen: score = -1.0 else: # synthetic accessibility score is a number between 1 (easy to make) and 10 (very difficult to make) sascore = sascorer.calculateScore(mol) / 10. # cycle score, squashed between 0 and 1 cyclescore = cycle_scorer.score_mol(mol) cyclescore = cyclescore / (1 + cyclescore) distance_score = distance_scorer.score_mol(mol) score = (0.75 * distance_score) + (0.15 * (1 - sascore)) + (0.10 * (1 - cyclescore)) seen.add(smiles) all_smiles[smiles] = (score, generated) if distance_score == 1.0: logger.info("FOUND!") # rescale score from [0,1] to [-1,1] ret_score = (score * 2) + (-1) if score >= 0. else score elapsed = time.time() - start return ret_score
def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return -1.0 jscore = jscorer.score(smiles) score = jscore / (1 + np.abs(jscore)) logger.info("%s, %s" % (generated, str(score))) return score
def generate(self, number_samples): print("generating %s samples..." % number_samples) samples = [] for n in range(number_samples): try: generated = self.lm.generate(num_chars=100, text_seed='<s>') decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') sanitized = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: sanitized = "invalid" samples.append(sanitized) return samples
def eval_function(text): global i, num_valid, all_smiles i += 1 generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return -1.0 num_valid += 1 if smiles in all_smiles: return -1.0 else: # the score in this case is simply 1.0, since the molecule is valid and hasn't been generated yet; # but it could be anything, such as whether the generated sequence contains sulfur, etc. score = 1.0 all_smiles.add(smiles) # As in "Molecular de-novo design through deep reinforcement learning", by Olivecrona et al., we are adding # the prior's log probability of the generated sequence to the score. prior_log_prob = prior.log_prob( DeepSMILESLanguageModelUtils.extract_sentence(text, join_on=' ', start='<s>', end='</s>')) tot_score = prior_log_prob + sigma * score # rescale the score # in practice, the log probs are rarely less than -45; so the min tot_score can be: -45 + (sigma*-1.0) rescale_min = -45 - sigma if tot_score < rescale_min: logger.info("WARNING: total score lower than %s" % rescale_min) # because probabilities are in the range [0,1], the max log prob is log(1) i.e. 0 # so the max tot_score can be: 0 + sigma*1.0 rescale_max = sigma # scaling x into [a,b]: (b-a)*((x - min(x))/(max(x) - min(x))+a ret_score = (1 - (-1)) * ((tot_score - rescale_min) / (rescale_max - rescale_min)) + (-1) return ret_score
def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return 0 extracted = DeepSMILESLanguageModelUtils.extract(generated, start='<s>', end='</s>') tokenized = DeepSMILESTokenizer(extracted) score = len(tokenized.get_tokens()) / ( text_length - 1) # provide more reward for longer text sequences logger.info("%s, %s" % (generated, str(score))) return score
def eval_function(text): global simulations, all_unique, elapsed, current_best_score, current_best_smiles, beats_current if elapsed >= time_limit or len(seen) == max_gen: # if elapsed >= time_limit or simulations == max_sims: raise StopTreeSearch() simulations += 1 generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(smiles) if mol is None: raise Exception except Exception: elapsed = time.time() - start return -1.0 score = distance_scorer.score_mol(mol) if current_best_score is None or beats_current(score): current_best_score = score current_best_smiles = smiles if score == 1.0: logger.info("FOUND!") # ret_score = -1.0 if smiles in seen else score ret_score = -1.0 if smiles in all_unique else score # rescale score from [0,1] to [-1,1] ret_score = (ret_score * 2) + (-1) if ret_score >= 0. else ret_score all_unique[smiles] = (score, generated) all_valid.append((smiles, score)) seen[smiles] = score elapsed = time.time() - start return ret_score
def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: samples.append("invalid") return -1.0 samples.append(smiles) if smiles in all_smiles: score = -1.0 else: score = 1.0 all_smiles.add(smiles) return score
def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return -1.0 global all_smiles if smiles in all_smiles: score = -1.0 else: qedscore = qedscorer.score(smiles) score = qedscore / (1 + np.abs(qedscore)) all_smiles[smiles] = qedscore logger.info("%s, %s" % (smiles, str(score))) return score
def eval_function(text): global simulations, num_valid, all_unique, elapsed, current_best_score, current_best_smiles, beats_current if elapsed >= TIME_PER_ITERATION: raise StopTreeSearch() simulations += 1 generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(smiles) if mol is None: raise Exception except Exception: elapsed = time.time() - start return -1.0 num_valid += 1 score = distance_scorer.score_mol(mol) seen.add(smiles) all_unique[smiles] = (score, generated) if current_best_score is None or beats_current(score): current_best_score = score current_best_smiles = smiles all_valid.append((smiles, score)) if score == 1.0: logger.info("FOUND!") ret_score = -1.0 if smiles in seen else score elapsed = time.time() - start return ret_score
def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return -1.0 if smiles in all_smiles: score = -1.0 else: score = scoring_function.score(smiles) all_smiles.add(smiles) if self.best_score < score: self.best_score = score self.best_smiles = smiles return score
def eval_function(text): global i, num_valid, all_smiles i += 1 generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return -1.0 num_valid += 1 if smiles in all_smiles: score = -1.0 else: score = 1.0 all_smiles.add(smiles) return score
logger = logger() vocab = get_arpa_vocab( '../resources/chemts_250k_deepsmiles_klm_6gram_190414.arpa') lm = EmptyDeepSMILESLanguageModel(vocab, n=6) current_best_score = None current_best_smiles = None beats_current = lambda score: score < current_best_score for i in range(1000): generated = lm.generate(num_chars=25, text_seed="<s>") try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') sanitized = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(sanitized) logp_score = MolLogP(mol) logger.info("successful: %s , score: %s" % (sanitized, str(logp_score))) if current_best_score is None or beats_current(logp_score): current_best_score = logp_score current_best_smiles = sanitized except Exception as e: pass
def eval_function(text): global simulations, num_valid, all_unique, elapsed, current_best_score, current_best_smiles, beats_current if elapsed >= TIME_PER_ITERATION: raise StopTreeSearch() simulations += 1 generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(smiles) if mol is None: raise Exception except Exception: elapsed = time.time() - start return -1.0 num_valid += 1 distance_score = distance_scorer.score_mol(mol) if distance_score == 1.0: logger.info("FOUND!") # As in "Molecular de-novo design through deep reinforcement learning", by Olivecrona et al., we are adding # the prior's log probability of the generated sequence to the score. prior_log_prob = prior.log_prob( DeepSMILESLanguageModelUtils.extract_sentence(text, join_on=' ', start='<s>', end='</s>')) # tot_score = prior_log_prob + sigma * ((distance_score * 2) + (-1)) # rescale the distance score from [0,1] to [-1,1] tot_score = prior_log_prob + sigma * distance_score # rescale the score # in practice, the log probs are rarely less than -45; so the min tot_score can be: -45 + (sigma*-1.0) rescale_min = -45 - sigma if tot_score < rescale_min: logger.info("WARNING: total score lower than %s" % rescale_min) # because probabilities are in the range [0,1], the max log prob is log(1) i.e. 0 # so the max tot_score can be: 0 + sigma*1.0 rescale_max = sigma # scaling x into [a,b]: (b-a)*((x - min(x))/(max(x) - min(x))+a ret_score = (1 - (-1)) * ((tot_score - rescale_min) / (rescale_max - rescale_min)) + (-1) ret_score = -1.0 if smiles in seen else ret_score if current_best_score is None or beats_current(distance_score): current_best_score = distance_score current_best_smiles = smiles all_unique[smiles] = (distance_score, generated) all_valid.append((smiles, distance_score)) seen.add(smiles) elapsed = time.time() - start return ret_score
def eval_function(text): global simulations, all_unique, elapsed, current_best_score, current_best_smiles, beats_current if elapsed >= time_limit or len(all_valid) == max_gen: raise StopTreeSearch() simulations += 1 generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(smiles) if mol is None: raise Exception except Exception: elapsed = time.time() - start return -1.0 score = distance_scorer.score_mol(mol) if current_best_score is None or beats_current(score): current_best_score = score current_best_smiles = smiles if score == 1.0: logger.info("FOUND!") ### # As in "Molecular de-novo design through deep reinforcement learning", by Olivecrona et al., we are adding # the prior's log probability of the generated sequence to the score. prior_log_prob = prior.log_prob( DeepSMILESLanguageModelUtils.extract_sentence(text, join_on=' ', start='<s>', end='</s>')) rescaled_distance_score = (score * 2) + (-1) # rescale the prior log prob # in practice, the log probs are rarely less than -45 rescale_min = -45 if prior_log_prob < rescale_min: logger.info("WARNING: prior log prob lower than %s" % rescale_min) # because probabilities are in the range [0,1], the max log prob is log(1) i.e. 0 rescale_max = 0.0 # scaling x into [a,b]: (b-a)*((x - min(x))/(max(x) - min(x))+a rescaled_log_prob = (1 - (-1)) * ((prior_log_prob - rescale_min) / (rescale_max - rescale_min)) + (-1) ret_score = ( 1 - sigma) * rescaled_log_prob + sigma * rescaled_distance_score ret_score = -1.0 if smiles in seen else ret_score ### all_unique[smiles] = (score, generated) all_valid.append((smiles, score)) seen.add(smiles) elapsed = time.time() - start return ret_score