def generate_optimized_molecules(self, scoring_function, number_molecules, starting_population=None): print("generating %s samples..." % number_molecules) all_smiles = set() width = 24 max_depth = 100 c = 5 num_simulations = 10000 self.best_smiles = None self.best_score = -1.0 def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return -1.0 if smiles in all_smiles: score = -1.0 else: score = scoring_function.score(smiles) all_smiles.add(smiles) if self.best_score < score: self.best_score = score self.best_smiles = smiles return score mcts = LanguageModelMCTSWithPUCTTerminating(self.lm, width, max_depth, eval_function, cpuct=c, terminating_symbol='</s>') mcts.search(["<s>"], num_simulations) return [self.best_smiles]
def generate(self, number_samples): print("generating %s samples..." % number_samples) all_smiles = set() samples = [] width = 24 max_depth = 100 c = 5 def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: samples.append("invalid") return -1.0 samples.append(smiles) if smiles in all_smiles: score = -1.0 else: score = 1.0 all_smiles.add(smiles) return score mcts = LanguageModelMCTSWithPUCTTerminating(self.lm, width, max_depth, eval_function, cpuct=c, terminating_symbol='</s>') mcts.search(["<s>"], number_samples) return samples
num_valid += 1 if smiles in all_smiles: score = -1.0 else: score = scorer.score(smiles) all_smiles[smiles] = (score, generated) logger.debug("%s, %s" % (smiles, str(score))) log_best(i, all_smiles, num_valid, logger) return score mcts = LanguageModelMCTSWithPUCTTerminating(lm, width, max_depth, eval_function, cpuct=c, terminating_symbol='</s>') state = start_state logger.info("beginning search...") start = time.time() mcts.search(state, num_simulations) end = time.time() logger.info("--done--") logger.info("num valid: %d" % num_valid) best = mcts.get_best_sequence() generated_text = ''.join(best[0]) logger.info("best generated text: %s" % generated_text)
# in practice, the log probs are rarely less than -45; so the min tot_score can be: -45 + (sigma*-1.0) rescale_min = -45 - sigma if tot_score < rescale_min: logger.info("WARNING: total score lower than %s" % rescale_min) # because probabilities are in the range [0,1], the max log prob is log(1) i.e. 0 # so the max tot_score can be: 0 + sigma*1.0 rescale_max = sigma # scaling x into [a,b]: (b-a)*((x - min(x))/(max(x) - min(x))+a ret_score = (1 - (-1)) * ((tot_score - rescale_min) / (rescale_max - rescale_min)) + (-1) return ret_score mcts = LanguageModelMCTSWithPUCTTerminating(prior, width, max_depth, eval_function, cpuct=c, terminating_symbol='</s>') state = start_state logger.info("beginning search...") start = time.time() mcts.search(state, num_simulations) t.cancel() end = time.time() logger.info("--done--") logger.info("num valid: %d" % num_valid) logger.info("num unique: %s" % len(all_smiles))