Exemplo n.º 1
0
    def generate_optimized_molecules(
            self,
            scoring_function: ScoringFunction,
            number_molecules: int,
            starting_population: Optional[List[str]] = None) -> List[str]:
        """
        Will iterate through the reference set of SMILES strings and select the best molecules.

        It will create a heap and keep it to the required size so that minimal memory is used.
        """
        top_molecules: List[Tuple[float, str]] = []

        for smiles in self.smiles_reader:
            score = scoring_function.score(smiles)

            # Put molecule and corresponding score in a tuple that allows for appropriate comparison operator for the heap.
            item = (score, smiles)

            if len(top_molecules) < number_molecules:
                heapq.heappush(top_molecules, item)
            else:
                # Equivalent to a push, then a pop, but faster
                # NB: pop removes the smallest value, i.e. in this case the molecule with the lowest score.
                heapq.heappushpop(top_molecules, item)

        return [x[1] for x in top_molecules]
    def generate_optimized_molecules(self, scoring_function: ScoringFunction, number_molecules: int,
                                     starting_population: Optional[List[str]] = None) -> List[str]:
        cuda_available = torch.cuda.is_available()
        device = "cuda" if cuda_available else "cpu"
        model_def = Path(self.pretrained_model_path).with_suffix('.json')

        smiles_rnn = load_rnn_model(model_def, self.pretrained_model_path, device, copy_to_cpu=True)
        model = SmilesRnnActorCritic(smiles_rnn=smiles_rnn).to(device)

        generator = PPOMoleculeGenerator(model=model,
                                         max_seq_length=self.max_seq_len,
                                         device=device)

        molecules = generator.optimise(objective=scoring_function,
                                       start_population=[],
                                       **self.model_args)

        # take the molecules seen during the hill-climbing, and also sample from the final model
        samples = [m.smiles for m in molecules]
        if self.sample_final_model_only:
            samples.clear()
        samples += generator.sample(max(number_molecules, self.number_final_samples))

        # calculate the scores and return the best ones
        samples = canonicalize_list(samples)
        scores = scoring_function.score_list(samples)

        scored_molecules = zip(samples, scores)
        sorted_scored_molecules = sorted(scored_molecules, key=lambda x: (x[1], hash(x[0])), reverse=True)

        top_scored_molecules = sorted_scored_molecules[:number_molecules]

        return [x[0] for x in top_scored_molecules]
Exemplo n.º 3
0
    def generate_optimized_molecules(self,
                                     scoring_function: ScoringFunction,
                                     number_molecules: int,
                                     starting_population: Optional[
                                         List[str]] = None,
                                     get_history=False) -> List[str]:

        # fetch initial population?
        if starting_population is None:
            print('selecting initial population...')
            if self.random_start:
                starting_population = []
            else:
                all_smiles = self.load_smiles_from_file(self.smi_file)
                starting_population = self.top_k(all_smiles, scoring_function,
                                                 self.mols_to_sample)

        cuda_available = torch.cuda.is_available()
        device = "cuda" if cuda_available else "cpu"
        model_def = Path(self.pretrained_model_path).with_suffix('.json')

        model = load_rnn_model(model_def,
                               self.pretrained_model_path,
                               device,
                               copy_to_cpu=True)

        generator = SmilesRnnMoleculeGenerator(model=model,
                                               max_len=self.max_len,
                                               device=device)

        molecules, smiles_history = generator.optimise(
            objective=scoring_function,
            start_population=starting_population,
            n_epochs=self.n_epochs,
            mols_to_sample=self.mols_to_sample,
            keep_top=self.keep_top,
            optimize_batch_size=self.optimize_batch_size,
            optimize_n_epochs=self.optimize_n_epochs,
            pretrain_n_epochs=self.pretrain_n_epochs)

        # take the molecules seen during the hill-climbing, and also sample from the final model
        samples = [m.smiles for m in molecules]
        if self.sample_final_model_only:
            samples.clear()
        samples += generator.sample(
            max(number_molecules, self.number_final_samples))

        # calculate the scores and return the best ones
        samples = canonicalize_list(samples)
        scores = scoring_function.score_list(samples)

        scored_molecules = zip(samples, scores)
        sorted_scored_molecules = sorted(scored_molecules,
                                         key=lambda x: (x[1], hash(x[0])),
                                         reverse=True)

        top_scored_molecules = sorted_scored_molecules[:number_molecules]

        return smiles_history
    def pretrain_on_initial_population(self, scoring_function: ScoringFunction,
                                       start_population,
                                       pretrain_epochs) -> List[OptResult]:
        """
        Takes an objective and tries to optimise it
        :param scoring_function: MPO
        :param start_population: Initial compounds (list of smiles) or request new (random?) population
        :param pretrain_epochs: number of epochs to finetune with start_population
        :return: Candidate molecules
        """
        seed: List[OptResult] = []

        start_population_size = len(start_population)

        training = canonicalize_list(start_population,
                                     include_stereocenters=True)

        if len(training) != start_population_size:
            logger.warning(
                "Some entries for the start population are invalid or duplicated"
            )
            start_population_size = len(training)

        if start_population_size == 0:
            return seed

        logger.info("finetuning with {} molecules for {} epochs".format(
            start_population_size, pretrain_epochs))

        scores = scoring_function.score_list(training)
        seed.extend(
            OptResult(smiles=smiles, score=score)
            for smiles, score in zip(training, scores))

        train_seqs, _ = load_smiles_from_list(training, max_len=self.max_len)
        train_set = get_tensor_dataset(train_seqs)

        batch_size = min(int(len(training)), 32)

        print_every = len(training) / batch_size

        losses = self.trainer.fit(train_set,
                                  train_set,
                                  batch_size=batch_size,
                                  n_epochs=pretrain_epochs,
                                  print_every=print_every,
                                  valid_every=print_every)
        logger.info(losses)
        return seed
    def optimise(self, objective: ScoringFunction, start_population, keep_top,
                 n_epochs, mols_to_sample, optimize_n_epochs,
                 optimize_batch_size, pretrain_n_epochs) -> List[OptResult]:
        """
        Takes an objective and tries to optimise it
        :param objective: MPO
        :param start_population: Initial compounds (list of smiles) or request new (random?) population
        :param kwargs need to contain:
                keep_top: number of molecules to keep at each iterative finetune step
                mols_to_sample: number of molecules to sample at each iterative finetune step
                optimize_n_epochs: number of episodes to finetune
                optimize_batch_size: batch size for fine-tuning
                pretrain_n_epochs: number of epochs to pretrain on start population
        :param get_history: If true also return intermediate samples as well
        :return: Candidate molecules
        """

        int_results = self.pretrain_on_initial_population(
            objective, start_population, pretrain_epochs=pretrain_n_epochs)

        results: List[OptResult] = []
        seen: Set[str] = set()

        for k in int_results:
            if k.smiles not in seen:
                results.append(k)
                seen.add(k.smiles)

        smiles_history = []
        for epoch in range(1, 1 + n_epochs):

            t0 = time.time()
            samples = self.sampler.sample(self.model,
                                          mols_to_sample,
                                          max_seq_len=self.max_len)
            t1 = time.time()

            canonicalized_samples = set(
                canonicalize_list(samples, include_stereocenters=True))
            smiles_history.append(list(canonicalized_samples))
            payload = list(canonicalized_samples.difference(seen))
            payload.sort(
            )  # necessary for reproducibility between different runs

            seen.update(canonicalized_samples)

            scores = objective.score_list(payload)
            int_results = [
                OptResult(smiles=smiles, score=score)
                for smiles, score in zip(payload, scores)
            ]

            t2 = time.time()

            results.extend(sorted(int_results, reverse=True)[0:keep_top])
            results.sort(reverse=True)
            subset = [i.smiles for i in results][0:keep_top]

            np.random.shuffle(subset)

            sub_train = subset[0:int(3 * len(subset) / 4)]
            sub_test = subset[int(3 * len(subset) / 4):]

            train_seqs, _ = load_smiles_from_list(sub_train,
                                                  max_len=self.max_len)
            valid_seqs, _ = load_smiles_from_list(sub_test,
                                                  max_len=self.max_len)

            train_set = get_tensor_dataset(train_seqs)
            valid_set = get_tensor_dataset(valid_seqs)

            opt_batch_size = min(len(sub_train), optimize_batch_size)

            print_every = int(len(sub_train) / opt_batch_size)

            if optimize_n_epochs > 0:
                self.trainer.fit(train_set,
                                 valid_set,
                                 n_epochs=optimize_n_epochs,
                                 batch_size=opt_batch_size,
                                 print_every=print_every,
                                 valid_every=print_every)

            t3 = time.time()

            logger.info(f'Generation {epoch} --- timings: '
                        f'sample: {(t1 - t0):.3f} s, '
                        f'score: {(t2 - t1):.3f} s, '
                        f'finetune: {(t3 - t2):.3f} s')

            top4 = '\n'.join(f'\t{result.score:.3f}: {result.smiles}'
                             for result in results[:4])

            logger.info(f'Top 4:\n{top4}')
            print(f'Top 4:\n{top4}')

        return sorted(results, reverse=True), smiles_history
    def generate_optimized_molecules(
            self,
            scoring_function: ScoringFunction,
            number_molecules: int,
            starting_population: Optional[List[str]] = None) -> List[str]:

        if number_molecules > self.population_size:
            self.population_size = number_molecules
            print(
                f'Benchmark requested more molecules than expected: new population is {number_molecules}'
            )

        # fetch initial population?
        if starting_population is None:
            print('selecting initial population...')
            init_size = self.population_size + self.n_mutations
            all_smiles = copy.deepcopy(self.all_smiles)
            if self.random_start:
                starting_population = np.random.choice(all_smiles, init_size)
            else:
                starting_population = self.top_k(all_smiles, scoring_function,
                                                 init_size)

        # The smiles GA cannot deal with '%' in SMILES strings (used for two-digit ring numbers).
        starting_population = [
            smiles for smiles in starting_population if '%' not in smiles
        ]

        # calculate initial genes
        initial_genes = [
            cfg_to_gene(cfg_util.encode(s), max_len=self.gene_size)
            for s in starting_population
        ]

        # score initial population
        initial_scores = scoring_function.score_list(starting_population)
        population = [
            Molecule(*m)
            for m in zip(initial_scores, starting_population, initial_genes)
        ]
        population = sorted(population, key=lambda x: x.score,
                            reverse=True)[:self.population_size]
        population_scores = [p.score for p in population]

        # evolution: go go go!!
        t0 = time()

        patience = 0

        for generation in range(self.generations):

            old_scores = population_scores
            # select random genes
            all_genes = [molecule.genes for molecule in population]
            choice_indices = np.random.choice(len(all_genes),
                                              self.n_mutations,
                                              replace=True)
            genes_to_mutate = [all_genes[i] for i in choice_indices]

            # evolve genes
            joblist = (delayed(mutate)(g, scoring_function)
                       for g in genes_to_mutate)
            new_population = self.pool(joblist)

            # join and dedup
            population += new_population
            population = deduplicate(population)

            # survival of the fittest
            population = sorted(population,
                                key=lambda x: x.score,
                                reverse=True)[:self.population_size]

            # stats
            gen_time = time() - t0
            mol_sec = (self.population_size + self.n_mutations) / gen_time
            t0 = time()

            population_scores = [p.score for p in population]

            # early stopping
            if population_scores == old_scores:
                patience += 1
                print(f'Failed to progress: {patience}')
                if patience >= self.patience:
                    print(f'No more patience, bailing...')
                    break
            else:
                patience = 0

            print(f'{generation} | '
                  f'max: {np.max(population_scores):.3f} | '
                  f'avg: {np.mean(population_scores):.3f} | '
                  f'min: {np.min(population_scores):.3f} | '
                  f'std: {np.std(population_scores):.3f} | '
                  f'{gen_time:.2f} sec/gen | '
                  f'{mol_sec:.2f} mol/sec')

        # finally
        return [molecule.smiles for molecule in population[:number_molecules]]
Exemplo n.º 7
0
    def generate_optimized_molecules(
            self,
            scoring_function: ScoringFunction,
            number_molecules: int,
            starting_population: Optional[List[str]] = None) -> List[str]:
        """The function called by the benchmarking software: All backend has to be controlled here"""
        # starting population is provided for some benchmarks.
        if number_molecules > self.population:
            self.population = number_molecules
            print(
                f'Benchmark requested more molecules than expected: new population is {number_molecules}'
            )

        self.task += 1
        if self.task < self.start_task:
            return ['CCC']
        self.init_deriver()

        scored_population = []
        if starting_population is None:
            print('selecting initial population...')
            if self.random_start:
                all_smiles = self.load_smiles_from_file(self.smi_file)
                selected_smiles = np.random.choice(all_smiles, self.population)
                scored_population = [(s, scoring_function.score(s))
                                     for s in selected_smiles]
            else:
                # we are just going to get the top scoring mols, and we've checked before so we'll just load a file
                scored_population = self.get_precomputed_scores(
                    self.population)
            self.deriver.set_seeds([s[0] for s in scored_population])
        elif len(starting_population) == 1:
            self.deriver.set_seeds(starting_population)
            scored_population = [(s, scoring_function.score(s))
                                 for s in starting_population]

        # allow self-mating in deriver for new methods
        #if len(scored_population) == 1:
        #    scored_population = [scored_population[0], scored_population[0]]

        best = max([s[1] for s in scored_population])
        p_max = best
        no_progess_counter = 0
        old_avg = 0
        mean_scores_by_gen = []
        best_scores_by_gen = []
        worst_scores_by_gen = []
        temperature = self.initial_temperature
        early_stop_annealing = False
        anneal_counter = 0
        filter_enabled = False
        current_population = self.population
        if self.derive_population:
            self.derive_size = self.population

        for generation in range(self.generations):
            # filter annealing #######################################################################
            if ((generation >= (self.generations * self.delayed_filtering))
                    and self.delayed_filtering) or early_stop_annealing:
                if (not anneal_counter) and (not filter_enabled):
                    filter_enabled = True
                    print(f'Enabling filter at generation {generation}')
                    self.deriver.enable_and_expand_filter()
                    alerts = pd.read_csv('data/alert_collection.csv')
                    sure_chembl = set(
                        alerts.loc[alerts['rule_set_name'] == 'SureChEMBL',
                                   'smarts'])
                    bai = set(alerts.loc[alerts['rule_set_name'] == 'BAI',
                                         'smarts'])
                    self.deriver.set_must_not_have_patterns(
                        list(sure_chembl.union(bai)))
                    print(
                        'Expanding population to introduce filtered candidates'
                    )
                    current_population *= 2
                print('Generating filtered candidates...')
                anneal_counter += 1
            ###########################################################################################

            scored_seeds = make_mating_pool(
                scored_population=scored_population,
                selection_size=self.selection_size,
                method=self.selection_method,
                best=best,
                temperature=temperature)

            # we want the best score from the previous generation, else there can only be ties
            best = p_max

            good_children = derive(
                deriver=self.deriver,
                seeds=[s[0] for s in scored_seeds],
                mut_rate=self.mut_rate,
                n_brics=self.derive_size * self.brics_proportion,
                n_selfies=self.derive_size * self.selfies_proportion,
                n_smiles_gb=self.derive_size * self.smiles_gb_proportion,
                n_selfies_gb=self.derive_size * self.selfies_gb_proportion,
                scanner=self.enable_scanner)

            scored_children = self.rank_and_score(good_children,
                                                  scoring_function)
            scored_population = list(
                (set(scored_children)).union(set(scored_population)))
            scored_population = sorted(scored_population,
                                       key=lambda x: x[1],
                                       reverse=True)[:current_population]
            relevant_scores = [s[1] for s in scored_population
                               ][:max([100, number_molecules])]

            # summarization
            p_max, p_avg, p_min, p_std, p_sum = summarize_results(
                generation, relevant_scores)
            mean_scores_by_gen.append(p_avg)
            best_scores_by_gen.append(p_max)
            worst_scores_by_gen.append(p_min)

            if early_stop_annealing:
                p_avg = np.mean([s[1] for s in scored_population])
                print(f'Population mean: {p_avg}')
            else:
                print(
                    f'Population mean: {np.mean([s[1] for s in scored_population])}'
                )

            # early stopping
            if p_avg == old_avg:
                no_progess_counter += 1
            else:
                no_progess_counter = 0
                anneal_counter = 0
            if self.task < 4 and max(relevant_scores) == 1:
                if self.delayed_filtering:
                    early_stop_annealing = True
                else:
                    print('Finished early on a rediscovery benchmark!')
                    break
            if (no_progess_counter >= self.patience) or (
                    p_avg == 1 and len(scored_population) > 1):
                if self.delayed_filtering:
                    early_stop_annealing = True
                else:
                    print("Finished early!")
                    break
            if (anneal_counter == self.patience) and self.delayed_filtering:
                print("Converged after filtering!")
                break

            old_avg = p_avg
            temperature *= self.temperature_decay
            self.clean_up()

        if self.delayed_filtering:
            scored_population = self.filter_mols(scored_population,
                                                 number_molecules)
        if self.counterscreen:
            self.deriver.enable_and_expand_filter()
            alerts = pd.read_csv('data/alert_collection.csv')
            sure_chembl = set(
                alerts.loc[alerts['rule_set_name'] == 'SureChEMBL', 'smarts'])
            bai = set(alerts.loc[alerts['rule_set_name'] == 'BAI', 'smarts'])
            self.deriver.set_must_not_have_patterns(
                list(sure_chembl.union(bai)))
            scored_population = self.filter_mols(scored_population,
                                                 number_molecules,
                                                 add_bad=False)

        return save_and_exit(scored_population, number_molecules,
                             mean_scores_by_gen, best_scores_by_gen,
                             worst_scores_by_gen)
Exemplo n.º 8
0
    def generate_optimized_molecules(self,
                                     scoring_function: ScoringFunction,
                                     number_molecules: int,
                                     starting_population: Optional[
                                         List[str]] = None,
                                     get_history=False) -> List[str]:

        if number_molecules > self.population_size:
            self.population_size = number_molecules
            print(
                f'Benchmark requested more molecules than expected: new population is {number_molecules}'
            )

        # fetch initial population?
        if starting_population is None:
            print('selecting initial population...')
            if self.random_start:
                starting_population = np.random.choice(self.all_smiles,
                                                       self.population_size)
            else:
                starting_population = self.top_k(self.all_smiles,
                                                 scoring_function,
                                                 self.population_size)

        # select initial population
        # this is also slow
        # population_smiles = heapq.nlargest(self.population_size, starting_population, key=scoring_function.score)
        starting_scores = scoring_function.score_list(starting_population)
        population_smiles = [
            x for _, x in sorted(zip(starting_scores, starting_population),
                                 key=lambda pair: pair[0],
                                 reverse=True)
        ]

        population_mol = [Chem.MolFromSmiles(s) for s in population_smiles]

        # this is slow. Don't know exactly why. maybe pickling classifiers is not too nice
        # population_scores_old = self.pool(delayed(score_mol)(m, scoring_function.score) for m in population_mol)
        population_scores = scoring_function.score_list(
            mols2smiles(population_mol))

        # evolution: go go go!!
        t0 = time()

        patience = 0

        population_history = []
        population_history.append(
            [Chem.MolToSmiles(m) for m in population_mol])

        for generation in range(self.generations):
            # new_population
            mating_pool = make_mating_pool(population_mol, population_scores,
                                           self.offspring_size)
            offspring_mol = self.pool(
                delayed(reproduce)(mating_pool, self.mutation_rate)
                for _ in range(self.population_size))

            # add new_population
            population_mol += offspring_mol
            population_mol = sanitize(population_mol)

            # stats
            gen_time = time() - t0
            mol_sec = self.population_size / gen_time
            t0 = time()

            old_scores = population_scores
            # population_scores = self.pool(delayed(score_mol)(m, scoring_function.score) for m in population_mol)
            population_scores = scoring_function.score_list(
                [Chem.MolToSmiles(m) for m in population_mol])
            population_tuples = list(zip(population_scores, population_mol))
            population_tuples = sorted(population_tuples,
                                       key=lambda x: x[0],
                                       reverse=True)[:self.population_size]
            population_mol = [t[1] for t in population_tuples]
            population_scores = [t[0] for t in population_tuples]

            # early stopping
            if population_scores == old_scores:
                patience += 1
                print(f'Failed to progress: {patience}')
                if patience >= self.patience:
                    print(f'No more patience, bailing...')
                    break
            else:
                patience = 0

            res_time = time() - t0

            print(f'{generation} | '
                  f'max: {np.max(population_scores):.3f} | '
                  f'avg: {np.mean(population_scores):.3f} | '
                  f'min: {np.min(population_scores):.3f} | '
                  f'std: {np.std(population_scores):.3f} | '
                  f'sum: {np.sum(population_scores):.3f} | '
                  f'{gen_time:.2f} sec/gen | '
                  f'{mol_sec:.2f} mol/sec | '
                  f'{res_time:.2f} rest ')

            population_history.append(
                [Chem.MolToSmiles(m) for m in population_mol])

        # finally
        if get_history:
            return population_history
        else:
            return [Chem.MolToSmiles(m)
                    for m in population_mol][:number_molecules]
Exemplo n.º 9
0
    def generate_optimized_molecules(
            self,
            scoring_function: ScoringFunction,
            number_molecules: int,
            starting_population: Optional[List[str]] = None) -> List[str]:

        instance = self.pop_alg.copy_instance_with_parameters()

        # Updating benchmark id
        self.curr_benchmark_id += 1

        # Extracting benchmark name
        curr_benchmark_name = self._get_benchmark_name(self.curr_benchmark_id)

        # Setting folder to save the results
        instance.output_folder_path = join(self.output_save_path,
                                           curr_benchmark_name)
        # instance.output_folder_path = join(self.output_save_path, name)

        # Extracting GuacaMol evaluation function
        # guacamol_evaluation_strategy = GuacamolEvaluationStrategy(scoring_function, name)
        guacamol_evaluation_strategy = GuacamolEvaluationStrategy(
            scoring_function, curr_benchmark_name)

        # Merging the evaluation strategy of the PopAlg instance to the GuacaMol objective
        if isinstance(instance.evaluation_strategy,
                      UndefinedGuacaMolEvaluationStrategy):
            instance.evaluation_strategy = guacamol_evaluation_strategy
        else:
            define_GuacaMol_evaluation_strategies(
                instance.evaluation_strategy, guacamol_evaluation_strategy)

        # Updating mutation strategy evaluator
        instance.mutation_strategy.evaluation_strategy = instance.evaluation_strategy

        # Setting additional stop criterion, stopping the execution when best possible score is obtained
        instance.kth_score_to_record_key = curr_benchmark_name
        # instance.kth_score_to_record_key = name
        additional_stop_criterion = KthScoreMaxValue(1, round=3)
        instance.stop_criterion_strategy.set_additional_strategy(
            additional_stop_criterion)
        instance.stop_criterion_strategy.set_pop_alg_instance(instance)

        # Setting kth score to record
        instance.kth_score_to_record = number_molecules

        # PopAlg instance initialization
        instance.initialize()

        # Population initialization
        if self.guacamol_init_top_100:

            # Extracting the top 100 SMILES for the property from ChEMBL and setting it at initial population
            # From https://github.com/BenevolentAI/guacamol_baselines/blob/master/graph_ga/goal_directed_generation.py
            with open(self.init_pop_path, "r") as f:

                smiles_list = f.readlines()
                scores = [scoring_function.score(s) for s in smiles_list]
                top_100_smiles = np.array(smiles_list)[np.argsort(scores)[::-1]
                                                       [:100]]
                instance.load_pop_from_smiles_list(smiles_list=top_100_smiles)
        else:
            instance.load_pop_from_smiles_list(smiles_list=["C"])

        # Running EvoMol
        instance.run()

        # Extracting the vector containing the guacamol  objective property value for all individuals
        if instance.kth_score_to_record_key == "total":
            obj_prop_vector = instance.curr_total_scores
        else:
            obj_prop_vector = instance.curr_scores[
                instance.kth_score_to_record_idx]

        # Extracting best individuals
        ind_to_return_indices = np.argsort(
            obj_prop_vector)[::-1].flatten()[:number_molecules]
        output_population = []
        for ind_idx in ind_to_return_indices:
            output_population.append(
                instance.pop[ind_idx].to_aromatic_smiles())

        # Returning optimized population
        return output_population