def assess_model(
        self, model: DistributionMatchingGenerator
    ) -> DistributionLearningBenchmarkResult:
        start_time = time.time()
        molecules = sample_valid_molecules(
            model=model, number_molecules=self.number_samples)
        end_time = time.time()

        if len(molecules) != self.number_samples:
            logger.warning(
                'The model could not generate enough valid molecules. The score will be penalized.'
            )

        # canonicalize_list removes duplicates (and invalid molecules, but there shouldn't be any)
        unique_molecules = canonicalize_list(molecules,
                                             include_stereocenters=False)

        unique_ratio = len(unique_molecules) / self.number_samples
        metadata = {
            'number_samples': self.number_samples,
            'number_unique': len(unique_molecules)
        }

        return DistributionLearningBenchmarkResult(benchmark_name=self.name,
                                                   score=unique_ratio,
                                                   sampling_time=end_time -
                                                   start_time,
                                                   metadata=metadata)
    def generate_optimized_molecules(self, scoring_function: ScoringFunction, number_molecules: int,
                                     starting_population: Optional[List[str]] = None) -> List[str]:
        cuda_available = torch.cuda.is_available()
        device = "cuda" if cuda_available else "cpu"
        model_def = Path(self.pretrained_model_path).with_suffix('.json')

        smiles_rnn = load_rnn_model(model_def, self.pretrained_model_path, device, copy_to_cpu=True)
        model = SmilesRnnActorCritic(smiles_rnn=smiles_rnn).to(device)

        generator = PPOMoleculeGenerator(model=model,
                                         max_seq_length=self.max_seq_len,
                                         device=device)

        molecules = generator.optimise(objective=scoring_function,
                                       start_population=[],
                                       **self.model_args)

        # take the molecules seen during the hill-climbing, and also sample from the final model
        samples = [m.smiles for m in molecules]
        if self.sample_final_model_only:
            samples.clear()
        samples += generator.sample(max(number_molecules, self.number_final_samples))

        # calculate the scores and return the best ones
        samples = canonicalize_list(samples)
        scores = scoring_function.score_list(samples)

        scored_molecules = zip(samples, scores)
        sorted_scored_molecules = sorted(scored_molecules, key=lambda x: (x[1], hash(x[0])), reverse=True)

        top_scored_molecules = sorted_scored_molecules[:number_molecules]

        return [x[0] for x in top_scored_molecules]
Пример #3
0
    def generate_optimized_molecules(self,
                                     scoring_function: ScoringFunction,
                                     number_molecules: int,
                                     starting_population: Optional[
                                         List[str]] = None,
                                     get_history=False) -> List[str]:

        # fetch initial population?
        if starting_population is None:
            print('selecting initial population...')
            if self.random_start:
                starting_population = []
            else:
                all_smiles = self.load_smiles_from_file(self.smi_file)
                starting_population = self.top_k(all_smiles, scoring_function,
                                                 self.mols_to_sample)

        cuda_available = torch.cuda.is_available()
        device = "cuda" if cuda_available else "cpu"
        model_def = Path(self.pretrained_model_path).with_suffix('.json')

        model = load_rnn_model(model_def,
                               self.pretrained_model_path,
                               device,
                               copy_to_cpu=True)

        generator = SmilesRnnMoleculeGenerator(model=model,
                                               max_len=self.max_len,
                                               device=device)

        molecules, smiles_history = generator.optimise(
            objective=scoring_function,
            start_population=starting_population,
            n_epochs=self.n_epochs,
            mols_to_sample=self.mols_to_sample,
            keep_top=self.keep_top,
            optimize_batch_size=self.optimize_batch_size,
            optimize_n_epochs=self.optimize_n_epochs,
            pretrain_n_epochs=self.pretrain_n_epochs)

        # take the molecules seen during the hill-climbing, and also sample from the final model
        samples = [m.smiles for m in molecules]
        if self.sample_final_model_only:
            samples.clear()
        samples += generator.sample(
            max(number_molecules, self.number_final_samples))

        # calculate the scores and return the best ones
        samples = canonicalize_list(samples)
        scores = scoring_function.score_list(samples)

        scored_molecules = zip(samples, scores)
        sorted_scored_molecules = sorted(scored_molecules,
                                         key=lambda x: (x[1], hash(x[0])),
                                         reverse=True)

        top_scored_molecules = sorted_scored_molecules[:number_molecules]

        return smiles_history
 def __init__(self, number_samples: int,
              training_set: Iterable[str]) -> None:
     """
     Args:
         number_samples: number of samples to generate from the model
         training_set: molecules from the training set
     """
     super().__init__(name='Novelty', number_samples=number_samples)
     self.training_set_molecules = set(
         canonicalize_list(training_set, include_stereocenters=False))
Пример #5
0
def test_list_canonicalization_removes_none():
    m1 = 'CCC(OCOCO)CC(=O)NCC'
    m2 = 'this.is.not.a.molecule'
    m3 = 'c1ccccc1'
    m4 = 'CC(OCON=N)CC'

    molecules = [m1, m2, m3, m4]
    canonicalized_molecules = canonicalize_list(molecules)

    valid_molecules = [m1, m3, m4]
    expected = [canonicalize(smiles) for smiles in valid_molecules]

    assert canonicalized_molecules == expected
    def pretrain_on_initial_population(self, scoring_function: ScoringFunction,
                                       start_population,
                                       pretrain_epochs) -> List[OptResult]:
        """
        Takes an objective and tries to optimise it
        :param scoring_function: MPO
        :param start_population: Initial compounds (list of smiles) or request new (random?) population
        :param pretrain_epochs: number of epochs to finetune with start_population
        :return: Candidate molecules
        """
        seed: List[OptResult] = []

        start_population_size = len(start_population)

        training = canonicalize_list(start_population,
                                     include_stereocenters=True)

        if len(training) != start_population_size:
            logger.warning(
                "Some entries for the start population are invalid or duplicated"
            )
            start_population_size = len(training)

        if start_population_size == 0:
            return seed

        logger.info("finetuning with {} molecules for {} epochs".format(
            start_population_size, pretrain_epochs))

        scores = scoring_function.score_list(training)
        seed.extend(
            OptResult(smiles=smiles, score=score)
            for smiles, score in zip(training, scores))

        train_seqs, _ = load_smiles_from_list(training, max_len=self.max_len)
        train_set = get_tensor_dataset(train_seqs)

        batch_size = min(int(len(training)), 32)

        print_every = len(training) / batch_size

        losses = self.trainer.fit(train_set,
                                  train_set,
                                  batch_size=batch_size,
                                  n_epochs=pretrain_epochs,
                                  print_every=print_every,
                                  valid_every=print_every)
        logger.info(losses)
        return seed
 def __init__(self, number_samples: int, training_set: List[str]) -> None:
     """
     Args:
         number_samples: number of samples to generate from the model
         training_set: molecules from the training set
     """
     super().__init__(name='KL divergence', number_samples=number_samples)
     self.training_set_molecules = canonicalize_list(
         get_random_subset(training_set, self.number_samples, seed=42),
         include_stereocenters=False)
     self.pc_descriptor_subset = [
         'BertzCT', 'MolLogP', 'MolWt', 'TPSA', 'NumHAcceptors',
         'NumHDonors', 'NumRotatableBonds', 'NumAliphaticRings',
         'NumAromaticRings'
     ]
Пример #8
0
def derive(deriver, seeds, mut_rate, n_brics, n_selfies, n_smiles_gb,
           n_selfies_gb, scanner):
    print('Deriving new mols...')
    all_mols = set()
    deriver.set_seeds(seeds)

    if scanner:
        good_scanner, _ = deriver.scan_selfies()
        print(f'Generated {len(good_scanner)} scanner mols.')
        all_mols.update(good_scanner)

    if n_brics > 0:
        try:
            good_brics, _ = deriver.derive_brics(n_children=int(n_brics))
            print(f'Generated {len(good_brics)} brics mols.')
            all_mols.update(good_brics)
        except ZeroDivisionError:
            print(f'No valid seed fragments could be generated for BRICs.')

    if n_selfies > 0:
        good_selfies, _ = deriver.derive_selfies(n_children=int(n_selfies))
        print(f'Generated {len(good_selfies)} selfies mols.')
        all_mols.update(good_selfies)

    if n_selfies_gb > 0:
        good_selfies_gb, _ = deriver.derive_gb(n_children=int(n_selfies_gb),
                                               mut_rate=mut_rate,
                                               kind='selfies')
        print(f'Generated {len(good_selfies_gb)} selfies_gb mols.')
        all_mols.update(good_selfies_gb)

    if n_smiles_gb > 0:
        good_smiles_gb, _ = deriver.derive_gb(n_children=int(n_smiles_gb),
                                              mut_rate=mut_rate,
                                              kind='smiles')
        print(f'Generated {len(good_smiles_gb)} smiles_gb mols.')
        all_mols.update(good_smiles_gb)

    return canonicalize_list(list(all_mols), False)
Пример #9
0
    def assess_model(
        self, model: DistributionMatchingGenerator
    ) -> DistributionLearningBenchmarkResult:
        """
        Assess a distribution-matching generator model.

        Args:
            model: model to assess
        """
        start_time = time.time()
        molecules = sample_unique_molecules(
            model=model, number_molecules=self.number_samples, max_tries=2)
        end_time = time.time()

        if len(molecules) != self.number_samples:
            logger.warning(
                "The model could not generate enough unique molecules. The score will be penalized."
            )

        # canonicalize_list in order to remove stereo information (also removes duplicates and invalid molecules, but there shouldn't be any)
        unique_molecules = set(
            canonicalize_list(molecules, include_stereocenters=False))

        novel_molecules = unique_molecules.difference(
            self.training_set_molecules)

        novel_ratio = len(novel_molecules) / self.number_samples

        metadata = {
            "number_samples": self.number_samples,
            "number_novel": len(novel_molecules)
        }

        return DistributionLearningBenchmarkResult(
            benchmark_name=self.name,
            score=novel_ratio,
            sampling_time=end_time - start_time,
            metadata=metadata,
        )
Пример #10
0
def save_metrics(model: EstimatorGenerator, training_set_file: Union[str,
                                                                     Path],
                 output_file: Union[str, Path]) -> None:
    training_set = [s.strip() for s in open(training_set_file).readlines()]
    training_set_molecules = set(
        canonicalize_list(training_set, include_stereocenters=False))
    LOG.info('Loaded %d unique molecules from %s', len(training_set_molecules),
             training_set_file)

    metrics = GraphMolecularMetrics(None, None)
    gen_molecules = sample_unique_molecules(model, 10000)
    pbar = tqdm(gen_molecules, desc='Computing metrics', total=10000)

    indices = []
    samples = defaultdict(lambda: [])
    for i, smi in enumerate(pbar):
        if smi is None or not ValidityScore.is_valid_smiles(smi):
            continue

        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            continue

        values = metrics.get_validation_metrics([mol])
        values['SMILES'] = smi
        values['is_novel'] = 0 if smi in training_set_molecules else 1

        for key, val in values.items():
            if isinstance(val, list):
                assert len(val) == 1
                val = val[0]
            samples[key].append(val)
        indices.append(i)

    df = pd.DataFrame.from_dict(samples)
    df.index = indices
    LOG.info('Saving metrics to %s', output_file)
    df.to_csv(output_file)
    def optimise(self, objective: ScoringFunction, start_population, keep_top,
                 n_epochs, mols_to_sample, optimize_n_epochs,
                 optimize_batch_size, pretrain_n_epochs) -> List[OptResult]:
        """
        Takes an objective and tries to optimise it
        :param objective: MPO
        :param start_population: Initial compounds (list of smiles) or request new (random?) population
        :param kwargs need to contain:
                keep_top: number of molecules to keep at each iterative finetune step
                mols_to_sample: number of molecules to sample at each iterative finetune step
                optimize_n_epochs: number of episodes to finetune
                optimize_batch_size: batch size for fine-tuning
                pretrain_n_epochs: number of epochs to pretrain on start population
        :param get_history: If true also return intermediate samples as well
        :return: Candidate molecules
        """

        int_results = self.pretrain_on_initial_population(
            objective, start_population, pretrain_epochs=pretrain_n_epochs)

        results: List[OptResult] = []
        seen: Set[str] = set()

        for k in int_results:
            if k.smiles not in seen:
                results.append(k)
                seen.add(k.smiles)

        smiles_history = []
        for epoch in range(1, 1 + n_epochs):

            t0 = time.time()
            samples = self.sampler.sample(self.model,
                                          mols_to_sample,
                                          max_seq_len=self.max_len)
            t1 = time.time()

            canonicalized_samples = set(
                canonicalize_list(samples, include_stereocenters=True))
            smiles_history.append(list(canonicalized_samples))
            payload = list(canonicalized_samples.difference(seen))
            payload.sort(
            )  # necessary for reproducibility between different runs

            seen.update(canonicalized_samples)

            scores = objective.score_list(payload)
            int_results = [
                OptResult(smiles=smiles, score=score)
                for smiles, score in zip(payload, scores)
            ]

            t2 = time.time()

            results.extend(sorted(int_results, reverse=True)[0:keep_top])
            results.sort(reverse=True)
            subset = [i.smiles for i in results][0:keep_top]

            np.random.shuffle(subset)

            sub_train = subset[0:int(3 * len(subset) / 4)]
            sub_test = subset[int(3 * len(subset) / 4):]

            train_seqs, _ = load_smiles_from_list(sub_train,
                                                  max_len=self.max_len)
            valid_seqs, _ = load_smiles_from_list(sub_test,
                                                  max_len=self.max_len)

            train_set = get_tensor_dataset(train_seqs)
            valid_set = get_tensor_dataset(valid_seqs)

            opt_batch_size = min(len(sub_train), optimize_batch_size)

            print_every = int(len(sub_train) / opt_batch_size)

            if optimize_n_epochs > 0:
                self.trainer.fit(train_set,
                                 valid_set,
                                 n_epochs=optimize_n_epochs,
                                 batch_size=opt_batch_size,
                                 print_every=print_every,
                                 valid_every=print_every)

            t3 = time.time()

            logger.info(f'Generation {epoch} --- timings: '
                        f'sample: {(t1 - t0):.3f} s, '
                        f'score: {(t2 - t1):.3f} s, '
                        f'finetune: {(t3 - t2):.3f} s')

            top4 = '\n'.join(f'\t{result.score:.3f}: {result.smiles}'
                             for result in results[:4])

            logger.info(f'Top 4:\n{top4}')
            print(f'Top 4:\n{top4}')

        return sorted(results, reverse=True), smiles_history
Пример #12
0
def main():
    """ Get Chembl-23.

    Preprocessing steps:

    1) filter SMILES shorter than 5 and longer than 200 chars and those with forbidden symbols
    2) canonicalize, neutralize, only permit smiles shorter than 100 chars
    3) shuffle, write files, check if they are consistently hashed.
    """
    setup_default_logger()

    argparser = get_argparser()
    args = argparser.parse_args()

    # Set constants
    np.random.seed(1337)
    neutralization_rxns = initialise_neutralisation_reactions()
    smiles_dict = AllowedSmilesCharDictionary()

    print("Preprocessing ChEMBL molecules...")

    chembl_file = os.path.join(args.destination, CHEMBL_FILE_NAME)

    data = (
        pkgutil.get_data("guacamol.data", "holdout_set_gcm_v1.smiles").decode("utf-8").splitlines()
    )

    holdout_mols = [i.split(" ")[0] for i in data]
    holdout_set = set(canonicalize_list(holdout_mols, False))
    holdout_fps = get_fingerprints_from_smileslist(holdout_set)

    # Download Chembl23 if needed.
    download_if_not_present(chembl_file, uri=CHEMBL_URL)
    raw_smiles = get_raw_smiles(
        chembl_file, smiles_char_dict=smiles_dict, open_fn=gzip.open, extract_fn=extract_chembl
    )

    file_prefix = "chembl24_canon"

    print(
        f"and standardizing {len(raw_smiles)} molecules using {args.n_jobs} cores, "
        f"and excluding molecules based on ECFP4 similarity of > {TANIMOTO_CUTOFF} to the holdout set."
    )

    # Process all the SMILES in parallel
    runner = Parallel(n_jobs=args.n_jobs, verbose=2)

    joblist = (
        delayed(filter_and_canonicalize)(
            smiles_str, holdout_set, holdout_fps, neutralization_rxns, TANIMOTO_CUTOFF, False
        )
        for smiles_str in raw_smiles
    )

    output = runner(joblist)

    # Put all nonzero molecules in a list, remove duplicates, sort and shuffle

    all_good_mols = sorted(list(set([item[0] for item in output if item])))
    np.random.shuffle(all_good_mols)
    print(f"Ended up with {len(all_good_mols)} molecules. Preparing splits...")

    # Split into train-dev-test
    # Check whether the md5-hashes of the generated smiles files match
    # the precomputed hashes, this ensures everyone works with the same splits.

    VALID_SIZE = int(0.05 * len(all_good_mols))
    TEST_SIZE = int(0.15 * len(all_good_mols))

    dev_set = all_good_mols[0:VALID_SIZE]
    dev_path = os.path.join(args.destination, f"{file_prefix}_dev-valid.smiles")
    write_smiles(dev_set, dev_path)

    test_set = all_good_mols[VALID_SIZE : VALID_SIZE + TEST_SIZE]
    test_path = os.path.join(args.destination, f"{file_prefix}_test.smiles")
    write_smiles(test_set, test_path)

    train_set = all_good_mols[VALID_SIZE + TEST_SIZE :]
    train_path = os.path.join(args.destination, f"{file_prefix}_train.smiles")
    write_smiles(train_set, train_path)

    # check the hashes
    valid_hashes = [
        compare_hash(train_path, TRAIN_HASH),
        compare_hash(dev_path, VALID_HASH),
        compare_hash(test_path, TEST_HASH),
    ]

    if not all(valid_hashes):
        raise SystemExit(f"Invalid hashes for the dataset files")

    print("Dataset generation successful. You are ready to go.")
Пример #13
0
def main():
    """ Get Chembl-23.

    Preprocessing steps:

    1) filter SMILES shorter than 5 and longer than 200 chars and those with forbidden symbols
    2) canonicalize, neutralize, only permit smiles shorter than 100 chars
    3) shuffle, write files, check if they are consistently hashed.
    """
    argparser = get_argparser()
    args = argparser.parse_args()

    # Set constants
    np.random.seed(1337)
    neutralization_rxns = initialise_neutralisation_reactions()
    smiles_dict = AllowedSmilesCharDictionary()

    tanimoto_cutoff = args.tanimoto_cutoff

    # Either use chembl, or supplied SMILES file.

    print('Preprocessing molecules...')

    if args.chembl:

        print('Using Chembl')

        chembl_file = os.path.join(args.destination, CHEMBL_FILE_NAME)

        data = pkgutil.get_data('guacamol.data', 'holdout_set_gcm_v1.smiles').decode('utf-8').splitlines()

        holdout_mols = [i.split(' ')[0] for i in data]
        holdout_set = set(canonicalize_list(holdout_mols, False))
        holdout_fps = get_fingerprints_from_smileslist(holdout_set)

        # Download Chembl23 if needed.
        download_if_not_present(chembl_file,
                                uri=CHEMBL_URL)
        raw_smiles = get_raw_smiles(chembl_file, smiles_char_dict=smiles_dict, open_fn=gzip.open,
                                    extract_fn=extract_chembl)

        file_prefix = 'chembl24_canon'

        print(f'Excluding molecules based on ECFP4 similarity of > {tanimoto_cutoff} to the holdout set')

    else:
        if args.input is None:
            raise IOError(
                'You need to specify an input smiles file with -i {file} or --input {file}. \n'
                'Alternatively, provide the --chembl flag to download and process molecules from ChEMBL24 (recommended)')

        raw_smiles = get_raw_smiles(args.input, smiles_char_dict=smiles_dict, open_fn=open,
                                    extract_fn=extract_smilesfile)
        tanimoto_cutoff = 100  # effectively no cutoff
        holdout_set = set([])
        holdout_fps = []
        file_prefix = args.output_prefix

    print()
    print(f'Standardizing {len(raw_smiles)} molecules using {args.n_jobs} cores...')

    # Process all the SMILES in parallel
    runner = Parallel(n_jobs=args.n_jobs, verbose=2)

    joblist = (delayed(filter_and_canonicalize)(smiles_str,
                                                holdout_set,
                                                holdout_fps,
                                                neutralization_rxns,
                                                tanimoto_cutoff,
                                                False)
               for smiles_str in raw_smiles)

    output = runner(joblist)

    # Put all nonzero molecules in a list, remove duplicates, sort and shuffle

    all_good_mols = sorted(list(set([item[0] for item in output if item])))
    np.random.shuffle(all_good_mols)
    print(f'Ended up with {len(all_good_mols)} molecules. Preparing splits...')

    # Split into train-dev-test
    # Check whether the md5-hashes of the generated smiles files match
    # the precomputed hashes, this ensures everyone works with the same splits.

    VALID_SIZE = int(0.05 * len(all_good_mols))
    TEST_SIZE = int(0.15 * len(all_good_mols))

    dev_set = all_good_mols[0:VALID_SIZE]
    dev_path = os.path.join(args.destination, f'{file_prefix}_dev-valid.smiles')
    write_smiles(dev_set, dev_path)

    test_set = all_good_mols[VALID_SIZE:VALID_SIZE + TEST_SIZE]
    test_path = os.path.join(args.destination, f'{file_prefix}_test.smiles')
    write_smiles(test_set, test_path)

    train_set = all_good_mols[VALID_SIZE + TEST_SIZE:]
    train_path = os.path.join(args.destination, f'{file_prefix}_train.smiles')
    write_smiles(train_set, train_path)

    # for chembl, check the hashes
    if args.chembl:
        compare_hash(train_path, TRAIN_HASH)
        compare_hash(dev_path, VALID_HASH)
        compare_hash(test_path, TEST_HASH)

        print('The train/test/dev-file md5 hashes match the expected hashes.')

    print('You are ready to go.')
    def assess_model(
            self, model: GoalDirectedGenerator) -> GoalDirectedBenchmarkResult:
        """
        Assess the given model by asking it to generate molecules optimizing a scoring function.
        The number of molecules to generate is determined automatically from the score contribution specification.

        Args:
            model: model to assess
        """
        number_molecules_to_generate = max(
            self.contribution_specification.top_counts)
        start_time = time.time()
        molecules = model.generate_optimized_molecules(
            scoring_function=self.wrapped_objective,
            number_molecules=number_molecules_to_generate,
            starting_population=self.starting_population,
        )
        end_time = time.time()

        canonicalized_molecules = canonicalize_list(
            molecules, include_stereocenters=False)
        unique_molecules = remove_duplicates(canonicalized_molecules)
        scores = self.objective.score_list(unique_molecules)

        if len(unique_molecules) != number_molecules_to_generate:
            number_missing = number_molecules_to_generate - len(
                unique_molecules)
            logger.warning(
                f"An incorrect number of distinct molecules was generated: "
                f"{len(unique_molecules)} instead of {number_molecules_to_generate}. "
                f"Padding scores with {number_missing} zeros...")
            scores.extend([0.0] * number_missing)

        global_score, top_x_dict = compute_global_score(
            self.contribution_specification, scores)

        scored_molecules = zip(unique_molecules, scores)
        sorted_scored_molecules = sorted(scored_molecules,
                                         key=lambda x: (x[1], x[0]),
                                         reverse=True)

        internal_similarities = calculate_internal_pairwise_similarities(
            unique_molecules)

        # accumulate internal_similarities in metadata
        int_simi_histogram = np.histogram(internal_similarities,
                                          bins=10,
                                          range=(0, 1),
                                          density=True)

        metadata: Dict[str, Any] = {}
        metadata.update(top_x_dict)
        metadata["internal_similarity_max"] = internal_similarities.max()
        metadata["internal_similarity_mean"] = internal_similarities.mean()
        metadata["internal_similarity_histogram_density"] = (
            int_simi_histogram[0].tolist(), )
        metadata["internal_similarity_histogram_bins"] = (
            int_simi_histogram[1].tolist(), )

        return GoalDirectedBenchmarkResult(
            benchmark_name=self.name,
            score=global_score,
            optimized_molecules=sorted_scored_molecules,
            execution_time=end_time - start_time,
            number_scoring_function_calls=self.wrapped_objective.evaluations,
            metadata=metadata,
        )
    def assess_model(
        self, model: DistributionMatchingGenerator
    ) -> DistributionLearningBenchmarkResult:
        """
        Assess a distribution-matching generator model.

        Args:
            model: model to assess
        """
        start_time = time.time()
        molecules = sample_unique_molecules(
            model=model, number_molecules=self.number_samples, max_tries=2)
        end_time = time.time()

        if len(molecules) != self.number_samples:
            logger.warning(
                'The model could not generate enough unique molecules. The score will be penalized.'
            )

        # canonicalize_list in order to remove stereo information (also removes duplicates and invalid molecules, but there shouldn't be any)
        unique_molecules = set(
            canonicalize_list(molecules, include_stereocenters=False))

        # first we calculate the descriptors, which are np.arrays of size n_samples x n_descriptors
        d_sampled = calculate_pc_descriptors(unique_molecules,
                                             self.pc_descriptor_subset)
        d_chembl = calculate_pc_descriptors(self.training_set_molecules,
                                            self.pc_descriptor_subset)

        kldivs = {}

        # now we calculate the kl divergence for the float valued descriptors ...
        for i in range(4):
            kldiv = continuous_kldiv(X_baseline=d_chembl[:, i],
                                     X_sampled=d_sampled[:, i])
            kldivs[self.pc_descriptor_subset[i]] = kldiv

        # ... and for the int valued ones.
        for i in range(4, 9):
            kldiv = discrete_kldiv(X_baseline=d_chembl[:, i],
                                   X_sampled=d_sampled[:, i])
            kldivs[self.pc_descriptor_subset[i]] = kldiv

        # pairwise similarity

        chembl_sim = calculate_internal_pairwise_similarities(
            self.training_set_molecules)
        chembl_sim = chembl_sim.max(axis=1)

        sampled_sim = calculate_internal_pairwise_similarities(
            unique_molecules)
        sampled_sim = sampled_sim.max(axis=1)

        kldiv_int_int = continuous_kldiv(X_baseline=chembl_sim,
                                         X_sampled=sampled_sim)
        kldivs['internal_similarity'] = kldiv_int_int

        # for some reason, this runs into problems when both sets are identical.
        # cross_set_sim = calculate_pairwise_similarities(self.training_set_molecules, unique_molecules)
        # cross_set_sim = cross_set_sim.max(axis=1)
        #
        # kldiv_ext = discrete_kldiv(chembl_sim, cross_set_sim)
        # kldivs['external_similarity'] = kldiv_ext
        # kldiv_sum += kldiv_ext

        metadata = {'number_samples': self.number_samples, 'kl_divs': kldivs}

        # Each KL divergence value is transformed to be in [0, 1].
        # Then their average delivers the final score.
        partial_scores = [np.exp(-score) for score in kldivs.values()]
        score = sum(partial_scores) / len(partial_scores)

        return DistributionLearningBenchmarkResult(benchmark_name=self.name,
                                                   score=score,
                                                   sampling_time=end_time -
                                                   start_time,
                                                   metadata=metadata)
Пример #16
0
    def optimise(self, objective, start_population, keep_top, n_epochs,
                 mols_to_sample, optimize_n_epochs, optimize_batch_size,
                 pretrain_n_epochs) -> List[OptResult]:
        """
        Takes an objective and tries to optimise it
        :param objective: MPO
        :param start_population: Initial compounds (list of smiles) or request new (random?) population
        :param kwargs need to contain:
                keep_top: number of molecules to keep at each iterative finetune step
                mols_to_sample: number of molecules to sample at each iterative finetune step
                optimize_n_epochs: number of episodes to finetune
                optimize_batch_size: batch size for fine-tuning
                pretrain_n_epochs: number of epochs to pretrain on start population
        :return: Candidate molecules
        """

        # int_results = self.pretrain_on_initial_population(objective, start_population,
        #                                                   pretrain_epochs=pretrain_n_epochs)

        results: List[OptResult] = []
        seen: Set[str] = set()

        # for k in int_results:
        #     if k.smiles not in seen:
        #         results.append(k)
        #         seen.add(k.smiles)

        oracle_num = 0
        smiles2score = {}
        result_folder = "/project/molecular_data/graphnn/pyscreener/smiles_lstm_hc/results.run.3/"
        for epoch in tqdm(range(1, 1 + n_epochs)):

            t0 = time.time()
            samples = self.sampler.sample(self.model,
                                          mols_to_sample,
                                          max_seq_len=self.max_len)
            t1 = time.time()

            canonicalized_samples = set(
                canonicalize_list(samples, include_stereocenters=True))
            payload = list(canonicalized_samples.difference(seen))
            payload.sort(
            )  # necessary for reproducibility between different runs

            seen.update(canonicalized_samples)

            print('payload', len(payload))
            # scores = objective.score_list(payload)
            scores = []
            for smiles in payload:
                if smiles not in smiles2score:
                    try:
                        score = objective(smiles)
                    except:
                        score = 0.0
                    oracle_num += 1
                    smiles2score[smiles] = score
                    print('oracle_num', oracle_num)
                    if oracle_num % 50 == 0:
                        pickle.dump(
                            smiles2score,
                            open(result_folder + str(oracle_num) + '.pkl',
                                 'wb'))
                else:
                    score = smiles2score[smiles]
                scores.append(score)

            int_results = [
                OptResult(smiles=smiles, score=score)
                for smiles, score in zip(payload, scores)
            ]

            t2 = time.time()

            results.extend(sorted(int_results, reverse=True)[0:keep_top])
            results.sort(reverse=True)
            subset = [i.smiles for i in results][0:keep_top]

            np.random.shuffle(subset)

            sub_train = subset[0:int(3 * len(subset) / 4)]
            sub_test = subset[int(3 * len(subset) / 4):]

            train_seqs, _ = load_smiles_from_list(sub_train,
                                                  max_len=self.max_len)
            valid_seqs, _ = load_smiles_from_list(sub_test,
                                                  max_len=self.max_len)

            train_set = get_tensor_dataset(train_seqs)
            valid_set = get_tensor_dataset(valid_seqs)

            opt_batch_size = min(len(sub_train), optimize_batch_size)

            print_every = int(len(sub_train) / opt_batch_size)

            if optimize_n_epochs > 0:
                self.trainer.fit(train_set,
                                 valid_set,
                                 n_epochs=optimize_n_epochs,
                                 batch_size=opt_batch_size,
                                 print_every=print_every,
                                 valid_every=print_every)

            t3 = time.time()

            logger.info(f'Generation {epoch} --- timings: '
                        f'sample: {(t1 - t0):.3f} s, '
                        f'score: {(t2 - t1):.3f} s, '
                        f'finetune: {(t3 - t2):.3f} s')

            top4 = '\n'.join(f'\t{result.score:.3f}: {result.smiles}'
                             for result in results[:4])
            logger.info(f'Top 4:\n{top4}')

        return sorted(results, reverse=True)