def test_sample_unique_molecules_if_not_enough_unique_generated(): # does not raise an exception if molecules = ['CO' for _ in range(20)] molecules[-1] = 'CC' generator = MockGenerator(molecules) # samples a max of 9*2 molecules and just does not sample the other molecule # in this case the list of generated molecules contains just 'CO' mols = sample_unique_molecules(generator, 2, max_tries=9) assert mols == ['CO'] # with a max of 10*2 molecules two valid molecules can be sampled generator = MockGenerator(molecules) mols = sample_unique_molecules(generator, 2) assert mols == ['CO', 'CC']
def assess_model( self, model: DistributionMatchingGenerator ) -> DistributionLearningBenchmarkResult: """ Assess a distribution-matching generator model. Args: model: model to assess """ start_time = time.time() molecules = sample_unique_molecules( model=model, number_molecules=self.number_samples, max_tries=2) end_time = time.time() if len(molecules) != self.number_samples: logger.warning( "The model could not generate enough unique molecules. The score will be penalized." ) # canonicalize_list in order to remove stereo information (also removes duplicates and invalid molecules, but there shouldn't be any) unique_molecules = set( canonicalize_list(molecules, include_stereocenters=False)) novel_molecules = unique_molecules.difference( self.training_set_molecules) novel_ratio = len(novel_molecules) / self.number_samples metadata = { "number_samples": self.number_samples, "number_novel": len(novel_molecules) } return DistributionLearningBenchmarkResult( benchmark_name=self.name, score=novel_ratio, sampling_time=end_time - start_time, metadata=metadata, )
def save_metrics(model: EstimatorGenerator, training_set_file: Union[str, Path], output_file: Union[str, Path]) -> None: training_set = [s.strip() for s in open(training_set_file).readlines()] training_set_molecules = set( canonicalize_list(training_set, include_stereocenters=False)) LOG.info('Loaded %d unique molecules from %s', len(training_set_molecules), training_set_file) metrics = GraphMolecularMetrics(None, None) gen_molecules = sample_unique_molecules(model, 10000) pbar = tqdm(gen_molecules, desc='Computing metrics', total=10000) indices = [] samples = defaultdict(lambda: []) for i, smi in enumerate(pbar): if smi is None or not ValidityScore.is_valid_smiles(smi): continue mol = Chem.MolFromSmiles(smi) if mol is None: continue values = metrics.get_validation_metrics([mol]) values['SMILES'] = smi values['is_novel'] = 0 if smi in training_set_molecules else 1 for key, val in values.items(): if isinstance(val, list): assert len(val) == 1 val = val[0] samples[key].append(val) indices.append(i) df = pd.DataFrame.from_dict(samples) df.index = indices LOG.info('Saving metrics to %s', output_file) df.to_csv(output_file)
def test_sample_unique_molecules_with_duplicate_molecules(): generator = MockGenerator(['CO', 'C(O)', 'CCCC', 'CC']) mols = sample_unique_molecules(generator, 2) assert mols == ['CO', 'CCCC']
def test_sample_unique_molecules_with_invalid_molecules(): generator = MockGenerator(['invalid1', 'invalid2', 'inv3', 'CCCC', 'CC']) mols = sample_unique_molecules(generator, 2) assert mols == ['CCCC', 'CC']
def test_sample_unique_molecules_for_valid_only(): generator = MockGenerator(['CCCC', 'CC']) mols = sample_unique_molecules(generator, 2) assert mols == ['CCCC', 'CC']
def assess_model( self, model: DistributionMatchingGenerator ) -> DistributionLearningBenchmarkResult: """ Assess a distribution-matching generator model. Args: model: model to assess """ start_time = time.time() molecules = sample_unique_molecules( model=model, number_molecules=self.number_samples, max_tries=2) end_time = time.time() if len(molecules) != self.number_samples: logger.warning( 'The model could not generate enough unique molecules. The score will be penalized.' ) # canonicalize_list in order to remove stereo information (also removes duplicates and invalid molecules, but there shouldn't be any) unique_molecules = set( canonicalize_list(molecules, include_stereocenters=False)) # first we calculate the descriptors, which are np.arrays of size n_samples x n_descriptors d_sampled = calculate_pc_descriptors(unique_molecules, self.pc_descriptor_subset) d_chembl = calculate_pc_descriptors(self.training_set_molecules, self.pc_descriptor_subset) kldivs = {} # now we calculate the kl divergence for the float valued descriptors ... for i in range(4): kldiv = continuous_kldiv(X_baseline=d_chembl[:, i], X_sampled=d_sampled[:, i]) kldivs[self.pc_descriptor_subset[i]] = kldiv # ... and for the int valued ones. for i in range(4, 9): kldiv = discrete_kldiv(X_baseline=d_chembl[:, i], X_sampled=d_sampled[:, i]) kldivs[self.pc_descriptor_subset[i]] = kldiv # pairwise similarity chembl_sim = calculate_internal_pairwise_similarities( self.training_set_molecules) chembl_sim = chembl_sim.max(axis=1) sampled_sim = calculate_internal_pairwise_similarities( unique_molecules) sampled_sim = sampled_sim.max(axis=1) kldiv_int_int = continuous_kldiv(X_baseline=chembl_sim, X_sampled=sampled_sim) kldivs['internal_similarity'] = kldiv_int_int # for some reason, this runs into problems when both sets are identical. # cross_set_sim = calculate_pairwise_similarities(self.training_set_molecules, unique_molecules) # cross_set_sim = cross_set_sim.max(axis=1) # # kldiv_ext = discrete_kldiv(chembl_sim, cross_set_sim) # kldivs['external_similarity'] = kldiv_ext # kldiv_sum += kldiv_ext metadata = {'number_samples': self.number_samples, 'kl_divs': kldivs} # Each KL divergence value is transformed to be in [0, 1]. # Then their average delivers the final score. partial_scores = [np.exp(-score) for score in kldivs.values()] score = sum(partial_scores) / len(partial_scores) return DistributionLearningBenchmarkResult(benchmark_name=self.name, score=score, sampling_time=end_time - start_time, metadata=metadata)