Exemplo n.º 1
0
def test_internal_sim():
    molz = ['OCCCF', 'c1cc(F)ccc1', 'c1cnc(CO)cc1', 'FOOF']
    sim = calculate_internal_pairwise_similarities(molz)

    assert sim.shape[0] == 4
    assert sim.shape[1] == 4
    # check elements
    for i in range(sim.shape[0]):
        for j in range(sim.shape[1]):
            assert sim[i, j] == sim[j, i]
            if i != j:
                assert sim[i, j] < 1.0
            else:
                assert sim[i, j] == 0
    def assess_model(
            self, model: GoalDirectedGenerator) -> GoalDirectedBenchmarkResult:
        """
        Assess the given model by asking it to generate molecules optimizing a scoring function.
        The number of molecules to generate is determined automatically from the score contribution specification.

        Args:
            model: model to assess
        """
        number_molecules_to_generate = max(
            self.contribution_specification.top_counts)
        start_time = time.time()
        molecules = model.generate_optimized_molecules(
            scoring_function=self.wrapped_objective,
            number_molecules=number_molecules_to_generate,
            starting_population=self.starting_population,
        )
        end_time = time.time()

        canonicalized_molecules = canonicalize_list(
            molecules, include_stereocenters=False)
        unique_molecules = remove_duplicates(canonicalized_molecules)
        scores = self.objective.score_list(unique_molecules)

        if len(unique_molecules) != number_molecules_to_generate:
            number_missing = number_molecules_to_generate - len(
                unique_molecules)
            logger.warning(
                f"An incorrect number of distinct molecules was generated: "
                f"{len(unique_molecules)} instead of {number_molecules_to_generate}. "
                f"Padding scores with {number_missing} zeros...")
            scores.extend([0.0] * number_missing)

        global_score, top_x_dict = compute_global_score(
            self.contribution_specification, scores)

        scored_molecules = zip(unique_molecules, scores)
        sorted_scored_molecules = sorted(scored_molecules,
                                         key=lambda x: (x[1], x[0]),
                                         reverse=True)

        internal_similarities = calculate_internal_pairwise_similarities(
            unique_molecules)

        # accumulate internal_similarities in metadata
        int_simi_histogram = np.histogram(internal_similarities,
                                          bins=10,
                                          range=(0, 1),
                                          density=True)

        metadata: Dict[str, Any] = {}
        metadata.update(top_x_dict)
        metadata["internal_similarity_max"] = internal_similarities.max()
        metadata["internal_similarity_mean"] = internal_similarities.mean()
        metadata["internal_similarity_histogram_density"] = (
            int_simi_histogram[0].tolist(), )
        metadata["internal_similarity_histogram_bins"] = (
            int_simi_histogram[1].tolist(), )

        return GoalDirectedBenchmarkResult(
            benchmark_name=self.name,
            score=global_score,
            optimized_molecules=sorted_scored_molecules,
            execution_time=end_time - start_time,
            number_scoring_function_calls=self.wrapped_objective.evaluations,
            metadata=metadata,
        )
    def assess_model(
        self, model: DistributionMatchingGenerator
    ) -> DistributionLearningBenchmarkResult:
        """
        Assess a distribution-matching generator model.

        Args:
            model: model to assess
        """
        start_time = time.time()
        molecules = sample_unique_molecules(
            model=model, number_molecules=self.number_samples, max_tries=2)
        end_time = time.time()

        if len(molecules) != self.number_samples:
            logger.warning(
                'The model could not generate enough unique molecules. The score will be penalized.'
            )

        # canonicalize_list in order to remove stereo information (also removes duplicates and invalid molecules, but there shouldn't be any)
        unique_molecules = set(
            canonicalize_list(molecules, include_stereocenters=False))

        # first we calculate the descriptors, which are np.arrays of size n_samples x n_descriptors
        d_sampled = calculate_pc_descriptors(unique_molecules,
                                             self.pc_descriptor_subset)
        d_chembl = calculate_pc_descriptors(self.training_set_molecules,
                                            self.pc_descriptor_subset)

        kldivs = {}

        # now we calculate the kl divergence for the float valued descriptors ...
        for i in range(4):
            kldiv = continuous_kldiv(X_baseline=d_chembl[:, i],
                                     X_sampled=d_sampled[:, i])
            kldivs[self.pc_descriptor_subset[i]] = kldiv

        # ... and for the int valued ones.
        for i in range(4, 9):
            kldiv = discrete_kldiv(X_baseline=d_chembl[:, i],
                                   X_sampled=d_sampled[:, i])
            kldivs[self.pc_descriptor_subset[i]] = kldiv

        # pairwise similarity

        chembl_sim = calculate_internal_pairwise_similarities(
            self.training_set_molecules)
        chembl_sim = chembl_sim.max(axis=1)

        sampled_sim = calculate_internal_pairwise_similarities(
            unique_molecules)
        sampled_sim = sampled_sim.max(axis=1)

        kldiv_int_int = continuous_kldiv(X_baseline=chembl_sim,
                                         X_sampled=sampled_sim)
        kldivs['internal_similarity'] = kldiv_int_int

        # for some reason, this runs into problems when both sets are identical.
        # cross_set_sim = calculate_pairwise_similarities(self.training_set_molecules, unique_molecules)
        # cross_set_sim = cross_set_sim.max(axis=1)
        #
        # kldiv_ext = discrete_kldiv(chembl_sim, cross_set_sim)
        # kldivs['external_similarity'] = kldiv_ext
        # kldiv_sum += kldiv_ext

        metadata = {'number_samples': self.number_samples, 'kl_divs': kldivs}

        # Each KL divergence value is transformed to be in [0, 1].
        # Then their average delivers the final score.
        partial_scores = [np.exp(-score) for score in kldivs.values()]
        score = sum(partial_scores) / len(partial_scores)

        return DistributionLearningBenchmarkResult(benchmark_name=self.name,
                                                   score=score,
                                                   sampling_time=end_time -
                                                   start_time,
                                                   metadata=metadata)