def test_subset_with_no_seed(): dataset = list(np.random.rand(100)) subset1 = get_random_subset(dataset, 10) subset2 = get_random_subset(dataset, 10) assert subset1 != subset2
def test_subset_with_random_seed(): dataset = list(np.random.rand(100)) subset1 = get_random_subset(dataset, 10, seed=33) subset2 = get_random_subset(dataset, 10, seed=33) subset3 = get_random_subset(dataset, 10, seed=43) assert subset1 == subset2 assert subset1 != subset3
def test_subset(): dataset = list(np.random.rand(100)) subset = get_random_subset(dataset, 10) for s in subset: assert s in dataset
def __init__(self, training_set: List[str], chemnet_model_filename='ChemNet_v0.13_pretrained.h5', sample_size=10000) -> None: """ Args: training_set: molecules from the training set chemnet_model_filename: name of the file for trained ChemNet model. Must be present in the 'fcd' package, since it will be loaded directly from there. sample_size: how many molecules to generate the distribution statistics from (both reference data and model) """ self.chemnet_model_filename = chemnet_model_filename self.sample_size = sample_size super().__init__(name='Frechet ChemNet Distance', number_samples=self.sample_size) self.reference_molecules = get_random_subset(training_set, self.sample_size, seed=42)
def __init__(self, number_samples: int, training_set: List[str]) -> None: """ Args: number_samples: number of samples to generate from the model training_set: molecules from the training set """ super().__init__(name='KL divergence', number_samples=number_samples) self.training_set_molecules = canonicalize_list( get_random_subset(training_set, self.number_samples, seed=42), include_stereocenters=False) self.pc_descriptor_subset = [ 'BertzCT', 'MolLogP', 'MolWt', 'TPSA', 'NumHAcceptors', 'NumHDonors', 'NumRotatableBonds', 'NumAliphaticRings', 'NumAromaticRings' ]
def test_subset_if_dataset_too_small(): dataset = list(np.random.rand(100)) with pytest.raises(Exception): get_random_subset(dataset, 1000)