def test_builder_returns_false_when_smiles_already_present(): uut = OptimizedMolecules.Builder() uut.append('C', col1=1) result = uut.append('C', col2=2) assert not result
def test_to_csv_call_with_without_columns_and_columns_raises(tmp_path): builder = OptimizedMolecules.Builder() builder.append('C', col1=0, col2=1) uut = builder.build() with pytest.raises(ValueError): uut.to_csv('dummy_path', columns=[], without_columns=[])
def test_top_fraction_raises_with_fraction_greater_than_one(): builder = OptimizedMolecules.Builder() builder.append('C', col1=0) uut = builder.build() with pytest.raises(ValueError): uut.get_first_fraction(1.2, by_column='col1')
def test_first_n_raises_with_zero_n(): builder = OptimizedMolecules.Builder() builder.append('C', col1=0) uut = builder.build() with pytest.raises(ValueError): uut.get_first_n(0, by_column='col1')
def test_first_fraction_raises_with_negative_fraction(): builder = OptimizedMolecules.Builder() builder.append('C', col1=0) uut = builder.build() with pytest.raises(ValueError): uut.get_first_fraction(-1, by_column='col1')
def test_most_similar_tanimoto_multiple_elements_happy_path(): builder = OptimizedMolecules.Builder() builder.append('C', col1=0) builder.append('Cl', col1=0) builder.append('F', col1=0) uut = builder.build() expected = pd.DataFrame.from_dict( { 'C': { 'tanimoto_similarity': 1.0, 'most_similar_smiles': 'C' }, 'F': { 'tanimoto_similarity': 1.0, 'most_similar_smiles': 'F' }, 'Cl': { 'tanimoto_similarity': 0.0, 'most_similar_smiles': 'C' }, }, orient='index') result = uut.most_similar_tanimoto(['C', 'F']) assert result.equals(expected)
def test_builder_in_operator_returns_true_when_smiles_already_present(): uut = OptimizedMolecules.Builder() uut.append('C', col2=2) result = 'C' in uut assert result
def test_top_fraction_raises_with_non_existing_column(): builder = OptimizedMolecules.Builder() builder.append('C', col1=0) uut = builder.build() with pytest.raises(ValueError): uut.get_first_fraction(1, by_column='no_such_column')
def test_rmse_raises_when_col2_contains_nans(): builder = OptimizedMolecules.Builder() builder.append('C', col1=1) builder.append('H', col1=1, col2=2) uut = builder.build() with pytest.raises(ValueError): uut.rmse('col1', 'col2')
def test_rmse_raises_when_col2_not_in_molecules(): builder = OptimizedMolecules.Builder() builder.append('C', col=1) builder.append('H', col=2) uut = builder.build() with pytest.raises(ValueError): uut.rmse('col', 'no_such_column')
def test_builder_does_not_calculate_validity_when_total_samples_is_zero(): uut = OptimizedMolecules.Builder() uut.append('C', col1=1) uut.append('H', col2=2) result = uut.build() assert 'validity' not in result.metrics
def test_builder_happy_path(): uut = OptimizedMolecules.Builder() uut.append('C', col1=1, col2=2) uut.append('H', col1=3, col2=4) expected = pd.DataFrame.from_dict({'C': {'col1': 1, 'col2': 2}, 'H': {'col1': 3, 'col2': 4}}, orient='index') result = uut.build() assert result.molecules.equals(expected)
def test_rmse_no_error_happy_path(): builder = OptimizedMolecules.Builder() builder.append('C', col1=0, col2=0) builder.append('H', col1=1, col2=1) uut = builder.build() result = uut.rmse('col1', 'col2') assert result == 0
def test_builder_calculates_validity_when_total_samples_positive(): uut = OptimizedMolecules.Builder() uut.append('C', col1=1) uut.append('H', col2=2) uut.total_samples += 10 result = uut.build() assert result.metrics['validity'] == 0.2
def test_builder_happy_path_with_missing_data(): uut = OptimizedMolecules.Builder() uut.append('C', col1=1) uut.append('H', col2=2) expected = pd.DataFrame.from_dict({'C': {'col1': 1}, 'H': {'col2': 2}}, orient='index') result = uut.build() assert result.molecules.equals(expected)
def test_top_n_happy_path_descending(): builder = OptimizedMolecules.Builder() builder.append('C', col1=0) builder.append('H', col1=1) uut = builder.build() expected = pd.DataFrame.from_dict({'H': {'col1': 1}}, orient='index') result = uut.get_first_fraction(.5, by_column='col1', sort_ascending=False) assert result.equals(expected)
def test_to_csv_happy_path(tmp_path): builder = OptimizedMolecules.Builder() builder.append('C', col1=0, col2=1) builder.append('H', col1=2.5, col2=1) uut = builder.build() file = tmp_path / 'tmp_file.csv' expected = textwrap.dedent('''\ SMILES,col1,col2 C,0.0,1 H,2.5,1 ''') uut.to_csv(file) assert file.read_text() == expected
def descent_steps(self, smiles_docking_score_fn, size): assert smiles_docking_score_fn is not None assert size is not None logger.info('Descent steps start') batch_size = 32 min_valid_steps = 5 results = [] while len(results) < size: logger.info(f'Valid descent steps results: {len(results)} / {size}') one_hots = self.train[np.random.choice(self.train.shape[0], batch_size, replace=False), :] latents = self.cvae.encode(one_hots) latent_changes = [latents] for _ in range(self.descent_iterations): latents += self.mlp.gradient(latents) * self.descent_lr latent_changes.append(np.copy(latents)) decoded_series = [self.cvae.hot_to_smiles(self.cvae.decode(delta), strip=True, numpy=True) for delta in latent_changes] smiles_changes = [ [canonicalize(smi) for smi in smi_list] for smi_list in decoded_series ] for i in range(batch_size): descent_results = OptimizedMolecules.Builder() for j in range(self.descent_iterations + 1): smi = smiles_changes[j][i] if smi is not None and is_valid(smi): try: docking_score = smiles_docking_score_fn(smi, n_cpu=self.docking_n_cpu) descent_results.append( smi, latent_vector=latent_changes[j][i], step=j, **docking_score ) except (ValueError, RuntimeError, TypeError): logger.error('Docking failed for %s', smi) if descent_results.size > min_valid_steps: results.append(descent_results.build()) return results
def test_to_csv_with_without_columns(tmp_path): builder = OptimizedMolecules.Builder() builder.append('C', col1=0, col2=1) builder.append('H', col1=2.5, col2=1) uut = builder.build() file = tmp_path / 'tmp_file.csv' expected = textwrap.dedent('''\ SMILES,col1 C,0.0 H,2.5 ''') uut.to_csv(file, without_columns=['col2']) assert file.read_text() == expected
def generate_optimized_molecules(self, number_molecules, smiles_docking_score_fn): results_builder = OptimizedMolecules.Builder() while results_builder.size < number_molecules: logger.info(f'Generated {results_builder.size} / {number_molecules}') latents = np.random.normal(size=(self.batch_size, self.latent)) before = [self.mlp.latent_score(latents[i].reshape(1, -1)) for i in range(self.batch_size)] for _ in range(self.descent_iterations): latents += self.mlp.gradient(latents) * self.descent_lr try: smiles = [canonicalize(smi) for smi in self.cvae.hot_to_smiles(self.cvae.decode(latents), strip=True, numpy=True)] except (RuntimeError, ValueError): logger.error('Decoding failed') continue for i, smi in enumerate(smiles): try: if smi is not None and is_valid(smi) and lipinski_filter(smi): if smi not in results_builder: latent_score = self.mlp.latent_score(latents[i].reshape(1, -1)) logger.info(f'Optimized from {before[i]} to {latent_score}') output_path = os.path.join( self.output_dir, f'{results_builder.size}.mol2') if self.output_dir is not None else None docking_score = smiles_docking_score_fn(smi, output_path=output_path, n_cpu=self.docking_n_cpu) results_builder.append( smi, latent_vector=latents[i], predicted_score=latent_score, **docking_score ) else: logger.info('Generated SMILES %s already present in OptimizedMoleculesBuilder', smi) except Exception: logger.error('Docking failed for ' + smi) if results_builder.size >= number_molecules: logger.info('Generating finished') break results_builder.total_samples += self.batch_size return results_builder.build()
def random_gauss(self, smiles_docking_score_fn, size): assert smiles_docking_score_fn is not None assert size is not None logger.info('Random gauss sampling start') results_builder = OptimizedMolecules.Builder() while results_builder.size < size: logger.info(f'Random sampled {results_builder.size} / {size}') latents = np.random.normal(size=(self.batch_size, self.latent)) smiles = [canonicalize(smi) for smi in self.cvae.hot_to_smiles(self.cvae.decode(latents), strip=True, numpy=True)] for i, smi in enumerate(smiles): try: if smi is not None and is_valid(smi) and lipinski_filter(smi): if smi not in results_builder: output_path = os.path.join( self.output_dir, f'{results_builder.size}.gauss.mol2') if self.output_dir is not None else None docking_score = smiles_docking_score_fn(smi, output_path=output_path, n_cpu=self.docking_n_cpu) results_builder.append( smi, latent_vector=latents[i], predicted_score=self.mlp.latent_score(latents[i].reshape(1, -1)), **docking_score ) else: logger.info('Generated SMILES %s already present in OptimizedMoleculesBuilder', smi) except Exception: logger.error('Docking failed for ' + smi) if results_builder.size >= size: logger.info('Random sampling finished') break results_builder.total_samples += self.batch_size logger.info('Random gauss sampling finished') return results_builder.build()
def test_builder_should_raise_when_smiles_is_not_str(): uut = OptimizedMolecules.Builder() with pytest.raises(TypeError): uut.append(34)
def test_builder_should_raise_on_empty_smiles(): uut = OptimizedMolecules.Builder() with pytest.raises(ValueError): uut.append('')
def test_builder_in_operator_returns_false_when_smiles_not_already_present(): uut = OptimizedMolecules.Builder() result = 'C' in uut assert not result
def test_to_csv_call_with_without_columns_and_columns_raises(tmp_path): uut = OptimizedMolecules.Builder().build() with pytest.raises(ValueError): uut.to_csv('dummy_path', columns=[], without_columns=[])