def test_isomeric_canonicalisation(): endiandric_acid = r'OC(=O)[C@H]5C2\C=C/C3[C@@H]5CC4[C@H](C\C=C\C=C\c1ccccc1)[C@@H]2[C@@H]34' with_stereocenters = canonicalize(endiandric_acid, include_stereocenters=True) without_stereocenters = canonicalize(endiandric_acid, include_stereocenters=False) expected_with_stereocenters = 'O=C(O)[C@H]1C2C=CC3[C@@H]1CC1[C@H](C/C=C/C=C/c4ccccc4)[C@@H]2[C@@H]31' expected_without_stereocenters = 'O=C(O)C1C2C=CC3C1CC1C(CC=CC=Cc4ccccc4)C2C31' assert with_stereocenters == expected_with_stereocenters assert without_stereocenters == expected_without_stereocenters
def process_smis(smis, scoring_function, pool, canonicalization, duplicate_removal, scoring_parallelization, max_smi_len=100): if canonicalization: smis = pool( delayed(lambda smi: canonicalize(smi, include_stereocenters=False)) (smi) for smi in smis) smis = list( filter(lambda smi: (smi is not None) and (len(smi) < max_smi_len), smis)) if duplicate_removal: smis = list(set(smis)) if scoring_function is None: return smis if scoring_parallelization: scores = pool(delayed(scoring_function)(smi) for smi in smis) else: scores = [scoring_function(smi) for smi in smis] smis, scores = filter_by_score(smis, scores, -1e-8) return smis, scores
def vec2smiles(self, vec, rem_bos, rem_eos): string = self.vec2string(vec, rem_bos, rem_eos) if string is None: return None smiles = canonicalize(string) if smiles is None or len(smiles) == 0: return None if len(smiles) > self.max_smiles_length: return None return smiles
def test_list_canonicalization_removes_none(): m1 = 'CCC(OCOCO)CC(=O)NCC' m2 = 'this.is.not.a.molecule' m3 = 'c1ccccc1' m4 = 'CC(OCON=N)CC' molecules = [m1, m2, m3, m4] canonicalized_molecules = canonicalize_list(molecules) valid_molecules = [m1, m3, m4] expected = [canonicalize(smiles) for smiles in valid_molecules] assert canonicalized_molecules == expected
def canonicalize_and_score_smiles(self, smis, scoring_function, pool): smis = pool(delayed(lambda smi: canonicalize(smi, include_stereocenters=False))(smi) for smi in smis) smis = list(filter(lambda smi: (smi is not None) and self.char_dict.allowed(smi), smis)) scores = pool(delayed(scoring_function.score)(smi) for smi in smis) #scores = [0.0 for smi in smis] filtered_smis_and_scores = list( filter( lambda smi_and_score: smi_and_score[1] > scoring_function.scoring_function.corrupt_score, zip(smis, scores), ) ) smis, scores = map(list, zip(*filtered_smis_and_scores)) if len(filtered_smis_and_scores) > 0 else ([], []) return smis, scores
def sample_unique_molecules(model: DistributionMatchingGenerator, number_molecules: int, max_tries=10) -> List[str]: """ Sample from the given generator until the desired number of unique (distinct) molecules has been sampled (i.e., ignore duplicate molecules). Args: model: model to sample from number_molecules: number of unique (distinct) molecules to generate max_tries: determines the maximum number N of samples to draw, N = number_molecules * max_tries Returns: A list of number_molecules unique molecules, in canonalized form. If this was not possible with the given max_tries, the list may be shorter. The generation order is kept. """ max_samples = max_tries * number_molecules number_already_sampled = 0 unique_list: List[str] = [] unique_set: Set[str] = set() while len(unique_list ) < number_molecules and number_already_sampled < max_samples: remaining_to_sample = number_molecules - len(unique_list) samples = model.generate(remaining_to_sample) number_already_sampled += remaining_to_sample for smiles in samples: canonical_smiles = canonicalize(smiles) if canonical_smiles is not None and canonical_smiles not in unique_set: unique_set.add(canonical_smiles) unique_list.append(canonical_smiles) # this should always be True assert len(unique_set) == len(unique_list) return unique_list
def mutate(p_gene, scoring_function): c_gene = mutation(p_gene) c_smiles = canonicalize(cfg_util.decode(gene_to_cfg(c_gene))) c_score = scoring_function.score(c_smiles) return Molecule(c_score, c_smiles, c_gene)
def _canonicalize(self): if self._canonical_smiles is not None: return canonical = [canonicalize(mol) for mol in self._smiles] self._canonical_smiles = [s for s in canonical if s is not None]