def _to_sliced_mol(row): scaffold_smi, decorations, _ = row.split("\t") decoration_smis = decorations.split(";") return usc.SlicedMol( uc.to_mol(scaffold_smi), {i: uc.to_mol(dec) for i, dec in enumerate(decoration_smis)})
def add_attachment_point_numbers(mol_or_smi, canonicalize=True): """ Adds the numbers for the attachment points throughout the molecule. :param mol_or_smi: SMILES string to convert. :param canonicalize: Canonicalize the SMILES so that the attachment points are always in the same order. :return : A converted SMILES string. """ if isinstance(mol_or_smi, str): smi = mol_or_smi if canonicalize: smi = to_smiles(uc.to_mol(mol_or_smi)) # only add numbers ordered by the SMILES ordering num = -1 def _ap_callback(_): nonlocal num num += 1 return "[{}:{}]".format(ATTACHMENT_POINT_TOKEN, num) return re.sub(ATTACHMENT_POINT_REGEXP, _ap_callback, smi) else: mol = mol_or_smi if canonicalize: mol = uc.to_mol(to_smiles(mol)) idx = 0 for atom in mol.GetAtoms(): if atom.GetSymbol() == ATTACHMENT_POINT_TOKEN: atom.SetProp("molAtomMapNumber", str(idx)) idx += 1 return to_smiles(mol)
def join(scaffold_smi, decoration_smi, keep_label_on_atoms=False): """ Joins a SMILES scaffold with a decoration. They must be labelled. :param scaffold_smi: SMILES of the scaffold. :param decoration_smi: SMILES of the decoration. :param keep_label_on_atoms: Add the labels to the atoms after attaching the molecule. This is useful when debugging, but it can give problems. :return: A Mol object of the joined scaffold. """ scaffold = uc.to_mol(scaffold_smi) decoration = uc.to_mol(decoration_smi) if scaffold and decoration: # obtain id in the decoration try: attachment_points = [atom.GetProp("molAtomMapNumber") for atom in decoration.GetAtoms() if atom.GetSymbol() == ATTACHMENT_POINT_TOKEN] if len(attachment_points) != 1: return None # more than one attachment point... attachment_point = attachment_points[0] except KeyError: return None combined_scaffold = rkc.RWMol(rkc.CombineMols(decoration, scaffold)) attachments = [atom for atom in combined_scaffold.GetAtoms() if atom.GetSymbol() == ATTACHMENT_POINT_TOKEN and atom.HasProp("molAtomMapNumber") and atom.GetProp("molAtomMapNumber") == attachment_point] if len(attachments) != 2: return None # something weird neighbors = [] for atom in attachments: if atom.GetDegree() != 1: return None # the attachment is wrongly generated neighbors.append(atom.GetNeighbors()[0]) bonds = [atom.GetBonds()[0] for atom in attachments] bond_type = rkc.BondType.SINGLE if any(bond for bond in bonds if bond.GetBondType() == rkc.BondType.DOUBLE): bond_type = rkc.BondType.DOUBLE combined_scaffold.AddBond(neighbors[0].GetIdx(), neighbors[1].GetIdx(), bond_type) combined_scaffold.RemoveAtom(attachments[0].GetIdx()) combined_scaffold.RemoveAtom(attachments[1].GetIdx()) if keep_label_on_atoms: for neigh in neighbors: _add_attachment_point_num(neigh, attachment_point) scaffold = combined_scaffold.GetMol() try: rkc.SanitizeMol(scaffold) except ValueError: # sanitization error return None else: return None return scaffold
def _generate_randomized_not_repeated( smi, num_rand=self.num_randomized_smiles, max_rand=self.max_randomized_smiles_sample): mol = uc.to_mol(smi) randomized_scaffolds = set() for _ in range(max_rand): randomized_scaffolds.add(usc.to_smiles(mol, variant="random")) if len(randomized_scaffolds) == num_rand: break return list(randomized_scaffolds)
def join_joined_attachments(scaffold_smi, decorations_smi): decorations_smi = [add_first_attachment_point_number(dec, i) for i, dec in enumerate(decorations_smi.split(ATTACHMENT_SEPARATOR_TOKEN))] scaffold_smi = add_attachment_point_numbers(scaffold_smi) num_attachment_points = len(get_attachment_points(scaffold_smi)) if len(decorations_smi) != num_attachment_points: return None mol = uc.to_mol(scaffold_smi) for dec in decorations_smi: mol = join(to_smiles(mol), dec) if not mol: return None return mol
def enumerate(row: ps.Row, enumerator: FragmentReactionSliceEnumerator, max_cuts: int) -> List[ps.Row]: attachments = AttachmentPoints() fields = row.split("\t") smiles = fields[0] mol = uc.to_mol(smiles) out_rows = [] if mol: for sliced_mol in enumerator.enumerate(mol, cuts=max_cuts): row_dict = { DataframeColumnsEnum.SCAFFOLDS: attachments.remove_attachment_point_numbers(sliced_mol.scaffold_smiles), DataframeColumnsEnum.DECORATIONS: sliced_mol.decorations_smiles, DataframeColumnsEnum.ORIGINAL: sliced_mol.original_smiles, DataframeColumnsEnum.MAX_CUTS: max_cuts} out_rows.append(ps.Row(**row_dict)) return out_rows
def collect_failures( self, row: ps.Row, enumerator: FailingReactionsEnumerator) -> List[ps.Row]: fields = row.split("\t") smiles = fields[0] mol = uc.to_mol(smiles) out_rows = [] if mol: for failed_reaction in enumerator.enumerate( mol, failures_limit=self.configuration.failures_limit): row_dict = { self._columns.REACTION: failed_reaction.reaction_smirks, self._columns.ORIGINAL: failed_reaction.molecule_smiles } print("found failed reaction") out_rows.append(ps.Row(**row_dict)) if self.configuration.failures_limit <= len(out_rows): break return out_rows
def _enumerate(row, max_cuts=self.max_cuts, enumerator=self.enumerator): fields = row.split("\t") smiles = fields[0] mol = uc.to_mol(smiles) out_rows = [] if mol: for cuts in range(1, max_cuts + 1): for sliced_mol in enumerator.enumerate(mol, cuts=cuts): # normalize scaffold and decorations scaff_smi, dec_smis = sliced_mol.to_smiles() dec_smis = [ smi for num, smi in sorted(dec_smis.items()) ] out_rows.append( ps.Row(scaffold=scaff_smi, decorations=dec_smis, smiles=uc.to_smiles(mol), cuts=cuts)) return out_rows
def __init__(self, model_path, log_path, validation_set_path, training_set_path, epoch, sample_size=10000, summary_writer=None, with_weights=False, smiles_type="smiles"): """ Creates a CollectStatsFromModelRunner. :param model_path: The input model path. :return: """ self._validation_set_path = validation_set_path self._training_set_path = training_set_path self._log_path = log_path self._with_weights = with_weights self._model = mm.Model.load_from_file(model_path, sampling_mode=True) self._epoch = epoch self._sample_size = max(sample_size, 1) # optionally reuse summary writer to prevent device errors if summary_writer: self.summary_writer = summary_writer else: self.summary_writer = tbx.SummaryWriter(log_dir=self._log_path) self.data = {} if smiles_type.startswith("deepsmiles"): _, deepsmiles_type = smiles_type.split(".") self._to_mol_func = lambda deepsmi: uc.to_mol( uc.from_deepsmiles(deepsmi, converter=deepsmiles_type)) else: self._to_mol_func = uc.to_mol
def _generate_randomized_repeated(smi, num_rand=self.num_randomized_smiles): mol = uc.to_mol(smi) return [ usc.to_smiles(mol, variant="random") for _ in range(num_rand) ]
def _format_attachment_point(smi, num): smi = usc.add_first_attachment_point_number(smi, num) return usc.to_smiles(uc.to_mol(smi)) # canonicalize
def _cleanup_decoration(dec_smi): dec_mol = uc.to_mol(dec_smi) if not dec_mol: return None return usc.to_smiles(usc.remove_attachment_point_numbers(dec_mol))