def normalize(smiles): #print(smiles) # Generate Mol mol = Chem.MolFromSmiles(smiles) # Uncharge uncharger = Uncharger() mol = uncharger.uncharge(mol) # LargestFragmentChooser flagmentChooser = LargestFragmentChooser() mol = flagmentChooser(mol) # Sanitaize Chem.SanitizeMol(mol) # Normalize normalizer = Normalizer() mol = normalizer.normalize(mol) tautomerCanonicalizer = TautomerCanonicalizer() mol = tautomerCanonicalizer.canonicalize(mol) return Chem.MolToSmiles(mol)
def canonicalize(mols, ntauts): for n in mols: mol_can = TautomerCanonicalizer(max_tautomers=ntauts).canonicalize(mols[n]["mol"]) if Chem.MolToSmiles(mols[n]["mol"]) == Chem.MolToSmiles(mol_can): mols[n]["mol_can"] = mols[n]["mol"] else: mols[n]["mol_can"] = mol_can
def FullStandardization(smi): try: mol = Chem.MolFromSmiles(smi) if mol == None: # If rdkit could not parse the smiles, returns Error 1 return "Error 1" else: STD = Standardizer() LFC = LargestFragmentChooser() UC = Uncharger() RI = Reionizer() TC = TautomerCanonicalizer() mol = STD(mol) mol = LFC(mol) allowed_elements = { "H", "B", "C", "N", "O", "F", "Si", "P", "S", "Cl", "Se", "Br", "I" } actual_elements = set( [atom.GetSymbol() for atom in mol.GetAtoms()]) if len(actual_elements - allowed_elements) == 0: mol = UC(mol) mol = RI(mol) RemoveStereochemistry(mol) mol = TC(mol) return Chem.MolToSmiles(mol) else: # If molecule contains other than the allowed elements, returns "Error 2" return "Error 2" except: return "Check manually"
def query_canonicalize(mol, ntauts): try: mol_sta = Standardizer().charge_parent( Standardizer().fragment_parent(mol), skip_standardize=True) mol_can = TautomerCanonicalizer( max_tautomers=ntauts).canonicalize(mol_sta) return mol_can except: return mol
def normalize(smiles): from copy import deepcopy #print(smiles) # Generate Mol mol = Chem.MolFromSmiles(smiles) # Uncharge uncharger = Uncharger() try: mol2 = uncharger.uncharge(mol) except: mol2 = mol mol2 = mol # LargestFragmentChooser flagmentChooser = LargestFragmentChooser() try: mol3 = flagmentChooser(mol2) except: mol3 = mol2 # Sanitaize mol3_tmp = deepcopy(mol3) try: ret = Chem.SanitizeMol(mol3, catchErrors=False) except: mol3 = mol3_tmp # Normalize normalizer = Normalizer() try: mol4 = normalizer.normalize(mol3) except: mol4 = mol3 #print(mol4) tautomerCanonicalizer = TautomerCanonicalizer() mol4 = tautomerCanonicalizer.canonicalize(mol4) new_smiles = Chem.MolToSmiles(mol4) #print(new_smiles) return new_smiles
class TautomerCheck(CurationStep): """ class to standaridze tautomers""" def __init__(self, **kwargs): self.tautomerizer = TautomerCanonicalizer(**kwargs) def filterFunction(self, cmp): return self.tautomerizer.canonicalize(cmp) def runStep(self, df, cmp_index): df.iloc[:, cmp_index] = [self.filterFunction(mol) for mol in df[cmp_index]] return df
def do_standard_mp(mol: Chem.rdchem.Mol, n: int = 0, ntauts: int = 100) -> tuple: """ Function to standardize and canonicalize and RDKit Mol. :param mol: molecule to be standardized :type mol: rdkit.Chem.rdchem.Mol :param n: molecule number, default = 0 :type n: int :param ntauts: maximum number of tautomers to be enumerated in canonicalization step, default = 100 :return: tuple containing the molecule number, the standardized molecule and the canonical tautomer (n, Mol, Mol) :rtype: tuple """ try: mol_sta = Standardizer().charge_parent(Standardizer().fragment_parent(mol), skip_standardize=True) mol_can = TautomerCanonicalizer(max_tautomers=ntauts).canonicalize(mol_sta) if Chem.MolToSmiles(mol_sta) == Chem.MolToSmiles(mol_can): mol_can = mol_sta except: mol_sta = mol mol_can = mol return (n, mol_sta, mol_can)
def do_standard(mols, ntauts): """ Adds a standardized molecule and canonicalized tautomer to the database :param mols: dictionary containing rdkit.Chem.rdchem.Mol objects as follows: {0: {"mol": rdkit.Chem.rdchem.Mol}, 1: {"mol": rdkit.Chem.rdchem.Mol} ...} :type mols: dict :param ntauts: maximum number of tautomers to be enumerated in canonicalization step, default = 100 :type ntauts: int """ for n in mols: try: mol_sta = Standardizer().charge_parent(Standardizer().fragment_parent(mols[n]["mol"]), skip_standardize=True) mol_can = TautomerCanonicalizer(max_tautomers=ntauts).canonicalize(mol_sta) mols[n]["mol"] = mol_sta if Chem.MolToSmiles(mol_sta) == Chem.MolToSmiles(mol_can): mols[n]["mol_can"] = mol_sta else: mols[n]["mol_can"] = mol_can except: mols[n]["mol_can"] = mols[n]["mol"]
def canonicalize_mp(mol, n, ntauts): mol_can = TautomerCanonicalizer(max_tautomers=ntauts).canonicalize(mol) if Chem.MolToSmiles(mol) == Chem.MolToSmiles(mol_can): mol_can = mol return (n, mol_can)
def standardize_mols(jobs, mol_counter, num_mols, results, start_time, vendors, max_stereo_isomers, failures, tautomer, verbose): """ This function passes molecules to the standardization functions. Parameters ---------- jobs: multiprocessing.manager.list A list containing job information as dictionaries. mol_counter: multiprocessing.manager.value A counter keeping track of processed molecules. num_mols: int Total number of molecules to be processed. results: multiprocessing.manager.list A list containing lists describing the processed molecules. start_time: float Starting time of molecule processing. vendors: list List of vendors. max_stereo_isomers: int Maximal number of stereo isomers to generater per molecule. verbose : bool If RDKit warning should be displayed. """ if not verbose: RDLogger.DisableLog('rdApp.*') job = 'initiate' processed_mols = [] while job is not None: try: job = jobs.pop(0) vendor_position = vendors.index(job['vendor']) supplier = Chem.SDMolSupplier(job['sdf_path']) for mol_id in range(job['mol_start'], job['mol_end'] + 1): mol = supplier[mol_id] if job['identifier_field'] == 'None': identifier = 'unknown' else: try: identifier = mol.GetProp(job['identifier_field']) except AttributeError: identifier = 'unknown' try: # generate smiles for error catching smiles = 'unknown' smiles = Chem.MolToSmiles(mol) # default standardization from molvs mol = Standardizer().standardize(mol) # choose largest fragment mol = LargestFragmentChooser().choose(mol) # canonicalize tautomer if tautomer: mol = TautomerCanonicalizer().canonicalize(mol) # protonate mol mol = protonate_mol(mol) # molecular weight will not change anymore if ExactMolWt(mol) < 1200: # enumerate stereo isomers and append mols if max_stereo_isomers > 0: for mol in enumerate_stereo_isomers(mol, max_stereo_isomers): mol_as_list = [Chem.MolToSmiles(mol)] + [''] * len(vendors) mol_as_list[1 + vendor_position] = identifier processed_mols.append(mol_as_list) else: mol_as_list = [Chem.MolToSmiles(mol)] + [''] * len(vendors) mol_as_list[1 + vendor_position] = identifier processed_mols.append(mol_as_list) except: failures.append(' '.join(['standardize_error', smiles, job['vendor'], identifier])) with mol_counter.get_lock(): mol_counter.value += 1 update_progress(mol_counter.value / num_mols, 'Progress of standardization', ((time.time() - start_time) / mol_counter.value) * (num_mols - mol_counter.value)) except IndexError: job = None results += processed_mols return
def get_tautomer(mol): TC = TautomerCanonicalizer() return TC(mol)
def __init__(self, **kwargs): self.tautomerizer = TautomerCanonicalizer(**kwargs)