def test6Charge(self): mol = Chem.MolFromSmiles("C1=C(C=CC(=C1)[S]([O-])=O)[S](O)(=O)=O") # instantiate with default acid base pair library reionizer = rdMolStandardize.Reionizer() nm = reionizer.reionize(mol) self.assertEqual(Chem.MolToSmiles(nm), "O=S(O)c1ccc(S(=O)(=O)[O-])cc1") # try reionize with another acid base pair library without the right # pairs abfile = os.path.join(RDConfig.RDDataDir, 'MolStandardize', 'acid_base_pairs2.txt') reionizer2 = rdMolStandardize.Reionizer(abfile) nm2 = reionizer2.reionize(mol) self.assertEqual(Chem.MolToSmiles(nm2), "O=S([O-])c1ccc(S(=O)(=O)O)cc1") # test Uncharger uncharger = rdMolStandardize.Uncharger() mol3 = Chem.MolFromSmiles("O=C([O-])c1ccccc1") nm3 = uncharger.uncharge(mol3) self.assertEqual(Chem.MolToSmiles(nm3), "O=C(O)c1ccccc1") # test canonical Uncharger uncharger = rdMolStandardize.Uncharger(canonicalOrder=False) mol3 = Chem.MolFromSmiles("C[N+](C)(C)CC(C(=O)[O-])CC(=O)[O-]") nm3 = uncharger.uncharge(mol3) self.assertEqual(Chem.MolToSmiles(nm3), "C[N+](C)(C)CC(CC(=O)[O-])C(=O)O") uncharger = rdMolStandardize.Uncharger(canonicalOrder=True) nm3 = uncharger.uncharge(mol3) self.assertEqual(Chem.MolToSmiles(nm3), "C[N+](C)(C)CC(CC(=O)O)C(=O)[O-]")
def preprocess_smi(smi): # Filter 1- Convert to Canonical Smiles try: mol = Chem.MolFromSmiles(smi) can_smi = Chem.MolToSmiles(mol, True) except: return None # Filter 2- Remove salt remover = SaltRemover() mol = Chem.MolFromSmiles(can_smi) res, deleted = remover.StripMolWithDeleted(mol, dontRemoveEverything=True) removed_salt_smi = Chem.MolToSmiles(res) # Filter 3- Remove Charge uncharger = rdMolStandardize.Uncharger() m = Chem.MolFromSmiles(removed_salt_smi) p = uncharger.uncharge(m) uncharged_smi = Chem.MolToSmiles(p) # Filter 4 - Standardize the tautomer clean_smi = MolStandardize.canonicalize_tautomer_smiles(uncharged_smi) return clean_smi
def predict(mol, uncharged=True): if uncharged: un = rdMolStandardize.Uncharger() mol = un.uncharge(mol) mol = AllChem.AddHs(mol) base_dict = predict_base(mol) acid_dict = predict_acid(mol) return base_dict, acid_dict
def run_filter_mol(smiles_info, child_dict): """ This takes a smiles_string and the selected filter list (child_dict) and runs it through the selected filters. Inputs: :param list smiles_info: A list with info about a ligand, the SMILES string is idx=0 and the name/ID is idx=1. example: smiles_info ["CCCCCCC","zinc123"] :param dict child_dict: This dictionary contains all the names of the chosen filters as keys and the the filter objects as the items Or None if User specifies no filters Returns: :returns: list smiles_info: list of the smiles_info if it passed the filter and "Filter_Passed". returns smiles_info and "Sanitize_fail" if the mol fails to sanitize. returns smiles_info and "Sanitize_fail" if the mol fails one or more filters. """ smiles_string = smiles_info[0] mol = Chem.MolFromSmiles(smiles_string, sanitize=False) # try sanitizing, which is necessary later mol = MOH.check_sanitization(mol) if mol is None: return [smiles_info, "Sanitize_fail"] mol = MOH.try_deprotanation(mol) if mol is None: return [smiles_info, "Sanitize_fail"] mol = MOH.check_sanitization(mol) if mol is None: return [smiles_info, "Sanitize_fail"] # remove charge from mol objects. This affects some properties # such as: logP, Mol refractivity, and polar surface area # which can impact filters such as Ghose and VandeWaterbeemd # This is done because logP is traditionally applied to neutral molecules uncharger_obj = rdMolStandardize.Uncharger() mol = uncharger_obj.uncharge(mol) if mol is None: return [smiles_info, "Sanitize_fail"] if child_dict is not None: # run through the filters filter_result = run_all_selected_filters(mol, child_dict) # see if passed if filter_result is False: return [smiles_info, "Filter_fail"] # it passed return the smiles_info return [smiles_info, "Filter_Passed"] # This will return None return [smiles_info, "Filter_Passed"]
def standardize_format(mol): """Clean up molecule and return in standardized format """ mol = rdMolStandardize.Cleanup(mol) mol = get_biggest_component(mol) uncharger = rdMolStandardize.Uncharger() mol = uncharger.uncharge(mol) remove_isotopes(mol) return mol
def corpus(input, output, suffix='sdf'): if suffix =='sdf': inf = gzip.open(input) mols = Chem.ForwardSDMolSupplier(inf) # mols = [mol for mol in suppl] else: df = pd.read_table(input).Smiles.dropna() mols = [Chem.MolFromSmiles(s) for s in df] voc = Voc('data/voc_smiles.txt') charger = rdMolStandardize.Uncharger() chooser = rdMolStandardize.LargestFragmentChooser() disconnector = rdMolStandardize.MetalDisconnector() normalizer = rdMolStandardize.Normalizer() words = set() canons = [] tokens = [] smiles = set() for mol in tqdm(mols): try: mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) mol = chooser.choose(mol) mol = charger.uncharge(mol) mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) smileR = Chem.MolToSmiles(mol, 0) smiles.add(Chem.CanonSmiles(smileR)) except: print('Parsing Error:') #, Chem.MolToSmiles(mol)) for smile in tqdm(smiles): token = voc.split(smile) + ['EOS'] if {'C', 'c'}.isdisjoint(token): print('Warning:', smile) continue if not {'[Na]', '[Zn]'}.isdisjoint(token): print('Redudent', smile) continue if 10 < len(token) <= 100: words.update(token) canons.append(smile) tokens.append(' '.join(token)) log = open(output + '_voc.txt', 'w') log.write('\n'.join(sorted(words))) log.close() log = pd.DataFrame() log['Smiles'] = canons log['Token'] = tokens log.drop_duplicates(subset='Smiles') log.to_csv(output + '_corpus.txt', sep='\t', index=False)
def uncharge(mol): """Attempts to neutralize charges by adding and/or removing hydrogens where possible. Parameters ---------- mol: rdkit.Chem.Mol The molecule where the charges have to be neutralized. Returns ------- mol: rdkit.Chem.Mol Returns a neutralized molecule. """ return rdMolStandardize.Uncharger().uncharge(mol)
def standardize_mol( mol: Chem.rdchem.Mol, disconnect_metals: bool = False, normalize: bool = True, reionize: bool = True, uncharge: bool = False, stereo: bool = True, ): r""" This function returns a standardized version the given molecule, with or without disconnect the metals. The process is apply in the order of the argument. Arguments: mol: The molecule to standardize. disconnect_metals: Whether to disconnect the metallic atoms from non-metals normalize: Whether to apply normalization (correct functional groups and recombine charges). reionize: Whether to apply molecule reionization uncharge: Whether to remove all charge from molecule stereo: Whether to attempt to assign stereochemistry Returns: mol: The standardized molecule. """ mol = copy_mol(mol) if disconnect_metals: md = rdMolStandardize.MetalDisconnector() mol = md.Disconnect(mol) if normalize: mol = rdMolStandardize.Normalize(mol) if reionize: reionizer = rdMolStandardize.Reionizer() mol = reionizer.reionize(mol) if uncharge: uncharger = rdMolStandardize.Uncharger() mol = uncharger.uncharge(mol) if stereo: Chem.AssignStereochemistry(mol, force=False, cleanIt=True) return mol
def standardize_mol(mol): """ Standardize molecule. Parameters ---------- mol : rdkit.Chem.rdchem.Mol Molecule. Returns ------- rdkit.Chem.rdchem.Mol or None Standardized molecule or None if standardization failed. """ try: # sanitize molecule Chem.SanitizeMol(mol) # remove non-explicit hydrogens mol = Chem.RemoveHs(mol) # disconnect metals from molecule mol = rdMolStandardize.MetalDisconnector().Disconnect(mol) # normalize moleucle mol = rdMolStandardize.Normalize(mol) # reionize molecule mol = rdMolStandardize.Reionize(mol) # uncharge molecule (this helps to standardize protonation states) u = rdMolStandardize.Uncharger() mol = u.uncharge(mol) # assign stereochemistry Chem.AssignStereochemistry(mol, force=True, cleanIt=True) return mol except Exception as e: print(f"ERROR in standardization: {e}") return None
def uncharge_mol(m): """ >>> def uncharge_smiles(smi): return Chem.MolToSmiles(uncharge_mol(Chem.MolFromSmiles(smi))) >>> uncharge_smiles('[NH3+]CCC') 'CCCN' >>> uncharge_smiles('[NH3+]CCC[O-]') 'NCCCO' >>> uncharge_smiles('C[N+](C)(C)CCC[O-]') 'C[N+](C)(C)CCC[O-]' >>> uncharge_smiles('CC[NH+](C)C.[Cl-]') 'CCN(C)C.Cl' >>> uncharge_smiles('CC(=O)[O-]') 'CC(=O)O' >>> uncharge_smiles('CC(=O)[O-].[Na+]') 'CC(=O)[O-].[Na+]' >>> uncharge_smiles('[NH3+]CC(=O)[O-].[Na+]') 'NCC(=O)[O-].[Na+]' >>> uncharge_smiles('CC(=O)[O-].C[NH+](C)C') 'CC(=O)O.CN(C)C' Alcohols are protonated before acids: >>> uncharge_smiles('[O-]C([N+](C)C)CC(=O)[O-]') 'C[N+](C)C(O)CC(=O)[O-]' And the neutralization is done in a canonical order, so atom ordering of the input structure isn't important: >>> uncharge_smiles('C[N+](C)(C)CC([O-])CC[O-]') 'C[N+](C)(C)CC([O-])CCO' >>> uncharge_smiles('C[N+](C)(C)CC(CC[O-])[O-]') 'C[N+](C)(C)CC([O-])CCO' """ uncharger = rdMolStandardize.Uncharger(canonicalOrder=True) res = uncharger.uncharge(m) res.UpdatePropertyCache(strict=False) return res
frag = r.StripMol(frag) if frag.GetNumAtoms() == 0: continue elif is_nonorganic(frag): continue else: nonorg = contains_nonorg(frag) try: frag = rdMolStandardize.Normalize(frag) except ValueError as e: stand_mol_list.append(("Failed at normalize", index, None, mixture, nonorg, str(e))) continue try: frag = rdMolStandardize.Uncharger().uncharge(frag) except ValueError as e: stand_mol_list.append( ("Failed at neutralising", index, None, mixture, nonorg, str(e))) continue if flow_variables['stereo'] == "Remove": try: Chem.RemoveStereochemistry(frag) except ValueError as e: stand_mol_list.append( ("Failed at stereochem remove", index, None, mixture, nonorg, str(e))) continue
:return: """ out_dict = {} conf = mol.GetConformer() atoms = mol.GetAtoms() for atom in atoms: # Get res_name res_name, atom_name, position = get_res_atom_name(atom, conf) if res_name in out_dict: out_dict[res_name][atom_name] = position else: out_dict[res_name] = {atom_name: position} return out_dict uncharger = rdMolStandardize.Uncharger() def standardize(mol): mol = rdMolStandardize.Cleanup(mol) mol = fragment(mol) mol = uncharger.uncharge(mol) remove_isotopes(mol) return mol def remove_isotopes(mol): for atom in mol.GetAtoms(): atom.SetIsotope(0)