def write_smiles_id_file(meas, target, activity_type, max_heavy): """ Format dataset for MMP analysis and write to temp file """ from rdkit import Chem from rdkit.Chem import SaltRemover remover = SaltRemover.SaltRemover() smifile = target + "_" + activity_type + "_ligands.smi" f = open(smifile, 'w') error_files = target + "_" + activity_type + "_problem_smiles.smi" g = open(error_files, 'w') for mol in meas.keys(): try: cpd = Chem.MolFromSmiles(meas[mol][0]['smiles']) res = remover.StripMol(cpd) # Remove Salts if res.GetNumAtoms() > max_heavy: continue smiles = Chem.MolToSmiles(res, True) # Canonicalize smiles if "." in smiles: print "Found unknown salt in ", mol, ": ", smiles print "This compound will be ignored in all further calculations." continue f.write(smiles + " " + mol + '\n') except: g.write(meas[mol][0]['smiles'] + " " + mol + "\n") f.close() g.close() return
def smiles_to_mol(smiles: list, max_attempts: int = 10, use_random_coords: bool = False, deisomerize=False) -> dict: if deisomerize: f = deisomerize_smiles else: f = lambda x: x mols_raw = [Chem.MolFromSmiles(f(smi)) for smi in smiles] logger.info("Computing 3D coordinates...") s = SaltRemover.SaltRemover() mols = {} n = len(mols_raw) pbar = tqdm(total=n) for i, mol in enumerate(mols_raw): pbar.update() logger.debug("Embedding %s" % smiles[i]) try: mol = s.StripMol(mol, dontRemoveEverything=True) mol = Chem.AddHs(mol) AllChem.Compute2DCoords(mol) AllChem.EmbedMolecule(mol, maxAttempts=max_attempts, useRandomCoords=use_random_coords) AllChem.UFFOptimizeMolecule(mol) # Is this deterministic? except Exception as e: logger.warning("Exception for %s: %s" % (smiles[i], str(e))) else: mols[smiles[i]] = mol logger.info("Finished embedding all molecules") return mols
def RemoveSaltsFromFrame(frame, molCol='ROMol'): ''' Removes salts from mols in pandas DataFrame's ROMol column ''' global _saltRemover if _saltRemover is None: from rdkit.Chem import SaltRemover _saltRemover = SaltRemover.SaltRemover() frame[molCol] = frame.apply(lambda x: _saltRemover.StripMol(x[molCol]), axis=1)
def process_mol(mol): #removal of mixtures fragmenter_object = molvs.fragment.LargestFragmentChooser( prefer_organic=True) mol = fragmenter_object.choose(mol) if mol is None: logging.info("Mixture removal failed for molecule") #removal of inorganics if not molvs.fragment.is_organic(mol): raise ManualReviewException("Molecule is not organic") #removal of salts remover = SaltRemover.SaltRemover() mol = remover.StripMol( mol, dontRemoveEverything=True) #tartrate is listed as a salt? what do? if mol is None: raise ManualReviewException("Salt removal failed for molecule") #structure normalization normalizer = molvs.normalize.Normalizer( normalizations=molvs.normalize.NORMALIZATIONS, max_restarts=molvs.normalize.MAX_RESTARTS) mol = normalizer.normalize(mol) if mol is None: raise ManualReviewException("Normalization failed for molecule") #tautomer selection tautomerizer = molvs.tautomer.TautomerCanonicalizer( transforms=molvs.tautomer.TAUTOMER_TRANSFORMS, scores=molvs.tautomer.TAUTOMER_SCORES, max_tautomers=molvs.tautomer.MAX_TAUTOMERS) if mol is None: raise ManualReviewException("Tautomerization failed for molecule") #disconnect metals metal_remover = molvs.metal.MetalDisconnector() mol = metal_remover.disconnect(mol) if mol is None: raise ManualReviewException("Metal removal failed for molecule") #final check for only valid atoms check_valid_atoms(mol) inchi = Chem.MolToInchi(mol) return inchi
def initialize_ChEMBL_PDB_conversion(): """ Return ChEMBL to PDB conversion and vice versa via uniprot IDs """ f = open("cc-to-pdb.txt",'r') a = f.readlines() f.close() PDB_LIG_ID_to_PDB_PROT = dict((line.split()[0],line.split()[1:]) for line in a) PDB_PROT_to_PDB_LIG_ID = defaultdict(list) for line in a: for pdb_id in line.split()[1:]: PDB_PROT_to_PDB_LIG_ID[pdb_id].append(line.split()[0]) f = open("pdbtosp.txt","r") a = f.read() f.close() a = a.replace("\n ","") a = a.split("\n") a = a[24:-6] PDB_RES = dict((string.lower(line.split()[0]),line.split()[2]) for line in a if line.split()[1] == "X-ray") PDB_to_UNIPROT = dict((string.lower(line[:4]),[i[:6] for i in line[41:].split("(")]) for line in a if line.split()[1] == "X-ray") UNIPROT_to_PDB = defaultdict(list) for line in a: if not line.split()[1] == "X-ray": continue for uniprot_id in [i[:6] for i in line[41:].split("(")]: UNIPROT_to_PDB[uniprot_id].append(string.lower(line[:4])) # Read PDB Ligand file and generate Fingerprints max_heavy = 70 remover = SaltRemover.SaltRemover() pdb_ligands = {} f = open("Components-smiles-stereo-oe.smi","r") for line in f: line = line.split() if len(line) >= 2: pdb_ligands[line[1]] = {"smiles":line[0],"fp":None} f.close() return PDB_LIG_ID_to_PDB_PROT, PDB_PROT_to_PDB_LIG_ID, PDB_RES, PDB_to_UNIPROT, UNIPROT_to_PDB, pdb_ligands
def screen_organic(smiles): """ Heuristic to determine if a input SMILES string is considered as only organic matter. Parameters ----------- smiles : str Returns ------------ is_organic : bool """ if smiles is None: return False remover = SaltRemover.SaltRemover() # SMARTS pattern for organic elements # H, B, C, N, O, F, P, S, Cl, Br, I patt = '[!$([#1,#5,#6,#7,#8,#9,#15,#16,#17,#35,#53])]' mpatt = Chem.MolFromSmarts(patt) m = Chem.MolFromSmiles(smiles, sanitize=True) if m is None: return False # remove salts res = remover.StripMol(m) if res is not None and res.GetNumAtoms() < m.GetNumAtoms(): return False # take only the largest fragment frags = AllChem.GetMolFrags(m, asMols=True) if len(frags) > 1: return False # nums = [(f.GetNumAtoms(), f) for f in frags] # nums.sort(reverse=True) # m = nums[0][1] # take only organic molecules if not m.HasSubstructMatch(mpatt): return True else: return False
def filter_ions(df): print 'You provided {} molecules to the salt remover'.format(df.shape[0]) #Strip common ions out of molecule objects remover = SaltRemover.SaltRemover( defnData="[Li,Na,K,Rb,Cs,Mg,Ca,Sr,Ba,Zn,Cl,Br,F,I]") df['mol_strip'] = df['mol_send'].map(remover.StripMol) df['smilesf'] = df['mol_strip'].map(Chem.MolToSmiles) df['smilesf'] = df['smilesf'].map(lambda x: max(x.split('.'), key=len)) print """CAUTION. You are removing ions and other fragments. However, the fingerprints used to calculate diversity were determined before removal. Consider recalculating fingerprints.""" print 'Filter will try to remove duplicates after de-salting...' #remove duplicates remaining after removing counterions df.drop_duplicates(inplace=True, subset='smilesf') print 'After duplicate removal, there are {} molecules'.format(df.shape[0]) print '...' return df
def get_pdbs_with_similar_ligands(circles,target,tc,pdbids, PDB_PROT_to_PDB_LIG_ID, pdb_ligands, meas): """ Find similar ligands within pdb files """ remover = SaltRemover.SaltRemover() similar_pdbs = [] for circle in circles: circle_pdbs = [] for cpd in circle: t_smi = meas[cpd][0]['smiles'] t_cpd = Chem.MolFromSmiles(t_smi) t_res = remover.StripMol(t_cpd) # Remove Salts t_fp = FingerprintMols.FingerprintMol(t_res) pdbs = [] for pdbid in pdbids: for lig in PDB_PROT_to_PDB_LIG_ID[pdbid]: try: if pdb_ligands[lig]["fp"] == "skip": continue if pdb_ligands[lig]["fp"] == None: cpd = Chem.MolFromSmiles(pdb_ligands[lig]["smiles"]) res = remover.StripMol(cpd) # Remove Salts if res.GetNumAtoms() > max_heavy: continue # if the ligand is too large smiles = Chem.MolToSmiles(res) # Canonicalize smiles fp = FingerprintMols.FingerprintMol(res) pdb_ligands[lig] = ({"smiles":smiles,"fp":fp,"mol":cpd}) except: pdb_ligands[lig] = ({"fp":"skip"}) continue sim = DataStructs.FingerprintSimilarity(t_fp,pdb_ligands[lig]["fp"]) if sim >= tc: pdbs.append((pdbid,sim)) circle_pdbs.append(pdbs) similar_pdbs.append(circle_pdbs) return similar_pdbs
def main(prm_file): pref = prm_file.split('.sdf')[0] print('## Reading file...') prm_df = PandasTools.LoadSDF(prm_file, smilesName='SMILES', molColName='MOL', includeFingerprints=False) print(prm_df[:10]) ## remove salts and rename the smiles print('## Cleaning moleucles...') remover = SaltRemover.SaltRemover() chooser = rdMolStandardize.LargestFragmentChooser(preferOrganic=True) prm_df['molx'] = prm_df.MOL.apply(remover.StripMol) prm_df['mol'] = prm_df.molx.apply(chooser.choose) prm_df['smiles'] = prm_df.mol.apply(Chem.MolToSmiles) def add_cb(inp): return 'CB_' + str(inp) prm_df['ID'] = prm_df.CB_ID.apply(add_cb) # prm_df['ID'] = prm_df.CB_ID ## shuffle print('## Shuffling molecules...') df = prm_df.sample(frac=1).reset_index(drop=True) print(prm_df[:10]) ## recalculate molecular properties print('## Calculating properties...') prm_df['qed'] = prm_df.mol.apply(QED.properties) prm_df['MW'] = prm_df.qed.apply(lambda x: x.MW) # prm_df['logP'] = prm_df.qed.apply(lambda x: x.ALOGP) # prm_df['HBA'] = prm_df.qed.apply(lambda x: x.HBA) # prm_df['HBD'] = prm_df.qed.apply(lambda x: x.HBD) # prm_df['PSA'] = prm_df.qed.apply(lambda x: x.PSA) # prm_df['ROTB'] = prm_df.qed.apply(lambda x: x.ROTB) # prm_df['AROM'] = prm_df.qed.apply(lambda x: x.AROM) # prm_df['HA'] = prm_df.mol.apply(rdchem.Mol.GetNumHeavyAtoms) print(prm_df[:10]) print(' > number of molecules... ', len(prm_df)) ## print out molecule properties and smiles (shuffled) print('## Writing results...') Cols_csv = [ 'ID', 'MW', 'HA', 'logP', 'LogS', 'HBA', 'HBD', 'PSA', 'ROTB', 'AROM', 'SaltType', 'smiles' ] Cols_smi = ['smiles', 'ID'] prm_df.loc[(prm_df.MW > 150.) & (prm_df.MW <= 300.)].to_csv( pref + '.frag.csv.bz2', sep=',', float_format='%.2f', columns=Cols_csv, index=False) prm_df.loc[(prm_df.MW > 150.) & (prm_df.MW <= 300.)].to_csv( pref + '.frag.smi', sep='\t', columns=Cols_smi, index=False) prm_df.loc[(prm_df.MW > 300.) & (prm_df.MW <= 400.)].to_csv( pref + '.lead.csv.bz2', sep=',', float_format='%.2f', columns=Cols_csv, index=False) prm_df.loc[(prm_df.MW > 300.) & (prm_df.MW <= 400.)].to_csv( pref + '.lead.smi', sep='\t', columns=Cols_smi, index=False) prm_df.loc[prm_df.MW > 400.].to_csv(pref + '.drug.csv.bz2', sep=',', float_format='%.2f', columns=Cols_csv, index=False) prm_df.loc[prm_df.MW > 400.].to_csv(pref + '.drug.smi', sep='\t', columns=Cols_smi, index=False) prm_df.loc[prm_df.MW <= 150.].to_csv(pref + '.small.csv.bz2', sep=',', float_format='%.2f', columns=Cols_csv, index=False) prm_df.loc[prm_df.MW <= 150.].to_csv(pref + '.small.smi', sep='\t', columns=Cols_smi, index=False)
def process_mol(self): mol = self.mol #removal of mixtures fragmenter_object = molvs.fragment.LargestFragmentChooser( prefer_organic=True) newmol = fragmenter_object.choose(mol) if newmol is None: self.history.add_modification( text="REJECT: Fragment chooser failed") self.rejected = True return False if Chem.MolToInchi(newmol) != Chem.MolToInchi(mol): self.history.add_modification( text="Detected mixture, chose largest fragment") mol = newmol #removal of inorganics if not molvs.fragment.is_organic(mol): self.history.add_modification( text="REJECT: Molecule is not organic") self.rejected = True return False #removal of salts remover = SaltRemover.SaltRemover() newmol = remover.StripMol( mol, dontRemoveEverything=True) #tartrate is listed as a salt? what do? if newmol is None: self.history.add_modification(text="REJECT: Salt removal failed") self.rejected = True return False if Chem.MolToInchi(newmol) != Chem.MolToInchi(mol): self.history.add_modification(text="Detected salts, removed") mol = newmol #structure normalization normalizer = molvs.normalize.Normalizer( normalizations=molvs.normalize.NORMALIZATIONS, max_restarts=molvs.normalize.MAX_RESTARTS) newmol = normalizer.normalize(mol) if newmol is None: self.history.add_modification(text="REJECT: Normalization failed") self.rejected = True return False if Chem.MolToInchi(newmol) != Chem.MolToInchi(mol): self.history.add_modification(text="Normalization(s) applied") mol = newmol #tautomer selection tautomerizer = molvs.tautomer.TautomerCanonicalizer( transforms=molvs.tautomer.TAUTOMER_TRANSFORMS, scores=molvs.tautomer.TAUTOMER_SCORES, max_tautomers=molvs.tautomer.MAX_TAUTOMERS) newmol = tautomerizer(mol) if newmol is None: self.history.add_modification( text="REJECT: Tautomerization failed") self.rejected = True return False if Chem.MolToInchi(newmol) != Chem.MolToInchi(mol): self.history.add_modification(text="Tautomer(s) canonicalized") mol = newmol #disconnect metals metal_remover = molvs.metal.MetalDisconnector() newmol = metal_remover.disconnect(mol) if newmol is None: self.history.add_modification(text="REJECT: Metal removal failed") self.rejected = True return False if Chem.MolToInchi(newmol) != Chem.MolToInchi(mol): self.history.add_modification(text="Metal(s) disconnected") mol = newmol #final check for only valid atoms passed_valid = self.check_valid_atoms() if not passed_valid: return False self.history.add_modification(text="Passed validation") self.mol = mol return True
def remove_salts(mol): remover = SaltRemover.SaltRemover() res = remover.StripMol(mol) # return Chem.MolToSmiles(res) return res
def _remove_salts(mol): return SaltRemover.SaltRemover().StripMol(mol, dontRemoveEverything=True)
def build_ligand_dictionary_from_infile( infile: str, error_path: str, props, units, *, delimiter=None, series_column=None, ): """ Read input file and assemble dictionaries """ if delimiter == "comma": delimiter = "," if delimiter == "tab" or delimiter is None: delimiter = "\t" elif delimiter == "space": delimiter = " " elif delimiter == "semicolon": delimiter = ";" with open(infile, "r") as f, open(error_path, "w") as g: ######## # Process header header = [ i.strip('"') for i in f.readline().rstrip("\n").split(delimiter) ] ######## # Figure out Column ID of SMILES and ID column id_col = [ i for i, name in enumerate(header) if "SRN" in name or "ID" in name ] id_col = 0 if len(id_col) == 0 else id_col[0] smi_col = [ i for i, name in enumerate(header) if "smiles" in name.lower() ] smi_col = 1 if len(smi_col) == 0 else smi_col[0] if series_column: ser_col = header.index(series_column) ######## # Figure out target column ids if not props: act_col = [2] props = [header[2]] else: try: act_col = [header.index(i) for i in props] except Exception: print( "Could not find all given Activity columns in file header." ) raise ######## # Figure out conversion of target columns # Valid Flags for not converting activity data to pActivity: pIC50, pEC50, pKi, pKd, noconv log_flags = [ "pIC50", "pEC50", "pKi", "pKd", "pCC50", "pIC20", "pID50", "noconv" ] col_convert = [ False if any(log_flag.lower() in target.lower() for log_flag in log_flags) else True for target in props ] log10 = [False for _ in props] ######### # Write Identified Columns to STDOUT print("Identifier Column found: " + header[id_col]) print("Smiles column found: " + header[smi_col]) for i in range(len(props)): if len(units) > 0: if units[i] == "noconv": col_convert[i] = False elif units[i] == "log10": col_convert[i] = False log10[i] = True if col_convert[i]: print("Activity column #" + str(i + 1) + ": " + props[i] + " will be converted to -log10(" + props[i] + ")") elif log10[i]: print("Activity column #" + str(i + 1) + ": " + props[i] + " will be converted to log10(" + props[i] + ")") else: print("Activity column #" + str(i + 1) + ": " + props[i]) if series_column: print("Series Column found: " + header[ser_col]) if id_col == smi_col or id_col in act_col or smi_col in act_col: print( "Was not able to cleanly distinguish ID, SMILES, and activity columns." ) print( "Please assign unambiguous names (no overlap in 'SMILES', 'ID', 'SRN'." ) raise RuntimeError ######## # Assemble data salt_defns = os.path.join( RDConfig.RDDataDir, "Salts.txt") # replace if you have more specific definitions remover = SaltRemover.SaltRemover(defnFilename=salt_defns) meas = dict() smiles_registered = dict() for line in f: line = [i.strip('"') for i in line.rstrip("\n").split(delimiter)] if line[0][0] == "#": # skip commented-out compounds continue compound_id = line[id_col] if compound_id in meas: print("Two or more entries for the same identifier: " + compound_id) print("Please fix.") raise RuntimeError smiles = line[smi_col].replace("\\\\", "\\") if len(line) < len(props) + 2: print("Could not properly read line:") print(line) raise RuntimeError try: mol = Chem.MolFromSmiles(smiles) res = remover.StripMol(mol) # Remove Salts smiles = Chem.MolToSmiles(res, True) # Canonicalize smiles mwt = Descriptors.MolWt(mol) if "." in smiles: print("Found unknown salt in " + line[id_col] + ": " + smiles) print("This compound will be ignored.") continue except: print("Could not properly read SMILES " + smiles + "(see error SMILES file)") print("This compound will be ignored.") g.write(smiles + "\n") continue if smiles in smiles_registered: print("Two entries with the same structure: " + smiles_registered[smiles] + " and " + compound_id) print( "Nonadd will use the first compound and discard the second.\n" ) continue else: smiles_registered[smiles] = compound_id meas[compound_id] = dict(smiles=smiles, Act=[], pAct=[], qualifiers=[], mwt=mwt, series=None) if series_column: meas[compound_id]["series"] = line[ser_col] for i, target in enumerate(props): if col_convert[i]: u_conv = unit_conv["uM"] if not len(units) == 0: try: u_conv = unit_conv[units[i]] except: print("Given unit " + units[i] + " has not been recognized.") print( "Please give one out of [M, mM, uM, nM, pM, noconv]" ) if line[act_col[i]] in ["NA", "", "No Value"]: meas[compound_id]["qualifiers"].append("") meas[compound_id]["Act"].append("NA") meas[compound_id]["pAct"].append("NA") elif is_number(line[act_col[i]]): if float(line[act_col[i]]) <= 0.0: print("Cannot interpret measured activity of " + line[act_col[i]] + units[i] + " for compound " + compound_id) print("Please fix.") raise RuntimeError meas[compound_id]["qualifiers"].append("") meas[compound_id]["Act"].append(float( line[act_col[i]])) meas[compound_id]["pAct"].append( (-1) * math.log10(float(line[act_col[i]]) * u_conv)) elif line[act_col[i]][0] in (">", "<", "*") and is_number( line[act_col[i]][1:]): meas[compound_id]["qualifiers"].append( line[act_col[i]][0]) meas[compound_id]["Act"].append( float(line[act_col[i]][1:])) meas[compound_id]["pAct"].append( (-1) * math.log10(float(line[act_col[i]][1:]) * u_conv)) else: print("Did not recognize number " + str(line[act_col[i]])) print(" in line: " + " ".join(line)) print("Please fix.") raise RuntimeError elif log10[i]: if line[act_col[i]] in ["NA", "", "No Value"]: meas[compound_id]["qualifiers"].append("") meas[compound_id]["Act"].append("NA") meas[compound_id]["pAct"].append("NA") elif is_number(line[act_col[i]]): if float(line[act_col[i]]) <= 0.0: print("Cannot interpret measured activity of " + line[act_col[i]] + units[i] + " for compound " + compound_id) print("Please fix.") raise RuntimeError meas[compound_id]["qualifiers"].append("") meas[compound_id]["Act"].append(float( line[act_col[i]])) meas[compound_id]["pAct"].append( math.log10(float(line[act_col[i]]))) elif line[act_col[i]][0] in (">", "<", "*") and is_number( line[act_col[i]][1:]): meas[compound_id]["qualifiers"].append( line[act_col[i]][0]) meas[compound_id]["Act"].append( float(line[act_col[i]][1:])) meas[compound_id]["pAct"].append( (-1) * math.log10(float(line[act_col[i]][1:]))) else: print("Did not recognize number " + str(line[act_col[i]])) print(" in line: " + " ".join(line)) print("Please fix.") raise RuntimeError else: if line[act_col[i]] in ["NA", "", "No Value"]: meas[compound_id]["qualifiers"].append("") meas[compound_id]["Act"].append("NA") meas[compound_id]["pAct"].append("NA") elif is_number(line[act_col[i]]): meas[line[id_col]]["qualifiers"].append("") if len(units) > 0: if units[i] == "noconv": meas[compound_id]["Act"].append("") else: meas[compound_id]["Act"].append( 1e6 * 10**((-1) * float(line[act_col[i]]))) else: meas[compound_id]["Act"].append( 1e6 * 10**((-1) * float(line[act_col[i]]))) meas[compound_id]["pAct"].append( float(line[act_col[i]])) elif line[act_col[i]][0] in (">", "<", "*") and is_number( line[act_col[i]][1:]): meas[compound_id]["qualifiers"].append( line[act_col[i]][0]) if len(units) > 0: if units[i] == "noconv": meas[compound_id]["Act"].append("") else: meas[compound_id]["Act"].append( 1e6 * 10**((-1) * float(line[act_col[i]][1:]))) else: meas[compound_id]["Act"].append( 1e6 * 10**((-1) * float(line[act_col[i]][1:]))) meas[compound_id]["pAct"].append( float(line[act_col[i]][1:])) else: print("Did not recognize number " + str(line[act_col[i]])) print(" in line: " + " ".join(line)) print("Please fix.") raise RuntimeError if len(units) == 0: units = ["noconv" for _ in props] return meas, props, units