def read_rd_mols_from_sdf_file(sdf_file, sanitize=True): if sdf_file.endswith('.gz'): with gzip.open(sdf_file) as f: suppl = Chem.ForwardSDMolSupplier(f, sanitize=sanitize) return [Molecule(mol) for mol in suppl] else: suppl = Chem.SDMolSupplier(sdf_file, sanitize=sanitize) return [Molecule(mol) for mol in suppl]
def ExtractMol(List, score_best, prefix): ConsScore = {} Saved_Mol = [] # cluster molecules based on which SDF file they belong to for Mol in sorted(List, key=lambda tup: tup[2]): if Mol[2] in ConsScore: ConsScore[Mol[2]].append(Mol) else: ConsScore[Mol[2]] = [Mol] # from each SDF file, extract the docked pose for file_id in tqdm(ConsScore, total=len(ConsScore)): file_prefix = file_id.split('txt')[0] SDF = glob.glob(file_prefix + 'sdf*') if len(SDF) == 0: sys.exit('{0} or related SD file not found.'.format(file_prefix + 'sdf*')) else: sdf_file = SDF[0] handle = file_handle(sdf_file) Temp = [ x for x in Chem.ForwardSDMolSupplier(handle, removeHs=False) if x is not None ] SDMol = {} for mol in Temp: name = mol.GetProp('_Name') # name = mol.GetProp('_Name').split()[0] # if name is separated if re.search(r':', name): # when the SD file is processed from docking SDMol[name.split(':')[0]] = mol else: SDMol[name] = mol for Mol in ConsScore[file_id]: try: test = SDMol[Mol[1]] except KeyError: print('{0} is not registered in database. Skip.'.format( Mol[1])) continue # if score_best is True: # Saved_Mol.append([Mol[0], Mol[1], SDMol[Mol[1]]]) # else: Saved_Mol.append([Mol[0], Mol[1], SDMol[Mol[1]], Mol[3]]) del Temp del SDMol ############# # Sort all mol based on score and write out saved_sdf = Chem.SDWriter(prefix + '.sdf') for M in sorted(Saved_Mol, key=lambda tup: tup[0]): saved_sdf.write(M[2]) saved_sdf.flush() saved_sdf.close()
def LoadSDF(filename, idName='ID', molColName='ROMol', includeFingerprints=False, isomericSmiles=True, smilesName=None, embedProps=False, removeHs=True, strictParsing=True): '''Read file in SDF format and return as Pandas data frame. If embedProps=True all properties also get embedded in Mol objects in the molecule column. If molColName=None molecules would not be present in resulting DataFrame (only properties would be read). ''' if isinstance(filename, str): if filename.lower()[-3:] == ".gz": import gzip f = gzip.open(filename, "rb") else: f = open(filename, 'rb') close = f.close else: f = filename close = None # don't close an open file that was passed in records = [] indices = [] for i, mol in enumerate( Chem.ForwardSDMolSupplier(f, sanitize=(molColName is not None), removeHs=removeHs, strictParsing=strictParsing)): if mol is None: continue row = dict((k, mol.GetProp(k)) for k in mol.GetPropNames()) if molColName is not None and not embedProps: for prop in mol.GetPropNames(): mol.ClearProp(prop) if mol.HasProp('_Name'): row[idName] = mol.GetProp('_Name') if smilesName is not None: try: row[smilesName] = Chem.MolToSmiles( mol, isomericSmiles=isomericSmiles) except: log.warning( 'No valid smiles could be generated for molecule %s', i) row[smilesName] = None if molColName is not None and not includeFingerprints: row[molColName] = mol elif molColName is not None: row[molColName] = _MolPlusFingerprint(mol) records.append(row) indices.append(i) if close is not None: close() RenderImagesInAllDataFrames(images=True) return pd.DataFrame(records, index=indices)
def get_dataframe_from_library(file_path, library, value, index='VPC ID'): suppl = Chem.ForwardSDMolSupplier(file_path) mols = [x for x in suppl if x is not None] rows_list = [] for molecule in mols: rows_list.append( get_molstring_from_library(molecule, library, value, index=index)) return pd.concat(rows_list, axis=0)
def load_from_gzip(input_filepath, filename): """ Loads a gzipped .sd file, and returns it as a not-None python list for later pickling. """ with gzip.open(os.path.join(input_filepath, filename)) as gzinfile: infile = Chem.ForwardSDMolSupplier(gzinfile) return [x for x in infile if x is not None]
def process( refmol_filename, inputs_filename, outputs_filename, refmol_index=None, refmol_format=None, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All, ): ref_mol = utils.read_single_molecule(refmol_filename, index=refmol_index, format=refmol_format) # utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms") ref_features = getRawFeatures(ref_mol) input_file = utils.open_file_for_reading(inputs_filename) suppl = Chem.ForwardSDMolSupplier(input_file) output_file = utils.open_file_for_writing(outputs_filename) writer = Chem.SDWriter(output_file) count = 0 total = 0 errors = 0 for mol in suppl: count += 1 if mol is None: continue # utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms") try: sucos_score, fm_score, val3 = get_SucosScore( ref_mol, mol, tani=tani, ref_features=ref_features, score_mode=score_mode, ) mol.SetDoubleProp("SuCOS_Score", sucos_score) mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score) if tani: mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3) else: mol.SetDoubleProp("SuCOS_Protrude_Score", val3) utils.log("Scores:", sucos_score, fm_score, val3) writer.write(mol) total += 1 except ValueError as e: errors += 1 utils.log("Molecule", count, "failed to score:", e.message) input_file.close() writer.flush() writer.close() output_file.close() utils.log("Completed.", total, "processed, ", count, "succeeded, ", errors, "errors")
def read_sdf( urlpath: Union[str, os.PathLike, TextIO], as_df: bool = False, smiles_column: Optional[str] = "smiles", mol_column: str = None, include_private: bool = False, include_computed: bool = False, ) -> Union[List[Chem.rdchem.Mol], pd.DataFrame]: """Read an SDF file. Args: urlpath: Path to a file or a file-like object. Path can be remote or local. as_df: Whether to return a list mol or a pandas DataFrame. smiles_column: Name of the SMILES column. Only relevant if `as_df` is True. mol_column: Name of the mol column. Only relevant if `as_df` is True. include_private: Include private properties in the columns. Only relevant if `as_df` is True. include_computed: Include computed properties in the columns. Only relevant if `as_df` is True. """ # File-like object if isinstance(urlpath, io.IOBase): supplier = Chem.ForwardSDMolSupplier(urlpath) mols = [mol for mol in supplier if mol is not None] # Regular local or remote paths else: with fsspec.open(urlpath) as f: if str(urlpath).endswith(".gz") or str(urlpath).endswith(".gzip"): f = gzip.open(f) supplier = Chem.ForwardSDMolSupplier(f) mols = [mol for mol in supplier if mol is not None] if as_df: return dm.to_df( mols, smiles_column=smiles_column, mol_column=mol_column, include_private=include_private, include_computed=include_computed, ) # type: ignore return mols
def read_molecules(infile=None, stream=None, molecules=None): if stream: suppl = Chem.ForwardSDMolSupplier(stream) mols = [x for x in suppl if x is not None] elif isinstance(molecules, list): mols = molecules else: if infile.endswith('.sdf.gz'): suppl = Chem.ForwardSDMolSupplier(gzip.open(infile)) elif infile.endswith('.sdf'): suppl = Chem.SDMolSupplier(infile) else: print('Wrong Format!') return 1 mols = [x for x in suppl if x is not None] LOGGER.info('{} valid molecules in {} dataset'.format(len(mols), infile)) return mols
def load_shard(shard, shards_dir, id_prefix): if "sdf.gz" not in shard: return print("Processing shard %s" % shard) shard = os.path.join(shards_dir, shard) with gzip.open(shard) as f: supp = Chem.ForwardSDMolSupplier(f) mols = [mol for mol in supp if mol is not None] mol_dict = mols_to_dict(mols, id_prefix) return mol_dict
def makePrints(s): try: inf = gzip.open(s) gzsuppl = Chem.ForwardSDMolSupplier(inf) mols = [x for x in gzsuppl if x is not None] prints = [finger(mol) for mol in mols] prints = pd.DataFrame(prints).dropna() return prints except: print('Unable to open...') return
def default_open_input_sdf(inputDef): """Open the input as a SD file (possibly gzipped if ending with .gz) according to RDKit's ForwardSDMolSupplier :param inputDef: The name of the file. If None then STDIN is used (and assumed not to be gzipped) """ if inputDef: input = open_file(inputDef) else: input = sys.stdin suppl = Chem.ForwardSDMolSupplier(input) return input, suppl
def generate_pdbs(ligand_file, out_dir, out_template): """Generate pdb files for ligands.""" with gzip.open(ligand_file) as inf: gzsuppl = Chem.ForwardSDMolSupplier(inf) mols = [x for x in gzsuppl if x is not None] print "Number molecules: " + str(len(mols)) for id, mol in enumerate(mols): ligand_pdb = os.path.join(out_dir, out_template % id) print "writing " + ligand_pdb w = Chem.PDBWriter(ligand_pdb) w.write(mol)
def sdf_mol_supplier( filename: str, gen_ids: bool, **kwargs ) -> IterableType[Tuple[int, Chem.Mol]]: """Generator function that reads from a .sdf file. Parameters ---------- filename : str .sdf filename. gen_ids: bool generate ids or not. Yields ------- tuple int id and rdkit mol. """ if filename.endswith('.gz'): import gzip gzf = gzip.open(filename) suppl = Chem.ForwardSDMolSupplier(gzf) else: suppl = Chem.ForwardSDMolSupplier(filename) for new_mol_id, rdmol in enumerate(suppl, 1): if rdmol: if gen_ids: mol_id = new_mol_id else: mol_id = rdmol.GetProp(kwargs["mol_id_prop"]) try: int(mol_id) except ValueError: raise Exception( "FPSim only supports integer ids for molecules, " "cosinder setting gen_ids=True when running " "create_db_file to autogenerate them." ) yield mol_id, rdmol else: continue
def parse_sdfgz(filename): f = gzip.open(filename) suppl = Chem.ForwardSDMolSupplier(f, removeHs=False, sanitize=True) for molobj in suppl: if molobj is None: continue inertia = parse_molobj(molobj) yield inertia
def save_sdf(mol_paths, mol_names, out_name=''): # Setup writer out_file = os.path.join(os.path.abspath(sys.argv[1]), f'{out_name}.sdf') writer = AllChem.SDWriter(out_file) for path, name in zip(mol_paths, mol_names): if ('.sdfgz' in path) or ('.sdf.gz' in path): with gzip.open(path) as rf: suppl = Chem.ForwardSDMolSupplier(rf, removeHs=False) mol = suppl.__next__() # Grab first mol mol.SetProp('_Name', name) writer.write(mol) elif '.sdf' in path: with open(path) as rf: suppl = Chem.ForwardSDMolSupplier(rf, removeHs=False) mol = suppl.__next__() mol.SetProp('_Name', name) writer.write(mol) writer.flush() writer.close() #st.write(f'Saved to: {out_file}') return
def __read_stdin_sdf(sanitize=True): molblock = '' line = sys.stdin.readline() while line: molblock += line if line == '$$$$\n': mol = [x for x in Chem.ForwardSDMolSupplier(BytesIO(molblock.encode('utf-8')), sanitize=sanitize)][0] mol_title = molblock.split('\n', 1)[0] if not mol_title: mol_title = __get_smi_as_molname(mol) yield mol, mol_title molblock = '' line = sys.stdin.readline()
def corpus(input, output, suffix='sdf'): if suffix =='sdf': inf = gzip.open(input) mols = Chem.ForwardSDMolSupplier(inf) # mols = [mol for mol in suppl] else: df = pd.read_table(input).Smiles.dropna() mols = [Chem.MolFromSmiles(s) for s in df] voc = Voc('data/voc_smiles.txt') charger = rdMolStandardize.Uncharger() chooser = rdMolStandardize.LargestFragmentChooser() disconnector = rdMolStandardize.MetalDisconnector() normalizer = rdMolStandardize.Normalizer() words = set() canons = [] tokens = [] smiles = set() for mol in tqdm(mols): try: mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) mol = chooser.choose(mol) mol = charger.uncharge(mol) mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) smileR = Chem.MolToSmiles(mol, 0) smiles.add(Chem.CanonSmiles(smileR)) except: print('Parsing Error:') #, Chem.MolToSmiles(mol)) for smile in tqdm(smiles): token = voc.split(smile) + ['EOS'] if {'C', 'c'}.isdisjoint(token): print('Warning:', smile) continue if not {'[Na]', '[Zn]'}.isdisjoint(token): print('Redudent', smile) continue if 10 < len(token) <= 100: words.update(token) canons.append(smile) tokens.append(' '.join(token)) log = open(output + '_voc.txt', 'w') log.write('\n'.join(sorted(words))) log.close() log = pd.DataFrame() log['Smiles'] = canons log['Token'] = tokens log.drop_duplicates(subset='Smiles') log.to_csv(output + '_corpus.txt', sep='\t', index=False)
def getFPdict_sdf(fh, molID=None, fpType='ecfp', radius=4): suppl = Chem.ForwardSDMolSupplier(fh, removeHs=False) fpD = {} count = 0 for mol in suppl: count += 1 if mol is None: continue name = getName(mol, count, molID) mol.UpdatePropertyCache(strict=False) mh = Chem.AddHs(mol, addCoords=True) fpD[name] = [getFP(mh, fpType, radius), Chem.MolToSmiles(mol)] return fpD
def get_molecules_from_sdf_bytes(dataset): """ Method which make RDKit molecules from dataset bytes-object :param dataset: bytearray with molecules :return: list of RDKit molecules :type dataset: bytearray :rtype: list """ stream = io.BytesIO(dataset) supplier = Chem.ForwardSDMolSupplier(stream) molecules = [x for x in supplier if x] return molecules
def rdkit_open(File_Tuple): List = [] for f in (File_Tuple): handle = file_handle(f) if re.search(r'.sdf', f): if re.search(r'.gz$|.bz2$', f): Mol = [ x for x in Chem.ForwardSDMolSupplier(handle, removeHs=False) if x is not None ] else: Mol = [ x for x in Chem.SDMolSupplier(handle, removeHs=False) if x is not None ] if re.search(r'.smi', f): with handle as fi: first_line = fi.readline() if re.search(r'smiles', first_line, re.IGNORECASE): Mol = [ x for x in Chem.SmilesMolSupplier( f, titleLine=True, delimiter=' |\t|,') if x is not None ] else: Mol = [ x for x in Chem.SmilesMolSupplier( f, titleLine=False, delimiter=' |\t|,') if x is not None ] ## not the official RDkit function, may fail if re.search(r'.mol2', f): Mol = [ x for x in Mol2MolSupplier(f, removeHs=False) if x is not None ] print("# Found mol in {0}: {1}".format(f, len(Mol))) for mol in Mol: List.append(mol) gc.collect() return List
def readAndCreateFingerprint(file_name, counts=False): if not counts: fingerprints = [ AllChem.GetMorganFingerprintAsBitVect(m, 2, 1024) for m in Chem.ForwardSDMolSupplier(file_name, removeHs=False) if m is not None ] return fingerprints else: info = {} fingerprints = [ AllChem.GetMorganFingerprint(m, 2, bitInfo=info) for m in Chem.ForwardSDMOLSupplier(file_name, removeHs=False) if m is not None ] return fingerprints
def extractField(args): # Write header in output file args.out.write('{}\n'.format('\t'.join(['Name', args.field]))) # Get data and print to output file suppl = Chem.ForwardSDMolSupplier(args.sdf) count = 0 for m in suppl: count += 1 if m is None: continue name = mh.getName(m, count) if m.HasProp(args.field): value = m.GetProp(args.field) else: value = 'NA' args.out.write('{}\n'.format('\t'.join([name, value])))
def __read_sdf(fname, input_format, id_field_name=None, sanitize=True): if input_format == 'sdf': suppl = Chem.SDMolSupplier(fname, sanitize=sanitize) elif input_format == 'sdf.gz': suppl = Chem.ForwardSDMolSupplier(gzip.open(fname), sanitize=sanitize) else: return for mol in suppl: if mol is not None: if id_field_name is not None: mol_title = mol.GetProp(id_field_name) else: if mol.GetProp("_Name"): mol_title = mol.GetProp("_Name") else: mol_title = Chem.MolToSmiles(mol, isomericSmiles=True) yield PropertyMol(mol), mol_title
def main(directory: str, chebml_version: str): """Download the ChEBML data.""" os.makedirs(directory, exist_ok=True) bradley_path = os.path.join(directory, 'jm020472j_s2.xls') if not os.path.exists(bradley_path): try: wget.download(bradley_url, out=directory) except: click.echo('There goes ACS stopping science') chembl_url = ( f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/' f'chembl_{chebml_version}/chembl_{chebml_version}.sdf.gz') sdf_path = os.path.join(directory, f'chembl_{chebml_version}.sdf.gz') if not os.path.exists(sdf_path): wget.download(chembl_url, out=directory) sss_path = os.path.join(directory, f'chembl{chebml_version}_sssdata.pkl') if not os.path.exists(sss_path): click.echo(f'RDKit Version: {rdBase.rdkitVersion}') data = [] with gzip.GzipFile(sdf_path) as gz: suppl = Chem.ForwardSDMolSupplier(gz) for mol in tqdm(suppl, desc=f'Processing ChEBML {chebml_version}', unit_scale=True): if mol is None or mol.GetNumAtoms() > 50: continue fp = Chem.PatternFingerprint(mol) smi = Chem.MolToSmiles(mol) data.append((smi, fp)) click.echo(f'Outputting to {sss_path}') with open(sss_path, 'wb') as file: mols = rdSubstructLibrary.CachedTrustedSmilesMolHolder() fps = rdSubstructLibrary.PatternHolder() for smi, fp in data: mols.AddSmiles(smi) fps.AddFingerprint(fp) library = rdSubstructLibrary.SubstructLibrary(mols, fps) pickle.dump(library, file, protocol=pickle.HIGHEST_PROTOCOL) click.echo('Done ;)')
def default_open_input_sdf(inputDef): """Open the input as a SD file (possibly gzipped if ending with .gz) according to RDKit's ForwardSDMolSupplier :param inputDef: The name of the file. If None then STDIN is used (and assumed not to be gzipped) """ if inputDef: input = utils.open_file(inputDef) else: # We need to use the (Python 3) stdin.buffer # (a binary representation of the input stream) # for RDKit in Python 3. if sys.version_info[0] >= 3: input = sys.stdin.buffer else: input = sys.stdin suppl = Chem.ForwardSDMolSupplier(input) return input, suppl
def __read_stdin_sdf(sanitize=True, removeHs=True): molblock = '' line = sys.stdin.readline() while line: molblock += line if line == '$$$$\n': mol = [ x for x in Chem.ForwardSDMolSupplier(BytesIO( molblock.encode('utf-8')), sanitize=sanitize, removeHs=removeHs) ][0] mol_title = molblock.split('\n', 1)[0] if not mol_title: mol_title = Chem.MolToSmiles(mol, isomericSmiles=True) yield mol, mol_title molblock = '' line = sys.stdin.readline()
def zinc(first=-1, *args, **kwargs): """ ZINC collection. ..[1] Irwin, John J, and Brian K Shoichet. “ZINC --a free database of commercially available compounds for virtual screening.” Journal of chemical information and modeling vol. 45,1 (2005): 177-82. doi:10.1021/ci049714+ """ import tarfile from os.path import exists from openff.toolkit.topology import Molecule from rdkit import Chem fname = "parm_at_Frosst.tgz" url = "http://www.ccl.net/cca/data/parm_at_Frosst/parm_at_Frosst.tgz" if not exists(fname): import urllib.request urllib.request.urlretrieve(url, fname) archive = tarfile.open(fname) zinc_file = archive.extractfile("parm_at_Frosst/zinc.sdf") _mols = Chem.ForwardSDMolSupplier(zinc_file, removeHs=False) count = 0 gs = [] for mol in _mols: try: gs.append( esp.Graph(Molecule.from_rdkit(mol, allow_undefined_stereo=True))) count += 1 except: pass if first != -1 and count >= first: break return esp.data.dataset.GraphDataset(gs, *args, **kwargs)
def processSDF(cursor, filename, split, storeMolblock): nmol = 0 # capture stderr when processing mol #sio = sys.stderr = StringIO() #fp = open(filename, 'rb') #suppl = Chem.ForwardSDMolSupplier(fp) if filename.endswith(".gz"): import gzip inf = gzip.open(filename) suppl = Chem.ForwardSDMolSupplier(inf) if storeMolblock: print("Warning: cannot store molblock from gzip file", file=sys.stderr) else: suppl = Chem.SDMolSupplier(filename) for mol in suppl: if hasattr(suppl, "GetItemText"): (molblock, sep, moldata) = suppl.GetItemText(nmol).partition('M END') if storeMolblock: molstore = molblock + sep else: molstore = None else: molstore = None nmol += 1 imol = addMol(cursor, mol, molstore, nmol) if mol: for p in mol.GetPropNames(): if split: for sp in mol.GetProp(p).split(","): addProp(cursor, imol, p, sp) else: addProp(cursor, imol, p, mol.GetProp(p)) else: print("Error adding molecule #%d" % nmol, file=sys.stderr) # molname stores the stderr when processing the mol #cursor.execute("Update molecule Set molname = ? Where molid = ?", [sio.getvalue(), imol]) #sio = sys.stderr = StringIO() # reset return nmol
def usrcat_write_binary(sdf_file_path: Path, gzip_output_binary: bool = False): assert sdf_file_path.exists(), "SDF file not found" binary_file_path = Path(str(sdf_file_path) + ".usrcatsl.bin") if gzip_output_binary: binary_file_path = Path(str(binary_file_path) + ".gz") assert not Path(binary_file_path).exists(), "Output binary exists" output_binary = open_file_may_be_gzipped(binary_file_path, "wb") output_smiles_index = open_file_may_be_gzipped( Path(str(sdf_file_path) + ".usrcatsl.smi"), "w") bar = progressbar.ProgressBar(prefix="Generating binary") pos_and_desc_bytes = bytearray( struct.calcsize(usrcat_binary_struct_format_string)) num_gets = 0 num_good_mols = 0 sdf_reader = None gz_compressed_file = None if str(sdf_file_path).endswith(".gz"): gz_compressed_file = gzip.open(str(sdf_file_path)) sdf_reader = Chem.ForwardSDMolSupplier(gz_compressed_file) else: sdf_reader = Chem.SDMolSupplier(str(sdf_file_path)) for mol in sdf_reader: num_gets += 1 if mol is not None: if mol.GetNumHeavyAtoms() > 2: num_good_mols += 1 usrcat_descriptos = GetUSRCAT(mol) struct.pack_into(usrcat_binary_struct_format_string, pos_and_desc_bytes, 0, num_good_mols, *usrcat_descriptos) # Note we use num_good mols, this means that the first line is #1, not 0 - the smiles lines are not zero-indexed. output_binary.write(pos_and_desc_bytes) output_smiles_index.write( Chem.MolToSmiles(mol) + " " + mol.GetProp("_Name") + "\n") if num_good_mols % 1000 == 0: bar.update(num_gets) bar.update(num_gets) output_binary.close() output_smiles_index.close() print("Num gets", num_gets) print("Num good mols", num_good_mols)
def LoadSDF(filename, smilesName='SMILES', idName='ID',molColName = 'ROMol',includeFingerprints=False): """ Read file in SDF format and return as Panda data frame """ df = None if type(filename) is str: f = open(filename, 'rb') #'rU') else: f = filename for i, mol in enumerate(Chem.ForwardSDMolSupplier(f)): if mol is None: continue row = dict((k, mol.GetProp(k)) for k in mol.GetPropNames()) if mol.HasProp('_Name'): row[idName] = mol.GetProp('_Name') row[smilesName] = Chem.MolToSmiles(mol) row = pd.DataFrame(row, index=[i]) if df is None: df = row else: df = df.append(row) f.close() AddMoleculeColumnToFrame(df, smilesCol=smilesName, molCol = molColName,includeFingerprints=includeFingerprints) return df