def filter_molecules(input_molstream, output_molstream, allow_repeats=False, allow_warnings=False, max_heavy_atoms=100, remove_smirks=list(), max_metals=0, explicitHs=True, elements=None, check_type=None): """ Takes input file and removes molecules using given criteria then writes a new output file """ errs = oechem.oeosstream() oechem.OEThrow.SetOutputStream(errs) molecule = oechem.OECreateOEGraphMol() smiles = list() count = 0 warnings = 0 smile_count = 0 saved = 0 while oechem.OEReadMolecule(input_molstream, molecule): count += 1 if ("warning" in errs.str().lower()) and not allow_warnings: warnings += 1 errs.clear() continue smi = oechem.OECreateIsoSmiString(molecule) mol_copy = oechem.OEMol(molecule) if explicitHs: oechem.OEAddExplicitHydrogens(mol_copy) new_smile = smi not in smiles if not new_smile: smile_count += 1 if new_smile or allow_repeats: keep = keep_molecule(mol_copy, max_heavy_atoms, remove_smirks, max_metals, elements, check_type) if keep: smiles.append(smi) oechem.OEWriteMolecule(output_molstream, mol_copy) saved += 1 errs.clear() print(f"{count} molecules in input stream") print(f"{warnings} molecules resulted in warnings when parsing") print(f"{smile_count} molecules were had repeated isomeric SMILES") print(f"{saved} molecules saved")
def read_molecules(file_path, verbose=True): """ Read molecules from an OpenEye-supported file. Parameters ---------- file_path : str Filename from which molecules are to be read (e.g. mol2, sdf) Returns ------- molecules : list of OEMol List of molecules read from file """ warnings.warn(DEPRECATION_WARNING_TEXT, PendingDeprecationWarning) from openeye import oechem from openforcefield.utils import get_data_file_path if not os.path.exists(file_path): built_in = get_data_file_path(f"molecules/{file_path}") if not os.path.exists(built_in): raise Exception(f"File '{file_path}' not found.") file_path = built_in if verbose: print(f"Loading molecules from '{file_path}'...") start_time = time.time() molecules = list() input_molstream = oechem.oemolistream(file_path) flavor = oechem.OEIFlavor_Generic_Default | oechem.OEIFlavor_MOL2_Default | oechem.OEIFlavor_MOL2_Forcefield input_molstream.SetFlavor(oechem.OEFormat_MOL2, flavor) molecule = oechem.OECreateOEGraphMol() while oechem.OEReadMolecule(input_molstream, molecule): # If molecule has no title, try getting SD 'name' tag if molecule.GetTitle() == '': name = oechem.OEGetSDData(molecule, 'name').strip() molecule.SetTitle(name) # Append to list. molecule_copy = oechem.OEMol(molecule) molecules.append(molecule_copy) input_molstream.close() if verbose: print(f"{len(molecules)} molecules read") end_time = time.time() elapsed_time = end_time - start_time if verbose: print(f"{elapsed_time:.3f} s elapsed") return molecules
def mols_from_file(mol_file): """ Parses a standard molecule file into chemper molecules using OpenEye toolkits Parameters ---------- mol_file: str relative or full path to molecule containing the molecule file that is accessible from the current working directory Returns ------- mols: list of chemper Mols list of molecules in the mol2 file as chemper Mols """ import os if not os.path.exists(mol_file): from chemper.chemper_utils import get_data_path mol_path = get_data_path(os.path.join('molecules', mol_file)) if not os.path.exists(mol_path): raise IOError( "File '%s' not found locally or in chemper/data/molecules." % mol_file) else: mol_file = mol_path molecules = list() # make Openeye input file stream ifs = oechem.oemolistream(mol_file) oemol = oechem.OECreateOEGraphMol() while oechem.OEReadMolecule(ifs, oemol): # if an SD file, the molecule name may be in the SD tags if oemol.GetTitle() == '': name = oechem.OEGetSDData(oemol, 'name').strip() oemol.SetTitle(name) # Append to list. molecules.append(Mol(oechem.OEMol(oemol))) ifs.close() return molecules
def file_to_oemols(filename, title=True, verbose=False): """Create OEMol from file. If more than one mol in file, return list of OEMols. Parameters ---------- filename: str absolute path to title: str, title title for molecule. If None, IUPAC name will be given as title. Returns ------- mollist: list list of OEMol for multiple molecules. OEMol if file only has one molecule. """ if not os.path.exists(filename): raise Exception("File {} not found".format(filename)) if verbose: logger().info("Loading molecules from {}".format(filename)) ifs = oechem.oemolistream(filename) #moldb = oechem.OEMolDatabase(ifs) mollist = [] molecule = oechem.OECreateOEGraphMol() while oechem.OEReadMolecule(ifs, molecule): molecule_copy = oechem.OEMol(molecule) if title: title = molecule_copy.GetTitle() if verbose: logger().info("Reading molecule {}".format(title)) mollist.append(normalize_molecule(molecule_copy, title)) # if len(mollist) <= 1: # mollist = mollist[0] ifs.close() return mollist
ofsTri.SetFlavor(oechem.OEFormat_MOL2, flavor) ofsTri.open(tripos_out) ofsFail = oechem.oemolostream() ofsFail.SetFlavor(oechem.OEFormat_MOL2, flavor) ofsFail.open(failed_out) success = 0 time_out = 0 conf_fail = 0 index = 0 ifs = oechem.oemolistream(in_file) ifs.SetFlavor(oechem.OEFormat_MOL2, flavor) c_mol = oechem.OECreateOEGraphMol() while oechem.OEReadMolecule(ifs, c_mol): index += 1 # process molecules individually, storing less p = multiprocessing.Process(target=genConfs, args=(c_mol,ofsff, ofsTri, index,)) p.start() p.join(24) if p.is_alive(): print(f"TIMED OUT {oechem.OECreateIsoSmiString(c_mol)}") oechem.OEWriteConstMolecule(ofsFail, oechem.OEMol(c_mol)) time_out += 1 p.terminate() p.join() elif p.exitcode: success += 1 p.terminate()
def eMolecules_filtering(input_f, current_smiles = list()): """ This function was used to filter eMolecules database and the eMolecules_incremental database. It creates all the filtered output files with 1000 molecules in each sdf file and 1,000,000 molecule-ID to smiles strings in each text file Parameter --------- input_f : string "path/to/inputfile.sdf" current_smiles : list of strings; smiles already in your molecule sets """ set_name = input_f.split('.')[0] output_f = set_name+"_%i.sdf" smiles_base = set_name+"_%i.txt" molecule_name = set_name+"_%i_%i" # Load and check input file ifs = oechem.oemolistream(input_f) if not ifs.IsValid(): raise Exception("Error: input_file (%s) was not valid" % input_f) errs = oechem.oeosstream() oechem.OEThrow.SetOutputStream(errs) molecule = oechem.OECreateOEGraphMol() count = 0 smile_count = 0 saved = 0 switch = False # first output file current_letter = 1000 ofs_file = output_f%current_letter ofs = oechem.oemolostream(ofs_file) if not ofs.IsValid(): raise Exception("output file %s is not valid" % ofs_file) add_smiles = open(smiles_base % current_letter, 'a') while oechem.OEReadMolecule(ifs, molecule): # count input file molecules count +=1 if switch: # If True create new output file switch = False ofs.close() current_letter += 1 ofs_file = output_f % current_letter # Load and check output file ofs = oechem.oemolostream(ofs_file) if not ofs.IsValid(): raise Exception("output file %s is not valid" % ofs_file) print("Switching to file %s, currently saved %i molecules" % (ofs_file, saved)) if current_letter%100 == 0: add_smiles.close() add_smiles = open(smiles_base % current_letter, 'a') # IF smiles in current list skip the molecule smi = oechem.OECreateIsoSmiString(molecule) if smi in current_smiles: smile_count += 1 continue # Make copy of molecule before making changes mol_copy = oechem.OEMol(molecule) oechem.OEAddExplicitHydrogens(mol_copy) # if the molecule meets our requirements save to current output if keep_molecule(mol_copy): mol_title = molecule_name % (current_letter,count) mol_copy.SetTitle(mol_title) add_smiles.writelines("%s\t\t%s\n" % (mol_title, smi)) oechem.OEWriteMolecule(ofs, mol_copy) saved += 1 if saved%1000 == 0: switch = True print("%i molecules in input file" % (count)) print("%i molecules were had repeated isomeric SMILES" % smile_count) print("%i molecules saved to output files" % (saved)) ifs.close() ofs.close()
# get letters to diferentiate output letters = string.ascii_letters letters = [l for l in letters] # get current smiles smiles_f = "smiles_to_ID_off-compare.txt" current_smiles = parse_smile(smiles_f) add_smiles = open(smiles_f, 'a') # Load and check input file ifs = oechem.oemolistream(input_f) ifs.SetFormat(oechem.OEFormat_SDF) if not ifs.IsValid(): raise Exception("Error: input_file (%s) was not valid" % input_f) molecule = oechem.OECreateOEGraphMol() count = 0 smile_count = 0 saved = 0 switch = False # first output file current_letter = letters.pop(0) ofs_file = output_f%current_letter ofs = oechem.oemolostream(ofs_file) ifs.SetFormat(oechem.OEFormat_SDF) if not ofs.IsValid(): raise Exception("output file %s is not valid" % ofs_file) while oechem.OEReadMolecule(ifs, molecule): # count input file