Пример #1
0
def filter_molecules(input_molstream,
                     output_molstream,
                     allow_repeats=False,
                     allow_warnings=False,
                     max_heavy_atoms=100,
                     remove_smirks=list(),
                     max_metals=0,
                     explicitHs=True,
                     elements=None,
                     check_type=None):
    """
    Takes input file and removes molecules using given criteria then
    writes a new output file
    """
    errs = oechem.oeosstream()
    oechem.OEThrow.SetOutputStream(errs)

    molecule = oechem.OECreateOEGraphMol()
    smiles = list()

    count = 0
    warnings = 0
    smile_count = 0
    saved = 0

    while oechem.OEReadMolecule(input_molstream, molecule):
        count += 1
        if ("warning" in errs.str().lower()) and not allow_warnings:
            warnings += 1
            errs.clear()
            continue

        smi = oechem.OECreateIsoSmiString(molecule)
        mol_copy = oechem.OEMol(molecule)
        if explicitHs:
            oechem.OEAddExplicitHydrogens(mol_copy)
        new_smile = smi not in smiles
        if not new_smile:
            smile_count += 1

        if new_smile or allow_repeats:
            keep = keep_molecule(mol_copy, max_heavy_atoms, remove_smirks,
                                 max_metals, elements, check_type)
            if keep:
                smiles.append(smi)
                oechem.OEWriteMolecule(output_molstream, mol_copy)
                saved += 1
        errs.clear()

    print(f"{count} molecules in input stream")
    print(f"{warnings} molecules resulted in warnings when parsing")
    print(f"{smile_count} molecules were had repeated isomeric SMILES")
    print(f"{saved} molecules saved")
Пример #2
0
def read_molecules(file_path, verbose=True):
    """
    Read molecules from an OpenEye-supported file.

    Parameters
    ----------
    file_path : str
        Filename from which molecules are to be read (e.g. mol2, sdf)

    Returns
    -------
    molecules : list of OEMol
        List of molecules read from file

    """
    warnings.warn(DEPRECATION_WARNING_TEXT, PendingDeprecationWarning)

    from openeye import oechem
    from openforcefield.utils import get_data_file_path

    if not os.path.exists(file_path):
        built_in = get_data_file_path(f"molecules/{file_path}")
        if not os.path.exists(built_in):
            raise Exception(f"File '{file_path}' not found.")
        file_path = built_in

    if verbose: print(f"Loading molecules from '{file_path}'...")
    start_time = time.time()
    molecules = list()
    input_molstream = oechem.oemolistream(file_path)

    flavor = oechem.OEIFlavor_Generic_Default | oechem.OEIFlavor_MOL2_Default | oechem.OEIFlavor_MOL2_Forcefield
    input_molstream.SetFlavor(oechem.OEFormat_MOL2, flavor)

    molecule = oechem.OECreateOEGraphMol()
    while oechem.OEReadMolecule(input_molstream, molecule):
        # If molecule has no title, try getting SD 'name' tag
        if molecule.GetTitle() == '':
            name = oechem.OEGetSDData(molecule, 'name').strip()
            molecule.SetTitle(name)
        # Append to list.
        molecule_copy = oechem.OEMol(molecule)
        molecules.append(molecule_copy)
    input_molstream.close()
    if verbose: print(f"{len(molecules)} molecules read")
    end_time = time.time()
    elapsed_time = end_time - start_time
    if verbose: print(f"{elapsed_time:.3f} s elapsed")

    return molecules
Пример #3
0
def mols_from_file(mol_file):
    """
    Parses a standard molecule file into chemper molecules using OpenEye toolkits

    Parameters
    ----------
    mol_file: str
              relative or full path to molecule containing the molecule file
              that is accessible from the current working directory

    Returns
    -------
    mols: list of chemper Mols
          list of molecules in the mol2 file as chemper Mols
    """
    import os
    if not os.path.exists(mol_file):
        from chemper.chemper_utils import get_data_path
        mol_path = get_data_path(os.path.join('molecules', mol_file))

        if not os.path.exists(mol_path):
            raise IOError(
                "File '%s' not found locally or in chemper/data/molecules." %
                mol_file)
        else:
            mol_file = mol_path

    molecules = list()

    # make Openeye input file stream
    ifs = oechem.oemolistream(mol_file)

    oemol = oechem.OECreateOEGraphMol()
    while oechem.OEReadMolecule(ifs, oemol):
        # if an SD file, the molecule name may be in the SD tags
        if oemol.GetTitle() == '':
            name = oechem.OEGetSDData(oemol, 'name').strip()
            oemol.SetTitle(name)
        # Append to list.
        molecules.append(Mol(oechem.OEMol(oemol)))
    ifs.close()

    return molecules
Пример #4
0
def file_to_oemols(filename, title=True, verbose=False):
    """Create OEMol from file. If more than one mol in file, return list of OEMols.

    Parameters
    ----------
    filename: str
        absolute path to
    title: str, title
        title for molecule. If None, IUPAC name will be given as title.

    Returns
    -------
    mollist: list
        list of OEMol for multiple molecules. OEMol if file only has one molecule.
    """

    if not os.path.exists(filename):
        raise Exception("File {} not found".format(filename))
    if verbose:
        logger().info("Loading molecules from {}".format(filename))

    ifs = oechem.oemolistream(filename)
    #moldb = oechem.OEMolDatabase(ifs)
    mollist = []

    molecule = oechem.OECreateOEGraphMol()
    while oechem.OEReadMolecule(ifs, molecule):
        molecule_copy = oechem.OEMol(molecule)
        if title:
            title = molecule_copy.GetTitle()
            if verbose:
                logger().info("Reading molecule {}".format(title))

        mollist.append(normalize_molecule(molecule_copy, title))

    # if len(mollist) <= 1:
    #     mollist = mollist[0]

    ifs.close()

    return mollist
ofsTri.SetFlavor(oechem.OEFormat_MOL2, flavor)
ofsTri.open(tripos_out)

ofsFail = oechem.oemolostream()
ofsFail.SetFlavor(oechem.OEFormat_MOL2, flavor)
ofsFail.open(failed_out)

success = 0
time_out = 0
conf_fail = 0
index = 0

ifs = oechem.oemolistream(in_file)
ifs.SetFlavor(oechem.OEFormat_MOL2, flavor)

c_mol = oechem.OECreateOEGraphMol()
while oechem.OEReadMolecule(ifs, c_mol):
    index += 1
    # process molecules individually, storing less
    p = multiprocessing.Process(target=genConfs, args=(c_mol,ofsff, ofsTri, index,))
    p.start()
    p.join(24)
    if p.is_alive():
        print(f"TIMED OUT {oechem.OECreateIsoSmiString(c_mol)}")
        oechem.OEWriteConstMolecule(ofsFail, oechem.OEMol(c_mol))
        time_out += 1
        p.terminate()
        p.join()
    elif p.exitcode:
        success += 1
        p.terminate()
Пример #6
0
def eMolecules_filtering(input_f, current_smiles = list()):
    """
    This function was used to filter eMolecules database
    and the eMolecules_incremental database.
    It creates all the filtered output files with 1000 molecules
    in each sdf file and 1,000,000 molecule-ID to smiles strings in each
    text file

    Parameter
    ---------
    input_f : string "path/to/inputfile.sdf"
    current_smiles : list of strings; smiles already in your molecule sets
    """
    set_name = input_f.split('.')[0]
    output_f = set_name+"_%i.sdf"
    smiles_base = set_name+"_%i.txt"
    molecule_name = set_name+"_%i_%i"

    # Load and check input file
    ifs = oechem.oemolistream(input_f)
    if not ifs.IsValid():
        raise Exception("Error: input_file (%s) was not valid" % input_f)

    errs = oechem.oeosstream()
    oechem.OEThrow.SetOutputStream(errs)

    molecule = oechem.OECreateOEGraphMol()
    count = 0
    smile_count = 0
    saved = 0
    switch = False

    # first output file
    current_letter = 1000
    ofs_file = output_f%current_letter
    ofs = oechem.oemolostream(ofs_file)
    if not ofs.IsValid():
        raise Exception("output file %s is not valid" % ofs_file)
    add_smiles = open(smiles_base % current_letter, 'a')

    while oechem.OEReadMolecule(ifs, molecule):
        # count input file molecules
        count +=1

        if switch: # If True create new output file
            switch = False
            ofs.close()
            current_letter += 1
            ofs_file = output_f % current_letter
            # Load and check output file
            ofs = oechem.oemolostream(ofs_file)
            if not ofs.IsValid():
                raise Exception("output file %s is not valid" % ofs_file)
            print("Switching to file %s, currently saved %i molecules" % (ofs_file, saved))
            if current_letter%100 == 0:
                add_smiles.close()
                add_smiles = open(smiles_base % current_letter, 'a')

        # IF smiles in current list skip the molecule
        smi = oechem.OECreateIsoSmiString(molecule)
        if smi in current_smiles:
            smile_count += 1
            continue

        # Make copy of molecule before making changes
        mol_copy = oechem.OEMol(molecule)
        oechem.OEAddExplicitHydrogens(mol_copy)
        # if the molecule meets our requirements save to current output
        if keep_molecule(mol_copy):
            mol_title = molecule_name % (current_letter,count)
            mol_copy.SetTitle(mol_title)
            add_smiles.writelines("%s\t\t%s\n" % (mol_title, smi))
            oechem.OEWriteMolecule(ofs, mol_copy)
            saved += 1
            if saved%1000 == 0:
                switch = True

    print("%i molecules in input file" % (count))
    print("%i molecules were had repeated isomeric SMILES" % smile_count)
    print("%i molecules saved to output files" % (saved))

    ifs.close()
    ofs.close()
Пример #7
0
    # get letters to diferentiate output
    letters = string.ascii_letters
    letters = [l for l in letters]

    # get current smiles
    smiles_f = "smiles_to_ID_off-compare.txt"
    current_smiles = parse_smile(smiles_f)
    add_smiles = open(smiles_f, 'a')

    # Load and check input file
    ifs = oechem.oemolistream(input_f)
    ifs.SetFormat(oechem.OEFormat_SDF)
    if not ifs.IsValid():
        raise Exception("Error: input_file (%s) was not valid" % input_f)

    molecule = oechem.OECreateOEGraphMol()
    count = 0
    smile_count = 0
    saved = 0
    switch = False

    # first output file
    current_letter = letters.pop(0)
    ofs_file = output_f%current_letter
    ofs = oechem.oemolostream(ofs_file)
    ifs.SetFormat(oechem.OEFormat_SDF)
    if not ofs.IsValid():
        raise Exception("output file %s is not valid" % ofs_file)

    while oechem.OEReadMolecule(ifs, molecule):
        # count input file