Exemplo n.º 1
0
def clean_smiles(smiles_df):
    """
    Helper function which runs the standardization tool on a list of smiles
    strings.

    Args:
        smiles_df: DataFrame which contains smiles strings in a column named
        "smiles"

    Returns:
        The original DataFrame, but with the smiles strings in the
        "smiles" column standardized and any rows which contained
        problematic smiles removed
    """
    standard = Standardizer(prefer_organic=True)
    for index, row in smiles_df.iterrows():
        try:
            mol = Chem.MolFromSmiles(row['smiles'])
            std_mol = standard.fragment_parent(mol, skip_standardize=False)
            smiles_df['smiles'][index] = Chem.MolToSmiles(std_mol)
        except:
            print("Error cleaning " + str(index) + " " +
                  str(row['smiles']))
            print(smiles_df.loc[index])
            smiles_df.drop(index, inplace=True)
    return smiles_df
Exemplo n.º 2
0
def filter_salts(in_lines, Verbose=False):

    # standardize structures and remove salts
    #
    # This should be called before any other filters having to do with molecular structures as it
    # affects both the molecular structure and the molecular weight of many compounds that come out of ChEMBL

    s = Standardizer()
    #salt_file = code_dir / 'Salts.txt'
    salt_file = conf_dir + '/Salts.txt'
    remover = SaltRemover.SaltRemover(defnFilename=salt_file)

    for i in range(len(in_lines)):
        mol_in = Chem.MolFromSmiles(in_lines['canonical_smiles'][i])
        mol_out = s.standardize(mol_in)
        smiles_out = Chem.MolToSmiles(remover(mol_out), isomericSmiles=False)
        if '.' in smiles_out:
            in_lines = in_lines.drop(i)
        else:
            in_lines.loc[i, 'canonical_smiles'] = smiles_out


#             in_lines['canonical_smiles'].replace(i,smiles_out)
#             ## I believe you should just use replace
# The replace function replaces values equal to i with smiles_out
# so I do not think we want to use replace

    if Verbose:
        print('Number of compounds after desalting pass: ', len(in_lines))

    return in_lines.reset_index(drop=True)
Exemplo n.º 3
0
 def testFragmentLong(self):
     if not doLong:
         raise unittest.SkipTest('long test')
     for data in self.readPCSdata(self.dataPCS_fragment100k):
         try:
             s = Standardizer()
             frag = s.fragment_parent(data.mol)
             ns = Chem.MolToSmiles(frag)
         except Exception:
             raise AssertionError(f'Line {data.lineNo}: MolVS normalization failed for SMILES {data.smiles}')
         self.assertEqual(ns, data.expected)
Exemplo n.º 4
0
 def testMetalLong(self):
     if not doLong:
         raise unittest.SkipTest('long test')
     for data in self.readPCSdata(self.dataPCS_metal100k):
         try:
             n = Standardizer()
             nm = n.disconnect_metals(data.mol)
             ns = Chem.MolToSmiles(nm)
         except Exception:
             raise AssertionError(f'Line {data.lineNo}: MolVS normalization failed for SMILES {data.smiles}')
         self.assertEqual(ns, data.expected)
Exemplo n.º 5
0
 def testMetalLong(self):
     if not doLong:
         raise unittest.SkipTest('long test')
     for data in self.readPCSdata(self.dataPCS_metal100k):
         try:
             n = Standardizer()
             nm = n.disconnect_metals(data.mol)
             ns = Chem.MolToSmiles(nm)
         except Exception:
             raise AssertionError(
                 'Line {0.lineNo}: MolVS normalization failed for SMILES {0.smiles}'.format(data))
         self.assertEqual(ns, data.expected)
Exemplo n.º 6
0
 def testNormalizeLong(self):
     if not doLong:
         raise unittest.SkipTest('long test')
     for data in self.readPCSdata(self.dataPCS_nomralized100k):
         try:
             n = Standardizer()
             nm = n.normalize(data.mol)
             ns = Chem.MolToSmiles(nm)
         except Exception:
             raise AssertionError(
                 'Line {0.lineNo}: MolVS normalization failed for SMILES {0.smiles}'.format(data))
         self.assertEqual(ns, data.expected)
Exemplo n.º 7
0
 def testFragmentLong(self):
     if not doLong:
         raise unittest.SkipTest('long test')
     for data in self.readPCSdata(self.dataPCS_fragment100k):
         try:
             s = Standardizer()
             frag = s.fragment_parent(data.mol)
             ns = Chem.MolToSmiles(frag)
         except Exception:
             raise AssertionError(
                 'Line {0.lineNo}: MolVS normalization failed for SMILES {0.smiles}'.format(data))
         self.assertEqual(ns, data.expected)
Exemplo n.º 8
0
def split_data(mols, acts, test_percent, split):
    mols_train = []
    mols_test = []
    molnames_train = []
    molnames_test = []
    acts_train = []
    acts_test = []
    actnames_train = []
    actnames_test = []

    # Split molecules and activities training set into training and test sets
    m_train, m_test, a_train, a_test = train_test_split(mols,
                                                        acts,
                                                        test_size=test_percent,
                                                        random_state=split)
    # Make a list of the names of all the molecules in the training list
    names_train = []

    for mol in m_train:
        names_train.append(mol[1])

    # Iterate over all the molecules we have read in
    for i in range(len(mols)):
        # assert mols[i][1] == acts[i][1]
        if mols[i][1] in names_train:  # is the molecule in the training set?
            mols_train.append(mols[i][0])
            molnames_train.append(mols[i][1])
            acts_train.append(acts[i][0])
            actnames_train.append(acts[i][1])
        else:  # the molecule is in the test set if it isn't in the the training set
            mols_test.append(mols[i][0])
            molnames_test.append(mols[i][1])
            acts_test.append(acts[i][0])
            actnames_test.append(acts[i][1])

    assert molnames_train == actnames_train
    assert molnames_test == actnames_test

    # Standardize structures of the training set and test set
    s = Standardizer()
    standard_mols_train = []

    for mol in mols_train:
        standard_mols_train.append(s.standardize(mol))

    standard_mols_test = []

    for mol in mols_test:
        standard_mols_test.append(s.standardize(mol))

    return standard_mols_train, molnames_train, acts_train, standard_mols_test, molnames_test, acts_test
Exemplo n.º 9
0
def standardize_mol(mol_file):

    if Path(mol_file).exists():
        '''Chem.MolFromMolFile() only works with string, not Path object'''
        mol_file = str(mol_file)
        mol = Chem.MolFromMolFile(mol_file)

        s = Standardizer()
        smol = s.standardize(mol)

        with open(mol_file, 'w') as f:
            f.write(Chem.MolToMolBlock(smol))

    else:
        # print('file does not exist.')
        raise RuntimeError('File does not exist.')
Exemplo n.º 10
0
def prepSMI(SMIin, defnFilename, removeMetal=1):

    mol = Chem.MolFromSmiles(SMIin)
    s = Standardizer()

    try:
        molstandardized = s.standardize(mol)
        smilestandadized = Chem.MolToSmiles(molstandardized)
    except:

        return "Error: Standardization Fail"

    # remove salt
    # 1.default
    if defnFilename != "":
        remover = SaltRemover(defnFilename=defnFilename)
    else:
        remover = SaltRemover()
    molclean = remover(molstandardized)
    smilesclean = Chem.MolToSmiles(molclean)

    # 2. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound
    lelem = smilesclean.split(".")
    # reduce double, case of several salts are included - 255
    lelem = list(set(lelem))
    try:
        lelem.remove("")
    except:
        pass

    # remove metal
    if removeMetal == 1:
        lnometal = []
        for elem in lelem:
            if is_metalorion(elem) == 0:
                lnometal.append(elem)
        lelem = lnometal

    if len(lelem) == 1:
        smilesclean = str(lelem[0])
        return smilesclean
    elif len(lelem) > 1:
        return "Error: Mixture or fragment ot check: " + smilesclean
    elif smilesclean == "":
        return "Error: SMILES empty after preparation"
    else:
        return "Error: No identified"
Exemplo n.º 11
0
def standardizeSMILES(smiIn):


        # self.mol = loader.ReadMolFromSmile(self.smi)

        s = Standardizer()
        mol = Chem.MolFromSmiles(smiIn)

        try:
            out = timeFunction(normalize, mol)
            if out == "ERROR":
                print "Normalize SMILES: ERROR DURING THE PROCESS"
            else:
                molstandardized = out
        except:
            print "Normalize SMILES: ERROR INPUT SMI"

        if "molstandardized" in locals():

            smilestandadized = Chem.MolToSmiles(molstandardized)

            # remove salt
            # 1.default
            remover = SaltRemover()
            mol = Chem.MolFromSmiles(smilestandadized)
            molcleandefault = remover(mol)
            # 2. Personal remover
            homeremover = SaltRemover(defnData=LSALT)
            molclean = homeremover(molcleandefault)
            smilesclean = Chem.MolToSmiles(molclean)
            # 3. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound
            lelem = smilesclean.split(".")
            if len(lelem) > 1:
                # reduce double, case of several salts are included - 255
                lelem = list(set(lelem))
                for smilesdel in LSMILESREMOVE:
                    if smilesdel in lelem:
                        lelem.remove(smilesdel)
                try:
                    lelem.remove("")  # case of bad smile
                except:
                    pass
                if len(lelem) == 1:
                    smilesclean = str(lelem[0])
                else:
                    # 4. Fragments
                    # Case of fragment -> stock in log file, check after to control
                    print "Fragments after standardization: " + smilesclean + "\n"
                    smilesclean = ""

            if smilesclean == "":
                print "SMILES empty after preparation\n"
                return 1

            else:
                print "Prepared SMI :" + str(smilesclean) + "\n"


            return smilesclean
Exemplo n.º 12
0
def clean_smiles(smi):
    """
    Helper function which runs the standardization tool on the input smiles
    string

    Args:
        smi: Input smiles string

    Returns:
        The standardized version of the input smiles string
    """
    s = Standardizer(prefer_organic=True)
    try:
        mol = Chem.MolFromSmiles(smi)
        std_mol = s.fragment_parent(mol, skip_standardize=False)
        std_smi = Chem.MolToSmiles(std_mol)
        return std_smi
    except:
        print("Issue with input smiles string. Unable to clean " + str(smi))
    return None
Exemplo n.º 13
0
def Tautomerize(mol):
    try:
        if mol.GetBoolProp('tautomerized'): return
    except KeyError:
        pass
    smi1 = Chem.MolToSmiles(mol)
    from molvs import Standardizer
    s = Standardizer()
    try:
        s.standardize(mol)
    except ValueError as e:
        MutateFail(mol)
        return False
    #from molvs.tautomer import TautomerCanonicalizer
    #t = TautomerCanonicalizer()
    #t.canonicalize(mol)
    mol.SetBoolProp('tautomerized', True)
    smi2 = Chem.MolToSmiles(mol)

    if not smi1 == smi2: print "tautomerized:", smi1, 'to:', smi2
    return True
Exemplo n.º 14
0
def sanitize_smiles_molvs(smiles, largest_fragment=False):
    """Sanitize a SMILES with MolVS

    Parameters
    ----------
    smiles : str
        SMILES string for a molecule.
    largest_fragment : bool
        Whether to select only the largest covalent unit in a molecule with
        multiple fragments. Default to False.

    Returns
    -------
    str
        SMILES string for the sanitized molecule.
    """
    standardizer = Standardizer()
    standardizer.prefer_organic = True

    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return smiles
    try:
        mol = standardizer.standardize(
            mol)  # standardize functional group reps
        if largest_fragment:
            mol = standardizer.largest_fragment(
                mol)  # remove product counterions/salts/etc.
        mol = standardizer.uncharge(mol)  # neutralize, e.g., carboxylic acids
    except Exception:
        pass
    return Chem.MolToSmiles(mol)
Exemplo n.º 15
0
def Tautomerize(mol, aromatic=aromaticity):
    try:
        if mol.GetBoolProp('tautomerized'): return mol
    except KeyError:
        pass


    Chem.SanitizeMol(mol)
    if not (aromatic or aromaticity):
        Chem.Kekulize(mol, True)

    smi1 = Chem.MolToSmiles(mol)
    from molvs import Standardizer
    s = Standardizer()
    try:
        molnew = s.standardize(mol)
    except ValueError as e:
        raise MutateFail(mol)

    if not aromatic:
        Chem.Kekulize(molnew, True)
    smi2 = Chem.MolToSmiles(molnew)

    if smi1 == smi2:
        # we return mol because it contains some properties
        # tautomerized mols need to get the props again
        mol.SetBoolProp('tautomerized', True)
        return mol
    else:
        if mol.HasProp('failedfilter'):
            ff = mol.GetProp('failedfilter')
            molnew.SetProp('failedfilter', ff)
        #print "tautomerized:", smi1, 'to:', smi2
        with open('tautomerized.smi', 'a') as f:
            f.write("{} {}\n".format(smi1, smi2))
        molnew.SetBoolProp('tautomerized', True)
        return molnew
Exemplo n.º 16
0
def standardizeMolVS(inMol):
    f = fragment.LargestFragmentChooser()
    outMol = f.choose(inMol)
    c = charge.Uncharger()
    outMol = c.uncharge(outMol)
    s = Standardizer()
    outMol = s.standardize(outMol)
    n = normalize.Normalizer()
    outMol = n.normalize(outMol)
    t = tautomer.TautomerCanonicalizer()
    outMol = t.canonicalize(outMol)

    # Transform with Inchi
    #print "inMol"
    #print Chem.MolToSmiles(inMol)
    #inchi = Chem.inchi.MolToInchi(inMol)
    #print inchi
    #print "outMol"
    #print Chem.MolToSmiles(outMol)
    #inchi = Chem.inchi.MolToInchi(outMol)
    #print inchi
    #outMol = Chem.inchi.MolFromInchi(inchi)

    return outMol
Exemplo n.º 17
0
 def testNormalizeShort(self):
     for data in self.readPCSdata(self.dataPCS_nomralized1k):
         n = Standardizer()
         nm = n.normalize(data.mol)
         ns = Chem.MolToSmiles(nm)
         self.assertEqual(ns, data.expected)
Exemplo n.º 18
0
def standardize_main(args):
    mol = _read_mol(args)
    s = Standardizer()
    mol = s.standardize(mol)
    _write_mol(mol, args)
Exemplo n.º 19
0
from molvs import Standardizer
from pprint import pprint
import psycopg2
import sys
import re
from io import StringIO
chem.WrapLogs()
#sio = sys.stderr = StringIO()
import logging
#reload(logging)
logging.basicConfig(filename='logging.log',
                    level=logging.DEBUG,
                    format="[%(asctime)s %(levelname)-8s] %(message)s",
                    datefmt="%Y/%b/%d %H:%M:%S")
from standardiser import standardise
s = Standardizer()

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.ERROR)

dictionary = ["C", "H", "O", "N", "S", "P", "F", "Cl", "Br", "I"]

#CHEMBL preparing
chembl_new = []
chembl_help = []
conn2 = psycopg2.connect('dbname=chembl user=data host=/tmp/')
curs2 = conn2.cursor()
curs2.execute(
    "select standard_inchi, standard_inchi from (((((target_dictionary T INNER JOIN target_components TC on T.tid = TC.tid) INNER JOIN component_sequences CSEQ on TC.component_id = CSEQ.component_id) INNER JOIN assays A on T.tid = A.tid) INNER JOIN activities AC on A.assay_ID = AC.assay_ID) INNER JOIN docs D on AC.doc_id = D.doc_id) INNER JOIN compound_structures CS on AC.molregno = CS.molregno where CSEQ.accession IN ('P10827', 'P10828', 'P10276', 'P10826', 'P13631', 'Q07869', 'Q03181', 'P37231', 'P20393', 'Q14995', 'P35398', 'Q92753', 'P51449', 'P55055', 'Q13133', 'Q96RI1', 'P11473', 'O75469', 'Q14994', 'P41235', 'Q14541', 'P19793', 'P28702', 'P48443', 'P13056', 'P49116', 'Q9Y466', 'Q9Y5X4', 'P10589', 'P24468', 'P10588', 'P03372', 'Q92731', 'P11474', 'O95718', 'P62508', 'P04150', 'P08235', 'P06401', 'P10275', 'P22736', 'P43354', 'Q92570', 'Q13285', 'O00482', 'Q15406', 'P51843', 'Q15466');"
)
Exemplo n.º 20
0
 def testFragmentShort(self):
     for data in self.readPCSdata(self.dataPCS_fragmnet1k):
         s = Standardizer()
         frag = s.fragment_parent(data.mol)
         ns = Chem.MolToSmiles(frag)
         self.assertEqual(ns, data.expected)
Exemplo n.º 21
0
from rdkit import Chem
from copy import copy

from pipelines.utils import utils

from molvs import enumerate_tautomers_smiles,canonicalize_tautomer_smiles,Standardizer
from molvs.charge import Uncharger,Reionizer
from standardiser import standardise

standardizer = Standardizer()

def _spam(n):
    out=[]
    for perm in _getPerms(n):
        elem = [ int(i) for i in list(perm) ]
        out.append(elem)
    return out

def _getPerms(n):
    from itertools import permutations
    for i in _getCandidates(n):
        for perm in set(permutations(i)):
            yield ''.join(perm)

def _getCandidates(n):
    for i in range(0, n+1):
        res = "1" * i + "0" * (n - i)
        yield res

def enumerateTautomers(mol):
    """
Exemplo n.º 22
0
def normalize(mol, lout):
    s = Standardizer()
    molstandardized = s.standardize(mol)
    #print molstandardized
    lout.append(molstandardized)
Exemplo n.º 23
0
 def testFragmentShort(self):
     for data in self.readPCSdata(self.dataPCS_fragmnet1k):
         s = Standardizer()
         frag = s.fragment_parent(data.mol)
         ns = Chem.MolToSmiles(frag)
         self.assertEqual(ns, data.expected)
Exemplo n.º 24
0
def read_mols(mode, method, basename, datadir='Default', modeldir='Default'):
    currworkdir = os.getcwd()
    if datadir == 'Default':
        datadir = os.path.join(currworkdir, 'data')
    else:
        if not os.path.isdir(datadir):
            print("error: ", datadir, " is not a directory. exiting.")
            exit(2)

    if modeldir == 'Default':
        modeldir = os.path.join(currworkdir, 'models')
    else:
        if not os.path.isdir(modeldir):
            print("error: ", modeldir, " is not a directory. exiting.")
            exit(2)
        else:
            print('setting modeldir to ', modeldir, '.')
            print(
                'Have you set the random splits to be correct for the model?')

    mol_data_filename = basename + '.smi'
    act_data_filename = basename + '.act'
    moldatafile = os.path.join(datadir, mol_data_filename)
    actdatafile = os.path.join(datadir, act_data_filename)

    # output_ext = "%s_%s_%d_%d" % (mode, method, int(rand_split), int(rand_state))
    model_filename = "model_%s.dat" % output_ext
    index_filename = "indices_%s.dat" % output_ext
    appdom_fp_filename = "training-FPs_%s.dat" % output_ext
    appdom_rad_filename = "AD-radius_%s.dat" % output_ext

    if mode.startswith('class'):
        if os.path.isfile(actdatafile):
            actfh = open(actdatafile)

            activities = []  # array of tuples: (activity, molecule name)

            for actline in actfh:
                line = actline.split()
                act = float(line[1])
                actname = line[0]
                activities.append((act, actname))

            actfh.close()

    elif mode.startswith('reg') and method == 'xgb':

        bits_filename = "sigbits_%s.dat" % output_ext
        bits_file = os.path.join(modeldir, bits_filename)
        with open(bits_file, 'rb') as f:
            significant_bits = pickle.load(f)

    model_file = os.path.join(modeldir, model_filename)
    loaded_model = pickle.load(open(model_file, "rb"))

    index_file = os.path.join(modeldir, index_filename)
    with open(index_file, 'rb') as f:
        indexes = pickle.load(f)

    appdom_fp_file = os.path.join(modeldir, appdom_fp_filename)
    with open(appdom_fp_file, 'rb') as f:
        appdom_fps = pickle.load(f)

    appdom_rad_file = os.path.join(modeldir, appdom_rad_filename)
    with open(appdom_rad_file, 'rb') as f:
        appdom_radius = pickle.load(f)

    # Read in molecules from test set
    molfh = open(moldatafile)

    molecules = []  # array of tuples: (molecule, molecule name)

    for molline in molfh:
        line = molline.split()
        mol = Chem.MolFromSmiles(line[0])
        molname = line[1]
        molecules.append((mol, molname))

    molfh.close()

    mols_train = []
    molnames_train = []

    if 'activities' in locals():
        acts_train = []
        actnames_train = []

    for i in range(len(molecules)):
        mols_train.append(molecules[i][0])
        molnames_train.append(molecules[i][1])
        if mode.startswith('class') and 'activities' in locals():
            acts_train.append(activities[i][0])
            actnames_train.append(activities[i][1])

    # Standardize structures
    s = Standardizer()
    standard_mols_train = []
    for mol in mols_train:
        standard_mols_train.append(s.standardize(mol))

    return_dict = {}

    return_dict['molnames'] = molnames_train
    return_dict['molecules'] = standard_mols_train
    return_dict['model'] = loaded_model
    return_dict['inds'] = indexes
    if mode.startswith('reg') and method == 'xgb':
        return_dict['sigbits'] = significant_bits
    elif mode.startswith('class') and 'activities' in locals():
        return_dict['activities'] = acts_train
    return_dict['ad_fps'] = appdom_fps
    return_dict['ad_radius'] = appdom_radius

    return return_dict
Exemplo n.º 25
0
def standardize_main(args):
    mol = _read_mol(args)
    s = Standardizer()
    mol = s.standardize(mol)
    _write_mol(mol, args)
Exemplo n.º 26
0
 def testReionizeShort(self):
     for data in self.readPCSdata(self.dataPCS_reionize1k):
         n = Standardizer()
         nm = n.reionize(data.mol)
         ns = Chem.MolToSmiles(nm)
         self.assertEqual(ns, data.expected)
Exemplo n.º 27
0
 def testMetalShort(self):
     for data in self.readPCSdata(self.dataPCS_metal1k):
         n = Standardizer()
         nm = n.disconnect_metals(data.mol)
         ns = Chem.MolToSmiles(nm)
         self.assertEqual(ns, data.expected)
Exemplo n.º 28
0
 def testNormalizeShort(self):
     for data in self.readPCSdata(self.dataPCS_nomralized1k):
         n = Standardizer()
         nm = n.normalize(data.mol)
         ns = Chem.MolToSmiles(nm)
         self.assertEqual(ns, data.expected)
Exemplo n.º 29
0
    parser.add_option("--bonds_as_doubles", dest="bonds_as_doubles", default=False)
    opts,args = parser.parse_args()

    fpred = open(opts.pred_path)
    fgold = open(opts.gold_path)
    feval = open(opts.pred_path + '.eval_by_smiles', 'w')

    print('## Bond types in output files are doubles? {}'.format(opts.bonds_as_doubles))

    idxfunc = lambda a: a.GetAtomMapNum()
    bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE,
                  Chem.rdchem.BondType.AROMATIC]
    bond_types_as_double = {0.0: 0, 1.0: 1, 2.0: 2, 3.0: 3, 1.5: 4}

    from molvs import Standardizer
    standardizer = Standardizer()
    standardizer.prefer_organic = True
    def sanitize_smiles(smi, largest_fragment=False):
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            return smi
        try:
            mol = standardizer.standardize(mol) # standardize functional group reps
            if largest_fragment:
                mol = standardizer.largest_fragment(mol) # remove product counterions/salts/etc.
            mol = standardizer.uncharge(mol) # neutralize, e.g., carboxylic acids
        except Exception:
            pass
        return Chem.MolToSmiles(mol)

Exemplo n.º 30
0
from pprint import pprint
import psycopg2
import sys
import re
import csv
from io import StringIO
chem.WrapLogs()
#sio = sys.stderr = StringIO()
import logging
#reload(logging)
logging.basicConfig(filename='logging.log',
                    level=logging.DEBUG,
                    format="[%(asctime)s %(levelname)-8s] %(message)s",
                    datefmt="%Y/%b/%d %H:%M:%S")
from standardiser import standardise
s = Standardizer()

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.ERROR)

dictionary = ["C", "H", "O", "N", "S", "P", "F", "Cl", "Br", "I"]

with open('receptors.csv', 'r', encoding='utf-8') as receptor:
    reader = csv.reader(
        receptor,
        delimiter=',',
    )
    final = list(reader)

#pprint (final)
Exemplo n.º 31
0
    def process(self,
                input: Union[str, list] = "",
                input_file: str = "",
                output_file: str = "",
                output_file_sdf: str = "",
                output_file_cml: str = "",
                sdf_append: bool = False,
                format_output: bool = True,
                opsin_output_format: str = "",
                output_formats: list = None,
                write_header: bool = True,
                dry_run: bool = False,
                csv_delimiter: str = ";",
                standardize_mols: bool = True,
                normalize_plurals: bool = True,
                continue_on_failure: bool = False) -> OrderedDict:
        r"""
        Process the input file with OPSIN.

        Parameters
        ----------
        input : str or list
            | str: String with IUPAC names, one per line.
            | list: List of IUPAC names.
        input_file : str
            Path to file to be processed by OPSIN. One IUPAC name per line.
        output_file : str
            File to write output in.
        output_file_sdf : str
            File to write SDF output in.
        output_file_cml : str
            | File to write CML (Chemical Markup Language) output in. `opsin_output_format` must be "cml".
            | Not supported by RDKit so standardization and conversion to other formats cannot be done.
        sdf_append : bool
            If True, append new molecules to existing SDF file or create new one if doesn't exist.
        format_output : bool
            | If True, the value of "content" key of returned dict will be list of OrderedDicts with keys:
            | "iupac", <output formats>, ..., "error"
            | If True and `output_file` is set it will be created as CSV file with columns: "iupac", <output formats>, ..., "error"
            | If False, the value of "content" key of returned dict will be None.
        opsin_output_format : str
            | Output format from OPSIN. Temporarily overrides the option `output_format` set during instantiation (in __init__).
            | Choices: "cml", "smi", "extendedsmi", "inchi", "stdinchi", "stdinchikey"
        output_formats : list
            | If True and `format_output` is also True, this specifies which molecule formats will be output.
            | You can specify more than one format, but only one format from OPSIN. This format must be also set with `output_format` in __init__
              or with `osra_output_format` here.
            | Default value: ["smiles"]

            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |         Value         |         Source        |                                            Note                                            |
            +=======================+=======================+============================================================================================+
            |         smiles        |         RDKit         |                                          canonical                                         |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |      smiles_opsin     |     OPSIN ("smi")     |                                           SMILES                                           |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            | smiles_extended_opsin | OPSIN ("extendedsmi") |                          Extended SMILES. Not supported by RDKit.                          |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |         inchi         |         RDKit         | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |      inchi_opsin      |    OPSIN ("inchi")    |                                            InChI                                           |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |     stdinchi_opsin    |   OPSIN ("stdinchi")  |                                       standard InChI                                       |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |        inchikey       |         RDKit         |      The same applies as for "inchi". Also molecule cannot be created from InChI-key.      |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |   stdinchikey_opsin   | OPSIN ("stdinchikey") |               Standard InChI-key. Cannot be used by RDKit to create molecule.              |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |          sdf          |         RDKit         |                     If present, an additional SDF file will be created.                    |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+

        write_header : bool
            If True and if `output_file` is set and `output_format` is True, write a CSV write_header.
        dry_run : bool
            If True, only return list of commands to be called by subprocess.
        csv_delimiter : str
            Delimiter for output CSV file.
        standardize_mols : bool
            If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules.
        normalize_plurals : bool
            | If True, normalize plurals ("nitrates" -> "nitrate"). See OPSIN.PLURAL_PATTERNS for relating plurals. You can
              set your own regex pattern with `plural_patterns` in __init__.
        continue_on_failure : bool
            | If True, continue running even if OPSIN returns non-zero exit code.
            | If False and error occurs, print it and return.

        Returns
        -------
        dict
            Keys:

            - stdout: str ... standard output from OPSIN
            - stderr: str ... standard error output from OPSIN
            - exit_code: int ... exit code from OPSIN
            - content:

              - list of OrderedDicts ... when format_output is True. Fields: "iupac", <output formats>, ..., "error"
              - None ... when format_output is False
        """

        options_internal = self.options_internal.copy()
        opsin_nonreadable_formats = ["cml", "stdinchikey"]

        if input and input_file:
            input_file = ""
            self.logger.warning(
                "Both 'input' and 'input_file' are set, but 'input' will be prefered."
            )
        elif not input and not input_file:
            raise ValueError("One of 'input' or 'input_file' must be set.")

        # OSRA output format check
        if opsin_output_format:
            options_internal["output_format"] = opsin_output_format
        else:
            opsin_output_format = options_internal["output_format"]

        opsin_valid_output_formats = {
            "cml": "cml_opsin",
            "smi": "smiles_opsin",
            "extendedsmi": "smiles_extended_opsin",
            "inchi": "inchi_opsin",
            "stdinchi": "stdinchi_opsin",
            "stdinchikey": "stdinchikey_opsin"
        }

        if opsin_output_format not in opsin_valid_output_formats:
            raise ValueError(
                "Unknown OPSIN output format. Possible values: {}".format(
                    list(opsin_valid_output_formats.keys())))

        if standardize_mols and opsin_output_format in opsin_nonreadable_formats:
            self.logger.warning(
                "OPSIN output format is \"{}\", which cannot be used by RDKit."
                .format(opsin_output_format))

        # output formats check
        if not output_formats:
            output_formats = ["smiles"]
        else:
            if opsin_output_format == "stdinchikey":
                output_formats = ["stdinchikey_opsin"]
            elif opsin_output_format == "extendedsmi":
                output_formats = ["smiles_extended_opsin"]
            else:
                output_formats = sorted(list(set(output_formats)))
                possible_output_formats = [
                    "smiles", "inchi", "inchikey", "sdf"
                ]
                output_formats = [
                    x for x in output_formats if x in possible_output_formats
                    or x == opsin_valid_output_formats[opsin_output_format]
                ]

        if normalize_plurals:
            if input_file:
                with open(input_file, mode="r", encoding="utf-8") as f:
                    input = "\n".join([x.strip() for x in f.readlines()])
                input_file = ""
            input = self.normalize_iupac(input)

        commands, _, _ = self.build_commands(options_internal,
                                             self._OPTIONS_REAL,
                                             self.path_to_binary)

        if input_file:
            commands.append(input)
            stdout, stderr, exit_code = common_subprocess(commands)
        elif input:
            if isinstance(input, list):
                input = "\n".join([x.strip() for x in input])
            stdout, stderr, exit_code = common_subprocess(commands,
                                                          stdin=input)
        else:
            raise UserWarning("Input is empty.")

        if dry_run:
            return " ".join(commands)

        to_return = {
            "stdout": stdout,
            "stderr": stderr,
            "exit_code": exit_code,
            "content": None
        }

        if not continue_on_failure and exit_code > 0:
            self.logger.warning("OPSIN error:")
            eprint("\n\t".join("\n{}".format(stderr).splitlines()))
            return to_return

        if output_file_cml and opsin_output_format == "cml":
            with open(output_file_cml, mode="w", encoding="utf-8") as f:
                f.write(stdout)
            return to_return
        elif output_file_cml and opsin_output_format != "cml":
            self.logger.warning(
                "Output file for CML is requested, but OPSIN output format is '{}'"
                .format(opsin_output_format))

        if not format_output:
            if output_file:
                with open(output_file, mode="w", encoding="utf-8") as f:
                    f.write(stdout)
            return to_return

        compounds = []
        standardizer = Standardizer()
        empty_cols = OrderedDict([(x, "") for x in output_formats])

        if output_file_sdf:
            if sdf_append:
                if not os.path.isfile(output_file_sdf):
                    open(output_file_sdf, mode="w", encoding="utf-8").close()
                writer = SDWriter(
                    open(output_file_sdf, mode="a", encoding="utf-8"))
            else:
                writer = SDWriter(output_file_sdf)

        stdout = stdout.split("\n")
        del stdout[-1]
        stderr = [
            x.strip() for x in stderr.split("\n")[1:] if x
        ]  # remove first line of stderr because there is OPSIN message (y u du dis...)

        if input_file:
            with open(input_file, mode="r", encoding="utf-8") as f:
                lines = iter(f.readlines())
        else:
            lines = iter(input.split("\n"))

        mol_output_template = OrderedDict.fromkeys(["iupac"] + output_formats +
                                                   ["error"])

        e = 0
        for i, line in enumerate(lines):
            line = line.strip()
            converted = stdout[i].strip()
            mol_output = mol_output_template.copy()

            if converted:
                if opsin_output_format == "stdinchikey":
                    compounds.append(
                        OrderedDict([("iupac", line),
                                     ("stdinchikey_opsin", converted),
                                     ("error", "")]))
                    continue
                elif opsin_output_format == "extendedsmi":
                    compounds.append(
                        OrderedDict([("iupac", line),
                                     ("smiles_extended_opsin", converted),
                                     ("error", "")]))
                    continue

                if opsin_output_format == "smi":
                    mol = MolFromSmiles(
                        converted,
                        sanitize=False if standardize_mols else True)
                elif opsin_output_format in ["inchi", "stdinchi"]:
                    mol = MolFromInchi(
                        converted,
                        sanitize=False if standardize_mols else True,
                        removeHs=False if standardize_mols else True)

                if mol:
                    if standardize_mols:
                        try:
                            mol = standardizer.standardize(mol)
                        except ValueError as e:
                            self.logger.warning(
                                "Cannot standardize '{}': {}".format(
                                    MolToSmiles(mol), str(e)))

                    for f in output_formats:
                        if f == "smiles":
                            mol_output["smiles"] = MolToSmiles(
                                mol, isomericSmiles=True)
                        elif f == "smiles_opsin" and opsin_output_format == "smi":
                            mol_output["smiles_opsin"] = converted
                        elif f == "inchi":
                            inchi = MolToInchi(mol)
                            if inchi:
                                mol_output["inchi"] = inchi
                            else:
                                mol_output["inchi"] = ""
                                self.logger.warning(
                                    "Cannot convert to InChI: {}".format(
                                        converted))
                        elif f == "inchi_opsin" and opsin_output_format == "inchi":
                            mol_output["inchi_opsin"] = converted
                        elif f == "stdinchi_opsin" and opsin_output_format == "stdinchi":
                            mol_output["stdinchi_opsin"] = converted
                        elif f == "inchikey":
                            inchi = MolToInchi(mol)
                            if inchi:
                                mol_output["inchikey"] = InchiToInchiKey(inchi)
                            else:
                                mol_output["inchikey"] = ""
                                self.logger.warning(
                                    "Cannot create InChI-key from InChI: {}".
                                    format(converted))
                        elif f == "stdinchikey_opsin" and opsin_output_format == "stdinchikey":
                            mol_output["stdinchikey_opsin"] = converted
                        elif f == "sdf":
                            mol_output["sdf"] = MolToMolBlock(
                                mol, includeStereo=True)

                    if output_file_sdf:
                        writer.write(mol)

                    mol_output.update(
                        OrderedDict([("iupac", line), ("error", "")]))
                else:
                    mol_output.update([
                        ("iupac", line),
                        ("error",
                         "Cannot convert to RDKit mol: {}".format(converted))
                    ])
                    mol_output.update(empty_cols)
                    self.logger.warning(compounds[-1].error)
            else:
                try:
                    error = stderr[e].strip()
                except IndexError:
                    error = ""

                mol_output.update([("iupac", line), ("error", error)])
                mol_output.update(empty_cols)
                e += 1
            compounds.append(mol_output)

        to_return["content"] = compounds

        if output_file and compounds:
            dict_to_csv(to_return["content"],
                        output_file=output_file,
                        csv_delimiter=csv_delimiter,
                        write_header=write_header)
        elif output_file and not compounds:
            write_empty_file(output_file,
                             csv_delimiter=csv_delimiter,
                             header=list(mol_output_template.keys()),
                             write_header=write_header)

        return to_return
Exemplo n.º 32
0
 def testReionizeShort(self):
     for data in self.readPCSdata(self.dataPCS_reionize1k):
         n = Standardizer()
         nm = n.reionize(data.mol)
         ns = Chem.MolToSmiles(nm)
         self.assertEqual(ns, data.expected)
Exemplo n.º 33
0
    def __init__(self,
                 dcompound,
                 logfile,
                 writecheck=1,
                 kSMILES="CANONICAL_SMILES",
                 kID="CMPD_CHEMBLID"):
        self.compound = dcompound
        loader = pydrug.PyDrug()

        # if SMILES, load using SMILES code
        if not kSMILES in dcompound.keys():
            try:
                smile = runExternalSoft.babelConvertSDFtoSMILE(
                    dcompound["sdf"])
                self.compound[kSMILES] = smile
            except:
                print "ERROR INPUT SDF - l33"
                self.log = "ERROR"
                try:
                    logfile.write(self.compound[kID] +
                                  "\t---\tERROR-SDF ORIGINAL INPUT\n")
                except:
                    pass
                return

        #Standardize smile code
        try:
            smilestandadized = standardize_smiles(self.compound[kSMILES])
        except:
            logfile.write(self.compound[kID] + "\t" +
                          str(self.compound[kSMILES]) + "\tERROR-SMILES INPUT"
                          "\n")
            self.log = "ERROR"
            return

        #Standardize using molvs (http://molvs.readthedocs.io/en/latest/api.html#molvs-fragment)
        s = Standardizer()
        mol = Chem.MolFromSmiles(smilestandadized)
        molstandardized = s.standardize(mol)
        smilestandadized = Chem.MolToSmiles(molstandardized)

        # remove salt
        # 1.default
        remover = SaltRemover()
        mol = Chem.MolFromSmiles(smilestandadized)
        molcleandefault = remover(mol)
        # 2. Personal remover
        homeremover = SaltRemover(defnData=LSALT)
        molclean = homeremover(molcleandefault)
        smilesclean = Chem.MolToSmiles(molclean)
        # 3. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound
        lelem = smilesclean.split(".")
        if len(lelem) > 1:
            # reduce double, case of several salts are included - 255
            lelem = list(set(lelem))
            for smilesdel in LSMILESREMOVE:
                if smilesdel in lelem:
                    lelem.remove(smilesdel)
            try:
                lelem.remove("")  # case of bad smile
            except:
                pass
            if len(lelem) == 1:
                smilesclean = str(lelem[0])
            else:
                # 4. Fragments
                #Case of fragment -> stock in log file, check after to control
                logfile.write(self.compound[kID] + "\t" +
                              str(self.compound[kSMILES]) +
                              "\tFRAGMENT IN INPUT"
                              "\n")
                print ".".join(lelem), " - FRAGMENTS - l66"
                self.log = "ERROR"
                return
        else:
            pass

        print self.compound[kSMILES], "SMILES IN - l25 liganddescriptors"
        print smilesclean, "SMILES without salt and standardized"

        # case where only salt are included
        if smilesclean == "":
            logfile.write(self.compound[kID] + "\t" +
                          str(self.compound[kSMILES]) + "\tEMPTY SMILES AFTER "
                          "STANDARDIZATION\n")
            print "EMPTY SMILES AFTER STANDARDIZATION - l84"
            self.log = "ERROR"
            return

        self.compound[kSMILES] = smilesclean
        self.log = "OK"

        if writecheck == 1:
            # SMILES code
            pfileSMILES = pathFolder.PR_COMPOUNDS + str(
                dcompound[kID]) + ".smi"
            fileSMILES = open(pfileSMILES, "w")
            fileSMILES.write(self.compound[kSMILES])
            fileSMILES.close()

            # SDF input
            if "sdf" in self.compound.keys():
                pfileSDF = pathFolder.PR_COMPOUNDS + str(
                    dcompound[kID]) + ".sdf"
                fileSDF = open(pfileSDF, "w")
                fileSDF.write(self.compound["sdf"])
                fileSDF.close()

        # read mol
        self.mol = loader.ReadMolFromSmile(self.compound[kSMILES])
Exemplo n.º 34
0
    def prepareChem(self, prSMIclean):

        psmiclean = prSMIclean + self.name + ".smi"

        # try if existing
        if path.exists(psmiclean):
            psmiclean = prSMIclean + self.name + ".smi"
            fsmiclean = open(psmiclean, "r")
            smiclean = fsmiclean.readlines()
            fsmiclean.close()

            smiclean = smiclean[0].strip()
            self.smiclean = smiclean
            self.mol = Chem.MolFromSmiles(smiclean)
            self.log = self.log + "Prep SMI :" + str(self.smi) + "\n"
            self.log = self.log + "Prepared SMI :" + str(self.smiclean) + "\n"

        else:
            #self.mol = loader.ReadMolFromSmile(self.smi)

            s = Standardizer()
            mol = Chem.MolFromSmiles(self.smi)

            try:
                out = toolbox.timeFunction(normalize, mol)
                if out == "ERROR":
                    self.log = self.log + "Normalize SMILES: ERROR DURING THE PROCESS\n"
                else:
                    molstandardized = out
            except:
                self.log = self.log + "Normalize SMILES: ERROR INPUT SMI\n"

            if "molstandardized" in locals():

                smilestandadized = Chem.MolToSmiles(molstandardized)

                # remove salt
                # 1.default
                remover = SaltRemover(defnFilename="Salts.txt")
                mol = Chem.MolFromSmiles(smilestandadized)
                molcleandefault = remover(mol)
                # 2. Personal remover
                homeremover = SaltRemover(defnData=LSALT)
                molclean = homeremover(molcleandefault)
                smilesclean = Chem.MolToSmiles(molclean)
                # 3. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound
                lelem = smilesclean.split(".")
                if len(lelem) > 1:
                    # reduce double, case of several salts are included - 255
                    lelem = list(set(lelem))
                    for smilesdel in LSMILESREMOVE:
                        if smilesdel in lelem:
                            lelem.remove(smilesdel)
                    try:
                        lelem.remove("")  # case of bad smile
                    except:
                        pass
                    if len(lelem) == 1:
                        smilesclean = str(lelem[0])
                    else:
                        # 4. Fragments
                        # Case of fragment -> stock in log file, check after to control
                        self.log = self.log + "Fragments after standardization: " + smilesclean + "\n"
                        smilesclean = ""

                if smilesclean == "":
                    self.log = self.log + "ERROR SMILES: SMILES empty after preparation\n"

                else:
                    self.log = self.log + "Prepared SMI :" + str(
                        smilesclean) + "\n"

                    fsmiclean = open(psmiclean, "w")
                    fsmiclean.write(smilesclean)
                    fsmiclean.close()

                    self.smiclean = smilesclean
                    self.psmiclean = psmiclean
Exemplo n.º 35
0
    def process(
            self,
            input_file: str,
            output_file: str = "",
            output_file_sdf: str = "",
            sdf_append: bool = False,
            #images_prefix: str = "",
            format_output: bool = True,
            write_header: bool = True,
            osra_output_format: str = "",
            output_formats: list = None,
            dry_run: bool = False,
            csv_delimiter: str = ";",
            use_gm: bool = True,
            gm_dpi: int = 300,
            gm_trim: bool = True,
            n_jobs: int = -1,
            input_type: str = "",
            standardize_mols: bool = True,
            annotate: bool = True,
            chemspider_token: str = "",
            custom_page: int = 0,
            continue_on_failure: bool = False) -> OrderedDict:
        r"""
        Process the input file with OSRA.

        Parameters
        ----------
        input_file : str
            Path to file to be processed by OSRA.
        output_file : str
            File to write output in.
        output_file_sdf : str
            | File to write SDF output in. "sdf" output format hasn't to be in `output_formats` to write SDF output.
            | If "sdf_osra" output format is requested, suffix "-osra.sdf" will be added.
        sdf_append : bool
            If True, append new molecules to existing SDF file or create new one if doesn't exist.
        NOT IMPLEMENTED | images_prefix : str
            Prefix for images of extracted compounds which will be written.
        format_output : bool
            | If True, the value of "content" key of returned dict will be list of OrderedDicts.
            | If True and `output_file` is set, the CSV file will be written.
            | If False, the value of "content" key of returned dict will be None.
        write_header : bool
            If True and if `output_file` is set and `output_format` is True, write a CSV write_header.
        osra_output_format : str
            | Output format from OSRA. Temporarily overrides the option `output_format` set during instantiation (in __init__).
            | Choices: "smi", "can", "sdf"
            | If "sdf", additional information like coordinates cannot be retrieved (not implemented yet).
        output_formats : list
            | If True and `format_output` is also True, this specifies which molecule formats will be output.
            | You can specify more than one format, but only one format from OSRA. This format must be also set with `output_format` in __init__
              or with `osra_output_format` here.
            | When output produces by OSRA is unreadable by RDKit, you can at least have that output from OSRA.
            | Default value: ["smiles"]

            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |      Value      |    Source    |                                            Note                                            |
            +=================+==============+============================================================================================+
            |      smiles     |     RDKit    |                                          canonical                                         |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |   smiles_osra   | OSRA ("smi") |                                           SMILES                                           |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            | smiles_can_osra | OSRA ("can") |                                      canonical SMILES                                      |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |      inchi      |     RDKit    | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |     inchikey    |     RDKit    |                              The same applies as for "inchi".                              |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |       sdf       |     RDKit    |                     If present, an additional SDF file will be created.                    |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |     sdf_osra    | OSRA ("sdf") |                     If present, an additional SDF file will be created.                    |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+

        dry_run : bool
            If True, only return list of commands to be called by subprocess.
        csv_delimiter : str
            Delimiter for output CSV file.
        use_gm : bool
            | If True, use GraphicsMagick to convert PDF to temporary PNG images before processing.
            | If False, OSRA will use it's own conversion of PDF to image.
            | Using gm is more reliable since OSRA (v2.1.0) is showing wrong information
              when converting directly from PDF (namely: coordinates, bond length and possibly more ones) and also there are sometimes
              incorrectly recognised structures.
        gm_dpi : int
            How many DPI will temporary PNG images have.
        gm_trim : bool
            If True, gm will trim the temporary PNG images.
        n_jobs : int
            | If `use_gm` and input file is PDF, how many jobs to use for OSRA processing of temporary PNG images.
            | If -1 all CPUs are used.
            | If 1 is given, no parallel computing code is used at all, which is useful for debugging.
            | For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used.
        input_type : str
            | When empty, input (MIME) type will be determined from magic bytes.
            | Or you can specify "pdf" or "image" and magic bytes check will be skipped.
        standardize_mols : bool
            If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules.
        annotate : bool
            | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with
              each identifier, separately for SMILES, InChI etc.
            | If entity has InChI key yet, prefer it in searching.
            | If "*" is present in SMILES, skip annotation.
        chemspider_token : str
            Your personal token for accessing the ChemSpider API. Make account there to obtain it.
        custom_page : bool
            When `use_gm` is False, this will set the page for all extracted compounds.
        continue_on_failure : bool
            | If True, continue running even if OSRA returns non-zero exit code.
            | If False and error occurs, print it and return.

        Returns
        -------
        dict
            Keys:

            - stdout: str ... standard output from OSRA
            - stderr: str ... standard error output from OSRA
            - exit_code: int ... exit code from OSRA
            - content:

                - list of OrderedDicts ... when `format_output` is True.
                - None ... when `format_output` is False

            | If `osra_output_format` is "sdf", additional information like 'bond_length' cannot be retrieved.
            | If `use_gm` is True then stdout, stderr and exit_code will be lists containing items from each temporary image
              extracted by OSRA.

        Notes
        -----
        Only with `format_output` set to True you can use molecule standardization and more molecule formats. Otherwise
        you will only get raw stdout from OSRA (which can also be written to file if `output_file` is set).
        """

        options_internal = self.options_internal.copy()
        osra_smiles_outputs = ["smi", "can"]

        # OSRA output format check
        if osra_output_format:
            options_internal["output_format"] = osra_output_format
        else:
            osra_output_format = options_internal["output_format"]

        osra_valid_output_formats = {
            "can": "smiles_can_osra",
            "smi": "smiles_osra",
            "sdf": "sdf_osra"
        }
        if osra_output_format not in osra_valid_output_formats:
            raise ValueError(
                "Unknown OSRA output format. Possible values: {}".format(
                    osra_valid_output_formats.values()))

        if osra_output_format == "sdf":
            self.logger.warning(
                "OSRA's output format is set to \"sdf\" so additional information like coordinates cannot be retrieved."
            )

        # output formats check
        is_output_sdf = False
        is_output_sdf_osra = False
        if not output_formats:
            output_formats = ["smiles"]
        else:
            output_formats = sorted(list(set(output_formats)))
            possible_output_formats = ["smiles", "inchi", "inchikey", "sdf"]
            output_formats = [
                x for x in output_formats if x in possible_output_formats
                or x == osra_valid_output_formats[osra_output_format]
            ]

            if ("sdf" in output_formats
                    or "sdf_osra" in output_formats) and not output_file_sdf:
                self.logger.warning(
                    "Cannot write SDF output: 'output_file_sdf' is not set.")
            if output_file_sdf:
                is_output_sdf = True
            if "sdf_osra" in output_formats and osra_output_format == "sdf" and output_file_sdf:
                is_output_sdf_osra = True
            if ("smiles_osra" in output_formats or "smiles_can_osra"
                    in output_formats) and osra_output_format == "sdf":
                try:
                    output_formats.remove("smiles_osra")
                except ValueError:
                    pass
                try:
                    output_formats.remove("smiles_can_osra")
                except ValueError:
                    pass
                self.logger.warning(
                    "SMILES or canonical SMILES output from OSRA is requested, but OSRA's output format is \"{}\"."
                    .format(osra_output_format))

        # input file type check
        possible_input_types = ["pdf", "image"]
        if not input_type:
            input_type = get_input_file_type(input_file)
            if input_type not in possible_input_types:
                use_gm = False
                self.logger.warning(
                    "Input file MIME type ('{}') is not one of {}. You can specify 'input_type' directly (see docstring)."
                    .format(input_type, possible_input_types))
        elif input_type not in possible_input_types:
            raise ValueError("Possible 'input_type' values are {}".format(
                possible_input_types))

        #options = ChainMap({k: v for k, v in {"images_prefix": images_prefix}.items() if v},
        #                   options_internal)

        if annotate:
            if not chemspider_token:
                self.logger.warning(
                    "Cannot perform annotation in ChemSpider: 'chemspider_token' is empty."
                )
            [
                output_formats.append(x)
                for x in ["smiles", "inchi", "inchikey"]
                if x not in output_formats
            ]
            output_formats = sorted(output_formats)

        commands, _, _ = self.build_commands(options_internal,
                                             self._OPTIONS_REAL,
                                             self.path_to_binary)
        commands.extend(
            ["--bond", "--coordinates", "--page", "--guess", "--print"])

        if dry_run:
            return " ".join(commands)

        osra_output_list = []
        if input_type == "image" or not use_gm:
            osra_output_list.append(
                self._process(input_file,
                              commands,
                              page=custom_page if custom_page else 1))
        elif input_type == "pdf":
            with tempfile.TemporaryDirectory() as temp_dir:
                stdout, stderr, exit_code = pdf_to_images(input_file,
                                                          temp_dir,
                                                          dpi=gm_dpi,
                                                          trim=gm_trim)
                osra_output_list = Parallel(n_jobs=n_jobs)(
                    delayed(self._process)(
                        temp_image_file, commands, page=page)
                    for temp_image_file, page in get_temp_images(temp_dir))

        # summarize OSRA results
        to_return = {
            "stdout": [],
            "stderr": [],
            "exit_code": [],
            "content": None,
            "pages": []
        }
        for result in osra_output_list:
            if result["stdout"]:
                to_return["stdout"].append(result["stdout"])
                to_return["stderr"].append(result["stderr"])
                to_return["exit_code"].append(result["exit_code"])
                to_return["pages"].append(result["page"])

        if not continue_on_failure:
            errors = [(page + 1, error)
                      for page, (exit_code, error) in enumerate(
                          zip(to_return["exit_code"], to_return["stderr"]))
                      if exit_code > 0]
            if errors:
                self.logger.warning("OSRA errors:")
                for page, error in errors:
                    eprint("\tError on page {}:".format(page))
                    eprint("\n\t\t".join("\n{}".format(error).splitlines()))
                return to_return

        if not format_output:
            if output_file:
                with open(output_file, mode="w", encoding="utf-8") as f:
                    f.write("\n".join(to_return["stdout"]))
            return to_return

        output_cols = OrderedDict([("bond_length", 1), ("resolution", 2),
                                   ("confidence", 3), ("page", 4),
                                   ("coordinates", 5)])

        if osra_output_format in osra_smiles_outputs:
            compound_template_dict = OrderedDict.fromkeys(
                output_formats + list(output_cols.keys()))
        else:
            compound_template_dict = OrderedDict.fromkeys(["page"] +
                                                          output_formats)

        if any(to_return["stdout"]):
            if standardize_mols:
                standardizer = Standardizer()

            compounds = []

            if is_output_sdf:
                if sdf_append:
                    if not os.path.isfile(output_file_sdf):
                        open(output_file_sdf, mode="w",
                             encoding="utf-8").close()
                    writer = SDWriter(
                        open(output_file_sdf, mode="a", encoding="utf-8"))
                else:
                    writer = SDWriter(output_file_sdf)

            for output, page in zip(to_return["stdout"], to_return["pages"]):
                if osra_output_format in osra_smiles_outputs:
                    lines = [x.strip() for x in output.split("\n") if x]
                else:
                    lines = [x for x in output.split("$$$$") if x.strip()]

                for line in lines:
                    """
                    # so much problems with --learn
                    # we can't simply split output by " " when --learn is present, because its output is like "1,2,2,2 1"
                    if "learn" in filtered_cols:
                        learn_start = filtered_cols.index("learn") + 1 #  "smiles" col isn't in output_cols
                        learn_end = filtered_cols.index("learn") + 1 + 3
                        line[learn_start:learn_end] = [" ".join(line[learn_start:learn_end])]
                    """

                    if not line:
                        continue

                    if osra_output_format in osra_smiles_outputs:
                        line = [x.strip() for x in line.split()]
                        if custom_page:
                            line[output_cols["page"]] = custom_page
                        elif use_gm:
                            line[output_cols["page"]] = page
                        mol = MolFromSmiles(
                            line[0],
                            sanitize=False if standardize_mols else True)
                    elif osra_output_format == "sdf":
                        line = "\n" + line.strip()
                        mol = MolFromMolBlock(
                            line,
                            strictParsing=False,
                            sanitize=False if standardize_mols else True,
                            removeHs=False if standardize_mols else True)

                    if mol:
                        compound = compound_template_dict.copy()

                        if standardize_mols:
                            try:
                                mol = standardizer.standardize(mol)
                            except ValueError as e:
                                self.logger.warning(
                                    "Cannot standardize '{}': {}".format(
                                        MolToSmiles(mol), str(e)))

                        for f in output_formats:
                            if f == "smiles":
                                compound["smiles"] = MolToSmiles(
                                    mol, isomericSmiles=True)
                            elif f == "smiles_osra" and osra_output_format == "smi":
                                compound["smiles_osra"] = line[0]
                            elif f == "smiles_can_osra" and osra_output_format == "can":
                                compound["smiles_can_osra"] = line[0]
                            elif f == "inchi":
                                inchi = MolToInchi(mol)
                                if inchi:
                                    compound["inchi"] = inchi
                                else:
                                    compound["inchi"] = ""
                                    self.logger.warning(
                                        "Cannot convert to InChI: {}".format(
                                            MolToSmiles(mol)))
                            elif f == "inchikey":
                                inchi = MolToInchi(mol)
                                if inchi:
                                    compound["inchikey"] = InchiToInchiKey(
                                        inchi)
                                else:
                                    compound["inchikey"] = ""
                                    self.logger.warning(
                                        "Cannot create InChI-key from InChI: {}"
                                        .format(MolToSmiles(mol)))
                            elif f == "sdf":
                                compound["sdf"] = MolToMolBlock(
                                    mol, includeStereo=True)
                            elif f == "sdf_osra":
                                compound["sdf_osra"] = line

                        if is_output_sdf:
                            writer.write(mol)

                        if osra_output_format in osra_smiles_outputs:
                            compound.update([(x[0], x[1]) for x in zip(
                                list(output_cols.keys()), line[1:])])
                        else:
                            compound[
                                "page"] = page if use_gm else custom_page if custom_page else 1

                        compounds.append(compound)
                    else:
                        self.logger.warning("Cannot convert to RDKit mol: " +
                                            line[0])

            if is_output_sdf_osra:
                with open(output_file_sdf + "-osra.sdf",
                          mode="w",
                          encoding="utf-8") as f:
                    f.write("".join(to_return["stdout"]))

            to_return["content"] = sorted(compounds, key=lambda x: x["page"])

            if annotate:
                chemspider = ChemSpider(
                    chemspider_token) if chemspider_token else None

                for i, ent in enumerate(to_return["content"]):
                    self.logger.info("Annotating entity {}/{}...".format(
                        i + 1, len(to_return["content"])))
                    ent.update(
                        OrderedDict([("pch_cids_by_inchikey", ""),
                                     ("chs_cids_by_inchikey", ""),
                                     ("pch_cids_by_smiles", ""),
                                     ("chs_cids_by_smiles", ""),
                                     ("pch_cids_by_inchi", ""),
                                     ("chs_cids_by_inchi", ""),
                                     ("pch_iupac_name", ""),
                                     ("chs_common_name", ""),
                                     ("pch_synonyms", "")]))

                    results = []

                    # prefer InChI key
                    if "inchikey" in ent and ent["inchikey"]:
                        try:
                            results = get_compounds(ent["inchikey"],
                                                    "inchikey")
                            if results:
                                if len(results) == 1:
                                    result = results[0]
                                    synonyms = result.synonyms
                                    if synonyms:
                                        ent["pch_synonyms"] = "\"{}\"".format(
                                            "\",\"".join(synonyms))
                                    ent["pch_iupac_name"] = result.iupac_name
                                ent["pch_cids_by_inchikey"] = "\"{}\"".format(
                                    ",".join([str(c.cid) for c in results]))
                        except (BadRequestError, NotFoundError,
                                PubChemHTTPError, ResponseParseError,
                                ServerError, TimeoutError, PubChemPyError):
                            pass

                        results = chemspider.search(
                            ent["inchikey"]) if chemspider_token else []
                        if results:
                            if len(results) == 1:
                                result = results[0]
                                ent["chs_common_name"] = result.common_name
                            ent["chs_cids_by_inchikey"] = "\"{}\"".format(
                                ",".join([str(c.csid) for c in results]))
                    else:
                        for search_field, col_pch, col_chs in [
                            ("smiles", "pch_cids_by_smiles",
                             "chs_cids_by_smiles"),
                            ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi")
                        ]:
                            results_pch = []
                            results_chs = []

                            if search_field == "smiles" and "smiles" in ent and ent[
                                    "smiles"] and "*" not in ent["smiles"]:
                                try:
                                    results_pch = get_compounds(
                                        ent["smiles"], "smiles")
                                except (BadRequestError, NotFoundError,
                                        PubChemHTTPError, ResponseParseError,
                                        ServerError, TimeoutError,
                                        PubChemPyError):
                                    pass
                                results_chs = chemspider.search(
                                    ent["smiles"]) if chemspider_token else []
                            elif search_field == "inchi" and "inchi" in ent and ent[
                                    "inchi"]:
                                try:
                                    results_pch = get_compounds(
                                        ent["inchi"], "inchi")
                                except (BadRequestError, NotFoundError,
                                        PubChemHTTPError, ResponseParseError,
                                        ServerError, TimeoutError,
                                        PubChemPyError):
                                    pass
                                results_chs = chemspider.search(
                                    ent["inchi"]) if chemspider_token else []

                            if results_pch:
                                ent[col_pch] = "\"{}\"".format(",".join(
                                    [str(c.cid) for c in results_pch]))
                            if results_chs:
                                ent[col_chs] = "\"{}\"".format(",".join(
                                    [str(c.csid) for c in results_chs]))

                            sleep(0.5)

            if output_file:
                dict_to_csv(to_return["content"],
                            output_file=output_file,
                            csv_delimiter=csv_delimiter,
                            write_header=write_header)

            if is_output_sdf:
                writer.close()
        elif not any(to_return["stdout"]) and output_file:
            write_empty_file(output_file,
                             csv_delimiter=csv_delimiter,
                             header=list(compound_template_dict.keys()),
                             write_header=write_header)

        return to_return
Exemplo n.º 36
0
 def testMetalShort(self):
     for data in self.readPCSdata(self.dataPCS_metal1k):
         n = Standardizer()
         nm = n.disconnect_metals(data.mol)
         ns = Chem.MolToSmiles(nm)
         self.assertEqual(ns, data.expected)
Exemplo n.º 37
0
#!/usr/env/bin python
# -*- coding: utf-8 -*-
"""
unit testing for MolVS steps for PubChem Substances
tests include
molvs.standardize_smiles
Standardizer().normalize
Standardizer().disconnect_metals
Standardizer().reionize
molvs.standardize.canonicalize_tautomer_smiles
molvs.validate.Validator()
Standardizer().fragment_parent
"""
import gzip
import os.path
import unittest
from collections import namedtuple

import molvs
from molvs import Standardizer, validate
from rdkit import Chem, RDConfig

doLong = False
TestData = namedtuple('TestData', 'lineNo,smiles,mol,expected')

class TestCase(unittest.TestCase):
    dataPCS_standardize_smiles100k = os.path.join(RDConfig.RDBaseDir, 'rdkit', 'Chem', 'MolStandardize', 'test_data', '100kPCS_standardize_sm.csv.gz')
    dataPCS_standardize_smiles1k = os.path.join(RDConfig.RDBaseDir,'rdkit', 'Chem', 'MolStandardize', 'test_data', '1kPCS_standardize_sm.csv.gz')
    dataPCS_nomralized1k = os.path.join(RDConfig.RDBaseDir,'rdkit', 'Chem', 'MolStandardize', 'test_data', '1kPCS_normalized.csv.gz')
    dataPCS_nomralized100k = os.path.join(RDConfig.RDBaseDir, 'rdkit', 'Chem', 'MolStandardize', 'test_data', '100kPCS_normalized.csv.gz')
    dataPCS_metal100k = os.path.join(RDConfig.RDBaseDir,'rdkit', 'Chem', 'MolStandardize', 'test_data', '100kPCS_metals.csv.gz')