예제 #1
0
 def _obtain_entry_api(self, search_text, name,
                       output_format) -> Optional[str]:
     cid = None
     cids = pcp.get_cids(search_text, "name", record_type="3d")
     if len(cids) == 0:
         print("No exact match found, please try the web search")
     else:
         cid = str(cids[0])
         if output_format.lower() == "smiles":
             compound = pcp.Compound.from_cid(int(cid))
             print("SMILES code:", compound.canonical_smiles)
         elif output_format.lower() == "pdb":
             sdf_file = os.path.join(self.write_dir,
                                     name + "_" + cid + ".sdf")
             pdb_file = os.path.join(self.write_dir,
                                     name + "_" + cid + ".pdb")
             pcp.download("SDF",
                          sdf_file,
                          cid,
                          record_type="3d",
                          overwrite=True)
             sdf_to_pdb(sdf_file, pdb_file)
         else:
             pcp.download(
                 output_format.upper(),
                 os.path.join(
                     self.write_dir,
                     name + "_" + cid + "." + output_format.lower()),
                 cid,
                 record_type="3d",
                 overwrite=True,
             )
     return cid
예제 #2
0
def get_pubchem_id(name):
    print(name)
    try:
        cpd_id = pcp.get_cids(name, "name")
        return name, cpd_id[0]
    except:
        return name, ""
예제 #3
0
    def scrape_super_rest(self,
                          cids,
                          match_isotopes=False,
                          match_charges=False,
                          match_tautomers=False,
                          rings_not_embedded=False,
                          single_double_bonds_match=True,
                          chains_match_rings=True,
                          strip_hydrogen=False,
                          stereo="ignore",
                          max_records=10000):
        """
        Generalized function for superstructure searches (searching for
        molecules that contain a given molecule within them).
        Kwargs are those used by PUG and PUG-REST for similarity queries.

        Parameter descriptions are largely taken from
        http://pubchemdocs.ncbi.nlm.nih.gov/pug-rest

        :param cids: A dict {"category": [ids]}, where each category is a
        molecular type of interest.
        :param match_isotopes: Atoms must be of the same specified isotope.
        :param match_charges: Atoms must match the specified charge.
        :param match_tautomers: Allows matching with tautomers.
        :param rings_not_embedded: Rings may not be embedded in a larger system.
        :param single_double_bonds_match: In an aromatic compound, either single
        or double bonds may match the aromatic bonds.
        :param chains_match_rings: Chain bonds in the query may match rings in hits.
        :param strip_hydrogen: Remove explicit hydrogens before searching.
        :param stereo: How to handle stereoisomers: either "ignore", "exact",
        "relative", or "nonconflicting".
        :param max_records: Maximum number of hits.
        :return: A dict {"category": {id:[matches]}}, where each id in each
        category is stored along with its matches (which take the form of CIDs.
        """

        output = {}
        queries_run = 0

        for cat in cids.keys():
            output[cat] = {}
            for cid in cids[cat]:
                queries_run = self.check_queries(queries_run)
                result = pcp.get_cids(
                    cid,
                    namespace="cid",
                    domain="compound",
                    searchtype="superstructure",
                    match_isotopes=match_isotopes,
                    match_charges=match_charges,
                    match_tautomers=match_tautomers,
                    rings_not_embedded=rings_not_embedded,
                    single_double_bonds_match=single_double_bonds_match,
                    chains_match_rings=chains_match_rings,
                    strip_hydrogen=strip_hydrogen,
                    stereo=stereo,
                    max_records=max_records)
                output[cat][cid] = result

        return output
예제 #4
0
def form2():
    if request.method == 'POST':
        rawtext = request.files['rawtext']
        words = set(nltk.corpus.words.words())
        fr = PyPDF2.PdfFileReader(rawtext)
        text = ""
        num_pages = fr.numPages
        count = 0
        text = ""
        while count < num_pages:
            pageObj = fr.getPage(count)
            count += 1
            text += pageObj.extractText()
            if text != "":
                text = text
            else:
                text = textract.process(input,
                                        method='tesseract',
                                        language='eng')
        s=" ".join(w for w in nltk.wordpunct_tokenize(text) \
        if (w.lower() not in words and w.upper() not in words))
        l = list(filter(lambda x: x[0].isupper(), s.split()))
        l = list(dict.fromkeys(l))
        freq = []
        for i in range(50):
            try:
                results = pc.get_cids(l[i], 'name')
                c = pc.Compound.from_cid(results)
                freq.append((l[i], c.synonyms[0]))
            except:
                pass

    return render_template('index.html', final_summary=freq)
예제 #5
0
    def scrape_similar_rest(self, cids, threshold=90, max_records=10000):
        """
        Searches by similarity (2D) using PUG-REST.

        :param cids:
        :param threshold:
        :param max_records:
        :return: A dict {"category": {id:[matches]}}, where each id in each
        category is stored along with its matches (which take the form of CIDs.
        """

        output = {}
        queries_run = 0

        for cat in cids.keys():
            output[cat] = {}
            for cid in cids[cat]:
                queries_run = self.check_queries(queries_run)
                result = pcp.get_cids(cid,
                                      namespace="cid",
                                      domain="compound",
                                      searchtype="similarity",
                                      threshold=threshold,
                                      max_records=max_records)
                output[cat][cid] = result

        return output
예제 #6
0
def get_structure(code, cutoff, dir_search, BRENDA_PARSER):
    proteins = BRENDA_PARSER.get_proteins(code)
    substrate, counted = choose_substrate(proteins)

    if substrate == None:
        print("No suitable substrate found, skip..")
        return False
    #for s in counted:
    #print(str(counted[s]), " : \t", s )
    print("\nmost common: ", substrate)

    try:
        CID = pcp.get_cids(substrate, 'name', 'substance',
                           list_return='flat')[0]
    except:
        print("CID not found..")
        return False

    if cutoff:
        if check_size_of_substrate(CID, cutoff) == False:
            return False

    file = (f'{dir_search}/{CID}.json')
    #file = (f'{dir_search}/{str(substrate).strip()}.json')
    try:
        pcp.download('JSON', file, CID, 'cid')
    except:
        return False
    return True
예제 #7
0
파일: PyCFMID.py 프로젝트: hcji/PyCFMID
def search_pubchem(formula, output_file=None, timeout=999):
    output_file = check_output_file(output_file)
    # get pubchem cid based on formula
    cids = pc.get_cids(formula, 'formula', list_return='flat')
    idstring = ''
    smiles = []
    inchikey = []
    all_cids = []
    # search pubchem via formula with pug
    for i, cid in enumerate(cids):
        idstring += ',' + str(cid)
        if ((i%100==99) or (i==len(cids)-1)):
            url_i = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/" + idstring[1:(len(idstring))] + "/property/InChIKey,CanonicalSMILES/JSON"
            res_i = requests.get(url_i, timeout=timeout)
            soup_i = BeautifulSoup(res_i.content, "html.parser")
            str_i = str(soup_i)
            properties_i = json.loads(str_i)['PropertyTable']['Properties']
            idstring = ''
            for properties_ij in properties_i:
                smiles_ij = properties_ij['CanonicalSMILES']
                if smiles_ij not in smiles:
                    smiles.append(smiles_ij)
                    inchikey.append(properties_ij['InChIKey'])
                    all_cids.append(str(properties_ij['CID']))
                else:
                    wh = np.where(np.array(smiles)==smiles_ij)[0][0]
                    all_cids[wh] = all_cids[wh] + ', ' + str(properties_ij['CID'])
    result = pd.DataFrame({'InChIKey': inchikey, 'SMILES': smiles, 'PubChem': all_cids})
    output = pd.DataFrame({'ID': result.index, 'Smiles': result['SMILES']})
    output.to_csv(output_file, header=False, index=False, sep=' ')
    return result
예제 #8
0
def get_cid(dataframe, source_column, new_column):
    """This function will retrieve the pubchem cid's for chemicals in a dataframe. The dataframe, source column
    for which to retrieve cid's and name of a new column to append to the dataframe"""

    cid_results = [
    ]  #temporary empty list that will contain the cid's retrieved by pubchem
    final_cid = []  #empty list that will contain the final cid's

    for i, row in dataframe.iterrows():

        names = row[source_column]

        #pubchempy command for retrieving cid's and appending to temporary list
        #note you need to input how you wish to search for the cid, here we are searching by chemcial 'name'
        cid_results.append(pcp.get_cids(names, 'name', list_return='flat'))

    for j in cid_results:

        if len(j) == 0:  #case in which no cid was found by pubchempy
            final_cid.append("no cid found")

        if len(j) >= 1:  #append only the first cid
            final_cid.append(j[0])

    dataframe[new_column] = final_cid

    return dataframe
예제 #9
0
파일: silimlar.py 프로젝트: LXander/dock
def query_similar(ligand_path):
    result = os.popen('obabel -ipdb {} -osmi'.format(ligand_path))
    smiles = result.read().split('\t')[0]
    cids = pcp.get_cids(smiles,
                        'smiles',
                        searchtype='similarity',
                        list_return='flat')
예제 #10
0
def get_molecular_weights(model, medium):
    import pubchempy as pcp
    import ssl
    # Alternative to using the xialab api (it wouldn't work for me on OSX)
    # Get molar weights from Chebi using pubchempy
    # medium: list of metabolites in media, formatted: ['EX_Lcyst(e)', 'EX_Lcystin(e)', 'EX_ade(e)'...]
    # model: metabolic model in question (read in from Cobrapy)
    # Returns: df with the columns ["query", "BiGG_ID", "hit", "mol_weight"]
    # query: substance name
    # hit: cid

    x = [model.reactions.get_by_id(compound_id).name for compound_id in medium]
    x = [compound.rstrip("exchange") for compound in x]
    ";".join(x)

    ssl._create_default_https_context = ssl._create_unverified_context
    df = pd.DataFrame(columns=["query", "BiGG_ID", "hit", "mol_weight"])
    for i in range(0, len(x)):
        new_dict = {'query': x[i]}
        cid = pcp.get_cids(x[i])
        new_dict['BiGG_ID'] = medium[i].lstrip('EX_')
        try:
            new_dict['hit'] = cid[0]
            new_dict['mol_weight'] = pcp.Compound.from_cid(
                cid[0]).molecular_weight
        except:
            new_dict['hit'] = "NA"
            new_dict['mol_weight'] = "NA"
        df = df.append(new_dict, ignore_index=True)

    return df
예제 #11
0
def change_smiles_pubchem_fingerprint(smiles_list):
    # smiles_list: list of smiles strings
    # Uses pubchempy
    import pubchempy as pcp

    main_list = []
    for i in smiles_list:
        p_id = pcp.get_cids(identifier=i, namespace="smiles")
        fingerprint = pcp.Compound.from_cid(p_id).cactvs_fingerprint
        main_list.append([int(i) for i in str(fingerprint)])

    return main_list
예제 #12
0
def change_smiles_pubchem_fingerprint(smiles_list):
	# smiles_list: list of smiles strings
	# Uses pubchempy
	import pubchempy as pcp

	main_list = []
	for i in smiles_list:
		p_id        = pcp.get_cids(identifier=i, namespace="smiles")
		fingerprint = pcp.Compound.from_cid(p_id).cactvs_fingerprint
		main_list.append([int(i) for i in str(fingerprint)]) 

	return main_list
예제 #13
0
def searchFormulaInfo(formula):
    info = ''
    cids = set(pcp.get_cids(formula, 'name')).intersection(
        pcp.get_cids(formula, 'formula'))

    for cid in cids:
        c = pcp.Compound.from_cid(cid)
        name = c.iupac_name
        altNames = c.synonyms[:4]

        if not name and len(altNames) > 0:
            name = altNames[0]
        if name in altNames:
            altNames.remove(name)

        if name:
            info += 'This formula has the IUPAC name \\textbf{' + name + '}'
            if c.molecular_formula:
                info += ' and the molecular formula \\textbf{' + c.molecular_formula + '}'
            info += '. '
        if len(altNames) > 1:
            info += 'Some alternative names are ' + ', '.join(
                str(n) for n in altNames[:-1]) + ' and ' + altNames[-1] + '. '
        elif len(altNames) == 1:
            info += 'An alternative name is ' + altNames[0] + '. '
        if c.complexity:
            info += 'Its complexity has value ' + str(c.complexity) + '. '
        if c.exact_mass:
            info += 'The exact mass is ' + str(c.exact_mass) + '. '
        if c.molecular_weight:
            info += 'The molecular weight is ' + str(c.molecular_weight) + '. '
        if c.monoisotopic_mass:
            info += 'The monoisotopic mass is ' + str(
                c.monoisotopic_mass) + '. '

    return info
def convert(input):
    words = set(nltk.corpus.words.words())
    f = open(input, 'rb')
    fr = PyPDF2.PdfFileReader(f)
    #pageObj=fr.getPage(1)
    text = ""
    num_pages = fr.numPages
    count = 0
    text = ""
    #The while loop will read each page
    while count < num_pages:
        pageObj = fr.getPage(count)
        count += 1
        text += pageObj.extractText()
        if text != "":
            text = text
    #            newString = (text.encode('ascii', 'ignore')).decode("utf-8")
        else:
            text = textract.process(input, method='tesseract', language='eng')
    #            newString = (text.encode('ascii', 'ignore')).decode("utf-8")
    s=" ".join(w for w in nltk.wordpunct_tokenize(text) \

             if (w.lower() not in words and w.upper() not in words))
    #print(s)
    #print(text)
    l = list(filter(lambda x: x[0].isupper(), s.split()))
    #print(l)
    l = list(dict.fromkeys(l))

    freq = {}
    for i in range(10):
        try:
            results = pc.get_cids(l[i], 'name')
            c = pc.Compound.from_cid(results)
            ''' if(i in freq):
                        freq[l[i]][1]+=1
                    else:
                        freq[l[i]]=[c.synonyms[0],1]'''
            freq[l[i]] = c.synonyms[0]
            #print(l[i]+" NAME:- "+c.synonyms[0])

        except:
            pass

    #print(l)
    f.close()
    return freq
예제 #15
0
def check_size_of_substrate(name, cutoff):
    counter = 0
    try:
        p = pcp.get_cids(name, 'name', 'substance', list_return='flat')
    except:
        print("substrate not found..")
        return False
    if len(p) == 0:
        return False
    c = pcp.Compound.from_cid(p[0])
    c = c.to_dict(properties=['atoms', 'bonds', 'inchi'])
    for atom in c['atoms']:
        if atom['element'] is not 'H':
            counter += 1
            if counter > cutoff:
                return False
    return True
예제 #16
0
파일: odorants.py 프로젝트: pyrfume/pyrfume
def get_cid(identifier: str,
            kind: str = None,
            verbose: bool = True,
            fix_smiles_on_error: bool = True,
            attempt=0) -> int:
    """
    Return data about a molecule from any synonym,
    including a chemical name or a CAS.
    """
    if isinstance(identifier, float) and np.isnan(identifier):
        return 0
    replace = [('α', 'alpha'), ('β', 'beta'), ('γ', 'gamma'), ('δ', 'delta')]
    for a, b in replace:
        identifier = identifier.replace(a, b)
    if kind is None:
        kind = get_kind(identifier)
    else:
        kind = kind.lower()
    try:
        result = pcp.get_cids(identifier, namespace=kind)
    except pcp.BadRequestError:
        logger.warning('Request Error for "%s"' % identifier)
        result = []
    except pcp.PubChemHTTPError as e:
        if attempt == 0:
            import time
            time.sleep(10)
            return get_cid(identifier, kind, verbose, fix_smiles_on_error, 1)
        else:
            raise e
    if not len(result):
        cid = 0
    else:
        if (len(result) > 1) and verbose:
            logger.warning("Multiple CIDs for %s: %s" % (identifier, result))
        cid = result[0]
    if not cid and kind == "smiles" and fix_smiles_on_error:
        # Retry with canonical SMILES
        identifier = canonical_smiles(identifier)
        if identifier:
            cid = get_cid(identifier,
                          kind=kind,
                          verbose=verbose,
                          fix_smiles_on_error=False)
    return cid
예제 #17
0
    def retrieve(
        name: Optional[str] = None,
        smiles: Optional[str] = None,
        inchi: Optional[str] = None,
        inchikey: Optional[str] = None,
    ) -> mtr.Structure:
        kwargs = (
            (name, "name"),
            (smiles, "smiles"),
            (inchi, "inchi"),
            (inchikey, "inchikey"),
        )
        try:
            identifier, identifier_type = next(
                (k, v) for k, v in kwargs if k is not None)
        except StopIteration:
            raise ValueError(
                "Provide name, SMILES, InChi, or InChiKey to retrieve structure."
            )
        try:
            # this just picks the first returned compound
            # if there are multiple, we are assuming that the
            # first such compound is the "most relevant" in some sense
            cid, *_ = pcp.get_cids(identifier, identifier_type)
            if cid == 0:
                raise ValueError
        except (ValueError, OSError):
            raise ValueError(f"Structure retrieval for {identifier} failed.")

        try:
            return _structure_from_pubchem_compound(
                compound=pcp.Compound.from_cid(cid, record_type="3d"))
        except pcp.NotFoundError:
            # no 3d structure from pubchem
            # there must be a 2d structure since a cid was found
            [property_dict] = pcp.get_properties(properties="IsomericSMILES",
                                                 identifier=cid,
                                                 namespace="cid")
            return Structure.generate(smiles=property_dict["IsomericSMILES"])
예제 #18
0
def get_cid(
    identifier: str, kind: str = "name", verbose: bool = True, fix_smiles_on_error: bool = True
) -> int:
    """Return data about a molecule from any synonym,
    including a chemical name or a CAS"""
    kind = kind.lower()
    try:
        result = pcp.get_cids(identifier, namespace=kind)
    except pcp.BadRequestError:
        logger.warning('Request Error for "%s"' % identifier)
        result = []
    if not len(result):
        cid = 0
    else:
        if (len(result) > 1) and verbose:
            logger.warning("Multiple CIDs for %s: %s" % (identifier, result))
        cid = result[0]
    if not cid and kind == "smiles" and fix_smiles_on_error:
        # Retry with canonical SMILES
        identifier = canonical_smiles(identifier)
        if identifier:
            cid = get_cid(identifier, kind=kind, verbose=verbose, fix_smiles_on_error=False)
    return cid
예제 #19
0
파일: forms.py 프로젝트: Chem3/chemcat
    def clean(self):
        iupac_name = self.cleaned_data.get('iupac_name')
        trivial_name = self.cleaned_data.get('trivial_name')
        cas_number = self.cleaned_data.get('cas')
        override = self.cleaned_data.get('override_iupac')

        if not (override) or not override:
            cids = pcp.get_cids(iupac_name, 'name')
            if not cids:
                raise forms.ValidationError(
                    "No PubChem match on suggested iupac name. "
                    "Are you sure this is a chemical? "
                    "If so, check CAS, trivial name, "
                    "molecular formula and then use override below. "
                    "An email will be sent to notify admin.")
            out = ""
            for cid in cids:
                out = out + pcp.Compound.from_cid(cid).iupac_name + ", "
            cmp = pcp.Compound.from_cid(cids[0])
            if not cmp.synonyms:
                trivial = ""
            else:
                trivial = cmp.synonyms[0]
            try:
                cas = chem.CAS_from_any(iupac_name)
            except ValueError:
                cas = "Non found"
            formula = cmp.isomeric_smiles

            if pcp.Compound.from_cid(
                    cids[0]).iupac_name.lower() != iupac_name.lower():
                raise forms.ValidationError(
                    "Iupac Name is not registered in Pubchem, try: " + out +
                    ", suggested trivial name " + trivial + ", cas " + cas +
                    ", suggested formula (SMILES) " + formula)

        return self.cleaned_data
예제 #20
0
    def searchPubChem(self, searchterm='', filters=[], numresults=10,\
     randomized=True, save=True):
        '''
		Get list of initialized Molecule cids from database that pass given filters

			Parameters:
				filters ([Filter]): list of filters to apply

			Returns:
				list of cids ([int])
		'''
        #Retrieve search results from PubChem
        searchcids = \
         pcp.get_cids(searchterm , namespace='smiles', \
         searchtype='substructure', MaxRecords=10000, record_type='3d')
        if randomized: random.shuffle(searchcids)

        results = []
        for cid in searchcids:
            if len(results) == numresults: break
            if str(cid) in self.molecules: continue
            print(f"fetching molecule {cid}")
            mol = Molecule.from_cid(cid)
            if mol is None: continue
            print("saving")
            self.save(mol)
            passed = True
            for molfilter in filters:
                if not molfilter.check(mol):
                    passed = False
                    break
            if passed:
                print('passed')
                results.append(mol.id)

        return results
예제 #21
0
import math
import pubchempy as pcp
import matplotlib.pyplot as plt
import pandas as pd

cids_95 = pcp.get_cids(
    'CCCC1=NN(C2=C1N=C(NC2=O)C3=C(C=CC(=C3)S(=O)(=O)N4CCN(CC4)C)OCC)C',
    'smiles',
    searchtype='similarity',
    Threshold=95)
print(f'len cids for 95: {len(cids_95)}')

cids_80 = pcp.get_cids(
    'CCCC1=NN(C2=C1N=C(NC2=O)C3=C(C=CC(=C3)S(=O)(=O)N4CCN(CC4)C)OCC)C',
    'smiles',
    searchtype='similarity',
    Threshold=80)
print(f'len cids for 80: {len(cids_80)}')

cids_70 = pcp.get_cids(
    'CCCC1=NN(C2=C1N=C(NC2=O)C3=C(C=CC(=C3)S(=O)(=O)N4CCN(CC4)C)OCC)C',
    'smiles',
    searchtype='similarity',
    Threshold=70)
print(f'len cids for 70: {len(cids_70)}')

# график зависимости количества результатов поиска от порога схожести
threshholds = [95, 80, 70]
threshholds_lengths = [len(cids_95), len(cids_80), len(cids_70)]

plt.plot(threshholds, threshholds_lengths)
"""

import pubchempy as pcp
import urllib.request
import bs4 as BS

products = {}

with open('LabNetworkSearch.txt') as l:
    for line in l:
        information = line.split()
        productID = information[0]
        smile = information[1]
        products[productID] = {"SMILES": smile}
        try:
            getCID = str(pcp.get_cids(smile, 'smiles'))
            getCID = getCID.replace("[", "")
            getCID = getCID.replace("]", "")
            products[productID]["CID"] = getCID
        except:
            products[productID]["CID"] = "NA"
            print('Record completed, checking the next record')
        else:
            print('Record completed, checking the next record')

print("CID added to the dictionary\n")

partOne = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/'
partTwo = '/XML?heading=LCSS'

for product, product_info in products.items():
예제 #23
0
def get_cid_from_inchikey(inkey):
    """
    Search PubChem for a matching InChiKey
    """
    result = pcp.get_cids(inkey, "inchikey")
    return result.pop() if result else None
예제 #24
0
#!/usr/bin/env python
import sys
import pubchempy as pcp

f = open("drugs.txt").readlines()
drug_names = []
drug_smiles = []
drug_cns = []
for i in f[1:]:
    l = i.split()
    if len(l) > 2:
        drug_names.append(l[0])
        drug_cns.append(l[1])
        drug_smiles.append(l[2].strip())

drug_cids = []
for i in drug_names:
    cid_list = pcp.get_cids(i, 'name', 'compound', list_return='flat')
    print "%20s %5d hits" % (i, len(cid_list))
    drug_cids.append(cid_list)

of = open("drug_pubchem_index.txt", "w")
for i in range(len(drug_names)):
    of.write("%20s, %6s, %s, %s\n" %
             (drug_names[i], drug_cns[i], drug_cids[i], drug_smiles[i]))
of.close()
예제 #25
0
def getResults(query, queryType):
    cids = pcp.get_cids(query, queryType, 'substance', list_return='flat')
    results = [pcp.Compound.from_cid(cid) for cid in cids]
    return results
예제 #26
0
def pka_lookup_pubchem(identifier, namespace=None, domain='compound') -> Optional[str]:
    global debug

    if len(sys.argv) == 2 and sys.argv[1] in ['--debug=True', '--debug=true', '--debug', '-d']:
        debug = True

    # if debug:
    #     print(f'In DEBUG mode: {debug}')

    # Identify lookup source (Pubchem in this case)
    lookup_source = 'Pubchem'

    try:
        headers = {
            'user-agent': 'Mozilla/5.0 (X11; CentOS; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}

        # print('Searching Pubchem...')

        # Using pubchem api for python
        # Getting CID number, the result of this, by default is exact match. The result is returned as a list.
        cids = []
        identifier_type = ''

        if not namespace:
            identifier_type = classify(identifier)
            # print(f'identifier_type determined by classify() is: {identifier_type}')

            # If the input is inchi, inchikey or smiles (this could be a false smiles):
            if identifier_type in ['smiles', 'inchi', 'inchikey']:
                lookup = pcp.get_cids(identifier, namespace=identifier_type)
                if lookup:
                    cids.append(lookup[0])
            else:
                lookup = pcp.get_cids(identifier, namespace='name')
                if lookup:
                    cids.append(lookup[0])
                    # print(f'namespace from pubchem lookup is: {namespace}')
        elif namespace == 'cas':
            cids = pcp.get_cids(identifier, namespace='name')
        else:
            cids = pcp.get_cids(identifier, namespace=namespace)

        if not cids:
            lookup = pcp.get_cids(identifier, namespace='name')
            if lookup:
                cids.append(lookup[0])

            # cids = pcp.get_cids(identifier, namespace=namespace)
            identifier_type = namespace

        # print(cids)

        #  this api return an empty list if it cannot find cas_nr. This is to check if pubchem has this chemical.
        if len(cids) > 0:
            # if Pubchem found the result, get the first result of the list
            cid = cids[0]
            # print('Compound ID (CID) from PubChem is: {} and type is: {}'.format(cid, type(cid)))

            exact_match = True

            # synonyms = []
            synonyms = pcp.get_synonyms(cid)[0]['Synonym'] or []
            
            # Extract CAS number from the list of synonyms
            returned_cas = ''
            for synonym in synonyms:
                cas_nr = re.search(r'^\d{2,7}-\d{2}-\d$', synonym)
                if cas_nr:
                    cas_nr = cas_nr.group()
                    returned_cas = cas_nr
                    break

            # lookup_result = []
            lookup_result = pcp.get_properties(['inchi', 'inchikey',
                                        'canonical_smiles', 'isomeric_smiles',
                                        'iupac_name'],
                                cid)

            if identifier_type == 'cas':
                # To double check if the CAS number is correct:
                # using pubchem api, get a list of synonym. The result is a list of dict.
                # choose the first result and check all values for 'Synonym' key:
                exact_match = identifier in synonyms

            elif identifier_type in ['inchi', 'inchikey']:

                if identifier_type == 'inchi':
                    # print(lookup_result[0].get('InChI', False))
                    # print(f'input:\n{identifier}')
                    exact_match = (identifier == lookup_result[0].get('InChI', False))
                
                elif identifier_type == 'inchikey':
                    exact_match = (identifier == lookup_result[0].get('InChIKey', False))

            if not exact_match:
                if debug:
                    print(f'Exact match between input and Pubchem return value? {identifier in synonyms}')
                raise ValueError('This is not an exact match on Pubchem!')

            '''
            get url from Pubchem to get pka lookup result
            'XML' can be replaced with 'JSON' but it is harder to parse later on
            for more info about Pubchem output types: https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest$_Toc494865558
            '''
            pka_lookup_result_xml = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{}/XML?heading=Dissociation+Constants'.format(cid)

            # Get the html request info using CID number from pubchem
            r = requests.get(pka_lookup_result_xml, headers=headers, timeout=15)
            # Check to see if give OK status (200) and not redirect
            if r.status_code == 200 and len(r.history) == 0:
                # print(r.text)
                # Use python XML to parse the return result
                tree = ET.fromstring(r.text)
            
                # Get the XML tree of <Information> only
                info_node = tree.find('.//*{http://pubchem.ncbi.nlm.nih.gov/pug_view}Information')

                # Get the pKa reference:
                original_source = info_node.find('{http://pubchem.ncbi.nlm.nih.gov/pug_view}Reference').text
                # Get the pKa result:
                pka_result = info_node.find('.//*{http://pubchem.ncbi.nlm.nih.gov/pug_view}String').text
                pka_result = re.sub(r'^pKa = ', '', pka_result)    # remove 'pka = ' part out of the string answer
                # print(pka_result)
                # print(original_source)
                # print(lookup_result)

                core_result = {
                    'source': lookup_source,
                    'Pubchem_CID': str(cid),
                    'pKa': pka_result,
                    'reference': original_source,
                    'Substance_CASRN': returned_cas,
                }
                extra_info = lookup_result[0]
                extra_info.pop('CID', None)    # Remove 'CID': ... from lookup_result[0]

                # Merge 2 dict: https://treyhunner.com/2016/02/how-to-merge-dictionaries-in-python/
                result = {**core_result, **extra_info}
                # Rename some keys in the dict
                s = pd.Series(result)
                s = s.rename({
                    'CanonicalSMILES': 'Canonical_SMILES',
                    'IsomericSMILES': 'Isomeric_SMILES',
                    'IUPACName': 'IUPAC_Name'
                })
                result = s.to_dict()            
                return result

            else:
                raise RuntimeError('pKa not found in Pubchem.')
    
        else:
            raise RuntimeError('Compound not found in Pubchem.')

    except Exception as error:
        if debug:
            traceback_str = ''.join(traceback.format_exception(etype=type(error), value=error, tb=error.__traceback__))
            print(traceback_str)

        return None
예제 #27
0
def extract_mol_from_pubchem(cas_nr):
    global download_path
    headers = {
        'user-agent':
        'Mozilla/5.0 (X11; CentOS; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
    }

    try:
        # print('\tSearching Pubchem...')

        # Using pubchem api for python
        # Getting CID number, the result of this, by default is exact match. The result is returned as a list.
        # cid = pcp.get_cids(cas_nr, 'name', 'substance', list_return='flat')
        cid = pcp.get_cids(cas_nr, 'name')

        file_name = cas_nr + '.mol'
        download_file = Path(download_path) / file_name

        # Check if the file not exists and download
        # check file exists: https://stackoverflow.com/questions/82831/how-do-i-check-whether-a-file-exists
        if download_file.exists() and os.stat(download_file).st_size != 0:
            # print('{} already downloaded'.format(file_name))
            return -1

        else:

            #  this api return an empty list if it cannot find cas_nr. This is to check if pubchem has this chemical.
            if len(cid) > 0:
                # if Pubchem found the result, get the first result of the list
                cid = cid[0]
                # print('Compound ID (CID) from PubChem is: {} and type is: {}'.format(cid, type(cid)))

                # To double check if the CAS number is correct:
                # using pubchem api, get a list of synonym. The result is a list of dict.
                # choose the first result and check first 5 values for 'Synonym' key:
                # synonyms = pcp.get_synonyms(cid)[0]['Synonym'][:7]
                synonyms = pcp.get_synonyms(cid)[0]['Synonym']
                # print('List of synonyms is: {}'.format(synonyms)); exit(0)

                if cas_nr not in synonyms:
                    raise ValueError('\tThis is not an exact match!')

                # get url from Fisher to get url to download sds file
                get_sdf_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{}/sdf'.format(
                    cid)

                # # Check if the file not exists and download
                # # check file exists: https://stackoverflow.com/questions/82831/how-do-i-check-whether-a-file-exists
                # if download_file.exists():
                #     # print('{} already downloaded'.format(file_name))
                #     return -1
                # else:

                # # Another way to get sdf, from pubchempy ---------------------------------------
                #     sdf = pcp.get_sdf(cid)
                #     with open('159857-81-5.mol', 'w') as f:
                #         f.write(sdf)
                # # ----------------------------------------------------------------------------------

                # Get the html request info using CID number from pubchem
                r = requests.get(get_sdf_url, headers=headers, timeout=15)
                # print('url is: {}'.format(get_sdf_url))

                # Check to see if give OK status (200) and not redirect
                if r.status_code == 200 and len(r.history) == 0:
                    download_file.write_text(data=r.text)

                    # Check if the mol file is a binary string (some error during downloading) or empty mol file:
                    if is_binary_string(open(download_file, 'rb').read(
                            1024)) or is_empty_mol_file(download_file):
                        os.remove(download_file)  # remove the error mol file
                        return cas_nr
                    else:
                        return 0

            # If not, try to find substances as well
            elif len(cid) == 0:
                '''pcp.get_substances(cas_nr, 'name') returns a list of Substances if found: 
                Ref: https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L328'''
                substances = pcp.get_substances(cas_nr, 'name')
                # print(sid); exit(0)

                if len(substances) == 0:
                    # print('nothing here')
                    raise ValueError(
                        'Could not find any compounds or substances with this CAS {} on Pubchem.'
                        .format(cas_nr))
                else:
                    for substance in substances:
                        # print('Substance ID (SID) from PubChem is: {} and type is: {}'.format(substance, type(substance)))
                        '''Ref: https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L735'''
                        # substance_synonyms = substance.to_dict(properties=['synonyms'])['synonyms']
                        '''
                        substance.to_dict(properties=['synonyms']) return example:
                        {'synonyms': ['12259-21-1', 'Iron oxide (Fe2O3), hydrate', 'Ferric oxide hydrate', 
                                        'Ferrox', 'Hydrated ferric oxide', 'Hydrous ferric oxide', 
                                        'Iron oxide (Fe203), hydrate']}
                        '''

                        substance_synonyms = substance.synonyms  # https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L1095
                        '''
                        substance.synonyms' return example:
                            ['12259-21-1', 'Iron oxide (Fe2O3), hydrate', 'Ferric oxide hydrate', 
                            'Ferrox', 'Hydrated ferric oxide', 'Hydrous ferric oxide', 
                            'Iron oxide (Fe203), hydrate']
                        '''

                        # Check to make sure the substance has the same CAS#
                        if cas_nr in substance_synonyms:
                            sdf = pcp.get_sdf(identifier=substance.sid,
                                              namespace='sid',
                                              domain='substance')
                            # print(sdf)
                            if sdf:  # pcp.get_sdf return None if not found SDF
                                download_file.write_text(data=sdf)

                                # Check if the mol file is a binary string (some error during downloading) or empty mol file:
                                if is_binary_string(
                                        open(download_file, 'rb').read(1024)
                                ) or is_empty_mol_file(download_file):
                                    os.remove(download_file
                                              )  # remove the error mol file
                                else:
                                    return 0

            # If none of the Substances has the same CAS and/or has SDF (mol) file, then return the CAS #
            return cas_nr

    except Exception as error:
        # print('.', end='')
        if debug:
            print('Error during search structure in Pubchem:\n\t{}'.format(
                error))
        return cas_nr
예제 #28
0
def get_pubchem_id(name):
    try:
        cpd_id = pcp.get_cids(name, "name")
        return cpd_id[0]
    except:
        return ""
예제 #29
0
                                          'isomeric_smiles', 'inchi',
                                          'iupac_name', 'exact_mass'))
    tup_list = [item for item in legend.items()]
    lege = ''
    for tup in tup_list:
        lege += f'{str(tup)}\n'
    print(lege)
    image = MolsToImage([mol_formated_molecule],
                        subImgSize=(1200, 800),
                        fitimage=True,
                        legends=[lege])
    image.show()


request = input('Search: molecule\n')
responce = pcp.get_cids(request, 'name')

if len(responce) < 1:
    print("I couldn't find what you were looking for")
elif len(responce) == 1:
    print('Searching')
    compound = pcp.Compound.from_cid(responce[0])
    describe_me(compound)
else:
    print('Compound	:	CID')
    for responces in responce:
        print({pcp.Compound.from_cid(responces).synonyms[0]: responces})
    choice = input('Select a CID to search\n')
    try:
        int(choice)
        compound = pcp.Compound.from_cid(int(choice))
예제 #30
0
#results = pcp.get_substances('ethanol','name')
#print(results)

#cids = pcp.get_cids('ethanol','name')
#print(cids)

#c = pcp.Compound.from_cid(cids[0])

#structure = c.inchi
#print(structure)

for cmp in cmps:
    print(cmp)
    # We'll just grab the first cid
    cid = pcp.get_cids(cmp, 'name')[0]
    c = pcp.Compound.from_cid(cid)
    print(c.cid)
    pcp.download('PNG',
                 'images/' + cmp.replace(" ", "_") + '.png',
                 c.cid,
                 'cid',
                 overwrite=True)
    m = Chem.MolFromInchi(c.inchi)
    #atoms_list = list(m.GetAtoms())
    #atoms = []
    #for i in range(len(atoms_list)):
    #    atoms.append(atoms_list[i])
    #print("Atoms: ", atoms)
    print("Alcohol: ", id_fg.is_alcohol(m))
    print("COOH: ", id_fg.is_cooh(m))