Exemplo n.º 1
0
def test_parse_pubchem_compounds(filename):
    """
    :param:  filename:      A string denoting the path to the metabolomics file
    :return: all_compounds: A Pandas dataframe containing the PubMed mappings
    """
    fileData = pd.read_excel(filename)
    fileData = fileData.drop('Compound Method', axis=1).join(
        fileData['Compound Method'].str.split(
            '/',
            expand=True).stack().reset_index(level=1,
                                             drop=True).rename('Metabolite'))

    fileData['Metabolite'] = fileData['Metabolite'].str.lower()
    pubChemQuery = fileData['Metabolite'].tolist()

    all_compounds = []
    for metabolite in pubChemQuery:
        try:
            df = pcp.get_substances(identifier=metabolite,
                                    namespace='name',
                                    as_dataframe=True)
            df['Name'] = metabolite
            print(df)
            all_compounds.append(df)
        except (KeyError, TimeoutError, pcp.TimeoutError):
            continue
    all_compounds = pd.concat(all_compounds)
    return all_compounds
def addinchikey(excelpathin, excelpathout):
    '''
    The function takes input Excel file substractes relevant columns -'Structure',	'Name',	'Formula'. From the 'Structure'
    it create another  3 columns - inchikey, source_id , source_name
    :param excelpathin: The path to Excel file which is the output of the Compound Discoverer. in the format r'path' -
    r'D:/BCDD/Documents/TalCompounds_export_test.xlsx"
    :param excelpathout: path to output Excel after the merge.
    :return: Excel file with inchikey
    '''

    CD = pd.read_excel(excelpathin)
    CD = pd.DataFrame(CD, columns=['Structure', 'Name', 'Formula'])
    sdflist = CD.Structure

    # loop over all cells in Structure is Nan value enter the string 'Nan'
    # adding delay time so we want be blocked
    newlistinchikey = []
    newlistsource_id = []
    newlistsource_name = []
    for idx, sdf in enumerate(sdflist):

        if idx % 50 == 0:
            time.sleep(3.25)

        if pd.isnull(sdf):
            # print(idx)
            newlistinchikey.append(np.nan)
            newlistsource_id.append(np.nan)
            newlistsource_name.append(np.nan)
        else:
            comp = pcp.get_compounds(sdf, 'sdf')
            substance = pcp.get_substances(comp[0].cid, 'sid')
            # print(comp)
            # print(substance)
            # comp[0].inchikey
            newlistinchikey.append(comp[0].inchikey)
            newlistsource_name.append(substance[0].source_name)
            newlistsource_id.append(substance[0].source_id)

    # Change list to Dataframe and concatenate with the original data and name them
    newlistinchikey = pd.DataFrame(newlistinchikey)
    newlistinchikey.columns = ['inchikey']
    newlistsource_name = pd.DataFrame(newlistsource_name)
    newlistsource_name.columns = ['source_name']
    newlistsource_id = pd.DataFrame(newlistsource_id)
    newlistsource_id.columns = ['source_id']

    CD = pd.concat([CD, newlistinchikey, newlistsource_name, newlistsource_id],
                   axis=1,
                   sort=False)

    # Export the merge data to an Excel file
    writer = pd.ExcelWriter(excelpathout, engine='xlsxwriter')

    CD.to_excel(writer, header=True)
    writer.save()
    writer.close()
Exemplo n.º 3
0
def queryPubChem(data):
    """
    queryPubChem maps the metabolite name from a pandas Dataframe in the 'Compound Method' column and extracts
    synoynms from several databases using the PubChem API.
    :param data:           A Pandas Dataframe of the metabolomics dataframe with the common metabolite identifiers
                           under the 'Compound Method' column
    :return all_compounds: A Pandas Dataframe from the PubChem API containing the metabolite map. This dataframe is
                           saved as a .csv file.
    :return queryList:     A string with semicolon delimters to be fed into a REST-API
    """
    import pubchempy as pcp

    # Split 'Compound Method' column by the '/' regex and clean up some data
    data = data.drop('Compound Method',
                     axis=1).join(data['Compound Method'].str.split(
                         '/', expand=True).stack().reset_index(
                             level=1, drop=True).rename('Metabolite'))
    data['Metabolite'] = data['Metabolite'].str.lower()
    pubChemQuery = data['Metabolite'].tolist()

    # Mine the PubChem database for synonyms
    all_compounds = []
    print(
        "Mapping metabolite names to PubChem database for synonym matching and ID retrieval."
    )

    # The data is too large to keep in memory. So I wrote it into a csv file, and will read in.
    for metabolite in pubChemQuery:
        try:
            df = pcp.get_substances(identifier=metabolite,
                                    namespace='name',
                                    as_dataframe=True)
            df['Name'] = metabolite
            df = df.applymap(str)
            df = df.drop('synonyms', axis=1).join(df['synonyms'].str.split(
                ',',
                expand=True).stack().reset_index(level=1,
                                                 drop=True).rename('synonyms'))
            df.to_csv('~/Data/Mappings/ME1/pubmed_me1_query.csv',
                      mode='a',
                      header=False,
                      index=False)

        except (KeyError, TimeoutError, pcp.TimeoutError):
            continue

    print("Finished metabolite common name -> identifier synoynm matching!")
Exemplo n.º 4
0
def request_pubchem(cas, name, en_name, trans):
    print(cas, name, en_name)
    if en_name:
        en_name = _translate(en_name, trans)
    else:
        en_name = _translate(name.capitalize(), trans)
    cas = cas.strip()
    # print(name, '-->', en_name, '(en), CAS: {}'.format(cas))
    if cas:
        r = requests.get(PC_SEARCH, params={'term': 'CAS-{}'.format(cas)})
    else:
        r = requests.get(PC_SEARCH, params={'term': en_name})
    m = PC_COMPOUND_re.search(r.url)
    data = {}
    structure = ''
    if m is not None:
        cid = m.group(1)
        compound = pcp.Compound.from_cid(int(cid))
        data = compound.to_dict()
        structure = _get_structure(cid)
    else:
        # Try the same with the translated name
        r = requests.get(PC_SEARCH, params={'term': en_name})
        m = PC_COMPOUND_re.search(r.url)
        if m is not None:
            cid = m.group(1)
            compound = pcp.Compound.from_cid(int(cid))
            data = compound.to_dict()
            structure = _get_structure(cid)
        else:
            try:
                compound = pcp.get_compounds(en_name, 'name')[0]
                data = compound.to_dict()
                structure = _get_structure(str(compound.cid))
            except IndexError:
                # Try to find as substance
                try:
                    substance = pcp.get_substances(en_name, 'name')[0]
                    compound = pcp.Compound.from_cid(substance.cids[0])
                    data = compound.to_dict()
                    structure = _get_structure(str(compound.cid))
                except IndexError:
                    pass
    return data, structure, en_name
Exemplo n.º 5
0
def search_pubchem_id(generic):  ###search pubchem info
    sub_results = pcp.get_substances(generic, 'name')
    comp_results = pcp.get_compounds(generic, 'name')
    sub = None
    comp = None
    sub_ids = []
    comp_ids = []
    if (sub_results):
        for s_id in sub_results:
            print("sid:", s_id, str(s_id)[10:-1])
            sub_ids.append(str(s_id)[10:-1])

    if (comp_results):
        print(comp_results)
        for c_id in comp_results:
            print("cid:", c_id, str(c_id)[9:-1])
            comp_ids.append([str(c_id)[9:-1], c_id.inchikey])
    if (len(comp_ids) == 0):
        return None, sub_ids
    else:
        return comp_ids, sub_ids
Exemplo n.º 6
0
def pubchem(compound, isCompound=True, report=False):
    def drug_form(compound, report=False):
        '''Prints data and returns it of a compound from pcp'''
        cid = compound.cid
        Entrez.email = import_email()
        x = Entrez.esummary(db="pccompound", id=cid)  #Obtain data
        record = Entrez.read(x)
        #pharmaKeys = record[0].keys()
        #SynonymList, Id, MolecularFormula, MolecularWeight, HydrogenBondDonorCount, HydrogenBondAcceptorCount, PharmActionList,XLogP
        record = record[0]
        action = record['PharmActionList']
        weight = record['MolecularWeight']
        formula = record['MolecularFormula']
        donors = record['HydrogenBondDonorCount']
        acceptors = record['HydrogenBondAcceptorCount']
        XLogP = record['XLogP']
        names = record['SynonymList']
        rotatable = record['RotatableBondCount']
        '''weight = compound.molecular_weight
		formula = compound.molecular_formula
		names = compound.synonyms
		donors = compound.h_bond_donor_count
		acceptors = compound.h_bond_acceptor_count
		rotatable = compound.rotatable_bond_count
		XLogP = compound.xlogp'''
        if report == False:
            if len(names) >= 5: print("Names: " + str(names[0:5]))
            else: print("Names: " + str(names[0:len(names)]))
            print("Weight: " + str(weight))
            print("Formula: " + str(formula))
            print("Donors, acceptors and rotatables: " + str(donors) + ", " +
                  str(acceptors) + ", " + str(rotatable))
            print("XLogP: " + str(XLogP))
            if len(action) >= 10: print("Actions: " + str(action[0:10]))
            else: print("Actions: " + str(action))
        drug_data = {
            "weight": weight,
            "formula": formula,
            "names": names,
            "donors": donors,
            "acceptors": acceptors,
            "rotatable": rotatable,
            "XLogP": XLogP,
            'action': action
        }
        return drug_data

    if report == True:
        print("Importing relevant data from the query {}".format(compound))
    info = ""
    x = pcp.get_compounds(compound, 'name')
    if len(x) > 0:  #Checks if there is a compound with the name "compound"
        info = drug_form(x[0], report)
        return info
    else:
        if report == False:
            print(
                "No results have been found using get_compounds, proceeding with substance search..."
            )
        x = pcp.get_substances(compound, 'name')
        if len(x) > 0:  #If a substance is found, do this...
            if report == False:
                print("Substances found: " + str(x))
                print("\nFinding all synonyms...")
            info = []
            for every in range(len(x)):
                if report == False: print(x[every].synonyms)
                info.append(x[every].synonyms)
            return info
        else:
            if report == False: print("No drug information has been found")
            return info
Exemplo n.º 7
0
def addcols_joindata(excelpathin, jsonpathin, excelpathout):
    '''
    The function takes input Excel file substractes relevant columns -'Structure',	'Name',	'Formula'. From the 'Structure'
    it create another  3 columns - inchikey, source_id , source_name. Then, take the JSON file which is
    the parsed data from the HMDB and merge the modified EXCEL with JSON, first by inchikey then by name and finely
    by Chemical Formula. This function takes the addinchikey and joindata functions  and merge them to one.
    :param excelpathin: The path to Excel file which is the output of the Compound Discoverer. in the format r'path' -
    r'D:/BCDD/Documents/TalCompounds_export_test.xlsx"
    :param jsonpathin: Path to JSON file - parsed XML file from HMDB
    :param excelpathout: Path to output Excel after the merge.
    :return: The columns of the Excel file with added columns  (disease name) from the JSON
    '''

    start_time = time.time()
    CD = pd.read_excel(excelpathin)
    # CD = pd.read_excel(r'D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/MOD_REINJ_NEG_ChemSpider Results.xlsx')
    # CD = pd.read_excel(r'D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/Compounds_export_test.xlsx')
    CD = pd.DataFrame(CD[1000:2001], columns=['Structure', 'Name', 'Formula'])
    sdflist = CD.Structure

    # Loop over all cells in Structure is Nan value enter the string 'Nan'
    # adding delay time so we want be blocked
    newlistinchikey = []
    newlistsource_id = []
    newlistsource_name = []
    for idx, sdf in enumerate(sdflist):
        print(idx)

        if idx % 50 == 0:
            print("--- %s seconds --f-time to %s rows" %
                  ((time.time() - start_time), idx))

            time.sleep(3.25)

        if pd.isnull(sdf):
            # print(idx)
            newlistinchikey.append(np.nan)
            newlistsource_id.append(np.nan)
            newlistsource_name.append(np.nan)
        else:
            comp = pcp.get_compounds(sdf, 'sdf')

            # In case the comp[0]=Compound() than type(comp[0].cid) is <class 'NoneType'>
            if type(comp[0].cid) == type(None):
                substance = []
                newlistinchikey.append(np.nan)

            else:
                substance = pcp.get_substances(comp[0].cid, 'sid')
                # print(comp)
                # print(substance)
                # comp[0].inchikey
                newlistinchikey.append(comp[0].inchikey)

            # The if statement is in case substance= [] (empty) -> then len(substance)=0
            if len(substance) > 0:
                newlistsource_name.append(substance[0].source_name)
                newlistsource_id.append(substance[0].source_id)
            else:
                newlistsource_name.append(np.nan)
                newlistsource_id.append(np.nan)

    # Change list to Dataframe and concatenate with the original data and name them
    newlistinchikey = pd.DataFrame(newlistinchikey)
    newlistinchikey.columns = ['InChIKey']
    newlistsource_name = pd.DataFrame(newlistsource_name)
    newlistsource_name.columns = ['source_name']
    newlistsource_id = pd.DataFrame(newlistsource_id)
    newlistsource_id.columns = ['source_id']

    CD = pd.concat([CD, newlistinchikey, newlistsource_name, newlistsource_id],
                   axis=1,
                   sort=False)
    print("--- %s seconds --f-add 3 cols" % (time.time() - start_time))

    # From here is the joindata function with modification
    # Load the parse HMDB file
    with open(jsonpathin, 'r') as read_file:
        data = json.load(read_file)

    start_time = time.time()
    # Load the parse HMDB file
    # with open('D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/Parser_HMDB.py Output/serum_metabolites.json', 'r') as read_file:
    #     data = json.load(read_file)

    # Create a data frame from the list of dictionaries
    # df_hmdb = pd.DataFrame(data,  columns=['accession', 'name', 'chemical_formula', 'inchikey', 'disease_name' ])
    df_hmdb = pd.DataFrame(data)
    df_hmdb.drop(
        ['description', 'synonyms', 'kegg_id', 'meta_cyc_id', 'pathway_name'],
        axis=1)

    df_excel = CD
    # Merge by inchikey
    joindata_by_inchikey = pd.merge(left=df_excel,
                                    right=df_hmdb,
                                    how='inner',
                                    left_on='InChIKey',
                                    right_on='inchikey')

    print("--- %s seconds --f-merge by inchikey " % (time.time() - start_time))

    start_time = time.time()
    # Reduce the rows to those we DID find a match by inchkey in bothe data sets
    df_hmdb_reduce_byinchik = df_hmdb.loc[~df_hmdb['inchikey'].
                                          isin(df_excel['InChIKey'])]
    df_excel_reduce_byinchik = df_excel.loc[
        ~df_excel['InChIKey'].isin(joindata_by_inchikey['InChIKey'])]

    # joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel, df_hmdb, left_on="Name", right_on="name")
    joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel_reduce_byinchik,
                                                    df_hmdb_reduce_byinchik,
                                                    left_on="Name",
                                                    right_on="name")

    # Selecting threshold  best_match_score>0.25 maybe adjustments needed
    joindata_by_name = joindata_by_name[
        joindata_by_name['best_match_score'] > 0.55]
    # Drop columns the
    joindata_by_name.drop(['best_match_score', '__id_left', '__id_right'],
                          axis=1,
                          inplace=True)
    print("--- %s seconds --f-merge by name" % (time.time() - start_time))

    start_time = time.time()
    # Reduce the rows to those we DID find a match by inchkey in and by name both data sets
    df_hmdb_reduce_byname = df_hmdb_reduce_byinchik.loc[
        ~df_hmdb_reduce_byinchik['name'].isin(joindata_by_name['name'])]
    df_excel_reduce_byname = df_excel_reduce_byinchik.loc[
        ~df_excel_reduce_byinchik['Name'].isin(joindata_by_name['Name'])]
    # Remove spaces between letters on  'Formula' ( there is  a warning)
    df_excel_reduce_byname.loc[:, 'Formula'] = df_excel_reduce_byname[
        'Formula'].str.replace(' ', '')

    # Merge by chemical_formula
    joindata_by_CF = pd.merge(left=df_excel_reduce_byname,
                              right=df_hmdb_reduce_byname,
                              how='inner',
                              left_on='Formula',
                              right_on='chemical_formula')

    # This data inculed rows from the original EXCEL file that we did NOT find and match ( by inchikey nor name nor CF)
    df_excel_reduce_byCF = df_excel_reduce_byname.loc[
        ~df_excel_reduce_byname['Formula'].
        isin(joindata_by_CF['chemical_formula'])]

    # Create a list of all columns of the HMDB JSON data
    colnames = joindata_by_inchikey.columns[6:]
    # Add those names as empty columns to the df_excel_reduce_byCF. reducedata in all the rows from the original Excel
    # that did NOT find a match and added the columns of the HMDB
    reducedata = df_excel_reduce_byCF.reindex(
        columns=[*df_excel_reduce_byCF.columns.tolist(), *colnames])

    # Append all the data sets
    # out = joindata_by_inchikey.append(joindata_by_name.append(joindata_by_CF))
    out = joindata_by_inchikey.append(
        joindata_by_name.append(joindata_by_CF.append(reducedata)))

    print("--- %s seconds --f-merge by CF" % (time.time() - start_time))
    # Export the merge data to an Excel file
    writer = pd.ExcelWriter(excelpathout, engine='xlsxwriter')
    # writer = pd.ExcelWriter('D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/MOD_REINJ_NEG_ChemSpider ResultsW HMDB_0_1000.xlsx', engine='xlsxwriter')
    out.to_excel(writer, header=True)
    writer.save()
    writer.close()

    return (out)
Exemplo n.º 8
0
def insert_interaction(interaction_name):

    print(interaction_name)
    translated = translate(interaction_name, 'en', 'ar')
    translated_spelled = get_google_spelling(translated)

    interaction_id = get_interaction(interaction_name)
    if interaction_id:
        return interaction_id[0][0]

    comp_id = get_composition(interaction_name)
    if comp_id:

        interaction_id = insert_interction_db(interaction_name, translated_spelled,  "0")
        return interaction_id

    thera_id = get_therapeutic_from_names(interaction_name, translated_spelled)
    if thera_id:
        thera_id = insert_interction_db(interaction_name, translated_spelled,  "1")
        return thera_id


    try:
        results = pcp.get_substances(translated_spelled, 'name')
        if len(results) > 0:
            print("from pcp")
            interaction_id = insert_interction_db(interaction_name, translated_spelled, "0")
            return interaction_id
        else:
            res = re.sub(" - ", "", translated_spelled)
            res = re.sub("-", "", res)
            res = re.sub(r"/\s+/", " ", res)
            res = re.sub(r"[^a-zA-Z ]", "", res)
            res = res.strip()
            print(res)
            thera = get_drug_class_from_drugs_site(res)
            if len(thera) > 0:
                print("thera now " , thera[0])
                thera_ar = get_therapeutic_arabic_name(thera[0])
                if thera_id:
                    interaction_id = insert_interction_db(thera_ar, translated_spelled, "1")
                    return interaction_id
                else:
                    file_en_dir = "utils/drug_classes_drug.txt"
                    file_en_dir = pkg_resources.resource_filename(__name__, file_en_dir)
                    file_en = open(file_en_dir, 'r', encoding="utf-8")
                    parts_en = file_en.read()

                    file_ar_dir = "utils/drug_classes_ar.txt"
                    file_ar_dir = pkg_resources.resource_filename(__name__, file_ar_dir)
                    parts_en = re.split('[\n]', parts_en)
                    file_ar = open(file_ar_dir, 'r', encoding="utf-8")
                    parts_ar = file_ar.read()
                    drug_site_class = re.sub(" agents", " ", thera[0])
                    drug_site_class = re.sub("agents ", " ", drug_site_class)
                    drug_site_class = re.sub(" for ", " ", drug_site_class)
                    drug_site_class = re.sub(" drugs", " ", drug_site_class)
                    drug_site_class = re.sub("drugs ", " ", drug_site_class)
                    drug_site_class = re.sub("-", "", drug_site_class)

                    for i in range(len(parts_en)):
                        parts_en[i] = parts_en[i].split('.')[0]
                        parts_en[i] = re.sub(" agents", " ", parts_en[i])
                        parts_en[i] = re.sub("agents ", " ", parts_en[i])
                        parts_en[i] = re.sub(" for ", " ", parts_en[i])
                        parts_en[i] = re.sub(" drugs", " ", parts_en[i])
                        parts_en[i] = re.sub("drugs ", " ", parts_en[i])
                        parts_en[i] = re.sub("-", "", parts_en[i])

                        if drug_site_class.strip().lower() == parts_en[i].strip().lower():
                            print("matched with thera", parts_ar[i])
                            interaction_id = insert_interction_db(parts_ar[i], translated_spelled, "1")
                            return interaction_id

                    print("couldnt match with thera ")
                    print(interaction_name, translated_spelled)
                    interaction_id = insert_interction_db(interaction_name, translated_spelled, "1")
                    return interaction_id

            else:
                print("no thera came back")
                interaction_id = insert_interction_db(interaction_name, translated_spelled, "1")

            return interaction_id

    except Exception as e:
        results, found = get_code(interaction_name)
        if found and len(results) > 0:
            categories = ["Biologically Active Substance", "Pharmacologic Substance", "Element, Ion, or Isotope",
                          "Organic Chemical", "Antibiotic"]
            found_translation = check(results[0][0], categories)
            if found_translation:
                print("from umls")
                interaction_id = insert_interction_db(interaction_name, translated_spelled, "0")
                return interaction_id

        res = re.sub(" - ", "", translated_spelled)
        res = re.sub("-", "", res)
        res = re.sub(r"/\s+/", " ", res)
        res = re.sub(r"[^a-zA-Z ]", "", res)
        res = res.strip()
        print(res)
        thera = get_drug_class_from_drugs_site(res)
        if len(thera) > 0:
            print("thera now ", thera[0])
            thera_ar = get_therapeutic_arabic_name(thera[0])
            if thera_id:
                print("from thera")
                interaction_id = insert_interction_db(thera_ar, translated_spelled, "1")
                return interaction_id
            else:
                file_en_dir = "utils/drug_classes_drug.txt"
                file_en_dir = pkg_resources.resource_filename(__name__, file_en_dir)
                file_en = open(file_en_dir, 'r', encoding="utf-8")
                parts_en = file_en.read()

                file_ar_dir = "utils/drug_classes_ar.txt"
                file_ar_dir = pkg_resources.resource_filename(__name__, file_ar_dir)
                parts_en = re.split('[\n]', parts_en)
                file_ar = open(file_ar_dir, 'r', encoding="utf-8")
                parts_ar = file_ar.read()

                for i in range(len(parts_en)):
                    drug_site_class = re.sub(" agents", " ", thera[0])
                    drug_site_class = re.sub("agents ", " ", drug_site_class)
                    drug_site_class = re.sub(" for ", " ", drug_site_class)
                    drug_site_class = re.sub(" drugs", " ", drug_site_class)
                    drug_site_class = re.sub("drugs ", " ", drug_site_class)
                    drug_site_class = re.sub("-", "", drug_site_class)

                    if drug_site_class.strip().lower() == parts_en[i].strip().lower():
                        print("matched with thera", parts_ar[i])
                        interaction_id = insert_interction_db(parts_ar[i], translated_spelled, "1")
                        return interaction_id

                print("couldnt match with thera ")
                interaction_id = insert_interction_db(interaction_name, translated_spelled, "1")
                return interaction_id

        else:
            print("no thera came back")
            interaction_id = insert_interction_db(interaction_name, translated_spelled, "1")

        return interaction_id
Exemplo n.º 9
0
def extract_mol_from_pubchem(cas_nr):
    global download_path
    headers = {
        'user-agent':
        'Mozilla/5.0 (X11; CentOS; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
    }

    try:
        # print('\tSearching Pubchem...')

        # Using pubchem api for python
        # Getting CID number, the result of this, by default is exact match. The result is returned as a list.
        # cid = pcp.get_cids(cas_nr, 'name', 'substance', list_return='flat')
        cid = pcp.get_cids(cas_nr, 'name')

        file_name = cas_nr + '.mol'
        download_file = Path(download_path) / file_name

        # Check if the file not exists and download
        # check file exists: https://stackoverflow.com/questions/82831/how-do-i-check-whether-a-file-exists
        if download_file.exists() and os.stat(download_file).st_size != 0:
            # print('{} already downloaded'.format(file_name))
            return -1

        else:

            #  this api return an empty list if it cannot find cas_nr. This is to check if pubchem has this chemical.
            if len(cid) > 0:
                # if Pubchem found the result, get the first result of the list
                cid = cid[0]
                # print('Compound ID (CID) from PubChem is: {} and type is: {}'.format(cid, type(cid)))

                # To double check if the CAS number is correct:
                # using pubchem api, get a list of synonym. The result is a list of dict.
                # choose the first result and check first 5 values for 'Synonym' key:
                # synonyms = pcp.get_synonyms(cid)[0]['Synonym'][:7]
                synonyms = pcp.get_synonyms(cid)[0]['Synonym']
                # print('List of synonyms is: {}'.format(synonyms)); exit(0)

                if cas_nr not in synonyms:
                    raise ValueError('\tThis is not an exact match!')

                # get url from Fisher to get url to download sds file
                get_sdf_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{}/sdf'.format(
                    cid)

                # # Check if the file not exists and download
                # # check file exists: https://stackoverflow.com/questions/82831/how-do-i-check-whether-a-file-exists
                # if download_file.exists():
                #     # print('{} already downloaded'.format(file_name))
                #     return -1
                # else:

                # # Another way to get sdf, from pubchempy ---------------------------------------
                #     sdf = pcp.get_sdf(cid)
                #     with open('159857-81-5.mol', 'w') as f:
                #         f.write(sdf)
                # # ----------------------------------------------------------------------------------

                # Get the html request info using CID number from pubchem
                r = requests.get(get_sdf_url, headers=headers, timeout=15)
                # print('url is: {}'.format(get_sdf_url))

                # Check to see if give OK status (200) and not redirect
                if r.status_code == 200 and len(r.history) == 0:
                    download_file.write_text(data=r.text)

                    # Check if the mol file is a binary string (some error during downloading) or empty mol file:
                    if is_binary_string(open(download_file, 'rb').read(
                            1024)) or is_empty_mol_file(download_file):
                        os.remove(download_file)  # remove the error mol file
                        return cas_nr
                    else:
                        return 0

            # If not, try to find substances as well
            elif len(cid) == 0:
                '''pcp.get_substances(cas_nr, 'name') returns a list of Substances if found: 
                Ref: https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L328'''
                substances = pcp.get_substances(cas_nr, 'name')
                # print(sid); exit(0)

                if len(substances) == 0:
                    # print('nothing here')
                    raise ValueError(
                        'Could not find any compounds or substances with this CAS {} on Pubchem.'
                        .format(cas_nr))
                else:
                    for substance in substances:
                        # print('Substance ID (SID) from PubChem is: {} and type is: {}'.format(substance, type(substance)))
                        '''Ref: https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L735'''
                        # substance_synonyms = substance.to_dict(properties=['synonyms'])['synonyms']
                        '''
                        substance.to_dict(properties=['synonyms']) return example:
                        {'synonyms': ['12259-21-1', 'Iron oxide (Fe2O3), hydrate', 'Ferric oxide hydrate', 
                                        'Ferrox', 'Hydrated ferric oxide', 'Hydrous ferric oxide', 
                                        'Iron oxide (Fe203), hydrate']}
                        '''

                        substance_synonyms = substance.synonyms  # https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L1095
                        '''
                        substance.synonyms' return example:
                            ['12259-21-1', 'Iron oxide (Fe2O3), hydrate', 'Ferric oxide hydrate', 
                            'Ferrox', 'Hydrated ferric oxide', 'Hydrous ferric oxide', 
                            'Iron oxide (Fe203), hydrate']
                        '''

                        # Check to make sure the substance has the same CAS#
                        if cas_nr in substance_synonyms:
                            sdf = pcp.get_sdf(identifier=substance.sid,
                                              namespace='sid',
                                              domain='substance')
                            # print(sdf)
                            if sdf:  # pcp.get_sdf return None if not found SDF
                                download_file.write_text(data=sdf)

                                # Check if the mol file is a binary string (some error during downloading) or empty mol file:
                                if is_binary_string(
                                        open(download_file, 'rb').read(1024)
                                ) or is_empty_mol_file(download_file):
                                    os.remove(download_file
                                              )  # remove the error mol file
                                else:
                                    return 0

            # If none of the Substances has the same CAS and/or has SDF (mol) file, then return the CAS #
            return cas_nr

    except Exception as error:
        # print('.', end='')
        if debug:
            print('Error during search structure in Pubchem:\n\t{}'.format(
                error))
        return cas_nr