def test_get_cid(self): self.assertEqual(get_cid("64-17-5"), 702) self.assertEqual(get_cid("141-78-6"), 8857) self.assertEqual(get_cid("110-01-0"), 1127) cids = get_cids(["64-17-5", "141-78-6", "110-01-0"]) self.assertEqual(cids["64-17-5"], 702) self.assertEqual(cids["141-78-6"], 8857) self.assertEqual(cids["110-01-0"], 1127)
url = 'https://senselab.med.yale.edu/OdorDB/Browse?db=5&cl=1&page=%d' % page f = urlopen(url) html = f.read() soup = bs4.BeautifulSoup(html) table = soup.find('table') for span in table.find_all('span'): name = span.text.strip() link = span.find('a').get('href') info.append((name, link)) # Make into a dataframe df = pd.DataFrame.from_records(info, columns=['name', 'url']) df.head() # Get CIDS by searching the names cids = odorants.get_cids(df['name'], kind='name') # Add these CIDs to the dataframe df = df.set_index('name').join(pd.Series(cids, name='CID')) # Get CAS strings for compounds with no CID was found based on the name for name, url_suffix in df[df['CID'] == 0]['url'].items(): url = 'https://senselab.med.yale.edu/OdorDB/%s' % url_suffix f = urlopen(url) html = f.read() soup = bs4.BeautifulSoup(html) table = soup.find('table') cas_row = table.find_all('tr')[5] cas_text = cas_row.find_all('span')[-1].text cas = cas_text.replace('\r\n', '').strip() df.loc[name, 'CAS'] = cas
# --- # jupyter: # jupytext: # formats: ipynb,py # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.10.3 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- import os import pandas as pd import pyrfume from pyrfume import odorants file_path = os.path.join(pyrfume.DATA, 'GRAS.smi') gras_data_raw = pd.read_csv(file_path, header=None, names=['SMILES', 'CAS'], sep='\t') results = odorants.get_cids(gras_data_raw['SMILES'], kind='SMILES', verbose=False) gras_data = pd.Series(results, name='CID').to_frame().join(gras_data_raw.set_index('SMILES')) gras_data.head() file_path = os.path.join(pyrfume.DATA, 'gras.csv') gras_data.to_csv(file_path)
# kernelspec: # display_name: Python 3 # language: python # name: python3 # --- # # Add CIDS to parsed_threshold_data_in_air.csv import pandas as pd import pyrfume from pyrfume.odorants import get_cid, get_cids from rickpy import ProgressBar df = pyrfume.load_data('thresholds/parsed_threshold_data_in_air.csv') df = df.set_index('canonical SMILES') smiles_cids = get_cids(df.index, kind='SMILES') df = df.join(pd.Series(smiles_cids, name='CID')) df.head() from rdkit.Chem import MolFromSmiles, MolToSmiles df['SMILES'] = df.index p = ProgressBar(len(smiles_cids)) for i, (old, cid) in enumerate(smiles_cids.items()): p.animate(i, status=old) if cid == 0: mol = MolFromSmiles(old) if mol is None: new = '' else: new = MolToSmiles(mol, isomericSmiles=True)
# display_name: Python 3 # language: python # name: python3 # --- # #### Basically, I just started from the mergedOdorants file that Joel sent me, and converted SMILES strings to CIDs import os import pandas as pd import pyrfume from pyrfume import odorants file_path = os.path.join(pyrfume.DATA, 'mergedOdorants.csv') df = pd.read_csv(file_path, index_col=0) # Get CIDs from PubChem smiles_cids = odorants.get_cids(df['NAME'], kind='smiles') # Merge back into this list df = pd.Series(smiles_cids, name='PubChemID').to_frame().join(df.set_index('NAME')) # Save back to a file of just CIDs for lib, name in [('goodscent', 'goodscents'), ('arc', 'arctander')]: file_path = os.path.join(pyrfume.DATA, '%s_cids.txt' % name) cids = sorted(set(df[df['lib'] == lib]['PubChemID']) - {0}) pd.Series(cids, name='CID').to_csv(file_path, header=True, index=False) file_path = os.path.join(pyrfume.DATA, 'mergedOdorants_with_cids.csv') df.to_csv(file_path)
# Remove extraneous hyphens name = re.sub('(?<![0-9\(])-(?![0-9])', ' ', name) # Add back hyphens after prefixes for x in [ 'alpha', 'beta', 'gamma', 'delta', 'tert', 'L', 'D', 'm', 'o', 'p', 'cis', 'trans', 'sec' ]: name = name.replace('%s ' % x, '%s-' % x) # Move isomeric identifiers to the front of the name for x in ['(-)', '(+)']: if x in name: name = '%s-%s' % (x, name.replace(x, '')) new_names.append(name) #print(name) cids = get_cids(new_names) df = pd.Series(cids, name='CID').to_frame() df['Old Name'] = old_names df.index.name = 'Name' df = df.reset_index() df.head() df[df['CID'] == 0] df.loc[67, 'CID'] = 19309 df.loc[76, 'CID'] = 11160 df.loc[79, 'CID'] = 8092 df.loc[155, 'CID'] = 28500 df.loc[170, 'CID'] = 251531
# jupyter: # jupytext: # formats: ipynb,py # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.10.3 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- import os import pandas as pd import pyrfume from pyrfume import snitz, odorants file_path = os.path.join(pyrfume.DATA, 'snitz', 'Snitz144.csv') snitz_data_raw = pd.read_csv(file_path) results = odorants.get_cids(snitz_data_raw['CAS'], kind='name', verbose=False) snitz_data = pd.Series(results, name='CID').to_frame().join( snitz_data_raw.set_index('CAS')) snitz_data.head() file_path = os.path.join(pyrfume.DATA, 'snitz', 'snitz.csv') snitz_data.to_csv(file_path)
# kernelspec: # display_name: Python 3 # language: python # name: python3 # --- import os import pandas as pd import pyrfume from pyrfume import sigma_ff, odorants # Load raw Sigma FF data descriptors, data = sigma_ff.get_data() # Turn CAS into CIDs cas_list = list(data.keys()) results = odorants.get_cids(cas_list, kind='name', verbose=False) # Format Sigma FF data into Dataframe with CIDs # Odorants without CIDs will have a CID of 0 sigma = pd.DataFrame(index=cas_list, columns=['CID']+descriptors, data=0) sigma.index.name = 'CAS' for cas, desc in data.items(): sigma.loc[cas, 'CID'] = results[cas] sigma.loc[cas, desc] = 1 sigma.head() # Create a new file with CIDs and store here in `cids` dictionary file_path = os.path.join(pyrfume.DATA, 'sigma', 'sigma.csv') sigma.to_csv(file_path)
# Delete those overflow rows df = df.loc[~df.index.isin(overflow_indices)] # Fix problematic CAS numbers for index, cas in df['CAS number'].items(): if not re.match('[0-9]+\-[0-9]+\-[0-9]+', cas): print("Fixing %s" % cas) cas = cas.replace('(', '').replace(')', '') assert re.match('[0-9]+\-[0-9]+\-[0-9]+', cas) df.loc[index, 'CAS number'] = cas # + jupyter={"outputs_hidden": true} # Get CIDs for these CAS numbers # Many of these CAS numbers are for substances, not compounds, and so have SIDs instead (not yet supported) cas_cids_dict = get_cids(df['CAS number']) # - # Add CIDs to the dataframe for cas, cid in cas_cids_dict.items(): df.loc[df['CAS number'] == cas, 'CID'] = cid # Convert CIDs to integers df.loc[:, 'CID'] = df.loc[:, 'CID'].astype(int) df.head() # Use CID as the index and sort df = df.set_index('CID').sort_index() df.head() pyrfume.save_data(df, 'IFRA_FIG/ifra_fig.csv')
# display_name: Python 3 # language: python # name: python3 # --- import os import pandas as pd import pyrfume from pyrfume import odorants from rdkit import Chem file_path = os.path.join(pyrfume.DATA, 'westeros', 'molecules.csv') leffingwell_data_raw = pd.read_csv(file_path, sep='\t') results = odorants.get_cids(leffingwell_data_raw['smiles'], kind='SMILES', verbose=False) leffingwell_data = pd.Series(results, name='CID').to_frame().join( leffingwell_data_raw.set_index('smiles')) leffingwell_data.head() for smiles in leffingwell_data[leffingwell_data['CID'] == 0].index: name = leffingwell_data.loc[smiles, 'chemical_name'] mol = Chem.MolFromSmiles(smiles) if mol is None: print("Bad smiles: %s" % smiles) else: smiles = Chem.MolToSmiles(mol, isomericSmiles=True) cid = odorants.get_cid(smiles, kind='smiles', verbose=True) if cid:
# --- # + import bs4 import os import pandas as pd from urllib.request import urlopen import pyrfume from pyrfume import odorants # - url = 'http://www.flavornet.org/cas.html' f = urlopen(url) html = f.read() soup = bs4.BeautifulSoup(html) cas_list = [] rows = soup.find('table').find_all('tr') for row in rows[1:]: cas = row.find('td').text cas_list.append(cas) cids = odorants.get_cids(cas_list, kind='name') df = pd.Series(cids, name='CID').to_frame() df.head() file_path = os.path.join(pyrfume.DATA, 'flavornet.csv') df.to_csv(file_path)
# extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.10.3 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- import os import pandas as pd import pyrfume from pyrfume import odorants file_path = os.path.join(pyrfume.DATA, 'PrestwickChemLib.smi') prestwick_data = pd.read_csv(file_path, header=None, sep='\t')[0] prestwick_data.head() results = odorants.get_cids(prestwick_data['SMILES'], kind='SMILES', verbose=False) prestwick_data = pd.Series(results, name='CID').to_frame().join(prestwick_data)[['CID']] prestwick_data.head() # Create a new file with CIDs and store here in `cids` dictionary file_path = os.path.join(pyrfume.DATA, 'prestwick.csv') prestwick_data.to_csv(file_path)