def load_raw_data(food, load): food_data = pd.read_csv(config.mfp(f'data/{food}_data.csv'), encoding='latin1') food_scoring = pd.read_csv(config.mfp(f'data/{food}_scoring.csv'), encoding='latin1') # Need to remove phenol explorer ids that were manually put into data (for garlic only) food_data = food_data[food_data.PMID.isin(food_scoring.PMID.tolist())] food_data.chemical = food_data.chemical.str.lower() food_data.amount = food_data.amount.str.replace(',', '') food_data = food_data.merge(food_scoring[['PMID', 'is_useful']], how='left', on='PMID') if report: report_stat( f'Number of papers in search {food}: ' + str(len(food_scoring)), f'num_papers_srch_{food}.txt') report_stat( f'Number of papers reviewed {food}: ' + str(len(food_scoring[food_scoring.is_useful.notnull()])), f'num_reviewed_papers_{food}.txt') report_stat( f'Number of unique papers {food}: ' + str(len(food_data['PMID'].drop_duplicates())), f'num_unique_papers_{food}.txt') report_stat(f'Total number of records {food}: ' + str(len(food_data)), f'num_records_{food}.txt') return food_data, food_scoring
def append_keys_raw_data(food_data, food, load): if load: food_data = pd.read_pickle(config.mfp(f'data/{food}_food_data.pkl')) else: food_data = id_loader(food_data, 'chemical', load, config.mfp(f'{food}_food_data.pkl')) return food_data
def load_usda_data(food, load): if not load: if food == 'garlic': # Garlic, 'Garlic, raw', 'Spices, garlic powder' NDB_id = [11215, 2020] if food == 'cocoa': # Cocoa, 'Oil, cocoa butter', 'Cocoa, dry powder, Hershey's European style cocoa', # 'Cocoa, dry powder, unsweetened', 'Cocoa, dry powder, unsweetend, processed with alkali', # 'Cocoa, dry powder, hi-fat or breakfast, processed with alkali' NDB_id = [4501, 19171, 19165, 19166, 19860] # Reads in USDA database usda = pd.read_csv(config.mfp('data/SR28_plus_flav.csv'), encoding='latin1') # Filters out rows not apart of NDB_id usda = usda[usda.NDB_No.isin(NDB_id)][[ 'NDB_No', 'food_name', 'Nutr_No_new', 'nut_desc', 'Nutr_Val', 'unit' ]] usda['num_measures'] = 1 # Average chemicals that appear in multiple USDA food categories for nutr in usda.Nutr_No_new.drop_duplicates().tolist(): temp = usda[usda.Nutr_No_new == nutr] if len(temp) > 1: if len(temp.unit.drop_duplicates()) > 1: print(nutr, 'has different units for same nutrient') new_row = temp.copy().reset_index(drop=True).loc[0, :] new_row['Nutr_Val'] = temp.Nutr_Val.mean() new_row['num_measures'] = len(temp) usda = usda.drop(temp.index) usda = usda.append(new_row) usda = usda.reset_index(drop=True) # Append chemical key matcher to USDA chemicals if load: usda = pd.read_pickle(config.mfp(f'data/{food}_usda.pkl')) else: usda = id_loader(usda, 'nut_desc', load, f'{food}_usda.pkl').reset_index(drop=True) usda.rename(columns={'foodb_id': 'chem_id_f'}, inplace=True) usda = usda[~usda.unit.isin(['IU', 'kcal', 'kJ'])].reset_index(drop=True) if report: report_stat(f'USDA size {food}: ' + str(len(usda)), f'usda_size_{food}.txt') return usda
def id_loader(df, chem_key, load, file, fdb=True, pubchem=True): if load: df = pd.read_pickle(config.mfp(f'data/{file}')) else: df = lbr.id_searcher(df, chem_key, fdb=fdb, pubchem=pubchem) df.to_pickle(config.mfp(f'misc_save/{file}')) df.rename(columns={ 'pubchem_id': 'chem_id_p', 'foodb_id': 'chem_id_f' }, inplace=True) return df
def report_stat(text, filename, varname=None): if not os.path.exists('stats'): os.mkdir('stats') if varname is not None: text = text + '\n\tVar: ' + varname text = text + '\t' + time.strftime("%m/%d/%Y", time.localtime()) with open(config.mfp('stats/' + filename), 'w') as f: f.write(text)
def load_foodb_data(food, load): # Dataframe with contents of foodb if not load: foodb = pd.read_csv(config.mfp('data/contentssql.csv')) foodb = foodb[(foodb.source_type != 'Nutrient') & (foodb.source_id != 0) & (foodb.standard_content != 0)] compounds = pd.read_csv(config.mfp('data/compounds.csv'), encoding='latin1') foodb = foodb.merge(compounds[['id', 'name']], how='left', left_on='source_id', right_on='id') if food == 'garlic': # Garlic - ["Garlic", "Soft-necked Garlic"] target_foodb_food_id = [8, 880] if food == 'cocoa': # Cocoa - ["cocoa bean", "cocoa butter", "Cocoa powder", "Cocoa Liquor"] target_foodb_food_id = [182, 706, 707, 708] # Gets the subset of the database pertaining to food foodb_food = foodb[foodb.food_id.isin( target_foodb_food_id)].reset_index(drop=True) # Transforms all the chemical names to lowercase for syncing foodb_food.name = foodb_food.name.str.lower() foodb_food = foodb_food.rename(index=str, columns={"source_id": "foodb_id"}) if load: foodb_food = pd.read_pickle(config.mfp(f'data/{food}_foodb_food.pkl')) foodb_food.rename(columns={'orig_source_name': 'name'}, inplace=True) foodb_food = id_loader(foodb_food, 'name', load, f'{food}_foodb_food.pkl', fdb=False) # Creates a list of the unique chemicals in garlic from foodb foodb_food_lower = list(set(foodb_food.chem_id.tolist())) # Creates a separate dataframe that holds chemicals for garlic in foodb with a real quantification quant_foodb_food = foodb_food[foodb_food.standard_content.notnull()][[ 'chem_id', 'chem_id_f', 'orig_source_id', 'name', 'standard_content' ]].drop_duplicates() # Creates a separate dataframe that holds chemicals for garlic in foodb without a real quantification unquant_foodb_food = foodb_food[foodb_food.standard_content.isnull()][[ 'chem_id', 'chem_id_f', 'orig_source_id', 'name', 'standard_content' ]].reset_index() q_ids = list(set(quant_foodb_food.chem_id.tolist())) q_names = list(set(quant_foodb_food.chem_id_f.tolist())) unquant_foodb_food = unquant_foodb_food[ (~unquant_foodb_food.chem_id.fillna('-').isin(q_ids)) & (~unquant_foodb_food.chem_id_f.fillna('-').isin(q_names))] if report: report_stat( f'FDB size {food}: ' + str(len(foodb_food.chem_id.drop_duplicates())), f'fdb_size_{food}.txt') report_stat( f'QFDB size {food}: ' + str(len(quant_foodb_food.chem_id.drop_duplicates())), f'qfdb_size_{food}.txt') report_stat( f'UQFDB size {food}: ' + str(len(unquant_foodb_food.chem_id.drop_duplicates())), f'uqfdb_size_{food}.txt') return foodb_food, quant_foodb_food, unquant_foodb_food