예제 #1
0
def load_raw_data(food, load):
    food_data = pd.read_csv(config.mfp(f'data/{food}_data.csv'),
                            encoding='latin1')

    food_scoring = pd.read_csv(config.mfp(f'data/{food}_scoring.csv'),
                               encoding='latin1')

    # Need to remove phenol explorer ids that were manually put into data (for garlic only)
    food_data = food_data[food_data.PMID.isin(food_scoring.PMID.tolist())]

    food_data.chemical = food_data.chemical.str.lower()
    food_data.amount = food_data.amount.str.replace(',', '')

    food_data = food_data.merge(food_scoring[['PMID', 'is_useful']],
                                how='left',
                                on='PMID')

    if report:
        report_stat(
            f'Number of papers in search {food}: ' + str(len(food_scoring)),
            f'num_papers_srch_{food}.txt')
        report_stat(
            f'Number of papers reviewed {food}: ' +
            str(len(food_scoring[food_scoring.is_useful.notnull()])),
            f'num_reviewed_papers_{food}.txt')
        report_stat(
            f'Number of unique papers {food}: ' +
            str(len(food_data['PMID'].drop_duplicates())),
            f'num_unique_papers_{food}.txt')
        report_stat(f'Total number of records {food}: ' + str(len(food_data)),
                    f'num_records_{food}.txt')

    return food_data, food_scoring
예제 #2
0
def append_keys_raw_data(food_data, food, load):
    if load:
        food_data = pd.read_pickle(config.mfp(f'data/{food}_food_data.pkl'))
    else:
        food_data = id_loader(food_data, 'chemical', load,
                              config.mfp(f'{food}_food_data.pkl'))

    return food_data
예제 #3
0
def load_usda_data(food, load):
    if not load:
        if food == 'garlic':
            # Garlic, 'Garlic, raw', 'Spices, garlic powder'
            NDB_id = [11215, 2020]

        if food == 'cocoa':
            # Cocoa, 'Oil, cocoa butter', 'Cocoa, dry powder, Hershey's European style cocoa',
            # 'Cocoa, dry powder, unsweetened', 'Cocoa, dry powder, unsweetend, processed with alkali',
            # 'Cocoa, dry powder, hi-fat or breakfast, processed with alkali'
            NDB_id = [4501, 19171, 19165, 19166, 19860]

        # Reads in USDA database
        usda = pd.read_csv(config.mfp('data/SR28_plus_flav.csv'),
                           encoding='latin1')

        # Filters out rows not apart of NDB_id
        usda = usda[usda.NDB_No.isin(NDB_id)][[
            'NDB_No', 'food_name', 'Nutr_No_new', 'nut_desc', 'Nutr_Val',
            'unit'
        ]]
        usda['num_measures'] = 1

        # Average chemicals that appear in multiple USDA food categories
        for nutr in usda.Nutr_No_new.drop_duplicates().tolist():
            temp = usda[usda.Nutr_No_new == nutr]
            if len(temp) > 1:
                if len(temp.unit.drop_duplicates()) > 1:
                    print(nutr, 'has different units for same nutrient')
                new_row = temp.copy().reset_index(drop=True).loc[0, :]
                new_row['Nutr_Val'] = temp.Nutr_Val.mean()
                new_row['num_measures'] = len(temp)

                usda = usda.drop(temp.index)
                usda = usda.append(new_row)

        usda = usda.reset_index(drop=True)

    # Append chemical key matcher to USDA chemicals
    if load:
        usda = pd.read_pickle(config.mfp(f'data/{food}_usda.pkl'))
    else:
        usda = id_loader(usda, 'nut_desc', load,
                         f'{food}_usda.pkl').reset_index(drop=True)

    usda.rename(columns={'foodb_id': 'chem_id_f'}, inplace=True)

    usda = usda[~usda.unit.isin(['IU', 'kcal', 'kJ'])].reset_index(drop=True)

    if report:
        report_stat(f'USDA size {food}: ' + str(len(usda)),
                    f'usda_size_{food}.txt')

    return usda
예제 #4
0
def id_loader(df, chem_key, load, file, fdb=True, pubchem=True):

    if load:
        df = pd.read_pickle(config.mfp(f'data/{file}'))
    else:
        df = lbr.id_searcher(df, chem_key, fdb=fdb, pubchem=pubchem)
        df.to_pickle(config.mfp(f'misc_save/{file}'))

    df.rename(columns={
        'pubchem_id': 'chem_id_p',
        'foodb_id': 'chem_id_f'
    },
              inplace=True)

    return df
예제 #5
0
def report_stat(text, filename, varname=None):
    if not os.path.exists('stats'):
        os.mkdir('stats')

    if varname is not None:
        text = text + '\n\tVar: ' + varname

    text = text + '\t' + time.strftime("%m/%d/%Y", time.localtime())

    with open(config.mfp('stats/' + filename), 'w') as f:
        f.write(text)
예제 #6
0
def load_foodb_data(food, load):
    # Dataframe with contents of foodb

    if not load:
        foodb = pd.read_csv(config.mfp('data/contentssql.csv'))
        foodb = foodb[(foodb.source_type != 'Nutrient')
                      & (foodb.source_id != 0) & (foodb.standard_content != 0)]

        compounds = pd.read_csv(config.mfp('data/compounds.csv'),
                                encoding='latin1')

        foodb = foodb.merge(compounds[['id', 'name']],
                            how='left',
                            left_on='source_id',
                            right_on='id')

        if food == 'garlic':
            # Garlic - ["Garlic", "Soft-necked Garlic"]
            target_foodb_food_id = [8, 880]

        if food == 'cocoa':
            # Cocoa - ["cocoa bean", "cocoa butter", "Cocoa powder", "Cocoa Liquor"]
            target_foodb_food_id = [182, 706, 707, 708]

        # Gets the subset of the database pertaining to food
        foodb_food = foodb[foodb.food_id.isin(
            target_foodb_food_id)].reset_index(drop=True)

        # Transforms all the chemical names to lowercase for syncing
        foodb_food.name = foodb_food.name.str.lower()

        foodb_food = foodb_food.rename(index=str,
                                       columns={"source_id": "foodb_id"})

    if load:
        foodb_food = pd.read_pickle(config.mfp(f'data/{food}_foodb_food.pkl'))
        foodb_food.rename(columns={'orig_source_name': 'name'}, inplace=True)

    foodb_food = id_loader(foodb_food,
                           'name',
                           load,
                           f'{food}_foodb_food.pkl',
                           fdb=False)

    # Creates a list of the unique chemicals in garlic from foodb
    foodb_food_lower = list(set(foodb_food.chem_id.tolist()))

    # Creates a separate dataframe that holds chemicals for garlic in foodb with a real quantification
    quant_foodb_food = foodb_food[foodb_food.standard_content.notnull()][[
        'chem_id', 'chem_id_f', 'orig_source_id', 'name', 'standard_content'
    ]].drop_duplicates()

    # Creates a separate dataframe that holds chemicals for garlic in foodb without a real quantification
    unquant_foodb_food = foodb_food[foodb_food.standard_content.isnull()][[
        'chem_id', 'chem_id_f', 'orig_source_id', 'name', 'standard_content'
    ]].reset_index()

    q_ids = list(set(quant_foodb_food.chem_id.tolist()))
    q_names = list(set(quant_foodb_food.chem_id_f.tolist()))
    unquant_foodb_food = unquant_foodb_food[
        (~unquant_foodb_food.chem_id.fillna('-').isin(q_ids))
        & (~unquant_foodb_food.chem_id_f.fillna('-').isin(q_names))]

    if report:
        report_stat(
            f'FDB size {food}: ' +
            str(len(foodb_food.chem_id.drop_duplicates())),
            f'fdb_size_{food}.txt')
        report_stat(
            f'QFDB size {food}: ' +
            str(len(quant_foodb_food.chem_id.drop_duplicates())),
            f'qfdb_size_{food}.txt')
        report_stat(
            f'UQFDB size {food}: ' +
            str(len(unquant_foodb_food.chem_id.drop_duplicates())),
            f'uqfdb_size_{food}.txt')

    return foodb_food, quant_foodb_food, unquant_foodb_food