示例#1
0
def get_df(datadir, fname, key):
    fname = get_fname(fname)
    df = xport.to_dataframe(open(datadir / fname, 'rb'))
    df.set_index(key, inplace=True)
    df.drop(df.columns.difference(all_cols), axis=1, inplace=True)
    df.rename(columns={**input_col_map, **output_col_map}, inplace=True)
    return df
示例#2
0
def merge_xpt(fname):
    if type(fname) == list:
        df = []
        for f in fname:
            with open(f, 'rb') as file:
                df.append(xport.to_dataframe(file))
        all_files = np.array(df)  # store all datasets in a np.array

        # merge data.frames
        for i in range(len(fname) - 1):
            df[0] = df[0].merge(df[i + 1], on=['SEQN'])
        return all_files, df[0]

    with open(fname, 'rb') as file:
        df = xport.to_dataframe(file)
    return df
示例#3
0
def get_survey_data(year):
    components = [
        'Demographics', 'Dietary', 'Examination', 'Laboratory',
        'Questionnaire', 'LimitedAccess'
    ]
    for component in components:
        survey_urls = re.findall(
            rf'href=\"(.*Component={component}.*CycleBeginYear={year})', html)
        for survey_url in survey_urls:
            survey_url = survey_url.replace('&', '&')
            survey_url = f'https://wwwn.cdc.gov{survey_url}'
            survey_page = requests.get(survey_url).text
            data_urls = re.findall(rf'href=\"(.*XPT|.*xpt)\"', survey_page)
            for data_url in data_urls:
                data_url = f'https://wwwn.cdc.gov{data_url}'
                r = requests.get(data_url)
                xpt_filename = os.path.basename(data_url)
                filepath = os.path.join(DATA_PATH, f'{year}-{year+1}',
                                        component)
                pathlib.Path(filepath).mkdir(parents=True, exist_ok=True)
                xpt_filepath = os.path.join(filepath, xpt_filename)
                with open(xpt_filepath, 'wb') as f:
                    f.write(r.content)
                with open(xpt_filepath, 'rb') as f:
                    data = xport.to_dataframe(f)
                if xpt_filename.endswith('.XPT'):
                    csv_filename = xpt_filename.replace('.XPT', '.csv')
                elif xpt_filename.endswith('.xpt'):
                    csv_filename = xpt_filename.replace('.xpt', '.csv')
                csv_filepath = os.path.join(filepath, csv_filename)
                print(f'Downloading to {csv_filepath}')
                data.to_csv(csv_filepath, index=False)
                os.remove(xpt_filepath)
示例#4
0
def process_DRIFF(day_num, year):
    # """
    # DR*IFF_H is an ind_files
    # note: drops DR1CCMNM DR1CCMTX DR1_020 DR1_030Z DR1FS DR1_040Z
    # #TODO: include time of day, quantize into ~3-5 cats
    # End cols:
    # food1, food2, food3
    # """
    #setup some var names based on day
    DRIFF = 'DR{}IFF_{}'.format(day_num, year2letter[year])

    DRIFDCD = 'DR{}IFDCD'.format(day_num)
    DRIKCAL = 'DR{}IKCAL'.format(day_num)
    DR_020 = 'DR{}_020'.format(day_num)
    DRILINE = 'DR{}ILINE'.format(day_num)
    DRCCMNM = 'DR{}CCMNM'.format(day_num)
    WTDRD = 'WTDRD1'
    DRIGRMS = 'DR{}IGRMS'.format(day_num)

    fn = DRIFF
    file_path = os.path.join(data_dir.format(year), "{}.XPT".format(fn))
    with open(file_path, 'rb') as f:
        dfm = xport.to_dataframe(f)

    df = dfm
    df['SEQN'] = dfm['SEQN'].astype(int)
    df[DRIFDCD] = dfm[DRIFDCD].astype(int)

    #drop redundant data
    df = dropAllBut(dfm, ['SEQN', DRIFDCD])

    #make core out of just seqn
    core = df['SEQN'].drop_duplicates(keep='first').to_frame()
    num_seqs = core.shape[0]

    # determine food encodings
    food_codes = np.sort(df[DRIFDCD].unique()).astype(np.int)
    fc_label_format = DRIFDCD + '-{}'
    fc_cols = [fc_label_format.format(fc) for fc in food_codes]
    num_fc = food_codes.size

    # #expand core by food code labels
    dfadd = pd.DataFrame(np.zeros((num_seqs, num_fc)).astype(np.float),
                         columns=fc_cols)
    core = core.reset_index(drop=True)
    core = pd.concat([core, dfadd], axis=1)
    # #add the ind.food entries to core
    for seqn in core['SEQN']:
        dfs = df.loc[df['SEQN'] == seqn]
        for i in dfs.index:
            #set appropriate value in core.o..
            fc = dfs.at[i, DRIFDCD]
            core.loc[core.index[core['SEQN'] == seqn],
                     fc_label_format.format(fc)] = 1

    #final readifiction
    core_seqn = list(core.set_index("SEQN").index)
    core = fillMissingSeqn(core, year)

    return core, core_seqn
示例#5
0
def xpt_to_csv(filename, filepath, save_dir):
    path=os.path.join(filepath+'/'+filename)
    with open(path, 'rb') as f:
        df=xport.to_dataframe(f)

    savepath=os.path.join(filepath+'/'+filename.split('.')[0]+'.csv')

    df.to_csv(savepath)
示例#6
0
def process_DSTOT_H(day_num):
    DSTOT_H = 'DS{}TOT_H'.format(day_num)
    WTDRD = 'WTDRD1'
    DSTKCAL = 'DS{}TKCAL'.format(day_num)

    fn = DSTOT_H
    file_path = os.path.join(data_dir, "{}.XPT".format(fn))
    with open(file_path, 'rb') as f:
        dfm = xport.to_dataframe(f)
    df = dfm
    df = dropBetween(df, WTDRD, DSTKCAL)
    df = fillMissingSeqnAndDropSeqn(df)

    return df
示例#7
0
def process_DSIDS(day_num, year):
    """
    DR*IDS_H is an ind_files
    note: drops DR1CCMNM DR1CCMTX DR1_020 DR1_030Z DR1FS DR1_040Z
    """
    #setup some var names based on day
    DSIDS = 'DS{}IDS_{}'.format(day_num, year2letter[year])
    DSDSUPP = 'DSDSUPP'
    WTDRD = 'WTDRD1'

    fn = DSIDS
    file_path = os.path.join(data_dir.format(year), "{}.XPT".format(fn))
    with open(file_path, 'rb') as f:
        dfm = xport.to_dataframe(f)

    #convert columns of interest to int
    df = dfm
    df['SEQN'] = dfm['SEQN'].astype(int)
    df['DSDSUPID'] = dfm['DSDSUPID'].astype(int)

    #drop redundant data
    df = dropAllBut(df, ['SEQN', 'DSDSUPID'])

    #make core out of just seqn
    core = df['SEQN'].drop_duplicates(keep='first').to_frame()
    num_seqs = core.shape[0]

    # determine sppl encodings
    sppl_codes = np.sort(df.DSDSUPID.unique()).astype(np.int)
    sc_label_format = 'DSDSUPID_2D_{}-{}'
    sc_cols = [sc_label_format.format(day_num, sc) for sc in sppl_codes]
    num_sc = sppl_codes.size

    # #expand core by food code labels
    dfadd = pd.DataFrame(np.zeros((num_seqs, num_sc)).astype(np.float),
                         columns=sc_cols)
    core = core.reset_index(drop=True)
    core = pd.concat([core, dfadd], axis=1)
    # #add the ind.food entries to core
    for seqn in core['SEQN']:
        dfs = df.loc[df['SEQN'] == seqn]
        for i in dfs.index:
            #set appropriate value in core.o..
            sc = dfs.at[i, 'DSDSUPID']
            core.loc[core.index[core['SEQN'] == seqn],
                     sc_label_format.format(day_num, sc)] = 1
    core_seqn = list(core.set_index("SEQN").index)
    core = fillMissingSeqn(core, year)

    return core, core_seqn
示例#8
0
def process_DSTOT(day_num, year):
    DSTOT = 'DS{}TOT_{}'.format(day_num, year2letter[year])
    WTDRD = 'WTDRD1'
    DSTKCAL = 'DS{}TKCAL'.format(day_num)

    fn = DSTOT
    file_path = os.path.join(data_dir.format(year), "{}.XPT".format(fn))
    with open(file_path, 'rb') as f:
        dfm = xport.to_dataframe(f)
    df = dfm
    df = dropBetween(df, WTDRD, DSTKCAL)

    df_seqn = list(df.set_index("SEQN").index)
    df = fillMissingSeqn(df, year)

    return df, df_seqn
示例#9
0
def process_DSQTOT_H():
    #setup some var names based on day
    DSQTOT_H = 'DSQTOT_H'

    fn = DSQTOT_H
    file_path = os.path.join(data_dir, "{}.XPT".format(fn))
    with open(file_path, 'rb') as f:
        dfm = xport.to_dataframe(f)
    df = dfm
    #convert columns of interest to int
    df = dfm
    df['SEQN'] = df['SEQN'].astype(int)

    df = dropBetween(df, 'DSDCOUNT', 'DSQTKCAL')
    df = fillMissingSeqnAndDropSeqn(df)

    return df
示例#10
0
def process_DSQIDS_H():
    #setup some var names based on day
    DSQIDS_H = 'DSQIDS_H'
    DSDSUPP = 'DSDSUPP'
    WTDRD = 'WTDRD1'

    fn = DSQIDS_H
    file_path = os.path.join(data_dir, "{}.XPT".format(fn))
    with open(file_path, 'rb') as f:
        dfm = xport.to_dataframe(f)

    #convert columns of interest to int
    df = dfm
    df['SEQN'] = dfm['SEQN'].astype(int)
    df['DSDSUPID'] = dfm['DSDSUPID'].astype(int)

    #drop redundant data
    df = dropAllBut(df, ['SEQN', 'DSDSUPID'])

    #make core out of just seqn
    core = df['SEQN'].drop_duplicates(keep='first').to_frame()
    num_seqs = core.shape[0]

    # determine sppl encodings
    sppl_codes = np.sort(df.DSDSUPID.unique()).astype(np.int)
    sc_label_format = 'DSDSUPID_30D-{}'
    sc_cols = [sc_label_format.format(sc) for sc in sppl_codes]
    num_sc = sppl_codes.size

    # #expand core by food code labels
    dfadd = pd.DataFrame(np.zeros((num_seqs, num_sc)).astype(np.float),
                         columns=sc_cols)
    core = core.reset_index(drop=True)
    core = pd.concat([core, dfadd], axis=1)
    # #add the ind.food entries to core
    for seqn in core['SEQN']:
        dfs = df.loc[df['SEQN'] == seqn]
        for i in dfs.index:
            #set appropriate value in core.o..
            sc = dfs.at[i, 'DSDSUPID']
            core.loc[core.index[core['SEQN'] == seqn],
                     sc_label_format.format(sc)] = 1

    core = fillMissingSeqnAndDropSeqn(core)

    return core
示例#11
0
def process_DSQTOT(year):
    #setup some var names based on day
    DSQTOT = 'DSQTOT_{}'.format(year2letter[year])

    fn = DSQTOT
    file_path = os.path.join(data_dir.format(year), "{}.XPT".format(fn))
    with open(file_path, 'rb') as f:
        dfm = xport.to_dataframe(f)
    df = dfm
    #convert columns of interest to int
    df = dfm
    df['SEQN'] = df['SEQN'].astype(int)

    df = dropBetween(df, 'DSDCOUNT', 'DSQTKCAL')

    df_seqn = list(df.set_index("SEQN").index)
    df = fillMissingSeqn(df, year)

    return df, df_seqn
示例#12
0
def xptodf(i):
     with open(i, 'rb') as f:
            return xp.to_dataframe(f)
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 11 17:03:48 2018

@author: qiang
"""

import numpy as np
import xport
import pandas as pd

# import the dataset
path = r"C:\Users\qiang\Desktop\2018 fall\5825\homework\project\LLCP2017.XPT"
with open(path, 'rb') as f:
    df = xport.to_dataframe(f)

# select BMI5 from dataset
data_BMI = df['_BMI5']
#fill the NaN data with mean
newdata_BMI = data_BMI.fillna(int(data_BMI.mean()))

# select WTKG3 from dataset
data_WTK = df['WTKG3']
#fill the NaN data with mean
newdata_WTK = data_WTK.fillna(int(data_WTK.mean()))

# create new dataset
dataset = pd.DataFrame()
dataset['WTKG'] = newdata_WTK  # add WTK

# create class label according BMI
示例#14
0
#80: ---------------------------------------------------------------------------

# Set up: ----------------------------------------------------------------------
import xport
import pandas as pd
import numpy as np
import seaborn as sns
import random
import matplotlib.pyplot as plt
from scipy.stats import t
from statsmodels.formula.api import ols

# Read in the data: ------------------------------------------------------------
## 'rb' mode - opens the file in binary format for reading
with open('HSQ_D.XPT', 'rb') as f:
    df_health = xport.to_dataframe(f)

with open('ALQ_D.XPT', 'rb') as f:
    df_alcohol = xport.to_dataframe(f)

with open('DEMO_D.XPT', 'rb') as f:
    df_demo = xport.to_dataframe(f)

# Data preparation: ------------------------------------------------------------
# Extract key columns
df_health = df_health.loc[df_health['HSD010'] <= 3, ['SEQN', 'HSD010']]
df_alcohol = df_alcohol.loc[df_alcohol['ALQ120Q'] <= 365, ['SEQN', 'ALQ120Q']]
df_demo = df_demo.loc[(df_demo.RIDAGEYR >= 21) & (df_demo.DMDEDUC2 <= 5),
                      ['SEQN', 'RIAGENDR', 'RIDAGEYR', 'INDFMPIR', 'DMDEDUC2']]

# Merge key columns into one data frame
示例#15
0
 def test_to_dataframe(self, library, library_bytestring):
     ds = next(iter(library.values()))
     fp = BytesIO(library_bytestring)
     with pytest.warns(DeprecationWarning):
         result = xport.to_dataframe(fp)
     assert (result == ds).all(axis=None)