示例#1
0
def preprocess():
    #read raw files ( not reading extraneous columns to avoid unneccesary loading time)
    print("Loading Data from SUF....")
    t0 = time.clock()
    read_var = [
        'Account', 'Age', 'Gender', 'Zip', 'County', 'Race', 'MDC',
        'Patient Disposition', 'Admission_Source', 'Primary_Payer',
        'Admitting_Service', 'Admission_Source', 'Admission_Date',
        'Discharge_Date', 'Year_of_Discharge', 'ICD_9_Proc_1',
        'Days_from_Admission_for_Proc_1', 'Admission_Type', 'DX1', 'DX2',
        'DX3', 'DX4', 'DX5', 'DX6', 'DX7', 'DX8', 'DX9', 'DX10', 'DX11',
        'DX12', 'DX13', 'DX14', 'DX15', 'DX16', 'Count_of_Diagonosis_Codes',
        'VALVE', 'HYPOTHY', 'COAG', 'OBESE', 'WGHTLOSS', 'LYTES', 'alc_drug',
        'anemia', 'DEPRESS', 'HTN_C', 'ace_adm', 'CKD_corr', 'esrd_corr',
        'PARA', 'NEURO', 'eGFR_epi_new', 'ratio_firstCr_mdrd', 'BLDLOSS',
        'ANEMDEF', 'ALCOHOL', 'DRUG', 'mort_status_30d', 'cv_comp_new',
        'MV_comp', 'ICU_comp', 'rural', 'total', 'Med_inc', 'prop_black',
        'prop_hisp', 'Prop_pov', 'zipdist2', 'service1'
    ]

    df = pd.read_csv(path_csv + 'fake_data_admission.csv',
                     usecols=read_var,
                     skipfooter=1)

    #Getting Provider Information from ip.CSV
    print("Loading Provider Information...")
    provider = mp.get_provider_map()

    #Getting Lab results on admission day for each account
    print("Loading Lab Results for patients...")
    ll = pd.DataFrame(columns=['Account', 'Admission_Date'])
    ll['Account'] = df['Account']
    ll['Admission_Date'] = df['Admission_Date']
    lab_results = lc.load_labs_data(ll)

    #Getting medicine intake for patients on admisision day
    print("Loading Medicine Information for patients")
    med_results = md.load_meds_data(ll)

    print("Finished Loading Data.")

    #start processing
    #creating new columns
    df['pay_grp'] = np.nan
    df['admitting_type'] = np.nan
    df['race2'] = np.nan
    df['imi'] = np.nan
    df['ichf'] = np.nan
    df['ipvd'] = np.nan
    df['icvd'] = np.nan
    df['liverd'] = np.nan
    df['icancer'] = np.nan
    df['diabetes'] = np.nan
    df['imcancer'] = np.nan
    df['attend_doc'] = np.nan
    df['min_HGB'] = np.nan
    df['max_PROTUR_grp2'] = np.nan
    df['max_HGBUR_gr'] = np.nan
    df['max_GLUURN_gr'] = np.nan
    df['count_HGBn'] = np.nan
    df['count_PROTURn'] = np.nan

    #renaming existing columns
    df.rename(columns={'Days_from_Admission_for_Proc_1': 'pr1_day'},
              inplace=True)
    df.rename(columns={'Count_of_Diagonosis_Codes': 'NDX'}, inplace=True)
    df.rename(columns={'Zip': 'zip5'}, inplace=True)
    df.rename(columns={'Age': 'age'}, inplace=True)

    #Iterating over rows to standardize parameters
    print("Processing Data")

    for index, row in df.iterrows():
        #processing suf data
        df.ix[index, 'race'] = sf.standardize_race(row['Race'])
        df.ix[index,
              'pay_grp'] = sf.standardize_pay_group(row['Primary_Payer'])
        df.ix[index, 'Admitting_type'] = sf.standardize_admitting_type(
            row['Admitting_Service'])
        df.ix[index, 'admission_source'] = sf.standardize_admission_source(
            row['Admission_Source'])
        df.ix[index, 'admit_day1'] = sf.day(row['Admission_Date'])
        df.ix[index, 'admit_mth'] = sf.month(row['Admission_Date'])
        df.ix[index, 'Year_of_admission'] = sf.year(row['Admission_Date'])
        df.ix[index, 'weekend_adm'] = sf.isWeekend(df.ix[index, 'admit_day1'])
        df.ix[index, 'emergent'] = sf.year(row['Admission_Type'])
        df.ix[index, 'pr1c'] = sf.standardize_proc_code(row['ICD_9_Proc_1'])
        df.ix[index, 'Count_of_Diagonosis_Codes'] = sf.standardize_proc_code(
            row['ICD_9_Proc_1'])

        #getting info from maps
        try:
            df.ix[index, 'attend_doc'] = provider[row['Account']]
        except ValueError:
            df.ix[index, 'attend_doc'] = "None"
        except KeyError:
            df.ix[index, 'attend_doc'] = "None"

    # cci calculation
        conditions = cc.CharlsonICD9CM([
            str(row['DX1']),
            str(row['DX2']),
            str(row['DX3']),
            str(row['DX4']),
            str(row['DX5']),
            str(row['DX6']),
            str(row['DX7']),
            str(row['DX8']),
            str(row['DX9']),
            str(row['DX10']),
            str(row['DX11']),
            str(row['DX12']),
            str(row['DX13']),
            str(row['DX14']),
            str(row['DX15']),
            str(row['DX16'])
        ])

        df.ix[index, 'imi'] = conditions[0]
        df.ix[index, 'ichf'] = conditions[1]
        df.ix[index, 'ipvd'] = conditions[2]
        df.ix[index, 'icvd'] = conditions[3]
        df.ix[index, 'icpd'] = conditions[5]
        df.ix[index, 'icancer'] = conditions[13]
        df.ix[index, 'imcancer'] = conditions[15]
        df.ix[index, 'cci'] = conditions[17]
        df.ix[index, 'cancer'] = conditions[18]
        df.ix[index, 'liverd'] = conditions[19]
        df.ix[index, 'diabetes'] = conditions[20]

        # comorbidity Macro2  related information
        conditions = cc2.translate_condition([
            str(row['DX1']),
            str(row['DX2']),
            str(row['DX3']),
            str(row['DX4']),
            str(row['DX5']),
            str(row['DX6']),
            str(row['DX7']),
            str(row['DX8']),
            str(row['DX9']),
            str(row['DX10']),
            str(row['DX11']),
            str(row['DX12']),
            str(row['DX13']),
            str(row['DX14']),
            str(row['DX15']),
            str(row['DX16'])
        ])

        # adding information about medicaitons
        med = med_results[row['Account']]
        df.ix[index, 'no_meds_on_adm'] = med['no_meds_on_adm']
        df.ix[index, 'aminog_adm'] = med['aminog_adm']
        df.ix[index, 'bicarb_adm'] = med['bicarb_adm']
        df.ix[index, 'diuret_adm'] = med['diuret_adm']
        df.ix[index, 'steroi_adm'] = med['steroi_adm']
        df.ix[index, 'vanco_adm'] = med['vanco_adm']
        df.ix[index, 'ace_adm'] = med['ace_adm']
        df.ix[index, 'nsaids_adm'] = med['nsaids_adm']
        df.ix[index, 'asa_adm'] = med['asa_adm']
        df.ix[index, 'antiemetic_adm'] = med['antiemetic_adm']
        df.ix[index, 'betablockers_adm'] = med['betablockers_adm']
        df.ix[index, 'statin_adm'] = med['statin_adm']
        df.ix[index, 'inot_pres_adm'] = med['inot_pres_adm']

        #adding lab information
        lab = lab_results[row['Account']]
        df.ix[index, 'min_HGB'] = lab['min_HGB']
        df.ix[index, 'max_PROTUR_gr2'] = lab['max_PROTUR_grp2']
        df.ix[index, 'max_HGBUR_gr'] = lab['max_HGBUR_gr']
        df.ix[index, 'max_GLUURN_gr'] = lab['max_GLUURN_gr']
        df.ix[index, 'count_HGBn'] = lab['count_HGBn']
        df.ix[index, 'count_PROTURn'] = lab['count_PROTURn']


#Dropping extraneous Columns
    df.drop(['Race', 'Primary_Payer', 'Admission_Date', 'ICD_9_Proc_1'],
            axis=1,
            inplace=True)
    df.drop([
        'DX1', 'DX2', 'DX3', 'DX4', 'DX5', 'DX6', 'DX7', 'DX8', 'DX9', 'DX10',
        'DX11', 'DX12', 'DX13', 'DX14', 'DX15', 'DX16'
    ],
            axis=1,
            inplace=True)

    print("Processing of data complete.Writing to csv...")
    df.to_csv("processeddata.csv", sep=',', index=False)

    print("Writing to csv complete.Final File generated")
    print time.clock() - t0, "seconds process time"

    #print df.to_json()
    return df.to_json()
示例#2
0
def preprocess():
    #read raw files ( not reading extraneous columns to avoid unneccesary loading time)
    print("Loading Data from SUF....")
    t0 = time.clock()
    read_var = [
        'Account', 'Age', 'Gender', 'Zip', 'County', 'Race', 'MDC',
        'Admission_Source', 'Primary_Payer', 'Admitting_Service',
        'Discharge_Service', 'Admission_Source', 'Admission_Date',
        'Discharge_Date', 'Year_of_Discharge', 'Icd_9_Proc_1', 'Icd_9_Proc_2',
        'Days_from_Admission_for_Proc_1', 'Admission_Type', 'DX1', 'DX2',
        'DX3', 'DX4', 'DX5', 'DX6', 'DX7', 'DX8', 'DX9', 'DX10', 'DX11',
        'DX12', 'DX13', 'DX14', 'DX15', 'DX16', 'Count_of_Diagnosis_Codes',
        'DRG', 'CKD_corr', 'esrd_corr', 'eGFR_epi_new', 'ratio_firstCr_mdrd'
    ]

    df = pd.read_csv(path + 'admission_data.csv', usecols=read_var)
    df['Admission_Date'] = pd.to_datetime(df['Admission_Date'])

    #Getting Provider Information from ip.CSV
    print("Loading Provider Information...")
    provider = mp.get_provider_map(path)

    #Getting Lab results on admission day for each account
    print("Loading Lab Results for patients...")
    ll = pd.DataFrame(columns=['Account', 'Admission_Date'])
    ll['Account'] = df['Account']
    ll['Admission_Date'] = df['Admission_Date']
    lab_results = lc.load_labs_data(ll, path)

    #Getting medicine intake for patients on admisision day
    print("Loading Medicine Information for patients")
    med_results = md.load_meds_data(ll, path)

    #Loading zip information
    print("Loading demographic data")
    zipdf = pd.read_csv(path + 'zip.csv')
    zipmap = pd.read_csv(path + 'zip_to_zcta_2015.csv')
    print zipmap

    print("Finished Loading Data.")

    #start processing
    #creating new columns
    df['pay_grp'] = np.nan
    df['admitting_type'] = np.nan
    df['race2'] = np.nan
    df['imi'] = np.nan
    df['ichf'] = np.nan
    df['ipvd'] = np.nan
    df['icvd'] = np.nan
    df['liverd'] = np.nan
    df['icancer'] = np.nan
    df['diabetes'] = np.nan
    df['imcancer'] = np.nan
    df['attend_doc'] = np.nan
    df['min_HGB'] = np.nan
    df['max_PROTUR_gr2'] = np.nan
    df['max_HGBUR_gr'] = np.nan
    df['max_GLUURN_gr'] = np.nan
    df['count_HGBn'] = np.nan
    df['count_PROTURn'] = np.nan

    #renaming existing columns
    df.rename(columns={'Account': 'acc'}, inplace=True)
    df.rename(columns={'Days_from_Admission_for_Proc_1': 'pr1_day'},
              inplace=True)
    df.rename(columns={'Count_of_Diagnosis_Codes': 'NDX'}, inplace=True)
    df.rename(columns={'Zip': 'zip5'}, inplace=True)
    df.rename(columns={'Age': 'age'}, inplace=True)
    #temporary change
    df.rename(columns={'dx1': 'DX1'}, inplace=True)
    df.rename(columns={'dx2': 'DX2'}, inplace=True)
    df.rename(columns={'dx3': 'DX3'}, inplace=True)
    df.rename(columns={'dx4': 'DX4'}, inplace=True)
    df.rename(columns={'dx5': 'DX5'}, inplace=True)
    df.rename(columns={'dx6': 'DX6'}, inplace=True)
    df.rename(columns={'dx7': 'DX7'}, inplace=True)
    df.rename(columns={'dx8': 'DX8'}, inplace=True)
    df.rename(columns={'dx9': 'DX9'}, inplace=True)
    df.rename(columns={'dx10': 'DX10'}, inplace=True)
    df.rename(columns={'dx11': 'DX11'}, inplace=True)
    df.rename(columns={'dx12': 'DX12'}, inplace=True)
    df.rename(columns={'dx13': 'DX13'}, inplace=True)
    df.rename(columns={'dx14': 'DX14'}, inplace=True)
    df.rename(columns={'dx15': 'DX15'}, inplace=True)
    df.rename(columns={'dx16': 'DX16'}, inplace=True)

    #Iterating over rows to standardize parameters
    print("Processing Data")

    for index, row in df.iterrows():
        #processing suf data
        print('Processing row ' + str(index) + '...')
        df.ix[index, 'race2'] = sf.standardize_race(row['Race'])
        df.ix[index,
              'pay_grp'] = sf.standardize_pay_group(row['Primary_Payer'])
        df.ix[index, 'Admitting_type'] = sf.standardize_admitting_type(
            row['Admitting_Service'])
        df.ix[index, 'Admission_Source'] = sf.standardize_admission_source(
            row['Admission_Source'])
        df.ix[index, 'admit_day1'] = sf.day(row['Admission_Date'])
        df.ix[index, 'admit_mth'] = sf.month(row['Admission_Date'])
        df.ix[index, 'Year_of_admission'] = sf.year(row['Admission_Date'])
        df.ix[index, 'weekend_adm'] = sf.isWeekend(df.ix[index, 'admit_day1'])
        df.ix[index, 'emergent'] = sf.standardize_admission_type(
            row['Admission_Type'])
        df.ix[index, 'pr1c'] = sf.standardize_proc_code(row['Icd_9_Proc_1'])
        df.ix[index, 'Count_of_Diagonosis_Codes'] = sf.standardize_proc_code(
            row['Icd_9_Proc_1'])
        df.ix[index, 'PEDS'] = sf.mark_as_ped(str(row['Discharge_Service']),
                                              str(row['Admitting_Service']))
        df.ix[index, 'service1'] = sf.standardize_service(
            str(row['Admitting_Service']), str(row['Discharge_Service']),
            str(row['Icd_9_Proc_1']), str(row['Icd_9_Proc_2']))
        #Getting income and zip level information with race and zip

        zipinfo = zz.get_zip_Info(str(row['zip5']), str(row['Race']), zipdf,
                                  zipmap)
        df.ix[index, 'total'] = zipinfo['total']
        df.ix[index, 'rural'] = zipinfo['rural']
        df.ix[index, 'Med_inc'] = zipinfo['Med_inc']
        df.ix[index, 'prop_black'] = zipinfo['prop_black']
        df.ix[index, 'prop_hisp'] = zipinfo['prop_hisp']
        df.ix[index, 'Prop_pov'] = zipinfo['prop_pov']
        df.ix[index, 'zipdist2'] = zipinfo['zipdist2']

        #getting info from maps
        try:
            df.ix[index, 'attend_doc'] = provider[row['acc']]
        except ValueError:
            df.ix[index, 'attend_doc'] = "None"
        except KeyError:
            df.ix[index, 'attend_doc'] = "None"

    # cci calculation
        conditions = cc.CharlsonICD9CM([
            str(row['DX1']),
            str(row['DX2']),
            str(row['DX3']),
            str(row['DX4']),
            str(row['DX5']),
            str(row['DX6']),
            str(row['DX7']),
            str(row['DX8']),
            str(row['DX9']),
            str(row['DX10']),
            str(row['DX11']),
            str(row['DX12']),
            str(row['DX13']),
            str(row['DX14']),
            str(row['DX15'])
        ])

        df.ix[index, 'imi'] = conditions[0]
        df.ix[index, 'ichf'] = conditions[1]
        df.ix[index, 'ipvd'] = conditions[2]
        df.ix[index, 'icvd'] = conditions[3]
        df.ix[index, 'icpd'] = conditions[5]
        df.ix[index, 'icancer'] = conditions[13]
        df.ix[index, 'imcancer'] = conditions[15]
        df.ix[index, 'cci'] = conditions[17]
        df.ix[index, 'cancer'] = conditions[18]
        df.ix[index, 'liverd'] = conditions[19]
        df.ix[index, 'diabetes'] = conditions[20]

        # comorbidity Macro2  related information
        conditions2 = cc2.comorb2_condition([
            str(row['DX1']),
            str(row['DX2']),
            str(row['DX3']),
            str(row['DX4']),
            str(row['DX5']),
            str(row['DX6']),
            str(row['DX7']),
            str(row['DX8']),
            str(row['DX9']),
            str(row['DX10']),
            str(row['DX11']),
            str(row['DX12']),
            str(row['DX13']),
            str(row['DX14']),
            str(row['DX15'])
        ], str(row['DRG']))

        df.ix[index, 'VALVE'] = conditions2['VALVE']
        df.ix[index, 'HYPOTHY'] = conditions2['HYPOTHY']
        df.ix[index, 'COAG'] = conditions2['COAG']
        df.ix[index, 'OBESE'] = conditions2['OBESE']
        df.ix[index, 'WGHTLOSS'] = conditions2['WGHTLOSS']
        df.ix[index, 'LYTES'] = conditions2['LYTES']
        df.ix[index, 'anemia'] = conditions2['anemia']
        df.ix[index, 'alc_drug'] = conditions2['alc_drug']
        df.ix[index, 'DEPRESS'] = conditions2['DEPRESS']
        df.ix[index, 'HTN_C'] = conditions2['HTN_C']
        df.ix[index, 'PARA'] = conditions2['PARA']
        df.ix[index, 'NEURO'] = conditions2['NEURO']

        # adding information about medicaitons
        med = med_results[row['acc']]
        df.ix[index, 'no_meds_on_adm'] = med['no_meds_on_adm']
        df.ix[index, 'aminog_adm'] = med['aminog_adm']
        df.ix[index, 'bicarb_adm'] = med['bicarb_adm']
        df.ix[index, 'diuret_adm'] = med['diuret_adm']
        df.ix[index, 'steroi_adm'] = med['steroi_adm']
        df.ix[index, 'vanco_adm'] = med['vanco_adm']
        df.ix[index, 'ace_adm'] = med['ace_adm']
        df.ix[index, 'nsaids_adm'] = med['nsaids_adm']
        df.ix[index, 'asa_adm'] = med['asa_adm']
        df.ix[index, 'antiemetic_adm'] = med['antiemetic_adm']
        df.ix[index, 'betablockers_adm'] = med['betablockers_adm']
        df.ix[index, 'statin_adm'] = med['statin_adm']
        df.ix[index, 'inot_pres_adm'] = med['inot_pres_adm']

        #adding lab information
        lab = lab_results[row['acc']]
        df.ix[index, 'min_HGB'] = lab['min_HGB']
        df.ix[index, 'max_PROTUR_gr2'] = lab['max_PROTUR_grp2']
        df.ix[index, 'max_HGBUR_gr'] = lab['max_HGBUR_gr']
        df.ix[index, 'max_GLUURN_gr'] = lab['max_GLUURN_gr']
        df.ix[index, 'count_HGBn'] = lab['count_HGBn']
        df.ix[index, 'count_PROTURn'] = lab['count_PROTURn']

    print(len(df))
    print(df[df['PEDS'] == 1])
    df = df[df['PEDS'] == 0]
    print(len(df))
    #Dropping extraneous Columns
    df.drop([
        'Race', 'Primary_Payer', 'Admission_Date', 'Icd_9_Proc_1',
        'Icd_9_Proc_2', 'Discharge_Service', 'DRG', 'PEDS'
    ],
            axis=1,
            inplace=True)
    df.drop([
        'DX1', 'DX2', 'DX3', 'DX4', 'DX5', 'DX6', 'DX7', 'DX8', 'DX9', 'DX10',
        'DX11', 'DX12', 'DX13', 'DX14', 'DX15', 'DX16'
    ],
            axis=1,
            inplace=True)
    #df.drop(['dx1', 'dx2', 'dx3', 'dx4', 'dx5', 'dx6', 'dx7', 'dx8', 'dx9', 'dx10', 'dx11', 'dx12', 'dx13', 'dx14', 'dx15', 'dx16'],axis=1,inplace=True)

    df = df[[
        'acc', 'age', 'Gender', 'race2', 'zip5', 'pay_grp', 'County', 'rural',
        'total', 'Med_inc', 'prop_black', 'prop_hisp', 'Prop_pov', 'zipdist2',
        'admit_day1', 'admit_mth', 'Year_of_admission', 'weekend_adm',
        'attend_doc', 'Admission_Source', 'Admitting_Service',
        'Admitting_type', 'emergent', 'pr1_day', 'service1', 'pr1c', 'cci',
        'NDX', 'MDC', 'imi', 'ichf', 'ipvd', 'icvd', 'icpd', 'liverd',
        'diabetes', 'icancer', 'imcancer', 'cancer', 'VALVE', 'HYPOTHY',
        'COAG', 'OBESE', 'WGHTLOSS', 'LYTES', 'alc_drug', 'anemia', 'DEPRESS',
        'HTN_C', 'PARA', 'NEURO', 'no_meds_on_adm', 'aminog_adm', 'bicarb_adm',
        'diuret_adm', 'steroi_adm', 'vanco_adm', 'ace_adm', 'nsaids_adm',
        'asa_adm', 'antiemetic_adm', 'betablockers_adm', 'statin_adm',
        'inot_pres_adm', 'min_HGB', 'max_PROTUR_gr2', 'max_HGBUR_gr',
        'max_GLUURN_gr', 'count_HGBn', 'count_PROTURn', 'CKD_corr',
        'esrd_corr', 'eGFR_epi_new', 'ratio_firstCr_mdrd'
    ]]

    print("Processing of data complete.Writing to csv...")
    df.to_csv("processeddata.csv", sep=',', index=False)

    print("Writing to csv complete.Final File generated")
    print time.clock() - t0, "seconds process time"

    return df.to_json()
示例#3
0
def preprocess():
    #read raw files ( not reading extraneous columns to avoid unneccesary loading time)
    print("Loading Data from SUF....")
    t0 = time.clock()
    read_var = ['Account',
            'Age',
            'Gender',
            'Zip',
            'County',
            'Race',
            'MDC',
            'Admission_Source',
            'Primary_Payer',
            'Admitting_Service',
            'Discharge_Service',
            'Admission_Source',
            'Admission_Date',
            'Discharge_Date',
            'Year_of_Discharge',
            'Icd_9_Proc_1',
            'Icd_9_Proc_2',
            'Days_from_Admission_for_Proc_1',
            'Admission_Type',
            'DX1','DX2','DX3','DX4','DX5','DX6','DX7','DX8','DX9','DX10','DX11','DX12','DX13','DX14','DX15','DX16',
            'Count_of_Diagnosis_Codes','DRG','CKD_corr','esrd_corr','eGFR_epi_new','ratio_firstCr_mdrd'
]

    df = pd.read_csv(path+'admission_data.csv',usecols=read_var)
    df['Admission_Date']= pd.to_datetime(df['Admission_Date'])

    #Getting Provider Information from ip.CSV
    print("Loading Provider Information...")
    provider = mp.get_provider_map(path)


    #Getting Lab results on admission day for each account
    print("Loading Lab Results for patients...")
    ll = pd.DataFrame(columns=['Account','Admission_Date'])
    ll['Account'] = df['Account']
    ll['Admission_Date'] = df['Admission_Date']
    lab_results= lc.load_labs_data(ll,path)

    #Getting medicine intake for patients on admisision day
    print("Loading Medicine Information for patients")
    med_results= md.load_meds_data(ll ,path)

    #Loading zip information
    print("Loading demographic data")
    zipdf  = pd.read_csv(path+'zip.csv')
    zipmap = pd.read_csv(path+'zip_to_zcta_2015.csv')
    print zipmap


    print("Finished Loading Data.")

    #start processing
    #creating new columns
    df['pay_grp'] = np.nan
    df['admitting_type'] = np.nan
    df['race2'] = np.nan
    df['imi'] = np.nan
    df['ichf'] = np.nan
    df['ipvd'] = np.nan
    df['icvd'] = np.nan
    df['liverd'] = np.nan
    df['icancer'] = np.nan
    df['diabetes'] = np.nan
    df['imcancer'] = np.nan
    df['attend_doc'] = np.nan
    df['min_HGB'] = np.nan
    df['max_PROTUR_gr2']=np.nan
    df['max_HGBUR_gr']=np.nan
    df['max_GLUURN_gr']=np.nan
    df['count_HGBn']=np.nan
    df['count_PROTURn']=np.nan


#renaming existing columns
    df.rename(columns={'Account': 'acc'}, inplace=True)
    df.rename(columns={'Days_from_Admission_for_Proc_1': 'pr1_day'}, inplace=True)
    df.rename(columns={'Count_of_Diagnosis_Codes': 'NDX'}, inplace=True)
    df.rename(columns={'Zip': 'zip5'}, inplace=True)
    df.rename(columns={'Age': 'age'}, inplace=True)
    #temporary change
    df.rename(columns={'dx1': 'DX1'}, inplace=True)
    df.rename(columns={'dx2': 'DX2'}, inplace=True)
    df.rename(columns={'dx3': 'DX3'}, inplace=True)
    df.rename(columns={'dx4': 'DX4'}, inplace=True)
    df.rename(columns={'dx5': 'DX5'}, inplace=True)
    df.rename(columns={'dx6': 'DX6'}, inplace=True)
    df.rename(columns={'dx7': 'DX7'}, inplace=True)
    df.rename(columns={'dx8': 'DX8'}, inplace=True)
    df.rename(columns={'dx9': 'DX9'}, inplace=True)
    df.rename(columns={'dx10': 'DX10'}, inplace=True)
    df.rename(columns={'dx11': 'DX11'}, inplace=True)
    df.rename(columns={'dx12': 'DX12'}, inplace=True)
    df.rename(columns={'dx13': 'DX13'}, inplace=True)
    df.rename(columns={'dx14': 'DX14'}, inplace=True)
    df.rename(columns={'dx15': 'DX15'}, inplace=True)
    df.rename(columns={'dx16': 'DX16'}, inplace=True)

#Iterating over rows to standardize parameters
    print("Processing Data")

    for index, row in df.iterrows():
    #processing suf data
        print('Processing row '+ str(index) + '...')
        df.ix[index, 'race2']                      = sf.standardize_race(row['Race'])
        df.ix[index, 'pay_grp']                   = sf.standardize_pay_group(row['Primary_Payer'])
        df.ix[index, 'Admitting_type']            = sf.standardize_admitting_type(row['Admitting_Service'])
        df.ix[index, 'Admission_Source']          = sf.standardize_admission_source(row['Admission_Source'])
        df.ix[index, 'admit_day1']                = sf.day(row['Admission_Date'])
        df.ix[index, 'admit_mth']                 = sf.month(row['Admission_Date'])
        df.ix[index, 'Year_of_admission']         = sf.year(row['Admission_Date'])
        df.ix[index, 'weekend_adm']               = sf.isWeekend(df.ix[index, 'admit_day1'])
        df.ix[index, 'emergent']                  = sf.standardize_admission_type(row['Admission_Type'])
        df.ix[index, 'pr1c']                      = sf.standardize_proc_code(row['Icd_9_Proc_1'])
        df.ix[index, 'Count_of_Diagonosis_Codes'] = sf.standardize_proc_code(row['Icd_9_Proc_1'])
        df.ix[index, 'PEDS']                      = sf.mark_as_ped(str(row['Discharge_Service']),str(row['Admitting_Service']))
        df.ix[index, 'service1'] = sf.standardize_service(str(row['Admitting_Service']),str(row['Discharge_Service']),str(row['Icd_9_Proc_1']),str(row['Icd_9_Proc_2']))
    #Getting income and zip level information with race and zip

        zipinfo = zz.get_zip_Info(str(row['zip5']),str(row['Race']),zipdf,zipmap)
        df.ix[index, 'total']       = zipinfo['total']
        df.ix[index, 'rural']       = zipinfo['rural']
        df.ix[index, 'Med_inc']     = zipinfo['Med_inc']
        df.ix[index, 'prop_black']  = zipinfo['prop_black']
        df.ix[index, 'prop_hisp']   = zipinfo['prop_hisp']
        df.ix[index, 'Prop_pov']    = zipinfo['prop_pov']
        df.ix[index, 'zipdist2']    = zipinfo['zipdist2']

    #getting info from maps
        try:
            df.ix[index, 'attend_doc'] = provider[row['acc']]
        except ValueError:
            df.ix[index, 'attend_doc'] = "None"
        except KeyError:
            df.ix[index, 'attend_doc'] = "None"

    # cci calculation
        conditions = cc.CharlsonICD9CM([str(row['DX1']),  str(row['DX2']),  str(row['DX3']),  str(row['DX4']),  str(row['DX5']),  str(row['DX6']),
                                        str(row['DX7']),  str(row['DX8']),  str(row['DX9']),  str(row['DX10']), str(row['DX11']), str(row['DX12']),
                                        str(row['DX13']), str(row['DX14']), str(row['DX15'])])


        df.ix[index, 'imi']      = conditions[0]
        df.ix[index, 'ichf']     = conditions[1]
        df.ix[index, 'ipvd']     = conditions[2]
        df.ix[index, 'icvd']     = conditions[3]
        df.ix[index, 'icpd']     = conditions[5]
        df.ix[index, 'icancer']  = conditions[13]
        df.ix[index, 'imcancer'] = conditions[15]
        df.ix[index, 'cci']      = conditions[17]
        df.ix[index, 'cancer']   = conditions[18]
        df.ix[index, 'liverd']   = conditions[19]
        df.ix[index, 'diabetes'] = conditions[20]

    # comorbidity Macro2  related information
        conditions2 = cc2.comorb2_condition([str(row['DX1'] ), str(row['DX2']), str(row['DX3']), str(row['DX4']), str(row['DX5']), str(row['DX6']),
                                              str(row['DX7']), str(row['DX8']), str(row['DX9']), str(row['DX10']), str(row['DX11']), str(row['DX12']),
                                              str(row['DX13']), str(row['DX14']), str(row['DX15'])] ,str(row['DRG']) )

        df.ix[index, 'VALVE']    = conditions2['VALVE']
        df.ix[index, 'HYPOTHY']  = conditions2['HYPOTHY']
        df.ix[index, 'COAG']     = conditions2['COAG']
        df.ix[index, 'OBESE']    = conditions2['OBESE']
        df.ix[index, 'WGHTLOSS'] = conditions2['WGHTLOSS']
        df.ix[index, 'LYTES']    = conditions2['LYTES']
        df.ix[index, 'anemia']   = conditions2['anemia']
        df.ix[index, 'alc_drug'] = conditions2['alc_drug']
        df.ix[index, 'DEPRESS']  = conditions2['DEPRESS']
        df.ix[index, 'HTN_C']    = conditions2['HTN_C']
        df.ix[index, 'PARA']     = conditions2['PARA']
        df.ix[index, 'NEURO']    = conditions2['NEURO']


    # adding information about medicaitons
        med = med_results[row['acc']]
        df.ix[index, 'no_meds_on_adm']   = med['no_meds_on_adm']
        df.ix[index, 'aminog_adm']       = med['aminog_adm']
        df.ix[index, 'bicarb_adm']       = med['bicarb_adm']
        df.ix[index, 'diuret_adm']       = med['diuret_adm']
        df.ix[index, 'steroi_adm']       = med['steroi_adm']
        df.ix[index, 'vanco_adm']        = med['vanco_adm']
        df.ix[index, 'ace_adm']          = med['ace_adm']
        df.ix[index, 'nsaids_adm']       = med['nsaids_adm']
        df.ix[index, 'asa_adm']          = med['asa_adm']
        df.ix[index, 'antiemetic_adm']   = med['antiemetic_adm']
        df.ix[index, 'betablockers_adm'] = med['betablockers_adm']
        df.ix[index, 'statin_adm']       = med['statin_adm']
        df.ix[index, 'inot_pres_adm']    = med['inot_pres_adm']

    #adding lab information
        lab = lab_results[row['acc']]
        df.ix[index,'min_HGB']         = lab['min_HGB']
        df.ix[index,'max_PROTUR_gr2']  = lab['max_PROTUR_grp2']
        df.ix[index,'max_HGBUR_gr']    = lab['max_HGBUR_gr']
        df.ix[index,'max_GLUURN_gr']   = lab['max_GLUURN_gr']
        df.ix[index,'count_HGBn']      = lab['count_HGBn']
        df.ix[index,'count_PROTURn']   = lab['count_PROTURn']

    print(len(df))
    print(df[df['PEDS'] == 1])
    df = df[df['PEDS'] == 0]
    print(len(df))
#Dropping extraneous Columns
    df.drop(['Race','Primary_Payer','Admission_Date','Icd_9_Proc_1','Icd_9_Proc_2','Discharge_Service','DRG','PEDS'],axis=1,inplace=True)
    df.drop(['DX1','DX2','DX3','DX4','DX5','DX6','DX7','DX8','DX9','DX10','DX11','DX12','DX13','DX14','DX15','DX16'],axis=1,inplace=True)
    #df.drop(['dx1', 'dx2', 'dx3', 'dx4', 'dx5', 'dx6', 'dx7', 'dx8', 'dx9', 'dx10', 'dx11', 'dx12', 'dx13', 'dx14', 'dx15', 'dx16'],axis=1,inplace=True)

    df=df[['acc','age','Gender','race2','zip5','pay_grp','County','rural','total','Med_inc','prop_black','prop_hisp','Prop_pov','zipdist2','admit_day1','admit_mth',	'Year_of_admission',	'weekend_adm',	'attend_doc',	'Admission_Source',	'Admitting_Service',	'Admitting_type',	'emergent',	'pr1_day',	'service1',	'pr1c',	'cci',	'NDX',	'MDC',	'imi',	'ichf',	'ipvd',	'icvd',	'icpd',	'liverd',	'diabetes',	'icancer',	'imcancer',	'cancer',	'VALVE',	'HYPOTHY',	'COAG',	'OBESE',	'WGHTLOSS',	'LYTES',	'alc_drug','anemia','DEPRESS','HTN_C','PARA','NEURO','no_meds_on_adm','aminog_adm','bicarb_adm','diuret_adm','steroi_adm','vanco_adm','ace_adm','nsaids_adm','asa_adm','antiemetic_adm','betablockers_adm','statin_adm','inot_pres_adm','min_HGB','max_PROTUR_gr2','max_HGBUR_gr','max_GLUURN_gr','count_HGBn','count_PROTURn','CKD_corr','esrd_corr','eGFR_epi_new','ratio_firstCr_mdrd']]

    print("Processing of data complete.Writing to csv...")
    df.to_csv("processeddata.csv" ,sep=',',index = False)

    print("Writing to csv complete.Final File generated")
    print time.clock() - t0, "seconds process time"
示例#4
0
def preprocess():
    #read raw files ( not reading extraneous columns to avoid unneccesary loading time)
    print("Loading Data from SUF....")
    t0 = time.clock()
    read_var = ['Account',
            'Age',
            'Gender',
            'Zip',
            'County',
            'Race',
            'MDC',
            'Patient Disposition',
            'Admission_Source',
            'Primary_Payer',
            'Admitting_Service',
            'Admission_Source',
            'Admission_Date',
            'Discharge_Date',
            'Year_of_Discharge',
            'ICD_9_Proc_1',
            'Days_from_Admission_for_Proc_1',
            'Admission_Type',
            'DX1','DX2','DX3','DX4','DX5','DX6','DX7','DX8','DX9','DX10','DX11','DX12','DX13','DX14','DX15','DX16',
            'Count_of_Diagonosis_Codes',
            'VALVE' ,'HYPOTHY','COAG','OBESE','WGHTLOSS','LYTES','alc_drug','anemia' ,'DEPRESS' ,'HTN_C','ace_adm',
            'CKD_corr','esrd_corr','PARA','NEURO','eGFR_epi_new','ratio_firstCr_mdrd','BLDLOSS','ANEMDEF',
            'ALCOHOL','DRUG','mort_status_30d','cv_comp_new','MV_comp','ICU_comp','rural','total','Med_inc','prop_black','prop_hisp','Prop_pov','zipdist2','service1'
            ]

    df = pd.read_csv(path_csv+'fake_data_admission.csv',usecols=read_var,skipfooter=1)


    #Getting Provider Information from ip.CSV
    print("Loading Provider Information...")
    provider = mp.get_provider_map()


    #Getting Lab results on admission day for each account
    print("Loading Lab Results for patients...")
    ll = pd.DataFrame(columns=['Account','Admission_Date'])
    ll['Account'] = df['Account']
    ll['Admission_Date'] = df['Admission_Date']
    lab_results= lc.load_labs_data(ll)

    #Getting medicine intake for patients on admisision day
    print("Loading Medicine Information for patients")
    med_results= md.load_meds_data(ll)

    print("Finished Loading Data.")

    #start processing
    #creating new columns
    df['pay_grp'] = np.nan
    df['admitting_type'] = np.nan
    df['race2'] = np.nan
    df['imi'] = np.nan
    df['ichf'] = np.nan
    df['ipvd'] = np.nan
    df['icvd'] = np.nan
    df['liverd'] = np.nan
    df['icancer'] = np.nan
    df['diabetes'] = np.nan
    df['imcancer'] = np.nan
    df['attend_doc'] = np.nan
    df['min_HGB'] = np.nan
    df['max_PROTUR_grp2']=np.nan
    df['max_HGBUR_gr']=np.nan
    df['max_GLUURN_gr']=np.nan
    df['count_HGBn']=np.nan
    df['count_PROTURn']=np.nan

#renaming existing columns
    df.rename(columns={'Days_from_Admission_for_Proc_1': 'pr1_day'}, inplace=True)
    df.rename(columns={'Count_of_Diagonosis_Codes': 'NDX'}, inplace=True)
    df.rename(columns={'Zip': 'zip5'}, inplace=True)
    df.rename(columns={'Age': 'age'}, inplace=True)


#Iterating over rows to standardize parameters
    print("Processing Data")

    for index, row in df.iterrows():
    #processing suf data
        df.ix[index, 'race']                      = sf.standardize_race(row['Race'])
        df.ix[index, 'pay_grp']                   = sf.standardize_pay_group(row['Primary_Payer'])
        df.ix[index, 'Admitting_type']            = sf.standardize_admitting_type(row['Admitting_Service'])
        df.ix[index, 'admission_source']          = sf.standardize_admission_source(row['Admission_Source'])
        df.ix[index, 'admit_day1']                = sf.day(row['Admission_Date'])
        df.ix[index, 'admit_mth']                 = sf.month(row['Admission_Date'])
        df.ix[index, 'Year_of_admission']         = sf.year(row['Admission_Date'])
        df.ix[index, 'weekend_adm']               = sf.isWeekend(df.ix[index, 'admit_day1'])
        df.ix[index, 'emergent']                  = sf.year(row['Admission_Type'])
        df.ix[index, 'pr1c']                      = sf.standardize_proc_code(row['ICD_9_Proc_1'])
        df.ix[index, 'Count_of_Diagonosis_Codes'] = sf.standardize_proc_code(row['ICD_9_Proc_1'])

    #getting info from maps
        try:
            df.ix[index, 'attend_doc'] = provider[row['Account']]
        except ValueError:
            df.ix[index, 'attend_doc'] = "None"
        except KeyError:
            df.ix[index, 'attend_doc'] = "None"

    # cci calculation
        conditions = cc.CharlsonICD9CM([str(row['DX1']),  str(row['DX2']),  str(row['DX3']),  str(row['DX4']),  str(row['DX5']),  str(row['DX6']),
                                    str(row['DX7']),  str(row['DX8']),  str(row['DX9']),  str(row['DX10']), str(row['DX11']), str(row['DX12']),
                                    str(row['DX13']), str(row['DX14']), str(row['DX15']), str(row['DX16'])])


        df.ix[index, 'imi']      = conditions[0]
        df.ix[index, 'ichf']     = conditions[1]
        df.ix[index, 'ipvd']     = conditions[2]
        df.ix[index, 'icvd']     = conditions[3]
        df.ix[index, 'icpd']     = conditions[5]
        df.ix[index, 'icancer']  = conditions[13]
        df.ix[index, 'imcancer'] = conditions[15]
        df.ix[index, 'cci']      = conditions[17]
        df.ix[index, 'cancer']   = conditions[18]
        df.ix[index, 'liverd']   = conditions[19]
        df.ix[index, 'diabetes'] = conditions[20]

    # comorbidity Macro2  related information
        conditions = cc2.translate_condition([str(row['DX1']), str(row['DX2']), str(row['DX3']), str(row['DX4']), str(row['DX5']), str(row['DX6']),
                                          str(row['DX7']), str(row['DX8']), str(row['DX9']), str(row['DX10']), str(row['DX11']), str(row['DX12']),
                                          str(row['DX13']), str(row['DX14']), str(row['DX15']), str(row['DX16'])])




    # adding information about medicaitons
        med = med_results[row['Account']]
        df.ix[index, 'no_meds_on_adm']   = med['no_meds_on_adm']
        df.ix[index, 'aminog_adm']       = med['aminog_adm']
        df.ix[index, 'bicarb_adm']       = med['bicarb_adm']
        df.ix[index, 'diuret_adm']       = med['diuret_adm']
        df.ix[index, 'steroi_adm']       = med['steroi_adm']
        df.ix[index, 'vanco_adm']        = med['vanco_adm']
        df.ix[index, 'ace_adm']          = med['ace_adm']
        df.ix[index, 'nsaids_adm']       = med['nsaids_adm']
        df.ix[index, 'asa_adm']          = med['asa_adm']
        df.ix[index, 'antiemetic_adm']   = med['antiemetic_adm']
        df.ix[index, 'betablockers_adm'] = med['betablockers_adm']
        df.ix[index, 'statin_adm']       = med['statin_adm']
        df.ix[index, 'inot_pres_adm']    = med['inot_pres_adm']

    #adding lab information
        lab = lab_results[row['Account']]
        df.ix[index,'min_HGB']         = lab['min_HGB']
        df.ix[index,'max_PROTUR_gr2']  = lab['max_PROTUR_grp2']
        df.ix[index,'max_HGBUR_gr']    = lab['max_HGBUR_gr']
        df.ix[index,'max_GLUURN_gr']   = lab['max_GLUURN_gr']
        df.ix[index,'count_HGBn']      = lab['count_HGBn']
        df.ix[index,'count_PROTURn']   = lab['count_PROTURn']


#Dropping extraneous Columns
    df.drop(['Race','Primary_Payer','Admission_Date','ICD_9_Proc_1'],axis=1,inplace=True)
    df.drop(['DX1','DX2','DX3','DX4','DX5','DX6','DX7','DX8','DX9','DX10','DX11','DX12','DX13','DX14','DX15','DX16'],axis=1,inplace=True)

    print("Processing of data complete.Writing to csv...")
    df.to_csv("processeddata.csv" ,sep=',',index = False)

    print("Writing to csv complete.Final File generated")
    print time.clock() - t0, "seconds process time"
    
    #print df.to_json()
    return df.to_json()
示例#5
0
def preprocess():
    # read raw files ( not reading extraneous columns to avoid unneccesary loading time)
    print ("Loading Data from SUF....")
    t0 = time.clock()
    read_var = [
        "Account",
        "Age",
        "Gender",
        "Zip",
        "County",
        "Race",
        "MDC",
        "Admission_Source",
        "Primary_Payer",
        "Admitting_Service",
        "Discharge_Service",
        "Admission_Source",
        "Admission_Date",
        "Discharge_Date",
        "Year_of_Discharge",
        "Icd_9_Proc_1",
        "Icd_9_Proc_2",
        "Days_from_Admission_for_Proc_1",
        "Admission_Type",
        "DX1",
        "DX2",
        "DX3",
        "DX4",
        "DX5",
        "DX6",
        "DX7",
        "DX8",
        "DX9",
        "DX10",
        "DX11",
        "DX12",
        "DX13",
        "DX14",
        "DX15",
        "DX16",
        "Count_of_Diagnosis_Codes",
        "DRG",
        "CKD_corr",
        "esrd_corr",
        "eGFR_epi_new",
        "ratio_firstCr_mdrd",
    ]

    df = pd.read_csv(path + "admission_data.csv", usecols=read_var)
    df["Admission_Date"] = pd.to_datetime(df["Admission_Date"])

    # Getting Provider Information from ip.CSV
    print ("Loading Provider Information...")
    provider = mp.get_provider_map(path)

    # Getting Lab results on admission day for each account
    print ("Loading Lab Results for patients...")
    ll = pd.DataFrame(columns=["Account", "Admission_Date"])
    ll["Account"] = df["Account"]
    ll["Admission_Date"] = df["Admission_Date"]
    lab_results = lc.load_labs_data(ll, path)

    # Getting medicine intake for patients on admisision day
    print ("Loading Medicine Information for patients")
    med_results = md.load_meds_data(ll, path)

    # Loading zip information
    print ("Loading demographic data")
    zipdf = pd.read_csv(path + "zip.csv")
    zipmap = pd.read_csv(path + "zip_to_zcta_2015.csv")
    print zipmap

    print ("Finished Loading Data.")

    # start processing
    # creating new columns
    df["pay_grp"] = np.nan
    df["admitting_type"] = np.nan
    df["race2"] = np.nan
    df["imi"] = np.nan
    df["ichf"] = np.nan
    df["ipvd"] = np.nan
    df["icvd"] = np.nan
    df["liverd"] = np.nan
    df["icancer"] = np.nan
    df["diabetes"] = np.nan
    df["imcancer"] = np.nan
    df["attend_doc"] = np.nan
    df["min_HGB"] = np.nan
    df["max_PROTUR_gr2"] = np.nan
    df["max_HGBUR_gr"] = np.nan
    df["max_GLUURN_gr"] = np.nan
    df["count_HGBn"] = np.nan
    df["count_PROTURn"] = np.nan

    # renaming existing columns
    df.rename(columns={"Account": "acc"}, inplace=True)
    df.rename(columns={"Days_from_Admission_for_Proc_1": "pr1_day"}, inplace=True)
    df.rename(columns={"Count_of_Diagnosis_Codes": "NDX"}, inplace=True)
    df.rename(columns={"Zip": "zip5"}, inplace=True)
    df.rename(columns={"Age": "age"}, inplace=True)
    # temporary change
    df.rename(columns={"dx1": "DX1"}, inplace=True)
    df.rename(columns={"dx2": "DX2"}, inplace=True)
    df.rename(columns={"dx3": "DX3"}, inplace=True)
    df.rename(columns={"dx4": "DX4"}, inplace=True)
    df.rename(columns={"dx5": "DX5"}, inplace=True)
    df.rename(columns={"dx6": "DX6"}, inplace=True)
    df.rename(columns={"dx7": "DX7"}, inplace=True)
    df.rename(columns={"dx8": "DX8"}, inplace=True)
    df.rename(columns={"dx9": "DX9"}, inplace=True)
    df.rename(columns={"dx10": "DX10"}, inplace=True)
    df.rename(columns={"dx11": "DX11"}, inplace=True)
    df.rename(columns={"dx12": "DX12"}, inplace=True)
    df.rename(columns={"dx13": "DX13"}, inplace=True)
    df.rename(columns={"dx14": "DX14"}, inplace=True)
    df.rename(columns={"dx15": "DX15"}, inplace=True)
    df.rename(columns={"dx16": "DX16"}, inplace=True)

    # Iterating over rows to standardize parameters
    print ("Processing Data")

    for index, row in df.iterrows():
        # processing suf data
        print ("Processing row " + str(index) + "...")
        df.ix[index, "race2"] = sf.standardize_race(row["Race"])
        df.ix[index, "pay_grp"] = sf.standardize_pay_group(row["Primary_Payer"])
        df.ix[index, "Admitting_type"] = sf.standardize_admitting_type(row["Admitting_Service"])
        df.ix[index, "Admission_Source"] = sf.standardize_admission_source(row["Admission_Source"])
        df.ix[index, "admit_day1"] = sf.day(row["Admission_Date"])
        df.ix[index, "admit_mth"] = sf.month(row["Admission_Date"])
        df.ix[index, "Year_of_admission"] = sf.year(row["Admission_Date"])
        df.ix[index, "weekend_adm"] = sf.isWeekend(df.ix[index, "admit_day1"])
        df.ix[index, "emergent"] = sf.standardize_admission_type(row["Admission_Type"])
        df.ix[index, "pr1c"] = sf.standardize_proc_code(row["Icd_9_Proc_1"])
        df.ix[index, "Count_of_Diagonosis_Codes"] = sf.standardize_proc_code(row["Icd_9_Proc_1"])
        df.ix[index, "PEDS"] = sf.mark_as_ped(str(row["Discharge_Service"]), str(row["Admitting_Service"]))
        df.ix[index, "service1"] = sf.standardize_service(
            str(row["Admitting_Service"]),
            str(row["Discharge_Service"]),
            str(row["Icd_9_Proc_1"]),
            str(row["Icd_9_Proc_2"]),
        )
        # Getting income and zip level information with race and zip

        zipinfo = zz.get_zip_Info(str(row["zip5"]), str(row["Race"]), zipdf, zipmap)
        df.ix[index, "total"] = zipinfo["total"]
        df.ix[index, "rural"] = zipinfo["rural"]
        df.ix[index, "Med_inc"] = zipinfo["Med_inc"]
        df.ix[index, "prop_black"] = zipinfo["prop_black"]
        df.ix[index, "prop_hisp"] = zipinfo["prop_hisp"]
        df.ix[index, "Prop_pov"] = zipinfo["prop_pov"]
        df.ix[index, "zipdist2"] = zipinfo["zipdist2"]

        # getting info from maps
        try:
            df.ix[index, "attend_doc"] = provider[row["acc"]]
        except ValueError:
            df.ix[index, "attend_doc"] = "None"
        except KeyError:
            df.ix[index, "attend_doc"] = "None"

        # cci calculation
        conditions = cc.CharlsonICD9CM(
            [
                str(row["DX1"]),
                str(row["DX2"]),
                str(row["DX3"]),
                str(row["DX4"]),
                str(row["DX5"]),
                str(row["DX6"]),
                str(row["DX7"]),
                str(row["DX8"]),
                str(row["DX9"]),
                str(row["DX10"]),
                str(row["DX11"]),
                str(row["DX12"]),
                str(row["DX13"]),
                str(row["DX14"]),
                str(row["DX15"]),
            ]
        )

        df.ix[index, "imi"] = conditions[0]
        df.ix[index, "ichf"] = conditions[1]
        df.ix[index, "ipvd"] = conditions[2]
        df.ix[index, "icvd"] = conditions[3]
        df.ix[index, "icpd"] = conditions[5]
        df.ix[index, "icancer"] = conditions[13]
        df.ix[index, "imcancer"] = conditions[15]
        df.ix[index, "cci"] = conditions[17]
        df.ix[index, "cancer"] = conditions[18]
        df.ix[index, "liverd"] = conditions[19]
        df.ix[index, "diabetes"] = conditions[20]

        # comorbidity Macro2  related information
        conditions2 = cc2.comorb2_condition(
            [
                str(row["DX1"]),
                str(row["DX2"]),
                str(row["DX3"]),
                str(row["DX4"]),
                str(row["DX5"]),
                str(row["DX6"]),
                str(row["DX7"]),
                str(row["DX8"]),
                str(row["DX9"]),
                str(row["DX10"]),
                str(row["DX11"]),
                str(row["DX12"]),
                str(row["DX13"]),
                str(row["DX14"]),
                str(row["DX15"]),
            ],
            str(row["DRG"]),
        )

        df.ix[index, "VALVE"] = conditions2["VALVE"]
        df.ix[index, "HYPOTHY"] = conditions2["HYPOTHY"]
        df.ix[index, "COAG"] = conditions2["COAG"]
        df.ix[index, "OBESE"] = conditions2["OBESE"]
        df.ix[index, "WGHTLOSS"] = conditions2["WGHTLOSS"]
        df.ix[index, "LYTES"] = conditions2["LYTES"]
        df.ix[index, "anemia"] = conditions2["anemia"]
        df.ix[index, "alc_drug"] = conditions2["alc_drug"]
        df.ix[index, "DEPRESS"] = conditions2["DEPRESS"]
        df.ix[index, "HTN_C"] = conditions2["HTN_C"]
        df.ix[index, "PARA"] = conditions2["PARA"]
        df.ix[index, "NEURO"] = conditions2["NEURO"]

        # adding information about medicaitons
        med = med_results[row["acc"]]
        df.ix[index, "no_meds_on_adm"] = med["no_meds_on_adm"]
        df.ix[index, "aminog_adm"] = med["aminog_adm"]
        df.ix[index, "bicarb_adm"] = med["bicarb_adm"]
        df.ix[index, "diuret_adm"] = med["diuret_adm"]
        df.ix[index, "steroi_adm"] = med["steroi_adm"]
        df.ix[index, "vanco_adm"] = med["vanco_adm"]
        df.ix[index, "ace_adm"] = med["ace_adm"]
        df.ix[index, "nsaids_adm"] = med["nsaids_adm"]
        df.ix[index, "asa_adm"] = med["asa_adm"]
        df.ix[index, "antiemetic_adm"] = med["antiemetic_adm"]
        df.ix[index, "betablockers_adm"] = med["betablockers_adm"]
        df.ix[index, "statin_adm"] = med["statin_adm"]
        df.ix[index, "inot_pres_adm"] = med["inot_pres_adm"]

        # adding lab information
        lab = lab_results[row["acc"]]
        df.ix[index, "min_HGB"] = lab["min_HGB"]
        df.ix[index, "max_PROTUR_gr2"] = lab["max_PROTUR_grp2"]
        df.ix[index, "max_HGBUR_gr"] = lab["max_HGBUR_gr"]
        df.ix[index, "max_GLUURN_gr"] = lab["max_GLUURN_gr"]
        df.ix[index, "count_HGBn"] = lab["count_HGBn"]
        df.ix[index, "count_PROTURn"] = lab["count_PROTURn"]

    print (len(df))
    print (df[df["PEDS"] == 1])
    df = df[df["PEDS"] == 0]
    print (len(df))
    # Dropping extraneous Columns
    df.drop(
        ["Race", "Primary_Payer", "Admission_Date", "Icd_9_Proc_1", "Icd_9_Proc_2", "Discharge_Service", "DRG", "PEDS"],
        axis=1,
        inplace=True,
    )
    df.drop(
        [
            "DX1",
            "DX2",
            "DX3",
            "DX4",
            "DX5",
            "DX6",
            "DX7",
            "DX8",
            "DX9",
            "DX10",
            "DX11",
            "DX12",
            "DX13",
            "DX14",
            "DX15",
            "DX16",
        ],
        axis=1,
        inplace=True,
    )
    # df.drop(['dx1', 'dx2', 'dx3', 'dx4', 'dx5', 'dx6', 'dx7', 'dx8', 'dx9', 'dx10', 'dx11', 'dx12', 'dx13', 'dx14', 'dx15', 'dx16'],axis=1,inplace=True)

    df = df[
        [
            "acc",
            "age",
            "Gender",
            "race2",
            "zip5",
            "pay_grp",
            "County",
            "rural",
            "total",
            "Med_inc",
            "prop_black",
            "prop_hisp",
            "Prop_pov",
            "zipdist2",
            "admit_day1",
            "admit_mth",
            "Year_of_admission",
            "weekend_adm",
            "attend_doc",
            "Admission_Source",
            "Admitting_Service",
            "Admitting_type",
            "emergent",
            "pr1_day",
            "service1",
            "pr1c",
            "cci",
            "NDX",
            "MDC",
            "imi",
            "ichf",
            "ipvd",
            "icvd",
            "icpd",
            "liverd",
            "diabetes",
            "icancer",
            "imcancer",
            "cancer",
            "VALVE",
            "HYPOTHY",
            "COAG",
            "OBESE",
            "WGHTLOSS",
            "LYTES",
            "alc_drug",
            "anemia",
            "DEPRESS",
            "HTN_C",
            "PARA",
            "NEURO",
            "no_meds_on_adm",
            "aminog_adm",
            "bicarb_adm",
            "diuret_adm",
            "steroi_adm",
            "vanco_adm",
            "ace_adm",
            "nsaids_adm",
            "asa_adm",
            "antiemetic_adm",
            "betablockers_adm",
            "statin_adm",
            "inot_pres_adm",
            "min_HGB",
            "max_PROTUR_gr2",
            "max_HGBUR_gr",
            "max_GLUURN_gr",
            "count_HGBn",
            "count_PROTURn",
            "CKD_corr",
            "esrd_corr",
            "eGFR_epi_new",
            "ratio_firstCr_mdrd",
        ]
    ]

    print ("Processing of data complete.Writing to csv...")
    df.to_csv("processeddata.csv", sep=",", index=False)

    print ("Writing to csv complete.Final File generated")
    print time.clock() - t0, "seconds process time"

    return df.to_json()