def mergeCancer_County(): # allCanMrg = readAllCancer_County() acsCounty = popData('county') indivCanMrg = readIndivCancer_County() indivCanMrgPop = pd.merge(indivCanMrg, acsCounty[['countyFIPS', 'totPop']], left_on = 'countyCode', right_on = 'countyFIPS') canCols = indivCanMrgPop.columns.values[1:-3] for i in canCols: indivCanMrgPop[i+'_Per100k'] = indivCanMrgPop[i]*100000/indivCanMrgPop['totPop'] indivCanMrgPop.drop(canCols, inplace=True,axis=1) indivCanMrgPop['geoid5'] = '36'+indivCanMrgPop['countyFIPS'] newCols = [x for x in list(indivCanMrgPop.columns.values) if x not in ['countyCode', 'countyFIPS', 'totPop', 'countyName', 'geoid5']] for x in ['totPop', 'countyName', 'countyFIPS', 'geoid5']: newCols.insert(0, x) indivCanMrgPop = indivCanMrgPop[newCols] return indivCanMrgPop
def main_CensusTract(): # Import air data airEmissions = read_airEmissions_CensusTract() #print airEmissions['n_5_1_fugitive_air_dioxin'].describe() # Import cancer data # allCancer = readIndivCancer_CensusTract() allCancer = mergeCancer_Tract() #allCancer.to_csv('/Users/Steve/Github/ny_cancerAnalysis/data/NYSDOH_CancerMapping_Data_2005_2009/allCancer.csv') # Import smoking data smoking = readSmoking() # Import census tract level population data acsTract = popData('tract') # Import cancer risk data cancerRisk = read_cancerRisk_CensusTract() # Join air emission data with cancer rates data data_merged = pd.merge(allCancer, airEmissions, how='left', left_on = 'geoid11', right_on = 'geoid') data_merged.fillna(0, inplace=True) # To avoid losing those tracts in model with smoking and demographic data but no chemical releases data_merged = data_merged.drop('geoid', 1) data_merged['countyCode'] = data_merged['tractFIPS'].str[:3] data_merged = pd.merge(data_merged, smoking, left_on = 'countyCode', right_on = 'cCode') data_merged = pd.merge(data_merged, acsTract, left_on = 'geoid11', right_on = 'Geo_FIPS') data_merged = pd.merge(data_merged, cancerRisk, left_on = 'Geo_FIPS', right_on = 'GEOID') # Produce correlation table correlation_table = data_merged.corr() correlation_table.to_csv('data/CorrelationTable/censusTract_correlationTable.csv') cancer_list = ['observed_Bladder_Per100k', 'observed_Bone_Per100k', 'observed_Brain_Per100k', 'observed_Breast_Per100k', 'observed_Colorectal_Per100k','observed_Esophagus_Per100k', 'observed_Kidney_Per100k', 'observed_Larynx_Per100k', 'observed_Leukemia_Per100k', 'observed_Liver_Per100k', 'observed_Lung_Per100k', 'observed_Mesothelioma_Per100k', 'observed_NHL_Per100k', 'observed_Nasal_Per100k', 'observed_Oral_Per100k', 'observed_Other_Per100k', 'observed_Ovary_Per100k', 'observed_Pancreas_Per100k', 'observed_Prostate_Per100k', 'observed_Soft_Tissue_Per100k', 'observed_Stomach_Per100k', 'observed_Testis_Per100k', 'observed_Thyroid_Per100k', 'observed_Uterus_Per100k', 'observed_Total_Per100k'] chemical_list = ['n_5_1_fugitive_air', 'n_5_2_stack_air', 'airTotal', 'n_5_1_fugitive_air_benzene', 'n_5_2_stack_air_benzene', 'benzeneTotal', 'n_5_1_fugitive_air_toluene', 'n_5_2_stack_air_toluene', 'tolueneTotal', 'n_5_1_fugitive_air_ethylbenzene', 'n_5_2_stack_air_ethylbenzene', 'ethylbenzeneTotal', 'n_5_1_fugitive_air_xylene', 'n_5_2_stack_air_xylene', 'xyleneTotal', 'n_5_1_fugitive_air_formaldehyde', 'n_5_2_stack_air_formaldehyde', 'formaldehydeTotal', 'BTEX_fugitive', 'BTEX_stack', 'BTEX_total', 'n_5_1_fugitive_air_dioxin', 'n_5_2_stack_air_dioxin', 'dioxinTotal'] for chemical in chemical_list: with open('data/Regression/'+chemical+'.csv', 'w') as f: # columns = ['Cancer', 'Coefficient', 'p-Value', 'Std. Error', 'Adj. R'] # result_df = pd.DataFrame(index=columns) for cancer in cancer_list: mod = smf.ols(formula=cancer+' ~ '+chemical+' + \ pctSmoking + pctElderly + income + higherEd + unemploy', data = data_merged).fit(cov_type='HC0') result_df = pd.DataFrame({ 'Cancer': cancer, 'Coefficient': mod.params.apply(lambda x: round(x, 3)), 'p-Value': mod.pvalues.apply(lambda x: round(x, 3)), 'Std. Error': mod.bse.astype(int), 'Adj. R': round(mod.rsquared_adj, 3)}) # Coefficient for air emission feature is mod.params[1] if mod.params[1] > 1.0: print 'Cancer and pollutant combination with Coefficient > 1:' print mod.params[1], mod.pvalues[1], cancer, chemical result_df.to_csv(f) # Test model mod = smf.ols(formula='observed_Esophagus_Per100k ~ n_5_1_fugitive_air_dioxin + \ pctSmoking + pctElderly + income + higherEd + unemploy', data = data_merged).fit(cov_type='HC0') print mod.summary() # Correlation Table Heat Map hm(correlation_table) return data_merged