def get_person_joint_dist_for_geography(self, ind): c = self.c puma = c.tract_to_puma(ind.state, ind.county, ind.tract) # this is cached so won't download more than once p_pums = self.c.download_population_pums(ind.state, puma) #def age_cat(r): #if r.AGEP <= 19: #return "19 and under" #elif r.AGEP <= 35: #return "20 to 35" #elif r.AGEP <= 60: #return "35 to 60" #return "above 60" #def race_cat(r): #if r.RAC1P == 1: #return "white" #elif r.RAC1P == 2: #return "black" #elif r.RAC1P == 6: #return "asian" #return "other" #def sex_cat(r): #if r.SEX == 1: #return "male" #return "female" # HS def age_cat(r): if r.AgeGrp == 1: return "category 1" if r.AgeGrp == 2: return "category 2" p_pums, jd_persons = cat.joint_distribution( p_pums, cat.category_combinations(self.p_acs_cat.columns), {"age": age_cat} ) #p_pums, jd_persons = cat.joint_distribution( #p_pums, #cat.category_combinations(self.p_acs_cat.columns), #{"age": age_cat, "race": race_cat, "sex": sex_cat} #) return p_pums, jd_persons
def get_person_joint_dist_for_geography(self, ind): puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010'] if puma in self.p_pums.keys(): return self.p_pums[puma], self.jd_persons[puma] # this is cached so won't download more than once p_pums = self.c.download_population_pums(self.state, puma) h_pums = self.c.download_household_pums(self.state, puma) h_pums = h_pums.loc[:,['serialno','TYPE','NP']] # add some household fields orig_len = len(p_pums) p_pums = p_pums.merge(h_pums, how='left') # Only Group Quarters p_pums = p_pums.loc[p_pums['TYPE']>2] print "Filtered to %d GQ persons from %d originally" % (len(p_pums), orig_len) assert(len(p_pums.loc[p_pums.RELP<16])==0) def gqage_cat(r): if r.AGEP <= 64: return "0-64" return "65+" def gqworker_cat(r): if r.employ == 5: return "0" return "1" p_pums, jd_persons = cat.joint_distribution( p_pums, cat.category_combinations(self.person_controls.columns), {"gqage_cat": gqage_cat, "gqworker_cat": gqworker_cat } ) # cache them self.p_pums[puma] = p_pums self.jd_persons[puma] = jd_persons return p_pums, jd_persons
def get_person_joint_dist_for_geography(self, ind): puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010'] if puma in self.p_pums.keys(): return self.p_pums[puma], self.jd_persons[puma] # this is cached so won't download more than once p_pums = self.c.download_population_pums(self.state, puma) h_pums = self.c.download_household_pums(self.state, puma) h_pums = h_pums.loc[:, ['serialno', 'TYPE', 'NP']] # add some household fields orig_len = len(p_pums) p_pums = p_pums.merge(h_pums, how='left') # Only Group Quarters p_pums = p_pums.loc[p_pums['TYPE'] > 2] print "Filtered to %d GQ persons from %d originally" % (len(p_pums), orig_len) assert (len(p_pums.loc[p_pums.RELP < 16]) == 0) def gqage_cat(r): if r.AGEP <= 64: return "0-64" return "65+" def gqworker_cat(r): if r.employ == 5: return "0" return "1" p_pums, jd_persons = cat.joint_distribution( p_pums, cat.category_combinations(self.person_controls.columns), { "gqage_cat": gqage_cat, "gqworker_cat": gqworker_cat }) # cache them self.p_pums[puma] = p_pums self.jd_persons[puma] = jd_persons return p_pums, jd_persons
def get_person_joint_dist_for_geography(self, ind): puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010'] if puma in self.p_pums.keys(): return self.p_pums[puma], self.jd_persons[puma] # this is cached so won't download more than once p_pums = self.c.download_population_pums(self.state, puma) h_pums = self.c.download_household_pums(self.state, puma) h_pums = h_pums.loc[:,['serialno','TYPE','NP']] # add some household fields orig_len = len(p_pums) p_pums = p_pums.merge(h_pums, how='left') p_pums = p_pums.loc[p_pums['TYPE']==1] print "Filtered to %d persons from %d originally" % (len(p_pums), orig_len) def age_cat(r): if r.AGEP <= 4: return "0-4" elif r.AGEP <= 19: return "5-19" elif r.AGEP <= 44: return "20-44" elif r.AGEP <= 64: return "45-64" return "65+" p_pums, jd_persons = cat.joint_distribution( p_pums, cat.category_combinations(self.person_controls.columns), {"age_cat": age_cat } ) # cache them self.p_pums[puma] = p_pums self.jd_persons[puma] = jd_persons return p_pums, jd_persons
def get_person_joint_dist_for_geography(self, ind): puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010'] if puma in self.p_pums.keys(): return self.p_pums[puma], self.jd_persons[puma] # this is cached so won't download more than once p_pums = self.c.download_population_pums(self.state, puma) h_pums = self.c.download_household_pums(self.state, puma) h_pums = h_pums.loc[:, ['serialno', 'TYPE', 'NP']] # add some household fields orig_len = len(p_pums) p_pums = p_pums.merge(h_pums, how='left') p_pums = p_pums.loc[p_pums['TYPE'] == 1] print "Filtered to %d persons from %d originally" % (len(p_pums), orig_len) def age_cat(r): if r.AGEP <= 4: return "0-4" elif r.AGEP <= 19: return "5-19" elif r.AGEP <= 44: return "20-44" elif r.AGEP <= 64: return "45-64" return "65+" p_pums, jd_persons = cat.joint_distribution( p_pums, cat.category_combinations(self.person_controls.columns), {"age_cat": age_cat}) # cache them self.p_pums[puma] = p_pums self.jd_persons[puma] = jd_persons return p_pums, jd_persons
def get_household_joint_dist_for_geography(self, ind): # check the cache to see if we've done it already puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010'] if puma in self.h_pums.keys(): return self.h_pums[puma], self.jd_households[puma] # if not, get the superclass to do a bunch of variable setting h_pums, p_pums = SFCTAStarter.get_pums(self, puma) orig_len = len(h_pums) # filter to housing unit only with number of persons > 0 h_pums = h_pums[h_pums['NP']>0] # Only Housing units h_pums = h_pums[h_pums['TYPE']==1] print "Filtered to %d households from %d originally" % (len(h_pums), orig_len) # Household income h_pums['hhinc_2012dollars'] = h_pums['HINCP']*(0.000001*h_pums['ADJINC']) # ADJINC has 6 implied decimal places h_pums['hhinc_1989dollars'] = 0.54*h_pums['hhinc_2012dollars'] h_pums['hhinc'] = h_pums['hhinc_1989dollars']/1000.0 # in thousands of dollars # print sum(h_pums.loc[:,'hhinc']<0) h_pums.loc[h_pums.loc[:,'hhinc']<0, 'hhinc'] = 0.0 # no negatives # print sum(h_pums.loc[:,'hhinc']>255) h_pums.loc[h_pums.loc[:,'hhinc']>255,'hhinc'] = 255.0 # max = 255 # For the following, r is a pandas.Series # It's basically a row from h_pums, so any variables defined above will be available def hhsize_cat(r): # NP = number of persons if r.NP >=5: return "5+" elif r.NP == 4: return "4" elif r.NP == 3: return "3" elif r.NP == 2: return "2" elif r.NP == 1: return "1" return "1" def income_cat(r): if r.hhinc < 25.0: return "0-25k" elif r.hhinc < 45.0: return "25-45k" elif r.hhinc < 75.0: return "45-75k" else: return "75k+" def workers_cat(r): # hmm... WIF = Workers in Family. What about non-family households? if r.workers >= 3: return "3+" elif r.workers == 2: return "2" elif r.workers == 1: return "1" return "0" def htype_cat(r): if r.hhage < 65 and r.NOC==0: return "HAGE1K0" elif r.hhage < 65 and r.NOC>0: return "HAGE1K1" else: return "HAGE65KALL" h_pums, jd_households = cat.joint_distribution( h_pums, cat.category_combinations(self.hh_controls.columns), {"hhsize_cat": hhsize_cat, "income_cat": income_cat, "workers_cat": workers_cat, "htype_cat": htype_cat} ) # cache them self.h_pums[puma] = h_pums self.jd_households[puma] = jd_households return h_pums, jd_households
def get_household_joint_dist_for_geography(self, ind): # check the cache to see if we've done it already puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010'] if puma in self.h_pums.keys(): return self.h_pums[puma], self.jd_households[puma] # if not, get the superclass to do a bunch of variable setting h_pums, p_pums = SFCTAStarter.get_pums(self, puma) orig_len = len(h_pums) # filter to housing unit only with number of persons > 0 h_pums = h_pums[h_pums['NP'] > 0] # Only Housing units h_pums = h_pums[h_pums['TYPE'] == 1] print "Filtered to %d households from %d originally" % (len(h_pums), orig_len) # Household income h_pums['hhinc_2012dollars'] = h_pums['HINCP'] * ( 0.000001 * h_pums['ADJINC']) # ADJINC has 6 implied decimal places h_pums['hhinc_1989dollars'] = 0.54 * h_pums['hhinc_2012dollars'] h_pums['hhinc'] = h_pums[ 'hhinc_1989dollars'] / 1000.0 # in thousands of dollars # print sum(h_pums.loc[:,'hhinc']<0) h_pums.loc[h_pums.loc[:, 'hhinc'] < 0, 'hhinc'] = 0.0 # no negatives # print sum(h_pums.loc[:,'hhinc']>255) h_pums.loc[h_pums.loc[:, 'hhinc'] > 255, 'hhinc'] = 255.0 # max = 255 # For the following, r is a pandas.Series # It's basically a row from h_pums, so any variables defined above will be available def hhsize_cat(r): # NP = number of persons if r.NP >= 5: return "5+" elif r.NP == 4: return "4" elif r.NP == 3: return "3" elif r.NP == 2: return "2" elif r.NP == 1: return "1" return "1" def income_cat(r): if r.hhinc < 25.0: return "0-25k" elif r.hhinc < 45.0: return "25-45k" elif r.hhinc < 75.0: return "45-75k" else: return "75k+" def workers_cat(r): # hmm... WIF = Workers in Family. What about non-family households? if r.workers >= 3: return "3+" elif r.workers == 2: return "2" elif r.workers == 1: return "1" return "0" def htype_cat(r): if r.hhage < 65 and r.NOC == 0: return "HAGE1K0" elif r.hhage < 65 and r.NOC > 0: return "HAGE1K1" else: return "HAGE65KALL" h_pums, jd_households = cat.joint_distribution( h_pums, cat.category_combinations(self.hh_controls.columns), { "hhsize_cat": hhsize_cat, "income_cat": income_cat, "workers_cat": workers_cat, "htype_cat": htype_cat }) # cache them self.h_pums[puma] = h_pums self.jd_households[puma] = jd_households return h_pums, jd_households
def test_categorize(acs_data, pums_data): p_acs_cat = cat.categorize(acs_data, { ("population", "total"): "B01001_001E", ("age", "19 and under"): "B01001_003E + B01001_004E + B01001_005E + " "B01001_006E + B01001_007E + B01001_027E + " "B01001_028E + B01001_029E + B01001_030E + " "B01001_031E", ("age", "20 to 35"): "B01001_008E + B01001_009E + B01001_010E + " "B01001_011E + B01001_012E + B01001_032E + " "B01001_033E + B01001_034E + B01001_035E + " "B01001_036E", ("age", "35 to 60"): "B01001_013E + B01001_014E + B01001_015E + " "B01001_016E + B01001_017E + B01001_037E + " "B01001_038E + B01001_039E + B01001_040E + " "B01001_041E", ("age", "above 60"): "B01001_018E + B01001_019E + B01001_020E + " "B01001_021E + B01001_022E + B01001_023E + " "B01001_024E + B01001_025E + B01001_042E + " "B01001_043E + B01001_044E + B01001_045E + " "B01001_046E + B01001_047E + B01001_048E + " "B01001_049E", ("race", "white"): "B02001_002E", ("race", "black"): "B02001_003E", ("race", "asian"): "B02001_005E", ("race", "other"): "B02001_004E + B02001_006E + B02001_007E + " "B02001_008E", ("sex", "male"): "B01001_002E", ("sex", "female"): "B01001_026E" }, index_cols=['NAME']) assert len(p_acs_cat) == 3 assert len(p_acs_cat.columns) == 11 assert len(p_acs_cat.columns.names) == 2 assert p_acs_cat.columns[0][0] == "age" assert np.all(cat.sum_accross_category(p_acs_cat) < 2) def age_cat(r): if r.AGEP <= 19: return "19 and under" elif r.AGEP <= 35: return "20 to 35" elif r.AGEP <= 60: return "35 to 60" return "above 60" def race_cat(r): if r.RAC1P == 1: return "white" elif r.RAC1P == 2: return "black" elif r.RAC1P == 6: return "asian" return "other" def sex_cat(r): if r.SEX == 1: return "male" return "female" pums_data, jd_persons = cat.joint_distribution( pums_data, cat.category_combinations(p_acs_cat.columns), {"age": age_cat, "race": race_cat, "sex": sex_cat} )
def get_household_joint_dist_for_geography(self, ind): c = self.c puma = c.tract_to_puma(ind.state, ind.county, ind.tract) # this is cached so won't download more than once h_pums = self.c.download_household_pums(ind.state, puma) #def cars_cat(r): #if r.VEH == 0: #return "none" #elif r.VEH == 1: #return "one" #return "two or more" #def children_cat(r): #if r.NOC > 0: #return "yes" #return "no" #def income_cat(r): #if r.FINCP > 100000: #return "gt100" #elif r.FINCP > 35000: #return "gt35-lt100" #return "lt35" #def workers_cat(r): #if r.WIF == 3: #return "two or more" #elif r.WIF == 2: #return "two or more" #elif r.WIF == 1: #return "one" #return "none" # HS # functions defining how category values are computed from the PUMA data def HHsize_cat(r): if r.HHSz == 1: return "one" if r.HHSz == 2: return "two" if r.HHSz == 3: return "three" if r.HHSz == 4: return "four" if r.HHSz == 5: return "five" if r.HHSz == 6: return "six" if r.HHSz > 6: return "seven+" h_pums, jd_households = cat.joint_distribution( h_pums, cat.category_combinations(self.h_acs_cat.columns), {"HHsize": HHsize_cat} ) #h_pums, jd_households = cat.joint_distribution( #h_pums, #cat.category_combinations(self.h_acs_cat.columns), #{"cars": cars_cat, "children": children_cat, #"income": income_cat, "workers": workers_cat} #) return h_pums, jd_households
def get_household_joint_dist_for_geography(self, ind): # check the cache to see if we've done it already puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010'] if puma in self.h_pums.keys(): return self.h_pums[puma], self.jd_households[puma] # if not, get the superclass to do a bunch of variable setting h_pums, p_pums = SFCTAStarter.get_pums(self, puma) orig_len = len(h_pums) # Don't bother filter number of persons -- this should happen with TYPE filter # h_pums = h_pums[h_pums['NP']==1] # Only Non-Institutional Group Quarters h_pums = h_pums[h_pums['TYPE']>2] print "Filtered to %d GQ 'households' from %d originally" % (len(h_pums), orig_len) np_bad = (h_pums.NP != 1) assert(np_bad.sum() == 0) # Group quarters income -- use PINCP h_pums.loc[pd.isnull(h_pums.loc[:,'PINCP']),'PINCP'] = 0.0 # no null h_pums['hhinc_2012dollars'] = h_pums['PINCP']*(0.000001*h_pums['ADJINC']) # ADJINC has 6 implied decimal places h_pums['hhinc_1989dollars'] = 0.54*h_pums['hhinc_2012dollars'] h_pums['hhinc'] = h_pums['hhinc_1989dollars']/1000.0 # in thousands of dollars # print sum(h_pums.loc[:,'hhinc']<0) h_pums.loc[h_pums.loc[:,'hhinc']<0, 'hhinc'] = 0.0 # no negatives # print sum(h_pums.loc[:,'hhinc']>255) h_pums.loc[h_pums.loc[:,'hhinc']>255,'hhinc'] = 255.0 # max = 255 # For the following, r is a pandas.Series # It's basically a row from h_pums, so any variables defined above will be available def hhsize_cat(r): # NP = number of persons if r.NP >=5: return "5+" elif r.NP == 4: return "4" elif r.NP == 3: return "3" elif r.NP == 2: return "2" elif r.NP == 1: return "1" return "1" def income_cat(r): if r.hhinc < 25.0: return "0-25k" elif r.hhinc < 45.0: return "25-45k" elif r.hhinc < 75.0: return "45-75k" else: return "75k+" def workers_cat(r): # hmm... WIF = Workers in Family. What about non-family households? if r.workers >= 3: return "3+" elif r.workers == 2: return "2" elif r.workers == 1: return "1" return "0" def htype_cat(r): if r.hhage < 65 and r.gqchild==0: return "HAGE1K0" elif r.hhage < 65 and r.gqchild > 0: return "HAGE1K1" else: return "HAGE65KALL" category_df = pd.DataFrame({'cat_id':[0], 'hhsize_cat':["1"]}) category_df.set_index(['hhsize_cat'], inplace=True) h_pums, jd_households = cat.joint_distribution( h_pums, category_df, {"hhsize_cat": hhsize_cat, "income_cat": income_cat, "workers_cat": workers_cat, "htype_cat": htype_cat} ) # cache them self.h_pums[puma] = h_pums self.jd_households[puma] = jd_households return h_pums, jd_households
# do the synthesis one PUMA at a time all_households=pd.DataFrame() all_persons=pd.DataFrame() for puma in all_pumas: print(puma) # get the block groups in this puma this_puma_ind=[i for i in range(len(h_acs)) if h_acs.iloc[i]['puma']==puma] #download the pums data p_pums=c.download_population_pums(state, puma10=puma, usecols=p_pums_cols) h_pums=c.download_household_pums(state, puma10=puma, usecols=h_pums_cols) #get the joint distribution of pums data h_pums, jd_households = cat.joint_distribution(h_pums, cat.category_combinations(h_acs_cat.columns), {"cars": cars_cat, "children": children_cat, "income": income_cat, "workers": workers_cat, "tenure": tenure_cat}) p_pums, jd_persons = cat.joint_distribution( p_pums, cat.category_combinations(p_acs_cat.columns), {"age": age_cat, "sex": sex_cat, "race": race_cat} ) # simulate households and persons for each person in each block-group of this PUMA for bg_ind in this_puma_ind: zone_name=h_acs_cat.index[bg_ind] print(zone_name) geoid=state+ h_acs.loc[zone_name,'county']+h_acs.loc[zone_name,'tract']+h_acs.loc[zone_name,'block group'] print(geoid) best_households, best_people, people_chisq, people_p= synthesizer.synthesize(h_acs_cat.iloc[bg_ind].transpose(), p_acs_cat.iloc[bg_ind].transpose(), jd_households, jd_persons, h_pums, p_pums, marginal_zero_sub=.01, jd_zero_sub=.001, hh_index_start=0) # add the puma and bg id to each HH
def get_household_joint_dist_for_geography(self, ind): # check the cache to see if we've done it already puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010'] if puma in self.h_pums.keys(): return self.h_pums[puma], self.jd_households[puma] # if not, get the superclass to do a bunch of variable setting h_pums, p_pums = SFCTAStarter.get_pums(self, puma) orig_len = len(h_pums) # Don't bother filter number of persons -- this should happen with TYPE filter # h_pums = h_pums[h_pums['NP']==1] # Only Non-Institutional Group Quarters h_pums = h_pums[h_pums['TYPE'] > 2] print "Filtered to %d GQ 'households' from %d originally" % ( len(h_pums), orig_len) np_bad = (h_pums.NP != 1) assert (np_bad.sum() == 0) # Group quarters income -- use PINCP h_pums.loc[pd.isnull(h_pums.loc[:, 'PINCP']), 'PINCP'] = 0.0 # no null h_pums['hhinc_2012dollars'] = h_pums['PINCP'] * ( 0.000001 * h_pums['ADJINC']) # ADJINC has 6 implied decimal places h_pums['hhinc_1989dollars'] = 0.54 * h_pums['hhinc_2012dollars'] h_pums['hhinc'] = h_pums[ 'hhinc_1989dollars'] / 1000.0 # in thousands of dollars # print sum(h_pums.loc[:,'hhinc']<0) h_pums.loc[h_pums.loc[:, 'hhinc'] < 0, 'hhinc'] = 0.0 # no negatives # print sum(h_pums.loc[:,'hhinc']>255) h_pums.loc[h_pums.loc[:, 'hhinc'] > 255, 'hhinc'] = 255.0 # max = 255 # For the following, r is a pandas.Series # It's basically a row from h_pums, so any variables defined above will be available def hhsize_cat(r): # NP = number of persons if r.NP >= 5: return "5+" elif r.NP == 4: return "4" elif r.NP == 3: return "3" elif r.NP == 2: return "2" elif r.NP == 1: return "1" return "1" def income_cat(r): if r.hhinc < 25.0: return "0-25k" elif r.hhinc < 45.0: return "25-45k" elif r.hhinc < 75.0: return "45-75k" else: return "75k+" def workers_cat(r): # hmm... WIF = Workers in Family. What about non-family households? if r.workers >= 3: return "3+" elif r.workers == 2: return "2" elif r.workers == 1: return "1" return "0" def htype_cat(r): if r.hhage < 65 and r.gqchild == 0: return "HAGE1K0" elif r.hhage < 65 and r.gqchild > 0: return "HAGE1K1" else: return "HAGE65KALL" category_df = pd.DataFrame({'cat_id': [0], 'hhsize_cat': ["1"]}) category_df.set_index(['hhsize_cat'], inplace=True) h_pums, jd_households = cat.joint_distribution( h_pums, category_df, { "hhsize_cat": hhsize_cat, "income_cat": income_cat, "workers_cat": workers_cat, "htype_cat": htype_cat }) # cache them self.h_pums[puma] = h_pums self.jd_households[puma] = jd_households return h_pums, jd_households