def get_person_joint_dist_for_geography(self, ind): c = self.c puma = c.tract_to_puma(ind.state, ind.county, ind.tract) # this is cached so won't download more than once p_pums = self.c.download_population_pums(ind.state, puma) #def age_cat(r): #if r.AGEP <= 19: #return "19 and under" #elif r.AGEP <= 35: #return "20 to 35" #elif r.AGEP <= 60: #return "35 to 60" #return "above 60" #def race_cat(r): #if r.RAC1P == 1: #return "white" #elif r.RAC1P == 2: #return "black" #elif r.RAC1P == 6: #return "asian" #return "other" #def sex_cat(r): #if r.SEX == 1: #return "male" #return "female" # HS def age_cat(r): if r.AgeGrp == 1: return "category 1" if r.AgeGrp == 2: return "category 2" p_pums, jd_persons = cat.joint_distribution( p_pums, cat.category_combinations(self.p_acs_cat.columns), {"age": age_cat} ) #p_pums, jd_persons = cat.joint_distribution( #p_pums, #cat.category_combinations(self.p_acs_cat.columns), #{"age": age_cat, "race": race_cat, "sex": sex_cat} #) return p_pums, jd_persons
def get_person_joint_dist_for_geography(self, ind): puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010'] if puma in self.p_pums.keys(): return self.p_pums[puma], self.jd_persons[puma] # this is cached so won't download more than once p_pums = self.c.download_population_pums(self.state, puma) h_pums = self.c.download_household_pums(self.state, puma) h_pums = h_pums.loc[:,['serialno','TYPE','NP']] # add some household fields orig_len = len(p_pums) p_pums = p_pums.merge(h_pums, how='left') # Only Group Quarters p_pums = p_pums.loc[p_pums['TYPE']>2] print "Filtered to %d GQ persons from %d originally" % (len(p_pums), orig_len) assert(len(p_pums.loc[p_pums.RELP<16])==0) def gqage_cat(r): if r.AGEP <= 64: return "0-64" return "65+" def gqworker_cat(r): if r.employ == 5: return "0" return "1" p_pums, jd_persons = cat.joint_distribution( p_pums, cat.category_combinations(self.person_controls.columns), {"gqage_cat": gqage_cat, "gqworker_cat": gqworker_cat } ) # cache them self.p_pums[puma] = p_pums self.jd_persons[puma] = jd_persons return p_pums, jd_persons
def get_person_joint_dist_for_geography(self, ind): puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010'] if puma in self.p_pums.keys(): return self.p_pums[puma], self.jd_persons[puma] # this is cached so won't download more than once p_pums = self.c.download_population_pums(self.state, puma) h_pums = self.c.download_household_pums(self.state, puma) h_pums = h_pums.loc[:, ['serialno', 'TYPE', 'NP']] # add some household fields orig_len = len(p_pums) p_pums = p_pums.merge(h_pums, how='left') # Only Group Quarters p_pums = p_pums.loc[p_pums['TYPE'] > 2] print "Filtered to %d GQ persons from %d originally" % (len(p_pums), orig_len) assert (len(p_pums.loc[p_pums.RELP < 16]) == 0) def gqage_cat(r): if r.AGEP <= 64: return "0-64" return "65+" def gqworker_cat(r): if r.employ == 5: return "0" return "1" p_pums, jd_persons = cat.joint_distribution( p_pums, cat.category_combinations(self.person_controls.columns), { "gqage_cat": gqage_cat, "gqworker_cat": gqworker_cat }) # cache them self.p_pums[puma] = p_pums self.jd_persons[puma] = jd_persons return p_pums, jd_persons
def get_person_joint_dist_for_geography(self, ind): puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010'] if puma in self.p_pums.keys(): return self.p_pums[puma], self.jd_persons[puma] # this is cached so won't download more than once p_pums = self.c.download_population_pums(self.state, puma) h_pums = self.c.download_household_pums(self.state, puma) h_pums = h_pums.loc[:,['serialno','TYPE','NP']] # add some household fields orig_len = len(p_pums) p_pums = p_pums.merge(h_pums, how='left') p_pums = p_pums.loc[p_pums['TYPE']==1] print "Filtered to %d persons from %d originally" % (len(p_pums), orig_len) def age_cat(r): if r.AGEP <= 4: return "0-4" elif r.AGEP <= 19: return "5-19" elif r.AGEP <= 44: return "20-44" elif r.AGEP <= 64: return "45-64" return "65+" p_pums, jd_persons = cat.joint_distribution( p_pums, cat.category_combinations(self.person_controls.columns), {"age_cat": age_cat } ) # cache them self.p_pums[puma] = p_pums self.jd_persons[puma] = jd_persons return p_pums, jd_persons
def get_person_joint_dist_for_geography(self, ind): puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010'] if puma in self.p_pums.keys(): return self.p_pums[puma], self.jd_persons[puma] # this is cached so won't download more than once p_pums = self.c.download_population_pums(self.state, puma) h_pums = self.c.download_household_pums(self.state, puma) h_pums = h_pums.loc[:, ['serialno', 'TYPE', 'NP']] # add some household fields orig_len = len(p_pums) p_pums = p_pums.merge(h_pums, how='left') p_pums = p_pums.loc[p_pums['TYPE'] == 1] print "Filtered to %d persons from %d originally" % (len(p_pums), orig_len) def age_cat(r): if r.AGEP <= 4: return "0-4" elif r.AGEP <= 19: return "5-19" elif r.AGEP <= 44: return "20-44" elif r.AGEP <= 64: return "45-64" return "65+" p_pums, jd_persons = cat.joint_distribution( p_pums, cat.category_combinations(self.person_controls.columns), {"age_cat": age_cat}) # cache them self.p_pums[puma] = p_pums self.jd_persons[puma] = jd_persons return p_pums, jd_persons
def synth_all_mp(zones, h_pums, p_pums, h_marg, p_marg, h_cat_config, p_cat_config, cores=None, pums_geo_col='PUMA', marginal_zero_sub=1e-5, jd_zero_sub=1e-5, ipf_tolerance=1e-3, ipf_max_iterations=10000, ipu_tolerance=1e-4, ipu_max_iterations=50000, num_draws=100): """ ... """ # the list of zones to process # this will be a list of tuples in the form (zone, sample geography) assert (h_marg.index.values == p_marg.index.values).all() assert zones.index.is_unique zone_ids = zip(zones.index, zones) # classify agents into bins that march the marginal defs def classify(cats, agents): s_all = {} for cat_name, cat in cats.items(): s = pd.Series(index=agents.index) for marg_name, marg_config in cat.items(): in_marg = agents.query(marg_config['pums']).index.values s[in_marg] = marg_name s_all[cat_name] = s return pd.concat(s_all, axis=1) h_pums2 = classify(h_cat_config, h_pums) h_cls_cols = h_pums2.columns h_pums2['serialno'] = h_pums['serialno'] h_pums2['puma'] = h_pums[pums_geo_col] p_pums2 = classify(p_cat_config, p_pums) p_cls_cols = p_pums2.columns p_pums2['serialno'] = p_pums['serialno'] # get a unique ID for each marginal grouping, the IPU will work off of this h_cat_ids = synth_cat.category_combinations(h_marg.columns) p_starting_cat_id = h_cat_ids['cat_id'].max() + 1 p_cat_ids = synth_cat.category_combinations(p_marg.columns) p_cat_ids['cat_id'] += p_starting_cat_id h_pums2['cat_id'] = broadcast(h_cat_ids, h_pums2, left_fk=h_cat_ids.index.names) p_pums2['cat_id'] = broadcast(p_cat_ids, p_pums2, left_fk=p_cat_ids.index.names) # if we have nulls, we have problems if h_marg.isnull().any().any(): raise ValueError('household marginals have nulls') if p_marg.isnull().any().any(): raise ValueError('person marginals have nulls') if h_pums2.isnull().any().any(): raise ValueError('problem formmating household pums') if p_pums2.isnull().any().any(): raise ValueError('problem formmating persons pums') # handle zero-cells on marginals h_marg = h_marg.replace(0, marginal_zero_sub) p_marg = p_marg.replace(0, marginal_zero_sub) # set up multiprocessing print 'handing off to mp...' worker = partial(_synth_a_zone, h_pums=h_pums2, p_pums=p_pums2, h_marg=h_marg, p_marg=p_marg, h_cat_ids=h_cat_ids, p_cat_ids=p_cat_ids, jd_zero_sub=jd_zero_sub, ipf_tolerance=ipf_tolerance, ipf_max_iterations=ipf_max_iterations, ipu_tolerance=ipu_tolerance, ipu_max_iterations=ipu_max_iterations, num_draws=num_draws) cores = cores if cores else (multiprocessing.cpu_count() - 1) p = multiprocessing.Pool(cores) results = p.map(worker, zone_ids) p.close() p.join() # results are a list of tuples in the form zone_id, [hh_ids] # turn these to a data frame and link to households and persons print 'compiling...' sampled_zone_ids = [] sampled_hh_ids = [] diagnostics = [] try: for row in results: curr_z = row[0] curr_hh_ids = row[1] d = row[2] d['pbg_id'] = curr_z diagnostics.append(d) for hh_id in curr_hh_ids: sampled_zone_ids.append(curr_z) sampled_hh_ids.append(hh_id) sampled_df = pd.DataFrame({ 'pbg_id': sampled_zone_ids, 'serialno': sampled_hh_ids }) sampled_df['household_id'] = sampled_df.index # households final_hh = pd.merge(sampled_df, pd.concat([h_pums, h_pums2[h_cls_cols]], axis=1), on='serialno') final_hh.set_index('household_id', inplace=True) # persons final_pers = pd.merge(sampled_df, pd.concat([p_pums, p_pums2[p_cls_cols]], axis=1), on='serialno') final_pers.index.name = 'person_id' # diagnostics final_diag = pd.DataFrame(diagnostics).set_index('pbg_id') return final_hh, final_pers, final_diag except Exception as e: print e return results, str(e), ':('
def get_household_joint_dist_for_geography(self, ind): # check the cache to see if we've done it already puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010'] if puma in self.h_pums.keys(): return self.h_pums[puma], self.jd_households[puma] # if not, get the superclass to do a bunch of variable setting h_pums, p_pums = SFCTAStarter.get_pums(self, puma) orig_len = len(h_pums) # filter to housing unit only with number of persons > 0 h_pums = h_pums[h_pums['NP']>0] # Only Housing units h_pums = h_pums[h_pums['TYPE']==1] print "Filtered to %d households from %d originally" % (len(h_pums), orig_len) # Household income h_pums['hhinc_2012dollars'] = h_pums['HINCP']*(0.000001*h_pums['ADJINC']) # ADJINC has 6 implied decimal places h_pums['hhinc_1989dollars'] = 0.54*h_pums['hhinc_2012dollars'] h_pums['hhinc'] = h_pums['hhinc_1989dollars']/1000.0 # in thousands of dollars # print sum(h_pums.loc[:,'hhinc']<0) h_pums.loc[h_pums.loc[:,'hhinc']<0, 'hhinc'] = 0.0 # no negatives # print sum(h_pums.loc[:,'hhinc']>255) h_pums.loc[h_pums.loc[:,'hhinc']>255,'hhinc'] = 255.0 # max = 255 # For the following, r is a pandas.Series # It's basically a row from h_pums, so any variables defined above will be available def hhsize_cat(r): # NP = number of persons if r.NP >=5: return "5+" elif r.NP == 4: return "4" elif r.NP == 3: return "3" elif r.NP == 2: return "2" elif r.NP == 1: return "1" return "1" def income_cat(r): if r.hhinc < 25.0: return "0-25k" elif r.hhinc < 45.0: return "25-45k" elif r.hhinc < 75.0: return "45-75k" else: return "75k+" def workers_cat(r): # hmm... WIF = Workers in Family. What about non-family households? if r.workers >= 3: return "3+" elif r.workers == 2: return "2" elif r.workers == 1: return "1" return "0" def htype_cat(r): if r.hhage < 65 and r.NOC==0: return "HAGE1K0" elif r.hhage < 65 and r.NOC>0: return "HAGE1K1" else: return "HAGE65KALL" h_pums, jd_households = cat.joint_distribution( h_pums, cat.category_combinations(self.hh_controls.columns), {"hhsize_cat": hhsize_cat, "income_cat": income_cat, "workers_cat": workers_cat, "htype_cat": htype_cat} ) # cache them self.h_pums[puma] = h_pums self.jd_households[puma] = jd_households return h_pums, jd_households
def get_household_joint_dist_for_geography(self, ind): # check the cache to see if we've done it already puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010'] if puma in self.h_pums.keys(): return self.h_pums[puma], self.jd_households[puma] # if not, get the superclass to do a bunch of variable setting h_pums, p_pums = SFCTAStarter.get_pums(self, puma) orig_len = len(h_pums) # filter to housing unit only with number of persons > 0 h_pums = h_pums[h_pums['NP'] > 0] # Only Housing units h_pums = h_pums[h_pums['TYPE'] == 1] print "Filtered to %d households from %d originally" % (len(h_pums), orig_len) # Household income h_pums['hhinc_2012dollars'] = h_pums['HINCP'] * ( 0.000001 * h_pums['ADJINC']) # ADJINC has 6 implied decimal places h_pums['hhinc_1989dollars'] = 0.54 * h_pums['hhinc_2012dollars'] h_pums['hhinc'] = h_pums[ 'hhinc_1989dollars'] / 1000.0 # in thousands of dollars # print sum(h_pums.loc[:,'hhinc']<0) h_pums.loc[h_pums.loc[:, 'hhinc'] < 0, 'hhinc'] = 0.0 # no negatives # print sum(h_pums.loc[:,'hhinc']>255) h_pums.loc[h_pums.loc[:, 'hhinc'] > 255, 'hhinc'] = 255.0 # max = 255 # For the following, r is a pandas.Series # It's basically a row from h_pums, so any variables defined above will be available def hhsize_cat(r): # NP = number of persons if r.NP >= 5: return "5+" elif r.NP == 4: return "4" elif r.NP == 3: return "3" elif r.NP == 2: return "2" elif r.NP == 1: return "1" return "1" def income_cat(r): if r.hhinc < 25.0: return "0-25k" elif r.hhinc < 45.0: return "25-45k" elif r.hhinc < 75.0: return "45-75k" else: return "75k+" def workers_cat(r): # hmm... WIF = Workers in Family. What about non-family households? if r.workers >= 3: return "3+" elif r.workers == 2: return "2" elif r.workers == 1: return "1" return "0" def htype_cat(r): if r.hhage < 65 and r.NOC == 0: return "HAGE1K0" elif r.hhage < 65 and r.NOC > 0: return "HAGE1K1" else: return "HAGE65KALL" h_pums, jd_households = cat.joint_distribution( h_pums, cat.category_combinations(self.hh_controls.columns), { "hhsize_cat": hhsize_cat, "income_cat": income_cat, "workers_cat": workers_cat, "htype_cat": htype_cat }) # cache them self.h_pums[puma] = h_pums self.jd_households[puma] = jd_households return h_pums, jd_households
def test_categorize(acs_data, pums_data): p_acs_cat = cat.categorize(acs_data, { ("population", "total"): "B01001_001E", ("age", "19 and under"): "B01001_003E + B01001_004E + B01001_005E + " "B01001_006E + B01001_007E + B01001_027E + " "B01001_028E + B01001_029E + B01001_030E + " "B01001_031E", ("age", "20 to 35"): "B01001_008E + B01001_009E + B01001_010E + " "B01001_011E + B01001_012E + B01001_032E + " "B01001_033E + B01001_034E + B01001_035E + " "B01001_036E", ("age", "35 to 60"): "B01001_013E + B01001_014E + B01001_015E + " "B01001_016E + B01001_017E + B01001_037E + " "B01001_038E + B01001_039E + B01001_040E + " "B01001_041E", ("age", "above 60"): "B01001_018E + B01001_019E + B01001_020E + " "B01001_021E + B01001_022E + B01001_023E + " "B01001_024E + B01001_025E + B01001_042E + " "B01001_043E + B01001_044E + B01001_045E + " "B01001_046E + B01001_047E + B01001_048E + " "B01001_049E", ("race", "white"): "B02001_002E", ("race", "black"): "B02001_003E", ("race", "asian"): "B02001_005E", ("race", "other"): "B02001_004E + B02001_006E + B02001_007E + " "B02001_008E", ("sex", "male"): "B01001_002E", ("sex", "female"): "B01001_026E" }, index_cols=['NAME']) assert len(p_acs_cat) == 3 assert len(p_acs_cat.columns) == 11 assert len(p_acs_cat.columns.names) == 2 assert p_acs_cat.columns[0][0] == "age" assert np.all(cat.sum_accross_category(p_acs_cat) < 2) def age_cat(r): if r.AGEP <= 19: return "19 and under" elif r.AGEP <= 35: return "20 to 35" elif r.AGEP <= 60: return "35 to 60" return "above 60" def race_cat(r): if r.RAC1P == 1: return "white" elif r.RAC1P == 2: return "black" elif r.RAC1P == 6: return "asian" return "other" def sex_cat(r): if r.SEX == 1: return "male" return "female" pums_data, jd_persons = cat.joint_distribution( pums_data, cat.category_combinations(p_acs_cat.columns), {"age": age_cat, "race": race_cat, "sex": sex_cat} )
def get_household_joint_dist_for_geography(self, ind): c = self.c puma = c.tract_to_puma(ind.state, ind.county, ind.tract) # this is cached so won't download more than once h_pums = self.c.download_household_pums(ind.state, puma) #def cars_cat(r): #if r.VEH == 0: #return "none" #elif r.VEH == 1: #return "one" #return "two or more" #def children_cat(r): #if r.NOC > 0: #return "yes" #return "no" #def income_cat(r): #if r.FINCP > 100000: #return "gt100" #elif r.FINCP > 35000: #return "gt35-lt100" #return "lt35" #def workers_cat(r): #if r.WIF == 3: #return "two or more" #elif r.WIF == 2: #return "two or more" #elif r.WIF == 1: #return "one" #return "none" # HS # functions defining how category values are computed from the PUMA data def HHsize_cat(r): if r.HHSz == 1: return "one" if r.HHSz == 2: return "two" if r.HHSz == 3: return "three" if r.HHSz == 4: return "four" if r.HHSz == 5: return "five" if r.HHSz == 6: return "six" if r.HHSz > 6: return "seven+" h_pums, jd_households = cat.joint_distribution( h_pums, cat.category_combinations(self.h_acs_cat.columns), {"HHsize": HHsize_cat} ) #h_pums, jd_households = cat.joint_distribution( #h_pums, #cat.category_combinations(self.h_acs_cat.columns), #{"cars": cars_cat, "children": children_cat, #"income": income_cat, "workers": workers_cat} #) return h_pums, jd_households
# do the synthesis one PUMA at a time all_households=pd.DataFrame() all_persons=pd.DataFrame() for puma in all_pumas: print(puma) # get the block groups in this puma this_puma_ind=[i for i in range(len(h_acs)) if h_acs.iloc[i]['puma']==puma] #download the pums data p_pums=c.download_population_pums(state, puma10=puma, usecols=p_pums_cols) h_pums=c.download_household_pums(state, puma10=puma, usecols=h_pums_cols) #get the joint distribution of pums data h_pums, jd_households = cat.joint_distribution(h_pums, cat.category_combinations(h_acs_cat.columns), {"cars": cars_cat, "children": children_cat, "income": income_cat, "workers": workers_cat, "tenure": tenure_cat}) p_pums, jd_persons = cat.joint_distribution( p_pums, cat.category_combinations(p_acs_cat.columns), {"age": age_cat, "sex": sex_cat, "race": race_cat} ) # simulate households and persons for each person in each block-group of this PUMA for bg_ind in this_puma_ind: zone_name=h_acs_cat.index[bg_ind] print(zone_name) geoid=state+ h_acs.loc[zone_name,'county']+h_acs.loc[zone_name,'tract']+h_acs.loc[zone_name,'block group'] print(geoid) best_households, best_people, people_chisq, people_p= synthesizer.synthesize(h_acs_cat.iloc[bg_ind].transpose(), p_acs_cat.iloc[bg_ind].transpose(), jd_households, jd_persons, h_pums, p_pums, marginal_zero_sub=.01, jd_zero_sub=.001, hh_index_start=0)