def get_household_joint_dist_for_geography(self, ind): # check the cache to see if we've done it already puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010'] if puma in self.h_pums.keys(): return self.h_pums[puma], self.jd_households[puma] # if not, get the superclass to do a bunch of variable setting h_pums, p_pums = SFCTAStarter.get_pums(self, puma) orig_len = len(h_pums) # filter to housing unit only with number of persons > 0 h_pums = h_pums[h_pums['NP']>0] # Only Housing units h_pums = h_pums[h_pums['TYPE']==1] print "Filtered to %d households from %d originally" % (len(h_pums), orig_len) # Household income h_pums['hhinc_2012dollars'] = h_pums['HINCP']*(0.000001*h_pums['ADJINC']) # ADJINC has 6 implied decimal places h_pums['hhinc_1989dollars'] = 0.54*h_pums['hhinc_2012dollars'] h_pums['hhinc'] = h_pums['hhinc_1989dollars']/1000.0 # in thousands of dollars # print sum(h_pums.loc[:,'hhinc']<0) h_pums.loc[h_pums.loc[:,'hhinc']<0, 'hhinc'] = 0.0 # no negatives # print sum(h_pums.loc[:,'hhinc']>255) h_pums.loc[h_pums.loc[:,'hhinc']>255,'hhinc'] = 255.0 # max = 255 # For the following, r is a pandas.Series # It's basically a row from h_pums, so any variables defined above will be available def hhsize_cat(r): # NP = number of persons if r.NP >=5: return "5+" elif r.NP == 4: return "4" elif r.NP == 3: return "3" elif r.NP == 2: return "2" elif r.NP == 1: return "1" return "1" def income_cat(r): if r.hhinc < 25.0: return "0-25k" elif r.hhinc < 45.0: return "25-45k" elif r.hhinc < 75.0: return "45-75k" else: return "75k+" def workers_cat(r): # hmm... WIF = Workers in Family. What about non-family households? if r.workers >= 3: return "3+" elif r.workers == 2: return "2" elif r.workers == 1: return "1" return "0" def htype_cat(r): if r.hhage < 65 and r.NOC==0: return "HAGE1K0" elif r.hhage < 65 and r.NOC>0: return "HAGE1K1" else: return "HAGE65KALL" h_pums, jd_households = cat.joint_distribution( h_pums, cat.category_combinations(self.hh_controls.columns), {"hhsize_cat": hhsize_cat, "income_cat": income_cat, "workers_cat": workers_cat, "htype_cat": htype_cat} ) # cache them self.h_pums[puma] = h_pums self.jd_households[puma] = jd_households return h_pums, jd_households
def get_household_joint_dist_for_geography(self, ind): # check the cache to see if we've done it already puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010'] if puma in self.h_pums.keys(): return self.h_pums[puma], self.jd_households[puma] # if not, get the superclass to do a bunch of variable setting h_pums, p_pums = SFCTAStarter.get_pums(self, puma) orig_len = len(h_pums) # Don't bother filter number of persons -- this should happen with TYPE filter # h_pums = h_pums[h_pums['NP']==1] # Only Non-Institutional Group Quarters h_pums = h_pums[h_pums['TYPE']>2] print "Filtered to %d GQ 'households' from %d originally" % (len(h_pums), orig_len) np_bad = (h_pums.NP != 1) assert(np_bad.sum() == 0) # Group quarters income -- use PINCP h_pums.loc[pd.isnull(h_pums.loc[:,'PINCP']),'PINCP'] = 0.0 # no null h_pums['hhinc_2012dollars'] = h_pums['PINCP']*(0.000001*h_pums['ADJINC']) # ADJINC has 6 implied decimal places h_pums['hhinc_1989dollars'] = 0.54*h_pums['hhinc_2012dollars'] h_pums['hhinc'] = h_pums['hhinc_1989dollars']/1000.0 # in thousands of dollars # print sum(h_pums.loc[:,'hhinc']<0) h_pums.loc[h_pums.loc[:,'hhinc']<0, 'hhinc'] = 0.0 # no negatives # print sum(h_pums.loc[:,'hhinc']>255) h_pums.loc[h_pums.loc[:,'hhinc']>255,'hhinc'] = 255.0 # max = 255 # For the following, r is a pandas.Series # It's basically a row from h_pums, so any variables defined above will be available def hhsize_cat(r): # NP = number of persons if r.NP >=5: return "5+" elif r.NP == 4: return "4" elif r.NP == 3: return "3" elif r.NP == 2: return "2" elif r.NP == 1: return "1" return "1" def income_cat(r): if r.hhinc < 25.0: return "0-25k" elif r.hhinc < 45.0: return "25-45k" elif r.hhinc < 75.0: return "45-75k" else: return "75k+" def workers_cat(r): # hmm... WIF = Workers in Family. What about non-family households? if r.workers >= 3: return "3+" elif r.workers == 2: return "2" elif r.workers == 1: return "1" return "0" def htype_cat(r): if r.hhage < 65 and r.gqchild==0: return "HAGE1K0" elif r.hhage < 65 and r.gqchild > 0: return "HAGE1K1" else: return "HAGE65KALL" category_df = pd.DataFrame({'cat_id':[0], 'hhsize_cat':["1"]}) category_df.set_index(['hhsize_cat'], inplace=True) h_pums, jd_households = cat.joint_distribution( h_pums, category_df, {"hhsize_cat": hhsize_cat, "income_cat": income_cat, "workers_cat": workers_cat, "htype_cat": htype_cat} ) # cache them self.h_pums[puma] = h_pums self.jd_households[puma] = jd_households return h_pums, jd_households
def get_household_joint_dist_for_geography(self, ind): # check the cache to see if we've done it already puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010'] if puma in self.h_pums.keys(): return self.h_pums[puma], self.jd_households[puma] # if not, get the superclass to do a bunch of variable setting h_pums, p_pums = SFCTAStarter.get_pums(self, puma) orig_len = len(h_pums) # filter to housing unit only with number of persons > 0 h_pums = h_pums[h_pums['NP'] > 0] # Only Housing units h_pums = h_pums[h_pums['TYPE'] == 1] print "Filtered to %d households from %d originally" % (len(h_pums), orig_len) # Household income h_pums['hhinc_2012dollars'] = h_pums['HINCP'] * ( 0.000001 * h_pums['ADJINC']) # ADJINC has 6 implied decimal places h_pums['hhinc_1989dollars'] = 0.54 * h_pums['hhinc_2012dollars'] h_pums['hhinc'] = h_pums[ 'hhinc_1989dollars'] / 1000.0 # in thousands of dollars # print sum(h_pums.loc[:,'hhinc']<0) h_pums.loc[h_pums.loc[:, 'hhinc'] < 0, 'hhinc'] = 0.0 # no negatives # print sum(h_pums.loc[:,'hhinc']>255) h_pums.loc[h_pums.loc[:, 'hhinc'] > 255, 'hhinc'] = 255.0 # max = 255 # For the following, r is a pandas.Series # It's basically a row from h_pums, so any variables defined above will be available def hhsize_cat(r): # NP = number of persons if r.NP >= 5: return "5+" elif r.NP == 4: return "4" elif r.NP == 3: return "3" elif r.NP == 2: return "2" elif r.NP == 1: return "1" return "1" def income_cat(r): if r.hhinc < 25.0: return "0-25k" elif r.hhinc < 45.0: return "25-45k" elif r.hhinc < 75.0: return "45-75k" else: return "75k+" def workers_cat(r): # hmm... WIF = Workers in Family. What about non-family households? if r.workers >= 3: return "3+" elif r.workers == 2: return "2" elif r.workers == 1: return "1" return "0" def htype_cat(r): if r.hhage < 65 and r.NOC == 0: return "HAGE1K0" elif r.hhage < 65 and r.NOC > 0: return "HAGE1K1" else: return "HAGE65KALL" h_pums, jd_households = cat.joint_distribution( h_pums, cat.category_combinations(self.hh_controls.columns), { "hhsize_cat": hhsize_cat, "income_cat": income_cat, "workers_cat": workers_cat, "htype_cat": htype_cat }) # cache them self.h_pums[puma] = h_pums self.jd_households[puma] = jd_households return h_pums, jd_households
def get_household_joint_dist_for_geography(self, ind): # check the cache to see if we've done it already puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010'] if puma in self.h_pums.keys(): return self.h_pums[puma], self.jd_households[puma] # if not, get the superclass to do a bunch of variable setting h_pums, p_pums = SFCTAStarter.get_pums(self, puma) orig_len = len(h_pums) # Don't bother filter number of persons -- this should happen with TYPE filter # h_pums = h_pums[h_pums['NP']==1] # Only Non-Institutional Group Quarters h_pums = h_pums[h_pums['TYPE'] > 2] print "Filtered to %d GQ 'households' from %d originally" % ( len(h_pums), orig_len) np_bad = (h_pums.NP != 1) assert (np_bad.sum() == 0) # Group quarters income -- use PINCP h_pums.loc[pd.isnull(h_pums.loc[:, 'PINCP']), 'PINCP'] = 0.0 # no null h_pums['hhinc_2012dollars'] = h_pums['PINCP'] * ( 0.000001 * h_pums['ADJINC']) # ADJINC has 6 implied decimal places h_pums['hhinc_1989dollars'] = 0.54 * h_pums['hhinc_2012dollars'] h_pums['hhinc'] = h_pums[ 'hhinc_1989dollars'] / 1000.0 # in thousands of dollars # print sum(h_pums.loc[:,'hhinc']<0) h_pums.loc[h_pums.loc[:, 'hhinc'] < 0, 'hhinc'] = 0.0 # no negatives # print sum(h_pums.loc[:,'hhinc']>255) h_pums.loc[h_pums.loc[:, 'hhinc'] > 255, 'hhinc'] = 255.0 # max = 255 # For the following, r is a pandas.Series # It's basically a row from h_pums, so any variables defined above will be available def hhsize_cat(r): # NP = number of persons if r.NP >= 5: return "5+" elif r.NP == 4: return "4" elif r.NP == 3: return "3" elif r.NP == 2: return "2" elif r.NP == 1: return "1" return "1" def income_cat(r): if r.hhinc < 25.0: return "0-25k" elif r.hhinc < 45.0: return "25-45k" elif r.hhinc < 75.0: return "45-75k" else: return "75k+" def workers_cat(r): # hmm... WIF = Workers in Family. What about non-family households? if r.workers >= 3: return "3+" elif r.workers == 2: return "2" elif r.workers == 1: return "1" return "0" def htype_cat(r): if r.hhage < 65 and r.gqchild == 0: return "HAGE1K0" elif r.hhage < 65 and r.gqchild > 0: return "HAGE1K1" else: return "HAGE65KALL" category_df = pd.DataFrame({'cat_id': [0], 'hhsize_cat': ["1"]}) category_df.set_index(['hhsize_cat'], inplace=True) h_pums, jd_households = cat.joint_distribution( h_pums, category_df, { "hhsize_cat": hhsize_cat, "income_cat": income_cat, "workers_cat": workers_cat, "htype_cat": htype_cat }) # cache them self.h_pums[puma] = h_pums self.jd_households[puma] = jd_households return h_pums, jd_households