Python joint_distribution примеры, synthpop.categorizer.joint_distribution Python примеры использования

Пример #1

0

Показать файл

Файл: starter.py Проект: psrc/psrc_synthpop

    def get_person_joint_dist_for_geography(self, ind):
        c = self.c

        puma = c.tract_to_puma(ind.state, ind.county, ind.tract)
        # this is cached so won't download more than once
        p_pums = self.c.download_population_pums(ind.state, puma)

        #def age_cat(r):
            #if r.AGEP <= 19:
                #return "19 and under"
            #elif r.AGEP <= 35:
                #return "20 to 35"
            #elif r.AGEP <= 60:
                #return "35 to 60"
            #return "above 60"

        #def race_cat(r):
            #if r.RAC1P == 1:
                #return "white"
            #elif r.RAC1P == 2:
                #return "black"
            #elif r.RAC1P == 6:
                #return "asian"
            #return "other"

        #def sex_cat(r):
            #if r.SEX == 1:
                #return "male"
            #return "female"
        # HS
        def age_cat(r):
            if r.AgeGrp == 1:
                return "category 1"
            if r.AgeGrp == 2:
                return "category 2"

        p_pums, jd_persons = cat.joint_distribution(
                p_pums,
                cat.category_combinations(self.p_acs_cat.columns),
                {"age": age_cat}
            )
        #p_pums, jd_persons = cat.joint_distribution(
            #p_pums,
            #cat.category_combinations(self.p_acs_cat.columns),
            #{"age": age_cat, "race": race_cat, "sex": sex_cat}
        #)
        return p_pums, jd_persons

Пример #2

0

Показать файл

Файл: sfcta_starter_gq.py Проект: bhargavasana/synthpop

    def get_person_joint_dist_for_geography(self, ind):
        puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010']
        
        if puma in self.p_pums.keys():
            return self.p_pums[puma], self.jd_persons[puma]
        
        # this is cached so won't download more than once
        p_pums = self.c.download_population_pums(self.state, puma)
        h_pums = self.c.download_household_pums(self.state, puma)
        h_pums = h_pums.loc[:,['serialno','TYPE','NP']]
        
        # add some household fields
        orig_len = len(p_pums)
        p_pums = p_pums.merge(h_pums, how='left')
        # Only Group Quarters
        p_pums = p_pums.loc[p_pums['TYPE']>2]
        print "Filtered to %d GQ persons from %d originally" % (len(p_pums), orig_len)
        
        assert(len(p_pums.loc[p_pums.RELP<16])==0)
        
        def gqage_cat(r):
            if r.AGEP <= 64:
                return "0-64"
            return "65+"
    
        def gqworker_cat(r):
            if r.employ == 5:
                return "0"
            return "1"

        p_pums, jd_persons = cat.joint_distribution(
            p_pums,
            cat.category_combinations(self.person_controls.columns),
            {"gqage_cat": gqage_cat,
             "gqworker_cat": gqworker_cat }
        )
        # cache them
        self.p_pums[puma]       = p_pums
        self.jd_persons[puma]   = jd_persons
        return p_pums, jd_persons

Пример #3

0

Показать файл

Файл: sfcta_starter_gq.py Проект: bhargavasana/synthpop

    def get_person_joint_dist_for_geography(self, ind):
        puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010']

        if puma in self.p_pums.keys():
            return self.p_pums[puma], self.jd_persons[puma]

        # this is cached so won't download more than once
        p_pums = self.c.download_population_pums(self.state, puma)
        h_pums = self.c.download_household_pums(self.state, puma)
        h_pums = h_pums.loc[:, ['serialno', 'TYPE', 'NP']]

        # add some household fields
        orig_len = len(p_pums)
        p_pums = p_pums.merge(h_pums, how='left')
        # Only Group Quarters
        p_pums = p_pums.loc[p_pums['TYPE'] > 2]
        print "Filtered to %d GQ persons from %d originally" % (len(p_pums),
                                                                orig_len)

        assert (len(p_pums.loc[p_pums.RELP < 16]) == 0)

        def gqage_cat(r):
            if r.AGEP <= 64:
                return "0-64"
            return "65+"

        def gqworker_cat(r):
            if r.employ == 5:
                return "0"
            return "1"

        p_pums, jd_persons = cat.joint_distribution(
            p_pums, cat.category_combinations(self.person_controls.columns), {
                "gqage_cat": gqage_cat,
                "gqworker_cat": gqworker_cat
            })
        # cache them
        self.p_pums[puma] = p_pums
        self.jd_persons[puma] = jd_persons
        return p_pums, jd_persons

Пример #4

0

Показать файл

Файл: sfcta_starter_hh.py Проект: bhargavasana/synthpop

    def get_person_joint_dist_for_geography(self, ind):
        puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010']
        
        if puma in self.p_pums.keys():
            return self.p_pums[puma], self.jd_persons[puma]
        
        # this is cached so won't download more than once
        p_pums = self.c.download_population_pums(self.state, puma)
        h_pums = self.c.download_household_pums(self.state, puma)
        h_pums = h_pums.loc[:,['serialno','TYPE','NP']]
        
        # add some household fields
        orig_len = len(p_pums)
        p_pums = p_pums.merge(h_pums, how='left')
        p_pums = p_pums.loc[p_pums['TYPE']==1]
        print "Filtered to %d persons from %d originally" % (len(p_pums), orig_len)

        def age_cat(r):
            if r.AGEP <= 4:
                return "0-4"
            elif r.AGEP <= 19:
                return "5-19"
            elif r.AGEP <= 44:
                return "20-44"
            elif r.AGEP <= 64:
                return "45-64"
            return "65+"

        p_pums, jd_persons = cat.joint_distribution(
            p_pums,
            cat.category_combinations(self.person_controls.columns),
            {"age_cat": age_cat }
        )
        # cache them
        self.p_pums[puma]       = p_pums
        self.jd_persons[puma]   = jd_persons
        return p_pums, jd_persons

Пример #5

0

Показать файл

    def get_person_joint_dist_for_geography(self, ind):
        puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010']

        if puma in self.p_pums.keys():
            return self.p_pums[puma], self.jd_persons[puma]

        # this is cached so won't download more than once
        p_pums = self.c.download_population_pums(self.state, puma)
        h_pums = self.c.download_household_pums(self.state, puma)
        h_pums = h_pums.loc[:, ['serialno', 'TYPE', 'NP']]

        # add some household fields
        orig_len = len(p_pums)
        p_pums = p_pums.merge(h_pums, how='left')
        p_pums = p_pums.loc[p_pums['TYPE'] == 1]
        print "Filtered to %d persons from %d originally" % (len(p_pums),
                                                             orig_len)

        def age_cat(r):
            if r.AGEP <= 4:
                return "0-4"
            elif r.AGEP <= 19:
                return "5-19"
            elif r.AGEP <= 44:
                return "20-44"
            elif r.AGEP <= 64:
                return "45-64"
            return "65+"

        p_pums, jd_persons = cat.joint_distribution(
            p_pums, cat.category_combinations(self.person_controls.columns),
            {"age_cat": age_cat})
        # cache them
        self.p_pums[puma] = p_pums
        self.jd_persons[puma] = jd_persons
        return p_pums, jd_persons

Пример #6

0

Показать файл

Файл: sfcta_starter_hh.py Проект: bhargavasana/synthpop

    def get_household_joint_dist_for_geography(self, ind):
        
        # check the cache to see if we've done it already
        puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010']
        if puma in self.h_pums.keys():
            return self.h_pums[puma], self.jd_households[puma]

        # if not, get the superclass to do a bunch of variable setting
        h_pums, p_pums = SFCTAStarter.get_pums(self, puma)
        orig_len = len(h_pums)

        # filter to housing unit only with number of persons > 0
        h_pums = h_pums[h_pums['NP']>0]
        # Only Housing units
        h_pums = h_pums[h_pums['TYPE']==1]
        print "Filtered to %d households from %d originally" % (len(h_pums), orig_len)
                
        # Household income
        h_pums['hhinc_2012dollars'] = h_pums['HINCP']*(0.000001*h_pums['ADJINC'])  # ADJINC has 6 implied decimal places
        h_pums['hhinc_1989dollars'] = 0.54*h_pums['hhinc_2012dollars']
        
        h_pums['hhinc'] = h_pums['hhinc_1989dollars']/1000.0  # in thousands of dollars
        # print sum(h_pums.loc[:,'hhinc']<0)
        h_pums.loc[h_pums.loc[:,'hhinc']<0,  'hhinc'] = 0.0       # no negatives
        # print sum(h_pums.loc[:,'hhinc']>255)
        h_pums.loc[h_pums.loc[:,'hhinc']>255,'hhinc'] = 255.0   # max = 255
        
        # For the following, r is a pandas.Series
        # It's basically a row from h_pums, so any variables defined above will be available
        
        def hhsize_cat(r):
            # NP = number of persons
            if r.NP >=5:
                return "5+"
            elif r.NP == 4:
                return "4"
            elif r.NP == 3:
                return "3"
            elif r.NP == 2:
                return "2"
            elif r.NP == 1:
                return "1"
            return "1"

        def income_cat(r):
            if r.hhinc < 25.0:
                return "0-25k"
            elif r.hhinc < 45.0:
                return "25-45k"
            elif r.hhinc < 75.0:
                return "45-75k"
            else:
                return "75k+"

        def workers_cat(r):
            # hmm... WIF = Workers in Family.  What about non-family households?
            if r.workers >= 3:
                return "3+"
            elif r.workers == 2:
                return "2"
            elif r.workers == 1:
                return "1"
            return "0"
        
        def htype_cat(r):
            if r.hhage < 65 and r.NOC==0:
                return "HAGE1K0"
            elif r.hhage < 65 and r.NOC>0:
                return "HAGE1K1"
            else:
                return "HAGE65KALL"

        h_pums, jd_households = cat.joint_distribution(
            h_pums,
            cat.category_combinations(self.hh_controls.columns),
            {"hhsize_cat": hhsize_cat,
             "income_cat": income_cat,
             "workers_cat": workers_cat,
             "htype_cat": htype_cat}
        )
        # cache them
        self.h_pums[puma]           = h_pums
        self.jd_households[puma]    = jd_households

        return h_pums, jd_households

Пример #7

0

Показать файл

    def get_household_joint_dist_for_geography(self, ind):

        # check the cache to see if we've done it already
        puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010']
        if puma in self.h_pums.keys():
            return self.h_pums[puma], self.jd_households[puma]

        # if not, get the superclass to do a bunch of variable setting
        h_pums, p_pums = SFCTAStarter.get_pums(self, puma)
        orig_len = len(h_pums)

        # filter to housing unit only with number of persons > 0
        h_pums = h_pums[h_pums['NP'] > 0]
        # Only Housing units
        h_pums = h_pums[h_pums['TYPE'] == 1]
        print "Filtered to %d households from %d originally" % (len(h_pums),
                                                                orig_len)

        # Household income
        h_pums['hhinc_2012dollars'] = h_pums['HINCP'] * (
            0.000001 * h_pums['ADJINC'])  # ADJINC has 6 implied decimal places
        h_pums['hhinc_1989dollars'] = 0.54 * h_pums['hhinc_2012dollars']

        h_pums['hhinc'] = h_pums[
            'hhinc_1989dollars'] / 1000.0  # in thousands of dollars
        # print sum(h_pums.loc[:,'hhinc']<0)
        h_pums.loc[h_pums.loc[:, 'hhinc'] < 0, 'hhinc'] = 0.0  # no negatives
        # print sum(h_pums.loc[:,'hhinc']>255)
        h_pums.loc[h_pums.loc[:, 'hhinc'] > 255, 'hhinc'] = 255.0  # max = 255

        # For the following, r is a pandas.Series
        # It's basically a row from h_pums, so any variables defined above will be available

        def hhsize_cat(r):
            # NP = number of persons
            if r.NP >= 5:
                return "5+"
            elif r.NP == 4:
                return "4"
            elif r.NP == 3:
                return "3"
            elif r.NP == 2:
                return "2"
            elif r.NP == 1:
                return "1"
            return "1"

        def income_cat(r):
            if r.hhinc < 25.0:
                return "0-25k"
            elif r.hhinc < 45.0:
                return "25-45k"
            elif r.hhinc < 75.0:
                return "45-75k"
            else:
                return "75k+"

        def workers_cat(r):
            # hmm... WIF = Workers in Family.  What about non-family households?
            if r.workers >= 3:
                return "3+"
            elif r.workers == 2:
                return "2"
            elif r.workers == 1:
                return "1"
            return "0"

        def htype_cat(r):
            if r.hhage < 65 and r.NOC == 0:
                return "HAGE1K0"
            elif r.hhage < 65 and r.NOC > 0:
                return "HAGE1K1"
            else:
                return "HAGE65KALL"

        h_pums, jd_households = cat.joint_distribution(
            h_pums, cat.category_combinations(self.hh_controls.columns), {
                "hhsize_cat": hhsize_cat,
                "income_cat": income_cat,
                "workers_cat": workers_cat,
                "htype_cat": htype_cat
            })
        # cache them
        self.h_pums[puma] = h_pums
        self.jd_households[puma] = jd_households

        return h_pums, jd_households

Пример #8

0

Показать файл

def test_categorize(acs_data, pums_data):
    p_acs_cat = cat.categorize(acs_data, {
        ("population", "total"): "B01001_001E",
        ("age", "19 and under"): "B01001_003E + B01001_004E + B01001_005E + "
                                 "B01001_006E + B01001_007E + B01001_027E + "
                                 "B01001_028E + B01001_029E + B01001_030E + "
                                 "B01001_031E",
        ("age", "20 to 35"): "B01001_008E + B01001_009E + B01001_010E + "
                             "B01001_011E + B01001_012E + B01001_032E + "
                             "B01001_033E + B01001_034E + B01001_035E + "
                             "B01001_036E",
        ("age", "35 to 60"): "B01001_013E + B01001_014E + B01001_015E + "
                             "B01001_016E + B01001_017E + B01001_037E + "
                             "B01001_038E + B01001_039E + B01001_040E + "
                             "B01001_041E",
        ("age", "above 60"): "B01001_018E + B01001_019E + B01001_020E + "
                             "B01001_021E + B01001_022E + B01001_023E + "
                             "B01001_024E + B01001_025E + B01001_042E + "
                             "B01001_043E + B01001_044E + B01001_045E + "
                             "B01001_046E + B01001_047E + B01001_048E + "
                             "B01001_049E",
        ("race", "white"):   "B02001_002E",
        ("race", "black"):   "B02001_003E",
        ("race", "asian"):   "B02001_005E",
        ("race", "other"):   "B02001_004E + B02001_006E + B02001_007E + "
                             "B02001_008E",
        ("sex", "male"):     "B01001_002E",
        ("sex", "female"):   "B01001_026E"
    }, index_cols=['NAME'])

    assert len(p_acs_cat) == 3
    assert len(p_acs_cat.columns) == 11
    assert len(p_acs_cat.columns.names) == 2
    assert p_acs_cat.columns[0][0] == "age"

    assert np.all(cat.sum_accross_category(p_acs_cat) < 2)

    def age_cat(r):
        if r.AGEP <= 19:
            return "19 and under"
        elif r.AGEP <= 35:
            return "20 to 35"
        elif r.AGEP <= 60:
            return "35 to 60"
        return "above 60"

    def race_cat(r):
        if r.RAC1P == 1:
            return "white"
        elif r.RAC1P == 2:
            return "black"
        elif r.RAC1P == 6:
            return "asian"
        return "other"

    def sex_cat(r):
        if r.SEX == 1:
            return "male"
        return "female"

    pums_data, jd_persons = cat.joint_distribution(
        pums_data,
        cat.category_combinations(p_acs_cat.columns),
        {"age": age_cat, "race": race_cat, "sex": sex_cat}
    )

Пример #9

0

Показать файл

Файл: starter.py Проект: psrc/psrc_synthpop

    def get_household_joint_dist_for_geography(self, ind):
        c = self.c

        puma = c.tract_to_puma(ind.state, ind.county, ind.tract)
        # this is cached so won't download more than once
        h_pums = self.c.download_household_pums(ind.state, puma)

        #def cars_cat(r):
            #if r.VEH == 0:
                #return "none"
            #elif r.VEH == 1:
                #return "one"
            #return "two or more"

        #def children_cat(r):
            #if r.NOC > 0:
                #return "yes"
            #return "no"

        #def income_cat(r):
            #if r.FINCP > 100000:
                #return "gt100"
            #elif r.FINCP > 35000:
                #return "gt35-lt100"
            #return "lt35"

        #def workers_cat(r):
            #if r.WIF == 3:
                #return "two or more"
            #elif r.WIF == 2:
                #return "two or more"
            #elif r.WIF == 1:
                #return "one"
            #return "none"

        # HS
        # functions defining how category values are computed from the PUMA data
        def HHsize_cat(r):
            if r.HHSz == 1:
                return "one"
            if r.HHSz == 2:
                return "two"
            if r.HHSz == 3:
                return "three"
            if r.HHSz == 4:
                return "four"
            if r.HHSz == 5:
                return "five"
            if r.HHSz == 6:
                return "six"
            if r.HHSz > 6:
                return "seven+"
          
        h_pums, jd_households = cat.joint_distribution(
                h_pums,
                cat.category_combinations(self.h_acs_cat.columns),
                {"HHsize": HHsize_cat}
            )          
        #h_pums, jd_households = cat.joint_distribution(
            #h_pums,
            #cat.category_combinations(self.h_acs_cat.columns),
            #{"cars": cars_cat, "children": children_cat,
             #"income": income_cat, "workers": workers_cat}
        #)
        return h_pums, jd_households

Пример #10

0

Показать файл

Файл: sfcta_starter_gq.py Проект: bhargavasana/synthpop

    def get_household_joint_dist_for_geography(self, ind):
        
        # check the cache to see if we've done it already
        puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010']
        if puma in self.h_pums.keys():
            return self.h_pums[puma], self.jd_households[puma]

        # if not, get the superclass to do a bunch of variable setting
        h_pums, p_pums = SFCTAStarter.get_pums(self, puma)
        orig_len = len(h_pums)
        
        # Don't bother filter number of persons -- this should happen with TYPE filter
        # h_pums = h_pums[h_pums['NP']==1]
        
        # Only Non-Institutional Group Quarters
        h_pums = h_pums[h_pums['TYPE']>2]
        print "Filtered to %d GQ 'households' from %d originally" % (len(h_pums), orig_len)
        np_bad = (h_pums.NP != 1)
        assert(np_bad.sum() == 0)
        
        # Group quarters income -- use PINCP
        h_pums.loc[pd.isnull(h_pums.loc[:,'PINCP']),'PINCP'] = 0.0       # no null
        h_pums['hhinc_2012dollars'] = h_pums['PINCP']*(0.000001*h_pums['ADJINC'])  # ADJINC has 6 implied decimal places
        h_pums['hhinc_1989dollars'] = 0.54*h_pums['hhinc_2012dollars']
        
        h_pums['hhinc'] = h_pums['hhinc_1989dollars']/1000.0  # in thousands of dollars
        # print sum(h_pums.loc[:,'hhinc']<0)
        h_pums.loc[h_pums.loc[:,'hhinc']<0,  'hhinc'] = 0.0              # no negatives
        # print sum(h_pums.loc[:,'hhinc']>255)
        h_pums.loc[h_pums.loc[:,'hhinc']>255,'hhinc'] = 255.0   # max = 255
        
        # For the following, r is a pandas.Series
        # It's basically a row from h_pums, so any variables defined above will be available
        
        def hhsize_cat(r):
            # NP = number of persons
            if r.NP >=5:
                return "5+"
            elif r.NP == 4:
                return "4"
            elif r.NP == 3:
                return "3"
            elif r.NP == 2:
                return "2"
            elif r.NP == 1:
                return "1"
            return "1"
        
        def income_cat(r):
            if r.hhinc < 25.0:
                return "0-25k"
            elif r.hhinc < 45.0:
                return "25-45k"
            elif r.hhinc < 75.0:
                return "45-75k"
            else:
                return "75k+"

        def workers_cat(r):
            # hmm... WIF = Workers in Family.  What about non-family households?
            if r.workers >= 3:
                return "3+"
            elif r.workers == 2:
                return "2"
            elif r.workers == 1:
                return "1"
            return "0"
        
        def htype_cat(r):
            if r.hhage < 65 and r.gqchild==0:
                return "HAGE1K0"
            elif r.hhage < 65 and r.gqchild > 0:
                return "HAGE1K1"
            else:
                return "HAGE65KALL"

        category_df = pd.DataFrame({'cat_id':[0], 'hhsize_cat':["1"]})
        category_df.set_index(['hhsize_cat'], inplace=True)
        h_pums, jd_households = cat.joint_distribution(
            h_pums,
            category_df,
            {"hhsize_cat": hhsize_cat,
             "income_cat": income_cat,
             "workers_cat": workers_cat,
             "htype_cat": htype_cat}
        )

        # cache them
        self.h_pums[puma]           = h_pums
        self.jd_households[puma]    = jd_households

        return h_pums, jd_households

Пример #11

0

Показать файл

# do the synthesis one PUMA at a time

all_households=pd.DataFrame()
all_persons=pd.DataFrame()

for puma in all_pumas:
    print(puma)
    # get the block groups in this puma
    this_puma_ind=[i for i in range(len(h_acs)) if h_acs.iloc[i]['puma']==puma]
    #download the pums data
    p_pums=c.download_population_pums(state, puma10=puma, usecols=p_pums_cols)
    h_pums=c.download_household_pums(state, puma10=puma, usecols=h_pums_cols)
    #get the joint distribution of pums data
    h_pums, jd_households = cat.joint_distribution(h_pums,
        cat.category_combinations(h_acs_cat.columns),
        {"cars": cars_cat, "children": children_cat, 
         "income": income_cat, "workers": workers_cat, "tenure": tenure_cat})
    p_pums, jd_persons = cat.joint_distribution(
        p_pums,
        cat.category_combinations(p_acs_cat.columns),
        {"age": age_cat, "sex": sex_cat, "race": race_cat}
    )
    # simulate households and persons for each person in each block-group of this PUMA
    for bg_ind in this_puma_ind:
        zone_name=h_acs_cat.index[bg_ind]
        print(zone_name)
        geoid=state+ h_acs.loc[zone_name,'county']+h_acs.loc[zone_name,'tract']+h_acs.loc[zone_name,'block group']
        print(geoid)
        best_households, best_people, people_chisq, people_p= synthesizer.synthesize(h_acs_cat.iloc[bg_ind].transpose(), p_acs_cat.iloc[bg_ind].transpose(), jd_households, jd_persons, h_pums, p_pums,
                   marginal_zero_sub=.01, jd_zero_sub=.001, hh_index_start=0)
    #     add the puma and bg id to each HH

Пример #12

0

Показать файл

Файл: sfcta_starter_gq.py Проект: bhargavasana/synthpop

    def get_household_joint_dist_for_geography(self, ind):

        # check the cache to see if we've done it already
        puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010']
        if puma in self.h_pums.keys():
            return self.h_pums[puma], self.jd_households[puma]

        # if not, get the superclass to do a bunch of variable setting
        h_pums, p_pums = SFCTAStarter.get_pums(self, puma)
        orig_len = len(h_pums)

        # Don't bother filter number of persons -- this should happen with TYPE filter
        # h_pums = h_pums[h_pums['NP']==1]

        # Only Non-Institutional Group Quarters
        h_pums = h_pums[h_pums['TYPE'] > 2]
        print "Filtered to %d GQ 'households' from %d originally" % (
            len(h_pums), orig_len)
        np_bad = (h_pums.NP != 1)
        assert (np_bad.sum() == 0)

        # Group quarters income -- use PINCP
        h_pums.loc[pd.isnull(h_pums.loc[:, 'PINCP']), 'PINCP'] = 0.0  # no null
        h_pums['hhinc_2012dollars'] = h_pums['PINCP'] * (
            0.000001 * h_pums['ADJINC'])  # ADJINC has 6 implied decimal places
        h_pums['hhinc_1989dollars'] = 0.54 * h_pums['hhinc_2012dollars']

        h_pums['hhinc'] = h_pums[
            'hhinc_1989dollars'] / 1000.0  # in thousands of dollars
        # print sum(h_pums.loc[:,'hhinc']<0)
        h_pums.loc[h_pums.loc[:, 'hhinc'] < 0, 'hhinc'] = 0.0  # no negatives
        # print sum(h_pums.loc[:,'hhinc']>255)
        h_pums.loc[h_pums.loc[:, 'hhinc'] > 255, 'hhinc'] = 255.0  # max = 255

        # For the following, r is a pandas.Series
        # It's basically a row from h_pums, so any variables defined above will be available

        def hhsize_cat(r):
            # NP = number of persons
            if r.NP >= 5:
                return "5+"
            elif r.NP == 4:
                return "4"
            elif r.NP == 3:
                return "3"
            elif r.NP == 2:
                return "2"
            elif r.NP == 1:
                return "1"
            return "1"

        def income_cat(r):
            if r.hhinc < 25.0:
                return "0-25k"
            elif r.hhinc < 45.0:
                return "25-45k"
            elif r.hhinc < 75.0:
                return "45-75k"
            else:
                return "75k+"

        def workers_cat(r):
            # hmm... WIF = Workers in Family.  What about non-family households?
            if r.workers >= 3:
                return "3+"
            elif r.workers == 2:
                return "2"
            elif r.workers == 1:
                return "1"
            return "0"

        def htype_cat(r):
            if r.hhage < 65 and r.gqchild == 0:
                return "HAGE1K0"
            elif r.hhage < 65 and r.gqchild > 0:
                return "HAGE1K1"
            else:
                return "HAGE65KALL"

        category_df = pd.DataFrame({'cat_id': [0], 'hhsize_cat': ["1"]})
        category_df.set_index(['hhsize_cat'], inplace=True)
        h_pums, jd_households = cat.joint_distribution(
            h_pums, category_df, {
                "hhsize_cat": hhsize_cat,
                "income_cat": income_cat,
                "workers_cat": workers_cat,
                "htype_cat": htype_cat
            })

        # cache them
        self.h_pums[puma] = h_pums
        self.jd_households[puma] = jd_households

        return h_pums, jd_households

Python joint_distribution примеры использования