Python categorize示例，synthpop.categorizer.categorize Python示例

示例#1

0

显示文件

文件： sfcta_starter_gq.py 项目： bhargavasana/synthpop

    def __init__(self, key, controls_csv, tazset=None, puma_data_dir=None, fips_file=None,
                  write_households_csv=None, write_persons_csv=None, write_append=False,
                  start_hhid=1, start_persid=1):
        SFCTAStarter.__init__(self, key, controls_csv, tazset, puma_data_dir, fips_file,
                              write_households_csv, write_persons_csv, write_append,
                              start_hhid, start_persid)

        # Remove 0-group quarters controls
        self.controls = self.controls[self.controls['GQPOP']>0]

        self.hh_controls     = cat.categorize(self.controls, 
            {("hhsize_cat","1"):"GQPOP"}, index_cols=['SFTAZ'])

        # cat_name  hhsize_cat
        # cat_value          1
        # SFTAZ
        # 1                  5
        # 2                  5
        # 3                  4
        # 4                  7
        # 5                  6
        # 6                 12
        # 7                  3
        # 8                  2
        # 9                  6
        # 10                29

        self.person_controls = cat.categorize(self.controls,
            {("gqworker_cat","1" ):"GQWKRS",
             ("gqworker_cat","0" ):"GQNONWKRS",
             ("gqage_cat", "0-64"):"GQAGE064",
             ("gqage_cat", "65+" ):"GQAGE65P" }, index_cols=['SFTAZ'])

示例#2

0

显示文件

文件： sfcta_starter_hh.py 项目： bhargavasana/synthpop

    def __init__(self, key, controls_csv, tazset=None, puma_data_dir=None, fips_file=None,
                  write_households_csv=None, write_persons_csv=None, write_append=False,
                  start_hhid=1, start_persid=1):
        SFCTAStarter.__init__(self, key, controls_csv, tazset, puma_data_dir, fips_file,
                              write_households_csv, write_persons_csv, write_append,
                              start_hhid, start_persid)

        # Remove 0-household controls
        self.controls = self.controls[self.controls['HHLDS']>0]
        
        # self.controls = self.controls.iloc[:2,]
        print "Household controls has length %d" % len(self.controls)

        self.hh_controls = cat.categorize(self.controls, 
            {("income_cat", "0-25k"  ): "HHINCQ1",
             ("income_cat", "25-45k" ): "HHINCQ2",
             ("income_cat", "45-75k" ): "HHINCQ3",
             ("income_cat", "75k+"   ): "HHINCQ4",
             ("hhsize_cat", "1"      ): "SZ1_HHLDS",
             ("hhsize_cat", "2"      ): "SZ2_HHLDS",
             ("hhsize_cat", "3"      ): "SZ3_HHLDS",
             ("hhsize_cat", "4"      ): "SZ4_HHLDS",
             ("hhsize_cat", "5+"     ): "SZ5_HHLDS",
             ("workers_cat", "0"     ): "WKR0_HHLDS",
             ("workers_cat", "1"     ): "WKR1_HHLDS",
             ("workers_cat", "2"     ): "WKR2_HHLDS",
             ("workers_cat", "3+"    ): "WKR3_HHLDS",
             ("htype_cat", "HAGE1K0"    ): "HAGE1KIDS0",
             ("htype_cat", "HAGE1K1"    ): "HAGE1KIDS1",
             ("htype_cat", "HAGE65KALL"    ): "HAGE65KIDSWHATEV" },
                                          index_cols=['SFTAZ'])
        
        # print self.hh_controls.loc[1:10,:]

        # cat_name  hhsize_cat                               income_cat                        workers_cat
        # cat_value          1       2      3      4      5+      0-30k  100k+  30-60k 60-100k           0       1       2      3+
        # SFTAZ                                                                                                                  
        # 1             28.365  46.970 40.260 49.410 139.995     51.540 61.848 130.700  59.912      52.155  93.025  96.990  62.830
        # 2             36.663  54.540 45.147 51.207 115.443     54.976 73.970  83.514  90.540      48.783  81.507  93.627  79.083
        # 3             73.610 104.353 77.074 71.012 106.951    261.136 28.188 106.849  37.827     158.911 196.149  67.115  10.825
        # 4             45.133  67.140 55.577 63.037 142.113     67.002 90.663 102.678 111.657      60.053 100.337 115.257  97.353
        # 5             34.038  56.364 48.312 59.292 167.994     62.707 75.211 157.363  71.719      62.586 111.630 116.388  75.396
        # 6             65.403  94.302 78.078 85.176 184.041    144.312 50.742 138.288 173.658      84.669 148.551 159.705 114.075
        # 7             46.920  66.516 49.128 45.264  68.172    166.646 17.759  68.419  24.176     101.292 125.028  42.780   6.900
        # 8             54.282  74.883 56.898 53.301  87.636     67.002 74.937  95.038  90.023      61.476 126.222 104.967  34.335
        # 9             42.471  63.180 52.299 59.319 133.731     63.566 85.465  97.339 105.630      56.511  94.419 108.459  91.611
        # 10            55.480  77.140 60.800 62.700 123.880     90.195 64.092 101.710 124.003      68.780 134.520 122.740  53.960


        # todo: add HAGE1KIDS0, HAGE1KIDS1, HAGE1KIDSWHATEV
        self.person_controls = cat.categorize(self.controls,
            {("age_cat", "0-4"  ): "AGE0004",
             ("age_cat", "5-19" ): "AGE0519",
             ("age_cat", "20-44"): "AGE2044",
             ("age_cat", "45-64"): "AGE4564",
             ("age_cat", "65+"  ): "AGE65P"},
                                          index_cols=['SFTAZ'])

示例#3

0

显示文件

文件： sfcta_starter_gq.py 项目： bhargavasana/synthpop

    def __init__(self,
                 key,
                 controls_csv,
                 tazset=None,
                 puma_data_dir=None,
                 fips_file=None,
                 write_households_csv=None,
                 write_persons_csv=None,
                 write_append=False,
                 start_hhid=1,
                 start_persid=1):
        SFCTAStarter.__init__(self, key, controls_csv, tazset, puma_data_dir,
                              fips_file, write_households_csv,
                              write_persons_csv, write_append, start_hhid,
                              start_persid)

        # Remove 0-group quarters controls
        self.controls = self.controls[self.controls['GQPOP'] > 0]

        self.hh_controls = cat.categorize(self.controls,
                                          {("hhsize_cat", "1"): "GQPOP"},
                                          index_cols=['SFTAZ'])

        # cat_name  hhsize_cat
        # cat_value          1
        # SFTAZ
        # 1                  5
        # 2                  5
        # 3                  4
        # 4                  7
        # 5                  6
        # 6                 12
        # 7                  3
        # 8                  2
        # 9                  6
        # 10                29

        self.person_controls = cat.categorize(self.controls, {
            ("gqworker_cat", "1"): "GQWKRS",
            ("gqworker_cat", "0"): "GQNONWKRS",
            ("gqage_cat", "0-64"): "GQAGE064",
            ("gqage_cat", "65+"): "GQAGE65P"
        },
                                              index_cols=['SFTAZ'])

示例#4

0

显示文件

    def __init__(self,
                 key,
                 controls_csv,
                 tazset=None,
                 puma_data_dir=None,
                 fips_file=None,
                 write_households_csv=None,
                 write_persons_csv=None,
                 write_append=False,
                 start_hhid=1,
                 start_persid=1):
        SFCTAStarter.__init__(self, key, controls_csv, tazset, puma_data_dir,
                              fips_file, write_households_csv,
                              write_persons_csv, write_append, start_hhid,
                              start_persid)

        # Remove 0-household controls
        self.controls = self.controls[self.controls['HHLDS'] > 0]

        # self.controls = self.controls.iloc[:2,]
        print "Household controls has length %d" % len(self.controls)

        self.hh_controls = cat.categorize(
            self.controls, {
                ("income_cat", "0-25k"): "HHINCQ1",
                ("income_cat", "25-45k"): "HHINCQ2",
                ("income_cat", "45-75k"): "HHINCQ3",
                ("income_cat", "75k+"): "HHINCQ4",
                ("hhsize_cat", "1"): "SZ1_HHLDS",
                ("hhsize_cat", "2"): "SZ2_HHLDS",
                ("hhsize_cat", "3"): "SZ3_HHLDS",
                ("hhsize_cat", "4"): "SZ4_HHLDS",
                ("hhsize_cat", "5+"): "SZ5_HHLDS",
                ("workers_cat", "0"): "WKR0_HHLDS",
                ("workers_cat", "1"): "WKR1_HHLDS",
                ("workers_cat", "2"): "WKR2_HHLDS",
                ("workers_cat", "3+"): "WKR3_HHLDS",
                ("htype_cat", "HAGE1K0"): "HAGE1KIDS0",
                ("htype_cat", "HAGE1K1"): "HAGE1KIDS1",
                ("htype_cat", "HAGE65KALL"): "HAGE65KIDSWHATEV"
            },
            index_cols=['SFTAZ'])

        # print self.hh_controls.loc[1:10,:]

        # cat_name  hhsize_cat                               income_cat                        workers_cat
        # cat_value          1       2      3      4      5+      0-30k  100k+  30-60k 60-100k           0       1       2      3+
        # SFTAZ
        # 1             28.365  46.970 40.260 49.410 139.995     51.540 61.848 130.700  59.912      52.155  93.025  96.990  62.830
        # 2             36.663  54.540 45.147 51.207 115.443     54.976 73.970  83.514  90.540      48.783  81.507  93.627  79.083
        # 3             73.610 104.353 77.074 71.012 106.951    261.136 28.188 106.849  37.827     158.911 196.149  67.115  10.825
        # 4             45.133  67.140 55.577 63.037 142.113     67.002 90.663 102.678 111.657      60.053 100.337 115.257  97.353
        # 5             34.038  56.364 48.312 59.292 167.994     62.707 75.211 157.363  71.719      62.586 111.630 116.388  75.396
        # 6             65.403  94.302 78.078 85.176 184.041    144.312 50.742 138.288 173.658      84.669 148.551 159.705 114.075
        # 7             46.920  66.516 49.128 45.264  68.172    166.646 17.759  68.419  24.176     101.292 125.028  42.780   6.900
        # 8             54.282  74.883 56.898 53.301  87.636     67.002 74.937  95.038  90.023      61.476 126.222 104.967  34.335
        # 9             42.471  63.180 52.299 59.319 133.731     63.566 85.465  97.339 105.630      56.511  94.419 108.459  91.611
        # 10            55.480  77.140 60.800 62.700 123.880     90.195 64.092 101.710 124.003      68.780 134.520 122.740  53.960

        # todo: add HAGE1KIDS0, HAGE1KIDS1, HAGE1KIDSWHATEV
        self.person_controls = cat.categorize(self.controls, {
            ("age_cat", "0-4"): "AGE0004",
            ("age_cat", "5-19"): "AGE0519",
            ("age_cat", "20-44"): "AGE2044",
            ("age_cat", "45-64"): "AGE4564",
            ("age_cat", "65+"): "AGE65P"
        },
                                              index_cols=['SFTAZ'])

示例#5

0

显示文件

def test_categorize(acs_data, pums_data):
    p_acs_cat = cat.categorize(acs_data, {
        ("population", "total"): "B01001_001E",
        ("age", "19 and under"): "B01001_003E + B01001_004E + B01001_005E + "
                                 "B01001_006E + B01001_007E + B01001_027E + "
                                 "B01001_028E + B01001_029E + B01001_030E + "
                                 "B01001_031E",
        ("age", "20 to 35"): "B01001_008E + B01001_009E + B01001_010E + "
                             "B01001_011E + B01001_012E + B01001_032E + "
                             "B01001_033E + B01001_034E + B01001_035E + "
                             "B01001_036E",
        ("age", "35 to 60"): "B01001_013E + B01001_014E + B01001_015E + "
                             "B01001_016E + B01001_017E + B01001_037E + "
                             "B01001_038E + B01001_039E + B01001_040E + "
                             "B01001_041E",
        ("age", "above 60"): "B01001_018E + B01001_019E + B01001_020E + "
                             "B01001_021E + B01001_022E + B01001_023E + "
                             "B01001_024E + B01001_025E + B01001_042E + "
                             "B01001_043E + B01001_044E + B01001_045E + "
                             "B01001_046E + B01001_047E + B01001_048E + "
                             "B01001_049E",
        ("race", "white"):   "B02001_002E",
        ("race", "black"):   "B02001_003E",
        ("race", "asian"):   "B02001_005E",
        ("race", "other"):   "B02001_004E + B02001_006E + B02001_007E + "
                             "B02001_008E",
        ("sex", "male"):     "B01001_002E",
        ("sex", "female"):   "B01001_026E"
    }, index_cols=['NAME'])

    assert len(p_acs_cat) == 3
    assert len(p_acs_cat.columns) == 11
    assert len(p_acs_cat.columns.names) == 2
    assert p_acs_cat.columns[0][0] == "age"

    assert np.all(cat.sum_accross_category(p_acs_cat) < 2)

    def age_cat(r):
        if r.AGEP <= 19:
            return "19 and under"
        elif r.AGEP <= 35:
            return "20 to 35"
        elif r.AGEP <= 60:
            return "35 to 60"
        return "above 60"

    def race_cat(r):
        if r.RAC1P == 1:
            return "white"
        elif r.RAC1P == 2:
            return "black"
        elif r.RAC1P == 6:
            return "asian"
        return "other"

    def sex_cat(r):
        if r.SEX == 1:
            return "male"
        return "female"

    pums_data, jd_persons = cat.joint_distribution(
        pums_data,
        cat.category_combinations(p_acs_cat.columns),
        {"age": age_cat, "race": race_cat, "sex": sex_cat}
    )

示例#6

0

显示文件

文件： starter.py 项目： psrc/psrc_synthpop

    def __init__(self, key, state=None, county=None, tract=None):
        self.c = c = Census(key)
        self.state = state
        self.county = county
        self.tract = tract

        #income_columns = ['B19001_0%02dE' % i for i in range(1, 18)]
        #vehicle_columns = ['B08201_0%02dE' % i for i in range(1, 7)]
        #workers_columns = ['B08202_0%02dE' % i for i in range(1, 6)]
        #families_columns = ['B11001_001E', 'B11001_002E']
        #block_group_columns = income_columns + families_columns
        #tract_columns = vehicle_columns + workers_columns
        #h_acs = c.block_group_and_tract_query(block_group_columns,
                                              #tract_columns, state, county,
                                              #merge_columns=['tract', 'county',
                                                             #'state'],
                                              #block_group_size_attr="B11001_001E",
                                              #tract_size_attr="B08201_001E",
                                              #tract=tract)
        # HS
        # read the marginals
        h_acs = pd.read_csv('data/HHmarginals.csv', dtype={
                                                  "state": "int32",
                                                  "county": "int32",
                                                  "tract": "int32",
                                                  "block group": "object"
                                              })
        p_acs = pd.read_csv('data/Personmarginals.csv', dtype={
            "state": "int32",
            "county": "int32",
            "tract": "int32",
            "block group": "object"
        })
        
        # reduce the datasets if county and tract are specified
        if state is not None:
            state_id, county_id = c.try_fips_lookup(state, county)
            h_ind = h_acs['state'] == int(state_id)
            p_ind = p_acs['state'] == int(state_id)
            if county is not None:
                h_ind = logical_and(h_ind, h_acs['county'] == int(county_id))
                p_ind = logical_and(p_ind, p_acs['county'] == int(county_id))
            if tract is not None:
                h_ind = logical_and(h_ind, h_acs['tract'] == int(tract_id))
                p_ind = logical_and(p_ind, p_acs['tract'] == int(tract_id))
            h_acs = h_acs[h_ind]
            p_acs = p_acs[p_ind]
            
        # define the categories as {(category_name, category_value): corresponding_column_in_marginals}
        self.h_acs_cat = cat.categorize(h_acs, {
            ("HHsize", "one"):    "HH1p",
            ("HHsize", "two"):    "HH2p", 
            ("HHsize", "three"):  "HH3p",
            ("HHsize", "four"):   "HH4p",
            ("HHsize", "five"):   "HH5p",
            ("HHsize", "six"):    "HH6p",
            ("HHsize", "seven+"): "HH7p",
            }, index_cols=['state', 'county', 'tract', 'block group'])
        
        self.p_acs_cat = cat.categorize(p_acs, {
            ("age", "category 1"):  "Age1",
            ("age", "category 2"):  "Age2",        
            }, index_cols=['state', 'county', 'tract', 'block group'])

示例#7

0

显示文件

h_acs=h_acs.loc[h_acs['block_group_id'].isin(all_block_group_ids)] 
p_acs=p_acs.loc[p_acs['block_group_id'].isin(all_block_group_ids)] 
# =============================================================================
# # create categorised versions
# =============================================================================
# Households
h_acs_cat = cat.categorize(h_acs, {
#     ("households", "total"): "B11001_001E",
    ("children", "yes"): "B11001_002E",
    ("children", "no"): "B11001_001E - B11001_002E",
    ("income", "lt35"): "B19001_002E + B19001_003E + B19001_004E + "
                        "B19001_005E + B19001_006E + B19001_007E",
    ("income", "gt35-lt100"): "B19001_008E + B19001_009E + "
                        "B19001_010E + B19001_011E + B19001_012E"
                        "+ B19001_013E",
    ("income", "gt100"): "B19001_014E + B19001_015E + B19001_016E"
                        "+ B19001_017E",
    ("cars", "none"): "B08201_002E",
    ("cars", "one"): "B08201_003E",
    ("cars", "two or more"): "B08201_004E + B08201_005E + B08201_006E",
    ("workers", "none"): "B08202_002E",
    ("workers", "one"): "B08202_003E",
    ("workers", "two or more"): "B08202_004E + B08202_005E",
    ("tenure", "owned"): "B25063_001E",
    ("tenure", "rented"): "B25075_001E"
}, index_cols=['NAME'])
h_acs_cat.head()



# Persons
p_acs_cat = cat.categorize(p_acs, {