-
Notifications
You must be signed in to change notification settings - Fork 9
/
census-features.py
191 lines (155 loc) · 5.58 KB
/
census-features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
'''create derived features for the year 2000 census
INPUT FILES
INPUT/.../census.csv
OUTPUT FILES
WORKING/census-features-derived.csv
WORKING/census-features[-test]/0log.txt
The fields in the output csv files are
index: the code for either the census_tract (6 digits) or zip5 (5 digits)
geo, same as index
has_commercial
has_industry
has_park
has_retail
has_school
'''
import argparse
import collections
import numpy as np
import pandas as pd
import pdb
from pprint import pprint
import random
import sys
from Bunch import Bunch
import dirutility
import layout_census as census
from Logger import Logger
from Path import Path
def usage(msg=None):
if msg is not None:
print msg
print 'usage : python census-features.py [--test]'
print ' --test: run in test mode'
sys.exit(1)
def make_control(argv):
# return a Bunch
print argv
if len(argv) not in (1, 2):
usage('invalid number of arguments')
parser = argparse.ArgumentParser()
parser.add_argument('invocation')
parser.add_argument('--test', action='store_true')
arg = parser.parse_args(argv)
arg.base_name = arg.invocation.split('.')[0]
path = Path()
dir_out = dirutility.assure_exists(path._dir_working + arg.base_name + ('-test' if arg.test else '')) + '/'
file_out = path._dir_working + arg.base_name + '-derived' + ('-test' if arg.test else '') + '.csv'
path_in = path._dir_input + 'neighborhood-data/census.csv'
random_seed = 123456
random.seed(random_seed)
debug = False
return Bunch(
arg=arg,
debug=debug,
path_in=path_in,
path_out=file_out,
path_out_log=dir_out + '0log.txt',
random_seed=random_seed,
test=arg.test,
)
CensusFeatures = collections.namedtuple(
'CensusFeatures',
'avg_commute median_hh_income fraction_owner_occupied',
)
def reduce_census(census_df):
'return dictionary[census_tract] --> CensusFeatures'
def get_census_tract(row):
fips_census_tract = float(row[census.fips_census_tract])
census_tract = int(fips_census_tract % 1000000)
return census_tract
def get_avg_commute(row):
'return weighted average commute time'
def mul(factor):
return (factor[0] * float(row[census.commute_less_5]) +
factor[1] * float(row[census.commute_5_to_9]) +
factor[2] * float(row[census.commute_10_to_14]) +
factor[3] * float(row[census.commute_15_to_19]) +
factor[4] * float(row[census.commute_20_to_24]) +
factor[5] * float(row[census.commute_25_to_29]) +
factor[6] * float(row[census.commute_30_to_34]) +
factor[7] * float(row[census.commute_35_to_39]) +
factor[8] * float(row[census.commute_40_to_44]) +
factor[9] * float(row[census.commute_45_to_59]) +
factor[10] * float(row[census.commute_60_to_89]) +
factor[11] * float(row[census.commute_90_or_more]))
n_samples = mul((1., ) * 12)
wsum = mul((2.5, 7.5, 12.5, 17.5, 22.5, 27.5, 32.5, 37.5, 42.5, 52.5, 75.0, 120.0))
return None if n_samples == 0 else wsum / n_samples
def get_median_household_income(row):
mhi = float(row[census.median_household_income])
return mhi
def get_fraction_owner_occupied(row):
total = float(row[census.occupied_total])
owner = float(row[census.occupied_owner])
return None if total == 0 else owner / total
d = {}
# first row has explanations for column names
verbose = False
labels = census_df.loc[0]
if verbose and False:
print 'labels'
for i in xrange(len(labels)):
print ' ', labels.index[i], labels[i]
for row_index in xrange(1, len(census_df)):
if verbose:
print row_index
row = census_df.loc[row_index] # row is a pd.Series
if verbose:
print row
ct = get_census_tract(row)
if ct in d:
print 'duplicate census tract', ct
pdb.set_trace()
ac = get_avg_commute(row)
mhi = get_median_household_income(row)
foo = get_fraction_owner_occupied(row)
if ac is not None and mhi is not None and foo is not None:
d[ct] = CensusFeatures(avg_commute=ac,
median_hh_income=mhi,
fraction_owner_occupied=foo,
)
return d
def make_census_reduced_df(d):
'convert d[census_tract]=(avg commute, med hh inc, fraction owner occ) to dataframe'
df = pd.DataFrame({'census_tract': [k for k in d.keys()],
'avg_commute': [d[k][0] for k in d.keys()],
'fraction_owner_occupied': [d[k][2] for k in d.keys()],
'median_household_income': [d[k][1] for k in d.keys()]
})
return df
def main(argv):
control = make_control(argv)
sys.stdout = Logger(logfile_path=control.path_out_log)
print control
# read the census
print 'reading input file', control.path_in
census_df = pd.read_csv(
control.path_in,
sep='\t',
)
derived_df = make_census_reduced_df(reduce_census(census_df))
derived_df.to_csv(control.path_out)
print control
if control.test:
print 'DISCARD OUTPUT: test'
print 'done'
return
if __name__ == '__main__':
if False:
# avoid pyflakes warnings
pdb.set_trace()
pprint()
pd.DataFrame()
np.array()
main(sys.argv)