-
Notifications
You must be signed in to change notification settings - Fork 9
/
parcels-features.py
199 lines (162 loc) · 6.13 KB
/
parcels-features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
'''create census tract and zip5 features of each parcel
INPUT FILES
INPUT/corelogic-deeds-*/CAC*.txt
OUTPUT FILES
WORKING/parcels-features-GEO.csv
WORKING/parcels-features-GEO-occurs.pickle how often each feature occurs in the GEO partitioning
WORKING/parcels-features-zip5.csv
Each parcels was classified as single family retail.
The fields in the output csv files are
index: the code for either the census_tract (6 digits) or zip5 (5 digits)
geo, same as index
has_X where X is in layout_parcels.propn.keys()
'''
import cPickle as pickle
import numpy as np
import pandas as pd
import pdb
from pprint import pprint
import random
import sys
from Bunch import Bunch
import layout_parcels as parcels
from Logger import Logger
from Path import Path
from ParseCommandLine import ParseCommandLine
def usage(msg=None):
if msg is not None:
print msg
print 'usage : python parcels-features.py --geo GEO [--test]'
print ' GEO : either census_tract or zip5'
print ' --test: run in test mode'
sys.exit(1)
def make_control(argv):
# return a Bunch
print argv
if len(argv) not in (3, 4):
usage('invalid number of arguments')
pcl = ParseCommandLine(argv)
arg = Bunch(
base_name=argv[0].split('.')[0],
geo=pcl.get_arg('--geo'),
test=pcl.has_arg('--test'),
)
if arg.geo is None:
usage('missing --arg')
if arg.geo not in ('census_tract', 'zip5'):
usage('invalid GEO value: ', + arg.geo)
random_seed = 123456
random.seed(random_seed)
path = Path() # use the default dir_input
debug = False
return Bunch(
arg=arg,
debug=debug,
max_sale_price=85e6, # according to Wall Street Journal
path=path,
path_out_csv=path.dir_working() + arg.base_name + '-' + arg.geo + '.csv',
path_out_occurs=path.dir_working() + arg.base_name + '-' + arg.geo + '-occurs.pickle',
random_seed=random_seed,
test=arg.test,
)
def just_used(geo, df):
'return new DataFrame containing just columns we need for further processing'
r = pd.DataFrame({
'geo': df[parcels.census_tract] if geo == 'census_tract' else df[parcels.zip5],
parcels.land_use: df[parcels.land_use],
parcels.property_indicator: df[parcels.property_indicator],
})
return r
def make_has_indicatorsOLD(df, name_masks):
'return new df with an index for each geo value'
result_index = set(df.index)
result = {}
for name, mask in name_masks:
is_feature = df[mask(df)]
result[name] = pd.Series(data=[False] * len(result_index),
index=result_index)
for is_true in set(is_feature.index):
print name, is_true
result[name][is_true] = True
r = pd.DataFrame(data=result)
r['geo'] = r.index
return r
def make_has_indicators(df, geo_name):
'return new df with an index for each property indicator value'
verbose = False
result_index = set(df.index)
d = {} # built up to be the data frame
occurs = {} # used for reporting
format = '%30s occurs in %7d geos'
for property_indicator_description in parcels.propn.keys():
feature_name = geo_name + '_has_' + property_indicator_description
mask = parcels.mask_property_indicator_is(property_indicator_description, df)
occurs[feature_name] = sum(mask)
print format % (feature_name, occurs[feature_name])
is_feature = df[mask]
d[feature_name] = pd.Series(data=[False] * len(result_index),
index=result_index)
for is_true in set(is_feature.index):
if verbose:
print feature_name, is_true
d[feature_name][is_true] = True
total_occurs = reduce(lambda x, y: x + y, occurs.values(), 0)
print format % ('** any feature **', total_occurs)
if total_occurs != len(df):
print total_occurs, len(df)
pdb.set_trace() # error detected
result = pd.DataFrame(data=d)
result[geo_name] = result.index
return result, occurs
def main(argv):
control = make_control(argv)
sys.stdout = Logger(base_name=control.arg.base_name)
print control
# create dataframes
parcels_df = parcels.read(control.path,
10000 if control.test else None)
print 'parcels df shape', parcels_df.shape
# drop samples without the geographic indicator we will use
# add zip5 field
if control.arg.geo == 'zip5':
parcels_df = parcels_df[parcels.mask_parcel_has_zipcode(parcels_df)]
parcels_df[parcels.zip5] = pd.Series(data=parcels_df[parcels.zipcode] / 10000.0,
dtype=np.int32,
index=parcels_df.index)
elif control.arg.geo == 'census_tract':
# drop if no census tract
parcels_df = parcels_df[parcels.mask_parcel_has_census_tract(parcels_df)]
else:
print 'bad control.arg.geo', control.arg.geo
pdb.set_trace()
# the computation runs out of memory on 64GB if all columns are retained
# so drop all but the columns needed
parcels_df = just_used(control.arg.geo, parcels_df)
parcels_sfr_df = parcels_df[parcels.mask_is_sfr(parcels_df)]
print 'parcels sfr df shape', parcels_sfr_df.shape
parcels_df.index = parcels_df.geo # the index must be the geo field
n_unique_indices = parcels_df.index.nunique()
has_indicators, occurs = make_has_indicators(parcels_df, control.arg.geo)
print 'has_indicators shape', has_indicators.shape
print '# of unique geo codes', n_unique_indices
assert has_indicators.shape[0] == n_unique_indices
if control.test:
print has_indicators
# write the results
has_indicators.to_csv(control.path_out_csv)
f = open(control.path_out_occurs, 'wb')
pickle.dump((occurs, control), f)
f.close()
print control
if control.test:
print 'DISCARD OUTPUT: test'
print 'done'
return
if __name__ == '__main__':
if False:
# avoid pyflakes warnings
pdb.set_trace()
pprint()
pd.DataFrame()
np.array()
main(sys.argv)