This repository has been archived by the owner on Nov 15, 2017. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
new_hampshire.py
186 lines (147 loc) · 6.58 KB
/
new_hampshire.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import urllib
import re
import sys
import json
import time
import dogcatcher
import os
voter_state = "NH"
source = "State"
cdir = os.path.dirname(os.path.abspath(__file__)) + "/"
#Currently I grab the CSV file by hand. At some point, I will need to grab it using mechanize.
file_path = os.path.join(cdir, "new_hampshire-clerks.csv")
result = [("authority_name", "first_name", "last_name", "town_name", "county_name", "fips",
"street", "city", "address_state", "zip_code",
"po_street", "po_city", "po_state", "po_zip_code",
"reg_authority_name", "reg_first", "reg_last",
"reg_street", "reg_city", "reg_state", "reg_zip_code",
"reg_po_street", "reg_po_city", "reg_po_state", "reg_po_zip_code",
"reg_phone", "reg_fax", "reg_email", "reg_website", "reg_hours",
"phone", "fax", "email", "website", "hours", "voter_state", "source", "review")]
data = open(file_path).read()
town_data_re = re.compile("[A-Z][A-Z].+?\n", re.DOTALL)
town_item_re = re.compile("(.+?),")
middle_name_re = re.compile(" ([a-zA-z]\.* )")
po_re = re.compile("P *O BOX \d+")
po_2_re = re.compile("\d+ P *O BOX")
po_city_re = re.compile("[A-Za-z \.]+? \d{5}[\d-]*")
zip_re = re.compile("\d{5}[\d-]*")
#When there isn't a mailing address, we have to be able to distinguish the street address from the city name. So, through trial, error, and guesswork, it checks this entire list.
street_1_re = re.compile(".+? [DR][DR]\.* ")
street_2_re = re.compile(".+? ROAD ")
street_3_re = re.compile(".+? S[QT]\.* ")
street_4_re = re.compile(".+? STREET ")
street_5_re = re.compile(".+? RO*U*TE* *\d+[A-Z]* ")
street_6_re = re.compile(".+? AVEN*U*E* ")
street_7_re = re.compile(".+? HI*G*H*WA*Y ")
street_8_re = re.compile(".+? WAY ")
street_9_re = re.compile(".+? PLAZA ")
street_10_re = re.compile(".+? SQUARE ")
street_11_re = re.compile(".+? TPKE ")
street_12_re = re.compile(".+? VILLAGE GREEN ")
street_24_re = re.compile(".+? MAIN ")
street_25_re = re.compile(".+? WASHINGTON ")
town_data = town_data_re.findall(data)
for town in town_data:
authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin(voter_state)
town = town.replace(",,",", ,").replace("'S","'s")
#Since the data is in a CSV format, we can easily split into items. This is helpful, since usually each item is a distinct type of data.
town_item = town_item_re.findall(town)
town_name = town_item[0].title().replace("Ward","").strip(" 0123456789").replace("'S","'s")
official_name = town_item[1].title()
first_name, last_name, review = dogcatcher.split_name(official_name, middle_name_re, review)
phone = "(603)" + town_item[3]
if town_item[4] != " ":
fax = "(603)" + town_item[4]
else:
fax = ""
if fax == "(603)155-9128":
fax = "(603)755-9128"
email = town_item[5].lower()
if website == "none available" or website == " ":
website = ""
else:
website = dogcatcher.clean_website(town_item[6])
#The full address is its own item, and there's either a PO Box or nothing. This block of code:
#1. Checks for a PO Box. If so, extracts it. It then checks for a city and zip code the same way. If there's no zip code, it grabs it form the polling place indicated in later column.
#2. If no PO Box, it extracts a street address (details described earlier), and then a zip code and city.
address = town_item[2]
if "O BOX" in address:
try:
po_street = po_re.findall(address)[0].strip()
except:
po_street = po_2_re.findall(address)[0].strip()
try:
po_city = po_city_re.findall(address)[0].strip()
except:
po_city = town_name
try:
po_zip = zip_re.findall(address)[0]
except:
po_zip = zip_re.findall(town_item[7])[0]
else:
if street_1_re.findall(address):
street = street_1_re.findall(address)[0]
elif street_2_re.findall(address):
street = street_2_re.findall(address)[0]
elif street_3_re.findall(address):
street = street_3_re.findall(address)[0]
elif street_4_re.findall(address):
street = street_4_re.findall(address)[0]
elif street_5_re.findall(address):
street = street_5_re.findall(address)[0]
elif street_6_re.findall(address):
street = street_6_re.findall(address)[0]
elif street_7_re.findall(address):
street = street_7_re.findall(address)[0]
elif street_8_re.findall(address):
street = street_8_re.findall(address)[0]
elif street_9_re.findall(address):
street = street_9_re.findall(address)[0]
elif street_10_re.findall(address):
street = street_10_re.findall(address)[0]
elif street_11_re.findall(address):
street = street_11_re.findall(address)[0]
elif street_12_re.findall(address):
street = street_12_re.findall(address)[0]
elif street_24_re.findall(address):
street = street_24_re.findall(address)[0]
elif street_25_re.findall(address):
street = street_25_re.findall(address)[0]
zip_code = zip_re.findall(address)[0]
city = address.replace(street,"").replace(zip_code,"").strip()
if street:
fips, county_name = dogcatcher.map_fips(city, address_state, zip_code)
else:
fips, county_name = dogcatcher.map_fips(po_city, po_state, po_zip_code)
#Both of these towns aren't found well in Google's data.
if town_name == "Pinkham's Grant":
county_name = "Coos"
fips = dogcatcher.find_fips(county_name, voter_state)
if town_name == "Sargent's Purchase":
county_name = "Coos"
fips = dogcatcher.find_fips(county_name, voter_state)
#The authority name is consistent from county to county and not included in the data.
authority_name = "Clerk"
result.append([authority_name, first_name, last_name, town_name, county_name, fips,
street, city, address_state, zip_code,
po_street, po_city, po_state, po_zip_code,
reg_authority_name, reg_first, reg_last,
reg_street, reg_city, reg_state, reg_zip_code,
reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code,
reg_phone, reg_fax, reg_email, reg_website, reg_hours,
phone, fax, email, website, hours, voter_state, source, review])
#In several larger towns, there are multiple entries. (One entry per ward.) These become identical in the data. This function checks for these and wipes them out as appropriate.
for item in result:
end = len(result)
x = 0
for i in range(result.index(item)+1,len(result)):
if item == result[i-x]:
print result[i-x]
result.pop(i-x)
x = x + 1
output = open(cdir + "new_hampshire.txt", "w")
for r in result:
output.write("\t".join(r))
output.write("\n")
output.close()