This repository has been archived by the owner on Nov 15, 2017. It is now read-only.
/
hawaii.py
143 lines (106 loc) · 5.7 KB
/
hawaii.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import urllib
import re
import sys
import dogcatcher
import HTMLParser
import os
import urllib2
h = HTMLParser.HTMLParser()
cdir = os.path.dirname(os.path.abspath(__file__)) + "/"
tmpdir = cdir + "tmp/"
#The following section grabs the website and writes it to a file. (Writing it to a file isn't strictly necessary, but saves some time down the line.)
voter_state = "HI"
source = "State"
result = [("authority_name", "first_name", "last_name", "county_name", "fips",
"street", "city", "address_state", "zip_code",
"po_street", "po_city", "po_state", "po_zip_code",
"reg_authority_name", "reg_first", "reg_last",
"reg_street", "reg_city", "reg_state", "reg_zip_code",
"reg_po_street", "reg_po_city", "reg_po_state", "reg_po_zip_code",
"reg_phone", "reg_fax", "reg_email", "reg_website", "reg_hours",
"phone", "fax", "email", "website", "hours", "voter_state", "source", "review")]
file_path = tmpdir + "hawaii-clerks.pdf"
url = "http://hawaii.gov/elections/factsheets/fsvs514.pdf"
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent' : user_agent}
req = urllib2.Request(url, "", headers)
pdf = urllib2.urlopen(req).read()
data = dogcatcher.pdf_to_text(pdf)
output = open(file_path, "w")
output.write(data)
output.close()
data = open(file_path).read()
county_re = re.compile("\n[A-Z][a-z][^\n]+?\nC[A-Z ]+.+?FAX: *\(\d{3}\) \d{3}-\d{4}",re.DOTALL)
county_name_re = re.compile("CLERK OF (.+)")
authority_name_re = re.compile(".+ CLERK")
name_re = re.compile("[A-Z][a-z]+? [A-Za-z ]+")
middle_re = re.compile("[A-Z]\.* ")
fax_re = re.compile("FAX: *(\(\d{3}\) \d{3}-\d{4})")
phone_re = re.compile("\(\d{3}\) \d{3}-\d{4}")
address_re = re.compile("\n\d.+?\d{5}[\d-]*", re.DOTALL)
csz_re = re.compile(" *([^,\n]+?, .+? *\d{5}[\d-]*)")
city_re = re.compile("(.+?),")
state_re = re.compile(" (.+?) ")
zip_re = re.compile(" (\d{5}[\d-]*)")
po_re = re.compile("(P\.* *O\.* Box .+)", re.DOTALL)
for county in county_re.findall(data):
print county
authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin(voter_state)
#All of the county officers are formatted as "Clerk of [County.]" There is one statewide election officer whose name follows a different format.
#We don't want his data. So the county_name_re is designed to both check that we're working on a county level and grab the county's name.
#If we aren't working on a county level, we skip that official.
try:
county_name = county_name_re.findall(county)[0].title().strip()
except:
continue
authority_name = authority_name_re.findall(county)[0].title().strip()
print [authority_name]
official_name = name_re.findall(county)[0]
first_name, last_name, review = dogcatcher.split_name(official_name, review)
fax = fax_re.findall(county)[0]
county = county.replace(fax,"")
fax = dogcatcher.clean_phone(fax)
#Nothing distinctive starts the line with the phone number, but the elections office phone number, by some quirk of the data, is always the penultimate phone number of everything that looks like a phone number.
#So we grab everything that looks like a phone number, and use the penultimate,
phone_all = phone_re.findall(county)
phone = dogcatcher.clean_phone(phone_all[len(phone_all)-1])
#This section finds the full address. After finding the address, it identifies a city/state/zip (csz) combination and a PO Box number if that exists.
#It removes both the CSZ and the PO Address (if it exists) from the full address, leaving behind a street address with some garbage.
#It then cleans up the street address and pulls the city, state, and zip out of the csz, and assigns them as appropriate to the street address and state.
#The state is written as "Hawaii", so we replace it with "HI." If it's ever something else, we break the program to examine the change.
address = address_re.findall(county)[0]
csz = csz_re.findall(address)[0]
try:
po_street = po_re.findall(address)[0].replace(csz,"").strip(", \n")
except:
po_street = ""
street = address.replace(po_street,"").replace(csz,"")
street = street.replace("\n",", ").replace(" ,",",").strip(" \n/,")
if po_street:
po_street = address.replace(csz,"").strip(" \n/,")
po_city = city_re.findall(csz)[0].strip()
po_state = state_re.findall(csz)[0].strip()
if address_state == "Hawaii":
address_state = "HI"
else:
sys.exit()
po_zip_code = zip_re.findall(csz)[0].strip()
if street:
city = city_re.findall(csz)[0]
address_state = state_re.findall(csz)[0]
if address_state == "Hawaii":
address_state = "HI"
else:
sys.exit()
zip_code = zip_re.findall(csz)[0]
fips = dogcatcher.find_fips(county_name, voter_state)
result.append([authority_name, first_name, last_name, county_name, fips,
street, city, address_state, zip_code,
po_street, po_city, po_state, po_zip_code,
reg_authority_name, reg_first, reg_last,
reg_street, reg_city, reg_state, reg_zip_code,
reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code,
reg_phone, reg_fax, reg_email, reg_website, reg_hours,
phone, fax, email, website, hours, voter_state, source, review])
#This outputs the results to a separate text file.
dogcatcher.output(result, voter_state, cdir)