hawaii.py

import urllib
import re
import sys
import dogcatcher
import HTMLParser
import os
import urllib2

h = HTMLParser.HTMLParser()

cdir = os.path.dirname(os.path.abspath(__file__)) + "/"
tmpdir = cdir + "tmp/"

#The following section grabs the website and writes it to a file. (Writing it to a file isn't strictly necessary, but saves some time down the line.)

voter_state = "HI"
source = "State"


result = [("authority_name", "first_name", "last_name", "county_name", "fips",
    "street", "city", "address_state", "zip_code",
    "po_street", "po_city", "po_state", "po_zip_code",
    "reg_authority_name", "reg_first", "reg_last",
    "reg_street", "reg_city", "reg_state", "reg_zip_code",
    "reg_po_street", "reg_po_city", "reg_po_state", "reg_po_zip_code",
    "reg_phone", "reg_fax", "reg_email", "reg_website", "reg_hours",
    "phone", "fax", "email", "website", "hours", "voter_state", "source", "review")]

file_path = tmpdir + "hawaii-clerks.pdf"
url = "http://hawaii.gov/elections/factsheets/fsvs514.pdf"
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent' : user_agent}

req = urllib2.Request(url, "", headers)
pdf = urllib2.urlopen(req).read()

data = dogcatcher.pdf_to_text(pdf)
output = open(file_path, "w")
output.write(data)
output.close()

data = open(file_path).read()

county_re = re.compile("\n[A-Z][a-z][^\n]+?\nC[A-Z ]+.+?FAX: *\(\d{3}\) \d{3}-\d{4}",re.DOTALL)
county_name_re = re.compile("CLERK OF (.+)")
authority_name_re = re.compile(".+ CLERK")

name_re = re.compile("[A-Z][a-z]+? [A-Za-z ]+")
middle_re = re.compile("[A-Z]\.* ")

fax_re = re.compile("FAX: *(\(\d{3}\) \d{3}-\d{4})")
phone_re = re.compile("\(\d{3}\) \d{3}-\d{4}")

address_re = re.compile("\n\d.+?\d{5}[\d-]*", re.DOTALL)

csz_re = re.compile(" *([^,\n]+?, .+? *\d{5}[\d-]*)")
city_re = re.compile("(.+?),")
state_re = re.compile(" (.+?) ")
zip_re = re.compile(" (\d{5}[\d-]*)")
po_re = re.compile("(P\.* *O\.* Box .+)", re.DOTALL)


for county in county_re.findall(data):

    print county

    authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin(voter_state)

    #All of the county officers are formatted as "Clerk of [County.]" There is one statewide election officer whose name follows a different format.
    #We don't want his data. So the county_name_re is designed to both check that we're working on a county level and grab the county's name.
    #If we aren't working on a county level, we skip that official.

    try:
        county_name = county_name_re.findall(county)[0].title().strip()
    except:
        continue

    authority_name = authority_name_re.findall(county)[0].title().strip()

    print [authority_name]

    official_name = name_re.findall(county)[0]
    first_name, last_name, review = dogcatcher.split_name(official_name, review)

    fax = fax_re.findall(county)[0]
    county = county.replace(fax,"")
    fax = dogcatcher.clean_phone(fax)

    #Nothing distinctive starts the line with the phone number, but the elections office phone number, by some quirk of the data, is always the penultimate phone number of everything that looks like a phone number.
    #So we grab everything that looks like a phone number, and use the penultimate,

    phone_all = phone_re.findall(county)
    phone = dogcatcher.clean_phone(phone_all[len(phone_all)-1])

    #This section finds the full address. After finding the address, it identifies a city/state/zip (csz) combination and a PO Box number if that exists.
    #It removes both the CSZ and the PO Address (if it exists) from the full address, leaving behind a street address with some garbage.
    #It then cleans up the street address and pulls the city, state, and zip out of the csz, and assigns them as appropriate to the street address and state.
    #The state is written as "Hawaii", so we replace it with "HI." If it's ever something else, we break the program to examine the change.

    address = address_re.findall(county)[0]

    csz = csz_re.findall(address)[0]

    try:
        po_street = po_re.findall(address)[0].replace(csz,"").strip(", \n")
    except:
        po_street = ""

    street = address.replace(po_street,"").replace(csz,"")
    street = street.replace("\n",", ").replace(" ,",",").strip(" \n/,")

    if po_street:
        po_street = address.replace(csz,"").strip(" \n/,")
        po_city = city_re.findall(csz)[0].strip()
        po_state = state_re.findall(csz)[0].strip()
        if address_state == "Hawaii":
            address_state = "HI"
        else:
            sys.exit()
        po_zip_code = zip_re.findall(csz)[0].strip()
    if street:
        city = city_re.findall(csz)[0]
        address_state = state_re.findall(csz)[0]
        if address_state == "Hawaii":
            address_state = "HI"
        else:
            sys.exit()
        zip_code = zip_re.findall(csz)[0]

    fips = dogcatcher.find_fips(county_name, voter_state)

    result.append([authority_name, first_name, last_name, county_name, fips,
        street, city, address_state, zip_code,
        po_street, po_city, po_state, po_zip_code,
        reg_authority_name, reg_first, reg_last,
        reg_street, reg_city, reg_state, reg_zip_code,
        reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code,
        reg_phone, reg_fax, reg_email, reg_website, reg_hours,
        phone, fax, email, website, hours, voter_state, source, review])

#This outputs the results to a separate text file.

dogcatcher.output(result, voter_state, cdir)