"po_state", "po_zip_code", "reg_authority_name", "reg_first", "reg_last", "reg_street", "reg_city", "reg_state", "reg_zip_code", "reg_po_street", "reg_po_city", "reg_po_state", "reg_po_zip_code", "reg_phone", "reg_fax", "reg_email", "reg_website", "reg_hours", "phone", "fax", "email", "website", "hours", "voter_state", "source", "review") ] file_path = tmpdir + "hawaii-clerks.pdf" url = "http://hawaii.gov/elections/factsheets/fsvs514.pdf" user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} req = urllib2.Request(url, "", headers) pdf = urllib2.urlopen(req).read() data = dogcatcher.pdf_to_text(pdf) output = open(file_path, "w") output.write(data) output.close() data = open(file_path).read() county_re = re.compile( "\n[A-Z][a-z][^\n]+?\nC[A-Z ]+.+?FAX: *\(\d{3}\) \d{3}-\d{4}", re.DOTALL) county_name_re = re.compile("CLERK OF (.+)") authority_name_re = re.compile(".+ CLERK") name_re = re.compile("[A-Z][a-z]+? [A-Za-z ]+") middle_re = re.compile("[A-Z]\.* ") fax_re = re.compile("FAX: *(\(\d{3}\) \d{3}-\d{4})")
"reg_street", "reg_city", "reg_state", "reg_zip_code", "reg_po_street", "reg_po_city", "reg_po_state", "reg_po_zip_code", "reg_phone", "reg_fax", "reg_email", "reg_website", "reg_hours", "phone", "fax", "email", "website", "hours", "voter_state", "source", "review")] #The following section grabs the website and writes it to a file. (Writing it to a file isn't strictly necessary, but saves some time down the line.) file_path = tmpdir + "arkansas-clerks.pdf" url = "http://www.sos.arkansas.gov/elections/Documents/county_clerks_for_website.pdf" user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent' : user_agent} req = urllib2.Request(url, "", headers) pdf = urllib2.urlopen(req).read() data = dogcatcher.pdf_to_text(pdf) output = open(file_path, "w") output.write(data) output.close() data = open(file_path).read() county_data_re = re.compile("\n.+?[@@][^\s]+? *\n", re.DOTALL) name_re = re.compile("\n([^\d\n\.,]+? [^\d\n\.,]+)") county_name_re = re.compile("\n([^\n][^\n]+?)\n") email_re = re.compile("\n([^\s]+?[@@][^\s]+?) *\n") middle_re = re.compile("[A-Z].* ") phone_re = re.compile("Phone: (\d{3}.+?\d{3}.+?\d{4})") fax_re = re.compile("Fax: (\d{3}.+?\d{3}.+?\d{4})") hyphen_re = re.compile(" \d{3}([^\s]+?)\d{3}")
"reg_po_street", "reg_po_city", "reg_po_state", "reg_po_zip_code", "reg_phone", "reg_fax", "reg_email", "reg_website", "reg_hours", "phone", "fax", "email", "website", "hours", "voter_state", "source", "review")] file_path = tmpdir + "montana-clerks.pdf" file_path_1 = tmpdir + "montana-clerks-1.pdf" url_1 = "http://sos.mt.gov/elections/forms/elections/electionadministrators.pdf" user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent' : user_agent} req_1 = urllib2.Request(url_1, headers=headers) pdf_1 = urllib2.urlopen(req_1).read() data_1 = dogcatcher.pdf_to_text(pdf_1) output = open(file_path_1, "w") output.write(data_1) output.close() data_1 = open(file_path_1).read() data_1 = data_1.replace("Dulcie Bear Don't Walk PO Box 908", "Dulcie Bear Don't Walk\nPO Box 908") data_1 = data_1.replace("Golden Valley Mary Lu Berry","Golden Valley\nMary Lu Berry") data_1 = data_1.replace("Lewis & Clark Paulette DeHart", "Lewis & Clark\nPaulette DeHart") data_1 = data_1.replace("Powder River Karen D Amende", "Powder River\nKaren D Amende") data_1 = data_1.replace("W i","Wi") streets = [] counties = [] names = []
"phone", "fax", "email", "website", "hours", "voter_state", "source", "review")] #There are two election offices in CT; each one is in a different PDF. The following section grabs the website and writes it to a file. (Writing it to a file isn't strictly necessary, but saves some time down the line.) file_path_1 = tmpdir + "connecticut-clerks-1.pdf" file_path_2 = tmpdir + "connecticut-clerks-2.pdf" url_1 = "http://www.ct.gov/sots/LIB/sots/ElectionServices/lists/TownClerkList.pdf" url_2 = "http://www.sots.ct.gov/sots/lib/sots/electionservices/lists/rovofficeaddresses.pdf" user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent' : user_agent} req_1 = urllib2.Request(url_1, headers=headers) pdf_1 = urllib2.urlopen(req_1).read() data_1 = dogcatcher.pdf_to_text(pdf_1) output = open(file_path_1, "w") output.write(data_1) output.close() req_2 = urllib2.Request(url_2, headers=headers) pdf_2 = urllib2.urlopen(req_2).read() data_2 = dogcatcher.pdf_to_text(pdf_2) output = open(file_path_2, "w") output.write(data_2) output.close() absdata = open(file_path_1).read() regdata = open(file_path_2).read()