output.close() req_2 = urllib2.Request(url_2, headers=headers) pdf_2 = urllib2.urlopen(req_2).read() data_2 = dogcatcher.pdf_to_text(pdf_2) output = open(file_path_2, "w") output.write(data_2) output.close() absdata = open(file_path_1).read() regdata = open(file_path_2).read() #Check to make sure that W I doesn't appear in the source documents before running. absdata = dogcatcher.po_standardize(absdata.replace("W I","WI").replace("","").replace("ONE FIRST","1 FIRST")) regdata = dogcatcher.po_standardize(regdata.replace("W I","WI").replace("","").replace("ONE FIRST","1 FIRST")) absdata = absdata.replace("\nN. ","\nNorth ") regdata = regdata.replace("N. S","North S") header_re = re.compile(".+?\d{2}:\d{2}:\d{2} [AP]M", re.DOTALL) for item in header_re.findall(absdata): absdata = absdata.replace(item,"") abstown_re = re.compile("([A-Z][A-Z].+?TOWN CLERK.+?)\n\n", re.DOTALL) regtown_re = re.compile("REGISTRAR[S]* OF .+?CT\s*\d{5}[-\d]*\n\n", re.DOTALL) regtown_name_re = re.compile("REGIS.+?, (.+)") abstown_name_re = re.compile("(.+) TOWN CLERK") party_re = re.compile(" [\[\(].+?[\)\]]")
state_re = re.compile(" ([A-Z][A-Z]) ") csz_re = re.compile("[^,\t\n]+?, [A-Z][A-Z] \d{5}[\d-]*") city_re = re.compile("(.+?),") zip_re = re.compile("\d{5}[\d-]*") is_street_re = re.compile("[^,\. \n\t]") street_break_re = re.compile(" *,* *\n") multi_comma_re = re.compile(", *, *") multi_space_re = re.compile(" +") data = data.replace("- ","-") data = data.replace(" and<br>\n",", ") #fixing an edge case in Morris County data = data.replace("-4:30pm","-4:30pm<br") #fixing an edge case in Mercer County data = data.replace("(FAX) 609-989-6888<br>\nOffice Hours: 8:00am-4:00pm","(FAX) 609-989-6888<br>\nOffice Hours: 8:00am-4:00pm<br>") data = dogcatcher.po_standardize(data) county_data = county_data_re.findall(data) #In each county, there are separate offices for registration and absentee ballots. This separates those offices and then applies essentially identical procedures to both. for county in county_data: authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin(voter_state) county_name = county_name_re.findall(county)[0] #This isolates the county clerk data from the complete county. clerk = clerk_re.findall(county)[0] clerk_name = name_re.findall(clerk)[0] first_name, last_name, review = dogcatcher.split_name(clerk_name, review)
csz_re = re.compile("(<p>[^\d]+?, *[A-Z][A-Z] *\d{5}[\d-]*</p>)") city_re = re.compile("<p>(.+?),") state_re = re.compile(" [A-Z][A-Z] ") zip_re = re.compile("\d{5}[-\d]*") address_re = re.compile("</p>.+?<p>(.+? \d{5}[\d-]*</p>)", re.DOTALL) po_re = re.compile("(P.* *O.* .+?)</p>") phone_re = re.compile("Phone: (.+?)") fax_re = re.compile("Fax: (.+?)") town_name_re = re.compile("(.+?)</h2>") data = open(file_path).read() data = data.replace("<p style=\"clear:both;\">Last Updated:","</div><div style=\"clear:left;\">") data = dogcatcher.po_standardize(data) county_data = county_re.findall(data) for county in county_data: authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin(voter_state) authority_name = "Municipal Clerk" print authority_name #There are many edits to town names needed to make the data come off of the Google Maps API well. town_name = town_name_re.findall(county)[0].replace("Plt","Plantation").strip(".") if town_name == "Rockwood Strip": town_name = town_name.replace(" Strip","")
output.write(data) output.close() reg_url = "http://www.sos.state.tx.us/elections/voter/votregduties.shtml" reg_data = urllib.urlopen(reg_url).read() output = open(reg_file_path,"w") output.write(reg_data) output.close() data = open(file_path).read() reg_data = open(reg_file_path).read() # fix some issues in the data # data = data.replace("550 E. 2nd Ave Belton 76513", "550 E. 2nd Ave, Belton 76513") data = dogcatcher.po_standardize(data.replace(""","'").replace("&","&").replace(", TX","")) reg_data = dogcatcher.po_standardize(reg_data.replace(""","'").replace(", TX","")) no_space_re = re.compile(",[^\s]") for item in no_space_re.findall(data): data = data.replace(item, dogcatcher.insert(item, " ", 1)) for item in no_space_re.findall(reg_data): data = data.replace(item, dogcatcher.insert(item, " ", 1)) county_re = re.compile("<dl>\s*(<dt>.+?</dd>)\s*</dl>", re.DOTALL) county_data_item_re = re.compile("dd>([^\n\r]+?\s*[^\n<]*?)\s*<",re.DOTALL) reg_county_data_item_re = re.compile("dd>(.+?)\s*<", re.DOTALL) county_name_re = re.compile("<..>([^<>]+?)</dt>")