try: county_name = county_name_re.findall(county)[0].title().strip() except: continue authority_name = authority_name_re.findall(county)[0].title().strip() print[authority_name] official_name = name_re.findall(county)[0] first_name, last_name, review = dogcatcher.split_name( official_name, review) fax = fax_re.findall(county)[0] county = county.replace(fax, "") fax = dogcatcher.clean_phone(fax) #Nothing distinctive starts the line with the phone number, but the elections office phone number, by some quirk of the data, is always the penultimate phone number of everything that looks like a phone number. #So we grab everything that looks like a phone number, and use the penultimate, phone_all = phone_re.findall(county) phone = dogcatcher.clean_phone(phone_all[len(phone_all) - 1]) #This section finds the full address. After finding the address, it identifies a city/state/zip (csz) combination and a PO Box number if that exists. #It removes both the CSZ and the PO Address (if it exists) from the full address, leaving behind a street address with some garbage. #It then cleans up the street address and pulls the city, state, and zip out of the csz, and assigns them as appropriate to the street address and state. #The state is written as "Hawaii", so we replace it with "HI." If it's ever something else, we break the program to examine the change. address = address_re.findall(county)[0] csz = csz_re.findall(address)[0]
street = " ".join(data.cell(row, 9).value.split()) po_street = " ".join(data.cell(row, 5).value.split()) if po_street != street: po_city = " ".join(data.cell(row, 6).value.split()) po_state = " ".join(data.cell(row, 7).value.split()) po_zip_code = " ".join(data.cell(row, 8).value.split()) else: po_street = "" city = " ".join(data.cell(row, 10).value.split()) address_state = " ".join(data.cell(row, 11).value.split()) zip_code = " ".join(data.cell(row, 12).value.split()) if zip_code == "0542": zip_code = "05342" phone = dogcatcher.clean_phone(data.cell(row, 13).value, "802") fax = dogcatcher.clean_phone(" ".join(data.cell(row, 14).value.split())) if fax == "N/A": fax = "" email = " ".join(data.cell(row, 15).value.lower().split()) hours = data.cell(row, 16).value.rstrip().replace("//", "+++++") fips = dogcatcher.find_fips(county_name, voter_state) result.append([ authority_name, first_name, last_name, town_name, county_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city,
po_city = " ".join(data.cell(row,10).value.split()) po_state = " ".join(data.cell(row,11).value.split()) #Occasionally, there's a separate mailing zip code. This checks that, and grabs it if so. Otherwise, it assigns the main zip code to be the mailing zip code. if data.cell(row,13).value: po_zip_code = " ".join(data.cell(row,13).value.split()) else: po_zip_code = " ".join(data.cell(row,12).value.split()) else: po_city = " ".join(data.cell(row,10).value.split()) po_state = " ".join(data.cell(row,11).value.split()) po_zip_code = " ".join(data.cell(row,12).value.split()) phone = dogcatcher.clean_phone(" ".join(data.cell(row,5).value.split())) fax = dogcatcher.clean_phone(" ".join(data.cell(row,6).value.split())) email = " ".join(data.cell(row,7).value.lower().split()) website = dogcatcher.clean_website(data.cell(row,14).value.rstrip().replace("//","+++++")) fips = dogcatcher.find_fips(county_name, voter_state) result.append([authority_name, first_name, last_name, county_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours,
email_re = re.compile("[^<>]+?@[^<>]+") digit_re = re.compile("\d") html_re = re.compile("<.+?>") county_data = county_data_re.findall(data) for county in county_data: authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin( voter_state) #The data is arranged in a well-ordered table, so we can split each row into a list of items and use each of those items in a consistent way from row to row. county_item = county_item_re.findall(county) fax = dogcatcher.clean_phone(county_item[7]) email = dogcatcher.find_emails(email_re, county) #For each item in the county, we trim any spare HTML out of it to make it easier to process. for item in county_item: index = county_item.index(item) for html in html_re.findall(item): county_item.insert(index, item.replace(html, "")) county_item.pop(index + 1) county_name = county_item[0].title() first_name = " ".join(county_item[1].split()) last_name = county_item[2]
"<br />", "").strip(" \n\r,") else: city = city_re.findall(csz)[0] zip_code = zip_re.findall(csz)[0] street = " ".join(address.replace(csz, "").split()).replace( "<br />", "").strip(" \n\r,") print "++++++++++++++++++++++++++++++++++++++++++++" print[address] print[po_street + street] phone_fax = phone_fax_re.findall(county)[0] phone = dogcatcher.clean_phone(number_re.findall(phone_fax)[0]) fax = dogcatcher.clean_phone(number_re.findall(phone_fax)[1]) fips = dogcatcher.find_fips(county_name, voter_state) result.append([ authority_name, first_name, last_name, county_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, voter_state, source, review ]) #This outputs the results to a separate text file. dogcatcher.output(result, voter_state, cdir)
for space in many_space_re.findall(data): data = data.replace(space," ") county_data = county_re.findall(data) for county in county_data: authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin(voter_state) authority_name = "Department of Elections" county_name = county_name_re.findall(county)[0].strip() phone = dogcatcher.clean_phone(phone_re.findall(county)[0]) fax = dogcatcher.clean_phone(phone_re.findall(county)[1]) website = dogcatcher.find_website(website_re, county) #This section finds the full address. After finding the address, it identifies a city/state/zip (csz) combination and a PO Box number if that exists. #It removes both the CSZ and the PO Address (if it exists) from the full address, leaving behind a street address with some garbage. #It then cleans up the street address and pulls the city, state, and zip out of the csz, and assigns them as appropriate to the street address and state. address = address_re.findall(county)[0] csz = csz_re.findall(address)[0] try: po_street = po_re.findall(address)[0] except: po_street = ""
po_zip_code = zip_re.findall(csz)[0] po_street = address.replace(csz,"").replace("\n\r",", ").replace("<br />","").strip(" \n\r,") else: city = city_re.findall(csz)[0] zip_code = zip_re.findall(csz)[0] street = " ".join(address.replace(csz,"").split()).replace("<br />","").strip(" \n\r,") print "++++++++++++++++++++++++++++++++++++++++++++" print [address] print [po_street + street] phone_fax = phone_fax_re.findall(county)[0] phone = dogcatcher.clean_phone(number_re.findall(phone_fax)[0]) fax = dogcatcher.clean_phone(number_re.findall(phone_fax)[1]) fips = dogcatcher.find_fips(county_name, voter_state) result.append([authority_name, first_name, last_name, county_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, voter_state, source, review])
if reg_po_street: reg_po_city = city_re.findall(reg_csz)[0].strip().title() reg_po_state = state_re.findall(reg_csz)[0].strip() reg_po_zip_code = zip_re.findall(reg_csz)[0].strip().title() if reg_street: reg_city = city_re.findall(reg_csz)[0].strip().title() reg_state = state_re.findall(reg_csz)[0].strip() reg_zip_code = zip_re.findall(reg_csz)[0].strip().title() phone = dogcatcher.find_phone(phone_re, town, areacode = "203") if ("(203) 203-") in phone: phone = dogcatcher.clean_phone(phone.partition(" ")[2]) print phone email = dogcatcher.find_emails(email_re, town) fax = dogcatcher.find_phone(fax_re, town) official_name = name_re.findall(town)[0].title() first_name, last_name, review = dogcatcher.split_name(official_name, review) #This section finds the full address for the town clerk. After finding the address, it identifies a city/state/zip (csz) combination and a PO Box number if that exists. #It removes both the CSZ and the PO Address (if it exists) from the full address, leaving behind a street address with some garbage. #It then cleans up the street address and pulls the city, state, and zip out of the csz, and assigns them as appropriate to the street address and state. address = abs_address_re.findall(town)[0]
# print county #The data is arranged in a well-ordered table, so we can split each row into a list of items and use each of those items in a consistent way from row to row. #Since we can be fairly confident about the format of the data, I mostly skip using the dogcatcher functions. county_item = county_item_re.findall(county) # print "+++++++++++++++++++++++++++++++++++++++++++" county_name = county_item[0].title().strip() official_name = county_item[1] first_name, last_name, review = dogcatcher.split_name( official_name, review) email = county_item[2].strip().lower() hours = county_item[3].strip() phone = dogcatcher.clean_phone(county_item[4]) fax = dogcatcher.clean_phone(county_item[5]) #Line 1 of the street address, Line 2 (if it exists), the city, state, and zip, are all distinct items in the array. #If there's a PO Box, it's always in Line 1. #This first checks for whether there's a PO box in Line 1. If there is, it creates po_city, po_state, and po_zip_code, and turns Line 1 into po_street, and checks whether Line 2 is a separate address. #If so, it creates street out of Line 2. Otherwise, it appends Line 2 to po_street. #If there isn't a PO Box, it checks whether there's a Line 2, and appends it to Line 1 to create street if it does. (If not, it just takes Line 1 as street.) #If street exists, it then creates city, address_state, and zip_code. address_1 = county_item[6].strip() address_2 = county_item[7].strip() if "PO Box" in address_1: po_city = county_item[8].strip() po_state = county_item[9].strip()
county_item_re = re.compile("<td borderColor=.+?>\s*<.+?>(.+?)</font> *</td>", re.DOTALL) email_re = re.compile("[^<>]+?@[^<>]+") digit_re = re.compile("\d") html_re = re.compile("<.+?>") county_data = county_data_re.findall(data) for county in county_data: authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin(voter_state) #The data is arranged in a well-ordered table, so we can split each row into a list of items and use each of those items in a consistent way from row to row. county_item = county_item_re.findall(county) fax = dogcatcher.clean_phone(county_item[7]) email = dogcatcher.find_emails(email_re, county) #For each item in the county, we trim any spare HTML out of it to make it easier to process. for item in county_item: index = county_item.index(item) for html in html_re.findall(item): county_item.insert(index,item.replace(html,"")) county_item.pop(index+1) county_name = county_item[0].title() first_name = " ".join(county_item[1].split()) last_name = county_item[2]
# print "-------------------------------------------" # print county #The data is arranged in a well-ordered table, so we can split each row into a list of items and use each of those items in a consistent way from row to row. #Since we can be fairly confident about the format of the data, I mostly skip using the dogcatcher functions. county_item = county_item_re.findall(county) # print "+++++++++++++++++++++++++++++++++++++++++++" county_name = county_item[0].title().strip() official_name = county_item[1] first_name, last_name, review = dogcatcher.split_name(official_name, review) email = county_item[2].strip().lower() hours = county_item[3].strip() phone = dogcatcher.clean_phone(county_item[4]) fax = dogcatcher.clean_phone(county_item[5]) #Line 1 of the street address, Line 2 (if it exists), the city, state, and zip, are all distinct items in the array. #If there's a PO Box, it's always in Line 1. #This first checks for whether there's a PO box in Line 1. If there is, it creates po_city, po_state, and po_zip_code, and turns Line 1 into po_street, and checks whether Line 2 is a separate address. #If so, it creates street out of Line 2. Otherwise, it appends Line 2 to po_street. #If there isn't a PO Box, it checks whether there's a Line 2, and appends it to Line 1 to create street if it does. (If not, it just takes Line 1 as street.) #If street exists, it then creates city, address_state, and zip_code. address_1 = county_item[6].strip() address_2 = county_item[7].strip() if "PO Box" in address_1: po_city = county_item[8].strip()
mailing_re = re.compile(".+?\d{5}[\d-]*") cz_re = re.compile(", [^,]+") zip_re = re.compile("\d{5}[\d-]*") for i in range(0,77): authority_name, first_name, last_name, county_name, town_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours, phone, fax, email, website, hours, review = dogcatcher.begin(voter_state) #print "___________________________________________" authority_name = "County Election Board" phone_name = phone_names[i] hours = hours_all[i] county_name = county_names[i].replace("LeFlore","Le Flore") address_block = address_blocks[i] phone = dogcatcher.clean_phone(phone_name.partition(" ")[0]) fax = dogcatcher.clean_phone(phone_name.partition(" ")[2].partition(" ")[0]) official_name = phone_name.partition(" ")[2].partition(" ")[2] first_name, last_name, review = dogcatcher.split_name(official_name, review, "ignore") county_name = county_name.replace(str(i+1),"").strip() #Every county name is attached to a number. We want to remove them. #One of the items in the address is a mailing address (which may be the same as the physical address.) #We extract it first, and pull the city/zip pair (states are not included in the data) out of it. #Then we see if there's a street address left when we remove the mailing address and the cz from the full mailing block. #If there is, the mailing address must be different from the physical address. #If not, there's only a physical address. mailing = mailing_re.findall(address_block)[0]
street = " ".join(data.cell(row,9).value.split()) po_street = " ".join(data.cell(row,5).value.split()) if po_street != street: po_city = " ".join(data.cell(row,6).value.split()) po_state = " ".join(data.cell(row,7).value.split()) po_zip_code = " ".join(data.cell(row,8).value.split()) else: po_street = "" city = " ".join(data.cell(row,10).value.split()) address_state = " ".join(data.cell(row,11).value.split()) zip_code = " ".join(data.cell(row,12).value.split()) if zip_code == "0542": zip_code = "05342" phone = dogcatcher.clean_phone(data.cell(row,13).value,"802") fax = dogcatcher.clean_phone(" ".join(data.cell(row,14).value.split())) if fax == "N/A": fax = "" email = " ".join(data.cell(row,15).value.lower().split()) hours = data.cell(row,16).value.rstrip().replace("//","+++++") fips = dogcatcher.find_fips(county_name, voter_state) result.append([authority_name, first_name, last_name, town_name, county_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code,
if po_street: po_city = " ".join(data.cell(row, 10).value.split()) po_state = " ".join(data.cell(row, 11).value.split()) #Occasionally, there's a separate mailing zip code. This checks that, and grabs it if so. Otherwise, it assigns the main zip code to be the mailing zip code. if data.cell(row, 13).value: po_zip_code = " ".join(data.cell(row, 13).value.split()) else: po_zip_code = " ".join(data.cell(row, 12).value.split()) else: po_city = " ".join(data.cell(row, 10).value.split()) po_state = " ".join(data.cell(row, 11).value.split()) po_zip_code = " ".join(data.cell(row, 12).value.split()) phone = dogcatcher.clean_phone(" ".join(data.cell(row, 5).value.split())) fax = dogcatcher.clean_phone(" ".join(data.cell(row, 6).value.split())) email = " ".join(data.cell(row, 7).value.lower().split()) website = dogcatcher.clean_website( data.cell(row, 14).value.rstrip().replace("//", "+++++")) fips = dogcatcher.find_fips(county_name, voter_state) result.append([ authority_name, first_name, last_name, county_name, fips, street, city, address_state, zip_code, po_street, po_city, po_state, po_zip_code, reg_authority_name, reg_first, reg_last, reg_street, reg_city, reg_state, reg_zip_code, reg_po_street, reg_po_city, reg_po_state, reg_po_zip_code, reg_phone, reg_fax, reg_email, reg_website, reg_hours,
try: county_name = county_name_re.findall(county)[0].title().strip() except: continue authority_name = authority_name_re.findall(county)[0].title().strip() print [authority_name] official_name = name_re.findall(county)[0] first_name, last_name, review = dogcatcher.split_name(official_name, review) fax = fax_re.findall(county)[0] county = county.replace(fax,"") fax = dogcatcher.clean_phone(fax) #Nothing distinctive starts the line with the phone number, but the elections office phone number, by some quirk of the data, is always the penultimate phone number of everything that looks like a phone number. #So we grab everything that looks like a phone number, and use the penultimate, phone_all = phone_re.findall(county) phone = dogcatcher.clean_phone(phone_all[len(phone_all)-1]) #This section finds the full address. After finding the address, it identifies a city/state/zip (csz) combination and a PO Box number if that exists. #It removes both the CSZ and the PO Address (if it exists) from the full address, leaving behind a street address with some garbage. #It then cleans up the street address and pulls the city, state, and zip out of the csz, and assigns them as appropriate to the street address and state. #The state is written as "Hawaii", so we replace it with "HI." If it's ever something else, we break the program to examine the change. address = address_re.findall(county)[0] csz = csz_re.findall(address)[0]