def fetch_data(): data = [] text = cache_request(BASE_URL, verify=SSL_CERT) soup = BeautifulSoup(text, 'html.parser') for county in tqdm(soup.find('select', id='Counties')('option')): if not county.get('value'): continue county_text = cache_request(f'{BASE_URL}/SearchByCounty', method='POST', data={'CountyID': county.get('value')}, wait=random_wait(), verify=SSL_CERT) county_soup = BeautifulSoup(county_text, 'html.parser') for jurisdiction_a in county_soup('a', class_='local-clerk-link'): qrystr_params = parse_qs( urlparse(jurisdiction_a.get('href')).query) jurisdiction_data = { k: v[0] for k, v in qrystr_params.items() if k != 'dummy' } jurisdiction_text = cache_request(f'{BASE_URL}/LocalClerk', method='POST', data=jurisdiction_data, wait=random_wait(), verify=SSL_CERT) jurisdiction_soup = BeautifulSoup(jurisdiction_text, 'html.parser') data.append( parse_jurisdiction( jurisdiction_soup, jurisdiction_data['jurisdictionName'], county.text, fipscode=jurisdiction_data['jurisdictionCode'])) return data
def fetch_data(): data = [] text = cache_request(BASE_URL) soup = BeautifulSoup(text, 'html.parser') for county_link in tqdm(soup.select('a[href^=countyInfo]')): text = cache_request(BASE_URL + county_link['href']) data.append(parse_county(BeautifulSoup(text, 'html.parser'))) return data
def fetch_data(): data = [] text = cache_request(LIST_URL) soup = BeautifulSoup(text, 'html.parser') for county in tqdm(soup.find(id='idTown')('option')): wait = random.uniform(1, 3) text = cache_request(DETAIL_URL, method='POST', data={'idTown': county['value'], 'contactType': 'R'}, wait=wait) data.append(parse_county(BeautifulSoup(text, 'html.parser'))) return data
def fetch_data(): data = [] text = cache_request('https://www.elections.ny.gov/CountyBoards.html') soup = BeautifulSoup(text, 'html.parser') for county_area in soup.find_all('area'): county_link = county_area['href'] text = cache_request(county_link) data.append(parse_county(BeautifulSoup(text, 'html.parser'))) return data
def fetch_data(verbose=True): data = [] text = cache_request(BASE_URL) soup = BeautifulSoup(text, 'html.parser') for county_area in tqdm(soup.find_all('area'), disable=not verbose): county_link = county_area['href'] text = cache_request(county_link) data.append(parse_county(BeautifulSoup(text, 'html.parser'))) return data
def fetch_data(verbose=True): # pylint: disable=unused-argument # extract __viewstate and other form params for .aspx page = cache_request(BASE_URL) soup = BeautifulSoup(page, 'lxml') form = soup.find('form') form_data = { form_input['name']: form_input['value'] for form_input in form('input') } form_data['ctl00$MainContentPlaceHolder$PPReport'] = 'rdoCsv' csv_text = cache_request(BASE_URL, method='POST', data=form_data) return parse_csv(csv_text)
def parse_pdf(): html = cache_request(BASE_URL) soup = BeautifulSoup(html, 'html.parser') pdf_url = soup.find('a', text=re.compile('^WI Municipal Clerks'))['href'] req = cache_request(pdf_url, is_binary=True) with BytesIO(req) as pdf_bytes: pdf_reader = PyPDF2.PdfFileReader(pdf_bytes) records = [] for page_num in tqdm(range(pdf_reader.numPages)): text = pdf_reader.getPage(page_num).extractText() for city_chunk in re_city_chunk.findall(text): records.append(parse_city(city_chunk)) return records
def fetch_data(verbose=True): # pylint: disable=unused-argument text = cache_request('https://www.sos.ks.gov/elections/county_election_officers_all.aspx') soup = BeautifulSoup(text, 'html.parser') # Remove first labels row. raw_rows = soup.body.find_all('tr')[1:] counties = [] for table_rows in raw_rows: county = {} items = table_rows.find_all('font') county['county'] = items[0].text + ' County' county['locale'] = county['county'] county['officer'] = items[1].text county['emails'] = [items[2].text] # Index 3 is hours of operation, which we don't care about. county['phones'] = [items[4].text.replace('\n', '').replace('(', '').replace(')', '-')] county['faxes'] = [items[5].text.replace('\n', '').replace('(', '').replace(')', '-')] # Only some counties have a second address line. address_line2 = items[7].text if len(address_line2) > 1: address_line2 = ' ' + address_line2 + ' ' address = (items[6].text + address_line2 + items[8].text + ' ' + items[9].text + items[10].text) county['address'] = address.replace('\n', ' ').replace('\xa0', ' ')[:-1] counties.append(county) return counties
def fetch_data(verbose=True): # pylint: disable=unused-argument data = defaultdict(list) text = cache_request(URL) soup = BeautifulSoup(text, 'html.parser') emails = soup.find_all('a', href=re.compile(r'^mailto:')) for email in emails: email_address = '' if '@' in email.text: email_address = email.text else: email_address = email['href'].replace('mailto:', '') data['emails'].append(email_address) for text in [p.text for p in soup.find_all('p')]: if 'Fax' in text: data['faxes'].append(extract_phone_number(text)) elif 'Phone' in text or 'Toll-Free' in text: data['phones'].append(extract_phone_number(text)) elif 'Absentee and Petition Office' in text: lines = text.split('\n') data['address'] = (lines[1] + lines[2]).replace('\r', ' ') data['locale'] = 'All' return [dict(data)]
def query_clerk_data(pdf_data, verbose=True): clerk_data = [] for pdf_datum in tqdm(pdf_data, disable=not verbose): for field in ['municipal_address', 'mailing_address']: if pdf_datum.get(field): street, city, _, zipcode = re_addr.search( pdf_datum.get(field)).groups() post_data = { 'addressLine': street, 'unit': "", 'city': city, 'zip': zipcode } result = cache_request(POST_URL, method='POST', data=post_data, wait=random.uniform(.1, .3)) json_data = json.loads(result).get('Data') or {} if json_data.get('clerk'): for phfax_field in ['fax', 'phone1', 'phone2' ]: # remove invalid phones/faxes phfax_match = re_to_e164.findall( json_data['clerk'].get(phfax_field, '')) json_data['clerk'][phfax_field] = ''.join( phfax_match[0]) if phfax_match else None clerk_data.append(json_data['clerk']) return clerk_data
def main(): req = cache_request( "https://sos.wyo.gov/Elections/Docs/WYCountyClerks_AbsRequest_VRChange.pdf", is_binary=True) with BytesIO(req) as fh: pdf_reader = PyPDF2.PdfFileReader(fh) text = '' for page_num in tqdm(range(pdf_reader.numPages)): text += pdf_reader.getPage(page_num).extractText() # Remove Page # of # text = re.sub(r"\sPage\s\n\d\sof\s\n\d\s", "", text) lines = make_lines(text) # Remove first couples of lines which are PDF header. lines = lines[9:] rows = group_rows(lines) grouped_rows = group_counties(rows) counties = generate_county_dict_list(grouped_rows) with open('public/wyoming.json', 'w') as f: json.dump(counties, f)
def fetch_data(verbose=True): # pylint: disable=unused-argument html = cache_request(BASE_URL) soup = BeautifulSoup(html, 'html.parser') return [ parse_row(row) for row in soup.select( 'table#ctl00_ContentPlaceHolder1_rgCountyAuditors_ctl00 tbody tr') ]
def get_locality_datum(id_): page = cache_request( 'https://vote.elections.virginia.gov/VoterInformation/PublicContactLookup', method='POST', data={'LocalityUid': id_}, wait=2, ) soup = BeautifulSoup(page, 'lxml') keys = soup.select('.resultsWrapper')[0].select('h5.display-lable') vals = soup.select('.resultsWrapper')[0].select('p.display-field') results = { key.text.strip(): val.text.strip() for key, val in zip(keys, vals) } locale = soup.select( 'select > option[selected="selected"]')[0].text.title() final = { 'locale': locale, 'county': locale if locale.endswith('County') else None, 'city': locale if not locale.endswith('County') else None, 'official': results['Registrar'], 'emails': [results['Email']], 'faxes': [results['Fax']], 'url': results.get('URL'), 'address': results.get('Mailing Address') or results.get('Address'), 'physicalAddress': results.get('Physical Address'), } return {k: v for k, v in final.items() if v}
def get_locality_ids(): page = cache_request(BASE_URL) soup = BeautifulSoup(page, 'lxml') return [ option['value'] for option in soup.select('select>option') if option['value'] ]
def fetch_data(verbose=True): # pylint: disable=unused-argument text = cache_request(BASE_URL) soup = BeautifulSoup(text, 'lxml') counties = soup.select('div.field-items>div.field-item div.col-sm-6') data = [parse_county(county) for county in counties] assert len(data) == 93 return data
def fetch_emails(): html = cache_selenium(EMAIL_LINK_URL) soup = BeautifulSoup(html, 'html.parser') xlsx_url = soup('a', text=re.compile(r'county.*e-?mail', re.IGNORECASE))[0]['href'] xlsx = cache_request(xlsx_url, is_binary=True) emails = pd.read_excel(xlsx).fillna(method='ffill').apply(lambda x: x.str.strip()) emails = emails.rename(columns={'Email': 'emails'}) emails['locale'] = emails['County'].str.title() + ' County' return emails.groupby('locale')['emails'].apply(list)
def fetch_data(): text = cache_request(BASE_URL) soup = BeautifulSoup(text, 'lxml') counties = soup.select('div.field-items>div.field-item div.col-sm-6') data = [parse_county(county) for county in counties] assert len(data) == 93 return data
def fetch_data(): data = [] text = cache_request('https://azsos.gov/county-election-info') soup = BeautifulSoup(text, 'html.parser') for county in soup('div', id=re.compile('^county_info_')): if county.find('h2'): # there are extra blank divs data.append(parse_county(county)) return data
def fetch_data(): text = cache_request(BASE_URL) if text.startswith('<plaintext>'): text = text[len('<plaintext>'):] reader = csv.reader(StringIO(text), delimiter='|') csv_data = [line for line in reader if line] json_data = [record(datum) for datum in csv_data] return json_data
def get_locality_ids(): page = cache_request( 'https://vote.elections.virginia.gov/VoterInformation/PublicContactLookup' ) soup = BeautifulSoup(page, 'lxml') return [ option['value'] for option in soup.select('select>option') if option['value'] ]
def parse_pdf(): html = cache_request(BASE_URL) soup = BeautifulSoup(html, 'html.parser') pdf_url = soup.find('a', text=re.compile('^WI Municipal Clerks'))['href'] text = fetch_pdf_text(pdf_url) return [ parse_city(city_chunk) for city_chunk in tqdm(re_city_chunk.findall(text)) ]
def parse_email_list(email_list_url): text = cache_request(email_list_url) soup = BeautifulSoup(text, 'html.parser') email_by_county = {} for row in soup.find('tbody')('tr'): county_name = row.find('td').text email_tag = row.find('a') if email_tag is not None: email_by_county[county_name] = email_tag.text return email_by_county
def fetch_data(verbose=True): html = cache_request(BASE_URL) soup = BeautifulSoup(html, 'lxml') counties = [ option['value'] for option in soup.select('select>option') if option['value'] ] return [ fetch_county(county_name) for county_name in tqdm(counties, disable=not verbose) ]
def fetch_data(verbose=True): # pylint: disable=unused-argument html = cache_request(URL) soup = BeautifulSoup(html, 'html.parser') text = soup.find(class_='dcboeContent').get_text('\n') faxes = re_fax.findall(text) return [{ 'locale': 'All', 'emails': re_email.findall(text), 'phones': [phone for phone in re_phone.findall(text) if phone not in faxes], 'faxes': faxes, }]
def get_pdf_text(url): req = cache_request(url, is_binary=True) with BytesIO(req) as pdf_file: with StringIO() as output: manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) for page in PDFPage.get_pages(pdf_file, set()): interpreter.process_page(page) converter.close() text = output.getvalue() return text
def fetch_data(verbose=True): # pylint: disable=unused-argument text = cache_request(BASE_URL) soup = BeautifulSoup(text, 'lxml') counties = soup.select('h2.contentpage-h2 a') data = [] for county in counties: data_id = county['data-target'].split('#')[1] datum = soup.find(id=data_id) data.append(parse_county(county, datum)) return data
def fetch_data(): html = cache_request(BASE_URL) soup = BeautifulSoup(html, 'html.parser') pdf_url = urljoin(BASE_URL, soup.find('a', text=re.compile('^List of County Election Boards'))['href']) email_list_url = urljoin(BASE_URL, soup.find('a', text=re.compile('^County Election Board Email Addresses'))['href']) pdf_data = parse_pdf(pdf_url) email_data = parse_email_list(email_list_url) assert len(pdf_data) == len(email_data) for county_name, email in email_data.items(): pdf_data[county_name]['emails'] = [email] return list(pdf_data.values())
def fetch_county(county_name): locale = f"{county_name} County" html = cache_request(f"{BASE_URL}?County={county_name}", wait=random.uniform(.5, 1.5)) soup = BeautifulSoup(html, 'lxml') table = soup.select_one('table#data') text = re_dense_lines.sub('\n', table.get_text('\n')) url = re_url.findall(text) return { 'locale': locale, 'county': locale, 'official': re_official.findall(text)[0], 'emails': re_email.findall(text), 'faxes': re_fax.findall(text), 'phones': re_phone.findall(text), 'url': url[0] if url else None, 'address': parse_addr(re_mailing_addr.findall(text)), 'physicalAddress': parse_addr(re_physical_addr.findall(text)), }
def main(): text = cache_request( 'https://elections.maryland.gov/about/county_boards.html') soup = BeautifulSoup(text, 'lxml') counties = soup.select('div.mdgov_contentWrapper > p') # lines = [line for line in line_gen(counties[1].children)] data = [] for county in counties: lines = list(county.children) href_datum = find_hrefs(lines) url_datum = { 'url': href_datum['urls'][0] } if href_datum['urls'] else {} geo = lines[0].text.strip() if geo.endswith('City'): geo_datum = { 'locale': geo + ':', 'city': geo, } else: # for Baltimore county = geo if geo.endswith(' County') else geo + ' County' geo_datum = { 'locale': ':' + county, 'county': county, } datum = { **geo_datum, 'official': find_re(election_director_re, lines), 'phones': find_re(phone_re, lines, find_all=True), 'faxes': [find_re(fax_re, lines)], **url_datum, **href_datum, } assert datum['emails'] assert find_re(fax_re, lines) data += [datum] with open('public/maryland.json', 'w') as fh: json.dump(data, fh)
def get_locality_datum(id_, attempt=0): if attempt > 5: return {} page = cache_request( BASE_URL, method='POST', data={'LocalityUid': id_}, wait=random.uniform(2, 3), ) soup = BeautifulSoup(page, 'lxml') results_wrappers = soup.select('.resultsWrapper') if results_wrappers: keys = results_wrappers[0].select('h5.display-lable') vals = results_wrappers[0].select('p.display-field') results = { key.text.strip(): val.text.strip() for key, val in zip(keys, vals) } locale = soup.select( 'select > option[selected="selected"]')[0].text.title() final = { 'locale': locale, 'county': locale if locale.endswith('County') else None, 'city': locale if not locale.endswith('County') else None, 'official': results['Registrar'], 'emails': to_list(results['Email']), 'faxes': to_list(results['Fax']), 'url': results.get('URL'), 'address': results.get('Mailing Address') or results.get('Address'), 'physicalAddress': results.get('Physical Address'), } return {k: v for k, v in final.items() if v} cached_files = glob.glob(os.path.join(work_dir, '*.*')) latest_file = max(cached_files, key=os.path.getctime) logging.warning( "error in Virginia file; deleting cached file %s; retry after wait", latest_file) os.remove(latest_file) sleep(30) return get_locality_datum(id_, attempt + 1)