def fetch_emails(): html = cache_selenium(EMAIL_LINK_URL) soup = BeautifulSoup(html, 'html.parser') xlsx_url = soup('a', text=re.compile(r'county.*e-?mail', re.IGNORECASE))[0]['href'] xlsx = cache_request(xlsx_url, is_binary=True) emails = pd.read_excel(xlsx).fillna(method='ffill').apply(lambda x: x.str.strip()) emails = emails.rename(columns={'Email': 'emails'}) emails['locale'] = emails['County'].str.title() + ' County' return emails.groupby('locale')['emails'].apply(list)
def fetch_data(verbose=True): driver = init_selenium_driver() # will be using repeatedly html = cache_selenium(BASE_URL, driver=driver) soup = BeautifulSoup(html, 'html.parser') county_links = soup.select('a[href^=countyInfo]') assert len(county_links) > 0, ( 'No county links found in the following HTML:\n' + '#' * 30 + html + '#' * 30) data = [ fetch_and_parse_county(BASE_URL + county_link['href'], driver) for county_link in tqdm(county_links, disable=not verbose) ] driver.close() return data
def fetch_and_parse_county(county_url, driver): html = cache_selenium(county_url, wait=1, driver=driver) soup = BeautifulSoup(html, 'html.parser') county = soup.find('p', class_='title').text.split('Supervisor')[0].strip() links = soup.find(id='rightContent')('a') return { 'locale': county, 'official': soup.find('span', class_='bigRed').text.replace(u'\xa0', ' ').split(',')[0].strip(), 'emails': [links[0]['href'].replace('mailto:', '').strip()], 'url': links[1]['href'].strip(), 'county': county, }
def fetch_data(): html = cache_selenium(BASE_URL) data = parse_html(html) return data
def main(): html = cache_selenium(BASE_URL) data = parse_html(html) data = normalize_state(data) diff_and_save(data, 'public/massachusetts.json')
def fetch_data(verbose=True): # pylint: disable=unused-argument html = cache_selenium(BASE_URL) data = parse_html(html) return data