class TestDatasource(TestCase): def setUp(self): self.datasource = Datasource() def test_clarity_election_base_url(self): # raw, base urls = [ ("http://results.enr.clarityelections.com/AR/39376/83979/en/reports.html", "http://results.enr.clarityelections.com/AR/39376/83979/"), ("http://results.enr.clarityelections.com/AR/Arkansas/42845/index.html", "http://results.enr.clarityelections.com/AR/Arkansas/42845/"), ] for url, expected in urls: base_url = self.datasource._clarity_election_base_url(url) self.assertEqual(base_url, expected) def test_scrape_county_paths(self): fixture_path = os.path.join(fixture_dir, 'ar_results_clarity_select_county.html') with open(fixture_path, 'r') as f: html = f.read() paths = self.datasource._scrape_county_paths(html) self.assertEqual(len(paths), 75) def test_scrape_county_redirect(self): fixture_path = os.path.join(fixture_dir, 'ar_results_clarity_county_redirect.html') with open(fixture_path, 'r') as f: html = f.read() path = self.datasource._scrape_county_redirect_path(html) self.assertEqual(path, '112821/summary.html') def test_clariy_county_url(self): pass
def setUp(self): self.datasource = Datasource()
def __init__(self): super(FetchResults, self).__init__() self._datasource = Datasource() self._fetched = set() self._results_portal_url = self._datasource.RESULTS_PORTAL_URL
class FetchResults(BaseFetcher): def __init__(self): super(FetchResults, self).__init__() self._datasource = Datasource() self._fetched = set() self._results_portal_url = self._datasource.RESULTS_PORTAL_URL def fetch(self, url, fname=None, overwrite=False): # We keep track of URLs we've already fetched in this run since # there will be multiple output files mapped to a single zip # file. If we've already fetched this URL, exit early. if url in self._fetched: return if url.startswith(self._results_portal_url): self._fetch_portal(url, fname, overwrite) elif url.endswith('.zip'): # Fetch the zip file, using the automatically generated filename zip_fname = self._local_zip_file_name(url) super(FetchResults, self).fetch(url, zip_fname, overwrite) self._extract_zip(url, zip_fname, overwrite) else: super(FetchResults, self).fetch(url, fname, overwrite) self._fetched.add(url) def _fetch_portal(self, url, fname, overwrite=False): """ Fetch a results file from the reporting portal. """ local_file_name = os.path.join(self.cache.abspath, fname) # The call to the parent class' fetch() method will duplicate the # check for the local file, but that's less expensive than building # the report URL, since that requires scraping an HTTP request to # fetch the form HTML and scraping it. if overwrite or not os.path.exists(local_file_name): report_url = self._get_report_url(url) # Now that we have the URL, delegate to the parent class' fetch # to grab the file. super(FetchResults, self).fetch(report_url, fname, overwrite) else: print("File is cached: %s" % local_file_name) def _get_report_url(self, url): """ Build the download URL for a results file from the election portal. """ query_params = self._get_report_query_params(url) qs = urllib.parse.urlencode(query_params) return 'http://www.sos.arkansas.gov/electionresults/index.php?' + qs def _get_report_query_params(self, url): """ Build the query string parameters to retrieve a results file from the election portal. Return a list of key, value pairs. """ params = [] resp = requests.get(url) resp.raise_for_status() contests = self._scrape_contests(resp.text) # County ids are consecutive integer values for county_id in range(1, 76): params.append(('counties[]', county_id)) for contest_id, contest_name in contests: params.append(('contests[]', contest_id)) # Show vote counts in report rather than percentages params.append(('votes', 'counts')) # Include unopposed contests in report params.append(('show_unopp', '1')) # Show results by polling location params.append(('group', 'poll')) # Download the file as delimited text rather than output in HTML params.append(('DOWNLOAD', '1')) params.append(('elecid', self._elec_id(url))) params.append(('ac:show:reports:extra:makereport:1', "Create Report")) return params def _scrape_contests(self, html): """ Scrape the contests from the results portal form. Return a list of contest id, office name tuples. """ soup = BeautifulSoup(html, 'html.parser') return [(o['value'], o.get_text()) for o in soup.select("select#contests option")] def _elec_id(self, url): """ Parse reporting portal election ID from the url """ parsed = urllib.parse.urlsplit(url) query_params = urllib.parse.parse_qs(parsed.query) return int(query_params['elecid'][0]) def _local_zip_file_name(self, url): """ Return a normalized local file name for a results zip file. We don't care too much about the format because we can delete the zip file later. """ parsed = urllib.parse.urlsplit(url) fname = parsed.path.split('/')[-1] return os.path.join(self.cache.abspath, fname) def _extract_zip(self, url, zip_fname=None, overwrite=False, remove=True): if zip_fname is None: zip_fname = self._local_zip_file_name(url) with ZipFile(zip_fname, 'r') as zipf: for mapping in self._datasource.mappings_for_url(url): local_file_name = os.path.join(self.cache.abspath, mapping['generated_filename']) if overwrite or not os.path.exists(local_file_name): zipf.extract(mapping['raw_extracted_filename'], self.cache.abspath) extracted_file_name = os.path.join(self.cache.abspath, mapping['raw_extracted_filename']) os.rename(extracted_file_name, local_file_name) print("Added to cache: %s" % local_file_name) else: print("File is cached: %s" % local_file_name) if remove: os.remove(zip_fname)
class FetchResults(BaseFetcher): def __init__(self): super(FetchResults, self).__init__() self._datasource = Datasource() self._fetched = set() self._results_portal_url = self._datasource.RESULTS_PORTAL_URL def fetch(self, url, fname=None, overwrite=False): # We keep track of URLs we've already fetched in this run since # there will be multiple output files mapped to a single zip # file. If we've already fetched this URL, exit early. if url in self._fetched: return if url.startswith(self._results_portal_url): self._fetch_portal(url, fname, overwrite) elif url.endswith('.zip'): # Fetch the zip file, using the automatically generated filename zip_fname = self._local_zip_file_name(url) super(FetchResults, self).fetch(url, zip_fname, overwrite) self._extract_zip(url, zip_fname, overwrite) else: super(FetchResults, self).fetch(url, fname, overwrite) self._fetched.add(url) def _fetch_portal(self, url, fname, overwrite=False): """ Fetch a results file from the reporting portal. """ local_file_name = os.path.join(self.cache.abspath, fname) # The call to the parent class' fetch() method will duplicate the # check for the local file, but that's less expensive than building # the report URL, since that requires scraping an HTTP request to # fetch the form HTML and scraping it. if overwrite or not os.path.exists(local_file_name): report_url = self._get_report_url(url) # Now that we have the URL, delegate to the parent class' fetch # to grab the file. super(FetchResults, self).fetch(report_url, fname, overwrite) else: print("File is cached: %s" % local_file_name) def _get_report_url(self, url): """ Build the download URL for a results file from the election portal. """ query_params = self._get_report_query_params(url) qs = urllib.parse.urlencode(query_params) return 'http://www.sos.arkansas.gov/electionresults/index.php?' + qs def _get_report_query_params(self, url): """ Build the query string parameters to retrieve a results file from the election portal. Return a list of key, value pairs. """ params = [] resp = requests.get(url) resp.raise_for_status() contests = self._scrape_contests(resp.text) # County ids are consecutive integer values for county_id in range(1, 76): params.append(('counties[]', county_id)) for contest_id, contest_name in contests: params.append(('contests[]', contest_id)) # Show vote counts in report rather than percentages params.append(('votes', 'counts')) # Include unopposed contests in report params.append(('show_unopp', '1')) # Show results by polling location params.append(('group', 'poll')) # Download the file as delimited text rather than output in HTML params.append(('DOWNLOAD', '1')) params.append(('elecid', self._elec_id(url))) params.append(('ac:show:reports:extra:makereport:1', "Create Report")) return params def _scrape_contests(self, html): """ Scrape the contests from the results portal form. Return a list of contest id, office name tuples. """ soup = BeautifulSoup(html, 'html.parser') return [(o['value'], o.get_text()) for o in soup.select("select#contests option")] def _elec_id(self, url): """ Parse reporting portal election ID from the url """ parsed = urllib.parse.urlsplit(url) query_params = urllib.parse.parse_qs(parsed.query) return int(query_params['elecid'][0]) def _local_zip_file_name(self, url): """ Return a normalized local file name for a results zip file. We don't care too much about the format because we can delete the zip file later. """ parsed = urllib.parse.urlsplit(url) fname = parsed.path.split('/')[-1] return os.path.join(self.cache.abspath, fname) def _extract_zip(self, url, zip_fname=None, overwrite=False, remove=True): if zip_fname is None: zip_fname = self._local_zip_file_name(url) with ZipFile(zip_fname, 'r') as zipf: for mapping in self._datasource.mappings_for_url(url): local_file_name = os.path.join(self.cache.abspath, mapping['generated_filename']) if overwrite or not os.path.exists(local_file_name): zipf.extract(mapping['raw_extracted_filename'], self.cache.abspath) extracted_file_name = os.path.join( self.cache.abspath, mapping['raw_extracted_filename']) os.rename(extracted_file_name, local_file_name) print("Added to cache: %s" % local_file_name) else: print("File is cached: %s" % local_file_name) if remove: os.remove(zip_fname)