class TestDatasource(TestCase):
    def setUp(self):
        self.datasource = Datasource()

    def test_clarity_election_base_url(self):
        # raw, base
        urls = [

        for url, expected in urls:
            base_url = self.datasource._clarity_election_base_url(url)
            self.assertEqual(base_url, expected)

    def test_scrape_county_paths(self):
        fixture_path = os.path.join(fixture_dir, 'ar_results_clarity_select_county.html')
        with open(fixture_path, 'r') as f:
            html =
            paths = self.datasource._scrape_county_paths(html)
            self.assertEqual(len(paths), 75)

    def test_scrape_county_redirect(self):
        fixture_path = os.path.join(fixture_dir, 
        with open(fixture_path, 'r') as f:
            html =
            path = self.datasource._scrape_county_redirect_path(html)
            self.assertEqual(path, '112821/summary.html')

    def test_clariy_county_url(self):
Пример #2
class TestDatasource(TestCase):
    def setUp(self):
        self.datasource = Datasource()

    def test_clarity_election_base_url(self):
        # raw, base
        urls = [

        for url, expected in urls:
            base_url = self.datasource._clarity_election_base_url(url)
            self.assertEqual(base_url, expected)

    def test_scrape_county_paths(self):
        fixture_path = os.path.join(fixture_dir,
        with open(fixture_path, 'r') as f:
            html =
            paths = self.datasource._scrape_county_paths(html)
            self.assertEqual(len(paths), 75)

    def test_scrape_county_redirect(self):
        fixture_path = os.path.join(fixture_dir,
        with open(fixture_path, 'r') as f:
            html =
            path = self.datasource._scrape_county_redirect_path(html)
            self.assertEqual(path, '112821/summary.html')

    def test_clariy_county_url(self):
 def setUp(self):
     self.datasource = Datasource()
Пример #4
 def __init__(self):
     super(FetchResults, self).__init__()
     self._datasource = Datasource()
     self._fetched = set()
     self._results_portal_url = self._datasource.RESULTS_PORTAL_URL
Пример #5
class FetchResults(BaseFetcher):
    def __init__(self):
        super(FetchResults, self).__init__()
        self._datasource = Datasource()
        self._fetched = set()
        self._results_portal_url = self._datasource.RESULTS_PORTAL_URL

    def fetch(self, url, fname=None, overwrite=False):
        # We keep track of URLs we've already fetched in this run since
        # there will be multiple output files mapped to a single zip
        # file.  If we've already fetched this URL, exit early.
        if url in self._fetched:

        if url.startswith(self._results_portal_url):
            self._fetch_portal(url, fname, overwrite)
        elif url.endswith('.zip'):
            # Fetch the zip file, using the automatically generated filename
            zip_fname = self._local_zip_file_name(url)
            super(FetchResults, self).fetch(url, zip_fname, overwrite)
            self._extract_zip(url, zip_fname, overwrite)
            super(FetchResults, self).fetch(url, fname, overwrite)


    def _fetch_portal(self, url, fname, overwrite=False):
        Fetch a results file from the reporting portal.
        local_file_name = os.path.join(self.cache.abspath, fname)
        # The call to the parent class' fetch() method will duplicate the
        # check for the local file, but that's less expensive than building
        # the report URL, since that requires scraping an HTTP request to
        # fetch the form HTML and scraping it.
        if overwrite or not os.path.exists(local_file_name):
            report_url = self._get_report_url(url)
            # Now that we have the URL, delegate to the parent class' fetch
            # to grab the file.
            super(FetchResults, self).fetch(report_url, fname, overwrite)
            print("File is cached: %s" % local_file_name)

    def _get_report_url(self, url):
        Build the download URL for a results file from the election portal.
        query_params = self._get_report_query_params(url)
        qs = urllib.parse.urlencode(query_params)
        return '' + qs

    def _get_report_query_params(self, url):
        Build the query string parameters to retrieve a results file from the
        election portal.

        Return a list of key, value pairs.
        params = []
        resp = requests.get(url) 
        contests = self._scrape_contests(resp.text)
        # County ids are consecutive integer values
        for county_id in range(1, 76):
            params.append(('counties[]', county_id)) 
        for contest_id, contest_name in contests:
            params.append(('contests[]', contest_id))
        # Show vote counts in report rather than percentages
        params.append(('votes', 'counts'))
        # Include unopposed contests in report
        params.append(('show_unopp', '1'))
        # Show results by polling location
        params.append(('group', 'poll'))
        # Download the file as delimited text rather than output in HTML
        params.append(('DOWNLOAD', '1'))
        params.append(('elecid', self._elec_id(url)))
        params.append(('ac:show:reports:extra:makereport:1', "Create Report"))
        return params

    def _scrape_contests(self, html):
        Scrape the contests from the results portal form.

        Return a list of contest id, office name tuples.
        soup = BeautifulSoup(html, 'html.parser')
        return [(o['value'], o.get_text()) for o in"select#contests option")]

    def _elec_id(self, url):
        Parse reporting portal election ID from the url
        parsed = urllib.parse.urlsplit(url)
        query_params = urllib.parse.parse_qs(parsed.query)
        return int(query_params['elecid'][0])

    def _local_zip_file_name(self, url):
        Return a normalized local file name for a results zip file.

        We don't care too much about the format because we can delete the
        zip file later.
        parsed = urllib.parse.urlsplit(url)
        fname = parsed.path.split('/')[-1]
        return os.path.join(self.cache.abspath, fname)

    def _extract_zip(self, url, zip_fname=None, overwrite=False, remove=True):
        if zip_fname is None:
            zip_fname =  self._local_zip_file_name(url)

        with ZipFile(zip_fname, 'r') as zipf:
            for mapping in self._datasource.mappings_for_url(url):
                local_file_name = os.path.join(self.cache.abspath,
                if overwrite or not os.path.exists(local_file_name):
                    extracted_file_name = os.path.join(self.cache.abspath,
                    os.rename(extracted_file_name, local_file_name)
                    print("Added to cache: %s" % local_file_name)
                    print("File is cached: %s" % local_file_name)

        if remove:
Пример #6
 def __init__(self):
     super(FetchResults, self).__init__()
     self._datasource = Datasource()
     self._fetched = set()
     self._results_portal_url = self._datasource.RESULTS_PORTAL_URL
Пример #7
class FetchResults(BaseFetcher):
    def __init__(self):
        super(FetchResults, self).__init__()
        self._datasource = Datasource()
        self._fetched = set()
        self._results_portal_url = self._datasource.RESULTS_PORTAL_URL

    def fetch(self, url, fname=None, overwrite=False):
        # We keep track of URLs we've already fetched in this run since
        # there will be multiple output files mapped to a single zip
        # file.  If we've already fetched this URL, exit early.
        if url in self._fetched:

        if url.startswith(self._results_portal_url):
            self._fetch_portal(url, fname, overwrite)
        elif url.endswith('.zip'):
            # Fetch the zip file, using the automatically generated filename
            zip_fname = self._local_zip_file_name(url)
            super(FetchResults, self).fetch(url, zip_fname, overwrite)
            self._extract_zip(url, zip_fname, overwrite)
            super(FetchResults, self).fetch(url, fname, overwrite)


    def _fetch_portal(self, url, fname, overwrite=False):
        Fetch a results file from the reporting portal.
        local_file_name = os.path.join(self.cache.abspath, fname)
        # The call to the parent class' fetch() method will duplicate the
        # check for the local file, but that's less expensive than building
        # the report URL, since that requires scraping an HTTP request to
        # fetch the form HTML and scraping it.
        if overwrite or not os.path.exists(local_file_name):
            report_url = self._get_report_url(url)
            # Now that we have the URL, delegate to the parent class' fetch
            # to grab the file.
            super(FetchResults, self).fetch(report_url, fname, overwrite)
            print("File is cached: %s" % local_file_name)

    def _get_report_url(self, url):
        Build the download URL for a results file from the election portal.
        query_params = self._get_report_query_params(url)
        qs = urllib.parse.urlencode(query_params)
        return '' + qs

    def _get_report_query_params(self, url):
        Build the query string parameters to retrieve a results file from the
        election portal.

        Return a list of key, value pairs.
        params = []
        resp = requests.get(url)
        contests = self._scrape_contests(resp.text)
        # County ids are consecutive integer values
        for county_id in range(1, 76):
            params.append(('counties[]', county_id))
        for contest_id, contest_name in contests:
            params.append(('contests[]', contest_id))
        # Show vote counts in report rather than percentages
        params.append(('votes', 'counts'))
        # Include unopposed contests in report
        params.append(('show_unopp', '1'))
        # Show results by polling location
        params.append(('group', 'poll'))
        # Download the file as delimited text rather than output in HTML
        params.append(('DOWNLOAD', '1'))
        params.append(('elecid', self._elec_id(url)))
        params.append(('ac:show:reports:extra:makereport:1', "Create Report"))
        return params

    def _scrape_contests(self, html):
        Scrape the contests from the results portal form.

        Return a list of contest id, office name tuples.
        soup = BeautifulSoup(html, 'html.parser')
        return [(o['value'], o.get_text())
                for o in"select#contests option")]

    def _elec_id(self, url):
        Parse reporting portal election ID from the url
        parsed = urllib.parse.urlsplit(url)
        query_params = urllib.parse.parse_qs(parsed.query)
        return int(query_params['elecid'][0])

    def _local_zip_file_name(self, url):
        Return a normalized local file name for a results zip file.

        We don't care too much about the format because we can delete the
        zip file later.
        parsed = urllib.parse.urlsplit(url)
        fname = parsed.path.split('/')[-1]
        return os.path.join(self.cache.abspath, fname)

    def _extract_zip(self, url, zip_fname=None, overwrite=False, remove=True):
        if zip_fname is None:
            zip_fname = self._local_zip_file_name(url)

        with ZipFile(zip_fname, 'r') as zipf:
            for mapping in self._datasource.mappings_for_url(url):
                local_file_name = os.path.join(self.cache.abspath,
                if overwrite or not os.path.exists(local_file_name):
                    extracted_file_name = os.path.join(
                        self.cache.abspath, mapping['raw_extracted_filename'])
                    os.rename(extracted_file_name, local_file_name)
                    print("Added to cache: %s" % local_file_name)
                    print("File is cached: %s" % local_file_name)

        if remove:
 def setUp(self):
     self.datasource = Datasource()