def testGetAllUrls(self, mockget: Mock) -> None:
        REPORTS_LANDING_HTML = fixtures.as_string("aggregate/regions/ga",
                                                  "reports_landing.html")
        REPORTS_YEAR_2015 = fixtures.as_string("aggregate/regions/ga",
                                               "reports_year_2015.html")
        REPORTS_YEAR_2019 = fixtures.as_string("aggregate/regions/ga",
                                               "reports_year_2019.html")

        def _MockGet(url: str) -> Mock:
            response = Mock()
            if "node/5617" in url:
                response.text = REPORTS_YEAR_2019
            elif "node/4036" in url:
                response.text = REPORTS_YEAR_2015
            else:
                response.text = REPORTS_LANDING_HTML
            return response

        mockget.side_effect = _MockGet
        url1 = "https://www.dca.ga.gov/sites/default/files/jail_report_jan19.pdf"
        url2 = "https://www.dca.ga.gov/sites/default/files/mar15_jail_report.pdf"
        expected_urls = {url1, url2}

        urls = ga_aggregate_site_scraper.get_urls_to_download()
        self.assertEqual(expected_urls, urls)
Пример #2
0
    def testGetAllUrls(self, mockget: Mock) -> None:
        mock_landing = Mock()
        mock_landing.text = fixtures.as_string("aggregate/regions/ma",
                                               "reports_landing.html")

        mock_year = Mock()
        mock_year.text = fixtures.as_string("aggregate/regions/ma",
                                            "year_page.html")

        mockget.side_effect = [mock_landing, mock_year]

        url1 = "https://www.mass.gov/doc/weekly-inmate-count-12252017/download"
        expected_urls = {url1}

        urls = ma_aggregate_site_scraper.get_urls_to_download()
        self.assertEqual(expected_urls, urls)
 def extract(self, html_filename, yaml_filename):
     yaml_path = os.path.join(os.path.dirname(__file__),
                              '../testdata/data_extractor/yaml',
                              yaml_filename)
     extractor = HtmlDataExtractor(yaml_path)
     contents = html.fromstring(
         fixtures.as_string('testdata/data_extractor/html', html_filename))
     return extractor.extract_and_populate_data(contents)
Пример #4
0
 def extract(self, html_filename: str, yaml_filename: str) -> IngestInfo:
     yaml_path = os.path.join(os.path.dirname(__file__),
                              "../testdata/data_extractor/yaml",
                              yaml_filename)
     extractor = HtmlDataExtractor(yaml_path)
     contents = html.fromstring(
         fixtures.as_string("testdata/data_extractor/html", html_filename))
     return extractor.extract_and_populate_data(contents)
Пример #5
0
    def test_parse_file_headers_only_iterator_input(self):
        extractor = _instantiate_extractor('header_cols_only_csv.yaml')
        content = fixtures.as_string('testdata/data_extractor/csv',
                                     'header_cols_only.csv')
        ingest_info = \
            extractor.extract_and_populate_data(iter(content.splitlines()))

        self.assertIsNotNone(ingest_info)
        self.assertFalse(ingest_info)
Пример #6
0
    def test_parse_file_empty(self):
        """Tests that we don't crash on a completely empty CSV and return an
        empty IngestInfoObject"""
        extractor = _instantiate_extractor('header_cols_only_csv.yaml')
        content = fixtures.as_string('testdata/data_extractor/csv',
                                     'empty.csv')
        ingest_info = extractor.extract_and_populate_data(content)

        self.assertIsNotNone(ingest_info)
        self.assertFalse(ingest_info)
    def test_parse_file_headers_only(self) -> None:
        """Tests that we don't crash on a CSV with only a header row and return
        an empty IngestInfoObject.
        """
        extractor = _instantiate_extractor("header_cols_only_csv.yaml")
        content = fixtures.as_string("testdata/data_extractor/csv",
                                     "header_cols_only.csv")
        ingest_info = extractor.extract_and_populate_data(content)

        self.assertIsNotNone(ingest_info)
        self.assertFalse(ingest_info)
Пример #8
0
    def testGetAllUrls(self, mockget: Mock) -> None:
        mockresponse = Mock()
        mockget.return_value = mockresponse
        mockresponse.text = fixtures.as_string("aggregate/regions/hi",
                                               "report.html")
        url1 = ("https://dps.hawaii.gov/wp-content/uploads/2019/01/"
                "Pop-Reports-EOM-2018-12-31.pdf")
        expected_urls = {url1}

        urls = hi_aggregate_site_scraper.get_urls_to_download()
        self.assertEqual(expected_urls, urls)
    def testGetAllUrls(self, mockget: Mock) -> None:
        mockresponse = Mock()
        mockget.return_value = mockresponse
        mockresponse.text = fixtures.as_string("aggregate/regions/fl",
                                               "reports.html")
        url1 = "http://www.dc.state.fl.us/pub/jails/2019/2019_06 June FCDF.pdf"
        url2 = "http://www.dc.state.fl.us/pub/jails/2016/jails-2016-03.pdf"
        expected_urls = {url1, url2}

        urls = fl_aggregate_site_scraper.get_urls_to_download()
        self.assertEqual(expected_urls, urls)
Пример #10
0
    def testGetAllUrls(self, mockget: Mock) -> None:
        mockresponse = Mock()
        mockget.return_value = mockresponse
        mockresponse.text = fixtures.as_string("aggregate/regions/ky", "report.html")
        url1 = (
            "https://corrections.ky.gov/About/researchandstats/Documents/"
            "Weekly Jail/2018/08-18-18.pdf"
        )
        expected_urls = {url1}

        urls = ky_aggregate_site_scraper.get_urls_to_download()
        self.assertEqual(expected_urls, urls)
Пример #11
0
    def testGetAllUrls(self, mockget: Mock) -> None:
        mockresponse = Mock()
        mockget.return_value = mockresponse
        mockresponse.text = fixtures.as_string("aggregate/regions/tn",
                                               "reports.html")
        url1 = ("https://www.tn.gov/content/dam/tn/correction/documents/"
                "JailAugust2018.pdf")
        url2 = ("https://www.tn.gov/content/dam/tn/correction/documents/"
                "JailFemaleOctober2017.pdf")
        expected_urls = {url1, url2}

        urls = tn_aggregate_site_scraper.get_urls_to_download()
        self.assertEqual(expected_urls, urls)
Пример #12
0
    def testGetAllUrls(self, mock_date: Mock, mockget: Mock) -> None:
        mockresponse = Mock()
        mockget.return_value = mockresponse
        mockresponse.text = fixtures.as_string("aggregate/regions/tx",
                                               "reports.html")
        mock_date.today.return_value = _TODAY
        url1 = ("https://www.tcjs.state.tx.us/docs/AbbreviatedPopReports/"
                "Abbreviated Pop Rpt June 2020.pdf")
        url2 = ("https://www.tcjs.state.tx.us/docs/AbbreviatedPopReports/"
                "Abbreviated Pop Rpt Jan 2021.pdf")
        url3 = "https://www.tcjs.state.tx.us/wp-content/uploads/2021/04/AbbreRptCurrent.pdf"
        expected_urls = {url1, url2, url3}

        urls = tx_aggregate_site_scraper.get_urls_to_download()
        self.assertEqual(expected_urls, urls)
Пример #13
0
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
# =============================================================================
"""Tests for ga_aggregate_ingest.py."""
from unittest import TestCase
from mock import patch, Mock
import requests

from recidiviz.ingest.aggregate.regions.ga import ga_aggregate_site_scraper
from recidiviz.tests.ingest import fixtures

REPORTS_LANDING_HTML = fixtures.as_string('aggregate/regions/ga',
                                          'reports_landing.html')
REPORTS_YEAR_2015 = fixtures.as_string('aggregate/regions/ga',
                                       'reports_year_2015.html')
REPORTS_YEAR_2019 = fixtures.as_string('aggregate/regions/ga',
                                       'reports_year_2019.html')


class TestGaAggregateSiteScraper(TestCase):
    """Test that ga_aggregate_site_scraper correctly scrapes urls."""
    @patch.object(requests, 'get')
    def testGetAllUrls(self, mockget):
        def _MockGet(url):
            response = Mock()
            if 'node/5617' in url:
                response.text = REPORTS_YEAR_2019
            elif 'node/4036' in url:
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
# =============================================================================
"""Tests for tn_aggregate_ingest.py."""
from unittest import TestCase
from mock import patch, Mock
import requests

from recidiviz.ingest.aggregate.regions.tn import tn_aggregate_site_scraper
from recidiviz.tests.ingest import fixtures

REPORTS_HTML = fixtures.as_string("aggregate/regions/tn", "reports.html")


class TestTnAggregateSiteScraper(TestCase):
    """Test that tn_aggregate_site_scraper correctly scrapes urls."""

    @patch.object(requests, "get")
    def testGetAllUrls(self, mockget):
        mockresponse = Mock()
        mockget.return_value = mockresponse
        mockresponse.text = REPORTS_HTML
        url1 = (
            "https://www.tn.gov/content/dam/tn/correction/documents/"
            "JailAugust2018.pdf"
        )
        url2 = (
Пример #15
0
from mock import patch, Mock

from recidiviz.common.ingest_metadata import IngestMetadata
from recidiviz.ingest.direct.regions.us_tx_brazos.us_tx_brazos_controller \
    import UsTxBrazosController
from recidiviz.ingest.models.ingest_info import Arrest, Bond, Booking, Charge, \
    Hold, Person, IngestInfo
from recidiviz.tests.ingest import fixtures
from recidiviz.tests.utils.individual_ingest_test import IndividualIngestTest
from recidiviz.tests.ingest.direct.direct_ingest_util import \
    build_controller_for_tests, ingest_args_for_fixture_file
from recidiviz.utils import regions


FIXTURE_PATH_PREFIX = 'direct/regions/us_tx_brazos'
_ROSTER_PATH_CONTENTS = fixtures.as_string(FIXTURE_PATH_PREFIX, 'daily_data.csv')
_FAKE_START_TIME = datetime.datetime(year=2019, month=1, day=2)


@patch('recidiviz.utils.metadata.project_id',
       Mock(return_value='recidiviz-staging'))
class UsTxBrazosControllerTest(IndividualIngestTest, TestCase):
    """Test Brazos direct ingest.
    """

    def testParse(self):
        controller = build_controller_for_tests(UsTxBrazosController,
                                                FIXTURE_PATH_PREFIX,
                                                run_async=False)

        args = ingest_args_for_fixture_file(controller, 'daily_data.csv')
import tempfile
from flask import Flask
from mock import patch, Mock, call
import requests
import gcsfs
import pytz

from recidiviz.cloud_functions.cloud_function_utils import GCSFS_NO_CACHING
from recidiviz.ingest.aggregate import scrape_aggregate_reports
from recidiviz.ingest.aggregate.regions.ca import ca_aggregate_site_scraper
from recidiviz.ingest.aggregate.regions.ny import ny_aggregate_site_scraper
from recidiviz.ingest.aggregate.regions.tx import tx_aggregate_site_scraper
from recidiviz.tests.ingest import fixtures
from recidiviz.utils import metadata

REPORTS_HTML = fixtures.as_string('aggregate/regions/tx', 'reports.html')

APP_ID = "recidiviz-scraper-aggregate-report-test"

app = Flask(__name__)
app.register_blueprint(
    scrape_aggregate_reports.scrape_aggregate_reports_blueprint)
app.config['TESTING'] = True

SERVER_MODIFIED_TIME = datetime.datetime(
    year=2019, month=1, day=1, tzinfo=pytz.UTC)
EXISTING_TEST_URL = 'http://test.com/url_test/Existing.pdf'
EXISTING_TEST_URL2 = 'http://test.com/url_test/Existing2.pdf'
EXISTING_TEST_URL_CA = 'http://test.com'
CA_POST_DATA = {'year': 1996, 'testing': '1'}
NONEXISTING_TEST_URL = 'url_test/nonexisting.pdf'
Пример #17
0
def _get_content_as_csv(content_filename: str) -> csv.DictReader:
    content = fixtures.as_string('testdata/data_extractor/csv',
                                 content_filename)
    return csv.DictReader(content.splitlines())