def parsed_pdf(request):
    request.cls.parsed_pdf = tn_aggregate_ingest.parse(
        '', fixtures.as_filepath('_jailjanuary2019.pdf'))
    request.cls.parsed_female_pdf = tn_aggregate_ingest.parse(
        '', fixtures.as_filepath('_jailfemalejanuary2019.pdf'))
    request.cls.parsed_newer_pdf = tn_aggregate_ingest.parse(
        '', fixtures.as_filepath('_jailmarch2020.pdf'))
예제 #2
0
 def setUpClass(cls) -> None:
     # Cache the parsed pdf between tests since it's expensive to compute
     cls.parsed_pdf = fl_aggregate_ingest.parse(
         fixtures.as_filepath("jails-2018-01.pdf"))
     cls.parsed_pdf_2 = fl_aggregate_ingest.parse(
         fixtures.as_filepath(
             "florida__pub_jails_2019_2019_06 june fcdf.pdf"))
def parsed_pdf(request):
    request.cls.parsed_pdf = fl_aggregate_ingest.parse(
        "", fixtures.as_filepath("jails-2018-01.pdf")
    )
    request.cls.parsed_pdf_2 = fl_aggregate_ingest.parse(
        "", fixtures.as_filepath("florida__pub_jails_2019_2019_06 june fcdf.pdf")
    )
def parsed_pdf(request):
    request.cls.parsed_pdf = ky_aggregate_ingest.parse(
        "", fixtures.as_filepath("12-20-18.pdf"))
    request.cls.parsed_pdf_2 = ky_aggregate_ingest.parse(
        "", fixtures.as_filepath("08-23-18.pdf"))
    request.cls.parsed_pdf_3 = ky_aggregate_ingest.parse(
        "", fixtures.as_filepath("08-22-19.pdf"))
예제 #5
0
 def setUpClass(cls) -> None:
     # Cache the parsed pdf between tests since it's expensive to compute
     cls.parsed_pdf = ny_aggregate_ingest.parse(
         fixtures.as_filepath("jail_population.pdf")
     )
     cls.parsed_pdf_3_pages = ny_aggregate_ingest.parse(
         fixtures.as_filepath("jail_population_2019.pdf")
     )
예제 #6
0
 def setUpClass(cls) -> None:
     # Cache the parsed pdfs between tests since it's expensive to compute
     cls.parsed_pdf = tn_aggregate_ingest.parse(
         fixtures.as_filepath("_jailjanuary2019.pdf"))
     cls.parsed_female_pdf = tn_aggregate_ingest.parse(
         fixtures.as_filepath("_jailfemalejanuary2019.pdf"))
     cls.parsed_newer_pdf = tn_aggregate_ingest.parse(
         fixtures.as_filepath("_jailmarch2020.pdf"))
예제 #7
0
 def setUpClass(cls) -> None:
     # Cache the parsed pdf between tests since it's expensive to compute
     cls.parsed_pdf = ky_aggregate_ingest.parse(
         fixtures.as_filepath("12-20-18.pdf"))
     cls.parsed_pdf_2 = ky_aggregate_ingest.parse(
         fixtures.as_filepath("08-23-18.pdf"))
     cls.parsed_pdf_3 = ky_aggregate_ingest.parse(
         fixtures.as_filepath("08-22-19.pdf"))
 def setUpClass(cls) -> None:
     # Cache the parsed pdf between tests since it's expensive to compute
     cls.parsed_pdf_before_1996 = tx_aggregate_ingest.parse(
         fixtures.as_filepath("abbreviated pop rpt march 1994.pdf"))
     cls.parsed_pdf_1996 = tx_aggregate_ingest.parse(
         fixtures.as_filepath(
             "texas_url_abbreviated pop rpt June 1996.pdf"))
     cls.parsed_pdf_after_1996 = tx_aggregate_ingest.parse(
         fixtures.as_filepath("Abbreviated Pop Rpt Dec 2017.pdf"))
     cls.parsed_pdf_concat = tx_aggregate_ingest.parse(
         fixtures.as_filepath(
             "docs_abbreviatedpopreports_abbreviated pop rpt oct 2003.pdf"))
예제 #9
0
 def test_get_export_config_valid(self) -> None:
     product_configs = ProductConfigs.from_file(
         path=fixtures.as_filepath("fixture_products.yaml"))
     _export_config = product_configs.get_export_config(
         export_job_name="EXPORT",
         state_code="US_XX",
     )
예제 #10
0
    def test_read_with_exception(self) -> None:
        class _TestException(ValueError):
            pass

        class _ExceptionDelegate(_TestGcsfsCsvReaderDelegate):
            def on_dataframe(self, encoding: str, chunk_num: int,
                             df: pd.DataFrame) -> bool:
                should_continue = super().on_dataframe(encoding, chunk_num, df)
                if chunk_num > 0:
                    raise _TestException("We crashed processing!")
                return should_continue

        file_path = fixtures.as_filepath("encoded_utf_8.csv")
        delegate = _ExceptionDelegate()

        with self.assertRaises(_TestException):
            self.reader.streaming_read(
                GcsfsFilePath.from_absolute_path(file_path),
                delegate=delegate,
                chunk_size=1,
            )

        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual("UTF-8", delegate.encodings_attempted[0])
        self.assertIsNone(delegate.successful_encoding)
        self.assertEqual(2, len(delegate.dataframes))
        self.assertEqual({"UTF-8"},
                         {encoding
                          for encoding, df in delegate.dataframes})
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(1, delegate.exceptions)
    def test_direct_ingest_preprocessed_view_detect_row_deletion_unknown_pk_table_specified(
        self, ) -> None:
        region_config = DirectIngestRegionRawFileConfig(
            region_code="us_xx",
            yaml_config_file_dir=fixtures.as_filepath("us_xx"),
        )

        view_query_template = """SELECT * FROM {file_tag_first}
        LEFT OUTER JOIN {tagFullHistoricalExport}
        USING (col1);"""

        with self.assertRaises(ValueError) as e:
            DirectIngestPreProcessedIngestView(
                ingest_view_name="ingest_view_tag",
                view_query_template=view_query_template,
                region_raw_table_config=region_config,
                order_by_cols="col1, col2",
                is_detect_row_deletion_view=True,
                primary_key_tables_for_entity_deletion=[
                    "tagFullHistoricalExport",
                    "unknown",
                ],
            )

        self.assertTrue(
            str(e.exception).startswith(
                "Ingest view ingest_view_tag has specified unknown in "
                "`primary_key_tables_for_entity_deletion`, but that "
                "raw file tag was not found as a dependency."))
 def test_parse_empty_yaml_throws(self):
     with self.assertRaises(ValueError):
         _ = DirectIngestRegionRawFileConfig(
             region_code='us_xx',
             yaml_config_file_path=fixtures.as_filepath(
                 'empty_raw_data_files.yaml'),
         )
예제 #13
0
    def test_happy_path(self) -> None:
        yaml_path = fixtures.as_filepath("schema_config.yaml")
        validation_schema_config = DatasetSchemaInfo.from_yaml(yaml_path)

        expected = DatasetSchemaInfo(
            dataset="fixture_schema",
            tables=[
                TableSchemaInfo(
                    table_name="incarceration_population_by_facility",
                    columns=[
                        "date_of_stay",
                        "facility",
                        "month",
                        "population_count",
                        "region_code",
                        "year",
                    ],
                ),
                TableSchemaInfo(
                    table_name="incarceration_population_person_level",
                    columns=[
                        "date_of_stay",
                        "facility",
                        "person_external_id",
                        "region_code",
                    ],
                ),
            ],
        )

        self.assertEqual(expected, validation_schema_config)
예제 #14
0
    def _build(
        self,
        *,
        dataset_overrides: Optional[Dict[str, str]] = None
    ) -> DirectIngestPreProcessedIngestView:
        region_config = DirectIngestRegionRawFileConfig(
            region_code="us_xx",
            yaml_config_file_dir=fixtures.as_filepath("us_xx"),
        )

        query = "select * from {file_tag_first} JOIN {tagFullHistoricalExport} USING (COL_1)"
        primary_key_tables_for_entity_deletion = (
            [] if not self.is_detect_row_deletion_view else
            ["tagFullHistoricalExport"])
        return DirectIngestPreProcessedIngestView(
            ingest_view_name=self.tag,
            view_query_template=query,
            region_raw_table_config=region_config,
            order_by_cols="colA, colC",
            is_detect_row_deletion_view=self.is_detect_row_deletion_view,
            primary_key_tables_for_entity_deletion=
            primary_key_tables_for_entity_deletion,
            materialize_raw_data_table_views=self.
            materialize_raw_data_table_views,
        )
예제 #15
0
    def test_read_completely_empty_file(self) -> None:
        empty_file_path = fixtures.as_filepath("tagA.csv")

        delegate = _TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(
            GcsfsFilePath.from_absolute_path(empty_file_path),
            delegate=delegate,
            chunk_size=1,
        )
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0],
                         delegate.successful_encoding)
        self.assertEqual(0, len(delegate.dataframes))
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)

        delegate = _TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(
            GcsfsFilePath.from_absolute_path(empty_file_path),
            delegate=delegate,
            chunk_size=10,
        )
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0],
                         delegate.successful_encoding)
        self.assertEqual(0, len(delegate.dataframes))
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)
    def test_direct_ingest_preprocessed_view_same_table_multiple_places(self):
        region_config = DirectIngestRegionRawFileConfig(
            region_code='us_xx',
            yaml_config_file_path=fixtures.as_filepath(
                'us_xx_raw_data_files.yaml'),
        )

        view_query_template = """SELECT * FROM {file_tag_first}
LEFT OUTER JOIN {file_tag_first}
USING (col1);"""

        view = DirectIngestPreProcessedIngestView(
            ingest_view_name='ingest_view_tag',
            view_query_template=view_query_template,
            region_raw_table_config=region_config)

        self.assertEqual(
            ['file_tag_first'],
            [c.file_tag for c in view.raw_table_dependency_configs])

        expected_view_query = """WITH
file_tag_first_generated_view AS (
    SELECT * FROM `recidiviz-456.us_xx_raw_data_up_to_date_views.file_tag_first_latest`
)
SELECT * FROM file_tag_first_generated_view
LEFT OUTER JOIN file_tag_first_generated_view
USING (col1);"""

        self.assertEqual(expected_view_query, view.view_query)
    def test_direct_ingest_preprocessed_view_detect_row_deletion_no_historical_table(
        self, ) -> None:
        region_config = DirectIngestRegionRawFileConfig(
            region_code="us_xx",
            yaml_config_file_dir=fixtures.as_filepath("us_xx"),
        )

        view_query_template = """SELECT * FROM {file_tag_first}
LEFT OUTER JOIN {file_tag_second}
USING (col1);"""

        with self.assertRaises(ValueError) as e:
            DirectIngestPreProcessedIngestView(
                ingest_view_name="ingest_view_tag",
                view_query_template=view_query_template,
                region_raw_table_config=region_config,
                order_by_cols="col1, col2",
                is_detect_row_deletion_view=True,
                primary_key_tables_for_entity_deletion=["file_tag_second"],
            )
        self.assertTrue(
            str(e.exception).startswith(
                "Ingest view ingest_view_tag is marked as `is_detect_row_deletion_view` and has table file_tag_second "
                "specified in `primary_key_tables_for_entity_deletion`; however the raw data file is not marked as always "
                "being exported as historically."))
예제 #18
0
    def test_read_file_with_columns_no_contents(self):
        empty_file_path = fixtures.as_filepath('tagB.csv')

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(
            GcsfsFilePath.from_absolute_path(empty_file_path),
            delegate=delegate,
            chunk_size=1)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0],
                         delegate.successful_encoding)
        self.assertEqual(1, len(delegate.dataframes))
        encoding, df = delegate.dataframes[0]
        self.assertEqual(encoding, delegate.successful_encoding)
        self.assertEqual(0, df.shape[0])  # No rows
        self.assertEqual(7, df.shape[1])  # 7 columns
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(
            GcsfsFilePath.from_absolute_path(empty_file_path),
            delegate=delegate,
            chunk_size=10)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0],
                         delegate.successful_encoding)
        self.assertEqual(1, len(delegate.dataframes))
        encoding, df = delegate.dataframes[0]
        self.assertEqual(encoding, delegate.successful_encoding)
        self.assertEqual(0, df.shape[0])  # No rows
        self.assertEqual(7, df.shape[1])  # 7 columns
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)
    def test_direct_ingest_preprocessed_view_detect_row_deletion_no_pk_tables_specified(
        self, ) -> None:
        region_config = DirectIngestRegionRawFileConfig(
            region_code="us_xx",
            yaml_config_file_dir=fixtures.as_filepath("us_xx"),
        )

        view_query_template = """SELECT * FROM {file_tag_first}
        LEFT OUTER JOIN {tagFullHistoricalExport}
        USING (col1);"""

        with self.assertRaises(ValueError) as e:
            DirectIngestPreProcessedIngestView(
                ingest_view_name="ingest_view_tag",
                view_query_template=view_query_template,
                region_raw_table_config=region_config,
                order_by_cols="col1, col2",
                is_detect_row_deletion_view=True,
                primary_key_tables_for_entity_deletion=[],
            )

        self.assertTrue(
            str(e.exception).startswith(
                "Ingest view ingest_view_tag was marked as `is_detect_row_deletion_view`; however no "
                "`primary_key_tables_for_entity_deletion` were defined."))
    def test_direct_ingest_preprocessed_view_other_materialized_subquery_fails(
        self, ) -> None:
        region_config = DirectIngestRegionRawFileConfig(
            region_code="us_xx",
            yaml_config_file_dir=fixtures.as_filepath("us_xx"),
        )

        view_query_template = """
CREATE TEMP TABLE my_subquery AS (SELECT * FROM {file_tag_first});
SELECT * FROM my_subquery;"""

        with self.assertRaises(ValueError) as e:
            _ = DirectIngestPreProcessedIngestView(
                ingest_view_name="ingest_view_tag",
                view_query_template=view_query_template,
                region_raw_table_config=region_config,
                order_by_cols="col1, col2",
                is_detect_row_deletion_view=False,
                primary_key_tables_for_entity_deletion=[],
            )

        self.assertEqual(
            str(e.exception),
            "Found CREATE TEMP TABLE clause in this query - ingest views cannot contain CREATE clauses.",
        )
def _parsed_result() -> Dict[DeclarativeMeta, pd.DataFrame]:
    global _PARSED_RESULT
    if not _PARSED_RESULT:
        _PARSED_RESULT = pa_aggregate_ingest.parse(
            fixtures.as_filepath(
                "2018 County Statistics _ General Information - 2017 Data.xlsx"
            ), )
    return _PARSED_RESULT
예제 #22
0
 def test_get_export_config_missing_state_code(self) -> None:
     product_configs = ProductConfigs.from_file(
         path=fixtures.as_filepath("fixture_products.yaml"))
     with self.assertRaisesRegex(
             BadProductExportSpecificationError,
             "Missing required state_code parameter for export_job_name EXPORT",
     ):
         product_configs.get_export_config(export_job_name="EXPORT", )
예제 #23
0
파일: jid.py 프로젝트: Recidiviz/pulse-data
def _get_JID() -> pd.DataFrame:
    global _JID

    if _JID is None:
        _JID = pd.read_csv(as_filepath("jid.csv", subdir="data_sets"),
                           dtype={"fips": str})

    return _JID
예제 #24
0
def _get_FIPS() -> pd.DataFrame:
    global _FIPS

    if _FIPS is None:
        _FIPS = pd.read_csv(as_filepath("fips.csv", subdir="data_sets"),
                            dtype={"fips": str})

    return _FIPS
예제 #25
0
파일: fid.py 프로젝트: Recidiviz/pulse-data
def _get_FID() -> pd.DataFrame:
    global _FID

    if _FID is None:
        _FID = pd.read_csv(as_filepath("fid.csv", subdir="data_sets"),
                           dtype={"vera_jid": str})

    return _FID
 def test_parse_no_defaults_throws(self) -> None:
     with self.assertRaises(ValueError) as e:
         _ = DirectIngestRegionRawFileConfig(
             region_code="us_yy",
             yaml_config_file_dir=fixtures.as_filepath("us_yy"),
         )
     self.assertEqual(str(e.exception),
                      "Missing default raw data configs for region: us_yy")
def _parsed_result() -> Dict[DeclarativeMeta, pd.DataFrame]:
    global _PARSED_RESULT

    if not _PARSED_RESULT:
        _PARSED_RESULT = ca_aggregate_ingest.parse(
            fixtures.as_filepath("QueryResult.xls")
        )

    return _PARSED_RESULT
예제 #28
0
    def test_direct_ingest_preprocessed_view_with_reference_table(self):
        region_config = DirectIngestRegionRawFileConfig(
            region_code='us_xx',
            yaml_config_file_path=fixtures.as_filepath(
                'us_xx_raw_data_files.yaml'),
        )

        view_query_template = """SELECT * FROM {file_tag_first}
LEFT OUTER JOIN `{{project_id}}.reference_tables.my_table`
USING (col1);"""

        view = DirectIngestPreProcessedIngestView(
            ingest_view_name='ingest_view_tag',
            view_query_template=view_query_template,
            region_raw_table_config=region_config,
            order_by_cols='col1, col2')

        self.assertEqual(
            ['file_tag_first'],
            [c.file_tag for c in view.raw_table_dependency_configs])

        expected_view_query = """WITH
file_tag_first_generated_view AS (
    SELECT * FROM `recidiviz-456.us_xx_raw_data_up_to_date_views.file_tag_first_latest`
)
SELECT * FROM file_tag_first_generated_view
LEFT OUTER JOIN `recidiviz-456.reference_tables.my_table`
USING (col1) 
ORDER BY col1, col2;"""

        self.assertEqual(expected_view_query, view.view_query)

        expected_date_parametrized_view_query = """WITH
file_tag_first_generated_view AS (
    WITH rows_with_recency_rank AS (
        SELECT 
            * EXCEPT (file_id, update_datetime), 
            ROW_NUMBER() OVER (PARTITION BY col_name_1a, col_name_1b
                               ORDER BY update_datetime DESC) AS recency_rank
        FROM 
            `recidiviz-456.us_xx_raw_data.file_tag_first`
        WHERE 
            update_datetime <= @my_param
    )

    SELECT * 
    EXCEPT (recency_rank)
    FROM rows_with_recency_rank
    WHERE recency_rank = 1
)
SELECT * FROM file_tag_first_generated_view
LEFT OUTER JOIN `recidiviz-456.reference_tables.my_table`
USING (col1) 
ORDER BY col1, col2;"""

        self.assertEqual(expected_date_parametrized_view_query,
                         view.date_parametrized_view_query('my_param'))
 def test_missing_configs_for_region(self) -> None:
     with self.assertRaises(ValueError) as e:
         region_config = DirectIngestRegionRawFileConfig(
             region_code="us_xy",
             yaml_config_file_dir=fixtures.as_filepath("us_xy"),
         )
         _configs = region_config.raw_file_configs
     self.assertEqual(str(e.exception),
                      "Missing raw data configs for region: us_xy")
예제 #30
0
 def test_get_export_config_too_many_exports(self) -> None:
     product_configs = ProductConfigs.from_file(
         path=fixtures.as_filepath("fixture_products.yaml"))
     product_configs.products.append(product_configs.products[0])
     with self.assertRaisesRegex(
             BadProductExportSpecificationError,
             "Wrong number of products returned for export for export_job_name EXPORT",
     ):
         product_configs.get_export_config(export_job_name="EXPORT", )