def test_for_schema_type_returns_instance(self) -> None:
     for schema_type in self.schema_types:
         if not CloudSqlToBQConfig.is_valid_schema_type(schema_type):
             with self.assertRaises(ValueError):
                 _ = CloudSqlToBQConfig.for_schema_type(schema_type)
         else:
             config = CloudSqlToBQConfig.for_schema_type(schema_type)
             self.assertIsInstance(config, CloudSqlToBQConfig)
def create_all_bq_refresh_tasks_for_schema(schema_arg: str) -> None:
    """Creates an export task for each table to be exported.

    A task is created for each table defined in the schema.

    Re-creates all tasks if any task fails to be created.
    """
    try:
        schema_type = SchemaType(schema_arg.upper())
    except ValueError:
        return

    logging.info("Beginning BQ export for %s schema tables.",
                 schema_type.value)

    task_manager = BQRefreshCloudTaskManager()

    cloud_sql_to_bq_config = CloudSqlToBQConfig.for_schema_type(schema_type)
    if cloud_sql_to_bq_config is None:
        logging.info("Cloud SQL to BQ is disabled for: %s", schema_type)
        return

    for table in cloud_sql_to_bq_config.get_tables_to_export():
        task_manager.create_refresh_bq_table_task(table.name, schema_type)

    if schema_type is SchemaType.STATE:
        pub_sub_topic = "v1.calculator.trigger_daily_pipelines"
        pub_sub_message = "State export to BQ complete"
    else:
        pub_sub_topic = ""
        pub_sub_message = ""

    task_manager.create_bq_refresh_monitor_task(schema_type.value,
                                                pub_sub_topic, pub_sub_message)
Пример #3
0
    def test_get_tables_to_export(self) -> None:
        """Assertions for the method get_tables_to_export
        1. Assert that it returns a list of type sqlalchemy.Table
        2. For the StateBase schema, assert that included history tables are included
        3. For the StateBase schema, assert that other history tables are excluded
        """
        for schema_type in self.enabled_schema_types:
            config = CloudSqlToBQConfig.for_schema_type(schema_type)
            assert config is not None
            tables_to_export = config.get_tables_to_export()

            self.assertIsInstance(tables_to_export, List)

            for table in tables_to_export:
                self.assertIsInstance(table, sqlalchemy.Table)

            if schema_type == SchemaType.STATE:
                history_tables_to_include = config.history_tables_to_include
                for history_table in history_tables_to_include:
                    table_names = list(map(lambda t: t.name, tables_to_export))
                    self.assertIn(history_table, table_names)

                for table in config.sorted_tables:
                    if ("history" in table.name and table.name
                            not in config.history_tables_to_include):
                        self.assertNotIn(table, tables_to_export)
 def test_incorrect_direct_ingest_instance_raises(self) -> None:
     for schema_type in self.enabled_schema_types:
         if schema_type != SchemaType.STATE:
             with self.assertRaises(ValueError):
                 _ = CloudSqlToBQConfig.for_schema_type(
                     schema_type, DirectIngestInstance.PRIMARY
                 )
Пример #5
0
 def test_for_schema_type_returns_instance(self) -> None:
     for schema_type in self.schema_types:
         config = CloudSqlToBQConfig.for_schema_type(schema_type)
         if schema_type in self.disabled_schema_types:
             self.assertIsNone(config)
         else:
             self.assertIsInstance(config, CloudSqlToBQConfig)
Пример #6
0
    def test_yaml_config_reads_correctly_JAILS(
        self,
        schema: SchemaType,
        regions_to_exclude: List[str],
        columns_to_exclude: Dict[str, List[str]],
        history_tables_to_include: List[str],
    ) -> None:
        config = CloudSqlToBQConfig.for_schema_type(schema)
        assert config is not None

        self.assertListsDistinctAndEqual(
            regions_to_exclude,
            config.region_codes_to_exclude,
            msg_prefix="Region codes",
        )
        self.assertListsDistinctAndEqual(
            history_tables_to_include,
            config.history_tables_to_include,
            msg_prefix="History tables",
        )

        self.assertListsDistinctAndEqual(
            list(columns_to_exclude.keys()),
            list(config.columns_to_exclude.keys()),
            msg_prefix="Excluded columns keys",
        )
        for k in columns_to_exclude.keys():
            self.assertListsDistinctAndEqual(
                columns_to_exclude[k],
                config.columns_to_exclude[k],
                msg_prefix=f"Excluded columsn for {k}",
            )
def main(
    sandbox_dataset_prefix: str,
    schema_type: SchemaType,
    direct_ingest_instance: Optional[DirectIngestInstance],
) -> None:
    """Defines the main function responsible for moving data from Postgres to BQ."""
    logging.info("Prefixing all output datasets with [%s_].",
                 known_args.sandbox_dataset_prefix)
    fake_gcs = FakeGCSFileSystem()

    # We mock the export config to a version that does not have any paused regions.
    with mock.patch(
            f"{cloud_sql_to_bq_refresh_config.__name__}.GcsfsFactory.build",
            return_value=fake_gcs,
    ):
        fake_gcs.upload_from_string(
            path=CloudSqlToBQConfig.default_config_path(),
            contents=STANDARD_YAML_CONTENTS,
            content_type="text/yaml",
        )
        federated_bq_schema_refresh(
            schema_type=schema_type,
            direct_ingest_instance=direct_ingest_instance,
            dataset_override_prefix=sandbox_dataset_prefix,
        )
        config = CloudSqlToBQConfig.for_schema_type(schema_type)
        final_destination_dataset = config.unioned_multi_region_dataset(
            dataset_override_prefix=sandbox_dataset_prefix)

    logging.info("Load complete. Data loaded to dataset [%s].",
                 final_destination_dataset)
 def test_build_unioned_segments_view_with_dataset_overrides(self) -> None:
     config = CloudSqlToBQConfig.for_schema_type(SchemaType.STATE)
     view_builder = UnionedStateSegmentsViewBuilder(
         config=config,
         table=StateBase.metadata.tables["state_person_external_id"],
         state_codes=[StateCode.US_XX, StateCode.US_WW],
     )
     view = view_builder.build(
         dataset_overrides={
             "state_regional": "my_prefix_state_regional",
             "us_xx_state_regional": "my_prefix_us_xx_state_regional",
             "us_ww_state_regional": "my_prefix_us_ww_state_regional",
         }
     )
     expected_query = (
         "SELECT state_person_external_id.external_id,state_person_external_id.state_code,"
         "state_person_external_id.id_type,state_person_external_id.person_external_id_id,"
         "state_person_external_id.person_id FROM "
         "`recidiviz-staging.my_prefix_us_xx_state_regional.state_person_external_id` state_person_external_id\n"
         "UNION ALL\n"
         "SELECT state_person_external_id.external_id,state_person_external_id.state_code,"
         "state_person_external_id.id_type,state_person_external_id.person_external_id_id,"
         "state_person_external_id.person_id FROM "
         "`recidiviz-staging.my_prefix_us_ww_state_regional.state_person_external_id` state_person_external_id"
     )
     self.assertEqual(expected_query, view.view_query)
     self.assertEqual(
         BigQueryAddress(
             dataset_id="my_prefix_state_regional",
             table_id="state_person_external_id",
         ),
         view.materialized_address,
     )
Пример #9
0
def refresh_bq_table() -> Tuple[str, int]:
    """Worker function to handle BQ export task requests.

    Form data must be a bytes-encoded JSON object with parameters listed below.

    URL Parameters:
        table_name: Table to export then import. Table must be defined
            in one of the base schema types.
    """
    json_data = request.get_data(as_text=True)
    data = json.loads(json_data)
    table_name = data['table_name']
    schema_type_str = data['schema_type']

    try:
        schema_type = SchemaType(schema_type_str)
    except ValueError:
        return (f'Unknown schema type [{schema_type_str}]',
                HTTPStatus.BAD_REQUEST)

    bq_client = BigQueryClientImpl()
    cloud_sql_to_bq_config = CloudSqlToBQConfig.for_schema_type(schema_type)

    logging.info("Starting BQ export task for table: %s", table_name)

    export_table_then_load_table(bq_client, table_name, cloud_sql_to_bq_config)
    return ('', HTTPStatus.OK)
Пример #10
0
    def update_data_freshness_results(self) -> None:
        """Refreshes information in the metadata store about freshness of ingested data
        for all states."""
        bq_export_config = CloudSqlToBQConfig.for_schema_type(
            SchemaType.STATE,
            yaml_path=GcsfsFilePath.from_absolute_path(
                f"gs://{self.project_id}-configs/cloud_sql_to_bq_config.yaml"
            ),
        )
        if bq_export_config is None:
            raise ValueError("STATE CloudSqlToBQConfig unexpectedly None.")

        regions_paused = bq_export_config.region_codes_to_exclude

        latest_upper_bounds_path = GcsfsFilePath.from_absolute_path(
            f"gs://{self.project_id}-ingest-metadata/ingest_metadata_latest_ingested_upper_bounds.json"
        )
        latest_upper_bounds_json = self.gcs_fs.download_as_string(
            latest_upper_bounds_path
        )
        latest_upper_bounds = []

        for line in latest_upper_bounds_json.splitlines():
            line = line.strip()
            if not line:
                continue
            struct = json.loads(line)
            latest_upper_bounds.append(
                {
                    "state": struct["state_code"],
                    "date": struct.get("processed_date"),
                    "ingestPaused": struct["state_code"] in regions_paused,
                }
            )
        self.data_freshness_results = latest_upper_bounds
 def test_unioned_segments_view_unsegmented_config_crashes(self) -> None:
     config = CloudSqlToBQConfig.for_schema_type(SchemaType.JAILS)
     with self.assertRaises(ValueError) as e:
         _ = UnionedStateSegmentsViewBuilder(
             config=config,
             table=JailsBase.metadata.sorted_tables[0],
             state_codes=[StateCode.US_XX],
         )
     self.assertEqual(str(e.exception), "Unexpected schema type [JAILS]")
Пример #12
0
    def test_get_bq_schema_for_table_region_code_in_schema(self) -> None:
        """Assert that the region code is included in the schema for association tables in the State schema."""
        association_table_name = (
            "state_supervision_period_program_assignment_association")
        config = CloudSqlToBQConfig.for_schema_type(SchemaType.STATE)
        assert config is not None
        region_code_col = "state_code"
        schema = config.get_bq_schema_for_table(association_table_name)

        self.assertIn(region_code_col,
                      [schema_field.name for schema_field in schema])
Пример #13
0
 def test_get_gcs_export_uri_for_table(self) -> None:
     """Test that get_gcs_export_uri_for_table generates a GCS URI
     with the correct project ID and table name.
     """
     config = CloudSqlToBQConfig.for_schema_type(SchemaType.JAILS)
     assert config is not None
     fake_table = "my_fake_table"
     bucket = "{}-dbexport".format(self.mock_project_id)
     gcs_export_uri = "gs://{bucket}/{table_name}.csv".format(
         bucket=bucket, table_name=fake_table)
     self.assertEqual(gcs_export_uri,
                      config.get_gcs_export_uri_for_table(fake_table))
 def test_excluded_columns(self) -> None:
     for schema_type in self.enabled_schema_types:
         config = CloudSqlToBQConfig.for_schema_type(schema_type)
         for table in config.sorted_tables:
             # pylint: disable=protected-access
             columns = config._get_table_columns_to_export(table)
             for column in columns:
                 self.assertIsInstance(column, str)
                 self.assertTrue(
                     column not in config.columns_to_exclude.get(table.name, []),
                     msg=f"Column {column} should not be included. It is found in "
                     f"COUNTY_COLUMNS_TO_EXCLUDE` for this table {table.name}.",
                 )
    def test_unioned_multi_region_dataset(self) -> None:
        for schema_type in self.enabled_schema_types:
            config = CloudSqlToBQConfig.for_schema_type(schema_type)
            dataset = config.unioned_multi_region_dataset(dataset_override_prefix=None)
            self.assertFalse(dataset.endswith("regional"))
            self.assertTrue(dataset in VIEW_SOURCE_TABLE_DATASETS)

            dataset_with_prefix = config.unioned_multi_region_dataset(
                dataset_override_prefix="prefix"
            )
            self.assertTrue(dataset_with_prefix.startswith("prefix_"))
            self.assertFalse(dataset_with_prefix.endswith("regional"))
            self.assertTrue(dataset_with_prefix not in VIEW_SOURCE_TABLE_DATASETS)
Пример #16
0
 def test_get_stale_bq_rows_for_excluded_regions_query_builder_jails_schema(
     self, ) -> None:
     """Given a JAILS schema, a table name and None for region_codes_to_exclude, assert that it returns a
     query builder that returns no rows"""
     filter_clause = "WHERE FALSE"
     config = CloudSqlToBQConfig.for_schema_type(SchemaType.JAILS)
     assert config is not None
     for table in config.get_tables_to_export():
         query_builder = config.get_stale_bq_rows_for_excluded_regions_query_builder(
             table.name)
         self.assertIsInstance(
             query_builder, BigQuerySchemaTableRegionFilteredQueryBuilder)
         self.assertEqual(filter_clause, query_builder.filter_clause())
Пример #17
0
    def test_unsegmented_collector_jails(self) -> None:
        self.fake_fs.upload_from_string(
            path=self.fake_config_path,
            contents=PAUSED_REGION_CLOUD_SQL_CONFIG_YAML,
            content_type="text/yaml",
        )
        config = CloudSqlToBQConfig.for_schema_type(SchemaType.JAILS)
        collector = UnsegmentedSchemaFederatedBigQueryViewCollector(config)
        builders = collector.collect_view_builders()
        self.assertEqual(
            len(JailsBase.metadata.sorted_tables),
            len(builders),
        )
        view_addresses = set()
        materialized_addresses = set()
        for builder in builders:
            view = builder.build()
            view_addresses.add(view.address)
            if not view.materialized_address:
                raise ValueError(
                    f"Materialized address None for view [{view.address}]")
            materialized_addresses.add(view.materialized_address)

            if view.view_id == "person":
                # Check that we explicitly select columns
                self.assertTrue("person.birthdate" in view.view_query)
                # ... but not excluded ones
                self.assertTrue("full_name" not in view.view_query)
                self.assertTrue(
                    "birthdate_inferred_from_age" not in view.view_query)

        self.assertEqual({"jails_cloudsql_connection"},
                         {a.dataset_id
                          for a in view_addresses})
        self.assertEqual({"census_regional"},
                         {a.dataset_id
                          for a in materialized_addresses})
        self.assertEqual(
            {t.name
             for t in JailsBase.metadata.sorted_tables},
            {a.table_id
             for a in materialized_addresses},
        )

        # No addresses should clobber each other
        self.assertEqual(len(view_addresses), len(builders))
        self.assertEqual(len(materialized_addresses), len(builders))
        self.assertEqual(set(),
                         view_addresses.intersection(materialized_addresses))
Пример #18
0
def create_all_jails_bq_refresh_tasks() -> Tuple[str, int]:
    """Creates an export task for each table to be exported.

    A task is created for each table defined in the JailsBase schema.

    Re-creates all tasks if any task fails to be created.
    """
    logging.info("Beginning BQ export for jails schema tables.")

    task_manager = BQRefreshCloudTaskManager()

    cloud_sql_to_bq_config = CloudSqlToBQConfig.for_schema_type(
        SchemaType.JAILS)
    for table in cloud_sql_to_bq_config.get_tables_to_export():
        task_manager.create_refresh_bq_table_task(table.name, SchemaType.JAILS)
    return ('', HTTPStatus.OK)
Пример #19
0
    def test_dataset_id(self) -> None:
        """Make sure dataset_id is defined correctly.

        Checks that it is a string, checks that it has characters,
        and checks that those characters are letters, numbers, or _.
        """
        for schema_type in self.enabled_schema_types:
            config = CloudSqlToBQConfig.for_schema_type(schema_type)
            assert config is not None
            allowed_characters = set(string.ascii_letters + string.digits +
                                     "_")

            self.assertIsInstance(config.dataset_id, str)

            for char in config.dataset_id:
                self.assertIn(char, allowed_characters)
Пример #20
0
    def test_collect_do_not_crash(self) -> None:
        self.fake_fs.upload_from_string(
            path=self.fake_config_path,
            contents=PAUSED_REGION_CLOUD_SQL_CONFIG_YAML,
            content_type="text/yaml",
        )
        for schema_type in SchemaType:
            if not CloudSqlToBQConfig.is_valid_schema_type(schema_type):
                continue
            config = CloudSqlToBQConfig.for_schema_type(schema_type)

            if config.is_state_segmented_refresh_schema():
                _ = StateSegmentedSchemaFederatedBigQueryViewCollector(
                    config).collect_view_builders()
            else:
                _ = UnsegmentedSchemaFederatedBigQueryViewCollector(
                    config).collect_view_builders()
Пример #21
0
    def test_get_stale_bq_rows_for_excluded_regions_query_builder(
            self) -> None:
        """Given a table name, assert that it returns a query builder that filters for rows
        excluded from the export query"""
        config = CloudSqlToBQConfig.for_schema_type(SchemaType.STATE)
        assert config is not None
        config.region_codes_to_exclude = ["US_VA", "us_id", "US_hi"]
        for table in config.get_tables_to_export():
            if is_association_table(table.name):
                # This is tested in the CloudSqlSchemaTableRegionFilteredQueryBuilder class
                continue
            filter_clause = "WHERE state_code IN ('US_VA','US_ID','US_HI')"
            query_builder = config.get_stale_bq_rows_for_excluded_regions_query_builder(
                table.name)

            self.assertIsInstance(
                query_builder, BigQuerySchemaTableRegionFilteredQueryBuilder)
            self.assertIn(filter_clause, query_builder.full_query())
Пример #22
0
    def test_state_segmented_collector_paused_regions(self) -> None:
        self.fake_fs.upload_from_string(
            path=self.fake_config_path,
            contents=PAUSED_REGION_CLOUD_SQL_CONFIG_YAML,
            content_type="text/yaml",
        )
        config = CloudSqlToBQConfig.for_schema_type(SchemaType.OPERATIONS)
        collector = StateSegmentedSchemaFederatedBigQueryViewCollector(config)
        builders = collector.collect_view_builders()
        direct_ingest_states = get_existing_direct_ingest_states()
        num_schema_tables = len(OperationsBase.metadata.sorted_tables)
        num_paused_regions = 1
        self.assertEqual(
            len(direct_ingest_states) * num_schema_tables -
            num_paused_regions * num_schema_tables,
            len(builders),
        )
        view_addresses = set()
        materialized_addresses = set()
        for builder in builders:
            view = builder.build()
            view_addresses.add(view.address)
            if not view.materialized_address:
                raise ValueError(
                    f"Materialized address None for view [{view.address}]")
            materialized_addresses.add(view.materialized_address)

        self.assertEqual({"operations_cloudsql_connection"},
                         {a.dataset_id
                          for a in view_addresses})
        self.assertNotIn("us_nd_operations_regional",
                         {a.dataset_id
                          for a in materialized_addresses})
        self.assertEqual(
            {t.name
             for t in OperationsBase.metadata.sorted_tables},
            {a.table_id
             for a in materialized_addresses},
        )
        # No addresses should clobber each other
        self.assertEqual(len(view_addresses), len(builders))
        self.assertEqual(len(materialized_addresses), len(builders))
        self.assertEqual(set(),
                         view_addresses.intersection(materialized_addresses))
    def test_column_to_exclude(self) -> None:
        """Make sure columns_to_exclude are defined correctly in case of typos.

        1) Check that all tables are defined in tables to export.

        2) Check that all columns are defined in their respective tables.
        """
        config = CloudSqlToBQConfig.for_schema_type(SchemaType.JAILS)
        assert config is not None
        tables = config.get_tables_to_export()
        table_names = list(map(lambda t: t.name, tables))

        for table, _ in config.columns_to_exclude.items():
            self.assertTrue(
                table in table_names,
                msg='Table "{}" in `cloud_sql_to_bq_export_config.COUNTY_COLUMNS_TO_EXCLUDE`'
                " not found in in the JailsBase schema."
                " Did you spell it correctly?".format(table),
            )
Пример #24
0
def create_all_state_bq_refresh_tasks() -> Tuple[str, int]:
    """Creates an export task for each table to be exported.

    A task is created for each table defined in the StateBase schema.

    Re-creates all tasks if any task fails to be created.
    """
    logging.info("Beginning BQ export for state schema tables.")

    task_manager = BQRefreshCloudTaskManager()

    cloud_sql_to_bq_config = CloudSqlToBQConfig.for_schema_type(
        SchemaType.STATE)
    for table in cloud_sql_to_bq_config.get_tables_to_export():
        task_manager.create_refresh_bq_table_task(table.name, SchemaType.STATE)

    pub_sub_topic = 'v1.calculator.recidivism'
    pub_sub_message = 'State export to BQ complete'
    task_manager.create_bq_refresh_monitor_task(pub_sub_topic, pub_sub_message)
    return ('', HTTPStatus.OK)
Пример #25
0
    def test_get_bq_schema_for_table(self) -> None:
        """Test that get_bq_schema_for_table returns a list
        of SchemaField objects when given a valid table_name

        Assert that excluded columns are not in the schema.
        """
        for schema_type in self.enabled_schema_types:
            config = CloudSqlToBQConfig.for_schema_type(schema_type)
            assert config is not None
            table_name = config.sorted_tables[0].name
            schema = config.get_bq_schema_for_table(table_name)
            for schema_field in schema:
                self.assertIsInstance(schema_field, bigquery.SchemaField)

            for table, column in config.columns_to_exclude.items():
                schema = config.get_bq_schema_for_table(table)
                self.assertTrue(
                    column not in list(map(lambda s: s.name, schema)),
                    msg='Column "{}" should not be included. It is found in '
                    'COUNTY_COLUMNS_TO_EXCLUDE` for this table "{}".',
                )
Пример #26
0
def federated_bq_schema_refresh(
    schema_type: SchemaType,
    direct_ingest_instance: Optional[DirectIngestInstance] = None,
    dataset_override_prefix: Optional[str] = None,
) -> None:
    """Performs a full refresh of BigQuery data for a given schema, pulling data from
    the appropriate CloudSQL Postgres instance.
    """
    if (direct_ingest_instance == DirectIngestInstance.SECONDARY
            and not dataset_override_prefix):
        raise ValueError(
            "Federated refresh can only proceed for secondary databases into a sandbox."
        )

    config = CloudSqlToBQConfig.for_schema_type(schema_type,
                                                direct_ingest_instance)
    # Query CloudSQL and export data into datasets with regions that match the instance
    # region (e.g. us-east1)
    _federated_bq_regional_dataset_refresh(config, dataset_override_prefix)

    # Copy the regional datasets to their final resting place in multi-region datasets
    _copy_regional_dataset_to_multi_region(config, dataset_override_prefix)
Пример #27
0
    def test_get_table_export_query(self) -> None:
        """For each SchemaType and for each table:
        1. Assert that each export query is of type string
        2. Assert that excluded columns are not in the query
        """
        for schema_type in self.enabled_schema_types:
            config = CloudSqlToBQConfig.for_schema_type(schema_type)
            assert config is not None
            config.region_codes_to_exclude = []

            for table in config.sorted_tables:
                query = config.get_table_export_query(table.name)
                self.assertIsInstance(query, str)

                for column in config.columns_to_exclude:
                    search_pattern = "[ ]*{}[, ]+".format(
                        f"{table.name}.{column}")
                    self.assertNotRegex(
                        query,
                        search_pattern,
                        msg='Column "{}" not excluded properly from table "{}".'
                        " Check cloud_sql_to_bq_export_config.COUNTY_TABLE_COLUMNS_TO_EXPORT"
                        .format(column, table.name),
                    )
 def test_is_state_segmented_refresh_schema(self) -> None:
     for schema_type in self.enabled_schema_types:
         config = CloudSqlToBQConfig.for_schema_type(schema_type)
         is_state_segmented = config.is_state_segmented_refresh_schema()
         if schema_type == SchemaType.STATE:
             self.assertTrue(is_state_segmented)
Пример #29
0
 def test_for_schema_type_raises_error(self) -> None:
     with self.assertRaises(ValueError):
         CloudSqlToBQConfig.for_schema_type(
             "random-schema-type")  # type: ignore[arg-type]
Пример #30
0
 def test_incorrect_environment(self) -> None:
     with self.assertRaises(ValueError):
         config = CloudSqlToBQConfig.for_schema_type(SchemaType.STATE)
         assert config is not None
         self.assertEqual(config.region_codes_to_exclude, [])