def setUp(self) -> None: self.schema_types: List[SchemaType] = list(SchemaType) self.enabled_schema_types = [ schema_type for schema_type in self.schema_types if CloudSqlToBQConfig.is_valid_schema_type(schema_type) ] self.mock_project_id = "fake-recidiviz-project" self.metadata_patcher = mock.patch( "recidiviz.persistence.database.bq_refresh.cloud_sql_to_bq_refresh_config.metadata" ) self.mock_metadata = self.metadata_patcher.start() self.mock_metadata.project_id.return_value = self.mock_project_id self.gcs_factory_patcher = mock.patch( "recidiviz.persistence.database.bq_refresh.cloud_sql_to_bq_refresh_config.GcsfsFactory.build" ) self.fake_gcs = FakeGCSFileSystem() self.gcs_factory_patcher.start().return_value = self.fake_gcs self.set_config_yaml( """ region_codes_to_exclude: - US_ND state_history_tables_to_include: - state_person_history county_columns_to_exclude: person: - full_name - birthdate_inferred_from_age """ )
def refresh_bq_schema(schema_arg: str) -> Tuple[str, HTTPStatus]: """Performs a full refresh of BigQuery data for a given schema, pulling data from the appropriate CloudSQL Postgres instance. On completion, triggers Dataflow pipelines (when necessary), releases the refresh lock and restarts any paused ingest work. """ try: schema_type = SchemaType(schema_arg.upper()) except ValueError: return ( f"Unexpected value for schema_arg: [{schema_arg}]", HTTPStatus.BAD_REQUEST, ) if not CloudSqlToBQConfig.is_valid_schema_type(schema_type): return ( f"Unsupported schema type: [{schema_type}]", HTTPStatus.BAD_REQUEST, ) lock_manager = CloudSqlToBQLockManager() try: can_proceed = lock_manager.can_proceed(schema_type) except GCSPseudoLockDoesNotExist as e: logging.exception(e) return ( f"Expected lock for [{schema_arg}] BQ refresh to already exist.", HTTPStatus.EXPECTATION_FAILED, ) if not can_proceed: return ( f"Expected to be able to proceed with refresh before this endpoint was " f"called for [{schema_arg}].", HTTPStatus.EXPECTATION_FAILED, ) federated_bq_schema_refresh(schema_type=schema_type) # Publish a message to the Pub/Sub topic once state BQ export is complete if schema_type is SchemaType.STATE: pubsub_helper.publish_message_to_topic( message="State export to BQ complete", topic="v1.calculator.trigger_daily_pipelines", ) # Unlock export lock when all BQ exports complete lock_manager = CloudSqlToBQLockManager() lock_manager.release_lock(schema_type) logging.info( "Done running refresh for [%s], unlocking Postgres to BigQuery export", schema_type.value, ) # Kick scheduler to restart ingest kick_all_schedulers() return "", HTTPStatus.OK
def test_for_schema_type_returns_instance(self) -> None: for schema_type in self.schema_types: if not CloudSqlToBQConfig.is_valid_schema_type(schema_type): with self.assertRaises(ValueError): _ = CloudSqlToBQConfig.for_schema_type(schema_type) else: config = CloudSqlToBQConfig.for_schema_type(schema_type) self.assertIsInstance(config, CloudSqlToBQConfig)
def test_collect_do_not_crash(self) -> None: self.fake_fs.upload_from_string( path=self.fake_config_path, contents=PAUSED_REGION_CLOUD_SQL_CONFIG_YAML, content_type="text/yaml", ) for schema_type in SchemaType: if not CloudSqlToBQConfig.is_valid_schema_type(schema_type): continue config = CloudSqlToBQConfig.for_schema_type(schema_type) if config.is_state_segmented_refresh_schema(): _ = StateSegmentedSchemaFederatedBigQueryViewCollector( config).collect_view_builders() else: _ = UnsegmentedSchemaFederatedBigQueryViewCollector( config).collect_view_builders()
def wait_for_ingest_to_create_tasks(schema_arg: str) -> Tuple[str, HTTPStatus]: """Worker function to wait until ingest is not running to queue a task to run /refresh_bq_schema. Before doing anything, grabs the refresh lock to indicate that a refresh wants to start and ingest should yield ASAP. Then: * When ingest is not running/locked, creates task to run /refresh_bq_schema. * When ingest is running/locked, re-enqueues this task to run again in 60 seconds. """ try: schema_type = SchemaType(schema_arg.upper()) except ValueError: return ( f"Unexpected value for schema_arg: [{schema_arg}]", HTTPStatus.BAD_REQUEST, ) if not CloudSqlToBQConfig.is_valid_schema_type(schema_type): return ( f"Unsuppported schema type: [{schema_type}]", HTTPStatus.BAD_REQUEST, ) lock_id = get_or_create_lock_id() logging.info("Request lock id: %s", lock_id) lock_manager = CloudSqlToBQLockManager() lock_manager.acquire_lock(schema_type=schema_type, lock_id=lock_id) task_manager = BQRefreshCloudTaskManager() if not lock_manager.can_proceed(schema_type): logging.info("Regions running, renqueuing this task.") task_manager.create_reattempt_create_refresh_tasks_task( lock_id=lock_id, schema=schema_arg) return "", HTTPStatus.OK logging.info("No regions running, triggering BQ refresh.") task_manager.create_refresh_bq_schema_task(schema_type=schema_type) return "", HTTPStatus.OK