def primary_ingest_bucket_for_region(self, region: Region) -> GcsfsBucketPath: return gcsfs_direct_ingest_bucket_for_region( region_code=region.region_code, system_level=SystemLevel.for_region(region), ingest_instance=DirectIngestInstance.PRIMARY, )
def kick_all_schedulers() -> None: """Kicks all ingest schedulers to restart ingest""" supported_regions = get_supported_direct_ingest_region_codes() for region_code in supported_regions: region = _region_for_region_code(region_code=region_code) if not region.is_ingest_launched_in_env(): continue system_level = SystemLevel.for_region(region) for ingest_instance in DirectIngestInstance: with monitoring.push_region_tag( region_code, ingest_instance=ingest_instance.value): try: ingest_instance.check_is_valid_system_level(system_level) except DirectIngestInstanceError: continue ingest_bucket = gcsfs_direct_ingest_bucket_for_region( region_code=region_code, system_level=system_level, ingest_instance=ingest_instance, ) controller = DirectIngestControllerFactory.build( ingest_bucket_path=ingest_bucket, allow_unlaunched=False, ) controller.kick_scheduler(just_finished_job=False)
def test_build_throws_in_prod_region_only_launched_in_staging( self, ) -> None: mock_region = fake_region( region_code="us_xx", environment="staging", is_direct_ingest=True, region_module=templates, ) with patch( "recidiviz.utils.regions.get_region", Mock(return_value=mock_region), ): ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region( region_code=mock_region.region_code, system_level=SystemLevel.for_region(mock_region), ingest_instance=DirectIngestInstance.PRIMARY, ) with self.assertRaises(DirectIngestError) as e: _ = DirectIngestControllerFactory.build( ingest_bucket_path=ingest_bucket_path, allow_unlaunched=False) self.assertEqual( str(e.exception), "Bad environment [production] for region [us_xx].", )
def _get_process_job_queue_manager( self, region: Region, ingest_instance: DirectIngestInstance ) -> CloudTaskQueueManager[ProcessIngestJobCloudTaskQueueInfo]: queue_name = _queue_name_for_queue_type( DirectIngestQueueType.PROCESS_JOB_QUEUE, region.region_code, SystemLevel.for_region(region), ingest_instance, ) return CloudTaskQueueManager( queue_info_cls=ProcessIngestJobCloudTaskQueueInfo, queue_name=queue_name, cloud_tasks_client=self.cloud_tasks_client, )
def _get_bq_import_export_queue_manager( self, region: Region, ingest_instance: DirectIngestInstance ) -> CloudTaskQueueManager[BQImportExportCloudTaskQueueInfo]: queue_name = _queue_name_for_queue_type( DirectIngestQueueType.BQ_IMPORT_EXPORT, region.region_code, SystemLevel.for_region(region), ingest_instance, ) return CloudTaskQueueManager( queue_info_cls=BQImportExportCloudTaskQueueInfo, queue_name=queue_name, cloud_tasks_client=self.cloud_tasks_client, )
def _get_scheduler_queue_manager( self, region: Region, ingest_instance: DirectIngestInstance ) -> CloudTaskQueueManager[SchedulerCloudTaskQueueInfo]: queue_name = _queue_name_for_queue_type( DirectIngestQueueType.SCHEDULER, region.region_code, SystemLevel.for_region(region), ingest_instance, ) return CloudTaskQueueManager( queue_info_cls=SchedulerCloudTaskQueueInfo, queue_name=queue_name, cloud_tasks_client=self.cloud_tasks_client, )
def test_build_gcsfs_ingest_controller_all_regions(self) -> None: for region_code in get_existing_region_dir_names(): region = get_region(region_code, is_direct_ingest=True) for ingest_instance in DirectIngestInstance: ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region( region_code=region_code, system_level=SystemLevel.for_region(region), ingest_instance=ingest_instance, ) controller = DirectIngestControllerFactory.build( ingest_bucket_path=ingest_bucket_path, allow_unlaunched=False) self.assertIsNotNone(controller) self.assertIsInstance(controller, BaseDirectIngestController) self.assertEqual(ingest_bucket_path, controller.ingest_bucket_path)
def ensure_all_raw_file_paths_normalized() -> Tuple[str, HTTPStatus]: """Ensures that all files in the ingest buckets for all direct ingest states have properly normalized file names, to ensure that repeat uploads of files into those buckets don't fail or overwrite data. This provides a layer of protection against cloud function failures. """ logging.info( "Received request for direct ingest ensure_all_raw_file_paths_normalized: " "%s", request.values, ) supported_regions = get_supported_direct_ingest_region_codes() for region_code in supported_regions: logging.info("Ensuring paths normalized for region [%s]", region_code) # The only type of file that wouldn't be normalized is a raw file, which # should only ever be in the PRIMARY bucket. ingest_instance = DirectIngestInstance.PRIMARY with monitoring.push_region_tag(region_code, ingest_instance=ingest_instance.value): ingest_bucket = gcsfs_direct_ingest_bucket_for_region( region_code=region_code, system_level=SystemLevel.for_region( _region_for_region_code(region_code)), ingest_instance=ingest_instance, ) try: controller = DirectIngestControllerFactory.build( ingest_bucket_path=ingest_bucket, allow_unlaunched=True, ) except DirectIngestError as e: if e.is_bad_request(): logging.error(str(e)) return str(e), HTTPStatus.BAD_REQUEST raise e can_start_ingest = controller.region.is_ingest_launched_in_env() controller.cloud_task_manager.create_direct_ingest_handle_new_files_task( controller.region, ingest_instance=controller.ingest_instance, ingest_bucket=controller.ingest_bucket_path, can_start_ingest=can_start_ingest, ) return "", HTTPStatus.OK
def _commit_person( person: SchemaPersonType, system_level: SystemLevel, ingest_time: datetime.datetime, ): db_key = SQLAlchemyDatabaseKey.canonical_for_schema( system_level.schema_type()) with SessionFactory.using_database(db_key) as act_session: merged_person = act_session.merge(person) metadata = IngestMetadata( region="somewhere", jurisdiction_id="12345", ingest_time=ingest_time, system_level=system_level, database_key=db_key, ) update_historical_snapshots(act_session, [merged_person], [], metadata)
def test_build_gcsfs_ingest_controller_all_regions_do_not_allow_launched( self, ) -> None: for region_code in get_existing_region_dir_names(): region = get_region(region_code, is_direct_ingest=True) for ingest_instance in DirectIngestInstance: ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region( region_code=region_code, system_level=SystemLevel.for_region(region), ingest_instance=ingest_instance, ) controller = DirectIngestControllerFactory.build( ingest_bucket_path=ingest_bucket_path, allow_unlaunched=True) # Should still succeed for all controllers in the test environment self.assertIsNotNone(controller) self.assertIsInstance(controller, BaseDirectIngestController) self.assertEqual(ingest_bucket_path, controller.ingest_bucket_path)
def test_build_succeeds_in_staging_region_launched_in_prod(self) -> None: mock_region = fake_region( region_code="us_xx", environment="production", is_direct_ingest=True, region_module=templates, ) with patch( "recidiviz.utils.regions.get_region", Mock(return_value=mock_region), ): ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region( region_code=mock_region.region_code, system_level=SystemLevel.for_region(mock_region), ingest_instance=DirectIngestInstance.PRIMARY, ) controller = DirectIngestControllerFactory.build( ingest_bucket_path=ingest_bucket_path, allow_unlaunched=False) self.assertIsNotNone(controller) self.assertIsInstance(controller, BaseDirectIngestController) self.assertEqual(ingest_bucket_path, controller.ingest_bucket_path)
def start_ingest_run(self, state_code: StateCode, instance_str: str) -> None: """This function is called through the Ingest Operations UI in the admin panel. It calls to start a direct ingest run for the given region_code in the given instance Requires: - state_code: (required) State code to start ingest for (i.e. "US_ID") - instance: (required) Which instance to start ingest for (either PRIMARY or SECONDARY) """ try: instance = DirectIngestInstance[instance_str] except KeyError as e: logging.error("Received an invalid instance: %s.", instance_str) raise ValueError( f"Invalid instance [{instance_str}] received", ) from e can_start_ingest = state_code in self.state_codes_launched_in_env formatted_state_code = state_code.value.lower() region = get_region(formatted_state_code, is_direct_ingest=True) # Get the ingest bucket for this region and instance ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region( region_code=formatted_state_code, system_level=SystemLevel.for_region(region), ingest_instance=instance, project_id=self.project_id, ) logging.info( "Creating cloud task to schedule next job and kick ingest for %s instance in %s.", instance, formatted_state_code, ) self.cloud_task_manager.create_direct_ingest_handle_new_files_task( region=region, ingest_instance=instance, ingest_bucket=ingest_bucket_path, can_start_ingest=can_start_ingest, )
def build_gcsfs_controller_for_tests( controller_cls: Type[CsvGcsfsDirectIngestController], ingest_instance: DirectIngestInstance, run_async: bool, can_start_ingest: bool = True, regions_module: ModuleType = fake_regions_module, ) -> BaseDirectIngestController: """Builds an instance of |controller_cls| for use in tests with several internal classes mocked properly. """ fake_fs = FakeGCSFileSystem() def mock_build_fs() -> FakeGCSFileSystem: return fake_fs if "TestGcsfsDirectIngestController" in controller_cls.__name__: view_collector_cls: Type[ BigQueryViewCollector] = FakeDirectIngestPreProcessedIngestViewCollector else: view_collector_cls = DirectIngestPreProcessedIngestViewCollector with patch( f"{BaseDirectIngestController.__module__}.DirectIngestCloudTaskManagerImpl" ) as mock_task_factory_cls: with patch( f"{BaseDirectIngestController.__module__}.BigQueryClientImpl" ) as mock_big_query_client_cls: with patch( f"{BaseDirectIngestController.__module__}.DirectIngestRawFileImportManager", FakeDirectIngestRawFileImportManager, ): with patch( f"{BaseDirectIngestController.__module__}.DirectIngestPreProcessedIngestViewCollector", view_collector_cls, ): task_manager = ( FakeAsyncDirectIngestCloudTaskManager() if run_async else FakeSynchronousDirectIngestCloudTaskManager()) mock_task_factory_cls.return_value = task_manager mock_big_query_client_cls.return_value = ( FakeDirectIngestBigQueryClient( project_id=metadata.project_id(), fs=fake_fs, region_code=controller_cls.region_code(), )) with patch.object(GcsfsFactory, "build", new=mock_build_fs): with patch.object( direct_ingest_raw_table_migration_collector, "regions", new=regions_module, ): controller = controller_cls( ingest_bucket_path= gcsfs_direct_ingest_bucket_for_region( region_code=controller_cls.region_code(), system_level=SystemLevel.for_region_code( controller_cls.region_code(), is_direct_ingest=True, ), ingest_instance=ingest_instance, project_id="recidiviz-xxx", )) controller.csv_reader = GcsfsCsvReader(fake_fs) controller.raw_file_import_manager.csv_reader = ( controller.csv_reader) task_manager.set_controller(controller) fake_fs.test_set_delegate( DirectIngestFakeGCSFileSystemDelegate( controller, can_start_ingest=can_start_ingest)) return controller
def system_level(self) -> SystemLevel: return SystemLevel.for_region(self.region)