def primary_ingest_bucket_for_region(self,
                                      region: Region) -> GcsfsBucketPath:
     return gcsfs_direct_ingest_bucket_for_region(
         region_code=region.region_code,
         system_level=SystemLevel.for_region(region),
         ingest_instance=DirectIngestInstance.PRIMARY,
     )
Пример #2
0
def kick_all_schedulers() -> None:
    """Kicks all ingest schedulers to restart ingest"""
    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        region = _region_for_region_code(region_code=region_code)
        if not region.is_ingest_launched_in_env():
            continue
        system_level = SystemLevel.for_region(region)
        for ingest_instance in DirectIngestInstance:
            with monitoring.push_region_tag(
                    region_code, ingest_instance=ingest_instance.value):
                try:
                    ingest_instance.check_is_valid_system_level(system_level)
                except DirectIngestInstanceError:
                    continue
                ingest_bucket = gcsfs_direct_ingest_bucket_for_region(
                    region_code=region_code,
                    system_level=system_level,
                    ingest_instance=ingest_instance,
                )
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket,
                    allow_unlaunched=False,
                )

                controller.kick_scheduler(just_finished_job=False)
 def test_build_throws_in_prod_region_only_launched_in_staging(
     self, ) -> None:
     mock_region = fake_region(
         region_code="us_xx",
         environment="staging",
         is_direct_ingest=True,
         region_module=templates,
     )
     with patch(
             "recidiviz.utils.regions.get_region",
             Mock(return_value=mock_region),
     ):
         ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
             region_code=mock_region.region_code,
             system_level=SystemLevel.for_region(mock_region),
             ingest_instance=DirectIngestInstance.PRIMARY,
         )
         with self.assertRaises(DirectIngestError) as e:
             _ = DirectIngestControllerFactory.build(
                 ingest_bucket_path=ingest_bucket_path,
                 allow_unlaunched=False)
         self.assertEqual(
             str(e.exception),
             "Bad environment [production] for region [us_xx].",
         )
    def _get_process_job_queue_manager(
        self, region: Region, ingest_instance: DirectIngestInstance
    ) -> CloudTaskQueueManager[ProcessIngestJobCloudTaskQueueInfo]:
        queue_name = _queue_name_for_queue_type(
            DirectIngestQueueType.PROCESS_JOB_QUEUE,
            region.region_code,
            SystemLevel.for_region(region),
            ingest_instance,
        )

        return CloudTaskQueueManager(
            queue_info_cls=ProcessIngestJobCloudTaskQueueInfo,
            queue_name=queue_name,
            cloud_tasks_client=self.cloud_tasks_client,
        )
    def _get_bq_import_export_queue_manager(
        self, region: Region, ingest_instance: DirectIngestInstance
    ) -> CloudTaskQueueManager[BQImportExportCloudTaskQueueInfo]:
        queue_name = _queue_name_for_queue_type(
            DirectIngestQueueType.BQ_IMPORT_EXPORT,
            region.region_code,
            SystemLevel.for_region(region),
            ingest_instance,
        )

        return CloudTaskQueueManager(
            queue_info_cls=BQImportExportCloudTaskQueueInfo,
            queue_name=queue_name,
            cloud_tasks_client=self.cloud_tasks_client,
        )
    def _get_scheduler_queue_manager(
        self, region: Region, ingest_instance: DirectIngestInstance
    ) -> CloudTaskQueueManager[SchedulerCloudTaskQueueInfo]:
        queue_name = _queue_name_for_queue_type(
            DirectIngestQueueType.SCHEDULER,
            region.region_code,
            SystemLevel.for_region(region),
            ingest_instance,
        )

        return CloudTaskQueueManager(
            queue_info_cls=SchedulerCloudTaskQueueInfo,
            queue_name=queue_name,
            cloud_tasks_client=self.cloud_tasks_client,
        )
    def test_build_gcsfs_ingest_controller_all_regions(self) -> None:
        for region_code in get_existing_region_dir_names():
            region = get_region(region_code, is_direct_ingest=True)
            for ingest_instance in DirectIngestInstance:
                ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
                    region_code=region_code,
                    system_level=SystemLevel.for_region(region),
                    ingest_instance=ingest_instance,
                )
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket_path,
                    allow_unlaunched=False)

                self.assertIsNotNone(controller)
                self.assertIsInstance(controller, BaseDirectIngestController)
                self.assertEqual(ingest_bucket_path,
                                 controller.ingest_bucket_path)
Пример #8
0
def ensure_all_raw_file_paths_normalized() -> Tuple[str, HTTPStatus]:
    """Ensures that all files in the ingest buckets for all direct ingest states have
    properly normalized  file names, to ensure that repeat uploads of files into those
    buckets don't fail or overwrite data. This provides a layer of protection against
    cloud function failures.
    """
    logging.info(
        "Received request for direct ingest ensure_all_raw_file_paths_normalized: "
        "%s",
        request.values,
    )

    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        logging.info("Ensuring paths normalized for region [%s]", region_code)
        # The only type of file that wouldn't be normalized is a raw file, which
        # should only ever be in the PRIMARY bucket.
        ingest_instance = DirectIngestInstance.PRIMARY
        with monitoring.push_region_tag(region_code,
                                        ingest_instance=ingest_instance.value):
            ingest_bucket = gcsfs_direct_ingest_bucket_for_region(
                region_code=region_code,
                system_level=SystemLevel.for_region(
                    _region_for_region_code(region_code)),
                ingest_instance=ingest_instance,
            )
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket,
                    allow_unlaunched=True,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            can_start_ingest = controller.region.is_ingest_launched_in_env()
            controller.cloud_task_manager.create_direct_ingest_handle_new_files_task(
                controller.region,
                ingest_instance=controller.ingest_instance,
                ingest_bucket=controller.ingest_bucket_path,
                can_start_ingest=can_start_ingest,
            )
    return "", HTTPStatus.OK
    def _commit_person(
        person: SchemaPersonType,
        system_level: SystemLevel,
        ingest_time: datetime.datetime,
    ):
        db_key = SQLAlchemyDatabaseKey.canonical_for_schema(
            system_level.schema_type())
        with SessionFactory.using_database(db_key) as act_session:
            merged_person = act_session.merge(person)

            metadata = IngestMetadata(
                region="somewhere",
                jurisdiction_id="12345",
                ingest_time=ingest_time,
                system_level=system_level,
                database_key=db_key,
            )
            update_historical_snapshots(act_session, [merged_person], [],
                                        metadata)
    def test_build_gcsfs_ingest_controller_all_regions_do_not_allow_launched(
        self, ) -> None:
        for region_code in get_existing_region_dir_names():
            region = get_region(region_code, is_direct_ingest=True)
            for ingest_instance in DirectIngestInstance:
                ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
                    region_code=region_code,
                    system_level=SystemLevel.for_region(region),
                    ingest_instance=ingest_instance,
                )
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket_path,
                    allow_unlaunched=True)

                # Should still succeed for all controllers in the test environment
                self.assertIsNotNone(controller)
                self.assertIsInstance(controller, BaseDirectIngestController)
                self.assertEqual(ingest_bucket_path,
                                 controller.ingest_bucket_path)
 def test_build_succeeds_in_staging_region_launched_in_prod(self) -> None:
     mock_region = fake_region(
         region_code="us_xx",
         environment="production",
         is_direct_ingest=True,
         region_module=templates,
     )
     with patch(
             "recidiviz.utils.regions.get_region",
             Mock(return_value=mock_region),
     ):
         ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
             region_code=mock_region.region_code,
             system_level=SystemLevel.for_region(mock_region),
             ingest_instance=DirectIngestInstance.PRIMARY,
         )
         controller = DirectIngestControllerFactory.build(
             ingest_bucket_path=ingest_bucket_path, allow_unlaunched=False)
         self.assertIsNotNone(controller)
         self.assertIsInstance(controller, BaseDirectIngestController)
         self.assertEqual(ingest_bucket_path, controller.ingest_bucket_path)
    def start_ingest_run(self, state_code: StateCode,
                         instance_str: str) -> None:
        """This function is called through the Ingest Operations UI in the admin panel.
        It calls to start a direct ingest run for the given region_code in the given instance
        Requires:
        - state_code: (required) State code to start ingest for (i.e. "US_ID")
        - instance: (required) Which instance to start ingest for (either PRIMARY or SECONDARY)
        """
        try:
            instance = DirectIngestInstance[instance_str]
        except KeyError as e:
            logging.error("Received an invalid instance: %s.", instance_str)
            raise ValueError(
                f"Invalid instance [{instance_str}] received", ) from e

        can_start_ingest = state_code in self.state_codes_launched_in_env

        formatted_state_code = state_code.value.lower()
        region = get_region(formatted_state_code, is_direct_ingest=True)

        # Get the ingest bucket for this region and instance
        ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
            region_code=formatted_state_code,
            system_level=SystemLevel.for_region(region),
            ingest_instance=instance,
            project_id=self.project_id,
        )

        logging.info(
            "Creating cloud task to schedule next job and kick ingest for %s instance in %s.",
            instance,
            formatted_state_code,
        )
        self.cloud_task_manager.create_direct_ingest_handle_new_files_task(
            region=region,
            ingest_instance=instance,
            ingest_bucket=ingest_bucket_path,
            can_start_ingest=can_start_ingest,
        )
Пример #13
0
def build_gcsfs_controller_for_tests(
    controller_cls: Type[CsvGcsfsDirectIngestController],
    ingest_instance: DirectIngestInstance,
    run_async: bool,
    can_start_ingest: bool = True,
    regions_module: ModuleType = fake_regions_module,
) -> BaseDirectIngestController:
    """Builds an instance of |controller_cls| for use in tests with several internal classes mocked properly. """
    fake_fs = FakeGCSFileSystem()

    def mock_build_fs() -> FakeGCSFileSystem:
        return fake_fs

    if "TestGcsfsDirectIngestController" in controller_cls.__name__:
        view_collector_cls: Type[
            BigQueryViewCollector] = FakeDirectIngestPreProcessedIngestViewCollector
    else:
        view_collector_cls = DirectIngestPreProcessedIngestViewCollector

    with patch(
            f"{BaseDirectIngestController.__module__}.DirectIngestCloudTaskManagerImpl"
    ) as mock_task_factory_cls:
        with patch(
                f"{BaseDirectIngestController.__module__}.BigQueryClientImpl"
        ) as mock_big_query_client_cls:
            with patch(
                    f"{BaseDirectIngestController.__module__}.DirectIngestRawFileImportManager",
                    FakeDirectIngestRawFileImportManager,
            ):
                with patch(
                        f"{BaseDirectIngestController.__module__}.DirectIngestPreProcessedIngestViewCollector",
                        view_collector_cls,
                ):
                    task_manager = (
                        FakeAsyncDirectIngestCloudTaskManager() if run_async
                        else FakeSynchronousDirectIngestCloudTaskManager())
                    mock_task_factory_cls.return_value = task_manager
                    mock_big_query_client_cls.return_value = (
                        FakeDirectIngestBigQueryClient(
                            project_id=metadata.project_id(),
                            fs=fake_fs,
                            region_code=controller_cls.region_code(),
                        ))
                    with patch.object(GcsfsFactory, "build",
                                      new=mock_build_fs):
                        with patch.object(
                                direct_ingest_raw_table_migration_collector,
                                "regions",
                                new=regions_module,
                        ):
                            controller = controller_cls(
                                ingest_bucket_path=
                                gcsfs_direct_ingest_bucket_for_region(
                                    region_code=controller_cls.region_code(),
                                    system_level=SystemLevel.for_region_code(
                                        controller_cls.region_code(),
                                        is_direct_ingest=True,
                                    ),
                                    ingest_instance=ingest_instance,
                                    project_id="recidiviz-xxx",
                                ))
                            controller.csv_reader = GcsfsCsvReader(fake_fs)
                            controller.raw_file_import_manager.csv_reader = (
                                controller.csv_reader)

                            task_manager.set_controller(controller)
                            fake_fs.test_set_delegate(
                                DirectIngestFakeGCSFileSystemDelegate(
                                    controller,
                                    can_start_ingest=can_start_ingest))
                            return controller
Пример #14
0
 def system_level(self) -> SystemLevel:
     return SystemLevel.for_region(self.region)