Пример #1
0
def scheduler() -> Tuple[str, HTTPStatus]:
    """Checks the state of the ingest instance and schedules any tasks to be run."""
    logging.info("Received request for direct ingest scheduler: %s",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    just_finished_job = get_bool_param_value("just_finished_job",
                                             request.values,
                                             default=False)

    # The bucket name for ingest instance to schedule work out of
    bucket = get_str_param_value("bucket", request.args)

    if not region_code or just_finished_job is None or not bucket:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    bucket_path = GcsfsBucketPath(bucket)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                bucket_path).value,
    ):
        try:
            controller = DirectIngestControllerFactory.build(
                ingest_bucket_path=bucket_path, allow_unlaunched=False)
        except DirectIngestError as e:
            if e.is_bad_request():
                logging.error(str(e))
                return str(e), HTTPStatus.BAD_REQUEST
            raise e

        controller.schedule_next_ingest_job(just_finished_job)
    return "", HTTPStatus.OK
Пример #2
0
    def __init__(self, ingest_bucket_path: GcsfsBucketPath) -> None:
        """Initialize the controller."""
        self.cloud_task_manager = DirectIngestCloudTaskManagerImpl()
        self.ingest_instance = DirectIngestInstance.for_ingest_bucket(
            ingest_bucket_path)
        self.region_lock_manager = DirectIngestRegionLockManager.for_direct_ingest(
            region_code=self.region.region_code,
            schema_type=self.system_level.schema_type(),
            ingest_instance=self.ingest_instance,
        )
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.ingest_bucket_path = ingest_bucket_path
        self.storage_directory_path = (
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code=self.region_code(),
                system_level=self.system_level,
                ingest_instance=self.ingest_instance,
            ))

        self.temp_output_directory_path = (
            gcsfs_direct_ingest_temporary_output_directory_path())

        self.file_prioritizer = GcsfsDirectIngestJobPrioritizer(
            self.fs,
            self.ingest_bucket_path,
            self.get_file_tag_rank_list(),
        )

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code,
            ingest_database_name=self.ingest_database_key.db_name,
        )

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_bucket_path=self.ingest_bucket_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl(),
        )

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            output_bucket_name=self.ingest_bucket_path.bucket_name,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()),
            launched_file_tags=self.get_file_tag_rank_list(),
        )

        self.ingest_instance_status_manager = DirectIngestInstanceStatusManager(
            self.region_code(), self.ingest_instance)
    def test_from_county_ingest_bucket(self) -> None:
        ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
            region_code="us_xx_yyyyy",
            system_level=SystemLevel.COUNTY,
            ingest_instance=DirectIngestInstance.PRIMARY,
            project_id="recidiviz-456",
        )

        self.assertEqual(
            DirectIngestInstance.PRIMARY,
            DirectIngestInstance.for_ingest_bucket(ingest_bucket_path),
        )
    def is_task_queued(self, region: Region,
                       ingest_args: GcsfsIngestArgs) -> bool:
        """Returns true if the ingest_args correspond to a task currently in
        the queue.
        """

        task_id_prefix = _build_task_id(
            region.region_code,
            DirectIngestInstance.for_ingest_bucket(
                ingest_args.file_path.bucket_path),
            ingest_args.task_id_tag(),
            prefix_only=True,
        )

        return bool(next(self._tasks_for_prefix(task_id_prefix), None))
    def test_from_state_ingest_bucket(self) -> None:
        ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
            region_code="us_xx",
            system_level=SystemLevel.STATE,
            ingest_instance=DirectIngestInstance.PRIMARY,
            project_id="recidiviz-456",
        )

        self.assertEqual(
            DirectIngestInstance.PRIMARY,
            DirectIngestInstance.for_ingest_bucket(ingest_bucket_path),
        )

        ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
            region_code="us_xx",
            system_level=SystemLevel.STATE,
            ingest_instance=DirectIngestInstance.SECONDARY,
            project_id="recidiviz-456",
        )

        self.assertEqual(
            DirectIngestInstance.SECONDARY,
            DirectIngestInstance.for_ingest_bucket(ingest_bucket_path),
        )
Пример #6
0
def handle_direct_ingest_file() -> Tuple[str, HTTPStatus]:
    """Called from a Cloud Function when a new file is added to a direct ingest
    bucket. Will trigger a job that deals with normalizing and splitting the
    file as is appropriate, then start the scheduler if allowed.
    """
    region_code = get_str_param_value("region", request.args)
    # The bucket name for the file to ingest
    bucket = get_str_param_value("bucket", request.args)
    # The relative path to the file, not including the bucket name
    relative_file_path = get_str_param_value("relative_file_path",
                                             request.args,
                                             preserve_case=True)
    start_ingest = get_bool_param_value("start_ingest",
                                        request.args,
                                        default=False)

    if not region_code or not bucket or not relative_file_path or start_ingest is None:
        response = f"Bad parameters [{request.args}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    bucket_path = GcsfsBucketPath(bucket_name=bucket)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                bucket_path).value,
    ):
        try:
            controller = DirectIngestControllerFactory.build(
                ingest_bucket_path=bucket_path,
                allow_unlaunched=True,
            )
        except DirectIngestError as e:
            if e.is_bad_request():
                logging.error(str(e))
                return str(e), HTTPStatus.BAD_REQUEST
            raise e

        path = GcsfsPath.from_bucket_and_blob_name(
            bucket_name=bucket, blob_name=relative_file_path)

        if isinstance(path, GcsfsFilePath):
            controller.handle_file(path, start_ingest=start_ingest)

    return "", HTTPStatus.OK
Пример #7
0
def handle_new_files() -> Tuple[str, HTTPStatus]:
    """Normalizes and splits files in the ingest bucket for a given region as
    is appropriate. Will schedule the next process_job task if no renaming /
    splitting work has been done that will trigger subsequent calls to this
    endpoint.
    """
    logging.info("Received request for direct ingest handle_new_files: %s",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    can_start_ingest = get_bool_param_value("can_start_ingest",
                                            request.values,
                                            default=False)
    bucket = get_str_param_value("bucket", request.values)

    if not region_code or can_start_ingest is None or not bucket:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    bucket_path = GcsfsBucketPath(bucket_name=bucket)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                bucket_path).value,
    ):
        try:
            controller = DirectIngestControllerFactory.build(
                ingest_bucket_path=bucket_path,
                allow_unlaunched=True,
            )
        except DirectIngestError as e:
            if e.is_bad_request():
                logging.error(str(e))
                return str(e), HTTPStatus.BAD_REQUEST
            raise e

        controller.handle_new_files(can_start_ingest=can_start_ingest)
    return "", HTTPStatus.OK
Пример #8
0
def process_job() -> Tuple[str, HTTPStatus]:
    """Processes a single direct ingest file, specified in the provided ingest
    arguments.
    """
    logging.info("Received request to process direct ingest job: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    file_path = get_str_param_value("file_path",
                                    request.values,
                                    preserve_case=True)

    if not region_code or not file_path:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    gcsfs_path = GcsfsFilePath.from_absolute_path(file_path)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                gcsfs_path.bucket_path).value,
    ):
        json_data = request.get_data(as_text=True)
        ingest_args = _parse_cloud_task_args(json_data)

        if not ingest_args:
            raise DirectIngestError(
                msg="process_job was called with no GcsfsIngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_args, GcsfsIngestArgs):
            raise DirectIngestError(
                msg=
                f"process_job was called with incorrect args type [{type(ingest_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if gcsfs_path != ingest_args.file_path:
            raise DirectIngestError(
                msg=f"Different paths were passed in the url and request body\n"
                f"url: {gcsfs_path.uri()}\n"
                f"body: {ingest_args.file_path.uri()}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags(
            {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_args.file_path.bucket_path,
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            try:
                controller.run_ingest_job_and_kick_scheduler_on_completion(
                    ingest_args)
            except GCSPseudoLockAlreadyExists as e:
                logging.warning(str(e))
                return str(e), HTTPStatus.CONFLICT
    return "", HTTPStatus.OK
Пример #9
0
def ingest_view_export() -> Tuple[str, HTTPStatus]:
    """Exports an ingest view from BQ to a file in the region's GCS File System ingest bucket that is ready to be
    processed and ingested into our Recidiviz DB.
    """
    logging.info("Received request to do direct ingest view export: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    output_bucket_name = get_str_param_value("output_bucket",
                                             request.values,
                                             preserve_case=True)

    if not region_code or not output_bucket_name:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                GcsfsBucketPath(output_bucket_name)).value,
    ):
        json_data = request.get_data(as_text=True)
        ingest_view_export_args = _parse_cloud_task_args(json_data)

        if not ingest_view_export_args:
            raise DirectIngestError(
                msg=
                "raw_data_import was called with no GcsfsIngestViewExportArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_view_export_args, GcsfsIngestViewExportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(ingest_view_export_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if output_bucket_name != ingest_view_export_args.output_bucket_name:
            raise DirectIngestError(
                msg=
                f"Different buckets were passed in the url and request body\n"
                f"url: {output_bucket_name}\n"
                f"body: {ingest_view_export_args.output_bucket_name}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags({
                TagKey.INGEST_VIEW_EXPORT_TAG:
                ingest_view_export_args.task_id_tag()
        }):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=GcsfsBucketPath(
                        ingest_view_export_args.output_bucket_name),
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            controller.do_ingest_view_export(ingest_view_export_args)
    return "", HTTPStatus.OK
Пример #10
0
def raw_data_import() -> Tuple[str, HTTPStatus]:
    """Imports a single raw direct ingest CSV file from a location in GCS File System to its corresponding raw data
    table in BQ.
    """
    logging.info("Received request to do direct ingest raw data import: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    file_path = get_str_param_value("file_path",
                                    request.values,
                                    preserve_case=True)

    if not region_code or not file_path:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    gcsfs_path = GcsfsFilePath.from_absolute_path(file_path)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                gcsfs_path.bucket_path).value,
    ):
        json_data = request.get_data(as_text=True)
        data_import_args = _parse_cloud_task_args(json_data)

        if not data_import_args:
            raise DirectIngestError(
                msg=
                "raw_data_import was called with no GcsfsRawDataBQImportArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(data_import_args, GcsfsRawDataBQImportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(data_import_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if gcsfs_path != data_import_args.raw_data_file_path:
            raise DirectIngestError(
                msg=f"Different paths were passed in the url and request body\n"
                f"url: {gcsfs_path.uri()}\n"
                f"body: {data_import_args.raw_data_file_path.uri()}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags(
            {TagKey.RAW_DATA_IMPORT_TAG: data_import_args.task_id_tag()}):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=data_import_args.raw_data_file_path.
                    bucket_path,
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            controller.do_raw_data_import(data_import_args)
    return "", HTTPStatus.OK