예제 #1
0
def controller_for_region_code(
        region_code: str,
        allow_unlaunched: bool = False) -> BaseDirectIngestController:
    """Returns an instance of the region's controller, if one exists."""
    if region_code not in get_supported_direct_ingest_region_codes():
        raise DirectIngestError(
            msg=
            f"Unsupported direct ingest region [{region_code}] in project [{metadata.project_id()}]",
            error_type=DirectIngestErrorType.INPUT_ERROR,
        )

    try:
        region = regions.get_region(region_code, is_direct_ingest=True)
    except FileNotFoundError:
        raise DirectIngestError(
            msg=f"Region [{region_code}] has no registered manifest",
            error_type=DirectIngestErrorType.INPUT_ERROR,
        )

    if not allow_unlaunched and not region.is_ingest_launched_in_env():
        check_is_region_launched_in_env(region)

    controller = region.get_ingestor()

    if not isinstance(controller, BaseDirectIngestController):
        raise DirectIngestError(
            msg=
            f"Controller for direct ingest region [{region_code}] has unexpected type [{type(controller)}]",
            error_type=DirectIngestErrorType.INPUT_ERROR,
        )

    return controller
예제 #2
0
    def build(
        cls, *, ingest_bucket_path: GcsfsBucketPath, allow_unlaunched: bool
    ) -> BaseDirectIngestController:
        """Retrieve a direct ingest GcsfsDirectIngestController associated with a
        particular ingest bucket.

        Returns:
            An instance of the region's direct ingest controller class (e.g.,
             UsNdController) that can run ingest operations for the ingest instance
             associated with the input bucket.
        """
        region_code = get_region_code_from_direct_ingest_bucket(
            ingest_bucket_path.bucket_name
        )

        if (
            region_code is None
            or region_code not in get_supported_direct_ingest_region_codes()
        ):
            raise DirectIngestError(
                msg=f"Unsupported direct ingest region [{region_code}] in "
                f"project [{metadata.project_id()}]",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        region = cls._region_for_bucket(ingest_bucket_path)
        if not allow_unlaunched and not region.is_ingest_launched_in_env():
            check_is_region_launched_in_env(region)

        controller_class = cls.get_controller_class(region)
        controller = controller_class(ingest_bucket_path=ingest_bucket_path)
        if not isinstance(controller, BaseDirectIngestController):
            raise ValueError(f"Unexpected controller class type [{type(controller)}]")

        return controller
    def schedule_next_ingest_job_or_wait_if_necessary(self,
                                                      just_finished_job: bool):
        """Creates a cloud task to run the next ingest job. Depending on the
        next job's IngestArgs, we either post a task to direct/scheduler/ if
        a wait_time is specified or direct/process_job/ if we can run the next
        job immediately."""
        check_is_region_launched_in_env(self.region)

        process_job_queue_info = \
            self.cloud_task_manager.get_process_job_queue_info(self.region)
        if process_job_queue_info.size() and not just_finished_job:
            logging.info(
                "Already running job [%s] - will not schedule another job for "
                "region [%s]", process_job_queue_info.task_names[0],
                self.region.region_code)
            return

        next_job_args = self._get_next_job_args()

        if not next_job_args:
            logging.info("No more jobs to run for region [%s] - returning",
                         self.region.region_code)
            return

        if process_job_queue_info.is_task_queued(self.region, next_job_args):
            logging.info(
                "Already have task queued for next job [%s] - returning.",
                self._job_tag(next_job_args))
            return

        # TODO(3020): Add similar logic between the raw data BQ import and ingest view export tasks
        # TODO(3162): Delete this wait logic from here once all regions have been transitioned to a SQL
        #  preprocessing model.
        wait_time_sec = self._wait_time_sec_for_next_args(next_job_args)
        logging.info("Found next ingest job to run [%s] with wait time [%s].",
                     self._job_tag(next_job_args), wait_time_sec)

        if wait_time_sec:
            scheduler_queue_info = \
                self.cloud_task_manager.get_scheduler_queue_info(self.region)
            if scheduler_queue_info.size() <= 1:
                logging.info(
                    "Creating cloud task to fire timer in [%s] seconds",
                    wait_time_sec)
                self.cloud_task_manager. \
                    create_direct_ingest_scheduler_queue_task(
                        region=self.region,
                        just_finished_job=False,
                        delay_sec=wait_time_sec)
            else:
                logging.info(
                    "[%s] tasks already in the scheduler queue for region "
                    "[%s] - not queueing another task.",
                    str(scheduler_queue_info.size), self.region.region_code)
        else:
            logging.info("Creating cloud task to run job [%s]",
                         self._job_tag(next_job_args))
            self.cloud_task_manager.create_direct_ingest_process_job_task(
                region=self.region, ingest_args=next_job_args)
            self._on_job_scheduled(next_job_args)
    def do_raw_data_import(self,
                           data_import_args: GcsfsRawDataBQImportArgs) -> None:
        """Process a raw incoming file by importing it to BQ, tracking it in our metadata tables, and moving it to
        storage on completion.
        """
        check_is_region_launched_in_env(self.region)
        if not self.region.are_raw_data_bq_imports_enabled_in_env():
            raise ValueError(
                f"Raw data imports not enabled for region [{self.region.region_code}]"
            )

        if not self.fs.exists(data_import_args.raw_data_file_path):
            logging.warning(
                "File path [%s] no longer exists - might have already been "
                "processed or deleted",
                data_import_args.raw_data_file_path,
            )
            self.kick_scheduler(just_finished_job=True)
            return

        file_metadata = self.file_metadata_manager.get_file_metadata(
            data_import_args.raw_data_file_path)

        if file_metadata.processed_time:
            logging.warning(
                "File [%s] is already marked as processed. Skipping file processing.",
                data_import_args.raw_data_file_path.file_name,
            )
            self.kick_scheduler(just_finished_job=True)
            return

        self.raw_file_import_manager.import_raw_file_to_big_query(
            data_import_args.raw_data_file_path, file_metadata)

        if not self.region.are_ingest_view_exports_enabled_in_env():
            # TODO(#3162) This is a stopgap measure for regions that have only partially launched. Delete once SQL
            #  pre-processing is enabled for all direct ingest regions.
            parts = filename_parts_from_path(
                data_import_args.raw_data_file_path)
            ingest_file_tags = self.get_file_tag_rank_list()

            if parts.file_tag in ingest_file_tags:
                self.fs.copy(
                    data_import_args.raw_data_file_path,
                    GcsfsFilePath.from_absolute_path(
                        to_normalized_unprocessed_file_path_from_normalized_path(
                            data_import_args.raw_data_file_path.abs_path(),
                            file_type_override=GcsfsDirectIngestFileType.
                            INGEST_VIEW,
                        )),
                )

        processed_path = self.fs.mv_path_to_processed_path(
            data_import_args.raw_data_file_path)
        self.file_metadata_manager.mark_file_as_processed(
            path=data_import_args.raw_data_file_path)

        self.fs.mv_path_to_storage(processed_path, self.storage_directory_path)
        self.kick_scheduler(just_finished_job=True)
예제 #5
0
    def run_ingest_job_and_kick_scheduler_on_completion(
            self, args: IngestArgsType):
        check_is_region_launched_in_env(self.region)

        should_schedule = self._run_ingest_job(args)
        if should_schedule:
            self.kick_scheduler(just_finished_job=True)
            logging.info("Done running task. Returning.")
예제 #6
0
    def _run_ingest_job(self, args: IngestArgsType) -> bool:
        """
        Runs the full ingest process for this controller - reading and parsing
        raw input data, transforming it to our schema, then writing to the
        database.
        Returns:
            True if we should try to schedule the next job on completion. False,
             otherwise.
        """
        check_is_region_launched_in_env(self.region)

        start_time = datetime.datetime.now()
        logging.info("Starting ingest for ingest run [%s]",
                     self._job_tag(args))

        contents_handle = self._get_contents_handle(args)

        if contents_handle is None:
            logging.warning(
                "Failed to get contents handle for ingest run [%s] - "
                "returning.", self._job_tag(args))
            # If the file no-longer exists, we do want to kick the scheduler
            # again to pick up the next file to run. We expect this to happen
            # occasionally as a race when the scheduler picks up a file before
            # it has been properly moved.
            return True

        if not self._can_proceed_with_ingest_for_contents(
                args, contents_handle):
            logging.warning(
                "Cannot proceed with contents for ingest run [%s] - returning.",
                self._job_tag(args))
            # If we get here, we've failed to properly split a file picked up
            # by the scheduler. We don't want to schedule a new job after
            # returning here, otherwise we'll get ourselves in a loop where we
            # continually try to schedule this file.
            return False

        logging.info("Successfully read contents for ingest run [%s]",
                     self._job_tag(args))

        if not self._are_contents_empty(args, contents_handle):
            self._parse_and_persist_contents(args, contents_handle)
        else:
            logging.warning(
                "Contents are empty for ingest run [%s] - skipping parse and "
                "persist steps.", self._job_tag(args))

        self._do_cleanup(args)

        duration_sec = (datetime.datetime.now() - start_time).total_seconds()
        logging.info("Finished ingest in [%s] sec for ingest run [%s].",
                     str(duration_sec), self._job_tag(args))

        return True
    def do_ingest_view_export(self, ingest_view_export_args: GcsfsIngestViewExportArgs) -> None:
        check_is_region_launched_in_env(self.region)
        if not self.region.are_ingest_view_exports_enabled_in_env():
            raise ValueError(f'Ingest view exports not enabled for region [{self.region.region_code}]. Passed args: '
                             f'{ingest_view_export_args}')

        did_export = self.ingest_view_export_manager.export_view_for_args(ingest_view_export_args)
        if not did_export or not self.file_metadata_manager.get_ingest_view_metadata_pending_export():
            logging.info("Creating cloud task to schedule next job.")
            self.cloud_task_manager.create_direct_ingest_handle_new_files_task(region=self.region,
                                                                               can_start_ingest=True)
예제 #8
0
    def do_raw_data_import(self,
                           data_import_args: GcsfsRawDataBQImportArgs) -> None:
        """Process a raw incoming file by importing it to BQ, tracking it in our metadata tables, and moving it to
        storage on completion.
        """
        check_is_region_launched_in_env(self.region)

        if self.ingest_instance_status_manager.is_instance_paused():
            logging.info("Ingest out of [%s] is currently paused.",
                         self.ingest_bucket_path.uri())
            return

        if self.ingest_instance == DirectIngestInstance.SECONDARY:
            raise ValueError(
                f"Raw data import not supported from SECONDARY ingest bucket "
                f"[{self.ingest_bucket_path}]. Raw data task for "
                f"[{data_import_args.raw_data_file_path}] should never have been "
                f"scheduled.")

        if not self.fs.exists(data_import_args.raw_data_file_path):
            logging.warning(
                "File path [%s] no longer exists - might have already been "
                "processed or deleted",
                data_import_args.raw_data_file_path,
            )
            self.kick_scheduler(just_finished_job=True)
            return

        file_metadata = self.file_metadata_manager.get_raw_file_metadata(
            data_import_args.raw_data_file_path)

        if file_metadata.processed_time:
            logging.warning(
                "File [%s] is already marked as processed. Skipping file processing.",
                data_import_args.raw_data_file_path.file_name,
            )
            self.kick_scheduler(just_finished_job=True)
            return

        self.raw_file_import_manager.import_raw_file_to_big_query(
            data_import_args.raw_data_file_path, file_metadata)

        processed_path = self.fs.mv_path_to_processed_path(
            data_import_args.raw_data_file_path)
        self.file_metadata_manager.mark_raw_file_as_processed(
            path=data_import_args.raw_data_file_path)

        self.fs.mv_path_to_storage(processed_path, self.storage_directory_path)
        self.kick_scheduler(just_finished_job=True)
예제 #9
0
    def do_ingest_view_export(
            self, ingest_view_export_args: GcsfsIngestViewExportArgs) -> None:
        check_is_region_launched_in_env(self.region)

        if self.ingest_instance_status_manager.is_instance_paused():
            logging.info("Ingest out of [%s] is currently paused.",
                         self.ingest_bucket_path.uri())
            return

        did_export = self.ingest_view_export_manager.export_view_for_args(
            ingest_view_export_args)
        if (not did_export or not self.file_metadata_manager.
                get_ingest_view_metadata_pending_export()):
            logging.info("Creating cloud task to schedule next job.")
            self.cloud_task_manager.create_direct_ingest_handle_new_files_task(
                region=self.region,
                ingest_instance=self.ingest_instance,
                ingest_bucket=self.ingest_bucket_path,
                can_start_ingest=True,
            )
    def run_ingest_job_and_kick_scheduler_on_completion(
        self, args: IngestArgsType
    ) -> None:
        check_is_region_launched_in_env(self.region)

        if self.lock_manager.is_locked(
            postgres_to_bq_lock_name_for_schema(
                schema_type_for_system_level(self.system_level)
            )
        ) or self.lock_manager.is_locked(
            postgres_to_bq_lock_name_for_schema(SchemaType.OPERATIONS)
        ):
            raise GCSPseudoLockAlreadyExists(
                "Postgres to BigQuery export is running, can not run ingest"
            )

        with self.lock_manager.using_lock(self.ingest_process_lock_for_region()):
            should_schedule = self._run_ingest_job(args)

        if should_schedule:
            self.kick_scheduler(just_finished_job=True)
            logging.info("Done running task. Returning.")
예제 #11
0
    def run_ingest_job_and_kick_scheduler_on_completion(
            self, args: GcsfsIngestArgs) -> None:
        check_is_region_launched_in_env(self.region)

        if self.ingest_instance_status_manager.is_instance_paused():
            logging.info("Ingest out of [%s] is currently paused.",
                         self.ingest_bucket_path.uri())
            return

        if not self.region_lock_manager.can_proceed():
            logging.warning(
                "Postgres to BigQuery export is running, can not run ingest")
            raise GCSPseudoLockAlreadyExists(
                "Postgres to BigQuery export is running, can not run ingest")

        with self.region_lock_manager.using_region_lock(
                expiration_in_seconds=self.default_job_lock_timeout_in_seconds(
                ), ):
            should_schedule = self._run_ingest_job(args)

        if should_schedule:
            self.kick_scheduler(just_finished_job=True)
            logging.info("Done running task. Returning.")
    def handle_new_files(self, can_start_ingest: bool) -> None:
        """Searches the ingest directory for new/unprocessed files. Normalizes
        file names and splits files as necessary, schedules the next ingest job
        if allowed.


        Should only be called from the scheduler queue.
        """
        if not can_start_ingest and self.region.is_ingest_launched_in_env():
            raise ValueError(
                "The can_start_ingest flag should only be used for regions where ingest is not yet launched in a "
                "particular environment. If we want to be able to selectively pause ingest processing for a state, we "
                "will first have to build a config that is respected by both the /ensure_all_file_paths_normalized "
                "endpoint and any cloud functions that trigger ingest.")

        unnormalized_paths = self.fs.get_unnormalized_file_paths(
            self.ingest_directory_path)

        unnormalized_path_file_type = (
            GcsfsDirectIngestFileType.RAW_DATA
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else
            GcsfsDirectIngestFileType.UNSPECIFIED)

        for path in unnormalized_paths:
            logging.info("File [%s] is not yet seen, normalizing.",
                         path.abs_path())
            self.fs.mv_path_to_normalized_path(
                path, file_type=unnormalized_path_file_type)

        if unnormalized_paths:
            logging.info(
                "Normalized at least one path - returning, will handle "
                "normalized files separately.")
            # Normalizing file paths will cause the cloud function that calls
            # this function to be re-triggered.
            return

        if not can_start_ingest:
            logging.warning(
                "Ingest not configured to start post-file normalization - returning."
            )
            return

        check_is_region_launched_in_env(self.region)

        unprocessed_raw_paths = []

        ingest_file_type_filter = (
            GcsfsDirectIngestFileType.INGEST_VIEW
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else
            None)
        unprocessed_ingest_view_paths = self.fs.get_unprocessed_file_paths(
            self.ingest_directory_path,
            file_type_filter=ingest_file_type_filter)
        if self.region.is_raw_vs_ingest_file_name_detection_enabled():
            unprocessed_raw_paths = self.fs.get_unprocessed_file_paths(
                self.ingest_directory_path,
                file_type_filter=GcsfsDirectIngestFileType.RAW_DATA,
            )
            self._register_all_new_paths_in_metadata(unprocessed_raw_paths)

            if self.region.are_ingest_view_exports_enabled_in_env():
                self._register_all_new_paths_in_metadata(
                    unprocessed_ingest_view_paths)

        unprocessed_paths = unprocessed_raw_paths + unprocessed_ingest_view_paths
        did_split = False
        for path in unprocessed_ingest_view_paths:
            if self._split_file_if_necessary(path):
                did_split = True

        if did_split:
            if self.region.are_ingest_view_exports_enabled_in_env():
                post_split_unprocessed_ingest_view_paths = (
                    self.fs.get_unprocessed_file_paths(
                        self.ingest_directory_path,
                        file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW,
                    ))
                self._register_all_new_paths_in_metadata(
                    post_split_unprocessed_ingest_view_paths)

            logging.info(
                "Split at least one path - returning, will handle split "
                "files separately.")
            # Writing new split files to storage will cause the cloud function
            # that calls this function to be re-triggered.
            return

        if unprocessed_paths:
            self.schedule_next_ingest_job_or_wait_if_necessary(
                just_finished_job=False)
예제 #13
0
    def handle_new_files(self, can_start_ingest: bool):
        """Searches the ingest directory for new/unprocessed files. Normalizes
        file names and splits files as necessary, schedules the next ingest job
        if allowed.


        Should only be called from the scheduler queue.
        """
        unnormalized_paths = self.fs.get_unnormalized_file_paths(
            self.ingest_directory_path)

        unnormalized_path_file_type = GcsfsDirectIngestFileType.RAW_DATA \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else GcsfsDirectIngestFileType.UNSPECIFIED

        for path in unnormalized_paths:
            logging.info("File [%s] is not yet seen, normalizing.",
                         path.abs_path())
            self.fs.mv_path_to_normalized_path(
                path, file_type=unnormalized_path_file_type)

        if unnormalized_paths:
            logging.info(
                "Normalized at least one path - returning, will handle "
                "normalized files separately.")
            # Normalizing file paths will cause the cloud function that calls
            # this function to be re-triggered.
            return

        if not can_start_ingest:
            logging.warning(
                "Ingest not configured to start post-file normalization - returning."
            )
            return

        check_is_region_launched_in_env(self.region)

        unprocessed_raw_paths = []

        ingest_file_type_filter = GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None
        unprocessed_ingest_view_paths = self.fs.get_unprocessed_file_paths(
            self.ingest_directory_path,
            file_type_filter=ingest_file_type_filter)
        if self.region.is_raw_vs_ingest_file_name_detection_enabled():
            unprocessed_raw_paths = self.fs.get_unprocessed_file_paths(
                self.ingest_directory_path,
                file_type_filter=GcsfsDirectIngestFileType.RAW_DATA)
            self._register_all_new_paths_in_metadata(unprocessed_raw_paths)

            if self.region.are_ingest_view_exports_enabled_in_env():
                self._register_all_new_paths_in_metadata(
                    unprocessed_ingest_view_paths)

        unprocessed_paths = unprocessed_raw_paths + unprocessed_ingest_view_paths
        did_split = False
        for path in unprocessed_ingest_view_paths:
            if self._split_file_if_necessary(path):
                did_split = True

        if did_split:
            if self.region.are_ingest_view_exports_enabled_in_env():
                post_split_unprocessed_ingest_view_paths = \
                    self.fs.get_unprocessed_file_paths(self.ingest_directory_path,
                                                       file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW)
                self._register_all_new_paths_in_metadata(
                    post_split_unprocessed_ingest_view_paths)

            logging.info(
                "Split at least one path - returning, will handle split "
                "files separately.")
            # Writing new split files to storage will cause the cloud function
            # that calls this function to be re-triggered.
            return

        if unprocessed_paths:
            self.schedule_next_ingest_job_or_wait_if_necessary(
                just_finished_job=False)
    def schedule_next_ingest_job_or_wait_if_necessary(
        self, just_finished_job: bool
    ) -> None:
        """Creates a cloud task to run the next ingest job. Depending on the
        next job's IngestArgs, we either post a task to direct/scheduler/ if
        a wait_time is specified or direct/process_job/ if we can run the next
        job immediately."""
        check_is_region_launched_in_env(self.region)

        if self._schedule_any_pre_ingest_tasks():
            logging.info("Found pre-ingest tasks to schedule - returning.")
            return

        if self.lock_manager.is_locked(self.ingest_process_lock_for_region()):
            logging.info("Direct ingest is already locked on region [%s]", self.region)
            return

        process_job_queue_info = self.cloud_task_manager.get_process_job_queue_info(
            self.region
        )
        if process_job_queue_info.size() and not just_finished_job:
            logging.info(
                "Already running job [%s] - will not schedule another job for "
                "region [%s]",
                process_job_queue_info.task_names[0],
                self.region.region_code,
            )
            return

        next_job_args = self._get_next_job_args()

        if not next_job_args:
            logging.info(
                "No more jobs to run for region [%s] - returning",
                self.region.region_code,
            )
            return

        if process_job_queue_info.is_task_queued(self.region, next_job_args):
            logging.info(
                "Already have task queued for next job [%s] - returning.",
                self._job_tag(next_job_args),
            )
            return

        if self.lock_manager.is_locked(
            postgres_to_bq_lock_name_for_schema(
                schema_type_for_system_level(self.system_level)
            )
        ) or self.lock_manager.is_locked(
            postgres_to_bq_lock_name_for_schema(SchemaType.OPERATIONS)
        ):
            logging.info(
                "Postgres to BigQuery export is running, cannot run ingest - returning"
            )
            return

        # TODO(#3020): Add similar logic between the raw data BQ import and ingest view export tasks
        # TODO(#3162): Delete this wait logic from here once all regions have been transitioned to a SQL
        #  preprocessing model.
        wait_time_sec = self._wait_time_sec_for_next_args(next_job_args)
        logging.info(
            "Found next ingest job to run [%s] with wait time [%s].",
            self._job_tag(next_job_args),
            wait_time_sec,
        )

        if wait_time_sec:
            scheduler_queue_info = self.cloud_task_manager.get_scheduler_queue_info(
                self.region
            )
            if scheduler_queue_info.size() <= 1:
                logging.info(
                    "Creating cloud task to fire timer in [%s] seconds", wait_time_sec
                )
                self.cloud_task_manager.create_direct_ingest_scheduler_queue_task(
                    region=self.region, just_finished_job=False, delay_sec=wait_time_sec
                )
            else:
                logging.info(
                    "[%s] tasks already in the scheduler queue for region "
                    "[%s] - not queueing another task.",
                    str(scheduler_queue_info.size),
                    self.region.region_code,
                )
        else:
            logging.info(
                "Creating cloud task to run job [%s]", self._job_tag(next_job_args)
            )
            self.cloud_task_manager.create_direct_ingest_process_job_task(
                region=self.region, ingest_args=next_job_args
            )
            self._on_job_scheduled(next_job_args)
예제 #15
0
    def handle_new_files(self, can_start_ingest: bool) -> None:
        """Searches the ingest directory for new/unprocessed files. Normalizes
        file names and splits files as necessary, schedules the next ingest job
        if allowed.


        Should only be called from the scheduler queue.
        """
        if not can_start_ingest and self.region.is_ingest_launched_in_env():
            raise ValueError(
                "The can_start_ingest flag should only be used for regions where ingest is not yet launched in a "
                "particular environment. If we want to be able to selectively pause ingest processing for a state, we "
                "will first have to build a config that is respected by both the /ensure_all_raw_file_paths_normalized "
                "endpoint and any cloud functions that trigger ingest.")

        if self.ingest_instance_status_manager.is_instance_paused():
            logging.info("Ingest out of [%s] is currently paused.",
                         self.ingest_bucket_path.uri())
            return

        unnormalized_paths = self.fs.get_unnormalized_file_paths(
            self.ingest_bucket_path)

        for path in unnormalized_paths:
            logging.info("File [%s] is not yet seen, normalizing.",
                         path.abs_path())
            self.fs.mv_path_to_normalized_path(
                path, file_type=GcsfsDirectIngestFileType.RAW_DATA)

        if unnormalized_paths:
            logging.info(
                "Normalized at least one path - returning, will handle "
                "normalized files separately.")
            # Normalizing file paths will cause the cloud function that calls
            # this function to be re-triggered.
            return

        if not can_start_ingest:
            logging.warning(
                "Ingest not configured to start post-file normalization - returning."
            )
            return

        check_is_region_launched_in_env(self.region)

        unprocessed_ingest_view_paths = self.fs.get_unprocessed_file_paths(
            self.ingest_bucket_path,
            file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW,
        )
        unprocessed_raw_paths = self.fs.get_unprocessed_file_paths(
            self.ingest_bucket_path,
            file_type_filter=GcsfsDirectIngestFileType.RAW_DATA,
        )
        if (unprocessed_raw_paths
                and self.ingest_instance == DirectIngestInstance.SECONDARY):
            raise ValueError(
                f"Raw data import not supported from SECONDARY ingest bucket "
                f"[{self.ingest_bucket_path}], but found {len(unprocessed_raw_paths)} "
                f"raw files. All raw files should be removed from this bucket and "
                f"uploaded to the primary ingest bucket, if appropriate.")

        self._register_all_new_paths_in_metadata(unprocessed_raw_paths)

        self._register_all_new_paths_in_metadata(unprocessed_ingest_view_paths)

        unprocessed_paths = unprocessed_raw_paths + unprocessed_ingest_view_paths
        did_split = False
        for path in unprocessed_ingest_view_paths:
            if self._split_file_if_necessary(path):
                did_split = True

        if did_split:
            post_split_unprocessed_ingest_view_paths = (
                self.fs.get_unprocessed_file_paths(
                    self.ingest_bucket_path,
                    file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW,
                ))
            self._register_all_new_paths_in_metadata(
                post_split_unprocessed_ingest_view_paths)

            logging.info(
                "Split at least one path - returning, will handle split "
                "files separately.")
            # Writing new split files to storage will cause the cloud function
            # that calls this function to be re-triggered.
            return

        if unprocessed_paths:
            self.schedule_next_ingest_job(just_finished_job=False)
예제 #16
0
    def schedule_next_ingest_job(self, just_finished_job: bool) -> None:
        """Creates a cloud task to run a /process_job request for the file, which will
        process and commit the contents to Postgres."""
        check_is_region_launched_in_env(self.region)

        if self.ingest_instance_status_manager.is_instance_paused():
            logging.info("Ingest out of [%s] is currently paused.",
                         self.ingest_bucket_path.uri())
            return

        if self._schedule_any_pre_ingest_tasks():
            logging.info("Found pre-ingest tasks to schedule - returning.")
            return

        if self.region_lock_manager.is_locked():
            logging.info("Direct ingest is already locked on region [%s]",
                         self.region)
            return

        process_job_queue_info = self.cloud_task_manager.get_process_job_queue_info(
            self.region,
            self.ingest_instance,
        )
        if (process_job_queue_info.tasks_for_instance(
                region_code=self.region_code(),
                ingest_instance=self.ingest_instance)
                and not just_finished_job):
            logging.info(
                "Already running job [%s] - will not schedule another job for "
                "region [%s]",
                process_job_queue_info.task_names[0],
                self.region.region_code,
            )
            return

        next_job_args = self._get_next_job_args()

        if not next_job_args:
            logging.info(
                "No more jobs to run for region [%s] - returning",
                self.region.region_code,
            )
            return

        if process_job_queue_info.is_task_queued(self.region, next_job_args):
            logging.info(
                "Already have task queued for next job [%s] - returning.",
                self._job_tag(next_job_args),
            )
            return

        if not self.region_lock_manager.can_proceed():
            logging.info(
                "Postgres to BigQuery export is running, cannot run ingest - returning"
            )
            return

        logging.info("Creating cloud task to run job [%s]",
                     self._job_tag(next_job_args))
        self.cloud_task_manager.create_direct_ingest_process_job_task(
            region=self.region,
            ingest_instance=self.ingest_instance,
            ingest_args=next_job_args,
        )
        self._on_job_scheduled(next_job_args)