def check_all_paths_processed(test_case: unittest.TestCase,
                              controller: GcsfsDirectIngestController,
                              file_tags: List[str],
                              unexpected_tags: List[str]):
    """Checks that all non-directory paths with expected tags have been
    processed and moved to storage.
    """

    if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
        raise ValueError(f"Controller fs must have type "
                         f"FakeDirectIngestGCSFileSystem. Found instead "
                         f"type [{type(controller.fs)}]")

    file_tags_processed = set()
    for path in controller.fs.all_paths:
        if isinstance(path, GcsfsDirectoryPath):
            continue

        file_tag = filename_parts_from_path(path).file_tag

        if file_tag not in unexpected_tags:
            # Test all expected files have been moved to storage
            test_case.assertTrue(
                path.abs_path().startswith(
                    controller.storage_directory_path.abs_path()),
                f'{path} has not been moved to correct storage directory')

            file_tags_processed.add(filename_parts_from_path(path).file_tag)
        else:
            test_case.assertTrue(path.file_name.startswith('unprocessed'))

    # Test that each expected file tag has been processed
    test_case.assertEqual(file_tags_processed,
                          set(file_tags).difference(set(unexpected_tags)))
예제 #2
0
    def test_process_file_that_needs_splitting(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=True)

        # Set line limit to 1
        controller.file_split_line_limit = 1

        # pylint:disable=protected-access
        file_tags = list(sorted(controller._get_file_tag_rank_list()))

        add_paths_with_tags_and_process(self,
                                        controller,
                                        file_tags,
                                        pre_normalize_filename=True)

        processed_split_file_paths = defaultdict(list)
        for path in controller.fs.all_paths:
            if self._path_in_split_file_storage_subdir(path, controller):
                file_tag = filename_parts_from_path(path).file_tag
                processed_split_file_paths[file_tag].append(path)

        self.assertEqual(1, len(processed_split_file_paths.keys()))
        self.assertEqual(2, len(processed_split_file_paths['tagC']))

        found_suffixes = {
            filename_parts_from_path(p).filename_suffix
            for p in processed_split_file_paths['tagC']
        }
        self.assertEqual(found_suffixes,
                         {'00001_file_split_size1', '00002_file_split_size1'})
예제 #3
0
def add_paths_with_tags_and_process(test_case: unittest.TestCase,
                                    controller: GcsfsDirectIngestController,
                                    file_tags: List[str],
                                    unexpected_tags: List[str] = None):
    """Runs a test that queues files for all the provided file tags, waits
    for the controller to finish processing everything, then makes sure that
    all files not in |unexpected_tags| have been moved to storage.
    """
    if unexpected_tags is None:
        unexpected_tags = []

    for file_tag in file_tags:
        args = ingest_args_for_fixture_file(controller, f'{file_tag}.csv')
        if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(controller.fs)}]")

        controller.fs.test_add_path(args.file_path)

        controller.kick_scheduler(just_finished_job=False)
        time.sleep(.05)

    if isinstance(controller.cloud_task_manager,
                  FakeAsyncDirectIngestCloudTaskManager):
        controller.cloud_task_manager.wait_for_all_tasks_to_run()
    elif isinstance(controller.cloud_task_manager,
                    FakeSynchronousDirectIngestCloudTaskManager):
        tm = controller.cloud_task_manager
        while tm.get_scheduler_queue_info(controller.region).size() \
                or tm.get_process_job_queue_info(controller.region).size():
            if tm.get_scheduler_queue_info(controller.region).size():
                tm.test_run_next_scheduler_task()
                tm.test_pop_finished_scheduler_task()
            if tm.get_process_job_queue_info(controller.region).size():
                tm.test_run_next_process_job_task()
                tm.test_pop_finished_process_job_task()
    else:
        raise ValueError(f"Unexpected type for cloud task manager: "
                         f"[{type(controller.cloud_task_manager)}]")

    file_tags_processed = set()
    for path in controller.fs.all_paths:
        file_tag = filename_parts_from_path(path).file_tag

        if file_tag not in unexpected_tags:
            # Test all expected files have been moved to storage
            test_case.assertTrue(
                path.startswith(controller.storage_directory_path),
                f'{path} does not start with expected prefix')

            file_tags_processed.add(filename_parts_from_path(path).file_tag)
        else:
            _, file_name = os.path.split(path)
            test_case.assertTrue(file_name.startswith('unprocessed'))

    # Test that each expected file tag has been processed
    test_case.assertEqual(file_tags_processed,
                          set(file_tags).difference(set(unexpected_tags)))
 def _is_last_job_for_day(self, args: GcsfsIngestArgs) -> bool:
     """Returns True if the file handled in |args| is the last file for that
     upload date."""
     parts = filename_parts_from_path(args.file_path)
     upload_date, date_str = parts.utc_upload_datetime, parts.date_str
     more_jobs_expected = self.file_prioritizer.are_more_jobs_expected_for_day(
         date_str)
     if more_jobs_expected:
         return False
     next_job_args = self.file_prioritizer.get_next_job_args(date_str)
     if next_job_args:
         next_job_date = filename_parts_from_path(
             next_job_args.file_path).utc_upload_datetime
         return next_job_date > upload_date
     return True
    def has_file_been_discovered(self, path: GcsfsFilePath) -> bool:
        parts = filename_parts_from_path(path)

        try:
            metadata = self.get_file_metadata(path)
        except ValueError as e:
            if parts.file_type != GcsfsDirectIngestFileType.RAW_DATA:
                raise e
            return False

        if not metadata:
            raise ValueError(f"Metadata unexpectedly None for path [{path.abs_path()}]")

        # TODO(#3020): Design/handle/write tests for case where this is a file we've moved from storage for a
        #  rerun. How do we accurately detect when this is happening?
        if isinstance(metadata, DirectIngestRawFileMetadata):
            return True

        if isinstance(metadata, DirectIngestIngestFileMetadata):
            if metadata.discovery_time is None:
                return False
            return True

        raise ValueError(
            f"Unexpected metadata type [{type(metadata)}] for path [{path.abs_path()}]"
        )
예제 #6
0
    def _ls_with_file_prefix(
        self,
        directory_path: GcsfsDirectoryPath,
        file_prefix: str,
        file_type_filter: Optional[GcsfsDirectIngestFileType],
    ) -> List[GcsfsFilePath]:
        """Returns absolute paths of files in the directory with the given |file_prefix|."""
        blob_prefix = os.path.join(
            *[directory_path.relative_path, file_prefix])
        blob_paths = self.gcs_file_system.ls_with_blob_prefix(
            directory_path.bucket_name, blob_prefix)

        result = []
        for path in blob_paths:
            if not isinstance(path, GcsfsFilePath):
                continue

            if not file_type_filter:
                result.append(path)
                continue

            file_type = filename_parts_from_path(path).file_type
            if file_type == GcsfsDirectIngestFileType.UNSPECIFIED:
                raise ValueError(
                    f"Found path {path.abs_path()} with unexpected UNSPECIFIED type."
                )

            if file_type == file_type_filter:
                result.append(path)

        return result
예제 #7
0
def _to_normalized_file_path_from_normalized_path(
    original_normalized_file_path: str,
    build_function: Callable,
    file_type_override: Optional[GcsfsDirectIngestFileType] = None,
) -> str:
    """Moves any normalized path back to a unprocessed/processed path with the same information embedded in the file
    name. If |file_type_override| is provided, we will always overwrite the original path file type with the override
    file type."""

    directory, _ = os.path.split(original_normalized_file_path)
    parts = filename_parts_from_path(
        GcsfsFilePath.from_absolute_path(original_normalized_file_path))

    file_type = file_type_override if file_type_override else parts.file_type

    utc_iso_timestamp_str = parts.utc_upload_datetime.strftime(
        "%Y-%m-%dT%H:%M:%S:%f")

    suffix_str = f"_{parts.filename_suffix}" if parts.filename_suffix else ""
    base_file_name = f"{parts.file_tag}{suffix_str}"

    path_to_return = build_function(
        utc_iso_timestamp_str=utc_iso_timestamp_str,
        file_type=file_type,
        base_file_name=base_file_name,
        extension=parts.extension,
    )

    return os.path.join(directory, path_to_return)
    def import_raw_file_to_big_query(
            self, path: GcsfsFilePath,
            file_metadata: DirectIngestFileMetadata) -> None:
        """Import a raw data file at the given path to the appropriate raw data table in BigQuery."""

        if not self.region.are_raw_data_bq_imports_enabled_in_env():
            raise ValueError(
                f'Cannot import raw files for region [{self.region.region_code}]'
            )

        parts = filename_parts_from_path(path)
        if parts.file_tag not in self.region_raw_file_config.raw_file_tags:
            raise ValueError(
                f'Attempting to import raw file with tag [{parts.file_tag}] unspecified by [{self.region.region_code}] '
                f'config.')

        if parts.file_type != GcsfsDirectIngestFileType.RAW_DATA:
            raise ValueError(
                f'Unexpected file type [{parts.file_type}] for path [{parts.file_tag}].'
            )

        logging.info('Beginning BigQuery upload of raw file [%s]',
                     path.abs_path())

        temp_output_paths = self._upload_contents_to_temp_gcs_paths(
            path, file_metadata)
        self._load_contents_to_bigquery(path, temp_output_paths)

        logging.info('Completed BigQuery import of [%s]', path.abs_path())
    def test_multiple_files_times_out_of_order(self):
        """Runs a test where there are no gaps but the files have been added
        (i.e. have creation times) out of order.
        """
        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        for i, path in enumerate(paths):
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            self.assertIsNotNone(next_job_args)
            self.assertEqual(next_job_args.file_path, path)
            self.assertTrue(
                self.prioritizer.are_next_args_expected(next_job_args))

            are_more_jobs_expected = \
                self.prioritizer.are_more_jobs_expected_for_day(date_str)
            if i == 2:
                self.assertFalse(are_more_jobs_expected)
            else:
                self.assertTrue(are_more_jobs_expected)

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
    def _create_split_file_path(
        self,
        original_file_path: GcsfsFilePath,
        output_dir: GcsfsDirectoryPath,
        split_num: int,
    ) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        updated_file_name = (
            f"{parts.stripped_file_name}_{rank_str}"
            f"_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}"
            f".{parts.extension}")

        file_type = (
            GcsfsDirectIngestFileType.INGEST_VIEW
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else
            GcsfsDirectIngestFileType.UNSPECIFIED)

        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(updated_file_name,
                                                file_type=file_type,
                                                dt=parts.utc_upload_datetime),
        )
예제 #11
0
    def import_raw_file_to_big_query(
            self, path: GcsfsFilePath,
            file_metadata: DirectIngestRawFileMetadata) -> None:
        """Import a raw data file at the given path to the appropriate raw data table in BigQuery."""
        parts = filename_parts_from_path(path)
        if parts.file_tag not in self.region_raw_file_config.raw_file_tags:
            raise ValueError(
                f"Attempting to import raw file with tag [{parts.file_tag}] unspecified by [{self.region.region_code}] "
                f"config.")

        if parts.file_type != GcsfsDirectIngestFileType.RAW_DATA:
            raise ValueError(
                f"Unexpected file type [{parts.file_type}] for path [{parts.file_tag}]."
            )

        logging.info("Beginning BigQuery upload of raw file [%s]",
                     path.abs_path())

        temp_output_paths = self._upload_contents_to_temp_gcs_paths(
            path, file_metadata)
        self._load_contents_to_bigquery(path, temp_output_paths)

        migration_queries = self.raw_table_migrations.get(parts.file_tag, [])
        logging.info(
            "Running [%s] migration queries for table [%s]",
            len(migration_queries),
            parts.file_tag,
        )
        for migration_query in migration_queries:
            query_job = self.big_query_client.run_query_async(
                query_str=migration_query)
            # Wait for the migration query to complete before running the next one
            query_job.result()

        logging.info("Completed BigQuery import of [%s]", path.abs_path())
def to_normalized_unprocessed_file_path_from_normalized_path(
        original_normalized_file_path: str,
        file_type_override: Optional[GcsfsDirectIngestFileType] = None
) -> str:
    """Moves any normalized path back to an unprocessed path with the same information embedded in the file name. If
    |file_type_override| is provided, we will always overwrite the original path file type with the override file type.
    """
    directory, _ = os.path.split(original_normalized_file_path)
    parts = filename_parts_from_path(GcsfsFilePath.from_absolute_path(original_normalized_file_path))

    file_type = file_type_override if file_type_override else parts.file_type

    utc_iso_timestamp_str = parts.utc_upload_datetime.strftime('%Y-%m-%dT%H:%M:%S:%f')

    suffix_str = \
        f'_{parts.filename_suffix}' if parts.filename_suffix else ''
    base_file_name = f'{parts.file_tag}{suffix_str}'

    path_as_unprocessed = _build_unprocessed_file_name(
        utc_iso_timestamp_str=utc_iso_timestamp_str,
        file_type=file_type,
        base_file_name=base_file_name,
        extension=parts.extension)

    return os.path.join(directory, path_as_unprocessed)
예제 #13
0
def collect_file_paths(
    data_discovery_args: DataDiscoveryArgs,
    configs: ConfigsByFileType,
    gcs_files: List[str],
) -> FilesByFileType:
    """ Given a set of configs configs, filter the listed GCS files to only those that match our search filters """
    collected_files = defaultdict(list)

    for found_file in gcs_files:
        try:
            path = GcsfsFilePath.from_absolute_path(found_file)
            file_parts = filename_parts_from_path(path)
        except DirectIngestError as e:
            if e.error_type == DirectIngestErrorType.INPUT_ERROR:
                continue

            logger.exception(e)
            continue

        if (not data_discovery_args.start_date <=
                file_parts.utc_upload_datetime.date() <=
                data_discovery_args.end_date):
            continue

        if file_parts.is_file_split:
            continue

        if file_parts.file_tag in configs[file_parts.file_type]:
            collected_files[file_parts.file_type].append(path)

    return collected_files
    def register_ingest_view_export_file_name(
        self,
        metadata_entity: DirectIngestIngestFileMetadata,
        exported_path: GcsfsFilePath,
    ) -> None:
        parts = filename_parts_from_path(exported_path)
        if parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW:
            raise ValueError(f"Exported path has unexpected type {parts.file_type}")

        session = SessionFactory.for_schema_base(OperationsBase)

        try:
            metadata = dao.get_file_metadata_row(
                session, GcsfsDirectIngestFileType.INGEST_VIEW, metadata_entity.file_id
            )

            if metadata.normalized_file_name:
                raise ValueError(
                    f"Normalized file name already set to [{metadata.normalized_file_name}] for file id "
                    f"[{metadata.file_id}]"
                )

            metadata.normalized_file_name = exported_path.file_name
            session.commit()
        except Exception as e:
            session.rollback()
            raise e
        finally:
            session.close()
    def mark_file_as_discovered(self, path: GcsfsFilePath) -> None:
        if not path.file_name.startswith(DIRECT_INGEST_UNPROCESSED_PREFIX):
            raise ValueError("Expect only unprocessed paths in this function.")

        parts = filename_parts_from_path(path)
        session = SessionFactory.for_schema_base(OperationsBase)

        try:
            if parts.file_type == GcsfsDirectIngestFileType.INGEST_VIEW:
                metadata = dao.get_file_metadata_row_for_path(
                    session, self.region_code, path
                )
                dt = datetime.datetime.utcnow()
                if not metadata.export_time:
                    metadata.export_time = dt
                metadata.discovery_time = dt
            elif parts.file_type == GcsfsDirectIngestFileType.RAW_DATA:
                session.add(
                    schema.DirectIngestRawFileMetadata(
                        region_code=self.region_code,
                        file_tag=parts.file_tag,
                        normalized_file_name=path.file_name,
                        discovery_time=datetime.datetime.utcnow(),
                        processed_time=None,
                        datetimes_contained_upper_bound_inclusive=parts.utc_upload_datetime,
                    )
                )
            else:
                raise ValueError(f"Unexpected path type: {parts.file_type}")
            session.commit()
        except Exception as e:
            session.rollback()
            raise e
        finally:
            session.close()
 def _can_proceed_with_ingest_for_contents(
         self, args: GcsfsIngestArgs,
         contents_handle: GcsfsFileContentsHandle) -> bool:
     parts = filename_parts_from_path(args.file_path)
     return self._are_contents_empty(
         args, contents_handle) or not self._must_split_contents(
             parts.file_type, args.file_path)
    def _should_split_file(self, path: GcsfsFilePath) -> bool:
        """Returns a handle to the contents of this path if this file should be split, None otherwise."""
        parts = filename_parts_from_path(path)

        if (self.region.is_raw_vs_ingest_file_name_detection_enabled()
                and parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW):
            raise ValueError(
                f"Should not be attempting to split files other than ingest view files, found path with "
                f"file type: {parts.file_type}")

        if parts.file_tag not in self.get_file_tag_rank_list():
            logging.info(
                "File tag [%s] for path [%s] not in rank list - not splitting.",
                parts.file_tag,
                path.abs_path(),
            )
            return False

        if (parts.is_file_split and parts.file_split_size and
                parts.file_split_size <= self.ingest_file_split_line_limit):
            logging.info(
                "File [%s] already split with size [%s].",
                path.abs_path(),
                parts.file_split_size,
            )
            return False

        return self._must_split_contents(parts.file_type, path)
예제 #18
0
파일: dao.py 프로젝트: Leo-Ryu/pulse-data
def get_file_metadata_row_for_path(
    session: Session, region_code: str, path: GcsfsFilePath
) -> Union[schema.DirectIngestRawFileMetadata,
           schema.DirectIngestIngestFileMetadata]:
    """Returns metadata information for the provided path. If the file has not yet been registered in the
    appropriate metadata table, this function will generate a file_id to return with the metadata.
    """

    parts = filename_parts_from_path(path)

    if parts.file_type == GcsfsDirectIngestFileType.INGEST_VIEW:
        results = session.query(
            schema.DirectIngestIngestFileMetadata).filter_by(
                region_code=region_code,
                is_invalidated=False,
                normalized_file_name=path.file_name).all()
    elif parts.file_type == GcsfsDirectIngestFileType.RAW_DATA:
        results = session.query(schema.DirectIngestRawFileMetadata).filter_by(
            region_code=region_code,
            normalized_file_name=path.file_name).all()
    else:
        raise ValueError(f'Unexpected path type: {parts.file_type}')

    if len(results) != 1:
        raise ValueError(
            f'Unexpected number of metadata results for path {path.abs_path()}: [{len(results)}]'
        )

    return one(results)
예제 #19
0
    def _move_processed_files_to_storage_as_necessary(
            self, last_processed_date_str: str):
        next_args = self.file_prioritizer.get_next_job_args()

        should_move_last_processed_date = False
        if not next_args:
            are_more_jobs_expected = \
                self.file_prioritizer.are_more_jobs_expected_for_day(
                    last_processed_date_str)
            if not are_more_jobs_expected:
                should_move_last_processed_date = True
        else:
            next_date_str = \
                filename_parts_from_path(next_args.file_path).date_str
            if next_date_str < last_processed_date_str:
                logging.info("Found a file [%s] from a date previous to our "
                             "last processed date - not moving anything to "
                             "storage.")
                return

            # If there are still more to process on this day, do not move files
            # from this day.
            should_move_last_processed_date = \
                next_date_str != last_processed_date_str

        # Note: at this point, we expect RAW file type files to already have been moved once they were imported to BQ.
        file_type_to_move = GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None

        self.fs.mv_processed_paths_before_date_to_storage(
            self.ingest_directory_path,
            self.storage_directory_path,
            file_type_filter=file_type_to_move,
            date_str_bound=last_processed_date_str,
            include_bound=should_move_last_processed_date)
    def _split_file_if_necessary(self, path: GcsfsFilePath):
        """Checks if the given file needs to be split according to this
        controller's |file_split_line_limit|.
        """
        parts = filename_parts_from_path(path)

        if parts.file_tag not in self._get_file_tag_rank_list():
            logging.info(
                "File tag [%s] for path [%s] not in rank list - "
                "not splitting.", parts.file_tag, path.abs_path())
            return False

        if parts.is_file_split and \
                parts.file_split_size and \
                parts.file_split_size <= self.file_split_line_limit:
            logging.info("File [%s] already split with size [%s].",
                         path.abs_path(), parts.file_split_size)
            return False

        file_contents_handle = self._get_contents_handle_from_path(path)

        if not file_contents_handle:
            logging.info("File [%s] has no rows - not splitting.",
                         path.abs_path())
            return False

        if self._can_proceed_with_ingest_for_contents(file_contents_handle):
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        self._split_file(path, file_contents_handle)
        return True
    def test_move_to_storage_with_conflict(self) -> None:
        dt = datetime.datetime.now()
        self.fully_process_file(
            dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(
            dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # pylint: disable=protected-access
        storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH,
                                                     '',
                                                     file_type_filter=None)
        self.assertEqual(len(storage_paths), 2)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            self.assertTrue(filename_parts_from_path(path))
            if path.abs_path().endswith('test_file.csv'):
                found_first_file = True
            if path.abs_path().endswith('test_file-(1).csv'):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)
    def _move_processed_files_to_storage_as_necessary(
            self, last_processed_date_str: str):
        next_args = self.file_prioritizer.get_next_job_args()

        should_move_last_processed_date = False
        if not next_args:
            are_more_jobs_expected = \
                self.file_prioritizer.are_more_jobs_expected_for_day(
                    last_processed_date_str)
            if not are_more_jobs_expected:
                should_move_last_processed_date = True
        else:
            next_date_str = \
                filename_parts_from_path(next_args.file_path).date_str
            if next_date_str < last_processed_date_str:
                logging.info("Found a file [%s] from a date previous to our "
                             "last processed date - not moving anything to "
                             "storage.")
                return

            # If there are still more to process on this day, do not move files
            # from this day.
            should_move_last_processed_date = \
                next_date_str != last_processed_date_str

        self.fs.mv_processed_paths_before_date_to_storage(
            self.ingest_directory_path,
            self.storage_directory_path,
            last_processed_date_str,
            include_bound=should_move_last_processed_date)
    def _move_files(self, from_uri: str):
        curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri)
        previous_date_format = filename_parts_from_path(
            curr_gcsfs_file_path).date_str
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        path_with_new_file_name = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path_from_normalized_path(
                from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path):
            path_with_new_file_name = GcsfsFilePath.from_absolute_path(
                to_normalized_processed_file_path_from_normalized_path(
                    from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir(
            self.region_storage_raw_dir_path, new_date_format)

        to_uri = GcsfsFilePath.from_directory_and_file_name(
            raw_dir_with_date, path_with_new_file_name.file_name).uri()

        if not self.dry_run:
            gsutil_mv(from_path=from_uri, to_path=to_uri)
        with self.mutex:
            self.move_list.append((from_uri, to_uri))
            if self.move_progress:
                self.move_progress.next()
예제 #24
0
    def _get_ingest_metadata(self, args: GcsfsIngestArgs) -> IngestMetadata:
        parts = filename_parts_from_path(args.file_path)
        ingest_time = datetime.strptime(cast(str, parts.filename_suffix),
                                        '%m%d%Y_%H%M%S')

        return attr.evolve(super()._get_ingest_metadata(args),
                           ingest_time=ingest_time)
    def _upload_contents_to_temp_gcs_paths(
        self, path: GcsfsFilePath, file_metadata: DirectIngestFileMetadata
    ) -> List[Tuple[GcsfsFilePath, List[str]]]:
        """Uploads the contents of the file at the provided path to one or more GCS files, with whitespace stripped and
        additional metadata columns added.

        Returns a list of tuple pairs containing the destination paths and corrected CSV columns for that file.
        """

        logging.info('Starting chunked upload of contents to GCS')

        parts = filename_parts_from_path(path)
        file_config = self.region_raw_file_config.raw_file_configs[
            parts.file_tag]

        columns = self._get_validated_columns(path, file_config)

        delegate = DirectIngestRawDataSplittingGcsfsCsvReaderDelegate(
            path, self.fs, file_metadata, self.temp_output_directory_path)

        self.csv_reader.streaming_read(
            path,
            delegate=delegate,
            chunk_size=self.upload_chunk_size,
            encodings_to_try=file_config.encodings_to_try(),
            index_col=False,
            header=None,
            skiprows=1,
            usecols=columns,
            names=columns,
            keep_default_na=False,
            **self._common_read_csv_kwargs(file_config))

        return delegate.output_paths_with_columns
    def _read_contents_into_dataframes(self,
                                       path: GcsfsFilePath,
                                       contents_handle: GcsfsFileContentsHandle) -> Iterator[pd.DataFrame]:
        parts = filename_parts_from_path(path)
        file_config = self.region_raw_file_config.raw_file_configs[parts.file_tag]

        columns = self._get_validated_columns(file_config, contents_handle)
        try:
            for df in pd.read_csv(
                    contents_handle.local_file_path,
                    sep=file_config.separator,
                    dtype=str,
                    index_col=False,
                    header=None,
                    skiprows=1,
                    encoding=file_config.encoding,
                    quoting=(csv.QUOTE_NONE if file_config.ignore_quotes else csv.QUOTE_MINIMAL),
                    usecols=columns,
                    names=columns,
                    chunksize=self.upload_chunk_size,
                    keep_default_na=False):
                yield df
        except Exception as e:
            logging.error('Failed to parse DataFrame for path [%s] with config [%s]', path.abs_path(), file_config)
            raise e
예제 #27
0
    def mv_processed_paths_before_date_to_storage(
        self,
        directory_path: GcsfsDirectoryPath,
        storage_directory_path: GcsfsDirectoryPath,
        file_type_filter: Optional[GcsfsDirectIngestFileType],
        date_str_bound: str,
        include_bound: bool,
    ) -> None:
        """Moves all files with timestamps before the provided |date_str_bound| to the appropriate storage location for
        that file. If a |file_type_filter| is provided, only moves files of a certain file type and throws if
        encountering a file of type UNSPECIFIED in the directory path.
        """

        processed_file_paths = self.get_processed_file_paths(
            directory_path, file_type_filter)

        for file_path in processed_file_paths:
            date_str = filename_parts_from_path(file_path).date_str
            if date_str < date_str_bound or (include_bound
                                             and date_str == date_str_bound):
                logging.info(
                    "Found file [%s] from [%s] which abides by provided bound "
                    "[%s]. Moving to storage.",
                    file_path.abs_path(),
                    date_str,
                    date_str_bound,
                )
                self.mv_path_to_storage(file_path, storage_directory_path)
    def handle_file(self, path: GcsfsFilePath, start_ingest: bool) -> None:
        """Called when a single new file is added to an ingest bucket (may also
        be called as a result of a rename).

        May be called from any worker/queue.
        """
        if self.fs.is_processed_file(path):
            logging.info("File [%s] is already processed, returning.",
                         path.abs_path())
            return

        if self.fs.is_normalized_file_path(path):
            parts = filename_parts_from_path(path)

            if (parts.is_file_split and parts.file_split_size
                    and parts.file_split_size <=
                    self.ingest_file_split_line_limit):
                self.kick_scheduler(just_finished_job=False)
                logging.info(
                    "File [%s] is already normalized and split split "
                    "with correct size, kicking scheduler.",
                    path.abs_path(),
                )
                return

        logging.info("Creating cloud task to schedule next job.")
        self.cloud_task_manager.create_direct_ingest_handle_new_files_task(
            region=self.region, can_start_ingest=start_ingest)
예제 #29
0
def get_ingest_file_metadata_row_for_path(
        session: Session, region_code: str, path: GcsfsFilePath,
        ingest_database_name: str) -> schema.DirectIngestIngestFileMetadata:
    """Returns metadata information for the provided path. If the file has not yet been registered in the
    appropriate metadata table, this function will generate a file_id to return with the metadata.
    """

    parts = filename_parts_from_path(path)

    if parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW:
        raise ValueError(f"Unexpected file type [{parts.file_type}]")

    results = (session.query(schema.DirectIngestIngestFileMetadata).filter_by(
        region_code=region_code.upper(),
        is_invalidated=False,
        normalized_file_name=path.file_name,
        ingest_database_name=ingest_database_name,
    ).all())

    if len(results) != 1:
        raise ValueError(
            f"Unexpected number of metadata results for path {path.abs_path()}: [{len(results)}]"
        )

    return one(results)
예제 #30
0
    def _move_processed_files_to_storage_as_necessary(
            self, last_processed_date_str: str) -> None:
        """Moves files that have already been ingested/processed, up to and including the given date, into storage,
        if there is nothing more left to ingest/process, i.e. we are not expecting more files."""
        next_args = self.file_prioritizer.get_next_job_args()

        should_move_last_processed_date = False
        if not next_args:
            are_more_jobs_expected = (
                self.file_prioritizer.are_more_jobs_expected_for_day(
                    last_processed_date_str))
            if not are_more_jobs_expected:
                should_move_last_processed_date = True
        else:
            next_date_str = filename_parts_from_path(
                next_args.file_path).date_str
            if next_date_str < last_processed_date_str:
                logging.info("Found a file [%s] from a date previous to our "
                             "last processed date - not moving anything to "
                             "storage.")
                return

            # If there are still more to process on this day, do not move files
            # from this day.
            should_move_last_processed_date = next_date_str != last_processed_date_str

        # Note: at this point, we expect RAW file type files to already have been moved once they were imported to BQ.
        self.fs.mv_processed_paths_before_date_to_storage(
            self.ingest_bucket_path,
            self.storage_directory_path,
            file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW,
            date_str_bound=last_processed_date_str,
            include_bound=should_move_last_processed_date,
        )