def _split_file(self, path: GcsfsFilePath, file_contents_handle: GcsfsFileContentsHandle) -> None: output_dir = GcsfsDirectoryPath.from_file_path(path) upload_paths_and_df = [] for i, df in enumerate( pd.read_csv(file_contents_handle.local_file_path, dtype=str, chunksize=self.file_split_line_limit, keep_default_na=False)): upload_path = self._create_split_file_path(path, output_dir, split_num=i) upload_paths_and_df.append((upload_path, df)) for output_path, df in upload_paths_and_df: logging.info("Writing file split [%s] to Cloud Storage.", output_path.abs_path()) self.fs.upload_from_string(output_path, df.to_csv(index=False), 'text/csv') logging.info("Done splitting file [%s] into [%s] paths, returning.", path.abs_path(), len(upload_paths_and_df)) self.fs.mv_path_to_storage(path, self.storage_directory_path)
def test_processing_continues_if_there_are_subfolders_in_ingest_dir(self): controller = build_gcsfs_controller_for_tests( StateTestGcsfsDirectIngestController, self.FIXTURE_PATH_PREFIX, run_async=False) if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem): raise ValueError(f"Controller fs must have type " f"FakeDirectIngestGCSFileSystem. Found instead " f"type [{type(controller.fs)}]") subdir_path = \ path_for_fixture_file(controller, f'subdir/', should_normalize=False) paths = [ subdir_path, path_for_fixture_file(controller, f'subdir/Unexpected_Tag.csv', should_normalize=False), path_for_fixture_file(controller, f'tagA.csv', should_normalize=False), path_for_fixture_file(controller, f'tagB.csv', should_normalize=False), path_for_fixture_file(controller, f'tagC.csv', should_normalize=False), path_for_fixture_file(controller, f'subdir/tagC_2.csv', should_normalize=False), ] for path in paths: controller.fs.test_add_path(path) run_task_queues_to_empty(controller) dir_paths_found = [] storage_file_paths = [] ingest_file_paths = [] for path in controller.fs.all_paths: if isinstance(path, GcsfsDirectoryPath): dir_paths_found.append(path) continue if path.abs_path().startswith( controller.storage_directory_path.abs_path()): storage_file_paths.append(path) else: self.assertTrue(path.abs_path().startswith( controller.ingest_directory_path.abs_path())) ingest_file_paths.append(path) self.assertEqual(1, len(dir_paths_found)) self.assertEqual(subdir_path, dir_paths_found[0]) self.assertEqual(3, len(storage_file_paths)) storage_tags = { filename_parts_from_path(path).file_tag for path in storage_file_paths } self.assertEqual({'tagA', 'tagB', 'tagC'}, storage_tags) for path in storage_file_paths: self.assertTrue(controller.fs.is_normalized_file_path(path)) self.assertTrue(controller.fs.is_processed_file(path)) self.assertEqual(2, len(ingest_file_paths)) ingest_tags = { filename_parts_from_path(path).file_tag for path in ingest_file_paths } self.assertEqual({'tagC', 'Unexpected_Tag'}, ingest_tags) for path in ingest_file_paths: self.assertTrue(controller.fs.is_normalized_file_path(path)) self.assertTrue(controller.fs.is_seen_unprocessed_file(path)) self.assertEqual(subdir_path, GcsfsDirectoryPath.from_file_path(path))
def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool: """Checks if the given file needs to be split according to this controller's |file_split_line_limit|. Returns True if the file was split, False if splitting was not necessary. """ should_split = self._should_split_file(path) if not should_split: logging.info("No need to split file path [%s].", path.abs_path()) return False logging.info("Proceeding to file splitting for path [%s].", path.abs_path()) original_metadata = None if self.region.are_ingest_view_exports_enabled_in_env(): original_metadata = self.file_metadata_manager.get_file_metadata( path) output_dir = GcsfsDirectoryPath.from_file_path(path) split_contents_paths = self._split_file(path) upload_paths = [] for i, split_contents_path in enumerate(split_contents_paths): upload_path = self._create_split_file_path(path, output_dir, split_num=i) logging.info( "Copying split [%s] to direct ingest directory at path [%s].", i, upload_path.abs_path()) upload_paths.append(upload_path) try: self.fs.mv(split_contents_path, upload_path) except Exception as e: logging.error( 'Threw error while copying split files from temp bucket - attempting to clean up before rethrowing.' ' [%s]', e) for p in upload_paths: self.fs.delete(p) raise e # We wait to register files with metadata manager until all files have been successfully copied to avoid leaving # the metadata manager in an inconsistent state. if self.region.are_ingest_view_exports_enabled_in_env(): if not isinstance(original_metadata, DirectIngestIngestFileMetadata): raise ValueError( 'Attempting to split a non-ingest view type file') logging.info( 'Registering [%s] split files with the metadata manager.', len(upload_paths)) for upload_path in upload_paths: ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split( original_metadata, upload_path) self.file_metadata_manager.mark_ingest_view_exported( ingest_file_metadata) self.file_metadata_manager.mark_file_as_processed(path) logging.info( "Done splitting file [%s] into [%s] paths, moving it to storage.", path.abs_path(), len(split_contents_paths)) self.fs.mv_path_to_storage(path, self.storage_directory_path) return True
def _split_file_if_necessary(self, path: GcsfsFilePath): """Checks if the given file needs to be split according to this controller's |file_split_line_limit|. """ parts = filename_parts_from_path(path) if self.region.is_raw_vs_ingest_file_name_detection_enabled() and \ parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW: raise ValueError(f'Should not be attempting to split files other than ingest view files, found path with ' f'file type: {parts.file_type}') if parts.file_tag not in self.get_file_tag_rank_list(): logging.info("File tag [%s] for path [%s] not in rank list - " "not splitting.", parts.file_tag, path.abs_path()) return False if parts.is_file_split and \ parts.file_split_size and \ parts.file_split_size <= self.file_split_line_limit: logging.info("File [%s] already split with size [%s].", path.abs_path(), parts.file_split_size) return False file_contents_handle = self._get_contents_handle_from_path(path) if not file_contents_handle: logging.info("File [%s] has no rows - not splitting.", path.abs_path()) return False if self._can_proceed_with_ingest_for_contents(file_contents_handle): logging.info("No need to split file path [%s].", path.abs_path()) return False logging.info("Proceeding to file splitting for path [%s].", path.abs_path()) split_contents_handles = self._split_file(path, file_contents_handle) original_metadata = None if self.region.are_ingest_view_exports_enabled_in_env(): original_metadata = self.file_metadata_manager.get_file_metadata(path) output_dir = GcsfsDirectoryPath.from_file_path(path) for i, split_contents_handle in enumerate(split_contents_handles): upload_path = self._create_split_file_path(path, output_dir, split_num=i) ingest_file_metadata = None if self.region.are_ingest_view_exports_enabled_in_env(): if not isinstance(original_metadata, DirectIngestIngestFileMetadata): raise ValueError('Attempting to split a non-ingest view type file') ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split(original_metadata, upload_path) logging.info("Writing file split [%s] to Cloud Storage.", upload_path.abs_path()) self.fs.upload_from_contents_handle(upload_path, split_contents_handle, self._contents_type()) if self.region.are_ingest_view_exports_enabled_in_env(): if not ingest_file_metadata: raise ValueError(f'Split file metadata for path unexpectedly none [{upload_path.abs_path()}]') self.file_metadata_manager.mark_ingest_view_exported(ingest_file_metadata) if self.region.are_ingest_view_exports_enabled_in_env(): self.file_metadata_manager.mark_file_as_processed(path) logging.info("Done splitting file [%s] into [%s] paths, moving it to storage.", path.abs_path(), len(split_contents_handles)) self.fs.mv_path_to_storage(path, self.storage_directory_path) return True
def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool: """Checks if the given file needs to be split according to this controller's |file_split_line_limit|. Returns True if the file was split, False if splitting was not necessary. """ should_split = self._should_split_file(path) if not should_split: logging.info("No need to split file path [%s].", path.abs_path()) return False logging.info("Proceeding to file splitting for path [%s].", path.abs_path()) original_metadata = None if self.region.are_ingest_view_exports_enabled_in_env(): original_metadata = self.file_metadata_manager.get_file_metadata( path) output_dir = GcsfsDirectoryPath.from_file_path(path) split_contents_paths = self._split_file(path) for i, split_contents_path in enumerate(split_contents_paths): upload_path = self._create_split_file_path(path, output_dir, split_num=i) ingest_file_metadata = None if self.region.are_ingest_view_exports_enabled_in_env(): if not isinstance(original_metadata, DirectIngestIngestFileMetadata): raise ValueError( 'Attempting to split a non-ingest view type file') ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split( original_metadata, upload_path) logging.info( "Copying split [%s] to direct ingest directory at path [%s].", i, upload_path.abs_path()) self.fs.mv(split_contents_path, upload_path) if self.region.are_ingest_view_exports_enabled_in_env(): if not ingest_file_metadata: raise ValueError( f'Split file metadata for path unexpectedly none [{upload_path.abs_path()}]' ) self.file_metadata_manager.mark_ingest_view_exported( ingest_file_metadata) if self.region.are_ingest_view_exports_enabled_in_env(): self.file_metadata_manager.mark_file_as_processed(path) logging.info( "Done splitting file [%s] into [%s] paths, moving it to storage.", path.abs_path(), len(split_contents_paths)) self.fs.mv_path_to_storage(path, self.storage_directory_path) return True