def test_move_to_storage_with_conflict(self): test_fs = FakeDirectIngestGCSFileSystem() dt = datetime.datetime.now() self.fully_process_file( test_fs, dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file( test_fs, dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # pylint: disable=protected-access storage_paths = test_fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '') self.assertEqual(len(storage_paths), 2) found_first_file = False found_second_file = False for path in storage_paths: if path.abs_path().endswith('test_file.csv'): found_first_file = True if path.abs_path().endswith('test_file-(1).csv'): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file)
def test_move_to_storage_with_conflict_with_file_types(self): dt = datetime.datetime.now() self.fully_process_file(dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file(dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) # pylint: disable=protected-access storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) self.assertEqual(len(storage_paths), 4) found_first_file = False found_second_file = False for path in storage_paths: if path.abs_path().endswith('test_file.csv'): found_first_file = True if path.abs_path().endswith('test_file-(1).csv'): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file)
def test_direct_ingest_multiple_file_moves(self): self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv'))
def test_direct_ingest_multiple_file_moves(self): test_fs = FakeDirectIngestGCSFileSystem() self.fully_process_file( test_fs, datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) self.fully_process_file( test_fs, datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv'))
def test_direct_ingest_multiple_file_moves_with_file_types(self): self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv'), file_type_differentiation_on=True)
def _move_files_for_date(self, subdir_path_str: str): """Function that loops through each subdirectory and moves files in each subdirectory using the from path and to path specified.""" from_dir_path = GcsfsDirectoryPath.from_absolute_path( subdir_path_str.rstrip('/')) previous_date_format = from_dir_path.relative_path.rstrip('/').split( '/')[-1] new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") from_paths = gsutil_ls(f'{subdir_path_str}*.csv') for from_path in from_paths: file_name = GcsfsFilePath( bucket_name=self.region_storage_dir_path.bucket_name, blob_name=from_path).file_name to_file_path = os.path.join( 'gs://', self.region_storage_dir_path.bucket_name, self.region_code, GcsfsDirectIngestFileType.RAW_DATA.value, new_date_format, file_name) normalized_to_file_path = to_normalized_processed_file_path_from_normalized_path( to_file_path, file_type_override=GcsfsDirectIngestFileType.RAW_DATA) to_path = normalized_to_file_path if not self.dry_run: gsutil_mv(from_path=from_path, to_path=to_path) with self.mutex: self.move_list.append((from_path, to_path)) if self.move_progress: self.move_progress.next()