def build_gcsfs_controller_for_tests( controller_cls, fixture_path_prefix: str, run_async: bool, **kwargs, ) -> GcsfsDirectIngestController: """Builds an instance of |controller_cls| for use in tests with several internal classes mocked properly. """ fake_fs = FakeDirectIngestGCSFileSystem() def mock_build_fs(): return fake_fs if 'TestGcsfsDirectIngestController' in controller_cls.__name__: view_collector_cls: Type[BigQueryViewCollector] = \ FakeDirectIngestPreProcessedIngestViewCollector else: view_collector_cls = DirectIngestPreProcessedIngestViewCollector with patch( f'{BaseDirectIngestController.__module__}.DirectIngestCloudTaskManagerImpl' ) as mock_task_factory_cls: with patch( f'{GcsfsDirectIngestController.__module__}.BigQueryClientImpl' ) as mock_big_query_client_cls: with patch( f'{GcsfsDirectIngestController.__module__}.DirectIngestRawFileImportManager', FakeDirectIngestRawFileImportManager): with patch( f'{GcsfsDirectIngestController.__module__}.DirectIngestPreProcessedIngestViewCollector', view_collector_cls): task_manager = FakeAsyncDirectIngestCloudTaskManager() \ if run_async else FakeSynchronousDirectIngestCloudTaskManager() mock_task_factory_cls.return_value = task_manager mock_big_query_client_cls.return_value = \ FakeDirectIngestBigQueryClient(project_id=metadata.project_id(), fs=fake_fs) with patch.object(GcsfsFactory, 'build', new=mock_build_fs): controller = controller_cls( ingest_directory_path= f'{fixture_path_prefix}/fixtures', storage_directory_path='storage/path', **kwargs) task_manager.set_controller(controller) fake_fs.test_set_controller(controller) return controller
def create_export_manager(self, region): metadata_manager = PostgresDirectIngestFileMetadataManager(region.region_code) return DirectIngestIngestViewExportManager( region=region, fs=FakeDirectIngestGCSFileSystem(), ingest_directory_path=GcsfsDirectoryPath.from_absolute_path('ingest_bucket'), big_query_client=self.mock_client, file_metadata_manager=metadata_manager, view_collector=_ViewCollector(region, controller_file_tags=['ingest_view']))
def setUp(self) -> None: self.project_id = 'recidiviz-456' self.test_region = fake_region( region_code='us_xx', are_raw_data_bq_imports_enabled_in_env=True) self.fs = FakeDirectIngestGCSFileSystem() self.ingest_directory_path = GcsfsDirectoryPath( bucket_name='direct/controllers/fixtures') self.temp_output_path = GcsfsDirectoryPath(bucket_name='temp_bucket') self.region_raw_file_config = DirectIngestRegionRawFileConfig( region_code='us_xx', yaml_config_file_path=fixtures.as_filepath( 'us_xx_raw_data_files.yaml'), ) self.mock_big_query_client = create_autospec(BigQueryClient) self.num_lines_uploaded = 0 self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = \ self.mock_import_raw_file_to_big_query self.import_manager = DirectIngestRawFileImportManager( region=self.test_region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_path, region_raw_file_config=self.region_raw_file_config, big_query_client=self.mock_big_query_client) self.import_manager.csv_reader = TestSafeGcsCsvReader(self.fs) self.time_patcher = patch( 'recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time' ) self.mock_time = self.time_patcher.start() def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference: return bigquery.DatasetReference(project=self.project_id, dataset_id=dataset_id) self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref
class DirectIngestRawFileImportManagerTest(unittest.TestCase): """Tests for DirectIngestRawFileImportManager.""" def setUp(self) -> None: self.project_id = 'recidiviz-456' self.test_region = fake_region( region_code='us_xx', are_raw_data_bq_imports_enabled_in_env=True) self.fs = FakeDirectIngestGCSFileSystem() self.ingest_directory_path = GcsfsDirectoryPath( bucket_name='direct/controllers/fixtures') self.temp_output_path = GcsfsDirectoryPath(bucket_name='temp_bucket') self.region_raw_file_config = DirectIngestRegionRawFileConfig( region_code='us_xx', yaml_config_file_path=fixtures.as_filepath( 'us_xx_raw_data_files.yaml'), ) self.mock_big_query_client = create_autospec(BigQueryClient) self.num_lines_uploaded = 0 self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = \ self.mock_import_raw_file_to_big_query self.import_manager = DirectIngestRawFileImportManager( region=self.test_region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_path, region_raw_file_config=self.region_raw_file_config, big_query_client=self.mock_big_query_client) self.import_manager.csv_reader = TestSafeGcsCsvReader(self.fs) self.time_patcher = patch( 'recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time' ) self.mock_time = self.time_patcher.start() def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference: return bigquery.DatasetReference(project=self.project_id, dataset_id=dataset_id) self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref def tearDown(self) -> None: self.time_patcher.stop() def mock_import_raw_file_to_big_query( self, *, source_uri: str, destination_table_schema: List[bigquery.SchemaField], **_kwargs): col_names = [ schema_field.name for schema_field in destination_table_schema ] temp_path = GcsfsFilePath.from_absolute_path(source_uri) local_temp_path = self.fs.uploaded_test_path_to_actual[ temp_path.abs_path()] df = pd.read_csv(local_temp_path, header=None, dtype=str) for value in df.values: for cell in value: if isinstance(cell, str): stripped_cell = cell.strip() if stripped_cell != cell: raise ValueError( 'Did not strip white space from raw data cell') if cell in col_names: raise ValueError( f'Wrote column row to output file: {value}') self.num_lines_uploaded += len(df) return mock.MagicMock() def _metadata_for_unprocessed_file_path( self, path: GcsfsFilePath) -> DirectIngestFileMetadata: parts = filename_parts_from_path(path) return DirectIngestFileMetadata( region_code=self.test_region.region_code, file_tag=parts.file_tag, file_id=123, processed_time=None) def _check_no_temp_files_remain(self): for path in self.fs.all_paths: if path.abs_path().startswith(self.temp_output_path.abs_path()): self.fail( f'Expected temp path {path.abs_path()} to be cleaned up') def test_get_unprocessed_raw_files_to_import(self): self.assertEqual( [], self.import_manager.get_unprocessed_raw_files_to_import()) raw_unprocessed = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='file_tag_first.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) ingest_view_unprocessed = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='file_tag_second.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.INGEST_VIEW) self.fs.test_add_path(raw_unprocessed) self.fs.test_add_path(ingest_view_unprocessed) self.assertEqual( [raw_unprocessed], self.import_manager.get_unprocessed_raw_files_to_import()) def test_import_bq_file_not_in_tags(self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='this_path_tag_not_in_yaml.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) with self.assertRaises(ValueError): self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestFileMetadata)) def test_import_bq_file_with_ingest_view_file(self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='file_tag_first.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.INGEST_VIEW) with self.assertRaises(ValueError): self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestFileMetadata)) def test_import_bq_file_with_unspecified_type_file(self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='file_tag_first.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.UNSPECIFIED) with self.assertRaises(ValueError): self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestFileMetadata)) def test_import_bq_file_feature_not_released_throws(self): self.import_manager = DirectIngestRawFileImportManager( region=fake_region(region_code='us_xx', are_raw_data_bq_imports_enabled_in_env=False), fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_path, region_raw_file_config=self.region_raw_file_config, big_query_client=self.mock_big_query_client) file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='file_tag_first.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) with self.assertRaises(ValueError): self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestFileMetadata)) def test_import_bq_file_with_raw_file(self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagC.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) self.fs.test_add_path(file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(1, len(self.fs.uploaded_test_path_to_actual)) path = one(self.fs.uploaded_test_path_to_actual.keys()) self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with( source_uri=f'gs://{path}', destination_dataset_ref=bigquery.DatasetReference( self.project_id, 'us_xx_raw_data'), destination_table_id='tagC', destination_table_schema=[ bigquery.SchemaField('COL1', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'), bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'), bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED') ]) self.assertEqual(2, self.num_lines_uploaded) self._check_no_temp_files_remain() def test_import_bq_file_with_raw_file_alternate_separator_and_encoding( self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagPipeSeparatedNonUTF8.txt', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) self.fs.test_add_path(file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(1, len(self.fs.uploaded_test_path_to_actual)) path = one(self.fs.uploaded_test_path_to_actual.keys()) self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with( source_uri=f'gs://{path}', destination_dataset_ref=bigquery.DatasetReference( self.project_id, 'us_xx_raw_data'), destination_table_id='tagPipeSeparatedNonUTF8', destination_table_schema=[ bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'), bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'), bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED') ]) self.assertEqual(5, self.num_lines_uploaded) self._check_no_temp_files_remain() def test_import_bq_file_multiple_chunks_even_division(self): self.import_manager.upload_chunk_size = 1 file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagPipeSeparatedNonUTF8.txt', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) self.fs.test_add_path(file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(5, len(self.fs.uploaded_test_path_to_actual)) expected_insert_calls = [ call.insert_into_table_from_cloud_storage_async( source_uri=f'gs://{uploaded_path}', destination_dataset_ref=bigquery.DatasetReference( self.project_id, 'us_xx_raw_data'), destination_table_id='tagPipeSeparatedNonUTF8', destination_table_schema=[ bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'), bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'), bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED') ]) for uploaded_path in self.fs.uploaded_test_path_to_actual ] self.assertEqual(expected_insert_calls, self.mock_big_query_client.method_calls) self.assertEqual( len(expected_insert_calls) - 1, self.mock_time.sleep.call_count) self.assertEqual(5, self.num_lines_uploaded) self._check_no_temp_files_remain() def test_import_bq_file_multiple_chunks_uneven_division(self): self.import_manager.upload_chunk_size = 2 file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagPipeSeparatedNonUTF8.txt', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) self.fs.test_add_path(file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(3, len(self.fs.uploaded_test_path_to_actual)) expected_insert_calls = [ call.insert_into_table_from_cloud_storage_async( source_uri=f'gs://{uploaded_path}', destination_dataset_ref=bigquery.DatasetReference( self.project_id, 'us_xx_raw_data'), destination_table_id='tagPipeSeparatedNonUTF8', destination_table_schema=[ bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'), bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'), bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED') ]) for uploaded_path in self.fs.uploaded_test_path_to_actual ] self.assertEqual(expected_insert_calls, self.mock_big_query_client.method_calls) self.assertEqual( len(expected_insert_calls) - 1, self.mock_time.sleep.call_count) self.assertEqual(5, self.num_lines_uploaded) self._check_no_temp_files_remain()
def setUp(self) -> None: self.fs = FakeDirectIngestGCSFileSystem() self.prioritizer = GcsfsDirectIngestJobPrioritizer( self.fs, self._INGEST_BUCKET_PATH, ['tagA', 'tagB'], file_type_filter=None)
class TestGcsfsDirectIngestJobPrioritizerNoFilter(unittest.TestCase): """Tests for the GcsfsDirectIngestJobPrioritizer.""" _DAY_1_TIME_1 = datetime.datetime( year=2019, month=1, day=2, hour=3, minute=4, second=5, microsecond=6789, tzinfo=datetime.timezone.utc) _DAY_1_TIME_2 = datetime.datetime( year=2019, month=1, day=2, hour=3, minute=4, second=5, microsecond=7789, tzinfo=datetime.timezone.utc) _DAY_1_TIME_3 = datetime.datetime( year=2019, month=1, day=2, hour=10, minute=4, second=5, microsecond=678, tzinfo=datetime.timezone.utc) _DAY_2_TIME_1 = datetime.datetime( year=2019, month=1, day=3, hour=3, minute=4, second=5, microsecond=6789, tzinfo=datetime.timezone.utc) _DAY_1 = _DAY_1_TIME_1.date() _DAY_2 = _DAY_2_TIME_1.date() _INGEST_BUCKET_PATH = \ GcsfsDirectoryPath.from_absolute_path('direct/regions/us_nd/fixtures') def setUp(self) -> None: self.fs = FakeDirectIngestGCSFileSystem() self.prioritizer = GcsfsDirectIngestJobPrioritizer( self.fs, self._INGEST_BUCKET_PATH, ['tagA', 'tagB'], file_type_filter=None) FIXTURE_PATH_PREFIX = 'direct/regions/us_nd/fixtures' def _normalized_path_for_filename(self, filename: str, file_type: GcsfsDirectIngestFileType, dt: datetime.datetime) -> GcsfsFilePath: normalized_path = \ to_normalized_unprocessed_file_path( original_file_path=os.path.join(self._INGEST_BUCKET_PATH.abs_path(), filename), file_type=file_type, dt=dt) return GcsfsFilePath.from_absolute_path(normalized_path) def _process_jobs_for_paths_with_no_gaps_in_expected_order( self, paths: List[GcsfsFilePath]): for path in paths: date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) if next_job_args is None: # Make mypy happy self.fail() self.assertEqual(next_job_args.file_path, path) self.assertTrue( self.prioritizer.are_next_args_expected(next_job_args)) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day(date_str)) # ... job runs ... self.fs.mv_path_to_processed_path(path) def test_empty_fs(self): self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1_TIME_1.date().isoformat())) self.assertIsNone(self.prioritizer.get_next_job_args()) def test_single_expected_file(self): path = self._normalized_path_for_filename( 'tagA.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_1_TIME_1) self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order([path]) self.assertIsNone(self.prioritizer.get_next_job_args()) # We still expect a file for tagB self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_multiple_files(self): paths = [ self._normalized_path_for_filename( 'tagA.csv', GcsfsDirectIngestFileType.RAW_DATA, self._DAY_1_TIME_1), self._normalized_path_for_filename( 'tagB.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_2) ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_unexpected_file(self): # Only file is out of order path = self._normalized_path_for_filename( 'tagB.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_1_TIME_1) self.fs.test_add_path(path) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) self.assertEqual(next_job_args.file_path, path) self.assertFalse(self.prioritizer.are_next_args_expected(next_job_args)) # ... job runs eventually even though unexpected... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) # We still expect a file for tagA self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_files_on_multiple_days(self): paths = [ self._normalized_path_for_filename( 'tagA.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_1), self._normalized_path_for_filename( 'tagB.csv', GcsfsDirectIngestFileType.RAW_DATA, self._DAY_1_TIME_2), self._normalized_path_for_filename( 'tagA.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_2_TIME_1), ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_2.isoformat())) def test_files_on_multiple_days_with_gap(self): """Runs a test where there are files on multiple days and there is a gap in the expected files for the first day. """ paths = [ self._normalized_path_for_filename( 'tagB.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_1_TIME_2), self._normalized_path_for_filename( 'tagA.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_2_TIME_1), ] for path in paths: self.fs.test_add_path(path) for i, path in enumerate(paths): date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) self.assertEqual(next_job_args.file_path, path) are_args_expected = \ self.prioritizer.are_next_args_expected(next_job_args) if i == 0: self.assertFalse(are_args_expected) else: self.assertTrue(are_args_expected) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day(date_str)) # ... job runs ... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_2.isoformat())) def test_multiple_files_same_tag(self): paths = [ self._normalized_path_for_filename( 'tagA.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_1_TIME_1), self._normalized_path_for_filename( 'tagA.csv', GcsfsDirectIngestFileType.RAW_DATA, self._DAY_1_TIME_2), self._normalized_path_for_filename( 'tagB.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_3), ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_multiple_files_times_out_of_order(self): """Runs a test where there are no gaps but the files have been added (i.e. have creation times) out of order. """ paths = [ self._normalized_path_for_filename( 'tagA.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_2), self._normalized_path_for_filename( 'tagB.csv', GcsfsDirectIngestFileType.RAW_DATA, self._DAY_1_TIME_1), self._normalized_path_for_filename( 'tagB.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_1_TIME_3), ] for path in paths: self.fs.test_add_path(path) for i, path in enumerate(paths): date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) self.assertEqual(next_job_args.file_path, path) self.assertTrue( self.prioritizer.are_next_args_expected(next_job_args)) are_more_jobs_expected = \ self.prioritizer.are_more_jobs_expected_for_day(date_str) if i == 2: self.assertFalse(are_more_jobs_expected) else: self.assertTrue(are_more_jobs_expected) # ... job runs ... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_run_multiple_copies_of_same_tag(self): paths = [ self._normalized_path_for_filename( 'tagA.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_1_TIME_2), self._normalized_path_for_filename( 'tagA_2.csv', GcsfsDirectIngestFileType.RAW_DATA, self._DAY_1_TIME_1), self._normalized_path_for_filename( 'tagB.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_1_TIME_3), ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat()))
def setUp(self) -> None: self.fs = FakeDirectIngestGCSFileSystem()
class TestFakeDirectIngestGcsFileSystem(TestCase): """Tests for the DirectIngestGCSFileSystem.""" STORAGE_DIR_PATH = GcsfsDirectoryPath(bucket_name='storage_bucket', relative_path='region_subdir') INGEST_DIR_PATH = GcsfsDirectoryPath(bucket_name='my_bucket') def setUp(self) -> None: self.fs = FakeDirectIngestGCSFileSystem() def fully_process_file(self, dt: datetime.datetime, path: GcsfsFilePath, file_type_differentiation_on: bool = False): """Mimics all the file system calls for a single file in the direct ingest system, from getting added to the ingest bucket, turning to a processed file, then getting moved to storage.""" self.fs.test_add_path(path) start_num_total_files = len(self.fs.all_paths) # pylint: disable=protected-access start_ingest_paths = self.fs._ls_with_file_prefix( self.INGEST_DIR_PATH, '', None) start_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', None) if file_type_differentiation_on: start_raw_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) start_ingest_view_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW) else: start_raw_storage_paths = [] start_ingest_view_storage_paths = [] # File is renamed to normalized path file_type = GcsfsDirectIngestFileType.RAW_DATA \ if file_type_differentiation_on else GcsfsDirectIngestFileType.UNSPECIFIED self.fs.mv_path_to_normalized_path(path, file_type, dt) if file_type_differentiation_on: raw_unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) self.assertEqual(len(raw_unprocessed), 1) self.assertTrue( self.fs.is_seen_unprocessed_file(raw_unprocessed[0])) # ... raw file imported to BQ processed_path = self.fs.mv_path_to_processed_path( raw_unprocessed[0]) processed = self.fs.get_processed_file_paths( self.INGEST_DIR_PATH, None) self.assertEqual(len(processed), 1) self.fs.copy( processed_path, GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( processed_path.abs_path(), file_type_override=GcsfsDirectIngestFileType. INGEST_VIEW))) self.fs.mv_path_to_storage(processed_path, self.STORAGE_DIR_PATH) ingest_unprocessed_filter = GcsfsDirectIngestFileType.INGEST_VIEW if file_type_differentiation_on else None ingest_unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, file_type_filter=ingest_unprocessed_filter) self.assertEqual(len(ingest_unprocessed), 1) self.assertTrue(self.fs.is_seen_unprocessed_file( ingest_unprocessed[0])) # ... file is ingested # File is moved to processed path self.fs.mv_path_to_processed_path(ingest_unprocessed[0]) processed = self.fs.get_processed_file_paths(self.INGEST_DIR_PATH, None) self.assertEqual(len(processed), 1) self.assertTrue(self.fs.is_processed_file(processed[0])) unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, None) self.assertEqual(len(unprocessed), 0) # File is moved to storage ingest_move_type_filter = GcsfsDirectIngestFileType.INGEST_VIEW \ if file_type_differentiation_on else None self.fs.mv_processed_paths_before_date_to_storage( self.INGEST_DIR_PATH, self.STORAGE_DIR_PATH, date_str_bound=dt.date().isoformat(), include_bound=True, file_type_filter=ingest_move_type_filter) end_ingest_paths = self.fs._ls_with_file_prefix(self.INGEST_DIR_PATH, '', file_type_filter=None) end_storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) if file_type_differentiation_on: end_raw_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) end_ingest_view_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW) else: end_raw_storage_paths = [] end_ingest_view_storage_paths = [] # Each file gets re-exported as ingest view splitting_factor = 2 if file_type_differentiation_on else 1 expected_final_total_files = start_num_total_files + splitting_factor - 1 self.assertEqual(len(self.fs.all_paths), expected_final_total_files) self.assertEqual(len(end_ingest_paths), len(start_ingest_paths) - 1) self.assertEqual(len(end_storage_paths), len(start_storage_paths) + 1 * splitting_factor) if file_type_differentiation_on: self.assertEqual( len(end_raw_storage_paths) + len(end_ingest_view_storage_paths), len(end_storage_paths)) self.assertEqual(len(end_raw_storage_paths), len(start_raw_storage_paths) + 1) self.assertEqual(len(end_ingest_view_storage_paths), len(start_ingest_view_storage_paths) + 1) for sp in end_storage_paths: parts = filename_parts_from_path(sp) if sp.abs_path() not in { p.abs_path() for p in start_storage_paths }: self.assertTrue(sp.abs_path().startswith( self.STORAGE_DIR_PATH.abs_path())) dir_path, storage_file_name = os.path.split(sp.abs_path()) if parts.file_type != GcsfsDirectIngestFileType.UNSPECIFIED: self.assertTrue(parts.file_type.value in dir_path) name, _ = path.file_name.split('.') self.assertTrue(name in storage_file_name) def test_direct_ingest_file_moves(self): self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) def test_direct_ingest_multiple_file_moves(self): self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv')) def test_move_to_storage_with_conflict(self): dt = datetime.datetime.now() self.fully_process_file( dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file( dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # pylint: disable=protected-access storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) self.assertEqual(len(storage_paths), 2) found_first_file = False found_second_file = False for path in storage_paths: self.assertTrue(filename_parts_from_path(path)) if path.abs_path().endswith('test_file.csv'): found_first_file = True if path.abs_path().endswith('test_file-(1).csv'): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file) def test_direct_ingest_file_moves_with_file_types(self): self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) def test_direct_ingest_multiple_file_moves_with_file_types(self): self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv'), file_type_differentiation_on=True) def test_move_to_storage_with_conflict_with_file_types(self): dt = datetime.datetime.now() self.fully_process_file(dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file(dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) # pylint: disable=protected-access storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) self.assertEqual(len(storage_paths), 4) found_first_file = False found_second_file = False for path in storage_paths: if path.abs_path().endswith('test_file.csv'): found_first_file = True if path.abs_path().endswith('test_file-(1).csv'): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file)