def test_handle_file_start_ingest_unsupported_region( self, mock_region, mock_environment, mock_fs_factory_cls, client): region_code = 'us_nd' mock_environment.return_value = 'production' mock_region.return_value = fake_region(region_code=region_code, environment='staging') fake_fs = FakeDirectIngestGCSFileSystem() fake_fs.test_add_path('bucket-us-nd/elite_offenders.csv') mock_fs_factory_cls.build.return_value = fake_fs request_args = { 'region': region_code, 'bucket': 'bucket-us-nd', 'relative_file_path': 'elite_offenders.csv', 'start_ingest': 'true', } headers = {'X-Appengine-Cron': "test-cron"} with pytest.raises(DirectIngestError): response = client.get('/handle_direct_ingest_file', query_string=request_args, headers=headers) # Even though the region isn't supported, we don't crash assert response.status_code == 400 mock_region.assert_called_with('us_nd', is_direct_ingest=True)
def test_move_to_storage_with_conflict(self): test_fs = FakeDirectIngestGCSFileSystem() dt = datetime.datetime.now() self.fully_process_file( test_fs, dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file( test_fs, dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # pylint: disable=protected-access storage_paths = test_fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '') self.assertEqual(len(storage_paths), 2) found_first_file = False found_second_file = False for path in storage_paths: if path.abs_path().endswith('test_file.csv'): found_first_file = True if path.abs_path().endswith('test_file-(1).csv'): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file)
def test_direct_ingest_multiple_file_moves(self): test_fs = FakeDirectIngestGCSFileSystem() self.fully_process_file( test_fs, datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) self.fully_process_file( test_fs, datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv'))
def test_handle_file_no_start_ingest(self, mock_fs_factory_cls, client): fake_fs = FakeDirectIngestGCSFileSystem() fake_fs.test_add_path('bucket-us-nd/Elite_Offenders.csv') mock_fs_factory_cls.build.return_value = fake_fs region = 'us_nd' request_args = { 'region': region, 'bucket': 'bucket-us-nd', 'relative_file_path': 'Elite_Offenders.csv', 'start_ingest': 'false', } headers = {'X-Appengine-Cron': "test-cron"} response = client.get('/handle_direct_ingest_file', query_string=request_args, headers=headers) # Even though the region isn't supported, we don't crash assert response.status_code == 200
def fully_process_file(self, test_fs: FakeDirectIngestGCSFileSystem, dt: datetime.datetime, path: GcsfsFilePath): """Mimics all the file system calls for a single file in the direct ingest system, from getting added to the ingest bucket, turning to a processed file, then getting moved to storage.""" test_fs.test_add_path(path) start_num_total_files = len(test_fs.all_paths) # pylint: disable=protected-access start_ingest_paths = test_fs._ls_with_file_prefix( self.INGEST_DIR_PATH, '') start_storage_paths = test_fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '') # File is renamed to normalized path test_fs.mv_path_to_normalized_path(path, dt) unprocessed = test_fs.get_unprocessed_file_paths(self.INGEST_DIR_PATH) self.assertEqual(len(unprocessed), 1) self.assertTrue(test_fs.is_seen_unprocessed_file(unprocessed[0])) # ... file is processed # File is moved to processed path test_fs.mv_path_to_processed_path(unprocessed[0]) processed = test_fs.get_processed_file_paths(self.INGEST_DIR_PATH) self.assertEqual(len(processed), 1) self.assertTrue(test_fs.is_processed_file(processed[0])) unprocessed = test_fs.get_unprocessed_file_paths(self.INGEST_DIR_PATH) self.assertEqual(len(unprocessed), 0) # File is moved to storage test_fs.mv_processed_paths_before_date_to_storage( self.INGEST_DIR_PATH, self.STORAGE_DIR_PATH, dt.date().isoformat(), include_bound=True) end_ingest_paths = test_fs._ls_with_file_prefix( self.INGEST_DIR_PATH, '') end_storage_paths = test_fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '') self.assertEqual(len(test_fs.all_paths), start_num_total_files) self.assertEqual(len(end_ingest_paths), len(start_ingest_paths) - 1) self.assertEqual(len(end_storage_paths), len(start_storage_paths) + 1) for sp in end_storage_paths: if sp.abs_path() not in \ {p.abs_path() for p in start_storage_paths}: self.assertTrue(sp.abs_path().startswith( self.STORAGE_DIR_PATH.abs_path())) _, storage_file_name = \ os.path.split(sp.abs_path()) name, _ = path.file_name.split('.') self.assertTrue(name in storage_file_name)
def setUp(self) -> None: self.fs = FakeDirectIngestGCSFileSystem() self.prioritizer = GcsfsDirectIngestJobPrioritizer( self.fs, self._INGEST_BUCKET_PATH, ['tagA', 'tagB'])
class TestGcsfsDirectIngestJobPrioritizer(unittest.TestCase): """Tests for the GcsfsDirectIngestJobPrioritizer.""" _DAY_1_TIME_1 = datetime.datetime(year=2019, month=1, day=2, hour=3, minute=4, second=5, microsecond=6789, tzinfo=datetime.timezone.utc) _DAY_1_TIME_2 = datetime.datetime(year=2019, month=1, day=2, hour=3, minute=4, second=5, microsecond=7789, tzinfo=datetime.timezone.utc) _DAY_1_TIME_3 = datetime.datetime(year=2019, month=1, day=2, hour=10, minute=4, second=5, microsecond=678, tzinfo=datetime.timezone.utc) _DAY_2_TIME_1 = datetime.datetime(year=2019, month=1, day=3, hour=3, minute=4, second=5, microsecond=6789, tzinfo=datetime.timezone.utc) _DAY_1 = _DAY_1_TIME_1.date() _DAY_2 = _DAY_2_TIME_1.date() _INGEST_BUCKET_PATH = \ GcsfsDirectoryPath.from_absolute_path('direct/regions/us_nd/fixtures') def setUp(self) -> None: self.fs = FakeDirectIngestGCSFileSystem() self.prioritizer = GcsfsDirectIngestJobPrioritizer( self.fs, self._INGEST_BUCKET_PATH, ['tagA', 'tagB']) FIXTURE_PATH_PREFIX = 'direct/regions/us_nd/fixtures' def _normalized_path_for_filename(self, filename: str, dt: datetime.datetime) -> GcsfsFilePath: normalized_path = \ to_normalized_unprocessed_file_path( os.path.join(self._INGEST_BUCKET_PATH.abs_path(), filename), dt) return GcsfsFilePath.from_absolute_path(normalized_path) def _process_jobs_for_paths_with_no_gaps_in_expected_order( self, paths: List[GcsfsFilePath]): for path in paths: date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) if next_job_args is None: # Make mypy happy self.fail() self.assertEqual(next_job_args.file_path, path) self.assertTrue( self.prioritizer.are_next_args_expected(next_job_args)) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day(date_str)) # ... job runs ... self.fs.mv_path_to_processed_path(path) def test_empty_fs(self): self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1_TIME_1.date().isoformat())) self.assertIsNone(self.prioritizer.get_next_job_args()) def test_single_expected_file(self): path = self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1) self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order([path]) self.assertIsNone(self.prioritizer.get_next_job_args()) # We still expect a file for tagB self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_multiple_files(self): paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2) ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_unexpected_file(self): # Only file is out of order path = self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_1) self.fs.test_add_path(path) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) self.assertEqual(next_job_args.file_path, path) self.assertFalse( self.prioritizer.are_next_args_expected(next_job_args)) # ... job runs eventually even though unexpected... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) # We still expect a file for tagA self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_files_on_multiple_days(self): paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagA.csv', self._DAY_2_TIME_1), ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_2.isoformat())) def test_files_on_multiple_days_with_gap(self): """Runs a test where there are files on multiple days and there is a gap in the expected files for the first day. """ paths = [ self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagA.csv', self._DAY_2_TIME_1), ] for path in paths: self.fs.test_add_path(path) for i, path in enumerate(paths): date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) self.assertEqual(next_job_args.file_path, path) are_args_expected = \ self.prioritizer.are_next_args_expected(next_job_args) if i == 0: self.assertFalse(are_args_expected) else: self.assertTrue(are_args_expected) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day(date_str)) # ... job runs ... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_2.isoformat())) def test_multiple_files_same_tag(self): paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3), ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_multiple_files_times_out_of_order(self): """Runs a test where there are no gaps but the files have been added (i.e. have creation times) out of order. """ paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3), ] for path in paths: self.fs.test_add_path(path) for i, path in enumerate(paths): date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) self.assertEqual(next_job_args.file_path, path) self.assertTrue( self.prioritizer.are_next_args_expected(next_job_args)) are_more_jobs_expected = \ self.prioritizer.are_more_jobs_expected_for_day(date_str) if i == 2: self.assertFalse(are_more_jobs_expected) else: self.assertTrue(are_more_jobs_expected) # ... job runs ... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_run_multiple_copies_of_same_tag(self): paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagA_2.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3), ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat()))
def setup_method(self, _test_method): self.fs = FakeDirectIngestGCSFileSystem() self.prioritizer = GcsfsDirectIngestJobPrioritizer( self.fs, self._INGEST_DIRECTORY_PATH, ['tagA', 'tagB'])