def test_get_ingest_view_metadata_pending_export_basic(self): args = GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) with freeze_time('2015-01-02T03:06:06'): self.metadata_manager.register_ingest_file_export_job(args) expected_list = [ DirectIngestIngestFileMetadata.new_with_defaults( region_code='US_XX', file_tag='file_tag', is_invalidated=False, is_file_split=False, job_creation_time=datetime.datetime(2015, 1, 2, 3, 6, 6), datetimes_contained_lower_bound_exclusive=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), datetimes_contained_upper_bound_inclusive=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) ] self.assertEqual( expected_list, self.metadata_manager.get_ingest_view_metadata_pending_export())
def test_getIngestViewExportTaskArgs_happy(self) -> None: # Arrange region = self.create_fake_region(ingest_view_exports_enabled=True) export_manager = self.create_export_manager(region) export_manager.file_metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job = Mock( # type: ignore return_value=DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag="ingest_view", normalized_file_name="normalized_file_name", processed_time=_DATE_1, is_invalidated=False, is_file_split=False, job_creation_time=_DATE_1, export_time=_DATE_1, datetimes_contained_lower_bound_exclusive=_DATE_1, datetimes_contained_upper_bound_inclusive=_DATE_1, discovery_time=_DATE_1, )) export_manager.file_metadata_manager.get_metadata_for_raw_files_discovered_after_datetime = Mock( # type: ignore return_value=[ DirectIngestRawFileMetadata( file_id=2, region_code=region.region_code, file_tag="ingest_view", discovery_time=_DATE_2, normalized_file_name= "unprocessed_2015-01-02T03:03:03:000003_raw_file_tag.csv", processed_time=None, datetimes_contained_upper_bound_inclusive=_DATE_2, ) ]) # Act args = export_manager.get_ingest_view_export_task_args() # Assert self.assertListEqual( args, [ GcsfsIngestViewExportArgs( ingest_view_name="ingest_view", upper_bound_datetime_prev=_DATE_1, upper_bound_datetime_to_export=_DATE_2, ) ], )
def test_getIngestViewExportTaskArgs_rawCodeTableOlderThanLastExport( self) -> None: # Arrange CODE_TABLE_TAG = "RECIDIVIZ_REFERENCE_ingest_view" region = self.create_fake_region(ingest_view_exports_enabled=True) export_manager = self.create_export_manager( region, controller_file_tags=[CODE_TABLE_TAG]) export_manager.file_metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job = Mock( # type: ignore return_value=DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag=CODE_TABLE_TAG, normalized_file_name="normalized_file_name", processed_time=_DATE_2, is_invalidated=False, is_file_split=False, job_creation_time=_DATE_2, export_time=_DATE_2, datetimes_contained_lower_bound_exclusive=_DATE_2 - datetime.timedelta(days=7), datetimes_contained_upper_bound_inclusive=_DATE_2, discovery_time=_DATE_2, )) export_manager.file_metadata_manager.get_metadata_for_raw_files_discovered_after_datetime = Mock( # type: ignore return_value=[ DirectIngestRawFileMetadata( file_id=2, region_code=region.region_code, file_tag=CODE_TABLE_TAG, discovery_time=_DATE_1, normalized_file_name= "unprocessed_2015-01-02T03:03:03:000003_raw_file_tag.csv", processed_time=None, datetimes_contained_upper_bound_inclusive=_DATE_1, ) ]) # Act args = export_manager.get_ingest_view_export_task_args() # Assert # New code tables are backdated but don't need to be re-ingested, so ignore them. self.assertListEqual(args, [])
def test_getIngestViewExportTaskArgs_rawFileOlderThanLastExport( self) -> None: # Arrange region = self.create_fake_region(ingest_view_exports_enabled=True) export_manager = self.create_export_manager(region) export_manager.file_metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job = Mock( # type: ignore return_value=DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag="ingest_view", normalized_file_name="normalized_file_name", processed_time=_DATE_2, is_invalidated=False, is_file_split=False, job_creation_time=_DATE_2, export_time=_DATE_2, datetimes_contained_lower_bound_exclusive=_DATE_2, datetimes_contained_upper_bound_inclusive=_DATE_2, discovery_time=_DATE_2, )) export_manager.file_metadata_manager.get_metadata_for_raw_files_discovered_after_datetime = Mock( # type: ignore return_value=[ DirectIngestRawFileMetadata( file_id=2, region_code=region.region_code, file_tag="ingest_view", discovery_time=_DATE_1, normalized_file_name= "unprocessed_2015-01-02T03:03:03:000003_raw_file_tag.csv", processed_time=None, datetimes_contained_upper_bound_inclusive=_DATE_1, ) ]) # Act with pytest.raises( ValueError, match=r"upper bound date.*before the last valid export"): export_manager.get_ingest_view_export_task_args()
def run_split_ingest_file_progression_pre_processing( self, metadata_manager: PostgresDirectIngestFileMetadataManager, original_file_metadata: DirectIngestIngestFileMetadata, split_file_path: GcsfsFilePath, discovery_before_export_recorded: bool = False): """Runs through the full progression of operations we expect to run on a split ingest file, up until processing. """ expected_metadata = DirectIngestIngestFileMetadata.new_with_defaults( region_code=metadata_manager.region_code, file_tag=original_file_metadata.file_tag, is_invalidated=False, is_file_split=True, job_creation_time=datetime.datetime(2015, 1, 2, 3, 5, 5), datetimes_contained_lower_bound_exclusive=original_file_metadata. datetimes_contained_lower_bound_exclusive, datetimes_contained_upper_bound_inclusive=original_file_metadata. datetimes_contained_upper_bound_inclusive, normalized_file_name=split_file_path.file_name, export_time=None, discovery_time=None, processed_time=None, ) with freeze_time('2015-01-02T03:05:05'): split_file_metadata = self.metadata_manager.register_ingest_file_split( original_file_metadata, split_file_path) self.assertEqual(expected_metadata, split_file_metadata) metadata = metadata_manager.get_file_metadata(split_file_path) self.assertEqual(expected_metadata, metadata) # ... export actually performed in here if discovery_before_export_recorded: with freeze_time('2015-01-02T03:06:07'): metadata_manager.mark_file_as_discovered(split_file_path) expected_metadata.discovery_time = datetime.datetime( 2015, 1, 2, 3, 6, 7) expected_metadata.export_time = datetime.datetime( 2015, 1, 2, 3, 6, 7) metadata = metadata_manager.get_file_metadata(split_file_path) self.assertEqual(expected_metadata, metadata) with freeze_time('2015-01-02T03:06:08'): self.metadata_manager.mark_ingest_view_exported( split_file_metadata) expected_metadata.export_time = datetime.datetime(2015, 1, 2, 3, 6, 8) metadata = metadata_manager.get_file_metadata(split_file_path) self.assertEqual(expected_metadata, metadata) if not discovery_before_export_recorded: with freeze_time('2015-01-02T03:07:07'): metadata_manager.mark_file_as_discovered(split_file_path) expected_metadata.discovery_time = datetime.datetime( 2015, 1, 2, 3, 7, 7) metadata = metadata_manager.get_file_metadata(split_file_path) self.assertEqual(expected_metadata, metadata)
def run_ingest_view_file_progression( self, export_args: GcsfsIngestViewExportArgs, metadata_manager: PostgresDirectIngestFileMetadataManager, ingest_view_unprocessed_path: GcsfsFilePath, discovery_before_export_recorded: bool = False, split_file: bool = False): """Runs through the full progression of operations we expect to run on an individual ingest view file.""" with freeze_time('2015-01-02T03:05:05'): metadata_manager.register_ingest_file_export_job(export_args) ingest_file_metadata = metadata_manager.get_ingest_view_metadata_for_export_job( export_args) expected_metadata = DirectIngestIngestFileMetadata.new_with_defaults( region_code=metadata_manager.region_code, file_tag=export_args.ingest_view_name, is_invalidated=False, is_file_split=False, job_creation_time=datetime.datetime(2015, 1, 2, 3, 5, 5), datetimes_contained_lower_bound_exclusive=export_args. upper_bound_datetime_prev, datetimes_contained_upper_bound_inclusive=export_args. upper_bound_datetime_to_export, normalized_file_name=None, export_time=None, discovery_time=None, processed_time=None, ) self.assertEqual(expected_metadata, ingest_file_metadata) with freeze_time('2015-01-02T03:06:06'): self.metadata_manager.register_ingest_view_export_file_name( ingest_file_metadata, ingest_view_unprocessed_path) expected_metadata.normalized_file_name = ingest_view_unprocessed_path.file_name metadata = metadata_manager.get_file_metadata( ingest_view_unprocessed_path) self.assertEqual(expected_metadata, metadata) # ... export actually performed in here if discovery_before_export_recorded: with freeze_time('2015-01-02T03:06:07'): metadata_manager.mark_file_as_discovered( ingest_view_unprocessed_path) expected_metadata.discovery_time = datetime.datetime( 2015, 1, 2, 3, 6, 7) expected_metadata.export_time = datetime.datetime( 2015, 1, 2, 3, 6, 7) metadata = metadata_manager.get_file_metadata( ingest_view_unprocessed_path) self.assertEqual(expected_metadata, metadata) with freeze_time('2015-01-02T03:06:08'): self.metadata_manager.mark_ingest_view_exported( ingest_file_metadata) expected_metadata.export_time = datetime.datetime(2015, 1, 2, 3, 6, 8) metadata = metadata_manager.get_file_metadata( ingest_view_unprocessed_path) self.assertEqual(expected_metadata, metadata) if not discovery_before_export_recorded: with freeze_time('2015-01-02T03:07:07'): metadata_manager.mark_file_as_discovered( ingest_view_unprocessed_path) expected_metadata.discovery_time = datetime.datetime( 2015, 1, 2, 3, 7, 7) metadata = metadata_manager.get_file_metadata( ingest_view_unprocessed_path) self.assertEqual(expected_metadata, metadata) split_file_paths_and_metadata: List[Tuple[ GcsfsFilePath, DirectIngestIngestFileMetadata]] = [] if split_file: metadata = metadata_manager.get_file_metadata( ingest_view_unprocessed_path) if not isinstance(metadata, DirectIngestIngestFileMetadata): self.fail(f'Unexpected metadata type {type(metadata)}') for i in range(2): split_file_path = self._make_unprocessed_path( f'bucket/split{i}.csv', GcsfsDirectIngestFileType.INGEST_VIEW) self.run_split_ingest_file_progression_pre_processing( metadata_manager, metadata, split_file_path, discovery_before_export_recorded) split_file_metadata = metadata_manager.get_file_metadata( split_file_path) if not isinstance(split_file_metadata, DirectIngestIngestFileMetadata): self.fail( f'Unexpected split_file_metadata type [{split_file_metadata}].' ) split_file_paths_and_metadata.append( (split_file_path, split_file_metadata)) with freeze_time('2015-01-02T03:08:08'): metadata_manager.mark_file_as_processed( ingest_view_unprocessed_path) expected_metadata.processed_time = datetime.datetime( 2015, 1, 2, 3, 8, 8) metadata = metadata_manager.get_file_metadata( ingest_view_unprocessed_path) self.assertEqual(expected_metadata, metadata) for split_file_path, split_file_metadata in split_file_paths_and_metadata: expected_metadata = split_file_metadata with freeze_time('2015-01-02T03:09:09'): metadata_manager.mark_file_as_processed(split_file_path) expected_metadata.processed_time = datetime.datetime( 2015, 1, 2, 3, 9, 9) metadata = metadata_manager.get_file_metadata(split_file_path) self.assertEqual(expected_metadata, metadata)