def test_ingest_view_export(self, mock_supported, mock_region, mock_environment): mock_supported.return_value = ['us_xx'] region_code = 'us_xx' mock_environment.return_value = 'staging' mock_controller = create_autospec(GcsfsDirectIngestController) mock_region.return_value = fake_region(region_code=region_code, environment='staging', ingestor=mock_controller) export_args = GcsfsIngestViewExportArgs( ingest_view_name='my_ingest_view', upper_bound_datetime_prev=datetime.datetime(2020, 4, 29), upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30)) request_args = { 'region': region_code, } body = { 'cloud_task_args': export_args.to_serializable(), 'args_type': 'GcsfsIngestViewExportArgs', } body_encoded = json.dumps(body).encode() headers = {'X-Appengine-Cron': 'test-cron'} response = self.client.post('/ingest_view_export', query_string=request_args, headers=headers, data=body_encoded) self.assertEqual(200, response.status_code) mock_controller.do_ingest_view_export.assert_called_with(export_args)
def test_get_ingest_view_metadata_pending_export_all_exported_in_region( self): args = GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) args_other_region = GcsfsIngestViewExportArgs( ingest_view_name='other_file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) with freeze_time('2015-01-02T03:06:06'): self.metadata_manager.register_ingest_file_export_job(args) self.metadata_manager_other_region.register_ingest_file_export_job( args_other_region) with freeze_time('2015-01-02T03:07:07'): path = self._make_unprocessed_path( 'bucket/file_tag.csv', file_type=GcsfsDirectIngestFileType.INGEST_VIEW, dt=datetime.datetime.utcnow()) metadata = self.metadata_manager.get_ingest_view_metadata_for_export_job( args) self.metadata_manager.register_ingest_view_export_file_name( metadata, path) # ... export actually performed in here self.metadata_manager.mark_ingest_view_exported(metadata) self.assertEqual( [], self.metadata_manager.get_ingest_view_metadata_pending_export())
def test_get_ingest_view_metadata_for_most_recent_valid_job(self): with freeze_time('2015-01-02T03:05:05'): self.metadata_manager.register_ingest_file_export_job( GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=None, upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2))) with freeze_time('2015-01-02T03:06:06'): self.metadata_manager.register_ingest_file_export_job( GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3))) with freeze_time('2015-01-02T03:07:07'): self.metadata_manager.register_ingest_file_export_job( GcsfsIngestViewExportArgs( ingest_view_name='another_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 4, 4, 4))) most_recent_valid_job = self.metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job( 'file_tag') self.assertIsNotNone(most_recent_valid_job) self.assertEqual('file_tag', most_recent_valid_job.file_tag) self.assertEqual( datetime.datetime(2015, 1, 2, 2, 2, 2, 2), most_recent_valid_job.datetimes_contained_lower_bound_exclusive) self.assertEqual( datetime.datetime(2015, 1, 2, 3, 3, 3, 3), most_recent_valid_job.datetimes_contained_upper_bound_inclusive) # Invalidate the row that was just returned session = SessionFactory.for_schema_base(OperationsBase) results = session.query( schema.DirectIngestIngestFileMetadata).filter_by( file_id=most_recent_valid_job.file_id).all() result = one(results) result.is_invalidated = True session.commit() most_recent_valid_job = self.metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job( 'file_tag') self.assertIsNotNone(most_recent_valid_job) self.assertEqual('file_tag', most_recent_valid_job.file_tag) self.assertEqual( None, most_recent_valid_job.datetimes_contained_lower_bound_exclusive) self.assertEqual( datetime.datetime(2015, 1, 2, 2, 2, 2, 2), most_recent_valid_job.datetimes_contained_upper_bound_inclusive)
def test_create_direct_ingest_ingest_view_export_task( self, mock_client: mock.MagicMock, mock_uuid: mock.MagicMock) -> None: # Arrange export_args = GcsfsIngestViewExportArgs( ingest_view_name="my_ingest_view", output_bucket_name="my_ingest_bucket", upper_bound_datetime_prev=datetime.datetime(2020, 4, 29), upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30), ) body = { "cloud_task_args": export_args.to_serializable(), "args_type": "GcsfsIngestViewExportArgs", } body_encoded = json.dumps(body).encode() uuid = "random-uuid" mock_uuid.uuid4.return_value = uuid date = "2019-07-20" queue_path = f"{DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2}-path" task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + "/{}-{}-{}".format( _REGION.region_code, date, uuid) url_params = { "region": _REGION.region_code, "output_bucket": "my_ingest_bucket", } task = tasks_v2.types.task_pb2.Task( name=task_name, app_engine_http_request={ "http_method": "POST", "relative_uri": f"/direct/ingest_view_export?{urlencode(url_params)}", "body": body_encoded, }, ) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl( ).create_direct_ingest_ingest_view_export_task( _REGION, DirectIngestInstance.PRIMARY, export_args) # Assert mock_client.return_value.queue_path.assert_called_with( self.mock_project_id, QUEUES_REGION, DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def test_ingest_view_file_same_args_after_invalidation(self): args = GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) ingest_view_unprocessed_path = self._make_unprocessed_path( 'bucket/file_tag.csv', GcsfsDirectIngestFileType.INGEST_VIEW) self.run_ingest_view_file_progression(args, self.metadata_manager, ingest_view_unprocessed_path) # Invalidate the previous row session = SessionFactory.for_schema_base(OperationsBase) results = session.query(schema.DirectIngestIngestFileMetadata).all() result = one(results) result.is_invalidated = True session.commit() # Now we can rerun with the same args ingest_view_unprocessed_path = self._make_unprocessed_path( 'bucket/file_tag.csv', GcsfsDirectIngestFileType.INGEST_VIEW, dt=datetime.datetime.now()) self.run_ingest_view_file_progression(args, self.metadata_manager, ingest_view_unprocessed_path)
def test_get_ingest_view_metadata_pending_export_basic(self): args = GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) with freeze_time('2015-01-02T03:06:06'): self.metadata_manager.register_ingest_file_export_job(args) expected_list = [ DirectIngestIngestFileMetadata.new_with_defaults( region_code='US_XX', file_tag='file_tag', is_invalidated=False, is_file_split=False, job_creation_time=datetime.datetime(2015, 1, 2, 3, 6, 6), datetimes_contained_lower_bound_exclusive=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), datetimes_contained_upper_bound_inclusive=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) ] self.assertEqual( expected_list, self.metadata_manager.get_ingest_view_metadata_pending_export())
def test_ingest_view_file_progression_same_args_twice_throws(self) -> None: args = GcsfsIngestViewExportArgs( ingest_view_name="file_tag", upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3), ) ingest_view_unprocessed_path = self._make_unprocessed_path( "bucket/file_tag.csv", GcsfsDirectIngestFileType.INGEST_VIEW) self.run_ingest_view_file_progression(args, self.metadata_manager, ingest_view_unprocessed_path) with self.assertRaises(IntegrityError): ingest_view_unprocessed_path = self._make_unprocessed_path( "bucket/file_tag.csv", GcsfsDirectIngestFileType.INGEST_VIEW, dt=datetime.datetime.now(), ) self.run_ingest_view_file_progression( args, self.metadata_manager, ingest_view_unprocessed_path) session = SessionFactory.for_schema_base(OperationsBase) results = session.query(schema.DirectIngestIngestFileMetadata).all() self.assertEqual(1, len(results))
def create_direct_ingest_ingest_view_export_task( self, region: Region, ingest_instance: DirectIngestInstance, ingest_view_export_args: GcsfsIngestViewExportArgs, ) -> None: task_id = _build_task_id( region.region_code, ingest_instance, task_id_tag=ingest_view_export_args.task_id_tag(), prefix_only=False, ) params = { "region": region.region_code.lower(), "output_bucket": ingest_view_export_args.output_bucket_name, } relative_uri = f"/direct/ingest_view_export?{urlencode(params)}" body = self._get_body_from_args(ingest_view_export_args) self._get_bq_import_export_queue_manager(region, ingest_instance).create_task( task_id=task_id, relative_uri=relative_uri, body=body, )
def get_ingest_view_export_task_args( self) -> List[GcsfsIngestViewExportArgs]: """Looks at what files have been exported for a given region and returns args for all the export jobs that should be started, given what has updated in the raw data tables since the last time we exported data. Also returns any tasks that have not yet completed. """ if not self.region.are_ingest_view_exports_enabled_in_env(): raise ValueError( f'Ingest view exports not enabled for region [{self.region.region_code}]' ) logging.info('Gathering export state for each ingest tag') ingest_view_to_export_state = {} for ingest_view_tag, ingest_view in self.ingest_views_by_tag.items(): export_state = self._get_export_state_for_ingest_view(ingest_view) self._validate_ascending_raw_file_update_dates(export_state) ingest_view_to_export_state[ingest_view_tag] = export_state logging.info('Done gathering export state for each ingest tag') # At this point we know that we have no new raw data backfills that should invalidate either pending or past # completed ingest view exports (checked in _validate_ascending_raw_file_update_dates()). We can now generate # any new jobs. jobs_to_schedule = [] metadata_pending_export = self.file_metadata_manager.get_ingest_view_metadata_pending_export( ) if metadata_pending_export: args_list = self._export_args_from_metadata( metadata_pending_export) jobs_to_schedule.extend(args_list) logging.info('Found [%s] already pending jobs to schedule.', len(jobs_to_schedule)) logging.info('Generating new ingest jobs.') for ingest_view_tag, export_state in ingest_view_to_export_state.items( ): lower_bound_datetime_exclusive = \ export_state.last_export_metadata.datetimes_contained_upper_bound_inclusive \ if export_state.last_export_metadata else None ingest_args_list = [] for _date, upper_bound_datetime_inclusive in export_state.max_update_datetime_by_date: args = GcsfsIngestViewExportArgs( ingest_view_name=ingest_view_tag, upper_bound_datetime_prev=lower_bound_datetime_exclusive, upper_bound_datetime_to_export= upper_bound_datetime_inclusive) logging.info('Generating job args for tag [%s]: [%s].', ingest_view_tag, args) self.file_metadata_manager.register_ingest_file_export_job( args) ingest_args_list.append(args) lower_bound_datetime_exclusive = upper_bound_datetime_inclusive jobs_to_schedule.extend(ingest_args_list) logging.info('Returning [%s] jobs to schedule.', len(jobs_to_schedule)) return jobs_to_schedule
def _export_args_from_metadata( metadata_list: List[DirectIngestIngestFileMetadata]) -> List[GcsfsIngestViewExportArgs]: return [GcsfsIngestViewExportArgs( ingest_view_name=metadata.file_tag, upper_bound_datetime_prev=metadata.datetimes_contained_lower_bound_exclusive, upper_bound_datetime_to_export=metadata.datetimes_contained_upper_bound_inclusive ) for metadata in metadata_list]
def test_ingest_view_file_progression_two_files_same_tag(self) -> None: args = GcsfsIngestViewExportArgs( ingest_view_name="file_tag", upper_bound_datetime_prev=None, upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), ) ingest_view_unprocessed_path_1 = self._make_unprocessed_path( "bucket/file_tag.csv", GcsfsDirectIngestFileType.INGEST_VIEW, dt=datetime.datetime(2015, 1, 2, 2, 2, 2, 2), ) self.run_ingest_view_file_progression(args, self.metadata_manager, ingest_view_unprocessed_path_1) args = GcsfsIngestViewExportArgs( ingest_view_name="file_tag", upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 3, 3, 3, 3, 3), ) ingest_view_unprocessed_path_2 = self._make_unprocessed_path( "bucket/file_tag.csv", GcsfsDirectIngestFileType.INGEST_VIEW, dt=datetime.datetime(2015, 1, 3, 3, 3, 3, 3), ) self.run_ingest_view_file_progression(args, self.metadata_manager, ingest_view_unprocessed_path_2) session = SessionFactory.for_schema_base(OperationsBase) results = session.query(schema.DirectIngestIngestFileMetadata).all() self.assertEqual( { ingest_view_unprocessed_path_1.file_name, ingest_view_unprocessed_path_2.file_name, }, {r.normalized_file_name for r in results}, ) for r in results: self.assertTrue(r.export_time) self.assertTrue(r.processed_time)
def test_create_direct_ingest_ingest_view_export_task( self, mock_client, mock_uuid): # Arrange project_id = 'recidiviz-456' export_args = GcsfsIngestViewExportArgs( ingest_view_name='my_ingest_view', upper_bound_datetime_prev=datetime.datetime(2020, 4, 29), upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30)) body = { 'cloud_task_args': export_args.to_serializable(), 'args_type': 'GcsfsIngestViewExportArgs' } body_encoded = json.dumps(body).encode() uuid = 'random-uuid' mock_uuid.uuid4.return_value = uuid date = '2019-07-20' queue_path = _REGION.shared_queue + '-path' task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + '/{}-{}-{}'.format( _REGION.region_code, date, uuid) task = tasks_v2.types.task_pb2.Task( name=task_name, app_engine_http_request={ 'http_method': 'POST', 'relative_uri': f'/direct/ingest_view_export?region={_REGION.region_code}', 'body': body_encoded }) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl( project_id=project_id ).create_direct_ingest_ingest_view_export_task(_REGION, export_args) # Assert mock_client.return_value.queue_path.assert_called_with( project_id, QUEUES_REGION, DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def test_ingest_view_export( self, mock_supported: mock.MagicMock, mock_region: mock.MagicMock, mock_environment: mock.MagicMock, ) -> None: mock_supported.return_value = ["us_xx"] region_code = "us_xx" mock_environment.return_value = "staging" mock_controller = create_autospec(GcsfsDirectIngestController) mock_region.return_value = fake_region(region_code=region_code, environment="staging", ingestor=mock_controller) export_args = GcsfsIngestViewExportArgs( ingest_view_name="my_ingest_view", upper_bound_datetime_prev=datetime.datetime(2020, 4, 29), upper_bound_datetime_to_export=datetime.datetime(2020, 4, 30), ) request_args = { "region": region_code, } body = { "cloud_task_args": export_args.to_serializable(), "args_type": "GcsfsIngestViewExportArgs", } body_encoded = json.dumps(body).encode() headers = {"X-Appengine-Cron": "test-cron"} response = self.client.post( "/ingest_view_export", query_string=request_args, headers=headers, data=body_encoded, ) self.assertEqual(200, response.status_code) mock_controller.do_ingest_view_export.assert_called_with(export_args)
def test_ingest_view_file_progression(self): args = GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) ingest_view_unprocessed_path = self._make_unprocessed_path( 'bucket/file_tag.csv', GcsfsDirectIngestFileType.INGEST_VIEW) self.run_ingest_view_file_progression(args, self.metadata_manager, ingest_view_unprocessed_path)
def test_gcsfs_ingest_view_export_args(self) -> None: dt_lower = datetime.datetime(2019, 1, 22, 11, 22, 33, 444444) dt_upper = datetime.datetime(2019, 11, 22, 11, 22, 33, 444444) args = GcsfsIngestViewExportArgs( ingest_view_name="my_file_tag", upper_bound_datetime_prev=None, upper_bound_datetime_to_export=dt_upper, ) self.assertEqual( "ingest_view_export_my_file_tag-None-2019_11_22_11_22_33_444444", args.task_id_tag(), ) args = GcsfsIngestViewExportArgs( ingest_view_name="my_file_tag", upper_bound_datetime_prev=dt_lower, upper_bound_datetime_to_export=dt_upper, ) self.assertEqual( "ingest_view_export_my_file_tag-2019_01_22_11_22_33_444444-2019_11_22_11_22_33_444444", args.task_id_tag(), )
def run_parse_file_test(self, expected: IngestInfo, fixture_file_name: str) -> IngestInfo: """Runs a test that reads and parses a given fixture file. Returns the parsed IngestInfo object for tests to run further validations.""" args = ingest_args_for_fixture_file(self.controller, f"{fixture_file_name}.csv") if not isinstance(self.controller.fs.gcs_file_system, FakeGCSFileSystem): raise ValueError( f"Controller fs must have type " f"FakeGCSFileSystem. Found instead " f"type [{type(self.controller.fs.gcs_file_system)}]") if self.controller.region.is_ingest_launched_in_env(): now = datetime.datetime.now() yesterday = now - datetime.timedelta(days=1) ingest_file_export_job_args = GcsfsIngestViewExportArgs( ingest_view_name=fixture_file_name, upper_bound_datetime_to_export=now, upper_bound_datetime_prev=yesterday, output_bucket_name=self.controller.ingest_bucket_path. bucket_name, ) self.controller.file_metadata_manager.register_ingest_file_export_job( ingest_file_export_job_args) self.controller.ingest_view_export_manager.export_view_for_args( ingest_file_export_job_args) else: fixture_util.add_direct_ingest_path( self.controller.fs.gcs_file_system, args.file_path, region_code=self.controller.region_code(), ) # pylint:disable=protected-access fixture_contents_handle = self.controller._get_contents_handle(args) if fixture_contents_handle is None: self.fail("fixture_contents_handle should not be None") final_info = self.controller._parse(args, fixture_contents_handle) print_visible_header_label("FINAL") print(final_info) print_visible_header_label("EXPECTED") print(expected) self.assertEqual(expected, final_info) return final_info
def test_exportViewForArgs_detectRowDeletionView_noLowerBound( self) -> None: # Arrange region = self.create_fake_region() export_manager = self.create_export_manager( region, is_detect_row_deletion_view=True) export_args = GcsfsIngestViewExportArgs( ingest_view_name="ingest_view", upper_bound_datetime_prev=None, upper_bound_datetime_to_export=_DATE_2, ) session = SessionFactory.for_schema_base(OperationsBase) metadata = schema.DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag=export_args.ingest_view_name, normalized_file_name="normalized_file_name", is_invalidated=False, is_file_split=False, job_creation_time=_DATE_1, export_time=None, datetimes_contained_lower_bound_exclusive=export_args. upper_bound_datetime_prev, datetimes_contained_upper_bound_inclusive=export_args. upper_bound_datetime_to_export, ) expected_metadata = attr.evolve(self.to_entity(metadata), export_time=_DATE_4) session.add(metadata) session.commit() session.close() # Act with freeze_time(_DATE_4.isoformat()): export_manager.export_view_for_args(export_args) # Assert self.mock_client.run_query_async.assert_not_called() self.mock_client.export_query_results_to_cloud_storage.assert_not_called( ) self.mock_client.delete_table.assert_not_called() assert_session = SessionFactory.for_schema_base(OperationsBase) found_metadata = self.to_entity( one( assert_session.query( schema.DirectIngestIngestFileMetadata).all())) self.assertEqual(expected_metadata, found_metadata) assert_session.close()
def test_exportViewForArgs_noLowerBound(self): # Arrange region = self.create_fake_region() export_manager = self.create_export_manager(region) export_args = GcsfsIngestViewExportArgs( ingest_view_name='ingest_view', upper_bound_datetime_prev=None, upper_bound_datetime_to_export=_DATE_2) session = SessionFactory.for_schema_base(OperationsBase) metadata = schema.DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag=export_args.ingest_view_name, normalized_file_name='normalized_file_name', is_invalidated=False, is_file_split=False, job_creation_time=_DATE_1, export_time=None, datetimes_contained_lower_bound_exclusive=export_args.upper_bound_datetime_prev, datetimes_contained_upper_bound_inclusive=export_args.upper_bound_datetime_to_export ) expected_metadata = attr.evolve(self.to_entity(metadata), export_time=_DATE_4) session.add(metadata) session.commit() session.close() # Act with freeze_time(_DATE_4.isoformat()): export_manager.export_view_for_args(export_args) # Assert self.mock_client.create_table_from_query_async.assert_has_calls([ mock.call( dataset_id='us_xx_ingest_views', overwrite=True, query=mock.ANY, query_parameters=[self.generate_query_params_for_date(export_args.upper_bound_datetime_to_export)], table_id='ingest_view_2020_07_20_00_00_00_upper_bound'), ]) expected_query = \ 'SELECT * FROM `recidiviz-456.us_xx_ingest_views.ingest_view_2020_07_20_00_00_00_upper_bound` ' \ 'ORDER BY colA, colC;' self.assert_exported_to_gcs_with_query(expected_query) self.mock_client.delete_table.assert_has_calls([ mock.call(dataset_id='us_xx_ingest_views', table_id='ingest_view_2020_07_20_00_00_00_upper_bound')]) assert_session = SessionFactory.for_schema_base(OperationsBase) found_metadata = self.to_entity(one(assert_session.query(schema.DirectIngestIngestFileMetadata).all())) self.assertEqual(expected_metadata, found_metadata) assert_session.close()
def json_to_cloud_task_args(json_data: dict) -> Optional[CloudTaskArgs]: if "cloud_task_args" in json_data and "args_type" in json_data: args_type = json_data["args_type"] cloud_task_args_dict = json_data["cloud_task_args"] if args_type == GcsfsIngestArgs.__name__: return GcsfsIngestArgs.from_serializable(cloud_task_args_dict) if args_type == GcsfsRawDataBQImportArgs.__name__: return GcsfsRawDataBQImportArgs.from_serializable( cloud_task_args_dict) if args_type == GcsfsIngestViewExportArgs.__name__: return GcsfsIngestViewExportArgs.from_serializable( cloud_task_args_dict) logging.error("Unexpected args_type in json_data: %s", args_type) return None
def test_ingest_then_split_progression(self) -> None: args = GcsfsIngestViewExportArgs( ingest_view_name="file_tag", upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3), ) ingest_view_unprocessed_path = self._make_unprocessed_path( "bucket/file_tag.csv", GcsfsDirectIngestFileType.INGEST_VIEW) self.run_ingest_view_file_progression(args, self.metadata_manager, ingest_view_unprocessed_path, split_file=True)
def json_to_cloud_task_args(json_data: dict): if 'cloud_task_args' in json_data and 'args_type' in json_data: args_type = json_data['args_type'] cloud_task_args_dict = json_data['cloud_task_args'] if args_type == IngestArgs.__name__: return IngestArgs.from_serializable(cloud_task_args_dict) if args_type == GcsfsIngestArgs.__name__: return GcsfsIngestArgs.from_serializable(cloud_task_args_dict) if args_type == GcsfsRawDataBQImportArgs.__name__: return GcsfsRawDataBQImportArgs.from_serializable( cloud_task_args_dict) if args_type == GcsfsIngestViewExportArgs.__name__: return GcsfsIngestViewExportArgs.from_serializable( cloud_task_args_dict) logging.error('Unexpected args_type in json_data: %s', args_type) return None
def test_getIngestViewExportTaskArgs_happy(self) -> None: # Arrange region = self.create_fake_region(ingest_view_exports_enabled=True) export_manager = self.create_export_manager(region) export_manager.file_metadata_manager.get_ingest_view_metadata_for_most_recent_valid_job = Mock( # type: ignore return_value=DirectIngestIngestFileMetadata( file_id=_ID, region_code=region.region_code, file_tag="ingest_view", normalized_file_name="normalized_file_name", processed_time=_DATE_1, is_invalidated=False, is_file_split=False, job_creation_time=_DATE_1, export_time=_DATE_1, datetimes_contained_lower_bound_exclusive=_DATE_1, datetimes_contained_upper_bound_inclusive=_DATE_1, discovery_time=_DATE_1, )) export_manager.file_metadata_manager.get_metadata_for_raw_files_discovered_after_datetime = Mock( # type: ignore return_value=[ DirectIngestRawFileMetadata( file_id=2, region_code=region.region_code, file_tag="ingest_view", discovery_time=_DATE_2, normalized_file_name= "unprocessed_2015-01-02T03:03:03:000003_raw_file_tag.csv", processed_time=None, datetimes_contained_upper_bound_inclusive=_DATE_2, ) ]) # Act args = export_manager.get_ingest_view_export_task_args() # Assert self.assertListEqual( args, [ GcsfsIngestViewExportArgs( ingest_view_name="ingest_view", upper_bound_datetime_prev=_DATE_1, upper_bound_datetime_to_export=_DATE_2, ) ], )
def _run_ingest_job_for_filename(self, filename: str) -> None: """Runs ingest for a the ingest view file with the given unnormalized file name.""" get_region_patcher = patch( "recidiviz.persistence.entity_matching.state." "base_state_matching_delegate.get_region") mock_get_region = get_region_patcher.start() mock_get_region.return_value = self._fake_region() environ_patcher = patch.dict("os.environ", {"PERSIST_LOCALLY": "true"}) environ_patcher.start() file_type = (GcsfsDirectIngestFileType.INGEST_VIEW if self.controller.region. is_raw_vs_ingest_file_name_detection_enabled() else None) if not isinstance(self.controller.fs.gcs_file_system, FakeGCSFileSystem): raise ValueError( f"Controller fs must have type " f"FakeGCSFileSystem. Found instead " f"type [{type(self.controller.fs.gcs_file_system)}]") if self.controller.region.are_ingest_view_exports_enabled_in_env(): now = datetime.datetime.utcnow() yesterday = now - datetime.timedelta(days=1) ingest_file_export_job_args = GcsfsIngestViewExportArgs( ingest_view_name=os.path.splitext(filename)[0], upper_bound_datetime_to_export=now, upper_bound_datetime_prev=yesterday, ) self.controller.file_metadata_manager.register_ingest_file_export_job( ingest_file_export_job_args) self.controller.ingest_view_export_manager.export_view_for_args( ingest_file_export_job_args) else: file_path = path_for_fixture_file(self.controller, filename, file_type=file_type, should_normalize=True) self.controller.fs.gcs_file_system.test_add_path( file_path, filename) run_task_queues_to_empty(self.controller) get_region_patcher.stop() environ_patcher.stop()
def create_direct_ingest_ingest_view_export_task( self, region: Region, ingest_view_export_args: GcsfsIngestViewExportArgs): task_id = _build_task_id( region.region_code, task_id_tag=ingest_view_export_args.task_id_tag(), prefix_only=False) relative_uri = f'/direct/ingest_view_export?region={region.region_code}' body = self._get_body_from_args(ingest_view_export_args) self.cloud_task_client.create_task( task_id=task_id, queue_name=DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2, relative_uri=relative_uri, body=body, )
def create_direct_ingest_ingest_view_export_task( self, region: Region, ingest_view_export_args: GcsfsIngestViewExportArgs) -> None: task_id = _build_task_id( region.region_code, task_id_tag=ingest_view_export_args.task_id_tag(), prefix_only=False, ) relative_uri = f"/direct/ingest_view_export?region={region.region_code}" body = self._get_body_from_args(ingest_view_export_args) self.bq_import_export_cloud_task_queue_manager.create_task( task_id=task_id, relative_uri=relative_uri, body=body, )
def test_exportViewForArgs_noExistingMetadata(self): # Arrange region = self.create_fake_region() export_manager = self.create_export_manager(region) export_args = GcsfsIngestViewExportArgs( ingest_view_name='ingest_view', upper_bound_datetime_prev=_DATE_1, upper_bound_datetime_to_export=_DATE_2) # Act with pytest.raises(ValueError): export_manager.export_view_for_args(export_args) # Assert self.mock_client.create_table_from_query_async.assert_not_called() self.mock_client.export_query_results_to_cloud_storage.assert_not_called() self.mock_client.delete_table.assert_not_called()
def test_register_ingest_view_export_file_name_already_exists_raises(self): args = GcsfsIngestViewExportArgs( ingest_view_name='file_tag', upper_bound_datetime_prev=datetime.datetime( 2015, 1, 2, 2, 2, 2, 2), upper_bound_datetime_to_export=datetime.datetime( 2015, 1, 2, 3, 3, 3, 3)) metadata_entity = self.metadata_manager.register_ingest_file_export_job( args) self.metadata_manager.register_ingest_view_export_file_name( metadata_entity, self._make_unprocessed_path('bucket/file_tag.csv', GcsfsDirectIngestFileType.INGEST_VIEW)) with self.assertRaises(ValueError): self.metadata_manager.register_ingest_view_export_file_name( metadata_entity, self._make_unprocessed_path( 'bucket/file_tag.csv', GcsfsDirectIngestFileType.INGEST_VIEW))
def test_exportViewForArgs_ingestViewExportsDisabled(self) -> None: # Arrange region = self.create_fake_region(ingest_view_exports_enabled=False) export_manager = self.create_export_manager(region) export_args = GcsfsIngestViewExportArgs( ingest_view_name="ingest_view", upper_bound_datetime_prev=_DATE_1, upper_bound_datetime_to_export=_DATE_2, ) # Act with pytest.raises(ValueError): export_manager.export_view_for_args(export_args) # Assert self.mock_client.create_dataset_if_necessary.assert_not_called() self.mock_client.run_query_async.assert_not_called() self.mock_client.export_query_results_to_cloud_storage.assert_not_called( ) self.mock_client.delete_table.assert_not_called()
def _run_ingest_job_for_filename(self, filename: str) -> None: """Runs ingest for a the ingest view file with the given unnormalized file name.""" environ_patcher = patch.dict("os.environ", {"PERSIST_LOCALLY": "true"}) environ_patcher.start() file_type = GcsfsDirectIngestFileType.INGEST_VIEW if not isinstance(self.controller.fs.gcs_file_system, FakeGCSFileSystem): raise ValueError( f"Controller fs must have type " f"FakeGCSFileSystem. Found instead " f"type [{type(self.controller.fs.gcs_file_system)}]") if self.controller.region.is_ingest_launched_in_env(): now = datetime.datetime.now(tz=pytz.UTC) yesterday = now - datetime.timedelta(days=1) ingest_file_export_job_args = GcsfsIngestViewExportArgs( ingest_view_name=os.path.splitext(filename)[0], upper_bound_datetime_to_export=now, upper_bound_datetime_prev=yesterday, output_bucket_name=self.controller.ingest_bucket_path. bucket_name, ) self.controller.file_metadata_manager.register_ingest_file_export_job( ingest_file_export_job_args) self.controller.ingest_view_export_manager.export_view_for_args( ingest_file_export_job_args) else: file_path = path_for_fixture_file(self.controller, filename, file_type=file_type, should_normalize=True) self.controller.fs.gcs_file_system.test_add_path( file_path, filename) run_task_queues_to_empty(self.controller) environ_patcher.stop()
def run_parse_file_test(self, expected: IngestInfo, fixture_file_name: str) -> IngestInfo: """Runs a test that reads and parses a given fixture file. Returns the parsed IngestInfo object for tests to run further validations.""" args = ingest_args_for_fixture_file(self.controller, f'{fixture_file_name}.csv') if not isinstance(self.controller.fs, FakeDirectIngestGCSFileSystem): raise ValueError(f"Controller fs must have type " f"FakeDirectIngestGCSFileSystem. Found instead " f"type [{type(self.controller.fs)}]") if self.controller.region.are_ingest_view_exports_enabled_in_env(): ingest_file_export_job_args = GcsfsIngestViewExportArgs( ingest_view_name=fixture_file_name, upper_bound_datetime_to_export=datetime.datetime.utcnow(), upper_bound_datetime_prev=None) self.controller.file_metadata_manager.register_ingest_file_export_job( ingest_file_export_job_args) self.controller.ingest_view_export_manager.export_view_for_args( ingest_file_export_job_args) else: self.controller.fs.test_add_path(args.file_path) # pylint:disable=protected-access fixture_contents_handle = self.controller._get_contents_handle(args) final_info = self.controller._parse(args, fixture_contents_handle) print_visible_header_label('FINAL') print(final_info) print_visible_header_label('EXPECTED') print(expected) self.assertEqual(expected, final_info) return final_info