def test_is_task_queued_has_tasks(self): # Arrange file_path = to_normalized_unprocessed_file_path( 'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW) gcsfs_args = \ GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=GcsfsFilePath.from_absolute_path(file_path)) full_task_name = \ _build_task_id(_REGION.region_code, gcsfs_args.task_id_tag()) info = ProcessIngestJobCloudTaskQueueInfo( queue_name='queue_name', task_names=[ 'projects/path/to/random_task', f'projects/path/to/{full_task_name}' ]) file_path = to_normalized_unprocessed_file_path( 'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW) gcsfs_args = \ GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=GcsfsFilePath.from_absolute_path(file_path)) # Act gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args) # Assert self.assertTrue(gcsfs_args_queued)
def path_for_fixture_file_in_test_gcs_directory( *, bucket_path: GcsfsBucketPath, filename: str, should_normalize: bool, file_type: Optional[GcsfsDirectIngestFileType], dt: Optional[datetime.datetime] = None, ) -> GcsfsFilePath: file_path_str = filename if should_normalize: if not file_type: raise ValueError( "Expected file_type for path normalization but got None") file_path_str = to_normalized_unprocessed_file_path( original_file_path=file_path_str, file_type=file_type, dt=dt) file_path = GcsfsFilePath.from_directory_and_file_name( dir_path=bucket_path, file_name=file_path_str, ) if not isinstance(file_path, GcsfsFilePath): raise ValueError( f"Expected type GcsfsFilePath, found {type(file_path)} for path: {file_path.abs_path()}" ) return file_path
def _create_split_file_path( self, original_file_path: GcsfsFilePath, output_dir: GcsfsDirectoryPath, split_num: int, ) -> GcsfsFilePath: parts = filename_parts_from_path(original_file_path) rank_str = str(split_num + 1).zfill(5) updated_file_name = ( f"{parts.stripped_file_name}_{rank_str}" f"_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}" f".{parts.extension}") file_type = ( GcsfsDirectIngestFileType.INGEST_VIEW if self.region.is_raw_vs_ingest_file_name_detection_enabled() else GcsfsDirectIngestFileType.UNSPECIFIED) return GcsfsFilePath.from_directory_and_file_name( output_dir, to_normalized_unprocessed_file_path(updated_file_name, file_type=file_type, dt=parts.utc_upload_datetime), )
def _do_check_in_for_file(self, path: str) -> None: normalized_file_name = os.path.basename( to_normalized_unprocessed_file_path( path, file_type=GcsfsDirectIngestFileType.RAW_DATA, dt=self.datetime)) self._copy_to_storage(path, normalized_file_name)
def test_raw_data_import(self, mock_supported, mock_region, mock_environment): mock_supported.return_value = ['us_xx'] region_code = 'us_xx' mock_environment.return_value = 'staging' mock_controller = create_autospec(GcsfsDirectIngestController) mock_region.return_value = fake_region(region_code=region_code, environment='staging', ingestor=mock_controller) import_args = GcsfsRawDataBQImportArgs( raw_data_file_path=GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path( 'bucket/raw_data_path.csv', file_type=GcsfsDirectIngestFileType.RAW_DATA))) request_args = { 'region': region_code, } body = { 'cloud_task_args': import_args.to_serializable(), 'args_type': 'GcsfsRawDataBQImportArgs', } body_encoded = json.dumps(body).encode() headers = {'X-Appengine-Cron': 'test-cron'} response = self.client.post('/raw_data_import', query_string=request_args, headers=headers, data=body_encoded) self.assertEqual(200, response.status_code) mock_controller.do_raw_data_import.assert_called_with(import_args)
def path_for_fixture_file_in_test_gcs_directory( *, directory: GcsfsDirectoryPath, filename: str, should_normalize: bool, file_type: Optional[GcsfsDirectIngestFileType] = None, dt: Optional[datetime.datetime] = None, ) -> GcsfsFilePath: file_path_str = filename if should_normalize: if not file_type: file_type = GcsfsDirectIngestFileType.UNSPECIFIED file_path_str = to_normalized_unprocessed_file_path( original_file_path=file_path_str, file_type=file_type, dt=dt) file_path = GcsfsPath.from_bucket_and_blob_name( bucket_name=directory.bucket_name, blob_name=os.path.join(directory.relative_path, file_path_str), ) if not isinstance(file_path, GcsfsFilePath): raise ValueError( f"Expected type GcsfsFilePath, found {type(file_path)} for path: {file_path.abs_path()}" ) return file_path
def _normalized_path_for_filename(self, filename: str, dt: datetime.datetime) -> GcsfsFilePath: normalized_path = \ to_normalized_unprocessed_file_path( os.path.join(self._INGEST_BUCKET_PATH.abs_path(), filename), dt) return GcsfsFilePath.from_absolute_path(normalized_path)
def _upload_file(self, path: str) -> None: normalized_file_name = os.path.basename( to_normalized_unprocessed_file_path( path, file_type=GcsfsDirectIngestFileType.RAW_DATA, dt=self.datetime)) self._copy_to_ingest_bucket(path, normalized_file_name)
def _make_unprocessed_path( path_str: str, file_type: GcsfsDirectIngestFileType, dt=datetime.datetime(2015, 1, 2, 3, 3, 3, 3) ) -> GcsfsFilePath: normalized_path_str = to_normalized_unprocessed_file_path( original_file_path=path_str, file_type=file_type, dt=dt) return GcsfsFilePath.from_absolute_path(normalized_path_str)
def ingest_args_for_fixture_file(controller: GcsfsDirectIngestController, filename: str) -> GcsfsIngestArgs: original_path = os.path.join(controller.ingest_directory_path, filename) file_path = to_normalized_unprocessed_file_path(original_path) return GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=file_path, )
def _normalized_path_for_filename(self, filename: str, file_type: GcsfsDirectIngestFileType, dt: datetime.datetime) -> GcsfsFilePath: normalized_path = \ to_normalized_unprocessed_file_path( original_file_path=os.path.join(self._INGEST_BUCKET_PATH.abs_path(), filename), file_type=file_type, dt=dt) return GcsfsFilePath.from_absolute_path(normalized_path)
def _upload_file( self, path_with_timestamp: Tuple[str, datetime.datetime]) -> None: path, timestamp = path_with_timestamp normalized_file_name = os.path.basename( to_normalized_unprocessed_file_path( path, file_type=GcsfsDirectIngestFileType.RAW_DATA, dt=timestamp)) full_file_upload_path = GcsfsFilePath.from_directory_and_file_name( self.gcs_destination_path, normalized_file_name) self._copy_to_ingest_bucket(path, full_file_upload_path)
def test_is_task_queued_has_tasks(self): # Arrange file_path = to_normalized_unprocessed_file_path('file_path.csv') gcsfs_args = GcsfsIngestArgs(ingest_time=datetime.datetime.now(), file_path=file_path) full_task_name = \ _build_task_id(_REGION.region_code, gcsfs_args.task_id_tag()) info = CloudTaskQueueInfo(queue_name='queue_name', task_names=[ f'projects/path/to/random_task', f'projects/path/to/{full_task_name}' ]) file_path = to_normalized_unprocessed_file_path('file_path.csv') gcsfs_args = GcsfsIngestArgs(ingest_time=datetime.datetime.now(), file_path=file_path) # Act gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args) # Assert self.assertTrue(gcsfs_args_queued)
def test_create_direct_ingest_process_job_task_secondary( self, mock_client: mock.MagicMock, mock_uuid: mock.MagicMock) -> None: # Arrange file_path = to_normalized_unprocessed_file_path( "bucket/ingest_view_name.csv", file_type=GcsfsDirectIngestFileType.INGEST_VIEW, ) ingest_args = GcsfsIngestArgs( datetime.datetime(year=2019, month=7, day=20), file_path=GcsfsFilePath.from_absolute_path(file_path), ) body = { "cloud_task_args": ingest_args.to_serializable(), "args_type": "GcsfsIngestArgs", } body_encoded = json.dumps(body).encode() uuid = "random-uuid" mock_uuid.uuid4.return_value = uuid date = "2019-07-20" queue_path = "us-xx-process-queue-path" queue_name = "direct-ingest-state-us-xx-process-job-queue" task_name = "{}/{}-{}-{}".format( DIRECT_INGEST_STATE_PROCESS_JOB_QUEUE_V2, _REGION.region_code, date, uuid) url_params = {"region": _REGION.region_code, "file_path": file_path} task = tasks_v2.types.task_pb2.Task( name=task_name, app_engine_http_request={ "http_method": "POST", "relative_uri": f"/direct/process_job?{urlencode(url_params)}", "body": body_encoded, }, ) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl( ).create_direct_ingest_process_job_task(_REGION, DirectIngestInstance.SECONDARY, ingest_args) # Assert mock_client.return_value.queue_path.assert_called_with( self.mock_project_id, QUEUES_REGION, queue_name, ) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def test_create_direct_ingest_raw_data_import_task( self, mock_client: mock.MagicMock, mock_uuid: mock.MagicMock) -> None: # Arrange raw_data_path = GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path( "bucket/raw_data_path.csv", file_type=GcsfsDirectIngestFileType.RAW_DATA, )) import_args = GcsfsRawDataBQImportArgs( raw_data_file_path=raw_data_path) body = { "cloud_task_args": import_args.to_serializable(), "args_type": "GcsfsRawDataBQImportArgs", } body_encoded = json.dumps(body).encode() uuid = "random-uuid" mock_uuid.uuid4.return_value = uuid date = "2019-07-20" queue_path = f"{DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2}-path" task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + "/{}-{}-{}".format( _REGION.region_code, date, uuid) url_params = { "region": _REGION.region_code, "file_path": raw_data_path.abs_path(), } task = tasks_v2.types.task_pb2.Task( name=task_name, app_engine_http_request={ "http_method": "POST", "relative_uri": f"/direct/raw_data_import?{urlencode(url_params)}", "body": body_encoded, }, ) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl( ).create_direct_ingest_raw_data_import_task( _REGION, DirectIngestInstance.PRIMARY, import_args) # Assert mock_client.return_value.queue_path.assert_called_with( self.mock_project_id, QUEUES_REGION, DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def _fetch( self, connection: pysftp.Connection, file_path: str, file_timestamp: datetime.datetime, ) -> None: """Fetches data files from the SFTP, tracking which items downloaded and failed to download.""" normalized_sftp_path = os.path.normpath(file_path) normalized_upload_path = GcsfsFilePath.from_directory_and_file_name( dir_path=self.download_dir, file_name=os.path.basename( to_normalized_unprocessed_file_path( normalized_sftp_path, file_type=GcsfsDirectIngestFileType.RAW_DATA, dt=file_timestamp, )), ) if not self.postgres_direct_ingest_file_metadata_manager.has_raw_file_been_discovered( normalized_upload_path ) and not self.postgres_direct_ingest_file_metadata_manager.has_raw_file_been_processed( normalized_upload_path): logging.info("Downloading %s into %s", normalized_sftp_path, self.download_dir) try: path = GcsfsFilePath.from_directory_and_file_name( dir_path=self.download_dir, file_name=normalized_sftp_path) self.gcsfs.upload_from_contents_handle_stream( path=path, contents_handle=GcsfsSftpFileContentsHandle( sftp_connection=connection, local_file_path=file_path), content_type=BYTES_CONTENT_TYPE, ) logging.info("Post processing %s", path.uri()) self.downloaded_items.append(( self.delegate.post_process_downloads(path, self.gcsfs), file_timestamp, )) except IOError as e: logging.info( "Could not download %s into %s: %s", normalized_sftp_path, self.download_dir, e.args, ) self.unable_to_download_items.append(file_path) else: logging.info( "Skipping downloading %s because it has already been previously downloaded for ingest.", normalized_sftp_path, ) self.skipped_files.append(file_path)
def path_for_fixture_file( controller: GcsfsDirectIngestController, filename: str, should_normalize: bool, dt: Optional[datetime.datetime] = None ) -> Union[GcsfsFilePath, GcsfsDirectoryPath]: file_path_str = filename if should_normalize: file_path_str = to_normalized_unprocessed_file_path(file_path_str, dt) return GcsfsPath.from_bucket_and_blob_name( bucket_name=controller.ingest_directory_path.bucket_name, blob_name=os.path.join(controller.ingest_directory_path.relative_path, file_path_str))
def _create_split_file_path(self, original_file_path: GcsfsFilePath, output_dir: GcsfsDirectoryPath, split_num: int) -> GcsfsFilePath: parts = filename_parts_from_path(original_file_path) rank_str = str(split_num + 1).zfill(5) existing_suffix = \ f'_{parts.filename_suffix}' if parts.filename_suffix else '' updated_file_name = ( f'{parts.file_tag}{existing_suffix}_{rank_str}' f'_{SPLIT_FILE_SUFFIX}_size{self.file_split_line_limit}' f'.{parts.extension}') return GcsfsFilePath.from_directory_and_file_name( output_dir, to_normalized_unprocessed_file_path(updated_file_name, dt=parts.utc_upload_datetime))
def test_create_direct_ingest_process_job_task_gcsfs_args( self, mock_client: MagicMock, mock_uuid: MagicMock, mock_datetime: MagicMock) -> None: # Arrange file_path = to_normalized_unprocessed_file_path( "bucket/file_path.csv", GcsfsDirectIngestFileType.INGEST_VIEW) ingest_args = GcsfsIngestArgs( ingest_time=datetime.datetime(year=2019, month=7, day=20), file_path=GcsfsFilePath.from_absolute_path(file_path), ) body = { "cloud_task_args": ingest_args.to_serializable(), "args_type": "GcsfsIngestArgs", } body_encoded = json.dumps(body).encode() uuid = "random-uuid" mock_uuid.uuid4.return_value = uuid date = "2019-07-20" mock_datetime.date.today.return_value = date queue_path = f"{_REGION.shared_queue}-path" task_name = _REGION.get_queue_name() + "/{}-{}-{}".format( _REGION.region_code, date, uuid) task = tasks_v2.types.task_pb2.Task( name=task_name, app_engine_http_request={ "http_method": "POST", "relative_uri": f"/direct/process_job?region={_REGION.region_code}", "body": body_encoded, }, ) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl( ).create_direct_ingest_process_job_task(_REGION, ingest_args) # Assert mock_client.return_value.queue_path.assert_called_with( self.mock_project_id, QUEUES_REGION, _REGION.shared_queue) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def test_is_task_queued_no_tasks(self): # Arrange info = CloudTaskQueueInfo(queue_name='queue_name', task_names=[]) file_path = to_normalized_unprocessed_file_path('file_path.csv') args = IngestArgs(ingest_time=datetime.datetime.now()) gcsfs_args = GcsfsIngestArgs(ingest_time=datetime.datetime.now(), file_path=file_path) # Act basic_args_queued = info.is_task_queued(_REGION, args) gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args) # Assert self.assertFalse(basic_args_queued) self.assertFalse(gcsfs_args_queued) self.assertFalse(info.is_task_queued(_REGION, gcsfs_args))
def test_create_direct_ingest_process_job_task_gcsfs_args( self, mock_client, mock_uuid, mock_datetime): # Arrange project_id = 'recidiviz-456' file_path = to_normalized_unprocessed_file_path( 'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW) ingest_args = \ GcsfsIngestArgs( ingest_time=datetime.datetime(year=2019, month=7, day=20), file_path=GcsfsFilePath.from_absolute_path(file_path)) body = { 'cloud_task_args': ingest_args.to_serializable(), 'args_type': 'GcsfsIngestArgs' } body_encoded = json.dumps(body).encode() uuid = 'random-uuid' mock_uuid.uuid4.return_value = uuid date = '2019-07-20' mock_datetime.date.today.return_value = date queue_path = _REGION.shared_queue + '-path' task_name = _REGION.shared_queue + '/{}-{}-{}'.format( _REGION.region_code, date, uuid) task = tasks_v2.types.task_pb2.Task( name=task_name, app_engine_http_request={ 'http_method': 'POST', 'relative_uri': f'/direct/process_job?region={_REGION.region_code}', 'body': body_encoded }) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl(project_id=project_id).\ create_direct_ingest_process_job_task(_REGION, ingest_args) # Assert mock_client.return_value.queue_path.assert_called_with( project_id, QUEUES_REGION, _REGION.shared_queue) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def test_create_direct_ingest_raw_data_import_task(self, mock_client, mock_uuid): # Arrange project_id = 'recidiviz-456' import_args = GcsfsRawDataBQImportArgs( raw_data_file_path=GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path( 'bucket/raw_data_path.csv', file_type=GcsfsDirectIngestFileType.RAW_DATA))) body = { 'cloud_task_args': import_args.to_serializable(), 'args_type': 'GcsfsRawDataBQImportArgs' } body_encoded = json.dumps(body).encode() uuid = 'random-uuid' mock_uuid.uuid4.return_value = uuid date = '2019-07-20' queue_path = _REGION.shared_queue + '-path' task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + '/{}-{}-{}'.format( _REGION.region_code, date, uuid) task = tasks_v2.types.task_pb2.Task( name=task_name, app_engine_http_request={ 'http_method': 'POST', 'relative_uri': f'/direct/raw_data_import?region={_REGION.region_code}', 'body': body_encoded }) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl( project_id=project_id).create_direct_ingest_raw_data_import_task( _REGION, import_args) # Assert mock_client.return_value.queue_path.assert_called_with( project_id, QUEUES_REGION, DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def path_for_fixture_file_in_test_gcs_directory( *, directory: GcsfsDirectoryPath, filename: str, should_normalize: bool, file_type: Optional[GcsfsDirectIngestFileType] = None, dt: Optional[datetime.datetime] = None ) -> Union[GcsfsFilePath, GcsfsDirectoryPath]: file_path_str = filename if should_normalize: if not file_type: file_type = GcsfsDirectIngestFileType.UNSPECIFIED file_path_str = to_normalized_unprocessed_file_path( original_file_path=file_path_str, file_type=file_type, dt=dt) return GcsfsPath.from_bucket_and_blob_name( bucket_name=directory.bucket_name, blob_name=os.path.join(directory.relative_path, file_path_str))
def test_raw_data_import( self, mock_supported: mock.MagicMock, mock_region: mock.MagicMock, mock_environment: mock.MagicMock, ) -> None: mock_supported.return_value = ["us_xx"] region_code = "us_xx" mock_environment.return_value = "staging" mock_controller = create_autospec(GcsfsDirectIngestController) mock_region.return_value = fake_region(region_code=region_code, environment="staging", ingestor=mock_controller) import_args = GcsfsRawDataBQImportArgs( raw_data_file_path=GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path( "bucket/raw_data_path.csv", file_type=GcsfsDirectIngestFileType.RAW_DATA, ))) request_args = { "region": region_code, } body = { "cloud_task_args": import_args.to_serializable(), "args_type": "GcsfsRawDataBQImportArgs", } body_encoded = json.dumps(body).encode() headers = {"X-Appengine-Cron": "test-cron"} response = self.client.post( "/raw_data_import", query_string=request_args, headers=headers, data=body_encoded, ) self.assertEqual(200, response.status_code) mock_controller.do_raw_data_import.assert_called_with(import_args)
def test_create_direct_ingest_process_job_task_gcsfs_args( self, mock_client, mock_uuid, mock_datetime): # Arrange ingest_args = GcsfsIngestArgs( datetime.datetime(year=2019, month=7, day=20), file_path=to_normalized_unprocessed_file_path('file_path.csv')) body = { 'ingest_args': ingest_args.to_serializable(), 'args_type': 'GcsfsIngestArgs' } body_encoded = json.dumps(body).encode() uuid = 'random-uuid' mock_uuid.uuid4.return_value = uuid date = '2019-07-20' mock_datetime.date.today.return_value = date queue_path = _REGION.shared_queue + '-path' task_name = _REGION.shared_queue + '/{}-{}-{}'.format( _REGION.region_code, date, uuid) task = tasks.types.Task( name=task_name, app_engine_http_request={ 'relative_uri': f'/direct/process_job?region={_REGION.region_code}', 'body': body_encoded }) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl().\ create_direct_ingest_process_job_task(_REGION, ingest_args) # Assert mock_client.return_value.queue_path.assert_called_with( metadata.project_id(), metadata.region(), _REGION.shared_queue) mock_client.return_value.create_task.assert_called_with( queue_path, task)
def _create_split_file_path( self, original_file_path: GcsfsFilePath, output_dir: GcsfsDirectoryPath, split_num: int, ) -> GcsfsFilePath: parts = filename_parts_from_path(original_file_path) rank_str = str(split_num + 1).zfill(5) updated_file_name = ( f"{parts.stripped_file_name}_{rank_str}" f"_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}" f".{parts.extension}") return GcsfsFilePath.from_directory_and_file_name( output_dir, to_normalized_unprocessed_file_path( updated_file_name, file_type=parts.file_type, dt=parts.utc_upload_datetime, ), )
def test_is_task_queued_no_tasks(self): # Arrange info = ProcessIngestJobCloudTaskQueueInfo(queue_name='queue_name', task_names=[]) file_path = to_normalized_unprocessed_file_path( 'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW) args = IngestArgs(ingest_time=datetime.datetime.now()) gcsfs_args = \ GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=GcsfsFilePath.from_absolute_path(file_path)) # Act basic_args_queued = info.is_task_queued(_REGION, args) gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args) # Assert self.assertFalse(basic_args_queued) self.assertFalse(gcsfs_args_queued) self.assertFalse(info.is_task_queued(_REGION, gcsfs_args))
def _normalized_path_for_filename(self, filename: str, dt: datetime.datetime): normalized_path = \ to_normalized_unprocessed_file_path( os.path.join(self._INGEST_DIRECTORY_PATH, filename), dt) return normalized_path
def _do_check_in_for_file(self, path: str) -> None: normalized_file_name = os.path.basename( to_normalized_unprocessed_file_path( path, dt=datetime.datetime.fromisoformat(self.date))) self._copy_to_storage(path, normalized_file_name)