def test_copy_to_processed_ok( mock_s3_service_ok, mock_dataset_create_distribution_ok, mock_status, mocker ): mocker.spy(S3Service, "copy") mocker.spy(Dataset, "create_distribution") lambda_event = test_data.copy_event("processed") response = handlers.write_s3(lambda_event, {}) assert response == asdict( StepData( status="OK", errors=[], s3_input_prefixes={ test_data.dataset_id: test_data.s3_output_prefix_processed }, ) ) S3Service.copy.assert_called_once_with( ANY, test_data.s3_sources, test_data.s3_output_prefix_processed ) Dataset.create_distribution.assert_called_once_with( ANY, test_data.dataset_id, test_data.version, test_data.edition, { "distribution_type": "file", "content_type": "application/json", "filenames": test_data.filenames, }, )
def validate_json(event, context): config = Config.from_lambda_event(event) step_config = StepConfig.from_dict(config.task_config) step_data = config.payload.step_data log_add( dataset_id=config.payload.output_dataset.id, version=config.payload.output_dataset.version, edition=config.payload.output_dataset.edition, ) if step_data.input_count > 1: raise IllegalWrite("cannot combine multiple datasets: ", step_data.input_count) if step_config.schema is None: return asdict( StepData( input_events=step_data.input_events, s3_input_prefixes=step_data.s3_input_prefixes, status="VALIDATION_SUCCESS", errors=[], )) input_data = resolve_input_data(step_data) validation_errors = JsonSchemaValidator( step_config.schema).validate_list(input_data) if validation_errors: return asdict( StepData( input_events=step_data.input_events, s3_input_prefixes=step_data.s3_input_prefixes, status="VALIDATION_FAILED", errors=validation_errors[:100], )) return asdict( StepData( input_events=step_data.input_events, s3_input_prefixes=step_data.s3_input_prefixes, status="VALIDATION_SUCCESS", errors=[], ))
def test_config_immutable(): config = Config.from_lambda_event(event_pipeline_lambda_event) with pytest.raises(FrozenInstanceError): config.execution_name = "bleh" with pytest.raises(FrozenInstanceError): config.payload.output_dataset.version = "bleh" with pytest.raises(FrozenInstanceError): config.payload.step_data = StepData("", [], {"foo": "bar"}) config.payload.step_data.s3_input_prefixes = {"Mutable": "ok"}
def test_no_schema_succeeds(): lambda_event_no_schema = deepcopy(lambda_event) lambda_event_no_schema["payload"]["pipeline"]["task_config"][task_name] = None result = validate_json(lambda_event_no_schema, {}) assert result == asdict( StepData( input_events=input_events, status="VALIDATION_SUCCESS", errors=[], ) )
def test_validation_failed(validation_failure): result = validate_json(lambda_event, {}) JsonSchemaValidator.validate_list.assert_called_once_with( self=ANY, data=input_events ) assert result == asdict( StepData( input_events=input_events, status="VALIDATION_FAILED", errors=validation_errors, ) )
def test_validation_success(validation_success): result = validate_json(lambda_event, {}) JsonSchemaValidator.validate_list.assert_called_once_with( self=ANY, data=input_events ) assert result == asdict( StepData( input_events=input_events, status="VALIDATION_SUCCESS", errors=[], ) )
def export(self): inputs = self.read_csv() s3_prefix = self.s3_prefix() outputs = [] schema = self.task_config.schema errors = [] try: for filename, source in inputs: out_prefix = f"s3://{BUCKET}/{s3_prefix}{filename}" if self.task_config.chunksize: outputs.extend( self._parallel_export(filename, source, schema, out_prefix)) else: outputs.append(self._export(source, schema, out_prefix)) except OutOfBoundsDatetime as e: errors.append({"error": "OutOfBoundsDatetime", "message": str(e)}) except ValueError as e: errors.append({"error": "ValueError", "message": str(e)}) if len(errors) > 0: log_add(errors=errors) return asdict( StepData( status="CONVERSION_FAILED", errors=errors, s3_input_prefixes={ self.config.payload.output_dataset.id: s3_prefix }, )) log_add(parquetfiles=outputs) return asdict( StepData( status="CONVERSION_SUCCESS", errors=[], s3_input_prefixes={ self.config.payload.output_dataset.id: s3_prefix }, ))
def test_config_from_event_pipeline_lambda_event(): config = Config.from_lambda_event(event_pipeline_lambda_event) assert config.execution_name == "test_execution" assert config.task == "kinesis_writer" assert config.payload.pipeline == Pipeline( id="some-id", task_config={"kinesis_writer": {"some_config": "some_value"}}, ) assert config.payload.output_dataset == OutputDataset(id="some-id", version="1") assert config.payload.step_data == StepData( input_events=[{"foo": "bar"}, {"foo": "car"}], status="PENDING", errors=[], ) assert config.payload.step_data.input_count == 2
def test_copy_to_processed_latest_edition_not_latest( mock_s3_service_ok, mock_dataset_create_distribution_ok, mock_status, mock_get_latest_edition, mocker, ): mocker.spy(S3Service, "copy") mocker.spy(S3Service, "delete_from_prefix") mocker.spy(Dataset, "create_distribution") not_latest_edition = "20190120T133701" expected_s3_output_prefix = re.sub( "edition=.*/", f"edition={not_latest_edition}/", test_data.s3_output_prefix_processed, ) lambda_event = test_data.copy_event( "processed", write_to_latest=True, edition=not_latest_edition ) response = handlers.write_s3(lambda_event, {}) assert response == asdict( StepData( status="OK", errors=[], s3_input_prefixes={test_data.dataset_id: expected_s3_output_prefix}, ) ) assert S3Service.delete_from_prefix.call_count == 0 S3Service.copy.assert_called_once_with( ANY, test_data.s3_sources, expected_s3_output_prefix ) Dataset.create_distribution.assert_called_once_with( ANY, test_data.dataset_id, test_data.version, not_latest_edition, { "distribution_type": "file", "content_type": "application/json", "filenames": test_data.filenames, }, )
def write_kinesis(event, context): pipeline_config = Config.from_lambda_event(event) dataset_id = pipeline_config.payload.output_dataset.id version = pipeline_config.payload.output_dataset.version log_add(dataset_id=dataset_id, version=version) dataset = dataset_client.get_dataset(dataset_id, retries=3) access_rights = dataset["accessRights"] confidentiality = CONFIDENTIALITY_MAP[access_rights] output_stream_name = f"dp.{confidentiality}.{dataset_id}.processed.{version}.json" log_add(output_stream_name=output_stream_name) input_events = pipeline_config.payload.step_data.input_events write_to_kinesis(events=input_events, stream_name=output_stream_name) return asdict(StepData(input_events=input_events, status="OK", errors=[]))
def test_config_from_s3_pipeline_lambda_event(): config = Config.from_lambda_event(s3_pipeline_lambda_event) assert config.execution_name == "test_execution" assert config.task == "s3_writer" assert config.payload.pipeline == Pipeline( id="some-id", task_config={"s3_writer": {"some_config": "some_value"}}, ) assert config.payload.output_dataset == OutputDataset( id="some-id", version="1", edition="some-edition", s3_prefix="some-s3-prefix" ) assert config.payload.step_data == StepData( s3_input_prefixes={ "input1": "some-s3-prefix", "input2": "some-s3-prefix", "input3": "some-s3-prefix", }, status="PENDING", errors=[], ) assert config.payload.step_data.input_count == 3
def test_copy_to_cleaned_ok( mock_s3_service_ok, mock_dataset_create_distribution_ok, mock_status, mocker ): mocker.spy(S3Service, "copy") mocker.spy(Dataset, "create_distribution") lambda_event = test_data.copy_event("cleaned") response = handlers.write_s3(lambda_event, {}) assert response == asdict( StepData( status="OK", errors=[], s3_input_prefixes={ test_data.dataset_id: test_data.s3_output_prefix_cleaned }, ) ) S3Service.copy.assert_called_once_with( ANY, test_data.s3_sources, test_data.s3_output_prefix_cleaned ) Dataset.create_distribution.call_count == 0
def write_s3(event, context): config = Config.from_lambda_event(event) task_config = TaskConfig.from_dict(config.task_config) output_dataset = config.payload.output_dataset step_data = config.payload.step_data content_type = task_config.content_type log_add( dataset_id=output_dataset.id, version=output_dataset.version, edition_id=output_dataset.edition, source_prefixes=step_data.s3_input_prefixes, write_to_latest=task_config.write_to_latest, output_stage=task_config.output_stage, ) if content_type: log_add(content_type=content_type) status_add( domain="dataset", domain_id=f"{output_dataset.id}/{output_dataset.version}", operation=config.task, ) if step_data.input_count > 1: raise IllegalWrite("cannot combine multiple datasets: ", step_data.input_count) source_prefix = next(iter(step_data.s3_input_prefixes.values())) output_prefix = config.payload.output_dataset.s3_prefix.replace( "%stage%", task_config.output_stage) s3_sources = s3_service.resolve_s3_sources(source_prefix) copied_files = copy_data(s3_sources, output_prefix) if task_config.output_stage == "processed": try: create_distribution_with_retries(output_dataset, copied_files, content_type) except Exception as e: s3_service.delete_from_prefix(output_prefix) log_exception(e) raise DistributionNotCreated if task_config.write_to_latest and is_latest_edition( output_dataset.id, output_dataset.version, output_dataset.edition): write_data_to_latest(s3_sources, output_prefix) output_prefixes = {output_dataset.id: output_prefix} response = StepData(s3_input_prefixes=output_prefixes, status="OK", errors=[]) # TODO: this is just to verify that we have a correct implementation of the status API # temporary - if we are in /latest write -> set run to complete # Once we get this up and see what the status-api can return to the CLI we will update with more information status_body = { "files": [s3_source.key for s3_source in s3_sources], "latest": task_config.write_to_latest, } status_add(status_body=status_body) return asdict(response)