Пример #1
0
def test_copy_to_processed_ok(
    mock_s3_service_ok, mock_dataset_create_distribution_ok, mock_status, mocker
):

    mocker.spy(S3Service, "copy")
    mocker.spy(Dataset, "create_distribution")
    lambda_event = test_data.copy_event("processed")
    response = handlers.write_s3(lambda_event, {})

    assert response == asdict(
        StepData(
            status="OK",
            errors=[],
            s3_input_prefixes={
                test_data.dataset_id: test_data.s3_output_prefix_processed
            },
        )
    )

    S3Service.copy.assert_called_once_with(
        ANY, test_data.s3_sources, test_data.s3_output_prefix_processed
    )
    Dataset.create_distribution.assert_called_once_with(
        ANY,
        test_data.dataset_id,
        test_data.version,
        test_data.edition,
        {
            "distribution_type": "file",
            "content_type": "application/json",
            "filenames": test_data.filenames,
        },
    )
Пример #2
0
def validate_json(event, context):
    config = Config.from_lambda_event(event)
    step_config = StepConfig.from_dict(config.task_config)
    step_data = config.payload.step_data

    log_add(
        dataset_id=config.payload.output_dataset.id,
        version=config.payload.output_dataset.version,
        edition=config.payload.output_dataset.edition,
    )

    if step_data.input_count > 1:
        raise IllegalWrite("cannot combine multiple datasets: ",
                           step_data.input_count)

    if step_config.schema is None:
        return asdict(
            StepData(
                input_events=step_data.input_events,
                s3_input_prefixes=step_data.s3_input_prefixes,
                status="VALIDATION_SUCCESS",
                errors=[],
            ))

    input_data = resolve_input_data(step_data)

    validation_errors = JsonSchemaValidator(
        step_config.schema).validate_list(input_data)

    if validation_errors:
        return asdict(
            StepData(
                input_events=step_data.input_events,
                s3_input_prefixes=step_data.s3_input_prefixes,
                status="VALIDATION_FAILED",
                errors=validation_errors[:100],
            ))

    return asdict(
        StepData(
            input_events=step_data.input_events,
            s3_input_prefixes=step_data.s3_input_prefixes,
            status="VALIDATION_SUCCESS",
            errors=[],
        ))
Пример #3
0
def test_config_immutable():
    config = Config.from_lambda_event(event_pipeline_lambda_event)
    with pytest.raises(FrozenInstanceError):
        config.execution_name = "bleh"
    with pytest.raises(FrozenInstanceError):
        config.payload.output_dataset.version = "bleh"
    with pytest.raises(FrozenInstanceError):
        config.payload.step_data = StepData("", [], {"foo": "bar"})
    config.payload.step_data.s3_input_prefixes = {"Mutable": "ok"}
Пример #4
0
def test_no_schema_succeeds():
    lambda_event_no_schema = deepcopy(lambda_event)
    lambda_event_no_schema["payload"]["pipeline"]["task_config"][task_name] = None
    result = validate_json(lambda_event_no_schema, {})
    assert result == asdict(
        StepData(
            input_events=input_events,
            status="VALIDATION_SUCCESS",
            errors=[],
        )
    )
Пример #5
0
def test_validation_failed(validation_failure):
    result = validate_json(lambda_event, {})
    JsonSchemaValidator.validate_list.assert_called_once_with(
        self=ANY, data=input_events
    )
    assert result == asdict(
        StepData(
            input_events=input_events,
            status="VALIDATION_FAILED",
            errors=validation_errors,
        )
    )
Пример #6
0
def test_validation_success(validation_success):
    result = validate_json(lambda_event, {})
    JsonSchemaValidator.validate_list.assert_called_once_with(
        self=ANY, data=input_events
    )
    assert result == asdict(
        StepData(
            input_events=input_events,
            status="VALIDATION_SUCCESS",
            errors=[],
        )
    )
Пример #7
0
    def export(self):
        inputs = self.read_csv()
        s3_prefix = self.s3_prefix()
        outputs = []
        schema = self.task_config.schema
        errors = []
        try:
            for filename, source in inputs:
                out_prefix = f"s3://{BUCKET}/{s3_prefix}{filename}"
                if self.task_config.chunksize:
                    outputs.extend(
                        self._parallel_export(filename, source, schema,
                                              out_prefix))
                else:
                    outputs.append(self._export(source, schema, out_prefix))
        except OutOfBoundsDatetime as e:
            errors.append({"error": "OutOfBoundsDatetime", "message": str(e)})
        except ValueError as e:
            errors.append({"error": "ValueError", "message": str(e)})

        if len(errors) > 0:
            log_add(errors=errors)
            return asdict(
                StepData(
                    status="CONVERSION_FAILED",
                    errors=errors,
                    s3_input_prefixes={
                        self.config.payload.output_dataset.id: s3_prefix
                    },
                ))

        log_add(parquetfiles=outputs)
        return asdict(
            StepData(
                status="CONVERSION_SUCCESS",
                errors=[],
                s3_input_prefixes={
                    self.config.payload.output_dataset.id: s3_prefix
                },
            ))
Пример #8
0
def test_config_from_event_pipeline_lambda_event():

    config = Config.from_lambda_event(event_pipeline_lambda_event)

    assert config.execution_name == "test_execution"
    assert config.task == "kinesis_writer"
    assert config.payload.pipeline == Pipeline(
        id="some-id",
        task_config={"kinesis_writer": {"some_config": "some_value"}},
    )
    assert config.payload.output_dataset == OutputDataset(id="some-id", version="1")
    assert config.payload.step_data == StepData(
        input_events=[{"foo": "bar"}, {"foo": "car"}],
        status="PENDING",
        errors=[],
    )
    assert config.payload.step_data.input_count == 2
Пример #9
0
def test_copy_to_processed_latest_edition_not_latest(
    mock_s3_service_ok,
    mock_dataset_create_distribution_ok,
    mock_status,
    mock_get_latest_edition,
    mocker,
):
    mocker.spy(S3Service, "copy")
    mocker.spy(S3Service, "delete_from_prefix")
    mocker.spy(Dataset, "create_distribution")

    not_latest_edition = "20190120T133701"
    expected_s3_output_prefix = re.sub(
        "edition=.*/",
        f"edition={not_latest_edition}/",
        test_data.s3_output_prefix_processed,
    )
    lambda_event = test_data.copy_event(
        "processed", write_to_latest=True, edition=not_latest_edition
    )
    response = handlers.write_s3(lambda_event, {})

    assert response == asdict(
        StepData(
            status="OK",
            errors=[],
            s3_input_prefixes={test_data.dataset_id: expected_s3_output_prefix},
        )
    )

    assert S3Service.delete_from_prefix.call_count == 0

    S3Service.copy.assert_called_once_with(
        ANY, test_data.s3_sources, expected_s3_output_prefix
    )
    Dataset.create_distribution.assert_called_once_with(
        ANY,
        test_data.dataset_id,
        test_data.version,
        not_latest_edition,
        {
            "distribution_type": "file",
            "content_type": "application/json",
            "filenames": test_data.filenames,
        },
    )
Пример #10
0
def write_kinesis(event, context):
    pipeline_config = Config.from_lambda_event(event)

    dataset_id = pipeline_config.payload.output_dataset.id
    version = pipeline_config.payload.output_dataset.version
    log_add(dataset_id=dataset_id, version=version)

    dataset = dataset_client.get_dataset(dataset_id, retries=3)
    access_rights = dataset["accessRights"]
    confidentiality = CONFIDENTIALITY_MAP[access_rights]

    output_stream_name = f"dp.{confidentiality}.{dataset_id}.processed.{version}.json"
    log_add(output_stream_name=output_stream_name)

    input_events = pipeline_config.payload.step_data.input_events
    write_to_kinesis(events=input_events, stream_name=output_stream_name)

    return asdict(StepData(input_events=input_events, status="OK", errors=[]))
Пример #11
0
def test_config_from_s3_pipeline_lambda_event():
    config = Config.from_lambda_event(s3_pipeline_lambda_event)

    assert config.execution_name == "test_execution"
    assert config.task == "s3_writer"
    assert config.payload.pipeline == Pipeline(
        id="some-id",
        task_config={"s3_writer": {"some_config": "some_value"}},
    )
    assert config.payload.output_dataset == OutputDataset(
        id="some-id", version="1", edition="some-edition", s3_prefix="some-s3-prefix"
    )
    assert config.payload.step_data == StepData(
        s3_input_prefixes={
            "input1": "some-s3-prefix",
            "input2": "some-s3-prefix",
            "input3": "some-s3-prefix",
        },
        status="PENDING",
        errors=[],
    )
    assert config.payload.step_data.input_count == 3
Пример #12
0
def test_copy_to_cleaned_ok(
    mock_s3_service_ok, mock_dataset_create_distribution_ok, mock_status, mocker
):

    mocker.spy(S3Service, "copy")
    mocker.spy(Dataset, "create_distribution")

    lambda_event = test_data.copy_event("cleaned")
    response = handlers.write_s3(lambda_event, {})

    assert response == asdict(
        StepData(
            status="OK",
            errors=[],
            s3_input_prefixes={
                test_data.dataset_id: test_data.s3_output_prefix_cleaned
            },
        )
    )

    S3Service.copy.assert_called_once_with(
        ANY, test_data.s3_sources, test_data.s3_output_prefix_cleaned
    )
    Dataset.create_distribution.call_count == 0
Пример #13
0
def write_s3(event, context):
    config = Config.from_lambda_event(event)
    task_config = TaskConfig.from_dict(config.task_config)
    output_dataset = config.payload.output_dataset
    step_data = config.payload.step_data
    content_type = task_config.content_type

    log_add(
        dataset_id=output_dataset.id,
        version=output_dataset.version,
        edition_id=output_dataset.edition,
        source_prefixes=step_data.s3_input_prefixes,
        write_to_latest=task_config.write_to_latest,
        output_stage=task_config.output_stage,
    )
    if content_type:
        log_add(content_type=content_type)

    status_add(
        domain="dataset",
        domain_id=f"{output_dataset.id}/{output_dataset.version}",
        operation=config.task,
    )

    if step_data.input_count > 1:
        raise IllegalWrite("cannot combine multiple datasets: ",
                           step_data.input_count)

    source_prefix = next(iter(step_data.s3_input_prefixes.values()))
    output_prefix = config.payload.output_dataset.s3_prefix.replace(
        "%stage%", task_config.output_stage)

    s3_sources = s3_service.resolve_s3_sources(source_prefix)
    copied_files = copy_data(s3_sources, output_prefix)

    if task_config.output_stage == "processed":
        try:
            create_distribution_with_retries(output_dataset, copied_files,
                                             content_type)
        except Exception as e:
            s3_service.delete_from_prefix(output_prefix)
            log_exception(e)
            raise DistributionNotCreated

    if task_config.write_to_latest and is_latest_edition(
            output_dataset.id, output_dataset.version, output_dataset.edition):
        write_data_to_latest(s3_sources, output_prefix)

    output_prefixes = {output_dataset.id: output_prefix}
    response = StepData(s3_input_prefixes=output_prefixes,
                        status="OK",
                        errors=[])

    # TODO: this is just to verify that we have a correct implementation of the status API
    # temporary - if we are in /latest write -> set run to complete
    # Once we get this up and see what the status-api can return to the CLI we will update with more information
    status_body = {
        "files": [s3_source.key for s3_source in s3_sources],
        "latest": task_config.write_to_latest,
    }
    status_add(status_body=status_body)
    return asdict(response)