Exemplo n.º 1
0
def _columns_to_metadata(
        columns: Mapping[str, Any]) -> Optional[Mapping[str, Any]]:
    return ({
        "schema":
        MetadataValue.table_schema(
            TableSchema(columns=[
                TableColumn(
                    name=name,
                    type=metadata.get("data_type") or "?",
                    description=metadata.get("description"),
                ) for name, metadata in columns.items()
            ]))
    } if len(columns) > 0 else None)
Exemplo n.º 2
0
 def materialize(_):
     yield AssetMaterialization(
         asset_key="all_types",
         description="a materialization with all metadata types",
         metadata_entries=[
             MetadataEntry("text", value="text is cool"),
             MetadataEntry(
                 "url", value=MetadataValue.url("https://bigty.pe/neato")),
             MetadataEntry("path",
                           value=MetadataValue.path("/tmp/awesome")),
             MetadataEntry("json", value={"is_dope": True}),
             MetadataEntry(
                 "python class",
                 value=MetadataValue.python_artifact(MetadataEntry)),
             MetadataEntry(
                 "python function",
                 value=MetadataValue.python_artifact(file_relative_path)),
             MetadataEntry("float", value=1.2),
             MetadataEntry("int", value=1),
             MetadataEntry("float NaN", value=float("nan")),
             MetadataEntry("long int", value=LONG_INT),
             MetadataEntry("pipeline run",
                           value=MetadataValue.pipeline_run("fake_run_id")),
             MetadataEntry("my asset", value=AssetKey("my_asset")),
             MetadataEntry(
                 "table",
                 value=MetadataValue.table(records=[
                     TableRecord(foo=1, bar=2),
                     TableRecord(foo=3, bar=4),
                 ], ),
             ),
             MetadataEntry(
                 "table_schema",
                 value=TableSchema(
                     columns=[
                         TableColumn(
                             name="foo",
                             type="integer",
                             constraints=TableColumnConstraints(
                                 unique=True),
                         ),
                         TableColumn(name="bar", type="string"),
                     ],
                     constraints=TableConstraints(other=["some constraint"
                                                         ], ),
                 ),
             ),
         ],
     )
     yield Output(None)
Exemplo n.º 3
0
 def materialize(_):
     yield AssetMaterialization(
         asset_key="all_types",
         description="a materialization with all metadata types",
         metadata_entries=[
             EventMetadataEntry.text("text is cool", "text"),
             EventMetadataEntry.url("https://bigty.pe/neato", "url"),
             EventMetadataEntry.fspath("/tmp/awesome", "path"),
             EventMetadataEntry.json({"is_dope": True}, "json"),
             EventMetadataEntry.python_artifact(EventMetadataEntry,
                                                "python class"),
             EventMetadataEntry.python_artifact(file_relative_path,
                                                "python function"),
             EventMetadataEntry.float(1.2, "float"),
             EventMetadataEntry.int(1, "int"),
             EventMetadataEntry.float(float("nan"), "float NaN"),
             EventMetadataEntry.int(LONG_INT, "long int"),
             EventMetadataEntry.pipeline_run("fake_run_id", "pipeline run"),
             EventMetadataEntry.asset(AssetKey("my_asset"), "my asset"),
             EventMetadataEntry.table(
                 label="table",
                 records=[
                     TableRecord(foo=1, bar=2),
                     TableRecord(foo=3, bar=4),
                 ],
             ),
             EventMetadataEntry.table_schema(
                 label="table_schema",
                 schema=TableSchema(
                     columns=[
                         TableColumn(
                             name="foo",
                             type="integer",
                             constraints=TableColumnConstraints(
                                 unique=True),
                         ),
                         TableColumn(name="bar", type="string"),
                     ],
                     constraints=TableConstraints(other=["some constraint"
                                                         ], ),
                 ),
             ),
         ],
     )
     yield Output(None)
Exemplo n.º 4
0
def test_assets(schema_prefix):

    ab_resource = airbyte_resource(
        build_init_resource_context(config={
            "host": "some_host",
            "port": "8000",
        }))
    destination_tables = ["foo", "bar"]
    if schema_prefix:
        destination_tables = [schema_prefix + t for t in destination_tables]
    ab_assets = build_airbyte_assets(
        "12345",
        destination_tables=destination_tables,
        asset_key_prefix=["some", "prefix"],
    )

    assert ab_assets[0].asset_keys == {
        AssetKey(["some", "prefix", t])
        for t in destination_tables
    }
    assert len(ab_assets[0].op.output_defs) == 2

    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/connections/get",
        json=get_sample_connection_json(prefix=schema_prefix),
        status=200,
    )
    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/connections/sync",
        json={"job": {
            "id": 1
        }},
        status=200,
    )
    responses.add(
        method=responses.POST,
        url=ab_resource.api_base_url + "/jobs/get",
        json=get_sample_job_json(schema_prefix=schema_prefix),
        status=200,
    )

    ab_job = build_assets_job(
        "ab_job",
        ab_assets,
        resource_defs={
            "airbyte":
            airbyte_resource.configured({
                "host": "some_host",
                "port": "8000",
            })
        },
    )

    res = ab_job.execute_in_process()

    materializations = [
        event.event_specific_data.materialization
        for event in res.events_for_node("airbyte_sync_12345")
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 3
    assert {m.asset_key
            for m in materializations} == {
                AssetKey(["some", "prefix", schema_prefix + "foo"]),
                AssetKey(["some", "prefix", schema_prefix + "bar"]),
                AssetKey(["some", "prefix", schema_prefix + "baz"]),
            }
    assert MetadataEntry("bytesEmitted",
                         value=1234) in materializations[0].metadata_entries
    assert MetadataEntry("recordsCommitted",
                         value=4321) in materializations[0].metadata_entries
    assert (MetadataEntry(
        "schema",
        value=TableSchema(columns=[
            TableColumn(name="a", type="str"),
            TableColumn(name="b", type="int"),
        ]),
    ) in materializations[0].metadata_entries)
Exemplo n.º 5
0
def _pandera_schema_to_table_schema(schema: pa.DataFrameSchema) -> TableSchema:
    df_constraints = _pandera_schema_wide_checks_to_table_constraints(schema.checks)
    columns = [_pandera_column_to_table_column(col) for k, col in schema.columns.items()]
    return TableSchema(columns=columns, constraints=df_constraints)
Exemplo n.º 6
0
    return type_check_fn


PANDERA_FAILURE_CASES_SCHEMA = TableSchema(
    columns=[
        TableColumn(
            name="schema_context",
            type="string",
            description="`Column` for column-wise checks, or `DataFrameSchema`",
        ),
        TableColumn(
            name="column",
            type="string",
            description="Column of value that failed the check, or `None` for wide checks.",
        ),
        TableColumn(
            name="check", type="string", description="Description of the failed Pandera check."
        ),
        TableColumn(name="check_number", description="Index of the failed check."),
        TableColumn(
            name="failure_case", type="number | string", description="Value that failed a check."
        ),
        TableColumn(
            name="index",
            type="number | string",
            description="Index (row) of value that failed a check.",
        ),
    ]
)


def _pandera_errors_to_type_check(