Пример #1
0
def test_preservation_of_underlying_metadata():

    # Test if additional data is preserved
    test_dict = {
        "name": "test",
        "description": "test",
        "file_format": "test",
        "sensitive": False,
        "columns": [{
            "name": "test",
            "type": "null"
        }],
        "primary_key": ["test"],
        "partitions": ["test"],
        "additional-attr": "test",
    }
    meta = Metadata.from_dict(test_dict)
    out_dict = meta.to_dict()
    for k, v in test_dict.items():
        assert out_dict[k] == v

    # make sure data is copied and not just a pointer
    assert id(test_dict) != id(meta._data)

    test_dict["columns"] = [{"name": "new_test", "type": "bool_"}]
    assert test_dict != meta.columns

    # Assert Metadata instances are different
    m1 = Metadata()
    m2 = Metadata()

    assert m1.columns == m2.columns

    m1.columns.append({"name": "new_test", "type": "bool_"})
    assert m1.columns != m2.columns
Пример #2
0
def test_to_dict():
    metadata = Metadata(
        name="test",
        description="test",
        file_format="test",
        sensitive=False,
        columns=[{
            "name": "test",
            "type": "null"
        }],
        primary_key=["test"],
        partitions=["test"],
    )
    assert metadata.to_dict() == {
        "$schema": _schema_url,
        "name": "test",
        "description": "test",
        "file_format": "test",
        "sensitive": False,
        "columns": [{
            "name": "test",
            "type": "null"
        }],
        "primary_key": ["test"],
        "partitions": ["test"],
    }
Пример #3
0
def test_basic_column_functions():
    meta = Metadata(columns=[
        {
            "name": "a",
            "type": "int8"
        },
        {
            "name": "b",
            "type": "string"
        },
        {
            "name": "c",
            "type": "date32"
        },
    ])
    assert meta.column_names == ["a", "b", "c"]

    meta.update_column({"name": "a", "type": "int64"})
    assert meta.columns[0]["type"] == "int64"

    meta.update_column({"name": "d", "type": "string"})
    assert meta.column_names == ["a", "b", "c", "d"]

    meta.remove_column("d")
    assert meta.column_names == ["a", "b", "c"]

    with pytest.raises(ValidationError):
        meta.update_column({"name": "d", "type": "error"})

    with pytest.raises(ValueError):
        meta.remove_column("e")
Пример #4
0
def cast_pandas_column_to_schema(
    s: pd.Series,
    metacol: dict,
    pd_integer=True,
    pd_string=True,
    pd_boolean=True,
    pd_date_type: str = "datetime_object",
    pd_timestamp_type: str = "datetime_object",
    num_errors="raise",
    bool_map=None,
) -> pd.Series:

    complex_type_categories = ["struct", "list"]

    # get type_category if not exist
    if "type_category" not in metacol:
        tmp_meta = Metadata(columns=metacol)
        tmp_meta.set_col_type_category_from_types()
        metacol = tmp_meta.get_column(metacol["name"])

    # Conversions
    try:
        if metacol["type_category"] == "integer":
            s = convert_to_integer_series(s, pd_integer, num_errors)

        elif metacol["type_category"] == "float":
            s = convert_to_float_series(s, num_errors)

        elif metacol["type_category"] == "boolean":
            s = convert_to_bool_series(s, pd_boolean, bool_map)

        elif metacol["type_category"] == "string":
            s = convert_to_string_series(s, pd_string)

        elif metacol["type_category"] == "timestamp":
            is_date = metacol["type"].startswith("date")
            s = convert_str_to_timestamp_series(
                s,
                pd_type=pd_date_type if is_date else pd_timestamp_type,
                is_date=is_date,
                str_datetime_format=metacol.get("datetime_format"),
            )
        elif metacol["type_category"] in complex_type_categories:
            warnings.warn(
                f"complex types ({complex_type_categories}) are not cast "
                f"(column: {metacol['name']})")
        else:
            raise ValueError(
                f"meta type_category must be one of {_allowed_type_categories}."
                f"Got {metacol['type_category']} from column {metacol['name']}"
            )

    except Exception as e:
        starter_msg = (f"Failed conversion - name: {metacol['name']} | "
                       f"type_category: {metacol['type_category']} | "
                       f"type: {metacol.get('type')} - see traceback.")
        raise PandasCastError(starter_msg).with_traceback(e.__traceback__)

    return s
Пример #5
0
def test_spec_matches_public_schema():
    msg = ("You will need to update the public schema here: "
           "https://github.com/moj-analytical-services/metadata_schema/")
    m = Metadata()
    with urllib.request.urlopen(m._data["$schema"]) as url:
        public_schema = json.loads(url.read().decode())

    assert public_schema == _table_schema, msg
def expected_meta_out_upper():
    meta = Metadata(
        columns=[
            {"name": "A", "type": "int8"},
            {"name": "B", "type": "string"},
            {"name": "C", "type": "date32"},
            {"name": "D", "type": "date32"},
            {"name": "E", "type": "date32"},
        ]
    )
    return meta
def expected_meta_out_lower():
    meta = Metadata(
        columns=[
            {"name": "a", "type": "int8"},
            {"name": "b", "type": "string"},
            {"name": "c", "type": "date32"},
            {"name": "d", "type": "date32"},
            {"name": "e", "type": "date32"},
        ]
    )
    return meta
def meta_input():
    meta = Metadata(
        columns=[
            {"name": "A", "type": "int8"},
            {"name": "b", "type": "string"},
            {"name": "C", "type": "date32"},
            {"name": "D", "type": "date32"},
            {"name": "e", "type": "date32"},
        ]
    )
    return meta
Пример #9
0
def test_basic_attributes(attribute: str, default_value: Any, valid_value: Any,
                          invalid_value: Any):
    """
    Attributes with default, valid and invalid types are handled as
    expected.
    """
    metadata = Metadata()
    assert getattr(metadata, attribute) == default_value

    setattr(metadata, attribute, valid_value)
    assert getattr(metadata, attribute) == valid_value

    with pytest.raises(ValidationError):
        setattr(metadata, attribute, invalid_value)
Пример #10
0
def test_column_and_partition_functionality():
    meta = Metadata()
    assert meta.columns == []

    cols = [
        {
            "name": "a",
            "type": "int8"
        },
        {
            "name": "b",
            "type": "string"
        },
        {
            "name": "c",
            "type": "date32"
        },
    ]

    meta.columns = cols
    assert meta.column_names == ["a", "b", "c"]

    assert meta.partitions == []
    assert meta.force_partition_order is None

    # force_partition_order is None so no change to order
    meta.partitions = ["b"]
    assert meta.column_names == ["a", "b", "c"]

    meta.force_partition_order = "start"
    meta.partitions = ["c", "b"]
    assert meta.column_names == ["c", "b", "a"]

    meta.force_partition_order = "end"
    assert meta.column_names == ["a", "c", "b"]

    meta.remove_column("c")
    assert meta.partitions == ["b"]

    with pytest.raises(ValueError):
        meta.force_partition_order = "error"

    with pytest.raises(ValueError):
        meta.partitions = ["c", "d"]

    with pytest.raises(ValueError):
        meta.columns = [{"name": "a", "type": "int8"}]
Пример #11
0
def test_set_col_type_category_from_types(col_input: Any, expected_cat: str):
    meta = Metadata(columns=col_input)
    meta.set_col_type_category_from_types()
    assert meta.columns[0]["type_category"] == expected_cat
Пример #12
0
def test_spec_matches_public_schema():
    m = Metadata()
    with urllib.request.urlopen(m._data["$schema"]) as url:
        public_schema = json.loads(url.read().decode())

    assert public_schema == _table_schema
Пример #13
0
def test_unpack_complex_data_type(data_type, expected):
    meta = Metadata()
    assert _unpack_complex_data_type(data_type) == expected
    assert meta.unpack_complex_data_type(data_type) == expected
Пример #14
0
def test_columns_default():
    metadata = Metadata()
    assert metadata.columns == []
Пример #15
0
def test_columns_pass(col_input: Any):
    Metadata(columns=col_input)
Пример #16
0
def test_columns_validation_error(col_input: Any):
    metadata = Metadata()
    with pytest.raises(ValidationError):
        metadata.columns = col_input