def test_preservation_of_underlying_metadata(): # Test if additional data is preserved test_dict = { "name": "test", "description": "test", "file_format": "test", "sensitive": False, "columns": [{ "name": "test", "type": "null" }], "primary_key": ["test"], "partitions": ["test"], "additional-attr": "test", } meta = Metadata.from_dict(test_dict) out_dict = meta.to_dict() for k, v in test_dict.items(): assert out_dict[k] == v # make sure data is copied and not just a pointer assert id(test_dict) != id(meta._data) test_dict["columns"] = [{"name": "new_test", "type": "bool_"}] assert test_dict != meta.columns # Assert Metadata instances are different m1 = Metadata() m2 = Metadata() assert m1.columns == m2.columns m1.columns.append({"name": "new_test", "type": "bool_"}) assert m1.columns != m2.columns
def test_to_dict(): metadata = Metadata( name="test", description="test", file_format="test", sensitive=False, columns=[{ "name": "test", "type": "null" }], primary_key=["test"], partitions=["test"], ) assert metadata.to_dict() == { "$schema": _schema_url, "name": "test", "description": "test", "file_format": "test", "sensitive": False, "columns": [{ "name": "test", "type": "null" }], "primary_key": ["test"], "partitions": ["test"], }
def test_basic_column_functions(): meta = Metadata(columns=[ { "name": "a", "type": "int8" }, { "name": "b", "type": "string" }, { "name": "c", "type": "date32" }, ]) assert meta.column_names == ["a", "b", "c"] meta.update_column({"name": "a", "type": "int64"}) assert meta.columns[0]["type"] == "int64" meta.update_column({"name": "d", "type": "string"}) assert meta.column_names == ["a", "b", "c", "d"] meta.remove_column("d") assert meta.column_names == ["a", "b", "c"] with pytest.raises(ValidationError): meta.update_column({"name": "d", "type": "error"}) with pytest.raises(ValueError): meta.remove_column("e")
def cast_pandas_column_to_schema( s: pd.Series, metacol: dict, pd_integer=True, pd_string=True, pd_boolean=True, pd_date_type: str = "datetime_object", pd_timestamp_type: str = "datetime_object", num_errors="raise", bool_map=None, ) -> pd.Series: complex_type_categories = ["struct", "list"] # get type_category if not exist if "type_category" not in metacol: tmp_meta = Metadata(columns=metacol) tmp_meta.set_col_type_category_from_types() metacol = tmp_meta.get_column(metacol["name"]) # Conversions try: if metacol["type_category"] == "integer": s = convert_to_integer_series(s, pd_integer, num_errors) elif metacol["type_category"] == "float": s = convert_to_float_series(s, num_errors) elif metacol["type_category"] == "boolean": s = convert_to_bool_series(s, pd_boolean, bool_map) elif metacol["type_category"] == "string": s = convert_to_string_series(s, pd_string) elif metacol["type_category"] == "timestamp": is_date = metacol["type"].startswith("date") s = convert_str_to_timestamp_series( s, pd_type=pd_date_type if is_date else pd_timestamp_type, is_date=is_date, str_datetime_format=metacol.get("datetime_format"), ) elif metacol["type_category"] in complex_type_categories: warnings.warn( f"complex types ({complex_type_categories}) are not cast " f"(column: {metacol['name']})") else: raise ValueError( f"meta type_category must be one of {_allowed_type_categories}." f"Got {metacol['type_category']} from column {metacol['name']}" ) except Exception as e: starter_msg = (f"Failed conversion - name: {metacol['name']} | " f"type_category: {metacol['type_category']} | " f"type: {metacol.get('type')} - see traceback.") raise PandasCastError(starter_msg).with_traceback(e.__traceback__) return s
def test_spec_matches_public_schema(): msg = ("You will need to update the public schema here: " "https://github.com/moj-analytical-services/metadata_schema/") m = Metadata() with urllib.request.urlopen(m._data["$schema"]) as url: public_schema = json.loads(url.read().decode()) assert public_schema == _table_schema, msg
def expected_meta_out_upper(): meta = Metadata( columns=[ {"name": "A", "type": "int8"}, {"name": "B", "type": "string"}, {"name": "C", "type": "date32"}, {"name": "D", "type": "date32"}, {"name": "E", "type": "date32"}, ] ) return meta
def expected_meta_out_lower(): meta = Metadata( columns=[ {"name": "a", "type": "int8"}, {"name": "b", "type": "string"}, {"name": "c", "type": "date32"}, {"name": "d", "type": "date32"}, {"name": "e", "type": "date32"}, ] ) return meta
def meta_input(): meta = Metadata( columns=[ {"name": "A", "type": "int8"}, {"name": "b", "type": "string"}, {"name": "C", "type": "date32"}, {"name": "D", "type": "date32"}, {"name": "e", "type": "date32"}, ] ) return meta
def test_basic_attributes(attribute: str, default_value: Any, valid_value: Any, invalid_value: Any): """ Attributes with default, valid and invalid types are handled as expected. """ metadata = Metadata() assert getattr(metadata, attribute) == default_value setattr(metadata, attribute, valid_value) assert getattr(metadata, attribute) == valid_value with pytest.raises(ValidationError): setattr(metadata, attribute, invalid_value)
def test_column_and_partition_functionality(): meta = Metadata() assert meta.columns == [] cols = [ { "name": "a", "type": "int8" }, { "name": "b", "type": "string" }, { "name": "c", "type": "date32" }, ] meta.columns = cols assert meta.column_names == ["a", "b", "c"] assert meta.partitions == [] assert meta.force_partition_order is None # force_partition_order is None so no change to order meta.partitions = ["b"] assert meta.column_names == ["a", "b", "c"] meta.force_partition_order = "start" meta.partitions = ["c", "b"] assert meta.column_names == ["c", "b", "a"] meta.force_partition_order = "end" assert meta.column_names == ["a", "c", "b"] meta.remove_column("c") assert meta.partitions == ["b"] with pytest.raises(ValueError): meta.force_partition_order = "error" with pytest.raises(ValueError): meta.partitions = ["c", "d"] with pytest.raises(ValueError): meta.columns = [{"name": "a", "type": "int8"}]
def test_set_col_type_category_from_types(col_input: Any, expected_cat: str): meta = Metadata(columns=col_input) meta.set_col_type_category_from_types() assert meta.columns[0]["type_category"] == expected_cat
def test_spec_matches_public_schema(): m = Metadata() with urllib.request.urlopen(m._data["$schema"]) as url: public_schema = json.loads(url.read().decode()) assert public_schema == _table_schema
def test_unpack_complex_data_type(data_type, expected): meta = Metadata() assert _unpack_complex_data_type(data_type) == expected assert meta.unpack_complex_data_type(data_type) == expected
def test_columns_default(): metadata = Metadata() assert metadata.columns == []
def test_columns_pass(col_input: Any): Metadata(columns=col_input)
def test_columns_validation_error(col_input: Any): metadata = Metadata() with pytest.raises(ValidationError): metadata.columns = col_input