def test_profile_boolean_schema(empty_data_context, boolean_types_schema): profiler = JsonSchemaProfiler() obs = profiler.profile(boolean_types_schema, "bools") assert isinstance(obs, ExpectationSuite) assert obs.expectation_suite_name == "bools" assert [e.to_json_dict() for e in obs.expectations] == [ { "meta": {}, "kwargs": { "column": "active" }, "expectation_type": "expect_column_to_exist", }, { "meta": {}, "kwargs": { "column": "active", "type_list": list(ProfilerTypeMapping.BOOLEAN_TYPE_NAMES), }, "expectation_type": "expect_column_values_to_be_in_type_list", }, { "meta": {}, "kwargs": { "column": "active", "value_set": [True, False] }, "expectation_type": "expect_column_values_to_be_in_set", }, ] context = empty_data_context context.save_expectation_suite(obs)
def test_profile_raises_error_on_schema_missing_top_level_type_key(): profiler = JsonSchemaProfiler() schema = {"a_schema": "missing_type"} with pytest.raises(KeyError) as e: profiler.profile(schema, "suite") message = str(e.value) assert "This profiler requires a json schema with a top level `type` key" in message
def test_profile_raises_error_on_schema_with_top_level_type_other_than_object(): profiler = JsonSchemaProfiler() schema = {"type": "array"} with pytest.raises(TypeError) as e: profiler.profile(schema, "suite") message = str(e.value) assert ( "This profiler requires a json schema with a top level `type` of `object`" in message )
def test_profile_simple_schema(empty_data_context, simple_schema): profiler = JsonSchemaProfiler() obs = profiler.profile(simple_schema, "simple_suite") assert isinstance(obs, ExpectationSuite) assert obs.expectation_suite_name == "simple_suite" assert [e.to_json_dict() for e in obs.expectations] == [ { "kwargs": { "column": "first_name" }, "expectation_type": "expect_column_to_exist", "meta": {}, }, { "kwargs": { "column": "first_name", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, "expectation_type": "expect_column_values_to_be_in_type_list", "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": { "column": "first_name" }, "meta": {}, }, { "kwargs": { "column": "age" }, "expectation_type": "expect_column_to_exist", "meta": {}, }, { "kwargs": { "column": "age", "type_list": list(ProfilerTypeMapping.INT_TYPE_NAMES), }, "expectation_type": "expect_column_values_to_be_in_type_list", "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": { "column": "age" }, "meta": {}, }, ] context = empty_data_context context.save_expectation_suite(obs)
def test_profile_enum_schema(empty_data_context, enum_types_schema): profiler = JsonSchemaProfiler() obs = profiler.profile(enum_types_schema, "enums") assert isinstance(obs, ExpectationSuite) assert obs.expectation_suite_name == "enums" assert [e.to_json_dict() for e in obs.expectations] == [ { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": { "column": "shirt-size" }, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_set", "kwargs": { "column": "shirt-size", "value_set": ["XS", "S", "M", "XL", "XXL"], }, }, ] context = empty_data_context context.save_expectation_suite(obs)
def test_has_profile_create_expectations_from_complex_schema( empty_data_context, complex_flat_schema ): profiler = JsonSchemaProfiler() obs = profiler.profile(complex_flat_schema, "complex") assert isinstance(obs, ExpectationSuite) assert obs.expectation_suite_name == "complex" assert obs.meta["notes"] == { "format": "markdown", "content": ["### Description:\nAn address"], } assert [e.to_json_dict() for e in obs.expectations] == [ { "expectation_type": "expect_column_to_exist", "kwargs": {"column": "post-office-box"}, "meta": {}, }, { "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "post-office-box", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "post-office-box"}, "meta": {}, }, { "expectation_type": "expect_column_to_exist", "kwargs": {"column": "street-name"}, "meta": {}, }, { "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "street-name", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "street-name"}, "meta": {}, }, { "expectation_type": "expect_column_to_exist", "kwargs": {"column": "street-number"}, "meta": { "notes": { "format": "markdown", "content": ["### Description:\nOnly the address number."], } }, }, { "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "street-number", "type_list": list(ProfilerTypeMapping.INT_TYPE_NAMES), }, "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "street-number"}, "meta": {}, }, { "expectation_type": "expect_column_to_exist", "kwargs": {"column": "locality"}, "meta": {}, }, { "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "locality", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "locality"}, "meta": {}, }, { "expectation_type": "expect_column_to_exist", "kwargs": {"column": "region"}, "meta": {}, }, { "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "region", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "region"}, "meta": {}, }, { "expectation_type": "expect_column_to_exist", "kwargs": {"column": "postal-code"}, "meta": {}, }, { "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "postal-code", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "postal-code"}, "meta": {}, }, { "expectation_type": "expect_column_to_exist", "kwargs": {"column": "country-name"}, "meta": {}, }, { "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "country-name", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "country-name"}, "meta": {}, }, ] context = empty_data_context context.save_expectation_suite(obs)
def test_profile_number_ranges_schema(empty_data_context, number_ranges_schema): profiler = JsonSchemaProfiler() obs = profiler.profile(number_ranges_schema, "number_ranges") assert isinstance(obs, ExpectationSuite) assert obs.expectation_suite_name == "number_ranges" assert [e.to_json_dict() for e in obs.expectations] == [ { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "favorite-number"}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "favorite-number", "type_list": list(ProfilerTypeMapping.FLOAT_TYPE_NAMES), }, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "favorite-number"}, "meta": {}, }, { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "age-0-130"}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "age-0-130", "type_list": list(ProfilerTypeMapping.FLOAT_TYPE_NAMES), }, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_between", "kwargs": {"column": "age-0-130", "min_value": 0.5, "max_value": 130.5}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "age-0-130"}, "meta": {}, }, { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "wheel-count-0-plus"}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "wheel-count-0-plus", "type_list": list(ProfilerTypeMapping.FLOAT_TYPE_NAMES), }, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_between", "kwargs": {"column": "wheel-count-0-plus", "min_value": 0.5}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "wheel-count-0-plus"}, "meta": {}, }, { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "rpm-max-7000"}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "rpm-max-7000", "type_list": list(ProfilerTypeMapping.FLOAT_TYPE_NAMES), }, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_between", "kwargs": {"column": "rpm-max-7000", "max_value": 7000.5}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "rpm-max-7000"}, "meta": {}, }, { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "lake-depth-max-minus-100"}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "lake-depth-max-minus-100", "type_list": list(ProfilerTypeMapping.FLOAT_TYPE_NAMES), }, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_between", "kwargs": {"column": "lake-depth-max-minus-100", "max_value": -100.5}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "lake-depth-max-minus-100"}, "meta": {}, }, { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "floor-exclusive-min-0"}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "floor-exclusive-min-0", "type_list": list(ProfilerTypeMapping.FLOAT_TYPE_NAMES), }, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_between", "kwargs": { "column": "floor-exclusive-min-0", "min_value": 0.5, "strict_min": True, }, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "floor-exclusive-min-0"}, "meta": {}, }, { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "floor-exclusive-max-100"}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "floor-exclusive-max-100", "type_list": list(ProfilerTypeMapping.FLOAT_TYPE_NAMES), }, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_between", "kwargs": { "column": "floor-exclusive-max-100", "max_value": 100.5, "strict_max": True, }, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "floor-exclusive-max-100"}, "meta": {}, }, { "kwargs": {"column": "gear-exclusive-0-6"}, "expectation_type": "expect_column_to_exist", "meta": {}, }, { "kwargs": { "column": "gear-exclusive-0-6", "type_list": list(ProfilerTypeMapping.FLOAT_TYPE_NAMES), }, "expectation_type": "expect_column_values_to_be_in_type_list", "meta": {}, }, { "kwargs": { "column": "gear-exclusive-0-6", "min_value": 0.5, "strict_min": True, "max_value": 6.5, "strict_max": True, }, "expectation_type": "expect_column_values_to_be_between", "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "gear-exclusive-0-6"}, "meta": {}, }, { "kwargs": {"column": "optional-min-half"}, "expectation_type": "expect_column_to_exist", "meta": {}, }, { "kwargs": { "column": "optional-min-half", "type_list": list(ProfilerTypeMapping.FLOAT_TYPE_NAMES), }, "expectation_type": "expect_column_values_to_be_in_type_list", "meta": {}, }, { "kwargs": {"column": "optional-min-half", "min_value": 0.5,}, "expectation_type": "expect_column_values_to_be_between", "meta": {}, }, ] context = empty_data_context context.save_expectation_suite(obs)
def test_profile_string_lengths_schema(empty_data_context, string_lengths_schema): profiler = JsonSchemaProfiler() obs = profiler.profile(string_lengths_schema, "lengths") assert isinstance(obs, ExpectationSuite) assert obs.expectation_suite_name == "lengths" assert [e.to_json_dict() for e in obs.expectations] == [ { "kwargs": {"column": "comments-no-constraints"}, "expectation_type": "expect_column_to_exist", "meta": {}, }, { "kwargs": { "column": "comments-no-constraints", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, "expectation_type": "expect_column_values_to_be_in_type_list", "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "comments-no-constraints"}, "meta": {}, }, { "kwargs": {"column": "state-abbreviation-equal-min-max"}, "expectation_type": "expect_column_to_exist", "meta": {}, }, { "kwargs": { "column": "state-abbreviation-equal-min-max", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, "expectation_type": "expect_column_values_to_be_in_type_list", "meta": {}, }, { "kwargs": {"column": "state-abbreviation-equal-min-max", "value": 2}, "expectation_type": "expect_column_value_lengths_to_equal", "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "state-abbreviation-equal-min-max"}, "meta": {}, }, { "kwargs": {"column": "ICD10-code-3-7"}, "expectation_type": "expect_column_to_exist", "meta": {}, }, { "kwargs": { "column": "ICD10-code-3-7", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, "expectation_type": "expect_column_values_to_be_in_type_list", "meta": {}, }, { "kwargs": {"column": "ICD10-code-3-7", "min_value": 3, "max_value": 7}, "expectation_type": "expect_column_value_lengths_to_be_between", "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "ICD10-code-3-7"}, "meta": {}, }, { "kwargs": {"column": "name-no-max"}, "expectation_type": "expect_column_to_exist", "meta": {}, }, { "kwargs": { "column": "name-no-max", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, "expectation_type": "expect_column_values_to_be_in_type_list", "meta": {}, }, { "kwargs": {"column": "name-no-max", "min_value": 1}, "expectation_type": "expect_column_value_lengths_to_be_between", "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "name-no-max"}, "meta": {}, }, { "kwargs": {"column": "password-max-33"}, "expectation_type": "expect_column_to_exist", "meta": {}, }, { "kwargs": { "column": "password-max-33", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, "expectation_type": "expect_column_values_to_be_in_type_list", "meta": {}, }, { "kwargs": {"column": "password-max-33", "max_value": 33}, "expectation_type": "expect_column_value_lengths_to_be_between", "meta": {}, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "password-max-33"}, "meta": {}, }, { "kwargs": {"column": "optional-min-1"}, "expectation_type": "expect_column_to_exist", "meta": {}, }, { "kwargs": { "column": "optional-min-1", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, "expectation_type": "expect_column_values_to_be_in_type_list", "meta": {}, }, { "kwargs": {"column": "optional-min-1", "min_value": 1}, "expectation_type": "expect_column_value_lengths_to_be_between", "meta": {}, }, ] context = empty_data_context context.save_expectation_suite(obs)
def test_profile_enum_schema(empty_data_context, enum_types_schema): profiler = JsonSchemaProfiler() obs = profiler.profile(enum_types_schema, "enums") assert isinstance(obs, ExpectationSuite) assert obs.expectation_suite_name == "enums" assert [e.to_json_dict() for e in obs.expectations] == [ { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "shirt-size"}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_set", "kwargs": { "column": "shirt-size", "value_set": ["XS", "S", "M", "XL", "XXL"], }, }, { "expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "shirt-size"}, "meta": {}, }, { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "optional-color"}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_set", "kwargs": { "column": "optional-color", "value_set": ["red", "green", "blue"], }, }, { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "optional-hat"}, }, { "kwargs": { "column": "optional-hat", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, "expectation_type": "expect_column_values_to_be_in_type_list", "meta": {}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_set", "kwargs": { "column": "optional-hat", "value_set": ["red", "green", "blue"], }, }, { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "optional-answer"}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_set", "kwargs": {"column": "optional-answer", "value_set": ["yes", "no"],}, }, ] context = empty_data_context context.save_expectation_suite(obs)
def test_profile_enum_with_bad_input_raises_schema_error(enum_types_schema): profiler = JsonSchemaProfiler() # mangle the enum list enum_types_schema["properties"]["shirt-size"]["enum"] = "foo" with pytest.raises(jsonschema.SchemaError): profiler.profile(enum_types_schema, "enums")
def test_profile_raises_error_on_missing_suite_name(simple_schema): profiler = JsonSchemaProfiler() with pytest.raises(ValueError) as e: profiler.profile(simple_schema, suite_name=None) message = str(e.value) assert "provide a suite name" in message
def test_profile_raises_errors_on_bad_inputs(): profiler = JsonSchemaProfiler() for bad in [1, 1.1, None, "junk"]: with pytest.raises(TypeError): profiler.profile(bad, "foo")
def test_validate_returns_true_on_valid_schema(simple_schema): profiler = JsonSchemaProfiler() assert profiler.validate(simple_schema) is True
def test_instantiable(): profiler = JsonSchemaProfiler() assert isinstance(profiler, JsonSchemaProfiler)
def test_null_fields_schema(empty_data_context, null_fields_schema): profiler = JsonSchemaProfiler() obs = profiler.profile(null_fields_schema, "null_fields") assert isinstance(obs, ExpectationSuite) assert obs.expectation_suite_name == "null_fields" assert [e.to_json_dict() for e in obs.expectations] == [ { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "null"}, }, { "expectation_type": "expect_column_values_to_be_null", "kwargs": {"column": "null"}, "meta": {}, }, { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "string-or-null"}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "string-or-null", "type_list": list(ProfilerTypeMapping.STRING_TYPE_NAMES), }, }, { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "int-or-null"}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "int-or-null", "type_list": list(ProfilerTypeMapping.INT_TYPE_NAMES), }, }, { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "number-or-null"}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_type_list", "kwargs": { "column": "number-or-null", "type_list": list(ProfilerTypeMapping.FLOAT_TYPE_NAMES), }, }, { "meta": {}, "expectation_type": "expect_column_to_exist", "kwargs": {"column": "enum-or-null"}, }, { "meta": {}, "expectation_type": "expect_column_values_to_be_in_set", "kwargs": {"column": "enum-or-null", "value_set": ["a", "b", "c"],}, }, ] context = empty_data_context context.save_expectation_suite(obs)