def test_set_logical_types(sample_column_names, sample_inferred_logical_types): semantic_tags = { 'full_name': 'tag1', 'email': ['tag2'], 'phone_number': ['tag3', 'tag2'], 'signup_date': {'secondary_time_index'}, } schema = TableSchema(sample_column_names, sample_inferred_logical_types, semantic_tags=semantic_tags, use_standard_tags=True) schema.set_types( logical_types={ 'full_name': Categorical, 'email': EmailAddress, 'phone_number': PhoneNumber, 'age': Double, }) assert schema.logical_types['full_name'] == Categorical assert schema.logical_types['email'] == EmailAddress assert schema.logical_types['phone_number'] == PhoneNumber assert schema.logical_types['age'] == Double # Verify semantic tags were reset to standard tags in columns with Logical Type changes assert schema.semantic_tags['full_name'] == {'category'} assert schema.semantic_tags['email'] == set() assert schema.semantic_tags['phone_number'] == set() assert schema.semantic_tags['age'] == {'numeric'} # Verify signup date column was unchanged assert schema.logical_types['signup_date'] == Datetime assert schema.semantic_tags['signup_date'] == {'secondary_time_index'}
def test_schema_with_numeric_time_index(sample_column_names, sample_inferred_logical_types): # Set a numeric time index on init schema = TableSchema(sample_column_names, logical_types={ **sample_inferred_logical_types, **{ 'signup_date': Integer } }, time_index='signup_date', use_standard_tags=True) date_col = schema.columns['signup_date'] assert schema.time_index == 'signup_date' assert date_col.logical_type == Integer assert date_col.semantic_tags == {'time_index', 'numeric'} # Specify logical type for time index on init schema = TableSchema(sample_column_names, logical_types={ **sample_inferred_logical_types, **{ 'signup_date': Double } }, time_index='signup_date', use_standard_tags=True) date_col = schema.columns['signup_date'] assert schema.time_index == 'signup_date' assert date_col.logical_type == Double assert date_col.semantic_tags == {'time_index', 'numeric'}
def test_schema_rename(sample_column_names, sample_inferred_logical_types): table_metadata = {'table_info': 'this is text'} id_description = 'the id of the row' schema = TableSchema(sample_column_names, sample_inferred_logical_types, index='id', time_index='signup_date', table_metadata=table_metadata, column_descriptions={'id': id_description}) original_schema = schema._get_subset_schema(list(schema.columns.keys())) renamed_schema = schema.rename({'age': 'birthday'}) # Confirm original schema hasn't changed assert schema == original_schema assert 'age' not in renamed_schema.columns assert 'birthday' in renamed_schema.columns # confirm that metadata and descriptions are there assert renamed_schema.metadata == table_metadata assert schema.columns['id'].description == id_description old_col = schema.columns['age'] new_col = renamed_schema.columns['birthday'] assert old_col.logical_type == new_col.logical_type assert old_col.semantic_tags == new_col.semantic_tags swapped_schema = schema.rename({'age': 'full_name', 'full_name': 'age'}) swapped_back_schema = swapped_schema.rename({ 'age': 'full_name', 'full_name': 'age' }) assert swapped_back_schema == schema
def test_validation_methods_called(mock_validate_params, mock_check_index, mock_check_time_index, mock_validate_not_setting_index, sample_column_names, sample_inferred_logical_types): assert not mock_validate_params.called assert not mock_check_index.called assert not mock_check_time_index.called assert not mock_validate_not_setting_index.called not_validated_schema = TableSchema(sample_column_names, sample_inferred_logical_types, index='id', time_index='signup_date', validate=False) assert not mock_validate_params.called assert not mock_check_index.called assert not mock_check_time_index.called assert not mock_validate_not_setting_index.called validated_schema = TableSchema(sample_column_names, sample_inferred_logical_types, index='id', time_index='signup_date', validate=True) assert mock_validate_params.called assert mock_check_index.called assert mock_check_time_index.called assert mock_validate_not_setting_index.called assert validated_schema == not_validated_schema
def test_set_index_errors(sample_column_names, sample_inferred_logical_types): schema = TableSchema(sample_column_names, sample_inferred_logical_types) error = re.escape( "Specified index column `testing` not found in TableSchema.") with pytest.raises(LookupError, match=error): schema.set_index('testing')
def test_semantic_tag_errors(sample_column_names, sample_inferred_logical_types): error_message = "semantic_tags for id must be a string, set or list" with pytest.raises(TypeError, match=error_message): TableSchema( sample_column_names, sample_inferred_logical_types, semantic_tags={"id": int}, ) error_message = "semantic_tags for id must be a string, set or list" with pytest.raises(TypeError, match=error_message): TableSchema( sample_column_names, sample_inferred_logical_types, semantic_tags={"id": { "index": {}, "time_index": {} }}, ) error_message = "semantic_tags for id must contain only strings" with pytest.raises(TypeError, match=error_message): TableSchema( sample_column_names, sample_inferred_logical_types, semantic_tags={"id": ["index", 1]}, )
def test_schema_adds_standard_semantic_tags(sample_column_names, sample_inferred_logical_types): schema = TableSchema( sample_column_names, logical_types={ **sample_inferred_logical_types, **{ "id": Categorical } }, use_standard_tags=True, name="schema", ) assert schema.semantic_tags["id"] == {"category"} assert schema.semantic_tags["age"] == {"numeric"} schema = TableSchema( sample_column_names, logical_types={ **sample_inferred_logical_types, **{ "id": Categorical } }, name="schema", use_standard_tags=False, ) assert schema.semantic_tags["id"] == set() assert schema.semantic_tags["age"] == set()
def test_schema_with_numeric_time_index(sample_column_names, sample_inferred_logical_types): # Set a numeric time index on init schema = TableSchema( sample_column_names, logical_types={ **sample_inferred_logical_types, **{ "signup_date": Integer } }, time_index="signup_date", use_standard_tags=True, ) date_col = schema.columns["signup_date"] assert schema.time_index == "signup_date" assert isinstance(date_col.logical_type, Integer) assert date_col.semantic_tags == {"time_index", "numeric"} # Specify logical type for time index on init schema = TableSchema( sample_column_names, logical_types={ **sample_inferred_logical_types, **{ "signup_date": Double } }, time_index="signup_date", use_standard_tags=True, ) date_col = schema.columns["signup_date"] assert schema.time_index == "signup_date" assert isinstance(date_col.logical_type, Double) assert date_col.semantic_tags == {"time_index", "numeric"}
def test_set_logical_types_invalid_data(sample_column_names, sample_inferred_logical_types): schema = TableSchema(sample_column_names, sample_inferred_logical_types) error_message = re.escape( "logical_types contains columns that are not present in TableSchema: ['birthday']" ) with pytest.raises(ColumnNotPresentError, match=error_message): schema.set_types(logical_types={"birthday": Double}) error_message = ("Logical Types must be of the LogicalType class " "and registered in Woodwork's type system. " "Double does not meet that criteria.") with pytest.raises(TypeError, match=error_message): schema.set_types(logical_types={"id": "Double"}) error_message = ("Logical Types must be of the LogicalType class " "and registered in Woodwork's type system. " "<class 'int'> does not meet that criteria.") with pytest.raises(TypeError, match=error_message): schema.set_types(logical_types={"age": int}) error_message = "semantic_tags for full_name must be a string, set or list" with pytest.raises(TypeError, match=error_message): schema.set_types(semantic_tags={"full_name": None})
def test_schema_adds_standard_semantic_tags(sample_column_names, sample_inferred_logical_types): schema = TableSchema(sample_column_names, logical_types={ **sample_inferred_logical_types, **{ 'id': Categorical } }, use_standard_tags=True, name='schema') assert schema.semantic_tags['id'] == {'category'} assert schema.semantic_tags['age'] == {'numeric'} schema = TableSchema(sample_column_names, logical_types={ **sample_inferred_logical_types, **{ 'id': Categorical } }, name='schema', use_standard_tags=False) assert schema.semantic_tags['id'] == set() assert schema.semantic_tags['age'] == set()
def test_schema_repr_empty(): schema = TableSchema([], {}) assert repr( schema ) == 'Empty DataFrame\nColumns: [Logical Type, Semantic Tag(s)]\nIndex: []' assert schema._repr_html_( ) == '<table border="1" class="dataframe">\n <thead>\n <tr style="text-align: right;">\n <th></th>\n <th>Logical Type</th>\n <th>Semantic Tag(s)</th>\n </tr>\n <tr>\n <th>Column</th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table>'
def test_reset_semantic_tags_invalid_column(sample_column_names, sample_inferred_logical_types): schema = TableSchema( sample_column_names, sample_inferred_logical_types, ) error_msg = "Input contains columns that are not present in dataframe: 'invalid_column'" with pytest.raises(LookupError, match=error_msg): schema.reset_semantic_tags('invalid_column')
def test_reset_semantic_tags_invalid_column(sample_column_names, sample_inferred_logical_types): schema = TableSchema( sample_column_names, sample_inferred_logical_types, ) error_msg = re.escape( "Column(s) '['invalid_column']' not found in DataFrame") with pytest.raises(ColumnNotPresentError, match=error_msg): schema.reset_semantic_tags("invalid_column")
def test_column_schema_metadata(sample_column_names, sample_inferred_logical_types): column_metadata = {'metadata_field': [1, 2, 3], 'created_by': 'user0'} schema = TableSchema(sample_column_names, sample_inferred_logical_types) assert schema.columns['id'].metadata == {} schema = TableSchema(sample_column_names, sample_inferred_logical_types, column_metadata={'id': column_metadata}) assert schema.columns['id'].metadata == column_metadata
def test_index_replacing_standard_tags(sample_column_names, sample_inferred_logical_types): schema = TableSchema(sample_column_names, sample_inferred_logical_types, use_standard_tags=True) assert schema.columns["id"].semantic_tags == {"numeric"} schema = TableSchema(sample_column_names, sample_inferred_logical_types, index="id") assert schema.columns["id"].semantic_tags == {"index"}
def test_index_replacing_standard_tags(sample_column_names, sample_inferred_logical_types): schema = TableSchema(sample_column_names, sample_inferred_logical_types, use_standard_tags=True) assert schema.columns['id'].semantic_tags == {'numeric'} schema = TableSchema(sample_column_names, sample_inferred_logical_types, index='id') assert schema.columns['id'].semantic_tags == {'index'}
def test_schema_repr(small_df): schema = TableSchema(list(small_df.columns), logical_types={"sample_datetime_series": Datetime}) schema_repr = repr(schema) expected_repr = " Logical Type Semantic Tag(s)\nColumn \nsample_datetime_series Datetime []" assert schema_repr == expected_repr schema_html_repr = schema._repr_html_() expected_repr = '<table border="1" class="dataframe">\n <thead>\n <tr style="text-align: right;">\n <th></th>\n <th>Logical Type</th>\n <th>Semantic Tag(s)</th>\n </tr>\n <tr>\n <th>Column</th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>sample_datetime_series</th>\n <td>Datetime</td>\n <td>[]</td>\n </tr>\n </tbody>\n</table>' assert schema_html_repr == expected_repr
def test_reset_all_semantic_tags(sample_column_names, sample_inferred_logical_types): semantic_tags = {'full_name': 'tag1', 'age': 'age'} schema = TableSchema(sample_column_names, sample_inferred_logical_types, semantic_tags=semantic_tags, use_standard_tags=True) schema.reset_semantic_tags() assert schema.semantic_tags['full_name'] == set() assert schema.semantic_tags['age'] == {'numeric'}
def test_reset_all_semantic_tags(sample_column_names, sample_inferred_logical_types): semantic_tags = {"full_name": "tag1", "age": "age"} schema = TableSchema( sample_column_names, sample_inferred_logical_types, semantic_tags=semantic_tags, use_standard_tags=True, ) schema.reset_semantic_tags() assert schema.semantic_tags["full_name"] == set() assert schema.semantic_tags["age"] == {"numeric"}
def test_column_schema_metadata(sample_column_names, sample_inferred_logical_types): column_metadata = {"metadata_field": [1, 2, 3], "created_by": "user0"} schema = TableSchema(sample_column_names, sample_inferred_logical_types) assert schema.columns["id"].metadata == {} schema = TableSchema( sample_column_names, sample_inferred_logical_types, column_metadata={"id": column_metadata}, ) assert schema.columns["id"].metadata == column_metadata
def test_add_semantic_tags(sample_column_names, sample_inferred_logical_types): semantic_tags = {'full_name': 'tag1', 'age': ['numeric', 'age']} schema = TableSchema(sample_column_names, sample_inferred_logical_types, semantic_tags=semantic_tags, use_standard_tags=False, index='id') new_tags = {'full_name': ['list_tag'], 'age': 'str_tag', 'id': {'set_tag'}} schema.add_semantic_tags(new_tags) assert schema.semantic_tags['full_name'] == {'tag1', 'list_tag'} assert schema.semantic_tags['age'] == {'numeric', 'age', 'str_tag'} assert schema.semantic_tags['id'] == {'set_tag', 'index'}
def test_filter_schema_overlap_name_and_type(sample_column_names, sample_inferred_logical_types): schema = TableSchema(sample_column_names, sample_inferred_logical_types) filter_name_ltype_overlap = schema._filter_cols(include="full_name") assert filter_name_ltype_overlap == [] filter_overlap_with_name = schema._filter_cols(include="full_name", col_names=True) assert filter_overlap_with_name == ["full_name"] schema = TableSchema( sample_column_names, { **sample_inferred_logical_types, "full_name": Categorical, "age": PersonFullName, }, semantic_tags={"id": "person_full_name"}, ) filter_tag_and_ltype = schema._filter_cols(include="person_full_name") assert set(filter_tag_and_ltype) == {"id", "age"} filter_all_three = schema._filter_cols( include=["person_full_name", "full_name"], col_names=True) assert set(filter_all_three) == {"id", "age", "full_name"}
def test_filter_schema_cols_no_matches(sample_column_names, sample_inferred_logical_types): schema = TableSchema( sample_column_names, sample_inferred_logical_types, time_index="signup_date", index="id", name="df_name", ) filter_no_matches = schema._filter_cols(include="nothing") assert filter_no_matches == [] filter_empty_list = schema._filter_cols(include=[]) assert filter_empty_list == [] filter_non_string = schema._filter_cols(include=1) assert filter_non_string == [] filter_exclude_no_matches = schema._filter_cols(exclude="nothing") assert set(filter_exclude_no_matches) == set(sample_column_names) filter_exclude_empty_list = schema._filter_cols(exclude=[]) assert set(filter_exclude_empty_list) == set(sample_column_names) filter_exclude_non_string = schema._filter_cols(exclude=1) assert set(filter_exclude_non_string) == set(sample_column_names)
def test_filter_schema_cols_include(sample_column_names, sample_inferred_logical_types): schema = TableSchema( sample_column_names, sample_inferred_logical_types, time_index="signup_date", index="id", name="df_name", use_standard_tags=True, ) filtered = schema._filter_cols(include=Datetime) expected = {"signup_date", "datetime_with_NaT"} assert set(filtered) == expected filtered = schema._filter_cols(include="email", col_names=True) assert filtered == ["email"] filtered_log_type_string = schema._filter_cols(include="Unknown") filtered_log_type = schema._filter_cols(include=Unknown) expected = {"full_name"} assert filtered_log_type == filtered_log_type_string assert set(filtered_log_type) == expected expected = { "integer", "double", "double_with_nan", "age", "nullable_integer" } filtered_semantic_tag = schema._filter_cols(include="numeric") assert set(filtered_semantic_tag) == expected filtered_multiple_overlap = schema._filter_cols( include=["Unknown", "email"], col_names=True) expected = ["full_name", "phone_number", "email"] for col in filtered_multiple_overlap: assert col in expected
def test_schema_init_with_col_origins(sample_column_names, sample_inferred_logical_types): origins = {"age": "base", "signup_date": "engineered"} schema = TableSchema(sample_column_names, sample_inferred_logical_types, column_origins=origins) for name, column in schema.columns.items(): assert column.origin == origins.get(name) schema_single_origin = TableSchema(sample_column_names, sample_inferred_logical_types, column_origins="base") for name, column in schema_single_origin.columns.items(): assert column.origin == "base"
def test_filter_schema_cols_exclude(sample_column_names, sample_inferred_logical_types): schema = TableSchema(sample_column_names, sample_inferred_logical_types, time_index='signup_date', index='id', name='df_name', use_standard_tags=True) filtered = schema._filter_cols(exclude=Datetime) assert 'signup_date' not in filtered filtered = schema._filter_cols(exclude='email', col_names=True) assert 'email' not in filtered filtered_log_type_string = schema._filter_cols(exclude='NaturalLanguage') filtered_log_type = schema._filter_cols(exclude=NaturalLanguage) expected = {'id', 'age', 'signup_date', 'is_registered'} assert filtered_log_type == filtered_log_type_string assert set(filtered_log_type) == expected filtered_semantic_tag = schema._filter_cols(exclude='numeric') assert 'age' not in filtered_semantic_tag filtered_multiple_overlap = schema._filter_cols( exclude=['NaturalLanguage', 'email'], col_names=True) expected = ['id', 'age', 'signup_date', 'is_registered'] for col in filtered_multiple_overlap: assert col in expected
def test_filter_schema_non_string_cols(): schema = TableSchema(column_names=[0, 1, 2, 3], logical_types={ 0: Integer, 1: Categorical, 2: NaturalLanguage, 3: Double }, use_standard_tags=True) filter_types_and_tags = schema._filter_cols(include=[Integer, 'category']) assert filter_types_and_tags == [0, 1] filter_by_name = schema._filter_cols(include=[0, 1], col_names=True) assert filter_by_name == [0, 1]
def test_use_standard_tags_from_dict(sample_column_names, sample_inferred_logical_types): default_schema = TableSchema(sample_column_names, sample_inferred_logical_types, use_standard_tags={ col_name: False for col_name in sample_column_names }) assert default_schema.use_standard_tags == { col_name: False for col_name in sample_column_names } use_standard_tags = { 'id': True, 'full_name': False, 'email': True, 'phone_number': True, 'age': False, 'signup_date': True, 'is_registered': False } full_dict_schema = TableSchema(sample_column_names, sample_inferred_logical_types, use_standard_tags=use_standard_tags) assert full_dict_schema.use_standard_tags == use_standard_tags partial_dict_schema = TableSchema(sample_column_names, sample_inferred_logical_types, use_standard_tags={ 'id': True, 'email': True, 'phone_number': True, 'signup_date': True }) assert full_dict_schema.use_standard_tags == partial_dict_schema.use_standard_tags assert full_dict_schema == partial_dict_schema partial_dict_default_schema = TableSchema(sample_column_names, sample_inferred_logical_types, use_standard_tags={ 'id': False, 'email': False, 'phone_number': False, 'signup_date': False }) assert default_schema.use_standard_tags == partial_dict_default_schema.use_standard_tags assert default_schema == partial_dict_default_schema
def test_add_semantic_tags(sample_column_names, sample_inferred_logical_types): semantic_tags = {"full_name": "tag1", "age": ["numeric", "age"]} schema = TableSchema( sample_column_names, sample_inferred_logical_types, semantic_tags=semantic_tags, use_standard_tags=False, index="id", ) new_tags = {"full_name": ["list_tag"], "age": "str_tag", "id": {"set_tag"}} schema.add_semantic_tags(new_tags) assert schema.semantic_tags["full_name"] == {"tag1", "list_tag"} assert schema.semantic_tags["age"] == {"numeric", "age", "str_tag"} assert schema.semantic_tags["id"] == {"set_tag", "index"}
def test_schema_logical_types(sample_column_names, sample_inferred_logical_types): schema = TableSchema(sample_column_names, sample_inferred_logical_types) assert isinstance(schema.logical_types, dict) assert set(schema.logical_types.keys()) == set(sample_column_names) for k, v in schema.logical_types.items(): assert v == schema.columns[k].logical_type