def test_invalid_logical_type(sample_series): error_message = "Invalid logical type specified for 'sample_series'" with pytest.raises(TypeError, match=error_message): DataColumn(sample_series, int) error_message = "String naturalllanguage is not a valid logical type" with pytest.raises(ValueError, match=error_message): DataColumn(sample_series, 'naturalllanguage')
def test_datacolumn_with_alternate_semantic_tags_input(sample_series): semantic_tags = 'custom_tag' data_col = DataColumn(sample_series, semantic_tags=semantic_tags, use_standard_tags=False) assert data_col.semantic_tags == {'custom_tag'} semantic_tags = {'custom_tag', 'numeric'} data_col = DataColumn(sample_series, semantic_tags=semantic_tags, use_standard_tags=False) assert data_col.semantic_tags == semantic_tags
def test_datacolumn_init(sample_series): data_col = DataColumn(sample_series, use_standard_tags=False) # Koalas doesn't support category dtype if not (ks and isinstance(sample_series, ks.Series)): sample_series = sample_series.astype('category') pd.testing.assert_series_equal(to_pandas(data_col.to_series()), to_pandas(sample_series)) assert data_col.name == sample_series.name assert data_col.logical_type == Categorical assert data_col.semantic_tags == set()
def test_reset_semantic_tags_without_standard_tags(sample_series): semantic_tags = 'initial_tag' data_col = DataColumn(sample_series, semantic_tags=semantic_tags, use_standard_tags=False) new_col = data_col.reset_semantic_tags() assert new_col is not data_col assert new_col.semantic_tags == set()
def test_set_semantic_tags(sample_series): semantic_tags = {'tag1', 'tag2'} data_col = DataColumn(sample_series, semantic_tags=semantic_tags, use_standard_tags=False) assert data_col.semantic_tags == semantic_tags new_tags = ['new_tag'] new_col = data_col.set_semantic_tags(new_tags) assert new_col is not data_col assert new_col.semantic_tags == set(new_tags)
def test_datacolumn_repr(sample_series): data_col = DataColumn(sample_series, use_standard_tags=False) # Koalas doesn't support categorical if ks and isinstance(sample_series, ks.Series): dtype = 'object' else: dtype = 'category' assert data_col.__repr__() == f'<DataColumn: sample_series (Physical Type = {dtype}) ' \ '(Logical Type = Categorical) (Semantic Tags = set())>'
def test_shape(sample_series): col = DataColumn(sample_series) col_shape = col.shape series_shape = col.to_series().shape if dd and isinstance(sample_series, dd.Series): col_shape = (col_shape[0].compute(),) series_shape = (series_shape[0].compute(),) assert col_shape == (4,) assert col_shape == series_shape
def test_warns_on_setting_duplicate_tag(sample_series): semantic_tags = ['first_tag', 'second_tag'] data_col = DataColumn(sample_series, semantic_tags=semantic_tags, use_standard_tags=False) expected_message = "Semantic tag(s) 'first_tag, second_tag' already present on column 'sample_series'" with pytest.warns(DuplicateTagsWarning) as record: data_col.add_semantic_tags(['first_tag', 'second_tag']) assert len(record) == 1 assert record[0].message.args[0] == expected_message
def test_set_semantic_tags_with_time_index(sample_datetime_series): semantic_tags = {'tag1', 'tag2'} data_col = DataColumn(sample_datetime_series, semantic_tags=semantic_tags, use_standard_tags=False) data_col._set_as_time_index() assert data_col.semantic_tags == {'tag1', 'tag2', 'time_index'} new_tags = ['new_tag'] new_col = data_col.set_semantic_tags(new_tags) assert new_col.semantic_tags == {'time_index', 'new_tag'} new_col2 = new_col.set_semantic_tags(new_tags, retain_index_tags=False) assert new_col2.semantic_tags == {'new_tag'}
def test_reset_semantic_tags_with_standard_tags(sample_series): semantic_tags = 'initial_tag' data_col = DataColumn(sample_series, semantic_tags=semantic_tags, logical_type=Categorical, use_standard_tags=True) new_col = data_col.reset_semantic_tags() assert new_col is not data_col assert new_col.semantic_tags == Categorical.standard_tags
def test_datacolumn_init_with_extension_array(): series_categories = pd.Series([1, 2, 3], dtype='category') extension_categories = pd.Categorical([1, 2, 3]) data_col = DataColumn(extension_categories) series = data_col.to_series() assert series.equals(series_categories) assert series.name is None assert data_col.name is None assert data_col.dtype == 'category' assert data_col.logical_type == Categorical series_ints = pd.Series([1, 2, None, 4], dtype='Int64') extension_ints = pd.arrays.IntegerArray(np.array([1, 2, 3, 4], dtype="int64"), mask=np.array([False, False, True, False])) data_col_with_name = DataColumn(extension_ints, name='extension') series = data_col_with_name.to_series() assert series.equals(series_ints) assert series.name == 'extension' assert data_col_with_name.name == 'extension' series_strs = pd.Series([1, 2, None, 4], dtype='string') data_col_different_ltype = DataColumn(extension_ints, logical_type='NaturalLanguage') series = data_col_different_ltype.to_series() assert series.equals(series_strs) assert data_col_different_ltype.logical_type == NaturalLanguage assert data_col_different_ltype.dtype == 'string'
def test_set_logical_type_without_standard_tags(sample_series): data_col = DataColumn(sample_series, logical_type=NaturalLanguage, semantic_tags='original_tag', use_standard_tags=False) new_col = data_col.set_logical_type(Categorical) assert isinstance(new_col, DataColumn) assert new_col is not data_col assert new_col.logical_type == Categorical assert new_col.semantic_tags == set()
def test_datacolumn_init_with_logical_type(sample_series): data_col = DataColumn(sample_series, NaturalLanguage) assert data_col.logical_type == NaturalLanguage assert data_col.semantic_tags == set() data_col = DataColumn(sample_series, "natural_language") assert data_col.logical_type == NaturalLanguage assert data_col.semantic_tags == set() data_col = DataColumn(sample_series, "NaturalLanguage") assert data_col.logical_type == NaturalLanguage assert data_col.semantic_tags == set()
def test_semantic_tag_errors(sample_series): error_message = "semantic_tags must be a string, set or list" with pytest.raises(TypeError, match=error_message): DataColumn(sample_series, semantic_tags=int) error_message = "semantic_tags must be a string, set or list" with pytest.raises(TypeError, match=error_message): DataColumn(sample_series, semantic_tags={'index': {}, 'time_index': {}}) error_message = "semantic_tags must contain only strings" with pytest.raises(TypeError, match=error_message): DataColumn(sample_series, semantic_tags=['index', 1])
def test_ordinal_with_order(sample_series): if (ks and isinstance(sample_series, ks.Series)) or (dd and isinstance(sample_series, dd.Series)): pytest.xfail('Fails with Dask and Koalas - ordinal data validation not compatible') ordinal_with_order = Ordinal(order=['a', 'b', 'c']) dc = DataColumn(sample_series, logical_type=ordinal_with_order) assert isinstance(dc.logical_type, Ordinal) assert dc.logical_type.order == ['a', 'b', 'c'] dc = DataColumn(sample_series, logical_type="NaturalLanguage") new_dc = dc.set_logical_type(ordinal_with_order) assert isinstance(new_dc.logical_type, Ordinal) assert new_dc.logical_type.order == ['a', 'b', 'c']
def test_add_custom_tags(sample_series): semantic_tags = 'initial_tag' data_col = DataColumn(sample_series, semantic_tags=semantic_tags, use_standard_tags=False) new_col = data_col.add_semantic_tags('string_tag') assert new_col is not data_col assert new_col.semantic_tags == {'initial_tag', 'string_tag'} new_col2 = new_col.add_semantic_tags(['list_tag']) assert new_col2.semantic_tags == {'initial_tag', 'string_tag', 'list_tag'} new_col3 = new_col2.add_semantic_tags({'set_tag'}) assert new_col3.semantic_tags == {'initial_tag', 'string_tag', 'list_tag', 'set_tag'}
def test_latlong_formatting(latlongs): expected_series = pd.Series([(1, 2), (3, 4)]) if ks and isinstance(latlongs[0], ks.Series): expected_series = ks.Series([[1, 2], [3, 4]]) elif dd and isinstance(latlongs[0], dd.Series): expected_series = dd.from_pandas(expected_series, npartitions=2) expected_dc = DataColumn(expected_series, logical_type='LatLong', name='test_series') for series in latlongs: dc = DataColumn(series, logical_type='LatLong', name='test_series') pd.testing.assert_series_equal(to_pandas(dc.to_series()), to_pandas(expected_series)) assert dc == expected_dc
def test_remove_semantic_tags(sample_series): tags_to_remove = [ 'tag1', ['tag1'], {'tag1'} ] data_col = DataColumn(sample_series, semantic_tags=['tag1', 'tag2'], use_standard_tags=False) for tag in tags_to_remove: new_col = data_col.remove_semantic_tags(tag) assert new_col is not data_col assert new_col.semantic_tags == {'tag2'}
def test_does_not_add_standard_tags(): series = pd.Series([1, 2, 3]) semantic_tags = 'custom_tag' data_col = DataColumn(series, logical_type=Double, semantic_tags=semantic_tags, use_standard_tags=False) assert data_col.semantic_tags == {'custom_tag'}
def test_adds_numeric_standard_tag(): series = pd.Series([1, 2, 3]) semantic_tags = 'custom_tag' logical_types = [Integer, Double] for logical_type in logical_types: data_col = DataColumn(series, logical_type=logical_type, semantic_tags=semantic_tags) assert data_col.semantic_tags == {'custom_tag', 'numeric'}
def test_adds_category_standard_tag(): series = pd.Series([1, 2, 3]) semantic_tags = 'custom_tag' logical_types = [Categorical, CountryCode, Ordinal(order=(1, 2, 3)), SubRegionCode, ZIPCode] for logical_type in logical_types: data_col = DataColumn(series, logical_type=logical_type, semantic_tags=semantic_tags) assert data_col.semantic_tags == {'custom_tag', 'category'}
def test_ordinal_with_incomplete_ranking(sample_series): if (ks and isinstance(sample_series, ks.Series)) or (dd and isinstance(sample_series, dd.Series)): pytest.xfail('Fails with Dask and Koalas - ordinal data validation not supported') ordinal_incomplete_order = Ordinal(order=['a', 'b']) error_msg = re.escape("Ordinal column sample_series contains values that are not " "present in the order values provided: ['c']") with pytest.raises(ValueError, match=error_msg): DataColumn(sample_series, logical_type=ordinal_incomplete_order)
def test_datacolumn_init_with_numpy_array(): numpy_array = np.array([1, 2, 3, 4]) expected_series = pd.Series([1, 2, 3, 4], dtype='Int64') dc = DataColumn(numpy_array) assert dc.name is None assert dc.logical_type == Integer assert dc.semantic_tags == {'numeric'} assert dc.dtype == 'Int64' assert dc._series.equals(expected_series) dc = DataColumn(numpy_array, logical_type='NaturalLanguage', name='test_col') expected_series.name = 'test_col' assert dc.name == 'test_col' assert dc.logical_type == NaturalLanguage assert dc.semantic_tags == set() assert dc.dtype == 'string' assert dc._series.equals(expected_series.astype('string'))
def test_raises_error_setting_time_index_tag_directly(sample_series): error_msg = re.escape("Cannot add 'time_index' tag directly. To set a column as the time index, " "use DataTable.set_time_index() instead.") with pytest.raises(ValueError, match=error_msg): DataColumn(sample_series, semantic_tags='time_index') data_col = DataColumn(sample_series) with pytest.raises(ValueError, match=error_msg): data_col.add_semantic_tags('time_index') with pytest.raises(ValueError, match=error_msg): data_col.set_semantic_tags('time_index')
def test_ordinal_requires_instance_on_update(sample_series): dc = DataColumn(sample_series, logical_type="NaturalLanguage") error_msg = 'Must use an Ordinal instance with order values defined' with pytest.raises(TypeError, match=error_msg): dc.set_logical_type(Ordinal) with pytest.raises(TypeError, match=error_msg): dc.set_logical_type("Ordinal")
def test_datacolumn_init_with_name(sample_series, sample_datetime_series): name = 'sample_series' changed_name = 'changed_name' dc_use_series_name = DataColumn(sample_series) assert dc_use_series_name.name == name assert dc_use_series_name.to_series().name == name warning = 'Name mismatch between sample_series and changed_name. DataColumn and underlying series name are now changed_name' with pytest.warns(ColumnNameMismatchWarning, match=warning): dc_use_input_name = DataColumn(sample_series, name=changed_name) assert dc_use_input_name.name == changed_name assert dc_use_input_name.to_series().name == changed_name warning = 'Name mismatch between sample_datetime_series and changed_name. DataColumn and underlying series name are now changed_name' with pytest.warns(ColumnNameMismatchWarning, match=warning): dc_with_ltype_change = DataColumn(sample_datetime_series, name=changed_name) assert dc_with_ltype_change.name == changed_name assert dc_with_ltype_change.to_series().name == changed_name
def test_reset_semantic_tags_with_time_index(sample_datetime_series): semantic_tags = 'initial_tag' data_col = DataColumn(sample_datetime_series, semantic_tags=semantic_tags, use_standard_tags=False) data_col._set_as_time_index() new_col = data_col.reset_semantic_tags(retain_index_tags=True) assert new_col.semantic_tags == {'time_index'} new_col = data_col.reset_semantic_tags() assert new_col.semantic_tags == set()
def test_set_logical_type_retains_time_index_tag(sample_datetime_series): data_col = DataColumn(sample_datetime_series, logical_type=Datetime, semantic_tags='original_tag', use_standard_tags=False) data_col._set_as_time_index() assert data_col.semantic_tags == {'time_index', 'original_tag'} new_col = data_col.set_logical_type(Categorical) assert new_col.semantic_tags == {'time_index'} new_col = data_col.set_logical_type(Categorical, retain_index_tags=False) assert new_col.semantic_tags == set()
def test_datacolumn_metadata(sample_series): column_metadata = {'metadata_field': [1, 2, 3], 'created_by': 'user0'} data_col = DataColumn(sample_series) assert data_col.metadata == {} data_col = DataColumn(sample_series, metadata=column_metadata) assert data_col.metadata == column_metadata new_metadata = {'date_created': '1/1/19', 'created_by': 'user1'} data_col.metadata = {**data_col.metadata, **new_metadata} assert data_col.metadata == {'date_created': '1/1/19', 'metadata_field': [1, 2, 3], 'created_by': 'user1'} data_col.metadata.pop('created_by') assert data_col.metadata == {'date_created': '1/1/19', 'metadata_field': [1, 2, 3]} data_col.metadata['number'] = 1012034 assert data_col.metadata == {'date_created': '1/1/19', 'metadata_field': [1, 2, 3], 'number': 1012034}
def test_remove_standard_semantic_tag(sample_series): # Check that warning is raised if use_standard_tags is True - tag should be removed data_col = DataColumn(sample_series, logical_type=Categorical, semantic_tags='tag1', use_standard_tags=True) expected_message = "Removing standard semantic tag(s) 'category' from column 'sample_series'" with pytest.warns(UserWarning) as record: new_col = data_col.remove_semantic_tags(['tag1', 'category']) assert len(record) == 1 assert record[0].message.args[0] == expected_message assert new_col.semantic_tags == set() # Check that warning is not raised if use_standard_tags is False - tag should be removed data_col = DataColumn(sample_series, logical_type=Categorical, semantic_tags=['category', 'tag1'], use_standard_tags=False) with pytest.warns(None) as record: new_col = data_col.remove_semantic_tags(['tag1', 'category']) assert len(record) == 0 assert new_col.semantic_tags == set()