예제 #1
0
def test_invalid_logical_type(sample_series):
    error_message = "Invalid logical type specified for 'sample_series'"
    with pytest.raises(TypeError, match=error_message):
        DataColumn(sample_series, int)

    error_message = "String naturalllanguage is not a valid logical type"
    with pytest.raises(ValueError, match=error_message):
        DataColumn(sample_series, 'naturalllanguage')
예제 #2
0
def test_datacolumn_with_alternate_semantic_tags_input(sample_series):
    semantic_tags = 'custom_tag'
    data_col = DataColumn(sample_series, semantic_tags=semantic_tags, use_standard_tags=False)
    assert data_col.semantic_tags == {'custom_tag'}

    semantic_tags = {'custom_tag', 'numeric'}
    data_col = DataColumn(sample_series, semantic_tags=semantic_tags, use_standard_tags=False)
    assert data_col.semantic_tags == semantic_tags
예제 #3
0
def test_datacolumn_init(sample_series):
    data_col = DataColumn(sample_series, use_standard_tags=False)
    # Koalas doesn't support category dtype
    if not (ks and isinstance(sample_series, ks.Series)):
        sample_series = sample_series.astype('category')
    pd.testing.assert_series_equal(to_pandas(data_col.to_series()), to_pandas(sample_series))
    assert data_col.name == sample_series.name
    assert data_col.logical_type == Categorical
    assert data_col.semantic_tags == set()
예제 #4
0
def test_reset_semantic_tags_without_standard_tags(sample_series):
    semantic_tags = 'initial_tag'
    data_col = DataColumn(sample_series,
                          semantic_tags=semantic_tags,
                          use_standard_tags=False)

    new_col = data_col.reset_semantic_tags()
    assert new_col is not data_col
    assert new_col.semantic_tags == set()
예제 #5
0
def test_set_semantic_tags(sample_series):
    semantic_tags = {'tag1', 'tag2'}
    data_col = DataColumn(sample_series, semantic_tags=semantic_tags, use_standard_tags=False)
    assert data_col.semantic_tags == semantic_tags

    new_tags = ['new_tag']
    new_col = data_col.set_semantic_tags(new_tags)
    assert new_col is not data_col
    assert new_col.semantic_tags == set(new_tags)
예제 #6
0
def test_datacolumn_repr(sample_series):
    data_col = DataColumn(sample_series, use_standard_tags=False)
    # Koalas doesn't support categorical
    if ks and isinstance(sample_series, ks.Series):
        dtype = 'object'
    else:
        dtype = 'category'
    assert data_col.__repr__() == f'<DataColumn: sample_series (Physical Type = {dtype}) ' \
        '(Logical Type = Categorical) (Semantic Tags = set())>'
예제 #7
0
def test_shape(sample_series):
    col = DataColumn(sample_series)
    col_shape = col.shape
    series_shape = col.to_series().shape
    if dd and isinstance(sample_series, dd.Series):
        col_shape = (col_shape[0].compute(),)
        series_shape = (series_shape[0].compute(),)
    assert col_shape == (4,)
    assert col_shape == series_shape
예제 #8
0
def test_warns_on_setting_duplicate_tag(sample_series):
    semantic_tags = ['first_tag', 'second_tag']
    data_col = DataColumn(sample_series, semantic_tags=semantic_tags, use_standard_tags=False)

    expected_message = "Semantic tag(s) 'first_tag, second_tag' already present on column 'sample_series'"
    with pytest.warns(DuplicateTagsWarning) as record:
        data_col.add_semantic_tags(['first_tag', 'second_tag'])
    assert len(record) == 1
    assert record[0].message.args[0] == expected_message
예제 #9
0
def test_set_semantic_tags_with_time_index(sample_datetime_series):
    semantic_tags = {'tag1', 'tag2'}
    data_col = DataColumn(sample_datetime_series, semantic_tags=semantic_tags, use_standard_tags=False)
    data_col._set_as_time_index()
    assert data_col.semantic_tags == {'tag1', 'tag2', 'time_index'}
    new_tags = ['new_tag']
    new_col = data_col.set_semantic_tags(new_tags)
    assert new_col.semantic_tags == {'time_index', 'new_tag'}
    new_col2 = new_col.set_semantic_tags(new_tags, retain_index_tags=False)
    assert new_col2.semantic_tags == {'new_tag'}
예제 #10
0
def test_reset_semantic_tags_with_standard_tags(sample_series):
    semantic_tags = 'initial_tag'
    data_col = DataColumn(sample_series,
                          semantic_tags=semantic_tags,
                          logical_type=Categorical,
                          use_standard_tags=True)

    new_col = data_col.reset_semantic_tags()
    assert new_col is not data_col
    assert new_col.semantic_tags == Categorical.standard_tags
예제 #11
0
def test_datacolumn_init_with_extension_array():
    series_categories = pd.Series([1, 2, 3], dtype='category')
    extension_categories = pd.Categorical([1, 2, 3])

    data_col = DataColumn(extension_categories)
    series = data_col.to_series()
    assert series.equals(series_categories)
    assert series.name is None
    assert data_col.name is None
    assert data_col.dtype == 'category'
    assert data_col.logical_type == Categorical

    series_ints = pd.Series([1, 2, None, 4], dtype='Int64')
    extension_ints = pd.arrays.IntegerArray(np.array([1, 2, 3, 4], dtype="int64"), mask=np.array([False, False, True, False]))

    data_col_with_name = DataColumn(extension_ints, name='extension')
    series = data_col_with_name.to_series()
    assert series.equals(series_ints)
    assert series.name == 'extension'
    assert data_col_with_name.name == 'extension'

    series_strs = pd.Series([1, 2, None, 4], dtype='string')

    data_col_different_ltype = DataColumn(extension_ints, logical_type='NaturalLanguage')
    series = data_col_different_ltype.to_series()
    assert series.equals(series_strs)
    assert data_col_different_ltype.logical_type == NaturalLanguage
    assert data_col_different_ltype.dtype == 'string'
예제 #12
0
def test_set_logical_type_without_standard_tags(sample_series):
    data_col = DataColumn(sample_series,
                          logical_type=NaturalLanguage,
                          semantic_tags='original_tag',
                          use_standard_tags=False)

    new_col = data_col.set_logical_type(Categorical)
    assert isinstance(new_col, DataColumn)
    assert new_col is not data_col
    assert new_col.logical_type == Categorical
    assert new_col.semantic_tags == set()
예제 #13
0
def test_datacolumn_init_with_logical_type(sample_series):
    data_col = DataColumn(sample_series, NaturalLanguage)
    assert data_col.logical_type == NaturalLanguage
    assert data_col.semantic_tags == set()

    data_col = DataColumn(sample_series, "natural_language")
    assert data_col.logical_type == NaturalLanguage
    assert data_col.semantic_tags == set()

    data_col = DataColumn(sample_series, "NaturalLanguage")
    assert data_col.logical_type == NaturalLanguage
    assert data_col.semantic_tags == set()
예제 #14
0
def test_semantic_tag_errors(sample_series):
    error_message = "semantic_tags must be a string, set or list"
    with pytest.raises(TypeError, match=error_message):
        DataColumn(sample_series, semantic_tags=int)

    error_message = "semantic_tags must be a string, set or list"
    with pytest.raises(TypeError, match=error_message):
        DataColumn(sample_series, semantic_tags={'index': {}, 'time_index': {}})

    error_message = "semantic_tags must contain only strings"
    with pytest.raises(TypeError, match=error_message):
        DataColumn(sample_series, semantic_tags=['index', 1])
예제 #15
0
def test_ordinal_with_order(sample_series):
    if (ks and isinstance(sample_series, ks.Series)) or (dd and isinstance(sample_series, dd.Series)):
        pytest.xfail('Fails with Dask and Koalas - ordinal data validation not compatible')

    ordinal_with_order = Ordinal(order=['a', 'b', 'c'])
    dc = DataColumn(sample_series, logical_type=ordinal_with_order)
    assert isinstance(dc.logical_type, Ordinal)
    assert dc.logical_type.order == ['a', 'b', 'c']

    dc = DataColumn(sample_series, logical_type="NaturalLanguage")
    new_dc = dc.set_logical_type(ordinal_with_order)
    assert isinstance(new_dc.logical_type, Ordinal)
    assert new_dc.logical_type.order == ['a', 'b', 'c']
예제 #16
0
def test_add_custom_tags(sample_series):
    semantic_tags = 'initial_tag'
    data_col = DataColumn(sample_series, semantic_tags=semantic_tags, use_standard_tags=False)

    new_col = data_col.add_semantic_tags('string_tag')
    assert new_col is not data_col
    assert new_col.semantic_tags == {'initial_tag', 'string_tag'}

    new_col2 = new_col.add_semantic_tags(['list_tag'])
    assert new_col2.semantic_tags == {'initial_tag', 'string_tag', 'list_tag'}

    new_col3 = new_col2.add_semantic_tags({'set_tag'})
    assert new_col3.semantic_tags == {'initial_tag', 'string_tag', 'list_tag', 'set_tag'}
예제 #17
0
def test_latlong_formatting(latlongs):
    expected_series = pd.Series([(1, 2), (3, 4)])
    if ks and isinstance(latlongs[0], ks.Series):
        expected_series = ks.Series([[1, 2], [3, 4]])
    elif dd and isinstance(latlongs[0], dd.Series):
        expected_series = dd.from_pandas(expected_series, npartitions=2)

    expected_dc = DataColumn(expected_series, logical_type='LatLong', name='test_series')

    for series in latlongs:
        dc = DataColumn(series, logical_type='LatLong', name='test_series')
        pd.testing.assert_series_equal(to_pandas(dc.to_series()), to_pandas(expected_series))

        assert dc == expected_dc
예제 #18
0
def test_remove_semantic_tags(sample_series):
    tags_to_remove = [
        'tag1',
        ['tag1'],
        {'tag1'}
    ]

    data_col = DataColumn(sample_series,
                          semantic_tags=['tag1', 'tag2'],
                          use_standard_tags=False)

    for tag in tags_to_remove:
        new_col = data_col.remove_semantic_tags(tag)
        assert new_col is not data_col
        assert new_col.semantic_tags == {'tag2'}
예제 #19
0
def test_does_not_add_standard_tags():
    series = pd.Series([1, 2, 3])
    semantic_tags = 'custom_tag'
    data_col = DataColumn(series,
                          logical_type=Double,
                          semantic_tags=semantic_tags,
                          use_standard_tags=False)
    assert data_col.semantic_tags == {'custom_tag'}
예제 #20
0
def test_adds_numeric_standard_tag():
    series = pd.Series([1, 2, 3])
    semantic_tags = 'custom_tag'

    logical_types = [Integer, Double]
    for logical_type in logical_types:
        data_col = DataColumn(series, logical_type=logical_type, semantic_tags=semantic_tags)
        assert data_col.semantic_tags == {'custom_tag', 'numeric'}
예제 #21
0
def test_adds_category_standard_tag():
    series = pd.Series([1, 2, 3])
    semantic_tags = 'custom_tag'

    logical_types = [Categorical, CountryCode, Ordinal(order=(1, 2, 3)), SubRegionCode, ZIPCode]
    for logical_type in logical_types:
        data_col = DataColumn(series, logical_type=logical_type, semantic_tags=semantic_tags)
        assert data_col.semantic_tags == {'custom_tag', 'category'}
예제 #22
0
def test_ordinal_with_incomplete_ranking(sample_series):
    if (ks and isinstance(sample_series, ks.Series)) or (dd and isinstance(sample_series, dd.Series)):
        pytest.xfail('Fails with Dask and Koalas - ordinal data validation not supported')

    ordinal_incomplete_order = Ordinal(order=['a', 'b'])
    error_msg = re.escape("Ordinal column sample_series contains values that are not "
                          "present in the order values provided: ['c']")
    with pytest.raises(ValueError, match=error_msg):
        DataColumn(sample_series, logical_type=ordinal_incomplete_order)
예제 #23
0
def test_datacolumn_init_with_numpy_array():
    numpy_array = np.array([1, 2, 3, 4])
    expected_series = pd.Series([1, 2, 3, 4], dtype='Int64')

    dc = DataColumn(numpy_array)
    assert dc.name is None
    assert dc.logical_type == Integer
    assert dc.semantic_tags == {'numeric'}
    assert dc.dtype == 'Int64'
    assert dc._series.equals(expected_series)

    dc = DataColumn(numpy_array, logical_type='NaturalLanguage', name='test_col')
    expected_series.name = 'test_col'

    assert dc.name == 'test_col'
    assert dc.logical_type == NaturalLanguage
    assert dc.semantic_tags == set()
    assert dc.dtype == 'string'
    assert dc._series.equals(expected_series.astype('string'))
예제 #24
0
def test_raises_error_setting_time_index_tag_directly(sample_series):
    error_msg = re.escape("Cannot add 'time_index' tag directly. To set a column as the time index, "
                          "use DataTable.set_time_index() instead.")
    with pytest.raises(ValueError, match=error_msg):
        DataColumn(sample_series, semantic_tags='time_index')

    data_col = DataColumn(sample_series)
    with pytest.raises(ValueError, match=error_msg):
        data_col.add_semantic_tags('time_index')
    with pytest.raises(ValueError, match=error_msg):
        data_col.set_semantic_tags('time_index')
예제 #25
0
def test_ordinal_requires_instance_on_update(sample_series):
    dc = DataColumn(sample_series, logical_type="NaturalLanguage")

    error_msg = 'Must use an Ordinal instance with order values defined'
    with pytest.raises(TypeError, match=error_msg):
        dc.set_logical_type(Ordinal)
    with pytest.raises(TypeError, match=error_msg):
        dc.set_logical_type("Ordinal")
예제 #26
0
def test_datacolumn_init_with_name(sample_series, sample_datetime_series):
    name = 'sample_series'
    changed_name = 'changed_name'

    dc_use_series_name = DataColumn(sample_series)
    assert dc_use_series_name.name == name
    assert dc_use_series_name.to_series().name == name

    warning = 'Name mismatch between sample_series and changed_name. DataColumn and underlying series name are now changed_name'
    with pytest.warns(ColumnNameMismatchWarning, match=warning):
        dc_use_input_name = DataColumn(sample_series, name=changed_name)
    assert dc_use_input_name.name == changed_name
    assert dc_use_input_name.to_series().name == changed_name

    warning = 'Name mismatch between sample_datetime_series and changed_name. DataColumn and underlying series name are now changed_name'
    with pytest.warns(ColumnNameMismatchWarning, match=warning):
        dc_with_ltype_change = DataColumn(sample_datetime_series, name=changed_name)
    assert dc_with_ltype_change.name == changed_name
    assert dc_with_ltype_change.to_series().name == changed_name
예제 #27
0
def test_reset_semantic_tags_with_time_index(sample_datetime_series):
    semantic_tags = 'initial_tag'
    data_col = DataColumn(sample_datetime_series,
                          semantic_tags=semantic_tags,
                          use_standard_tags=False)

    data_col._set_as_time_index()
    new_col = data_col.reset_semantic_tags(retain_index_tags=True)
    assert new_col.semantic_tags == {'time_index'}
    new_col = data_col.reset_semantic_tags()
    assert new_col.semantic_tags == set()
예제 #28
0
def test_set_logical_type_retains_time_index_tag(sample_datetime_series):
    data_col = DataColumn(sample_datetime_series,
                          logical_type=Datetime,
                          semantic_tags='original_tag',
                          use_standard_tags=False)

    data_col._set_as_time_index()
    assert data_col.semantic_tags == {'time_index', 'original_tag'}
    new_col = data_col.set_logical_type(Categorical)
    assert new_col.semantic_tags == {'time_index'}
    new_col = data_col.set_logical_type(Categorical, retain_index_tags=False)
    assert new_col.semantic_tags == set()
예제 #29
0
def test_datacolumn_metadata(sample_series):
    column_metadata = {'metadata_field': [1, 2, 3], 'created_by': 'user0'}

    data_col = DataColumn(sample_series)
    assert data_col.metadata == {}

    data_col = DataColumn(sample_series, metadata=column_metadata)
    assert data_col.metadata == column_metadata

    new_metadata = {'date_created': '1/1/19', 'created_by': 'user1'}

    data_col.metadata = {**data_col.metadata, **new_metadata}
    assert data_col.metadata == {'date_created': '1/1/19', 'metadata_field': [1, 2, 3], 'created_by': 'user1'}

    data_col.metadata.pop('created_by')
    assert data_col.metadata == {'date_created': '1/1/19', 'metadata_field': [1, 2, 3]}

    data_col.metadata['number'] = 1012034
    assert data_col.metadata == {'date_created': '1/1/19', 'metadata_field': [1, 2, 3], 'number': 1012034}
예제 #30
0
def test_remove_standard_semantic_tag(sample_series):
    # Check that warning is raised if use_standard_tags is True - tag should be removed
    data_col = DataColumn(sample_series,
                          logical_type=Categorical,
                          semantic_tags='tag1',
                          use_standard_tags=True)
    expected_message = "Removing standard semantic tag(s) 'category' from column 'sample_series'"
    with pytest.warns(UserWarning) as record:
        new_col = data_col.remove_semantic_tags(['tag1', 'category'])
    assert len(record) == 1
    assert record[0].message.args[0] == expected_message
    assert new_col.semantic_tags == set()

    # Check that warning is not raised if use_standard_tags is False - tag should be removed
    data_col = DataColumn(sample_series,
                          logical_type=Categorical,
                          semantic_tags=['category', 'tag1'],
                          use_standard_tags=False)

    with pytest.warns(None) as record:
        new_col = data_col.remove_semantic_tags(['tag1', 'category'])
    assert len(record) == 0
    assert new_col.semantic_tags == set()