def test_select_semantic_tags_no_match(sample_df): dt = DataTable(sample_df, time_index='signup_date', index='id', name='dt_name') dt = dt.set_types( logical_types={ 'full_name': FullName, 'email': EmailAddress, 'phone_number': PhoneNumber, 'signup_date': Datetime(datetime_format='%Y-%m-%d'), }) dt = dt.set_types( semantic_tags={ 'full_name': ['new_tag', 'tag2'], 'age': 'numeric', 'signup_date': 'date_of_birth', 'email': 'tag2' }) assert len(dt.select(['doesnt_exist']).columns) == 0 dt_multiple_unused = dt.select( ['doesnt_exist', 'boolean', 'category', PhoneNumber]) assert len(dt_multiple_unused.columns) == 2 dt_unused_ltype = dt.select( ['date_of_birth', 'doesnt_exist', ZIPCode, Integer]) assert len(dt_unused_ltype.columns) == 3
def test_select_ltypes_table(sample_df): dt = DataTable(sample_df, time_index='signup_date', index='id') dt = dt.set_types(logical_types={ 'full_name': FullName, 'email': EmailAddress, 'phone_number': PhoneNumber, 'age': Double, 'signup_date': Datetime, }) dt.set_types(semantic_tags={ 'full_name': ['new_tag', 'tag2'], 'age': 'numeric', }) dt_no_indices = dt.select('phone_number') assert dt_no_indices.index is None assert dt_no_indices.time_index is None dt_with_indices = dt.select(['Datetime', 'Integer']) assert dt_with_indices.index == 'id' assert dt_with_indices.time_index == 'signup_date' dt_values = dt.select(['FullName']) assert dt_values.name == dt.name original_col = dt_values.columns['full_name'] col = dt.columns['full_name'] assert col.logical_type == original_col.logical_type assert to_pandas(col.to_series()).equals(to_pandas(original_col.to_series())) assert col.dtype == original_col.dtype assert col.semantic_tags == original_col.semantic_tags
def test_select_ltypes_no_match_and_all(sample_df): dt = DataTable(sample_df) dt = dt.set_types(logical_types={ 'full_name': FullName, 'email': EmailAddress, 'phone_number': PhoneNumber, 'age': Double, 'signup_date': Datetime, }) assert len(dt.select(ZIPCode).columns) == 0 assert len(dt.select(['ZIPCode', PhoneNumber]).columns) == 1 all_types = ww.type_system.registered_types dt_all_types = dt.select(all_types) assert len(dt_all_types.columns) == len(dt.columns) assert len(dt_all_types.to_dataframe().columns) == len(dt.to_dataframe().columns)
def test_select_ltypes_objects(sample_df): dt = DataTable(sample_df) dt = dt.set_types(logical_types={ 'full_name': FullName, 'email': EmailAddress, 'phone_number': PhoneNumber, 'age': Double, 'signup_date': Datetime, }) dt_multiple_ltypes = dt.select([FullName, EmailAddress, Double, Boolean, Datetime]) assert len(dt_multiple_ltypes.columns) == 5 assert 'phone_number' not in dt_multiple_ltypes.columns assert 'id' not in dt_multiple_ltypes.columns dt_single_ltype = dt.select(FullName) assert len(dt_single_ltype.columns) == 1
def test_select_semantic_tags(sample_df): dt = DataTable(sample_df, time_index='signup_date', name='dt_name') dt = dt.set_types( semantic_tags={ 'full_name': 'tag1', 'email': ['tag2'], 'age': ['numeric', 'tag2'], 'phone_number': ['tag3', 'tag2'], 'is_registered': 'category', }) dt_one_match = dt.select('numeric') assert len(dt_one_match.columns) == 2 assert 'age' in dt_one_match.columns assert 'id' in dt_one_match.columns dt_multiple_matches = dt.select('tag2') assert len(dt_multiple_matches.columns) == 3 assert 'age' in dt_multiple_matches.columns assert 'phone_number' in dt_multiple_matches.columns assert 'email' in dt_multiple_matches.columns dt_multiple_tags = dt.select(['numeric', 'time_index']) assert len(dt_multiple_tags.columns) == 3 assert 'id' in dt_multiple_tags.columns assert 'age' in dt_multiple_tags.columns assert 'signup_date' in dt_multiple_tags.columns dt_overlapping_tags = dt.select(['numeric', 'tag2']) assert len(dt_overlapping_tags.columns) == 4 assert 'id' in dt_overlapping_tags.columns assert 'age' in dt_overlapping_tags.columns assert 'phone_number' in dt_overlapping_tags.columns assert 'email' in dt_overlapping_tags.columns dt_common_tags = dt.select(['category', 'numeric']) assert len(dt_common_tags.columns) == 3 assert 'id' in dt_common_tags.columns assert 'is_registered' in dt_common_tags.columns assert 'age' in dt_common_tags.columns
def test_select_ltypes_mixed(sample_df): dt = DataTable(sample_df) dt = dt.set_types(logical_types={ 'full_name': FullName, 'email': EmailAddress, 'phone_number': PhoneNumber, 'age': Double, 'signup_date': Datetime, }) dt_mixed_ltypes = dt.select(['FullName', 'email_address', Double]) assert len(dt_mixed_ltypes.columns) == 3 assert 'phone_number' not in dt_mixed_ltypes.columns
def test_underlying_index_on_update(sample_df): if dd and isinstance(sample_df, dd.DataFrame): pytest.xfail( 'Setting underlying index is not supported with Dask input') if ks and isinstance(sample_df, ks.DataFrame): pytest.xfail( 'Setting underlying index is not supported with Koalas input') dt = DataTable(sample_df.copy(), index='id') dt.update_dataframe(sample_df.tail(2)) assert (dt._dataframe.index == [2, 3]).all() assert dt._dataframe.index.name is None assert type(dt._dataframe.index) == pd.Int64Index assert type(dt.to_dataframe().index) == pd.Int64Index actual = dt.iloc[[0, 1]] assert type(actual._dataframe.index) == pd.Index assert type(actual.to_dataframe().index) == pd.Index actual = dt.select(dt.index) assert type(actual._dataframe.index) == pd.Int64Index assert type(actual.to_dataframe().index) == pd.Int64Index actual = dt[['age']] assert type(actual._dataframe.index) == pd.Int64Index assert type(actual.to_dataframe().index) == pd.Int64Index actual = dt.drop(dt.index) assert type(actual._dataframe.index) == pd.RangeIndex assert type(actual.to_dataframe().index) == pd.RangeIndex actual = dt.reset_semantic_tags(retain_index_tags=False) assert type(actual._dataframe.index) == pd.RangeIndex assert type(actual.to_dataframe().index) == pd.RangeIndex actual = dt.set_types(retain_index_tags=False, semantic_tags={'id': 'numeric'}) assert type(actual._dataframe.index) == pd.RangeIndex assert type(actual.to_dataframe().index) == pd.RangeIndex dt.pop(dt.index) assert type(dt._dataframe.index) == pd.RangeIndex assert type(dt.to_dataframe().index) == pd.RangeIndex