def test_sets_category_dtype_on_init(): column_name = 'test_series' series_list = [ pd.Series(['a', 'b', 'c'], name=column_name), pd.Series(['a', None, 'c'], name=column_name), pd.Series(['a', np.nan, 'c'], name=column_name), pd.Series(['a', pd.NA, 'c'], name=column_name), pd.Series(['a', pd.NaT, 'c'], name=column_name), ] logical_types = [ Categorical, CountryCode, Ordinal(order=['a', 'b', 'c']), SubRegionCode, ZIPCode, ] for series in series_list: series = series.astype('object') for logical_type in logical_types: ltypes = { column_name: logical_type, } dt = DataTable(pd.DataFrame(series), logical_types=ltypes) assert dt.columns[column_name].logical_type == logical_type assert dt.columns[column_name].dtype == logical_type.pandas_dtype assert dt.to_dataframe( )[column_name].dtype == logical_type.pandas_dtype
def test_sets_string_dtype_on_init(): column_name = 'test_series' series_list = [ pd.Series(['a', 'b', 'c'], name=column_name), pd.Series(['a', None, 'c'], name=column_name), pd.Series(['a', np.nan, 'c'], name=column_name), pd.Series(['a', pd.NA, 'c'], name=column_name), ] logical_types = [ Filepath, FullName, IPAddress, NaturalLanguage, PhoneNumber, URL, ] for series in series_list: series = series.astype('object') for logical_type in logical_types: ltypes = { column_name: logical_type, } dt = DataTable(pd.DataFrame(series), logical_types=ltypes) assert dt.columns[column_name].logical_type == logical_type assert dt.columns[column_name].dtype == logical_type.pandas_dtype assert dt.to_dataframe( )[column_name].dtype == logical_type.pandas_dtype
def test_to_csv(sample_df, tmpdir): dt = DataTable(sample_df, name='test_data', index='id', semantic_tags={'id': 'tag1'}, logical_types={'age': Ordinal(order=[25, 33, 57])}, column_descriptions={ 'signup_date': 'original signup date', 'age': 'age of the user' }, column_metadata={ 'id': { 'is_sorted': True }, 'age': { 'interesting_values': [33, 57] } }) dt.to_csv(str(tmpdir), encoding='utf-8', engine='python') _dt = deserialize.read_datatable(str(tmpdir)) pd.testing.assert_frame_equal( to_pandas(dt.to_dataframe(), index=_dt.index, sort_index=True), to_pandas(_dt.to_dataframe(), index=_dt.index, sort_index=True)) assert dt == _dt
def test_deserialize_url_csv_anon(sample_df_pandas): dt = DataTable(sample_df_pandas, index='id') _dt = deserialize.read_datatable(URL, profile_name=False) pd.testing.assert_frame_equal( to_pandas(dt.to_dataframe(), index=dt.index), to_pandas(_dt.to_dataframe(), index=_dt.index)) assert dt == _dt
def test_to_parquet(sample_df, tmpdir): dt = DataTable(sample_df, index='id') dt.to_parquet(str(tmpdir)) _dt = deserialize.read_datatable(str(tmpdir)) pd.testing.assert_frame_equal( to_pandas(dt.to_dataframe(), index=dt.index, sort_index=True), to_pandas(_dt.to_dataframe(), index=_dt.index, sort_index=True)) assert dt == _dt
def test_deserialize_s3_csv(sample_df_pandas): dt = DataTable(sample_df_pandas, index='id') _dt = deserialize.read_datatable(S3_URL) pd.testing.assert_frame_equal( to_pandas(dt.to_dataframe(), index=dt.index), to_pandas(_dt.to_dataframe(), index=_dt.index)) assert dt == _dt
def test_filter_cols_errors(sample_df): dt = DataTable(sample_df, time_index='signup_date', index='id', name='dt_name') filter_no_matches = dt._filter_cols(include='nothing') assert filter_no_matches == []
def test_unserializable_table(sample_df, tmpdir): dt = DataTable( sample_df, table_metadata={'not_serializable': sample_df['is_registered'].dtype}) error = "DataTable is not json serializable. Check table and column metadata for values that may not be serializable." with pytest.raises(TypeError, match=error): dt.to_csv(str(tmpdir), encoding='utf-8', engine='python')
def test_sets_object_dtype_on_update(latlong_df): for column_name in latlong_df.columns: ltypes = {column_name: NaturalLanguage} dt = DataTable(latlong_df.loc[:, [column_name]], logical_types=ltypes) dt = dt.set_types(logical_types={column_name: LatLong}) assert dt.columns[column_name].logical_type == LatLong assert dt.columns[column_name].dtype == LatLong.pandas_dtype assert dt.to_dataframe()[column_name].dtype == LatLong.pandas_dtype
def test_datatable_getitem_list_input(sample_df): # Test regular columns dt = DataTable(sample_df, time_index='signup_date', index='id', name='dt_name') df = dt.to_dataframe() columns = ['age', 'full_name'] new_dt = dt[columns] assert new_dt is not dt assert new_dt.to_dataframe() is not df pd.testing.assert_frame_equal(to_pandas(df[columns]).reset_index(drop=True), to_pandas(new_dt.to_dataframe())) assert all(new_dt.to_dataframe().columns == ['age', 'full_name']) assert set(new_dt.columns.keys()) == {'age', 'full_name'} assert new_dt.index is None assert new_dt.time_index is None # Test with index columns = ['id', 'full_name'] new_dt = dt[columns] assert new_dt is not dt assert new_dt.to_dataframe() is not df pd.testing.assert_frame_equal(to_pandas(df[columns]), to_pandas(new_dt.to_dataframe())) assert all(new_dt.to_dataframe().columns == ['id', 'full_name']) assert set(new_dt.columns.keys()) == {'id', 'full_name'} assert new_dt.index == 'id' assert new_dt.time_index is None # Test with time_index columns = ['id', 'signup_date', 'full_name'] new_dt = dt[columns] assert new_dt is not dt assert new_dt.to_dataframe() is not df pd.testing.assert_frame_equal(to_pandas(df[columns]), to_pandas(new_dt.to_dataframe()), check_index_type=False) assert all(new_dt.to_dataframe().columns == ['id', 'signup_date', 'full_name']) assert set(new_dt.columns.keys()) == {'id', 'signup_date', 'full_name'} assert new_dt.index == 'id' # Test with empty list selector columns = [] new_dt = dt[columns] assert new_dt is not dt assert new_dt.to_dataframe() is not df assert to_pandas(new_dt.to_dataframe()).empty assert set(new_dt.columns.keys()) == set() assert new_dt.index is None assert new_dt.time_index is None # Test that reversed column order reverses resulting column order columns = list(reversed(list(dt.columns.keys()))) new_dt = dt[columns] assert new_dt is not dt assert new_dt.to_dataframe() is not df assert all(df.columns[::-1] == new_dt.to_dataframe().columns) assert all(dt.types.index[::-1] == new_dt.types.index) assert all(new_dt.to_dataframe().columns == new_dt.types.index) assert set(new_dt.columns.keys()) == set(dt.columns.keys()) assert new_dt.index == 'id' assert new_dt.time_index == 'signup_date'
def test_sets_object_dtype_on_init(latlong_df): for column_name in latlong_df.columns: ltypes = { column_name: LatLong, } dt = DataTable(latlong_df.loc[:, [column_name]], logical_types=ltypes) assert dt.columns[column_name].logical_type == LatLong assert dt.columns[column_name].dtype == LatLong.pandas_dtype assert dt.to_dataframe()[column_name].dtype == LatLong.pandas_dtype
def test_setitem_new_column(sample_df): dt = DataTable(sample_df) new_series = pd.Series([1, 2, 3]) if ks and isinstance(sample_df, ks.DataFrame): dtype = 'int64' new_series = ks.Series(new_series) else: dtype = 'Int64' new_col = DataColumn(new_series, use_standard_tags=False) assert new_col.name is None dt['test_col2'] = new_col updated_df = dt.to_dataframe() assert 'test_col2' in dt.columns assert dt['test_col2'].logical_type == Integer assert dt['test_col2'].semantic_tags == set() assert dt['test_col2'].name == 'test_col2' assert dt['test_col2']._series.name == 'test_col2' assert 'test_col2' in updated_df.columns assert updated_df['test_col2'].dtype == dtype # Standard tags and no logical type new_series = pd.Series(['new', 'column', 'inserted'], name='test_col') if ks and isinstance(sample_df, ks.DataFrame): dtype = 'object' new_series = ks.Series(new_series) else: dtype = 'category' new_col = DataColumn(new_series, use_standard_tags=True) dt['test_col'] = new_col updated_df = dt.to_dataframe() assert 'test_col' in dt.columns assert dt['test_col'].logical_type == Categorical assert dt['test_col'].semantic_tags == {'category'} assert dt['test_col'].name == 'test_col' assert dt['test_col']._series.name == 'test_col' assert 'test_col' in updated_df.columns assert updated_df['test_col'].dtype == dtype # Add with logical type and semantic tag new_series = pd.Series([1, 2, 3]) if ks and isinstance(sample_df, ks.DataFrame): new_series = ks.Series(new_series) new_col = DataColumn(new_series, logical_type=Double, use_standard_tags=False, semantic_tags={'test_tag'}) dt['test_col3'] = new_col updated_df = dt.to_dataframe() assert 'test_col3' in dt.columns assert dt['test_col3'].logical_type == Double assert dt['test_col3'].semantic_tags == {'test_tag'} assert dt['test_col3'].name == 'test_col3' assert dt['test_col3']._series.name == 'test_col3' assert 'test_col3' in updated_df.columns assert updated_df['test_col3'].dtype == 'float'
def test_pop_error(sample_df): dt = DataTable(sample_df, name='datatable', logical_types={'age': Integer}, semantic_tags={'age': 'custom_tag'}, use_standard_tags=True) with pytest.raises( KeyError, match="Column with name missing not found in DataTable"): dt.pop("missing")
def test_serialize_s3_pickle_anon(sample_df_pandas, s3_client, s3_bucket): pandas_dt = DataTable(sample_df_pandas) pandas_dt.to_pickle(TEST_S3_URL, profile_name=False) make_public(s3_client, s3_bucket) _dt = deserialize.read_datatable(TEST_S3_URL, profile_name=False) pd.testing.assert_frame_equal( to_pandas(pandas_dt.to_dataframe(), index=pandas_dt.index), to_pandas(_dt.to_dataframe(), index=_dt.index)) assert pandas_dt == _dt
def test_iloc_indices(sample_df): if dd and isinstance(sample_df, dd.DataFrame): pytest.xfail('iloc is not supported with Dask inputs') dt_with_index = DataTable(sample_df, index='id') assert dt_with_index.iloc[:, [0, 5]].index == 'id' assert dt_with_index.iloc[:, [1, 2]].index is None dt_with_time_index = DataTable(sample_df, time_index='signup_date') assert dt_with_time_index.iloc[:, [0, 5]].time_index == 'signup_date' assert dt_with_time_index.iloc[:, [1, 2]].index is None
def test_set_semantic_tags_with_index(sample_df): dt = DataTable(sample_df, index='id', use_standard_tags=False) assert dt.columns['id'].semantic_tags == {'index'} new_tags = { 'id': 'new_tag', } dt = dt.set_types(semantic_tags=new_tags) assert dt.columns['id'].semantic_tags == {'index', 'new_tag'} dt = dt.set_types(semantic_tags=new_tags, retain_index_tags=False) assert dt.columns['id'].semantic_tags == {'new_tag'}
def test_datatable_init(sample_df): dt = DataTable(sample_df) df = dt.to_dataframe() assert dt.name is None assert dt.index is None assert dt.time_index is None assert set(dt.columns.keys()) == set(sample_df.columns) assert df is sample_df pd.testing.assert_frame_equal(to_pandas(df), to_pandas(sample_df))
def test_reset_selected_column_semantic_tags(sample_df): semantic_tags = {'full_name': 'tag1', 'age': 'age'} input_types = ['age', ['age'], {'age'}] for input_type in input_types: dt = DataTable(sample_df, semantic_tags=semantic_tags, use_standard_tags=True) dt = dt.reset_semantic_tags(input_type) assert dt.columns['full_name'].semantic_tags == {'tag1'} assert dt.columns['age'].semantic_tags == {'numeric'}
def test_sets_float64_dtype_on_update(): column_name = 'test_series' series = pd.Series([0, 1, 0], name=column_name) series = series.astype('object') ltypes = { column_name: Integer, } dt = DataTable(pd.DataFrame(series), logical_types=ltypes) dt = dt.set_types(logical_types={column_name: Double}) assert dt.columns[column_name].logical_type == Double assert dt.columns[column_name].dtype == Double.pandas_dtype assert dt.to_dataframe()[column_name].dtype == Double.pandas_dtype
def test_to_parquet_with_latlong(latlong_df, tmpdir): dt = DataTable( latlong_df, logical_types={col: 'LatLong' for col in latlong_df.columns}) dt.to_parquet(str(tmpdir)) _dt = deserialize.read_datatable(str(tmpdir)) pd.testing.assert_frame_equal( to_pandas(dt.to_dataframe(), index=dt.index, sort_index=True), to_pandas(_dt.to_dataframe(), index=_dt.index, sort_index=True)) assert dt == _dt
def test_serialize_s3_parquet_anon(sample_df, s3_client, s3_bucket): xfail_tmp_disappears(sample_df) dt = DataTable(sample_df) dt.to_parquet(TEST_S3_URL, profile_name=False) make_public(s3_client, s3_bucket) _dt = deserialize.read_datatable(TEST_S3_URL, profile_name=False) pd.testing.assert_frame_equal( to_pandas(dt.to_dataframe(), index=dt.index), to_pandas(_dt.to_dataframe(), index=_dt.index)) assert dt == _dt
def test_sets_datetime_dtype_on_update(): column_name = 'test_series' series = pd.Series(['2020-01-01', '2020-01-02', '2020-01-03'], name=column_name) series = series.astype('object') ltypes = { column_name: NaturalLanguage, } dt = DataTable(pd.DataFrame(series), logical_types=ltypes) dt = dt.set_types(logical_types={column_name: Datetime}) assert dt.columns[column_name].logical_type == Datetime assert dt.columns[column_name].dtype == Datetime.pandas_dtype assert dt.to_dataframe()[column_name].dtype == Datetime.pandas_dtype
def test_reset_semantic_tags_with_index(sample_df): semantic_tags = { 'id': 'tag1', } dt = DataTable(sample_df, index='id', semantic_tags=semantic_tags, use_standard_tags=False) assert dt['id'].semantic_tags == {'index', 'tag1'} dt = dt.reset_semantic_tags('id', retain_index_tags=True) assert dt['id'].semantic_tags == {'index'} dt = dt.reset_semantic_tags('id') assert dt['id'].semantic_tags == set()
def test_select_ltypes_mixed(sample_df): dt = DataTable(sample_df) dt = dt.set_types(logical_types={ 'full_name': FullName, 'email': EmailAddress, 'phone_number': PhoneNumber, 'age': Double, 'signup_date': Datetime, }) dt_mixed_ltypes = dt.select(['FullName', 'email_address', Double]) assert len(dt_mixed_ltypes.columns) == 3 assert 'phone_number' not in dt_mixed_ltypes.columns
def test_set_semantic_tags_with_time_index(sample_df): dt = DataTable(sample_df, time_index='signup_date', use_standard_tags=False) assert dt.columns['signup_date'].semantic_tags == {'time_index'} new_tags = { 'signup_date': 'new_tag', } dt = dt.set_types(semantic_tags=new_tags) assert dt.columns['signup_date'].semantic_tags == {'time_index', 'new_tag'} dt = dt.set_types(semantic_tags=new_tags, retain_index_tags=False) assert dt.columns['signup_date'].semantic_tags == {'new_tag'}
def test_reset_semantic_tags_with_time_index(sample_df): semantic_tags = { 'signup_date': 'tag1', } dt = DataTable(sample_df, time_index='signup_date', semantic_tags=semantic_tags, use_standard_tags=False) assert dt['signup_date'].semantic_tags == {'time_index', 'tag1'} dt = dt.reset_semantic_tags('signup_date', retain_index_tags=True) assert dt['signup_date'].semantic_tags == {'time_index'} dt = dt.reset_semantic_tags('signup_date') assert dt['signup_date'].semantic_tags == set()
def test_datatable_drop_indices(sample_df): dt = DataTable(sample_df, index='id', time_index='signup_date') assert dt.index == 'id' assert dt.time_index == 'signup_date' dropped_index_dt = dt.drop('id') assert 'id' not in dropped_index_dt.columns assert dropped_index_dt.index is None assert dropped_index_dt.time_index == 'signup_date' dropped_time_index_dt = dt.drop(['signup_date']) assert 'signup_date' not in dropped_time_index_dt.columns assert dropped_time_index_dt.time_index is None assert dropped_time_index_dt.index == 'id'
def test_numeric_index_strings(time_index_df): error_msg = 'Time index column must contain datetime or numeric values' with pytest.raises(TypeError, match=error_msg): DataTable(time_index_df, time_index='strs') error_msg = 'Time index column must contain datetime or numeric values' with pytest.raises(TypeError, match=error_msg): DataTable(time_index_df, time_index='ints', logical_types={'ints': 'Categorical'}) error_msg = 'Time index column must contain datetime or numeric values' with pytest.raises(TypeError, match=error_msg): DataTable(time_index_df, time_index='letters', logical_types={'strs': 'Integer'}) dt = DataTable(time_index_df, time_index='strs', logical_types={'strs': 'Double'}) date_col = dt['strs'] assert dt.time_index == 'strs' assert date_col.logical_type == Double assert date_col.semantic_tags == {'time_index', 'numeric'} dt = DataTable(time_index_df, logical_types={'strs': 'Double'}) dt = dt.set_time_index('strs') date_col = dt['strs'] assert dt.time_index == 'strs' assert date_col.logical_type == Double assert date_col.semantic_tags == {'time_index', 'numeric'}
def test_select_ltypes_table(sample_df): dt = DataTable(sample_df, time_index='signup_date', index='id') dt = dt.set_types(logical_types={ 'full_name': FullName, 'email': EmailAddress, 'phone_number': PhoneNumber, 'age': Double, 'signup_date': Datetime, }) dt.set_types(semantic_tags={ 'full_name': ['new_tag', 'tag2'], 'age': 'numeric', }) dt_no_indices = dt.select('phone_number') assert dt_no_indices.index is None assert dt_no_indices.time_index is None dt_with_indices = dt.select(['Datetime', 'Integer']) assert dt_with_indices.index == 'id' assert dt_with_indices.time_index == 'signup_date' dt_values = dt.select(['FullName']) assert dt_values.name == dt.name original_col = dt_values.columns['full_name'] col = dt.columns['full_name'] assert col.logical_type == original_col.logical_type assert to_pandas(col.to_series()).equals(to_pandas(original_col.to_series())) assert col.dtype == original_col.dtype assert col.semantic_tags == original_col.semantic_tags
def test_reset_all_semantic_tags(sample_df): semantic_tags = {'full_name': 'tag1', 'age': 'age'} dt = DataTable(sample_df, semantic_tags=semantic_tags, use_standard_tags=True) new_dt = dt.reset_semantic_tags() # Verify original tags were not changed assert dt.columns['full_name'].semantic_tags == {'tag1'} assert dt.columns['age'].semantic_tags == {'numeric', 'age'} assert new_dt is not dt assert new_dt.columns['full_name'].semantic_tags == set() assert new_dt.columns['age'].semantic_tags == {'numeric'}