Exemplo n.º 1
0
def test_sets_category_dtype_on_init():
    column_name = 'test_series'
    series_list = [
        pd.Series(['a', 'b', 'c'], name=column_name),
        pd.Series(['a', None, 'c'], name=column_name),
        pd.Series(['a', np.nan, 'c'], name=column_name),
        pd.Series(['a', pd.NA, 'c'], name=column_name),
        pd.Series(['a', pd.NaT, 'c'], name=column_name),
    ]

    logical_types = [
        Categorical,
        CountryCode,
        Ordinal(order=['a', 'b', 'c']),
        SubRegionCode,
        ZIPCode,
    ]

    for series in series_list:
        series = series.astype('object')
        for logical_type in logical_types:
            ltypes = {
                column_name: logical_type,
            }
            dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
            assert dt.columns[column_name].logical_type == logical_type
            assert dt.columns[column_name].dtype == logical_type.pandas_dtype
            assert dt.to_dataframe(
            )[column_name].dtype == logical_type.pandas_dtype
Exemplo n.º 2
0
def test_sets_string_dtype_on_init():
    column_name = 'test_series'
    series_list = [
        pd.Series(['a', 'b', 'c'], name=column_name),
        pd.Series(['a', None, 'c'], name=column_name),
        pd.Series(['a', np.nan, 'c'], name=column_name),
        pd.Series(['a', pd.NA, 'c'], name=column_name),
    ]

    logical_types = [
        Filepath,
        FullName,
        IPAddress,
        NaturalLanguage,
        PhoneNumber,
        URL,
    ]

    for series in series_list:
        series = series.astype('object')
        for logical_type in logical_types:
            ltypes = {
                column_name: logical_type,
            }
            dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
            assert dt.columns[column_name].logical_type == logical_type
            assert dt.columns[column_name].dtype == logical_type.pandas_dtype
            assert dt.to_dataframe(
            )[column_name].dtype == logical_type.pandas_dtype
Exemplo n.º 3
0
def test_to_csv(sample_df, tmpdir):
    dt = DataTable(sample_df,
                   name='test_data',
                   index='id',
                   semantic_tags={'id': 'tag1'},
                   logical_types={'age': Ordinal(order=[25, 33, 57])},
                   column_descriptions={
                       'signup_date': 'original signup date',
                       'age': 'age of the user'
                   },
                   column_metadata={
                       'id': {
                           'is_sorted': True
                       },
                       'age': {
                           'interesting_values': [33, 57]
                       }
                   })

    dt.to_csv(str(tmpdir), encoding='utf-8', engine='python')
    _dt = deserialize.read_datatable(str(tmpdir))

    pd.testing.assert_frame_equal(
        to_pandas(dt.to_dataframe(), index=_dt.index, sort_index=True),
        to_pandas(_dt.to_dataframe(), index=_dt.index, sort_index=True))
    assert dt == _dt
Exemplo n.º 4
0
def test_deserialize_url_csv_anon(sample_df_pandas):
    dt = DataTable(sample_df_pandas, index='id')
    _dt = deserialize.read_datatable(URL, profile_name=False)

    pd.testing.assert_frame_equal(
        to_pandas(dt.to_dataframe(), index=dt.index),
        to_pandas(_dt.to_dataframe(), index=_dt.index))
    assert dt == _dt
Exemplo n.º 5
0
def test_to_parquet(sample_df, tmpdir):
    dt = DataTable(sample_df, index='id')
    dt.to_parquet(str(tmpdir))
    _dt = deserialize.read_datatable(str(tmpdir))
    pd.testing.assert_frame_equal(
        to_pandas(dt.to_dataframe(), index=dt.index, sort_index=True),
        to_pandas(_dt.to_dataframe(), index=_dt.index, sort_index=True))
    assert dt == _dt
Exemplo n.º 6
0
def test_deserialize_s3_csv(sample_df_pandas):
    dt = DataTable(sample_df_pandas, index='id')
    _dt = deserialize.read_datatable(S3_URL)

    pd.testing.assert_frame_equal(
        to_pandas(dt.to_dataframe(), index=dt.index),
        to_pandas(_dt.to_dataframe(), index=_dt.index))
    assert dt == _dt
Exemplo n.º 7
0
def test_filter_cols_errors(sample_df):
    dt = DataTable(sample_df,
                   time_index='signup_date',
                   index='id',
                   name='dt_name')

    filter_no_matches = dt._filter_cols(include='nothing')
    assert filter_no_matches == []
Exemplo n.º 8
0
def test_unserializable_table(sample_df, tmpdir):
    dt = DataTable(
        sample_df,
        table_metadata={'not_serializable': sample_df['is_registered'].dtype})

    error = "DataTable is not json serializable. Check table and column metadata for values that may not be serializable."
    with pytest.raises(TypeError, match=error):
        dt.to_csv(str(tmpdir), encoding='utf-8', engine='python')
Exemplo n.º 9
0
def test_sets_object_dtype_on_update(latlong_df):
    for column_name in latlong_df.columns:
        ltypes = {column_name: NaturalLanguage}
        dt = DataTable(latlong_df.loc[:, [column_name]], logical_types=ltypes)
        dt = dt.set_types(logical_types={column_name: LatLong})
        assert dt.columns[column_name].logical_type == LatLong
        assert dt.columns[column_name].dtype == LatLong.pandas_dtype
        assert dt.to_dataframe()[column_name].dtype == LatLong.pandas_dtype
Exemplo n.º 10
0
def test_datatable_getitem_list_input(sample_df):
    # Test regular columns
    dt = DataTable(sample_df, time_index='signup_date', index='id', name='dt_name')
    df = dt.to_dataframe()
    columns = ['age', 'full_name']
    new_dt = dt[columns]
    assert new_dt is not dt
    assert new_dt.to_dataframe() is not df
    pd.testing.assert_frame_equal(to_pandas(df[columns]).reset_index(drop=True), to_pandas(new_dt.to_dataframe()))
    assert all(new_dt.to_dataframe().columns == ['age', 'full_name'])
    assert set(new_dt.columns.keys()) == {'age', 'full_name'}
    assert new_dt.index is None
    assert new_dt.time_index is None

    # Test with index
    columns = ['id', 'full_name']
    new_dt = dt[columns]
    assert new_dt is not dt
    assert new_dt.to_dataframe() is not df
    pd.testing.assert_frame_equal(to_pandas(df[columns]), to_pandas(new_dt.to_dataframe()))
    assert all(new_dt.to_dataframe().columns == ['id', 'full_name'])
    assert set(new_dt.columns.keys()) == {'id', 'full_name'}
    assert new_dt.index == 'id'
    assert new_dt.time_index is None

    # Test with time_index
    columns = ['id', 'signup_date', 'full_name']
    new_dt = dt[columns]
    assert new_dt is not dt
    assert new_dt.to_dataframe() is not df
    pd.testing.assert_frame_equal(to_pandas(df[columns]), to_pandas(new_dt.to_dataframe()), check_index_type=False)
    assert all(new_dt.to_dataframe().columns == ['id', 'signup_date', 'full_name'])
    assert set(new_dt.columns.keys()) == {'id', 'signup_date', 'full_name'}
    assert new_dt.index == 'id'

    # Test with empty list selector
    columns = []
    new_dt = dt[columns]
    assert new_dt is not dt
    assert new_dt.to_dataframe() is not df
    assert to_pandas(new_dt.to_dataframe()).empty
    assert set(new_dt.columns.keys()) == set()
    assert new_dt.index is None
    assert new_dt.time_index is None

    # Test that reversed column order reverses resulting column order
    columns = list(reversed(list(dt.columns.keys())))
    new_dt = dt[columns]

    assert new_dt is not dt
    assert new_dt.to_dataframe() is not df
    assert all(df.columns[::-1] == new_dt.to_dataframe().columns)
    assert all(dt.types.index[::-1] == new_dt.types.index)
    assert all(new_dt.to_dataframe().columns == new_dt.types.index)
    assert set(new_dt.columns.keys()) == set(dt.columns.keys())
    assert new_dt.index == 'id'
    assert new_dt.time_index == 'signup_date'
Exemplo n.º 11
0
def test_sets_object_dtype_on_init(latlong_df):
    for column_name in latlong_df.columns:
        ltypes = {
            column_name: LatLong,
        }
        dt = DataTable(latlong_df.loc[:, [column_name]], logical_types=ltypes)
        assert dt.columns[column_name].logical_type == LatLong
        assert dt.columns[column_name].dtype == LatLong.pandas_dtype
        assert dt.to_dataframe()[column_name].dtype == LatLong.pandas_dtype
Exemplo n.º 12
0
def test_setitem_new_column(sample_df):
    dt = DataTable(sample_df)
    new_series = pd.Series([1, 2, 3])
    if ks and isinstance(sample_df, ks.DataFrame):
        dtype = 'int64'
        new_series = ks.Series(new_series)
    else:
        dtype = 'Int64'

    new_col = DataColumn(new_series, use_standard_tags=False)
    assert new_col.name is None

    dt['test_col2'] = new_col
    updated_df = dt.to_dataframe()
    assert 'test_col2' in dt.columns
    assert dt['test_col2'].logical_type == Integer
    assert dt['test_col2'].semantic_tags == set()
    assert dt['test_col2'].name == 'test_col2'
    assert dt['test_col2']._series.name == 'test_col2'
    assert 'test_col2' in updated_df.columns
    assert updated_df['test_col2'].dtype == dtype

    # Standard tags and no logical type
    new_series = pd.Series(['new', 'column', 'inserted'], name='test_col')
    if ks and isinstance(sample_df, ks.DataFrame):
        dtype = 'object'
        new_series = ks.Series(new_series)
    else:
        dtype = 'category'
    new_col = DataColumn(new_series, use_standard_tags=True)
    dt['test_col'] = new_col
    updated_df = dt.to_dataframe()
    assert 'test_col' in dt.columns
    assert dt['test_col'].logical_type == Categorical
    assert dt['test_col'].semantic_tags == {'category'}
    assert dt['test_col'].name == 'test_col'
    assert dt['test_col']._series.name == 'test_col'
    assert 'test_col' in updated_df.columns
    assert updated_df['test_col'].dtype == dtype

    # Add with logical type and semantic tag
    new_series = pd.Series([1, 2, 3])
    if ks and isinstance(sample_df, ks.DataFrame):
        new_series = ks.Series(new_series)
    new_col = DataColumn(new_series,
                         logical_type=Double,
                         use_standard_tags=False,
                         semantic_tags={'test_tag'})
    dt['test_col3'] = new_col
    updated_df = dt.to_dataframe()
    assert 'test_col3' in dt.columns
    assert dt['test_col3'].logical_type == Double
    assert dt['test_col3'].semantic_tags == {'test_tag'}
    assert dt['test_col3'].name == 'test_col3'
    assert dt['test_col3']._series.name == 'test_col3'
    assert 'test_col3' in updated_df.columns
    assert updated_df['test_col3'].dtype == 'float'
Exemplo n.º 13
0
def test_pop_error(sample_df):
    dt = DataTable(sample_df,
                   name='datatable',
                   logical_types={'age': Integer},
                   semantic_tags={'age': 'custom_tag'},
                   use_standard_tags=True)

    with pytest.raises(
            KeyError, match="Column with name missing not found in DataTable"):
        dt.pop("missing")
Exemplo n.º 14
0
def test_serialize_s3_pickle_anon(sample_df_pandas, s3_client, s3_bucket):
    pandas_dt = DataTable(sample_df_pandas)
    pandas_dt.to_pickle(TEST_S3_URL, profile_name=False)
    make_public(s3_client, s3_bucket)
    _dt = deserialize.read_datatable(TEST_S3_URL, profile_name=False)

    pd.testing.assert_frame_equal(
        to_pandas(pandas_dt.to_dataframe(), index=pandas_dt.index),
        to_pandas(_dt.to_dataframe(), index=_dt.index))
    assert pandas_dt == _dt
Exemplo n.º 15
0
def test_iloc_indices(sample_df):
    if dd and isinstance(sample_df, dd.DataFrame):
        pytest.xfail('iloc is not supported with Dask inputs')
    dt_with_index = DataTable(sample_df, index='id')
    assert dt_with_index.iloc[:, [0, 5]].index == 'id'
    assert dt_with_index.iloc[:, [1, 2]].index is None

    dt_with_time_index = DataTable(sample_df, time_index='signup_date')
    assert dt_with_time_index.iloc[:, [0, 5]].time_index == 'signup_date'
    assert dt_with_time_index.iloc[:, [1, 2]].index is None
def test_set_semantic_tags_with_index(sample_df):
    dt = DataTable(sample_df, index='id', use_standard_tags=False)
    assert dt.columns['id'].semantic_tags == {'index'}

    new_tags = {
        'id': 'new_tag',
    }
    dt = dt.set_types(semantic_tags=new_tags)
    assert dt.columns['id'].semantic_tags == {'index', 'new_tag'}
    dt = dt.set_types(semantic_tags=new_tags, retain_index_tags=False)
    assert dt.columns['id'].semantic_tags == {'new_tag'}
Exemplo n.º 17
0
def test_datatable_init(sample_df):
    dt = DataTable(sample_df)
    df = dt.to_dataframe()

    assert dt.name is None
    assert dt.index is None
    assert dt.time_index is None

    assert set(dt.columns.keys()) == set(sample_df.columns)
    assert df is sample_df
    pd.testing.assert_frame_equal(to_pandas(df), to_pandas(sample_df))
def test_reset_selected_column_semantic_tags(sample_df):
    semantic_tags = {'full_name': 'tag1', 'age': 'age'}

    input_types = ['age', ['age'], {'age'}]
    for input_type in input_types:
        dt = DataTable(sample_df,
                       semantic_tags=semantic_tags,
                       use_standard_tags=True)
        dt = dt.reset_semantic_tags(input_type)
        assert dt.columns['full_name'].semantic_tags == {'tag1'}
        assert dt.columns['age'].semantic_tags == {'numeric'}
Exemplo n.º 19
0
def test_sets_float64_dtype_on_update():
    column_name = 'test_series'
    series = pd.Series([0, 1, 0], name=column_name)
    series = series.astype('object')
    ltypes = {
        column_name: Integer,
    }
    dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
    dt = dt.set_types(logical_types={column_name: Double})
    assert dt.columns[column_name].logical_type == Double
    assert dt.columns[column_name].dtype == Double.pandas_dtype
    assert dt.to_dataframe()[column_name].dtype == Double.pandas_dtype
Exemplo n.º 20
0
def test_to_parquet_with_latlong(latlong_df, tmpdir):
    dt = DataTable(
        latlong_df,
        logical_types={col: 'LatLong'
                       for col in latlong_df.columns})
    dt.to_parquet(str(tmpdir))
    _dt = deserialize.read_datatable(str(tmpdir))

    pd.testing.assert_frame_equal(
        to_pandas(dt.to_dataframe(), index=dt.index, sort_index=True),
        to_pandas(_dt.to_dataframe(), index=_dt.index, sort_index=True))
    assert dt == _dt
Exemplo n.º 21
0
def test_serialize_s3_parquet_anon(sample_df, s3_client, s3_bucket):
    xfail_tmp_disappears(sample_df)

    dt = DataTable(sample_df)
    dt.to_parquet(TEST_S3_URL, profile_name=False)
    make_public(s3_client, s3_bucket)
    _dt = deserialize.read_datatable(TEST_S3_URL, profile_name=False)

    pd.testing.assert_frame_equal(
        to_pandas(dt.to_dataframe(), index=dt.index),
        to_pandas(_dt.to_dataframe(), index=_dt.index))
    assert dt == _dt
Exemplo n.º 22
0
def test_sets_datetime_dtype_on_update():
    column_name = 'test_series'
    series = pd.Series(['2020-01-01', '2020-01-02', '2020-01-03'],
                       name=column_name)
    series = series.astype('object')
    ltypes = {
        column_name: NaturalLanguage,
    }
    dt = DataTable(pd.DataFrame(series), logical_types=ltypes)
    dt = dt.set_types(logical_types={column_name: Datetime})
    assert dt.columns[column_name].logical_type == Datetime
    assert dt.columns[column_name].dtype == Datetime.pandas_dtype
    assert dt.to_dataframe()[column_name].dtype == Datetime.pandas_dtype
def test_reset_semantic_tags_with_index(sample_df):
    semantic_tags = {
        'id': 'tag1',
    }
    dt = DataTable(sample_df,
                   index='id',
                   semantic_tags=semantic_tags,
                   use_standard_tags=False)
    assert dt['id'].semantic_tags == {'index', 'tag1'}
    dt = dt.reset_semantic_tags('id', retain_index_tags=True)
    assert dt['id'].semantic_tags == {'index'}
    dt = dt.reset_semantic_tags('id')
    assert dt['id'].semantic_tags == set()
Exemplo n.º 24
0
def test_select_ltypes_mixed(sample_df):
    dt = DataTable(sample_df)
    dt = dt.set_types(logical_types={
        'full_name': FullName,
        'email': EmailAddress,
        'phone_number': PhoneNumber,
        'age': Double,
        'signup_date': Datetime,
    })

    dt_mixed_ltypes = dt.select(['FullName', 'email_address', Double])
    assert len(dt_mixed_ltypes.columns) == 3
    assert 'phone_number' not in dt_mixed_ltypes.columns
def test_set_semantic_tags_with_time_index(sample_df):
    dt = DataTable(sample_df,
                   time_index='signup_date',
                   use_standard_tags=False)
    assert dt.columns['signup_date'].semantic_tags == {'time_index'}

    new_tags = {
        'signup_date': 'new_tag',
    }
    dt = dt.set_types(semantic_tags=new_tags)
    assert dt.columns['signup_date'].semantic_tags == {'time_index', 'new_tag'}
    dt = dt.set_types(semantic_tags=new_tags, retain_index_tags=False)
    assert dt.columns['signup_date'].semantic_tags == {'new_tag'}
def test_reset_semantic_tags_with_time_index(sample_df):
    semantic_tags = {
        'signup_date': 'tag1',
    }
    dt = DataTable(sample_df,
                   time_index='signup_date',
                   semantic_tags=semantic_tags,
                   use_standard_tags=False)
    assert dt['signup_date'].semantic_tags == {'time_index', 'tag1'}
    dt = dt.reset_semantic_tags('signup_date', retain_index_tags=True)
    assert dt['signup_date'].semantic_tags == {'time_index'}
    dt = dt.reset_semantic_tags('signup_date')
    assert dt['signup_date'].semantic_tags == set()
def test_datatable_drop_indices(sample_df):
    dt = DataTable(sample_df, index='id', time_index='signup_date')
    assert dt.index == 'id'
    assert dt.time_index == 'signup_date'

    dropped_index_dt = dt.drop('id')
    assert 'id' not in dropped_index_dt.columns
    assert dropped_index_dt.index is None
    assert dropped_index_dt.time_index == 'signup_date'

    dropped_time_index_dt = dt.drop(['signup_date'])
    assert 'signup_date' not in dropped_time_index_dt.columns
    assert dropped_time_index_dt.time_index is None
    assert dropped_time_index_dt.index == 'id'
def test_numeric_index_strings(time_index_df):
    error_msg = 'Time index column must contain datetime or numeric values'
    with pytest.raises(TypeError, match=error_msg):
        DataTable(time_index_df, time_index='strs')

    error_msg = 'Time index column must contain datetime or numeric values'
    with pytest.raises(TypeError, match=error_msg):
        DataTable(time_index_df,
                  time_index='ints',
                  logical_types={'ints': 'Categorical'})

    error_msg = 'Time index column must contain datetime or numeric values'
    with pytest.raises(TypeError, match=error_msg):
        DataTable(time_index_df,
                  time_index='letters',
                  logical_types={'strs': 'Integer'})

    dt = DataTable(time_index_df,
                   time_index='strs',
                   logical_types={'strs': 'Double'})
    date_col = dt['strs']
    assert dt.time_index == 'strs'
    assert date_col.logical_type == Double
    assert date_col.semantic_tags == {'time_index', 'numeric'}

    dt = DataTable(time_index_df, logical_types={'strs': 'Double'})
    dt = dt.set_time_index('strs')
    date_col = dt['strs']
    assert dt.time_index == 'strs'
    assert date_col.logical_type == Double
    assert date_col.semantic_tags == {'time_index', 'numeric'}
Exemplo n.º 29
0
def test_select_ltypes_table(sample_df):
    dt = DataTable(sample_df, time_index='signup_date', index='id')
    dt = dt.set_types(logical_types={
        'full_name': FullName,
        'email': EmailAddress,
        'phone_number': PhoneNumber,
        'age': Double,
        'signup_date': Datetime,
    })
    dt.set_types(semantic_tags={
        'full_name': ['new_tag', 'tag2'],
        'age': 'numeric',
    })

    dt_no_indices = dt.select('phone_number')
    assert dt_no_indices.index is None
    assert dt_no_indices.time_index is None

    dt_with_indices = dt.select(['Datetime', 'Integer'])
    assert dt_with_indices.index == 'id'
    assert dt_with_indices.time_index == 'signup_date'

    dt_values = dt.select(['FullName'])
    assert dt_values.name == dt.name
    original_col = dt_values.columns['full_name']
    col = dt.columns['full_name']
    assert col.logical_type == original_col.logical_type
    assert to_pandas(col.to_series()).equals(to_pandas(original_col.to_series()))
    assert col.dtype == original_col.dtype
    assert col.semantic_tags == original_col.semantic_tags
def test_reset_all_semantic_tags(sample_df):
    semantic_tags = {'full_name': 'tag1', 'age': 'age'}
    dt = DataTable(sample_df,
                   semantic_tags=semantic_tags,
                   use_standard_tags=True)

    new_dt = dt.reset_semantic_tags()
    # Verify original tags were not changed
    assert dt.columns['full_name'].semantic_tags == {'tag1'}
    assert dt.columns['age'].semantic_tags == {'numeric', 'age'}

    assert new_dt is not dt
    assert new_dt.columns['full_name'].semantic_tags == set()
    assert new_dt.columns['age'].semantic_tags == {'numeric'}