def test_one_hot_legacy(clickstream): ddf = dd.from_pandas(clickstream, npartitions=10) dsf = one_hot_encode(ddf, 'page_id', list('ABCDE'), ['index', 'id']) assert dsf._meta.empty sf = dsf.compute() assert sf.shape == (100, 5) assert isinstance(sf.index, pd.MultiIndex)
def test_one_hot(clickstream): ddf = dd.from_pandas(clickstream, npartitions=10) dsf = one_hot_encode(ddf, column='page_id', categories=list('ABCDE'), index_col=['index', 'id']) sf = dsf.compute() assert sf.shape == (100, 5) assert isinstance(sf.index, pd.MultiIndex)
def test_dask_multi_index_loc(clickstream): sf = one_hot_encode(dd.from_pandas(clickstream, npartitions=10), categories={ 'page_id': list('ABCDE'), 'other_categorical': list('FGHIJ') }, index_col=['index', 'id']) res = sf.loc['2016-01-15':'2016-02-15'] res = res.compute() assert res.index.get_level_values(0).date.min() == dt.date(2016, 1, 15) assert res.index.get_level_values(0).date.max() == dt.date(2016, 2, 15)
def test_one_hot_disk_categories(clickstream): with tmpdir() as tmp: cat_path = os.path.join(tmp, 'cat.pickle') pd.Series(list('ABCDE')).to_pickle(cat_path) ddf = dd.from_pandas(clickstream, npartitions=10) dsf = one_hot_encode(ddf, categories={'page_id': cat_path}, index_col=['index', 'id']) assert dsf._meta.empty sf = dsf.compute() assert sf.shape == (100, 5) assert isinstance(sf.index, pd.MultiIndex)
def test_set_index(clickstream): ddf = dd.from_pandas(clickstream, npartitions=10) dsf = one_hot_encode(ddf, categories={ 'page_id': list('ABCDE'), 'other_categorical': list('FGHIJ') }, order=['other_categorical', 'page_id'], index_col=['index', 'id']) dense = dsf.compute().set_index(level=1).todense() res = dsf.set_index(level=1).compute().todense() pdt.assert_frame_equal(dense, res)
def test_one_hot_prefixes_sep(clickstream): ddf = dd.from_pandas(clickstream, npartitions=10) dsf = one_hot_encode(ddf, categories={ 'page_id': list('ABCDE'), 'other_categorical': list('FGHIJ') }, index_col=['index', 'id'], prefixes=True, sep='=') correct_columns = list(map(lambda x: 'page_id=' + x, list('ABCDE'))) \ + list(map(lambda x: 'other_categorical=' + x, list('FGHIJ'))) assert sorted(dsf.columns) == sorted(correct_columns)
def test_one_hot_no_order(clickstream): ddf = dd.from_pandas(clickstream, npartitions=10) dsf = one_hot_encode(ddf, categories={ 'page_id': list('ABCDE'), 'other_categorical': list('FGHIJ') }, index_col=['index', 'id']) assert dsf._meta.empty assert sorted(dsf.columns) == list('ABCDEFGHIJ') sf = dsf.compute() assert sf.shape == (100, 10) assert isinstance(sf.index, pd.MultiIndex) assert sorted(sf.columns) == list('ABCDEFGHIJ')
def test_one_hot_dense_column(clickstream): ddf = dd.from_pandas(clickstream, npartitions=10) dsf = one_hot_encode( ddf, categories={ 'page_id': list('ABCDE'), 'other_categorical': list('FGHIJ'), 'id': False }, ) assert dsf._meta.empty assert set(dsf.columns) == set('ABCDEFGHIJ') | {'id'} sf = dsf.compute() assert sf.shape == (100, 11) assert set(sf.columns) == set('ABCDEFGHIJ') | {'id'}
def test_one_hot_prefixes(clickstream): ddf = dd.from_pandas(clickstream, npartitions=10) dsf = one_hot_encode(ddf, categories={ 'page_id': list('ABCDE'), 'other_categorical': list('FGHIJ') }, index_col=['index', 'id'], prefixes=True) correct_columns = list(map(lambda x: 'page_id_' + x, list('ABCDE'))) \ + list(map(lambda x: 'other_categorical_' + x, list('FGHIJ'))) assert dsf._meta.empty assert sorted(dsf.columns) == sorted(correct_columns) sf = dsf.compute() assert sf.shape == (100, 10) assert isinstance(sf.index, pd.MultiIndex) assert sorted(sf.columns) == sorted(correct_columns)