예제 #1
0
def test_one_hot_legacy(clickstream):
    ddf = dd.from_pandas(clickstream, npartitions=10)
    dsf = one_hot_encode(ddf, 'page_id', list('ABCDE'), ['index', 'id'])
    assert dsf._meta.empty
    sf = dsf.compute()
    assert sf.shape == (100, 5)
    assert isinstance(sf.index, pd.MultiIndex)
예제 #2
0
def test_one_hot(clickstream):
    ddf = dd.from_pandas(clickstream, npartitions=10)
    dsf = one_hot_encode(ddf,
                         column='page_id',
                         categories=list('ABCDE'),
                         index_col=['index', 'id'])
    sf = dsf.compute()
    assert sf.shape == (100, 5)
    assert isinstance(sf.index, pd.MultiIndex)
예제 #3
0
def test_dask_multi_index_loc(clickstream):
    sf = one_hot_encode(dd.from_pandas(clickstream, npartitions=10),
                        categories={
                            'page_id': list('ABCDE'),
                            'other_categorical': list('FGHIJ')
                        },
                        index_col=['index', 'id'])
    res = sf.loc['2016-01-15':'2016-02-15']
    res = res.compute()
    assert res.index.get_level_values(0).date.min() == dt.date(2016, 1, 15)
    assert res.index.get_level_values(0).date.max() == dt.date(2016, 2, 15)
예제 #4
0
def test_one_hot_disk_categories(clickstream):
    with tmpdir() as tmp:
        cat_path = os.path.join(tmp, 'cat.pickle')
        pd.Series(list('ABCDE')).to_pickle(cat_path)
        ddf = dd.from_pandas(clickstream, npartitions=10)
        dsf = one_hot_encode(ddf,
                             categories={'page_id': cat_path},
                             index_col=['index', 'id'])
        assert dsf._meta.empty
        sf = dsf.compute()
        assert sf.shape == (100, 5)
        assert isinstance(sf.index, pd.MultiIndex)
예제 #5
0
def test_set_index(clickstream):
    ddf = dd.from_pandas(clickstream, npartitions=10)
    dsf = one_hot_encode(ddf,
                         categories={
                             'page_id': list('ABCDE'),
                             'other_categorical': list('FGHIJ')
                         },
                         order=['other_categorical', 'page_id'],
                         index_col=['index', 'id'])
    dense = dsf.compute().set_index(level=1).todense()
    res = dsf.set_index(level=1).compute().todense()

    pdt.assert_frame_equal(dense, res)
예제 #6
0
def test_one_hot_prefixes_sep(clickstream):
    ddf = dd.from_pandas(clickstream, npartitions=10)
    dsf = one_hot_encode(ddf,
                         categories={
                             'page_id': list('ABCDE'),
                             'other_categorical': list('FGHIJ')
                         },
                         index_col=['index', 'id'],
                         prefixes=True,
                         sep='=')
    correct_columns = list(map(lambda x: 'page_id=' + x, list('ABCDE'))) \
        + list(map(lambda x: 'other_categorical=' + x, list('FGHIJ')))
    assert sorted(dsf.columns) == sorted(correct_columns)
예제 #7
0
def test_one_hot_no_order(clickstream):
    ddf = dd.from_pandas(clickstream, npartitions=10)
    dsf = one_hot_encode(ddf,
                         categories={
                             'page_id': list('ABCDE'),
                             'other_categorical': list('FGHIJ')
                         },
                         index_col=['index', 'id'])
    assert dsf._meta.empty
    assert sorted(dsf.columns) == list('ABCDEFGHIJ')
    sf = dsf.compute()
    assert sf.shape == (100, 10)
    assert isinstance(sf.index, pd.MultiIndex)
    assert sorted(sf.columns) == list('ABCDEFGHIJ')
예제 #8
0
def test_one_hot_dense_column(clickstream):
    ddf = dd.from_pandas(clickstream, npartitions=10)
    dsf = one_hot_encode(
        ddf,
        categories={
            'page_id': list('ABCDE'),
            'other_categorical': list('FGHIJ'),
            'id': False
        },
    )
    assert dsf._meta.empty
    assert set(dsf.columns) == set('ABCDEFGHIJ') | {'id'}
    sf = dsf.compute()
    assert sf.shape == (100, 11)
    assert set(sf.columns) == set('ABCDEFGHIJ') | {'id'}
예제 #9
0
def test_one_hot_prefixes(clickstream):
    ddf = dd.from_pandas(clickstream, npartitions=10)
    dsf = one_hot_encode(ddf,
                         categories={
                             'page_id': list('ABCDE'),
                             'other_categorical': list('FGHIJ')
                         },
                         index_col=['index', 'id'],
                         prefixes=True)
    correct_columns = list(map(lambda x: 'page_id_' + x, list('ABCDE'))) \
        + list(map(lambda x: 'other_categorical_' + x, list('FGHIJ')))
    assert dsf._meta.empty
    assert sorted(dsf.columns) == sorted(correct_columns)
    sf = dsf.compute()
    assert sf.shape == (100, 10)
    assert isinstance(sf.index, pd.MultiIndex)
    assert sorted(sf.columns) == sorted(correct_columns)