def test_assign_number():
    sf = SparseFrame(np.identity(5), columns=list('ABCDE'))
    sf = sf.assign(**{'F': 1})
    correct = np.hstack([np.identity(5), np.ones(5).reshape(-1, 1)])
    assert 'F' in set(sf.columns)
    assert sf.shape == (5, 6)
    assert np.all(correct == sf.data.todense())
def test_init_with_pandas():
    df = pd.DataFrame(
        np.identity(5),
        index=[pd.date_range("2100-01-01", periods=5),
               np.arange(5)],
        columns=list('ABCDE'))
    sf = SparseFrame(df)
    assert sf.shape == (5, 5)
    assert isinstance(sf.index, pd.MultiIndex)
    assert (sf.index == df.index).all()
    assert (sf.columns == df.columns).all()

    with pytest.warns(SyntaxWarning):
        sf = SparseFrame(df, index=np.arange(10, 15), columns=list('VWXYZ'))
    assert sf.index.tolist() == np.arange(10, 15).tolist()
    assert sf.columns.tolist() == list('VWXYZ')

    s = pd.Series(np.ones(10))
    sf = SparseFrame(s)

    assert sf.shape == (10, 1)
    assert np.all(sf.data.todense() == np.ones(10).reshape(-1, 1))

    df['A'] = 'bla'
    with pytest.raises(TypeError):
        sf = SparseFrame(df)
Exemplo n.º 3
0
def test_groupby():
    shuffle_idx = np.random.permutation(np.arange(100))
    index = np.tile(np.arange(10), 10)
    data = np.vstack([np.identity(10) for _ in range(10)])
    t = SparseFrame(data[shuffle_idx, :], index=index[shuffle_idx])
    res = t.groupby_sum().data.todense()
    assert np.all(res == (np.identity(10) * 10))
def test_drop_single_label():
    old_names = list('ABCDE')
    sf = SparseFrame(np.identity(5), columns=old_names)
    sf = sf.drop('A', axis=1)

    correct = np.identity(5)[:, 1:]
    assert sf.columns.tolist() == list('BCDE')
    np.testing.assert_array_equal(sf.data.todense(), correct)
def test_drop_multiple_labels():
    old_names = list('ABCDE')
    sf = SparseFrame(np.identity(5), columns=old_names)
    sf = sf.drop(['A', 'C'], axis=1)

    correct = np.identity(5)[:, [1, 3, 4]]
    assert sf.columns.tolist() == list('BDE')
    np.testing.assert_array_equal(sf.data.todense(), correct)
def test_groupby_dense_random_data():
    shuffle_idx = np.random.permutation(np.arange(100))
    index = np.tile(np.arange(10), 10)
    single_tile = np.random.rand(10, 10)
    data = np.vstack([single_tile for _ in range(10)])
    t = SparseFrame(data[shuffle_idx, :], index=index[shuffle_idx])
    res = t.groupby_sum().data.todense()
    np.testing.assert_array_almost_equal(res, (single_tile * 10))
def test_simple_add_partial_overlap(complex_example):
    first = SparseFrame(np.ones((3, 5)), index=[0, 1, 2])
    second = SparseFrame(np.ones((3, 5)), index=[2, 3, 4])

    correct = np.ones((5, 5))
    correct[2, :] += 1

    res = first.add(second)
    assert np.all(res.data.todense() == correct)
    assert np.all(res.index == range(5))
def test_simple_join():
    t = SparseFrame(np.identity(10))

    res1 = t.join(t, axis=0).data.todense()
    correct = np.vstack([np.identity(10), np.identity(10)])
    assert np.all(res1 == correct)

    res2 = t.join(t, axis=1).data.todense()
    correct = np.hstack([np.identity(10), np.identity(10)])
    assert np.all(res2 == correct)
def test_dropna():
    index = np.arange(5, dtype=float)
    index[[1, 3]] = np.nan
    sf = SparseFrame(np.identity(5), index=index)

    sf_cleared = sf.dropna()

    correct = np.zeros((3, 5))
    correct[[0, 1, 2], [0, 2, 4]] = 1

    assert np.all(sf_cleared.data.todense() == correct)
def test_rename():
    old_names = list('ABCDE')
    func = lambda x: x + '_new'
    new_names = list(map(func, old_names))
    sf = SparseFrame(np.identity(5), columns=old_names)

    sf_renamed = sf.rename(columns=func)
    assert np.all(sf.columns == old_names), "Original frame was changed."
    assert np.all(sf_renamed.columns == new_names), "New frame has old names."

    sf.rename(columns=func, inplace=True)
    assert np.all(sf.columns == new_names), "In-place renaming didn't work."
def test_error_unaligned_indices():
    data = np.identity(5)
    with pytest.raises(ValueError) as e:
        SparseFrame(data, index=np.arange(6))
        assert '(5, 5)' in str(e) and '(6, 5)' in str(e)

    with pytest.raises(ValueError) as e:
        SparseFrame(data, columns=np.arange(6))
        assert '(5, 5)' in str(e) and '(5, 6)' in str(e)

    with pytest.raises(ValueError) as e:
        SparseFrame(data, columns=np.arange(6), index=np.arange(6))
        assert '(5, 5)' in str(e) and '(6, 6)' in str(e)
def test_multiply_rowwise():
    # Row wise multiplication with different types
    sf = SparseFrame(np.ones((5, 5)))
    other = np.arange(5)
    msg = "Row wise multiplication failed"

    # list
    res = sf.multiply(list(other), axis=0)
    assert np.all(res.sum(axis=1).T == 5 * other), msg

    # 1D array
    res = sf.multiply(other, axis=0)
    assert np.all(res.sum(axis=1).T == 5 * other), msg

    # 2D array
    _other = other.reshape(-1, 1)
    res = sf.multiply(_other, axis=0)
    assert np.all(res.sum(axis=1).T == 5 * other), msg

    # SparseFrame
    _other = SparseFrame(other)
    res = sf.multiply(_other, axis=0)
    assert np.all(res.sum(axis=1).T == 5 * other), msg

    # csr_matrix
    _other = _other.data
    res = sf.multiply(_other, axis=0)
    assert np.all(res.sum(axis=1).T == 5 * other), msg
def test_vstack():
    frames = []
    data = []
    for _ in range(10):
        values = np.identity(5)
        data.append(values)
        sf = SparseFrame(values, columns=list('ABCDE'))
        frames.append(sf)
    sf = SparseFrame.vstack(frames)
    assert np.all(sf.data.todense() == np.vstack(data))

    with pytest.raises(AssertionError):
        frames[2] = SparseFrame(np.identity(5), columns=list('XYZWQ'))
        SparseFrame.vstack(frames)
Exemplo n.º 14
0
def sf_midx():
    midx = pd.MultiIndex.from_arrays(
        [pd.date_range("2016-10-01", periods=5),
         np.arange(5)])
    cols = list('ABCDE')
    sf = SparseFrame(np.identity(5), index=midx, columns=cols)
    return sf
def test_getitem_missing_col():
    id_ = np.identity(10)
    sf = SparseFrame(id_, columns=list('abcdefghij'))

    with pytest.raises(ValueError):
        sf[None]
    with pytest.raises(KeyError):
        sf['x']
    with pytest.raises(KeyError):
        sf[['x']]
    with pytest.raises(KeyError):
        sf[['a', 'x']]
    with pytest.raises(KeyError):
        sf[['y', 'x']]

    idx = pd.Index(list('abx'))
    with pytest.raises(KeyError):
        sf[idx]
    with pytest.raises(KeyError):
        sf[idx.to_series()]
    with pytest.raises(KeyError):
        sf[idx.tolist()]
    with pytest.raises(KeyError):
        sf[tuple(idx)]
    with pytest.raises(KeyError):
        sf[idx.values]
Exemplo n.º 16
0
def test_getitem():
    sf = SparseFrame(np.identity(10), columns=list('abcdefghij'))
    assert sf['a'].data.todense()[0] == 1
    assert sf['j'].data.todense()[9] == 1
    tmp = sf[['j', 'a']].data.todense()
    assert tmp[9, 0] == 1
    assert tmp[0, 1] == 1
def test_existing_column_assign_number():
    sf = SparseFrame(np.identity(5))
    with pytest.raises(NotImplementedError):
        sf[0] = 1
        correct = np.identity(5)
        correct[:, 0] = 1
        assert np.all(correct == sf.data.todense())
def test_npz_io_s3(complex_example):
    with mock_s3_fs('sparsity'):
        sf, second, third = complex_example
        sf.to_npz('s3://sparsity/sparse.npz')
        loaded = SparseFrame.read_npz('s3://sparsity/sparse.npz')
        assert np.all(loaded.data.todense() == sf.data.todense())
        assert np.all(loaded.index == sf.index)
        assert np.all(loaded.columns == sf.columns)
def test_getitem_empty():
    df = pd.DataFrame([], columns=list('abcdefghij'), dtype=float)
    sf = SparseFrame(df)

    assert sf['a'].empty
    assert sf['a'].columns.tolist() == ['a']
    assert sf[['a', 'b']].empty
    assert sf[['a', 'b']].columns.tolist() == ['a', 'b']
def test_npz_io(complex_example):
    sf, second, third = complex_example
    sf.to_npz('/tmp/sparse.npz')
    loaded = SparseFrame.read_npz('/tmp/sparse.npz')
    assert np.all(loaded.data.todense() == sf.data.todense())
    assert np.all(loaded.index == sf.index)
    assert np.all(loaded.columns == sf.columns)
    os.remove('/tmp/sparse.npz')
Exemplo n.º 21
0
def sf_midx_int():
    midx = pd.MultiIndex.from_arrays([
        np.concatenate([np.ones(4), np.zeros(1)]),
        pd.date_range("2016-10-01", periods=5)
    ])
    cols = list('ABCDE')
    sf = SparseFrame(np.identity(5), index=midx, columns=cols)
    return sf
def test__array___():
    correct = np.identity(5)
    sf = SparseFrame(correct, index=list('ABCDE'), columns=list('ABCDE'))
    res = np.asarray(sf)
    assert np.all(res == correct)
    assert isinstance(res, np.ndarray)

    res = np.asarray(sf['A'])
    assert len(res.shape) == 1
def test_save_load_multiindex(sf_midx):
    with tmpdir() as tmp:
        # test new
        path = os.path.join(tmp, 'sf.npz')
        sf_midx.to_npz(path)
        res = SparseFrame.read_npz(path)
        assert isinstance(res.index, pd.MultiIndex)

        # test backwards compatibility
        def _to_npz_legacy(sf, filename):
            data = _csr_to_dict(sf.data)
            data['frame_index'] = sf.index.values
            data['frame_columns'] = sf.columns.values
            np.savez(filename, **data)

        _to_npz_legacy(sf_midx, path)
        res = SparseFrame.read_npz(path)
        assert isinstance(res.index, pd.MultiIndex)
def test_iloc():
    # name index and columns somehow so that their names are not integers
    sf = SparseFrame(np.identity(5),
                     index=list('ABCDE'),
                     columns=list('ABCDE'))

    assert np.all(sf.iloc[:2].data.todense() == np.identity(5)[:2])
    assert np.all(sf.iloc[[3, 4]].data.todense() == np.identity(5)[[3, 4]])
    assert np.all(sf.iloc[3].data.todense() == np.identity(5)[3])
    assert sf.iloc[1:].shape == (4, 5)
def test_groupby_agg_multiindex():
    df = pd.DataFrame({
        'X': [1, 1, 1, 0],
        'Y': [0, 1, 0, 1],
        'gr': ['a', 'a', 'b', 'b'],
        'day': [10, 11, 11, 12]
    })
    df = df.set_index(['day', 'gr'])
    sf = SparseFrame(df)

    correct = df.groupby(level=1).mean()
    res = sf.groupby_agg(level=1, agg_func=lambda x: x.mean(axis=0))
    assert np.all(res.index == correct.index)
    assert np.all(res.columns == correct.columns)

    correct = df.groupby(by='Y').mean()
    res = sf.groupby_agg(by='Y', agg_func=lambda x: x.mean(axis=0))
    assert np.all(res.index == correct.index)
    assert np.all(res.columns == correct.columns)
Exemplo n.º 26
0
def test_loc():
    sf = SparseFrame(np.identity(5), index=list("ABCDE"))

    # test single
    assert np.all(sf.loc['A'].data.todense() == np.matrix([[1, 0, 0, 0, 0]]))

    # test slices
    assert np.all(sf.loc[:'B'].data.todense() == np.identity(5)[:2])

    sf = SparseFrame(np.identity(5), pd.date_range("2016-10-01", periods=5))

    str_slice = slice('2016-10-01', "2016-10-03")
    assert np.all(sf.loc[str_slice].data.todense() == np.identity(5)[:3])

    ts_slice = slice(pd.Timestamp('2016-10-01'), pd.Timestamp("2016-10-03"))
    assert np.all(sf.loc[ts_slice].data.todense() == np.identity(5)[:3])

    dt_slice = slice(dt.date(2016, 10, 1), dt.date(2016, 10, 3))
    assert np.all(sf.loc[dt_slice].data.todense() == np.identity(5)[:3])
Exemplo n.º 27
0
def shuffle_group_2(sf: sp.SparseFrame):
    if not len(sf):
        return {}, sf
    ind = sf['_partitions'].todense()._values.astype(np.int64)
    n = ind.max() + 1
    indexer, locations = groupsort_indexer(ind.view(np.int64), n)
    df2 = sf.take(indexer)
    locations = locations.cumsum()
    parts = [df2.iloc[a:b] for a, b in zip(locations[:-1], locations[1:])]
    result2 = dict(zip(range(n), parts))
    return result2, sf.iloc[:0]
def test_loc_duplicate_index():
    sf = SparseFrame(np.identity(5),
                     columns=list('UUXYZ'),
                     index=list('AAABB'))
    assert len(sf.loc['A'].index) == 3
    assert len(sf.loc['B'].index) == 2
    assert np.all(sf.loc['A'].todense().values == np.identity(5)[:3])
    assert np.all(sf.loc['B'].todense().values == np.identity(5)[3:])

    assert len(sf.loc[:, 'U'].columns) == 2
    assert np.all(sf.loc[:, 'U'].todense().values == np.identity(5)[:, :2])
def test_vstack_multi_index(clickstream):
    df_0 = clickstream.iloc[:len(clickstream) // 2]
    df_1 = clickstream.iloc[len(clickstream) // 2:]
    sf_0 = sparse_one_hot(df_0,
                          categories={'page_id': list('ABCDE')},
                          index_col=['index', 'id'])
    sf_1 = sparse_one_hot(df_1,
                          categories={'page_id': list('ABCDE')},
                          index_col=['index', 'id'])
    res = SparseFrame.vstack([sf_0, sf_1])
    assert isinstance(res.index, pd.MultiIndex)
Exemplo n.º 30
0
def test_dask_multi_index_loc(clickstream):
    sf = dd.from_pandas(clickstream, npartitions=10) \
        .map_partitions(
            sparse_one_hot,
            column='page_id',
            index_col=['index', 'id'],
            categories=list('ABCDE'),
            meta=list
    )
    res = sf.loc['2016-01-15':'2016-02-15']
    res = SparseFrame.vstack(res.compute(get=get_sync).tolist())
    assert res.index.get_level_values(0).date.min() == dt.date(2016, 1, 15)
    assert res.index.get_level_values(0).date.max() == dt.date(2016, 2, 15)