def test_assign_number(): sf = SparseFrame(np.identity(5), columns=list('ABCDE')) sf = sf.assign(**{'F': 1}) correct = np.hstack([np.identity(5), np.ones(5).reshape(-1, 1)]) assert 'F' in set(sf.columns) assert sf.shape == (5, 6) assert np.all(correct == sf.data.todense())
def test_init_with_pandas(): df = pd.DataFrame( np.identity(5), index=[pd.date_range("2100-01-01", periods=5), np.arange(5)], columns=list('ABCDE')) sf = SparseFrame(df) assert sf.shape == (5, 5) assert isinstance(sf.index, pd.MultiIndex) assert (sf.index == df.index).all() assert (sf.columns == df.columns).all() with pytest.warns(SyntaxWarning): sf = SparseFrame(df, index=np.arange(10, 15), columns=list('VWXYZ')) assert sf.index.tolist() == np.arange(10, 15).tolist() assert sf.columns.tolist() == list('VWXYZ') s = pd.Series(np.ones(10)) sf = SparseFrame(s) assert sf.shape == (10, 1) assert np.all(sf.data.todense() == np.ones(10).reshape(-1, 1)) df['A'] = 'bla' with pytest.raises(TypeError): sf = SparseFrame(df)
def test_groupby(): shuffle_idx = np.random.permutation(np.arange(100)) index = np.tile(np.arange(10), 10) data = np.vstack([np.identity(10) for _ in range(10)]) t = SparseFrame(data[shuffle_idx, :], index=index[shuffle_idx]) res = t.groupby_sum().data.todense() assert np.all(res == (np.identity(10) * 10))
def test_drop_single_label(): old_names = list('ABCDE') sf = SparseFrame(np.identity(5), columns=old_names) sf = sf.drop('A', axis=1) correct = np.identity(5)[:, 1:] assert sf.columns.tolist() == list('BCDE') np.testing.assert_array_equal(sf.data.todense(), correct)
def test_drop_multiple_labels(): old_names = list('ABCDE') sf = SparseFrame(np.identity(5), columns=old_names) sf = sf.drop(['A', 'C'], axis=1) correct = np.identity(5)[:, [1, 3, 4]] assert sf.columns.tolist() == list('BDE') np.testing.assert_array_equal(sf.data.todense(), correct)
def test_groupby_dense_random_data(): shuffle_idx = np.random.permutation(np.arange(100)) index = np.tile(np.arange(10), 10) single_tile = np.random.rand(10, 10) data = np.vstack([single_tile for _ in range(10)]) t = SparseFrame(data[shuffle_idx, :], index=index[shuffle_idx]) res = t.groupby_sum().data.todense() np.testing.assert_array_almost_equal(res, (single_tile * 10))
def test_simple_add_partial_overlap(complex_example): first = SparseFrame(np.ones((3, 5)), index=[0, 1, 2]) second = SparseFrame(np.ones((3, 5)), index=[2, 3, 4]) correct = np.ones((5, 5)) correct[2, :] += 1 res = first.add(second) assert np.all(res.data.todense() == correct) assert np.all(res.index == range(5))
def test_simple_join(): t = SparseFrame(np.identity(10)) res1 = t.join(t, axis=0).data.todense() correct = np.vstack([np.identity(10), np.identity(10)]) assert np.all(res1 == correct) res2 = t.join(t, axis=1).data.todense() correct = np.hstack([np.identity(10), np.identity(10)]) assert np.all(res2 == correct)
def test_dropna(): index = np.arange(5, dtype=float) index[[1, 3]] = np.nan sf = SparseFrame(np.identity(5), index=index) sf_cleared = sf.dropna() correct = np.zeros((3, 5)) correct[[0, 1, 2], [0, 2, 4]] = 1 assert np.all(sf_cleared.data.todense() == correct)
def test_rename(): old_names = list('ABCDE') func = lambda x: x + '_new' new_names = list(map(func, old_names)) sf = SparseFrame(np.identity(5), columns=old_names) sf_renamed = sf.rename(columns=func) assert np.all(sf.columns == old_names), "Original frame was changed." assert np.all(sf_renamed.columns == new_names), "New frame has old names." sf.rename(columns=func, inplace=True) assert np.all(sf.columns == new_names), "In-place renaming didn't work."
def test_error_unaligned_indices(): data = np.identity(5) with pytest.raises(ValueError) as e: SparseFrame(data, index=np.arange(6)) assert '(5, 5)' in str(e) and '(6, 5)' in str(e) with pytest.raises(ValueError) as e: SparseFrame(data, columns=np.arange(6)) assert '(5, 5)' in str(e) and '(5, 6)' in str(e) with pytest.raises(ValueError) as e: SparseFrame(data, columns=np.arange(6), index=np.arange(6)) assert '(5, 5)' in str(e) and '(6, 6)' in str(e)
def test_multiply_rowwise(): # Row wise multiplication with different types sf = SparseFrame(np.ones((5, 5))) other = np.arange(5) msg = "Row wise multiplication failed" # list res = sf.multiply(list(other), axis=0) assert np.all(res.sum(axis=1).T == 5 * other), msg # 1D array res = sf.multiply(other, axis=0) assert np.all(res.sum(axis=1).T == 5 * other), msg # 2D array _other = other.reshape(-1, 1) res = sf.multiply(_other, axis=0) assert np.all(res.sum(axis=1).T == 5 * other), msg # SparseFrame _other = SparseFrame(other) res = sf.multiply(_other, axis=0) assert np.all(res.sum(axis=1).T == 5 * other), msg # csr_matrix _other = _other.data res = sf.multiply(_other, axis=0) assert np.all(res.sum(axis=1).T == 5 * other), msg
def test_vstack(): frames = [] data = [] for _ in range(10): values = np.identity(5) data.append(values) sf = SparseFrame(values, columns=list('ABCDE')) frames.append(sf) sf = SparseFrame.vstack(frames) assert np.all(sf.data.todense() == np.vstack(data)) with pytest.raises(AssertionError): frames[2] = SparseFrame(np.identity(5), columns=list('XYZWQ')) SparseFrame.vstack(frames)
def sf_midx(): midx = pd.MultiIndex.from_arrays( [pd.date_range("2016-10-01", periods=5), np.arange(5)]) cols = list('ABCDE') sf = SparseFrame(np.identity(5), index=midx, columns=cols) return sf
def test_getitem_missing_col(): id_ = np.identity(10) sf = SparseFrame(id_, columns=list('abcdefghij')) with pytest.raises(ValueError): sf[None] with pytest.raises(KeyError): sf['x'] with pytest.raises(KeyError): sf[['x']] with pytest.raises(KeyError): sf[['a', 'x']] with pytest.raises(KeyError): sf[['y', 'x']] idx = pd.Index(list('abx')) with pytest.raises(KeyError): sf[idx] with pytest.raises(KeyError): sf[idx.to_series()] with pytest.raises(KeyError): sf[idx.tolist()] with pytest.raises(KeyError): sf[tuple(idx)] with pytest.raises(KeyError): sf[idx.values]
def test_getitem(): sf = SparseFrame(np.identity(10), columns=list('abcdefghij')) assert sf['a'].data.todense()[0] == 1 assert sf['j'].data.todense()[9] == 1 tmp = sf[['j', 'a']].data.todense() assert tmp[9, 0] == 1 assert tmp[0, 1] == 1
def test_existing_column_assign_number(): sf = SparseFrame(np.identity(5)) with pytest.raises(NotImplementedError): sf[0] = 1 correct = np.identity(5) correct[:, 0] = 1 assert np.all(correct == sf.data.todense())
def test_npz_io_s3(complex_example): with mock_s3_fs('sparsity'): sf, second, third = complex_example sf.to_npz('s3://sparsity/sparse.npz') loaded = SparseFrame.read_npz('s3://sparsity/sparse.npz') assert np.all(loaded.data.todense() == sf.data.todense()) assert np.all(loaded.index == sf.index) assert np.all(loaded.columns == sf.columns)
def test_getitem_empty(): df = pd.DataFrame([], columns=list('abcdefghij'), dtype=float) sf = SparseFrame(df) assert sf['a'].empty assert sf['a'].columns.tolist() == ['a'] assert sf[['a', 'b']].empty assert sf[['a', 'b']].columns.tolist() == ['a', 'b']
def test_npz_io(complex_example): sf, second, third = complex_example sf.to_npz('/tmp/sparse.npz') loaded = SparseFrame.read_npz('/tmp/sparse.npz') assert np.all(loaded.data.todense() == sf.data.todense()) assert np.all(loaded.index == sf.index) assert np.all(loaded.columns == sf.columns) os.remove('/tmp/sparse.npz')
def sf_midx_int(): midx = pd.MultiIndex.from_arrays([ np.concatenate([np.ones(4), np.zeros(1)]), pd.date_range("2016-10-01", periods=5) ]) cols = list('ABCDE') sf = SparseFrame(np.identity(5), index=midx, columns=cols) return sf
def test__array___(): correct = np.identity(5) sf = SparseFrame(correct, index=list('ABCDE'), columns=list('ABCDE')) res = np.asarray(sf) assert np.all(res == correct) assert isinstance(res, np.ndarray) res = np.asarray(sf['A']) assert len(res.shape) == 1
def test_save_load_multiindex(sf_midx): with tmpdir() as tmp: # test new path = os.path.join(tmp, 'sf.npz') sf_midx.to_npz(path) res = SparseFrame.read_npz(path) assert isinstance(res.index, pd.MultiIndex) # test backwards compatibility def _to_npz_legacy(sf, filename): data = _csr_to_dict(sf.data) data['frame_index'] = sf.index.values data['frame_columns'] = sf.columns.values np.savez(filename, **data) _to_npz_legacy(sf_midx, path) res = SparseFrame.read_npz(path) assert isinstance(res.index, pd.MultiIndex)
def test_iloc(): # name index and columns somehow so that their names are not integers sf = SparseFrame(np.identity(5), index=list('ABCDE'), columns=list('ABCDE')) assert np.all(sf.iloc[:2].data.todense() == np.identity(5)[:2]) assert np.all(sf.iloc[[3, 4]].data.todense() == np.identity(5)[[3, 4]]) assert np.all(sf.iloc[3].data.todense() == np.identity(5)[3]) assert sf.iloc[1:].shape == (4, 5)
def test_groupby_agg_multiindex(): df = pd.DataFrame({ 'X': [1, 1, 1, 0], 'Y': [0, 1, 0, 1], 'gr': ['a', 'a', 'b', 'b'], 'day': [10, 11, 11, 12] }) df = df.set_index(['day', 'gr']) sf = SparseFrame(df) correct = df.groupby(level=1).mean() res = sf.groupby_agg(level=1, agg_func=lambda x: x.mean(axis=0)) assert np.all(res.index == correct.index) assert np.all(res.columns == correct.columns) correct = df.groupby(by='Y').mean() res = sf.groupby_agg(by='Y', agg_func=lambda x: x.mean(axis=0)) assert np.all(res.index == correct.index) assert np.all(res.columns == correct.columns)
def test_loc(): sf = SparseFrame(np.identity(5), index=list("ABCDE")) # test single assert np.all(sf.loc['A'].data.todense() == np.matrix([[1, 0, 0, 0, 0]])) # test slices assert np.all(sf.loc[:'B'].data.todense() == np.identity(5)[:2]) sf = SparseFrame(np.identity(5), pd.date_range("2016-10-01", periods=5)) str_slice = slice('2016-10-01', "2016-10-03") assert np.all(sf.loc[str_slice].data.todense() == np.identity(5)[:3]) ts_slice = slice(pd.Timestamp('2016-10-01'), pd.Timestamp("2016-10-03")) assert np.all(sf.loc[ts_slice].data.todense() == np.identity(5)[:3]) dt_slice = slice(dt.date(2016, 10, 1), dt.date(2016, 10, 3)) assert np.all(sf.loc[dt_slice].data.todense() == np.identity(5)[:3])
def shuffle_group_2(sf: sp.SparseFrame): if not len(sf): return {}, sf ind = sf['_partitions'].todense()._values.astype(np.int64) n = ind.max() + 1 indexer, locations = groupsort_indexer(ind.view(np.int64), n) df2 = sf.take(indexer) locations = locations.cumsum() parts = [df2.iloc[a:b] for a, b in zip(locations[:-1], locations[1:])] result2 = dict(zip(range(n), parts)) return result2, sf.iloc[:0]
def test_loc_duplicate_index(): sf = SparseFrame(np.identity(5), columns=list('UUXYZ'), index=list('AAABB')) assert len(sf.loc['A'].index) == 3 assert len(sf.loc['B'].index) == 2 assert np.all(sf.loc['A'].todense().values == np.identity(5)[:3]) assert np.all(sf.loc['B'].todense().values == np.identity(5)[3:]) assert len(sf.loc[:, 'U'].columns) == 2 assert np.all(sf.loc[:, 'U'].todense().values == np.identity(5)[:, :2])
def test_vstack_multi_index(clickstream): df_0 = clickstream.iloc[:len(clickstream) // 2] df_1 = clickstream.iloc[len(clickstream) // 2:] sf_0 = sparse_one_hot(df_0, categories={'page_id': list('ABCDE')}, index_col=['index', 'id']) sf_1 = sparse_one_hot(df_1, categories={'page_id': list('ABCDE')}, index_col=['index', 'id']) res = SparseFrame.vstack([sf_0, sf_1]) assert isinstance(res.index, pd.MultiIndex)
def test_dask_multi_index_loc(clickstream): sf = dd.from_pandas(clickstream, npartitions=10) \ .map_partitions( sparse_one_hot, column='page_id', index_col=['index', 'id'], categories=list('ABCDE'), meta=list ) res = sf.loc['2016-01-15':'2016-02-15'] res = SparseFrame.vstack(res.compute(get=get_sync).tolist()) assert res.index.get_level_values(0).date.min() == dt.date(2016, 1, 15) assert res.index.get_level_values(0).date.max() == dt.date(2016, 2, 15)