Exemplo n.º 1
0
    def test_constructor_ndarray(self):
        # no index or columns
        sp = SparseDataFrame(self.frame.values)

        # 1d
        sp = SparseDataFrame(self.data['A'], index=self.dates, columns=['A'])
        tm.assert_sp_frame_equal(sp, self.frame.reindex(columns=['A']))

        # raise on level argument
        pytest.raises(TypeError, self.frame.reindex, columns=['A'], level=1)

        # wrong length index / columns
        with tm.assert_raises_regex(ValueError, "^Index length"):
            SparseDataFrame(self.frame.values, index=self.frame.index[:-1])

        with tm.assert_raises_regex(ValueError, "^Column length"):
            SparseDataFrame(self.frame.values, columns=self.frame.columns[:-1])
Exemplo n.º 2
0
    def test_pickle(self):
        def _test_roundtrip(frame, orig):
            result = tm.round_trip_pickle(frame)
            tm.assert_sp_frame_equal(frame, result)
            tm.assert_frame_equal(result.to_dense(), orig, check_dtype=False)

        _test_roundtrip(SparseDataFrame(), DataFrame())
        self._check_all(_test_roundtrip)
Exemplo n.º 3
0
    def test_fill_value_when_combine_const(self):
        # GH12723
        dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
        df = SparseDataFrame({'foo': dat}, index=range(6))

        exp = df.fillna(0).add(2)
        res = df.add(2, fill_value=0)
        tm.assert_sp_frame_equal(res, exp)
Exemplo n.º 4
0
 def test_constructor_from_unknown_type(self):
     # GH 19393
     class Unknown(object):
         pass
     with pytest.raises(TypeError,
                        message='SparseDataFrame called with unknown type '
                                '"Unknown" for data argument'):
         SparseDataFrame(Unknown())
Exemplo n.º 5
0
    def test_constructor(self, float_frame, float_frame_int_kind,
                         float_frame_fill0):
        for col, series in compat.iteritems(float_frame):
            assert isinstance(series, SparseSeries)

        assert isinstance(float_frame_int_kind['A'].sp_index, IntIndex)

        # constructed zframe from matrix above
        assert float_frame_fill0['A'].fill_value == 0
        tm.assert_numpy_array_equal(pd.SparseArray([1., 2., 3., 4., 5., 6.]),
                                    float_frame_fill0['A'].values)
        tm.assert_numpy_array_equal(np.array([0., 0., 0., 0., 1., 2.,
                                              3., 4., 5., 6.]),
                                    float_frame_fill0['A'].to_dense().values)

        # construct no data
        sdf = SparseDataFrame(columns=np.arange(10), index=np.arange(10))
        for col, series in compat.iteritems(sdf):
            assert isinstance(series, SparseSeries)

        # construct from nested dict
        data = {}
        for c, s in compat.iteritems(float_frame):
            data[c] = s.to_dict()

        sdf = SparseDataFrame(data)
        tm.assert_sp_frame_equal(sdf, float_frame)

        # TODO: test data is copied from inputs

        # init dict with different index
        idx = float_frame.index[:5]
        cons = SparseDataFrame(
            float_frame, index=idx, columns=float_frame.columns,
            default_fill_value=float_frame.default_fill_value,
            default_kind=float_frame.default_kind, copy=True)
        reindexed = float_frame.reindex(idx)

        tm.assert_sp_frame_equal(cons, reindexed, exact_indices=False)

        # assert level parameter breaks reindex
        with pytest.raises(TypeError):
            float_frame.reindex(idx, level=0)

        repr(float_frame)
Exemplo n.º 6
0
    def setup_method(self, method):
        self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
                     'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
                     'C': np.arange(10, dtype=float),
                     'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}

        self.dates = bdate_range('1/1/2011', periods=10)

        self.frame = SparseDataFrame(self.data, index=self.dates)
Exemplo n.º 7
0
    def test_getitem(self):
        # 1585 select multiple columns
        sdf = SparseDataFrame(index=[0, 1, 2], columns=['a', 'b', 'c'])

        result = sdf[['a', 'b']]
        exp = sdf.reindex(columns=['a', 'b'])
        tm.assert_sp_frame_equal(result, exp)

        pytest.raises(Exception, sdf.__getitem__, ['a', 'd'])
Exemplo n.º 8
0
 def get_empty_Frame(data, sparse):
     if isinstance(data, Series):
         index = data.index
     else:
         index = np.arange(len(data))
     if not sparse:
         return DataFrame(index=index)
     else:
         return SparseDataFrame(index=index, default_fill_value=0)
Exemplo n.º 9
0
    def setup_method(self, method):
        self.data = {
            'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
            'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
            'C': np.arange(10, dtype=np.float64),
            'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]
        }

        self.dates = bdate_range('1/1/2011', periods=10)

        self.orig = pd.DataFrame(self.data, index=self.dates)
        self.iorig = pd.DataFrame(self.data, index=self.dates)

        self.frame = SparseDataFrame(self.data, index=self.dates)
        self.iframe = SparseDataFrame(self.data,
                                      index=self.dates,
                                      default_kind='integer')
        self.mixed_frame = self.frame.copy(False)
        self.mixed_frame['foo'] = pd.SparseArray(['bar'] * len(self.dates))

        values = self.frame.values.copy()
        values[np.isnan(values)] = 0

        self.zorig = pd.DataFrame(values,
                                  columns=['A', 'B', 'C', 'D'],
                                  index=self.dates)
        self.zframe = SparseDataFrame(values,
                                      columns=['A', 'B', 'C', 'D'],
                                      default_fill_value=0,
                                      index=self.dates)

        values = self.frame.values.copy()
        values[np.isnan(values)] = 2

        self.fill_orig = pd.DataFrame(values,
                                      columns=['A', 'B', 'C', 'D'],
                                      index=self.dates)
        self.fill_frame = SparseDataFrame(values,
                                          columns=['A', 'B', 'C', 'D'],
                                          default_fill_value=2,
                                          index=self.dates)

        self.empty = SparseDataFrame()
Exemplo n.º 10
0
    def test_as_blocks(self):
        df = SparseDataFrame({
            'A': [1.1, 3.3],
            'B': [nan, -3.9]
        },
                             dtype='float64')

        df_blocks = df.blocks
        assert list(df_blocks.keys()) == ['float64']
        tm.assert_frame_equal(df_blocks['float64'], df)
Exemplo n.º 11
0
    def test_density(self):
        df = SparseSeries([nan, nan, nan, 0, 1, 2, 3, 4, 5, 6])
        self.assertEqual(df.density, 0.7)

        df = SparseDataFrame({'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
                              'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
                              'C': np.arange(10),
                              'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]})

        self.assertEqual(df.density, 0.75)
Exemplo n.º 12
0
    def test_as_blocks(self):
        df = SparseDataFrame({'A': [1.1, 3.3], 'B': [nan, -3.9]},
                             dtype='float64')

        # deprecated 0.21.0
        with tm.assert_produces_warning(FutureWarning,
                                        check_stacklevel=False):
            df_blocks = df.blocks
        assert list(df_blocks.keys()) == ['float64']
        tm.assert_frame_equal(df_blocks['float64'], df)
Exemplo n.º 13
0
    def test_density(self):
        df = SparseSeries([nan, nan, nan, 0, 1, 2, 3, 4, 5, 6])
        assert df.density == 0.7

        df = SparseDataFrame({'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
                              'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
                              'C': np.arange(10),
                              'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]})

        assert df.density == 0.75
Exemplo n.º 14
0
    def test_cumsum(self, float_frame):
        expected = SparseDataFrame(float_frame.to_dense().cumsum())

        result = float_frame.cumsum()
        tm.assert_sp_frame_equal(result, expected)

        result = float_frame.cumsum(axis=None)
        tm.assert_sp_frame_equal(result, expected)

        result = float_frame.cumsum(axis=0)
        tm.assert_sp_frame_equal(result, expected)
Exemplo n.º 15
0
    def test_iloc(self):

        # 2227
        result = self.frame.iloc[:, 0]
        assert isinstance(result, SparseSeries)
        tm.assert_sp_series_equal(result, self.frame['A'])

        # preserve sparse index type. #2251
        data = {'A': [0, 1]}
        iframe = SparseDataFrame(data, default_kind='integer')
        tm.assert_class_equal(iframe['A'].sp_index, iframe.iloc[:, 0].sp_index)
Exemplo n.º 16
0
    def setUp(self):
        self.data = {
            'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
            'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
            'C': np.arange(10, dtype=np.float64),
            'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]
        }

        self.dates = bdate_range('1/1/2011', periods=10)

        self.orig = pd.DataFrame(self.data, index=self.dates)
        self.iorig = pd.DataFrame(self.data, index=self.dates)

        self.frame = SparseDataFrame(self.data, index=self.dates)
        self.iframe = SparseDataFrame(self.data,
                                      index=self.dates,
                                      default_kind='integer')

        values = self.frame.values.copy()
        values[np.isnan(values)] = 0

        self.zorig = pd.DataFrame(values,
                                  columns=['A', 'B', 'C', 'D'],
                                  index=self.dates)
        self.zframe = SparseDataFrame(values,
                                      columns=['A', 'B', 'C', 'D'],
                                      default_fill_value=0,
                                      index=self.dates)

        values = self.frame.values.copy()
        values[np.isnan(values)] = 2

        self.fill_orig = pd.DataFrame(values,
                                      columns=['A', 'B', 'C', 'D'],
                                      index=self.dates)
        self.fill_frame = SparseDataFrame(values,
                                          columns=['A', 'B', 'C', 'D'],
                                          default_fill_value=2,
                                          index=self.dates)

        self.empty = SparseDataFrame()
Exemplo n.º 17
0
    def test_numpy_cumsum(self, float_frame):
        result = np.cumsum(float_frame)
        expected = SparseDataFrame(float_frame.to_dense().cumsum())
        tm.assert_sp_frame_equal(result, expected)

        msg = "the 'dtype' parameter is not supported"
        tm.assert_raises_regex(ValueError, msg, np.cumsum,
                               float_frame, dtype=np.int64)

        msg = "the 'out' parameter is not supported"
        tm.assert_raises_regex(ValueError, msg, np.cumsum,
                               float_frame, out=result)
Exemplo n.º 18
0
    def test_pickle(self, float_frame, float_frame_int_kind, float_frame_dense,
                    float_frame_fill0, float_frame_fill0_dense,
                    float_frame_fill2, float_frame_fill2_dense):

        def _test_roundtrip(frame, orig):
            result = tm.round_trip_pickle(frame)
            tm.assert_sp_frame_equal(frame, result)
            tm.assert_frame_equal(result.to_dense(), orig, check_dtype=False)

        _test_roundtrip(SparseDataFrame(), DataFrame())
        _test_roundtrip(float_frame, float_frame_dense)
        _test_roundtrip(float_frame_int_kind, float_frame_dense)
        _test_roundtrip(float_frame_fill0, float_frame_fill0_dense)
        _test_roundtrip(float_frame_fill2, float_frame_fill2_dense)
Exemplo n.º 19
0
    def test_quantile(self):
        # GH 17386
        data = [[1, 1], [2, 10], [3, 100], [nan, nan]]
        q = 0.1

        sparse_df = SparseDataFrame(data)
        result = sparse_df.quantile(q)

        dense_df = DataFrame(data)
        dense_expected = dense_df.quantile(q)
        sparse_expected = SparseSeries(dense_expected)

        tm.assert_series_equal(result, dense_expected)
        tm.assert_sp_series_equal(result, sparse_expected)
Exemplo n.º 20
0
    def test_constructor_from_series(self):

        # GH 2873
        x = Series(np.random.randn(10000), name='a')
        x = x.to_sparse(fill_value=0)
        assert isinstance(x, SparseSeries)
        df = SparseDataFrame(x)
        assert isinstance(df, SparseDataFrame)

        x = Series(np.random.randn(10000), name='a')
        y = Series(np.random.randn(10000), name='b')
        x2 = x.astype(float)
        x2.loc[:9998] = np.NaN
        # TODO: x_sparse is unused...fix
        x_sparse = x2.to_sparse(fill_value=np.NaN)  # noqa

        # Currently fails too with weird ufunc error
        # df1 = SparseDataFrame([x_sparse, y])

        y.loc[:9998] = 0
        # TODO: y_sparse is unsused...fix
        y_sparse = y.to_sparse(fill_value=0)  # noqa
Exemplo n.º 21
0
def _get_dummies_1d(data,
                    prefix,
                    prefix_sep='_',
                    dummy_na=False,
                    sparse=False,
                    drop_first=False):
    # Series avoids inconsistent NaN handling
    codes, levels = _factorize_from_iterable(Series(data))

    def get_empty_Frame(data, sparse):
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        if not sparse:
            return DataFrame(index=index)
        else:
            return SparseDataFrame(index=index, default_fill_value=0)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_Frame(data, sparse)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_Frame(data, sparse)

    number_of_cols = len(levels)

    if prefix is not None:
        dummy_strs = [
            u'{prefix}{sep}{level}'
            if isinstance(v, text_type) else '{prefix}{sep}{level}'
            for v in levels
        ]
        dummy_cols = [
            dummy_str.format(prefix=prefix, sep=prefix_sep, level=v)
            for dummy_str, v in zip(dummy_strs, levels)
        ]
    else:
        dummy_cols = levels

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:
        sparse_series = {}
        N = len(data)
        sp_indices = [[] for _ in range(len(dummy_cols))]
        for ndx, code in enumerate(codes):
            if code == -1:
                # Blank entries if not dummy_na and code == -1, #GH4446
                continue
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8),
                               sparse_index=IntIndex(N, ixs),
                               fill_value=0,
                               dtype=np.uint8)
            sparse_series[col] = SparseSeries(data=sarr, index=index)

        out = SparseDataFrame(sparse_series,
                              index=index,
                              columns=dummy_cols,
                              default_fill_value=0,
                              dtype=np.uint8)
        return out

    else:
        dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Exemplo n.º 22
0
    def test_reindex_method(self):

        sparse = SparseDataFrame(data=[[11., 12., 14.],
                                       [21., 22., 24.],
                                       [41., 42., 44.]],
                                 index=[1, 2, 4],
                                 columns=[1, 2, 4],
                                 dtype=float)

        # Over indices

        # default method
        result = sparse.reindex(index=range(6))
        expected = SparseDataFrame(data=[[nan, nan, nan],
                                         [11., 12., 14.],
                                         [21., 22., 24.],
                                         [nan, nan, nan],
                                         [41., 42., 44.],
                                         [nan, nan, nan]],
                                   index=range(6),
                                   columns=[1, 2, 4],
                                   dtype=float)
        tm.assert_sp_frame_equal(result, expected)

        # method='bfill'
        result = sparse.reindex(index=range(6), method='bfill')
        expected = SparseDataFrame(data=[[11., 12., 14.],
                                         [11., 12., 14.],
                                         [21., 22., 24.],
                                         [41., 42., 44.],
                                         [41., 42., 44.],
                                         [nan, nan, nan]],
                                   index=range(6),
                                   columns=[1, 2, 4],
                                   dtype=float)
        tm.assert_sp_frame_equal(result, expected)

        # method='ffill'
        result = sparse.reindex(index=range(6), method='ffill')
        expected = SparseDataFrame(data=[[nan, nan, nan],
                                         [11., 12., 14.],
                                         [21., 22., 24.],
                                         [21., 22., 24.],
                                         [41., 42., 44.],
                                         [41., 42., 44.]],
                                   index=range(6),
                                   columns=[1, 2, 4],
                                   dtype=float)
        tm.assert_sp_frame_equal(result, expected)

        # Over columns

        # default method
        result = sparse.reindex(columns=range(6))
        expected = SparseDataFrame(data=[[nan, 11., 12., nan, 14., nan],
                                         [nan, 21., 22., nan, 24., nan],
                                         [nan, 41., 42., nan, 44., nan]],
                                   index=[1, 2, 4],
                                   columns=range(6),
                                   dtype=float)
        tm.assert_sp_frame_equal(result, expected)

        # method='bfill'
        with pytest.raises(NotImplementedError):
            sparse.reindex(columns=range(6), method='bfill')

        # method='ffill'
        with pytest.raises(NotImplementedError):
            sparse.reindex(columns=range(6), method='ffill')
Exemplo n.º 23
0
 def test_constructor_empty(self):
     sp = SparseDataFrame()
     self.assertEqual(len(sp.index), 0)
     self.assertEqual(len(sp.columns), 0)
Exemplo n.º 24
0
 def test_constructor_convert_index_once(self):
     arr = np.array([1.5, 2.5, 3.5])
     sdf = SparseDataFrame(columns=lrange(4), index=arr)
     assert sdf[0].index is sdf[1].index
Exemplo n.º 25
0
 def test_constructor_dataframe(self, float_frame):
     dense = float_frame.to_dense()
     sp = SparseDataFrame(dense)
     tm.assert_sp_frame_equal(sp, float_frame)
Exemplo n.º 26
0
 def test_constructor_empty(self):
     sp = SparseDataFrame()
     assert len(sp.index) == 0
     assert len(sp.columns) == 0
Exemplo n.º 27
0
def _get_dummies_1d(data,
                    prefix,
                    prefix_sep='_',
                    dummy_na=False,
                    sparse=False,
                    drop_first=False,
                    dtype=None):
    # Series avoids inconsistent NaN handling
    codes, levels = _factorize_from_iterable(Series(data))

    if dtype is None:
        dtype = np.uint8
    dtype = np.dtype(dtype)

    if is_object_dtype(dtype):
        raise ValueError("dtype=object is not a valid dtype for get_dummies")

    def get_empty_Frame(data, sparse):
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        if not sparse:
            return DataFrame(index=index)
        else:
            return SparseDataFrame(index=index, default_fill_value=0)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_Frame(data, sparse)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_Frame(data, sparse)

    number_of_cols = len(levels)

    if prefix is None:
        dummy_cols = levels
    else:

        # PY2 embedded unicode, gh-22084
        def _make_col_name(prefix, prefix_sep, level):
            fstr = '{prefix}{prefix_sep}{level}'
            if PY2 and (isinstance(prefix, text_type) or isinstance(
                    prefix_sep, text_type) or isinstance(level, text_type)):
                fstr = u(fstr)
            return fstr.format(prefix=prefix,
                               prefix_sep=prefix_sep,
                               level=level)

        dummy_cols = [
            _make_col_name(prefix, prefix_sep, level) for level in levels
        ]

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:
        sparse_series = {}
        N = len(data)
        sp_indices = [[] for _ in range(len(dummy_cols))]
        mask = codes != -1
        codes = codes[mask]
        n_idx = np.arange(N)[mask]

        for ndx, code in zip(n_idx, codes):
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(np.ones(len(ixs), dtype=dtype),
                               sparse_index=IntIndex(N, ixs),
                               fill_value=0,
                               dtype=dtype)
            sparse_series[col] = SparseSeries(data=sarr, index=index)

        out = SparseDataFrame(sparse_series,
                              index=index,
                              columns=dummy_cols,
                              default_fill_value=0,
                              dtype=dtype)
        return out

    else:
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)