Пример #1
0
    def test_basic(self, sparse, dtype):
        s_list = list('abc')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame({
            'a': [1, 0, 0],
            'b': [0, 1, 0],
            'c': [0, 0, 1]
        },
                             dtype=self.effective_dtype(dtype))
        result = get_dummies(s_list, sparse=sparse, dtype=dtype)
        if sparse:
            tm.assert_sp_frame_equal(
                result, expected.to_sparse(kind='integer', fill_value=0))
        else:
            assert_frame_equal(result, expected)

        result = get_dummies(s_series, sparse=sparse, dtype=dtype)
        if sparse:
            expected = expected.to_sparse(kind='integer', fill_value=0)
        assert_frame_equal(result, expected)

        expected.index = list('ABC')
        result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
        if sparse:
            expected.to_sparse(kind='integer', fill_value=0)
        assert_frame_equal(result, expected)
Пример #2
0
    def test_basic_drop_first_NA(self, sparse):
        # Test NA handling together with drop_first
        s_NA = ['a', 'b', np.nan]
        res = get_dummies(s_NA, drop_first=True, sparse=sparse)
        exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
        if sparse:
            exp = exp.to_sparse(fill_value=0, kind='integer')

        assert_frame_equal(res, exp)

        res_na = get_dummies(s_NA,
                             dummy_na=True,
                             drop_first=True,
                             sparse=sparse)
        exp_na = DataFrame({
            'b': [0, 1, 0],
            nan: [0, 0, 1]
        }, dtype=np.uint8).reindex(['b', nan], axis=1)
        if sparse:
            exp_na = exp_na.to_sparse(fill_value=0, kind='integer')
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan],
                                  dummy_na=True,
                                  drop_first=True,
                                  sparse=sparse)
        exp_just_na = DataFrame(index=np.arange(1))
        assert_frame_equal(res_just_na, exp_just_na)
Пример #3
0
def sparse_pickle_df(df: pd.DataFrame, filename, suffix='.pkl'):
    """
    Converts dataframe to sparse dataframe before pickling to disk.
    """
    if isinstance(filename, str):
        filename = Path(filename)
    df.to_sparse().to_pickle(filename.with_suffix(suffix))
Пример #4
0
    def test_basic(self, sparse, dtype):
        s_list = list('abc')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame({'a': [1, 0, 0],
                              'b': [0, 1, 0],
                              'c': [0, 0, 1]},
                             dtype=self.effective_dtype(dtype))
        result = get_dummies(s_list, sparse=sparse, dtype=dtype)
        if sparse:
            tm.assert_sp_frame_equal(result,
                                     expected.to_sparse(kind='integer',
                                                        fill_value=0))
        else:
            assert_frame_equal(result, expected)

        result = get_dummies(s_series, sparse=sparse, dtype=dtype)
        if sparse:
            expected = expected.to_sparse(kind='integer', fill_value=0)
        assert_frame_equal(result, expected)

        expected.index = list('ABC')
        result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
        if sparse:
            expected.to_sparse(kind='integer', fill_value=0)
        assert_frame_equal(result, expected)
Пример #5
0
    def test_dense_to_sparse(self):
        df = DataFrame({"A": [nan, nan, nan, 1, 2], "B": [1, 2, nan, nan, nan]})
        sdf = df.to_sparse()
        self.assert_(isinstance(sdf, SparseDataFrame))
        self.assert_(np.isnan(sdf.default_fill_value))
        self.assert_(isinstance(sdf["A"].sp_index, BlockIndex))
        tm.assert_frame_equal(sdf.to_dense(), df)

        sdf = df.to_sparse(kind="integer")
        self.assert_(isinstance(sdf["A"].sp_index, IntIndex))

        df = DataFrame({"A": [0, 0, 0, 1, 2], "B": [1, 2, 0, 0, 0]}, dtype=float)
        sdf = df.to_sparse(fill_value=0)
        self.assertEquals(sdf.default_fill_value, 0)
        tm.assert_frame_equal(sdf.to_dense(), df)
Пример #6
0
    def test_basic_types(self):
        # GH 10531
        s_list = list('abc')
        s_series = Series(s_list)
        s_df = DataFrame({'a': [0, 1, 0, 1, 2],
                          'b': ['A', 'A', 'B', 'C', 'C'],
                          'c': [2, 3, 3, 3, 2]})

        expected = DataFrame({'a': [1, 0, 0],
                              'b': [0, 1, 0],
                              'c': [0, 0, 1]},
                             dtype='uint8',
                             columns=list('abc'))
        if not self.sparse:
            compare = tm.assert_frame_equal
        else:
            expected = expected.to_sparse(fill_value=0, kind='integer')
            compare = tm.assert_sp_frame_equal

        result = get_dummies(s_list, sparse=self.sparse)
        compare(result, expected)

        result = get_dummies(s_series, sparse=self.sparse)
        compare(result, expected)

        result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns)
        tm.assert_series_equal(result.get_dtype_counts(),
                               Series({'uint8': 8}))

        result = get_dummies(s_df, sparse=self.sparse, columns=['a'])
        expected = Series({'uint8': 3, 'int64': 1, 'object': 1}).sort_values()
        tm.assert_series_equal(result.get_dtype_counts().sort_values(),
                               expected)
Пример #7
0
    def test_basic_types(self):
        # GH 10531
        s_list = list('abc')
        s_series = Series(s_list)
        s_df = DataFrame({'a': [0, 1, 0, 1, 2],
                          'b': ['A', 'A', 'B', 'C', 'C'],
                          'c': [2, 3, 3, 3, 2]})

        expected = DataFrame({'a': [1, 0, 0],
                              'b': [0, 1, 0],
                              'c': [0, 0, 1]},
                             dtype='uint8',
                             columns=list('abc'))
        if not self.sparse:
            compare = tm.assert_frame_equal
        else:
            expected = expected.to_sparse(fill_value=0, kind='integer')
            compare = tm.assert_sp_frame_equal

        result = get_dummies(s_list, sparse=self.sparse)
        compare(result, expected)

        result = get_dummies(s_series, sparse=self.sparse)
        compare(result, expected)

        result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns)
        tm.assert_series_equal(result.get_dtype_counts(),
                               Series({'uint8': 8}))

        result = get_dummies(s_df, sparse=self.sparse, columns=['a'])
        expected = Series({'uint8': 3, 'int64': 1, 'object': 1}).sort_values()
        tm.assert_series_equal(result.get_dtype_counts().sort_values(),
                               expected)
Пример #8
0
    def test_dtypes(self):
        df = DataFrame(np.random.randn(10000, 4))
        df.loc[:9998] = np.nan
        sdf = df.to_sparse()

        result = sdf.get_dtype_counts()
        expected = Series({'float64': 4})
        tm.assert_series_equal(result, expected)
Пример #9
0
    def test_dense_to_sparse(self):
        df = DataFrame({'A': [nan, nan, nan, 1, 2],
                        'B': [1, 2, nan, nan, nan]})
        sdf = df.to_sparse()
        assert isinstance(sdf, SparseDataFrame)
        assert np.isnan(sdf.default_fill_value)
        assert isinstance(sdf['A'].sp_index, BlockIndex)
        tm.assert_frame_equal(sdf.to_dense(), df)

        sdf = df.to_sparse(kind='integer')
        assert isinstance(sdf['A'].sp_index, IntIndex)

        df = DataFrame({'A': [0, 0, 0, 1, 2],
                        'B': [1, 2, 0, 0, 0]}, dtype=float)
        sdf = df.to_sparse(fill_value=0)
        assert sdf.default_fill_value == 0
        tm.assert_frame_equal(sdf.to_dense(), df)
Пример #10
0
    def test_dense_to_sparse(self):
        df = DataFrame({'A': [nan, nan, nan, 1, 2],
                        'B': [1, 2, nan, nan, nan]})
        sdf = df.to_sparse()
        tm.assertIsInstance(sdf, SparseDataFrame)
        self.assertTrue(np.isnan(sdf.default_fill_value))
        tm.assertIsInstance(sdf['A'].sp_index, BlockIndex)
        tm.assert_frame_equal(sdf.to_dense(), df)

        sdf = df.to_sparse(kind='integer')
        tm.assertIsInstance(sdf['A'].sp_index, IntIndex)

        df = DataFrame({'A': [0, 0, 0, 1, 2],
                        'B': [1, 2, 0, 0, 0]}, dtype=float)
        sdf = df.to_sparse(fill_value=0)
        self.assertEqual(sdf.default_fill_value, 0)
        tm.assert_frame_equal(sdf.to_dense(), df)
Пример #11
0
    def test_dtypes(self):
        df = DataFrame(np.random.randn(10000, 4))
        df.ix[:9998] = np.nan
        sdf = df.to_sparse()

        result = sdf.get_dtype_counts()
        expected = Series({'float64': 4})
        tm.assert_series_equal(result, expected)
Пример #12
0
 def test_dataframe_dummies_drop_first(self, df, sparse):
     df = df[['A', 'B']]
     result = get_dummies(df, drop_first=True, sparse=sparse)
     expected = DataFrame({'A_b': [0, 1, 0],
                           'B_c': [0, 0, 1]},
                          dtype=np.uint8)
     if sparse:
         expected = expected.to_sparse(fill_value=0, kind='integer')
     assert_frame_equal(result, expected)
Пример #13
0
    def test_apply_nonuq(self):
        df_orig = DataFrame(
            [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c'])
        df = df_orig.to_sparse()
        rs = df.apply(lambda s: s[0], axis=1)
        xp = Series([1., 4., 7.], ['a', 'a', 'c'])
        tm.assert_series_equal(rs, xp)

        # df.T breaks
        df = df_orig.T.to_sparse()
        rs = df.apply(lambda s: s[0], axis=0)  # noqa
Пример #14
0
    def test_apply_nonuq(self):
        df_orig = DataFrame(
            [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c'])
        df = df_orig.to_sparse()
        rs = df.apply(lambda s: s[0], axis=1)
        xp = Series([1., 4., 7.], ['a', 'a', 'c'])
        tm.assert_series_equal(rs, xp)

        # df.T breaks
        df = df_orig.T.to_sparse()
        rs = df.apply(lambda s: s[0], axis=0)  # noqa
Пример #15
0
 def test_dataframe_dummies_drop_first(self, df, sparse):
     df = df[['A', 'B']]
     result = get_dummies(df, drop_first=True, sparse=sparse)
     expected = DataFrame({
         'A_b': [0, 1, 0],
         'B_c': [0, 0, 1]
     },
                          dtype=np.uint8)
     if sparse:
         expected = expected.to_sparse(fill_value=0, kind='integer')
     assert_frame_equal(result, expected)
Пример #16
0
    def test_basic_types(self, sparse, dtype):
        # GH 10531
        s_list = list('abc')
        s_series = Series(s_list)
        s_df = DataFrame({
            'a': [0, 1, 0, 1, 2],
            'b': ['A', 'A', 'B', 'C', 'C'],
            'c': [2, 3, 3, 3, 2]
        })

        expected = DataFrame({
            'a': [1, 0, 0],
            'b': [0, 1, 0],
            'c': [0, 0, 1]
        },
                             dtype=self.effective_dtype(dtype),
                             columns=list('abc'))
        if not sparse:
            compare = tm.assert_frame_equal
        else:
            expected = expected.to_sparse(fill_value=0, kind='integer')
            compare = tm.assert_sp_frame_equal

        result = get_dummies(s_list, sparse=sparse, dtype=dtype)
        compare(result, expected)

        result = get_dummies(s_series, sparse=sparse, dtype=dtype)
        compare(result, expected)

        result = get_dummies(s_df,
                             columns=s_df.columns,
                             sparse=sparse,
                             dtype=dtype)
        if sparse:
            dtype_name = 'Sparse[{}, 0]'.format(
                self.effective_dtype(dtype).name)
        else:
            dtype_name = self.effective_dtype(dtype).name

        expected = Series({dtype_name: 8})
        tm.assert_series_equal(result.get_dtype_counts(), expected)

        result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype)

        expected_counts = {'int64': 1, 'object': 1}
        expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)

        expected = Series(expected_counts).sort_index()
        tm.assert_series_equal(result.get_dtype_counts().sort_index(),
                               expected)
Пример #17
0
 def test_unicode(self, sparse):
     # See GH 6885 - get_dummies chokes on unicode values
     import unicodedata
     e = 'e'
     eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
     s = [e, eacute, eacute]
     res = get_dummies(s, prefix='letter', sparse=sparse)
     exp = DataFrame({'letter_e': [1, 0, 0],
                      u('letter_%s') % eacute: [0, 1, 1]},
                     dtype=np.uint8)
     if sparse:
         tm.assert_sp_frame_equal(res, exp.to_sparse(fill_value=0,
                                                     kind='integer'))
     else:
         assert_frame_equal(res, exp)
Пример #18
0
    def test_basic_drop_first_NA(self, sparse):
        # Test NA handling together with drop_first
        s_NA = ['a', 'b', np.nan]
        res = get_dummies(s_NA, drop_first=True, sparse=sparse)
        exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
        if sparse:
            exp = exp.to_sparse(fill_value=0, kind='integer')

        assert_frame_equal(res, exp)

        res_na = get_dummies(s_NA, dummy_na=True, drop_first=True,
                             sparse=sparse)
        exp_na = DataFrame(
            {'b': [0, 1, 0],
             nan: [0, 0, 1]},
            dtype=np.uint8).reindex(['b', nan], axis=1)
        if sparse:
            exp_na = exp_na.to_sparse(fill_value=0, kind='integer')
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True, drop_first=True,
                                  sparse=sparse)
        exp_just_na = DataFrame(index=np.arange(1))
        assert_frame_equal(res_just_na, exp_just_na)
Пример #19
0
    def test_apply_nonuq(self):
        orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"])
        sparse = orig.to_sparse()
        res = sparse.apply(lambda s: s[0], axis=1)
        exp = orig.apply(lambda s: s[0], axis=1)
        # dtype must be kept
        self.assertEqual(res.dtype, np.int64)
        # ToDo: apply must return subclassed dtype
        self.assertIsInstance(res, pd.Series)
        tm.assert_series_equal(res.to_dense(), exp)

        # df.T breaks
        sparse = orig.T.to_sparse()
        res = sparse.apply(lambda s: s[0], axis=0)  # noqa
        exp = orig.T.apply(lambda s: s[0], axis=0)
Пример #20
0
    def test_apply_nonuq(self):
        orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                         index=['a', 'a', 'c'])
        sparse = orig.to_sparse()
        res = sparse.apply(lambda s: s[0], axis=1)
        exp = orig.apply(lambda s: s[0], axis=1)
        # dtype must be kept
        assert res.dtype == np.int64
        # ToDo: apply must return subclassed dtype
        assert isinstance(res, pd.Series)
        tm.assert_series_equal(res.to_dense(), exp)

        # df.T breaks
        sparse = orig.T.to_sparse()
        res = sparse.apply(lambda s: s[0], axis=0)  # noqa
        exp = orig.T.apply(lambda s: s[0], axis=0)
Пример #21
0
    def test_basic_types(self, sparse, dtype):
        # GH 10531
        s_list = list('abc')
        s_series = Series(s_list)
        s_df = DataFrame({'a': [0, 1, 0, 1, 2],
                          'b': ['A', 'A', 'B', 'C', 'C'],
                          'c': [2, 3, 3, 3, 2]})

        expected = DataFrame({'a': [1, 0, 0],
                              'b': [0, 1, 0],
                              'c': [0, 0, 1]},
                             dtype=self.effective_dtype(dtype),
                             columns=list('abc'))
        if not sparse:
            compare = tm.assert_frame_equal
        else:
            expected = expected.to_sparse(fill_value=0, kind='integer')
            compare = tm.assert_sp_frame_equal

        result = get_dummies(s_list, sparse=sparse, dtype=dtype)
        compare(result, expected)

        result = get_dummies(s_series, sparse=sparse, dtype=dtype)
        compare(result, expected)

        result = get_dummies(s_df, columns=s_df.columns,
                             sparse=sparse, dtype=dtype)
        if sparse:
            dtype_name = 'Sparse[{}, 0]'.format(
                self.effective_dtype(dtype).name
            )
        else:
            dtype_name = self.effective_dtype(dtype).name

        expected = Series({dtype_name: 8})
        tm.assert_series_equal(result.get_dtype_counts(), expected)

        result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype)

        expected_counts = {'int64': 1, 'object': 1}
        expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)

        expected = Series(expected_counts).sort_index()
        tm.assert_series_equal(result.get_dtype_counts().sort_index(),
                               expected)
Пример #22
0
def test_apply_nonuq():
    orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                     index=['a', 'a', 'c'])
    sparse = orig.to_sparse()
    res = sparse.apply(lambda s: s[0], axis=1)
    exp = orig.apply(lambda s: s[0], axis=1)

    # dtype must be kept
    assert res.dtype == SparseDtype(np.int64)

    # ToDo: apply must return subclassed dtype
    assert isinstance(res, Series)
    tm.assert_series_equal(res.to_dense(), exp)

    # df.T breaks
    sparse = orig.T.to_sparse()
    res = sparse.apply(lambda s: s[0], axis=0)  # noqa
    exp = orig.T.apply(lambda s: s[0], axis=0)
Пример #23
0
 def test_unicode(self, sparse):
     # See GH 6885 - get_dummies chokes on unicode values
     import unicodedata
     e = 'e'
     eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
     s = [e, eacute, eacute]
     res = get_dummies(s, prefix='letter', sparse=sparse)
     exp = DataFrame(
         {
             'letter_e': [1, 0, 0],
             u('letter_%s') % eacute: [0, 1, 1]
         },
         dtype=np.uint8)
     if sparse:
         tm.assert_sp_frame_equal(
             res, exp.to_sparse(fill_value=0, kind='integer'))
     else:
         assert_frame_equal(res, exp)
Пример #24
0
    def test_sparse_frame_pad_backfill_limit(self):
        index = np.arange(10)
        df = DataFrame(np.random.randn(10, 4), index=index)
        sdf = df.to_sparse()

        result = sdf[:2].reindex(index, method='pad', limit=5)

        expected = sdf[:2].reindex(index).fillna(method='pad')
        expected = expected.to_dense()
        expected.values[-3:] = np.nan
        expected = expected.to_sparse()
        tm.assert_frame_equal(result, expected)

        result = sdf[-2:].reindex(index, method='backfill', limit=5)

        expected = sdf[-2:].reindex(index).fillna(method='backfill')
        expected = expected.to_dense()
        expected.values[:3] = np.nan
        expected = expected.to_sparse()
        tm.assert_frame_equal(result, expected)
Пример #25
0
    def test_sparse_frame_pad_backfill_limit(self):
        index = np.arange(10)
        df = DataFrame(np.random.randn(10, 4), index=index)
        sdf = df.to_sparse()

        result = sdf[:2].reindex(index, method='pad', limit=5)

        expected = sdf[:2].reindex(index).fillna(method='pad')
        expected = expected.to_dense()
        expected.values[-3:] = np.nan
        expected = expected.to_sparse()
        tm.assert_frame_equal(result, expected)

        result = sdf[-2:].reindex(index, method='backfill', limit=5)

        expected = sdf[-2:].reindex(index).fillna(method='backfill')
        expected = expected.to_dense()
        expected.values[:3] = np.nan
        expected = expected.to_sparse()
        tm.assert_frame_equal(result, expected)
Пример #26
0
    def test_basic_drop_first(self, sparse):
        # GH12402 Add a new parameter `drop_first` to avoid collinearity
        # Basic case
        s_list = list('abc')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame({'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype=np.uint8)

        result = get_dummies(s_list, drop_first=True, sparse=sparse)
        if sparse:
            expected = expected.to_sparse(fill_value=0, kind='integer')
        assert_frame_equal(result, expected)

        result = get_dummies(s_series, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

        expected.index = list('ABC')
        result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)
Пример #27
0
    def test_basic_drop_first(self, sparse):
        # GH12402 Add a new parameter `drop_first` to avoid collinearity
        # Basic case
        s_list = list('abc')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame({'b': [0, 1, 0],
                              'c': [0, 0, 1]},
                             dtype=np.uint8)

        result = get_dummies(s_list, drop_first=True, sparse=sparse)
        if sparse:
            expected = expected.to_sparse(fill_value=0, kind='integer')
        assert_frame_equal(result, expected)

        result = get_dummies(s_series, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

        expected.index = list('ABC')
        result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)
Пример #28
0
    def test_str(self):
        df = DataFrame(np.random.randn(10000, 4))
        df.loc[:9998] = np.nan

        sdf = df.to_sparse()
        str(sdf)
Пример #29
0
 def test_nan_columnname(self):
     # GH 8822
     nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan])
     nan_colname_sparse = nan_colname.to_sparse()
     assert np.isnan(nan_colname_sparse.columns[0])
Пример #30
0
    def test_str(self):
        df = DataFrame(np.random.randn(10000, 4))
        df.ix[:9998] = np.nan

        sdf = df.to_sparse()
        str(sdf)
Пример #31
0
 def test_nan_columnname(self):
     # GH 8822
     nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan])
     nan_colname_sparse = nan_colname.to_sparse()
     self.assertTrue(np.isnan(nan_colname_sparse.columns[0]))