def test_basic(self, sparse, dtype): s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({ 'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1] }, dtype=self.effective_dtype(dtype)) result = get_dummies(s_list, sparse=sparse, dtype=dtype) if sparse: tm.assert_sp_frame_equal( result, expected.to_sparse(kind='integer', fill_value=0)) else: assert_frame_equal(result, expected) result = get_dummies(s_series, sparse=sparse, dtype=dtype) if sparse: expected = expected.to_sparse(kind='integer', fill_value=0) assert_frame_equal(result, expected) expected.index = list('ABC') result = get_dummies(s_series_index, sparse=sparse, dtype=dtype) if sparse: expected.to_sparse(kind='integer', fill_value=0) assert_frame_equal(result, expected)
def test_basic_drop_first_NA(self, sparse): # Test NA handling together with drop_first s_NA = ['a', 'b', np.nan] res = get_dummies(s_NA, drop_first=True, sparse=sparse) exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8) if sparse: exp = exp.to_sparse(fill_value=0, kind='integer') assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse) exp_na = DataFrame({ 'b': [0, 1, 0], nan: [0, 0, 1] }, dtype=np.uint8).reindex(['b', nan], axis=1) if sparse: exp_na = exp_na.to_sparse(fill_value=0, kind='integer') assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, sparse=sparse) exp_just_na = DataFrame(index=np.arange(1)) assert_frame_equal(res_just_na, exp_just_na)
def sparse_pickle_df(df: pd.DataFrame, filename, suffix='.pkl'): """ Converts dataframe to sparse dataframe before pickling to disk. """ if isinstance(filename, str): filename = Path(filename) df.to_sparse().to_pickle(filename.with_suffix(suffix))
def test_basic(self, sparse, dtype): s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype=self.effective_dtype(dtype)) result = get_dummies(s_list, sparse=sparse, dtype=dtype) if sparse: tm.assert_sp_frame_equal(result, expected.to_sparse(kind='integer', fill_value=0)) else: assert_frame_equal(result, expected) result = get_dummies(s_series, sparse=sparse, dtype=dtype) if sparse: expected = expected.to_sparse(kind='integer', fill_value=0) assert_frame_equal(result, expected) expected.index = list('ABC') result = get_dummies(s_series_index, sparse=sparse, dtype=dtype) if sparse: expected.to_sparse(kind='integer', fill_value=0) assert_frame_equal(result, expected)
def test_dense_to_sparse(self): df = DataFrame({"A": [nan, nan, nan, 1, 2], "B": [1, 2, nan, nan, nan]}) sdf = df.to_sparse() self.assert_(isinstance(sdf, SparseDataFrame)) self.assert_(np.isnan(sdf.default_fill_value)) self.assert_(isinstance(sdf["A"].sp_index, BlockIndex)) tm.assert_frame_equal(sdf.to_dense(), df) sdf = df.to_sparse(kind="integer") self.assert_(isinstance(sdf["A"].sp_index, IntIndex)) df = DataFrame({"A": [0, 0, 0, 1, 2], "B": [1, 2, 0, 0, 0]}, dtype=float) sdf = df.to_sparse(fill_value=0) self.assertEquals(sdf.default_fill_value, 0) tm.assert_frame_equal(sdf.to_dense(), df)
def test_basic_types(self): # GH 10531 s_list = list('abc') s_series = Series(s_list) s_df = DataFrame({'a': [0, 1, 0, 1, 2], 'b': ['A', 'A', 'B', 'C', 'C'], 'c': [2, 3, 3, 3, 2]}) expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype='uint8', columns=list('abc')) if not self.sparse: compare = tm.assert_frame_equal else: expected = expected.to_sparse(fill_value=0, kind='integer') compare = tm.assert_sp_frame_equal result = get_dummies(s_list, sparse=self.sparse) compare(result, expected) result = get_dummies(s_series, sparse=self.sparse) compare(result, expected) result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns) tm.assert_series_equal(result.get_dtype_counts(), Series({'uint8': 8})) result = get_dummies(s_df, sparse=self.sparse, columns=['a']) expected = Series({'uint8': 3, 'int64': 1, 'object': 1}).sort_values() tm.assert_series_equal(result.get_dtype_counts().sort_values(), expected)
def test_dtypes(self): df = DataFrame(np.random.randn(10000, 4)) df.loc[:9998] = np.nan sdf = df.to_sparse() result = sdf.get_dtype_counts() expected = Series({'float64': 4}) tm.assert_series_equal(result, expected)
def test_dense_to_sparse(self): df = DataFrame({'A': [nan, nan, nan, 1, 2], 'B': [1, 2, nan, nan, nan]}) sdf = df.to_sparse() assert isinstance(sdf, SparseDataFrame) assert np.isnan(sdf.default_fill_value) assert isinstance(sdf['A'].sp_index, BlockIndex) tm.assert_frame_equal(sdf.to_dense(), df) sdf = df.to_sparse(kind='integer') assert isinstance(sdf['A'].sp_index, IntIndex) df = DataFrame({'A': [0, 0, 0, 1, 2], 'B': [1, 2, 0, 0, 0]}, dtype=float) sdf = df.to_sparse(fill_value=0) assert sdf.default_fill_value == 0 tm.assert_frame_equal(sdf.to_dense(), df)
def test_dense_to_sparse(self): df = DataFrame({'A': [nan, nan, nan, 1, 2], 'B': [1, 2, nan, nan, nan]}) sdf = df.to_sparse() tm.assertIsInstance(sdf, SparseDataFrame) self.assertTrue(np.isnan(sdf.default_fill_value)) tm.assertIsInstance(sdf['A'].sp_index, BlockIndex) tm.assert_frame_equal(sdf.to_dense(), df) sdf = df.to_sparse(kind='integer') tm.assertIsInstance(sdf['A'].sp_index, IntIndex) df = DataFrame({'A': [0, 0, 0, 1, 2], 'B': [1, 2, 0, 0, 0]}, dtype=float) sdf = df.to_sparse(fill_value=0) self.assertEqual(sdf.default_fill_value, 0) tm.assert_frame_equal(sdf.to_dense(), df)
def test_dtypes(self): df = DataFrame(np.random.randn(10000, 4)) df.ix[:9998] = np.nan sdf = df.to_sparse() result = sdf.get_dtype_counts() expected = Series({'float64': 4}) tm.assert_series_equal(result, expected)
def test_dataframe_dummies_drop_first(self, df, sparse): df = df[['A', 'B']] result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame({'A_b': [0, 1, 0], 'B_c': [0, 0, 1]}, dtype=np.uint8) if sparse: expected = expected.to_sparse(fill_value=0, kind='integer') assert_frame_equal(result, expected)
def test_apply_nonuq(self): df_orig = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) df = df_orig.to_sparse() rs = df.apply(lambda s: s[0], axis=1) xp = Series([1., 4., 7.], ['a', 'a', 'c']) tm.assert_series_equal(rs, xp) # df.T breaks df = df_orig.T.to_sparse() rs = df.apply(lambda s: s[0], axis=0) # noqa
def test_dataframe_dummies_drop_first(self, df, sparse): df = df[['A', 'B']] result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame({ 'A_b': [0, 1, 0], 'B_c': [0, 0, 1] }, dtype=np.uint8) if sparse: expected = expected.to_sparse(fill_value=0, kind='integer') assert_frame_equal(result, expected)
def test_basic_types(self, sparse, dtype): # GH 10531 s_list = list('abc') s_series = Series(s_list) s_df = DataFrame({ 'a': [0, 1, 0, 1, 2], 'b': ['A', 'A', 'B', 'C', 'C'], 'c': [2, 3, 3, 3, 2] }) expected = DataFrame({ 'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1] }, dtype=self.effective_dtype(dtype), columns=list('abc')) if not sparse: compare = tm.assert_frame_equal else: expected = expected.to_sparse(fill_value=0, kind='integer') compare = tm.assert_sp_frame_equal result = get_dummies(s_list, sparse=sparse, dtype=dtype) compare(result, expected) result = get_dummies(s_series, sparse=sparse, dtype=dtype) compare(result, expected) result = get_dummies(s_df, columns=s_df.columns, sparse=sparse, dtype=dtype) if sparse: dtype_name = 'Sparse[{}, 0]'.format( self.effective_dtype(dtype).name) else: dtype_name = self.effective_dtype(dtype).name expected = Series({dtype_name: 8}) tm.assert_series_equal(result.get_dtype_counts(), expected) result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype) expected_counts = {'int64': 1, 'object': 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) expected = Series(expected_counts).sort_index() tm.assert_series_equal(result.get_dtype_counts().sort_index(), expected)
def test_unicode(self, sparse): # See GH 6885 - get_dummies chokes on unicode values import unicodedata e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] res = get_dummies(s, prefix='letter', sparse=sparse) exp = DataFrame({'letter_e': [1, 0, 0], u('letter_%s') % eacute: [0, 1, 1]}, dtype=np.uint8) if sparse: tm.assert_sp_frame_equal(res, exp.to_sparse(fill_value=0, kind='integer')) else: assert_frame_equal(res, exp)
def test_basic_drop_first_NA(self, sparse): # Test NA handling together with drop_first s_NA = ['a', 'b', np.nan] res = get_dummies(s_NA, drop_first=True, sparse=sparse) exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8) if sparse: exp = exp.to_sparse(fill_value=0, kind='integer') assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse) exp_na = DataFrame( {'b': [0, 1, 0], nan: [0, 0, 1]}, dtype=np.uint8).reindex(['b', nan], axis=1) if sparse: exp_na = exp_na.to_sparse(fill_value=0, kind='integer') assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, sparse=sparse) exp_just_na = DataFrame(index=np.arange(1)) assert_frame_equal(res_just_na, exp_just_na)
def test_apply_nonuq(self): orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) sparse = orig.to_sparse() res = sparse.apply(lambda s: s[0], axis=1) exp = orig.apply(lambda s: s[0], axis=1) # dtype must be kept self.assertEqual(res.dtype, np.int64) # ToDo: apply must return subclassed dtype self.assertIsInstance(res, pd.Series) tm.assert_series_equal(res.to_dense(), exp) # df.T breaks sparse = orig.T.to_sparse() res = sparse.apply(lambda s: s[0], axis=0) # noqa exp = orig.T.apply(lambda s: s[0], axis=0)
def test_apply_nonuq(self): orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) sparse = orig.to_sparse() res = sparse.apply(lambda s: s[0], axis=1) exp = orig.apply(lambda s: s[0], axis=1) # dtype must be kept assert res.dtype == np.int64 # ToDo: apply must return subclassed dtype assert isinstance(res, pd.Series) tm.assert_series_equal(res.to_dense(), exp) # df.T breaks sparse = orig.T.to_sparse() res = sparse.apply(lambda s: s[0], axis=0) # noqa exp = orig.T.apply(lambda s: s[0], axis=0)
def test_basic_types(self, sparse, dtype): # GH 10531 s_list = list('abc') s_series = Series(s_list) s_df = DataFrame({'a': [0, 1, 0, 1, 2], 'b': ['A', 'A', 'B', 'C', 'C'], 'c': [2, 3, 3, 3, 2]}) expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype=self.effective_dtype(dtype), columns=list('abc')) if not sparse: compare = tm.assert_frame_equal else: expected = expected.to_sparse(fill_value=0, kind='integer') compare = tm.assert_sp_frame_equal result = get_dummies(s_list, sparse=sparse, dtype=dtype) compare(result, expected) result = get_dummies(s_series, sparse=sparse, dtype=dtype) compare(result, expected) result = get_dummies(s_df, columns=s_df.columns, sparse=sparse, dtype=dtype) if sparse: dtype_name = 'Sparse[{}, 0]'.format( self.effective_dtype(dtype).name ) else: dtype_name = self.effective_dtype(dtype).name expected = Series({dtype_name: 8}) tm.assert_series_equal(result.get_dtype_counts(), expected) result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype) expected_counts = {'int64': 1, 'object': 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) expected = Series(expected_counts).sort_index() tm.assert_series_equal(result.get_dtype_counts().sort_index(), expected)
def test_apply_nonuq(): orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) sparse = orig.to_sparse() res = sparse.apply(lambda s: s[0], axis=1) exp = orig.apply(lambda s: s[0], axis=1) # dtype must be kept assert res.dtype == SparseDtype(np.int64) # ToDo: apply must return subclassed dtype assert isinstance(res, Series) tm.assert_series_equal(res.to_dense(), exp) # df.T breaks sparse = orig.T.to_sparse() res = sparse.apply(lambda s: s[0], axis=0) # noqa exp = orig.T.apply(lambda s: s[0], axis=0)
def test_unicode(self, sparse): # See GH 6885 - get_dummies chokes on unicode values import unicodedata e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] res = get_dummies(s, prefix='letter', sparse=sparse) exp = DataFrame( { 'letter_e': [1, 0, 0], u('letter_%s') % eacute: [0, 1, 1] }, dtype=np.uint8) if sparse: tm.assert_sp_frame_equal( res, exp.to_sparse(fill_value=0, kind='integer')) else: assert_frame_equal(res, exp)
def test_sparse_frame_pad_backfill_limit(self): index = np.arange(10) df = DataFrame(np.random.randn(10, 4), index=index) sdf = df.to_sparse() result = sdf[:2].reindex(index, method='pad', limit=5) expected = sdf[:2].reindex(index).fillna(method='pad') expected = expected.to_dense() expected.values[-3:] = np.nan expected = expected.to_sparse() tm.assert_frame_equal(result, expected) result = sdf[-2:].reindex(index, method='backfill', limit=5) expected = sdf[-2:].reindex(index).fillna(method='backfill') expected = expected.to_dense() expected.values[:3] = np.nan expected = expected.to_sparse() tm.assert_frame_equal(result, expected)
def test_basic_drop_first(self, sparse): # GH12402 Add a new parameter `drop_first` to avoid collinearity # Basic case s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype=np.uint8) result = get_dummies(s_list, drop_first=True, sparse=sparse) if sparse: expected = expected.to_sparse(fill_value=0, kind='integer') assert_frame_equal(result, expected) result = get_dummies(s_series, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) expected.index = list('ABC') result = get_dummies(s_series_index, drop_first=True, sparse=sparse) assert_frame_equal(result, expected)
def test_str(self): df = DataFrame(np.random.randn(10000, 4)) df.loc[:9998] = np.nan sdf = df.to_sparse() str(sdf)
def test_nan_columnname(self): # GH 8822 nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan]) nan_colname_sparse = nan_colname.to_sparse() assert np.isnan(nan_colname_sparse.columns[0])
def test_str(self): df = DataFrame(np.random.randn(10000, 4)) df.ix[:9998] = np.nan sdf = df.to_sparse() str(sdf)
def test_nan_columnname(self): # GH 8822 nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan]) nan_colname_sparse = nan_colname.to_sparse() self.assertTrue(np.isnan(nan_colname_sparse.columns[0]))