def test_basic_types(self): # GH 10531 s_list = list('abc') s_series = Series(s_list) s_df = DataFrame({'a': [0, 1, 0, 1, 2], 'b': ['A', 'A', 'B', 'C', 'C'], 'c': [2, 3, 3, 3, 2]}) expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype='uint8', columns=list('abc')) if not self.sparse: compare = tm.assert_frame_equal else: expected = expected.to_sparse(fill_value=0, kind='integer') compare = tm.assert_sp_frame_equal result = get_dummies(s_list, sparse=self.sparse) compare(result, expected) result = get_dummies(s_series, sparse=self.sparse) compare(result, expected) result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns) tm.assert_series_equal(result.get_dtype_counts(), Series({'uint8': 8})) result = get_dummies(s_df, sparse=self.sparse, columns=['a']) expected = Series({'uint8': 3, 'int64': 1, 'object': 1}).sort_values() tm.assert_series_equal(result.get_dtype_counts().sort_values(), expected)
def test_dataframe_dummies_all_obj(self): df = self.df[['A', 'B']] result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], 'B_c': [0, 0, 1]}, dtype=np.uint8) assert_frame_equal(result, expected)
def test_just_na(self): just_na_list = [np.nan] just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index=['A']) res_list = get_dummies(just_na_list, sparse=self.sparse) res_series = get_dummies(just_na_series, sparse=self.sparse) res_series_index = get_dummies(just_na_series_index, sparse=self.sparse) self.assertEqual(res_list.empty, True) self.assertEqual(res_series.empty, True) self.assertEqual(res_series_index.empty, True) self.assertEqual(res_list.index.tolist(), [0]) self.assertEqual(res_series.index.tolist(), [0]) self.assertEqual(res_series_index.index.tolist(), ['A'])
def test_just_na(self): just_na_list = [np.nan] just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index=['A']) res_list = get_dummies(just_na_list, sparse=self.sparse) res_series = get_dummies(just_na_series, sparse=self.sparse) res_series_index = get_dummies(just_na_series_index, sparse=self.sparse) assert res_list.empty assert res_series.empty assert res_series_index.empty assert res_list.index.tolist() == [0] assert res_series.index.tolist() == [0] assert res_series_index.index.tolist() == ['A']
def test_basic_drop_first_one_level(self): # Test the case that categorical variable only has one level. s_list = list('aaa') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame(index=np.arange(3)) result = get_dummies(s_list, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected) result = get_dummies(s_series, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected) expected = DataFrame(index=list('ABC')) result = get_dummies(s_series_index, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected)
def test_dataframe_dummies_drop_first(self): df = self.df[['A', 'B']] result = get_dummies(df, sparse=self.sparse, drop_first=True) expected = DataFrame({ 'A_b': [0, 1, 0], 'B_c': [0, 0, 1] }, dtype=np.uint8) assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_str(self): # not that you should do this... df = self.df result = get_dummies(df, prefix='bad', sparse=self.sparse) expected = DataFrame( [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'], dtype=np.uint8) expected = expected.astype({"C": np.int64}) assert_frame_equal(result, expected)
def test_basic(self): s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'a': {0: 1, 1: 0, 2: 0}, 'b': {0: 0, 1: 1, 2: 0}, 'c': {0: 0, 1: 0, 2: 1}}, dtype=np.uint8) assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected) assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected) expected.index = list('ABC') assert_frame_equal( get_dummies(s_series_index, sparse=self.sparse), expected)
def test_dataframe_dummies_drop_first_with_na(self): df = self.df df.loc[3, :] = [np.nan, np.nan, np.nan] result = get_dummies(df, dummy_na=True, sparse=self.sparse, drop_first=True) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_b': [0, 1, 0, 0], 'A_nan': [0, 0, 0, 1], 'B_c': [0, 0, 1, 0], 'B_nan': [0, 0, 0, 1]}) cols = ['A_b', 'A_nan', 'B_c', 'B_nan'] expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] assert_frame_equal(result, expected) result = get_dummies(df, dummy_na=False, sparse=self.sparse, drop_first=True) expected = expected[['C', 'A_b', 'B_c']] assert_frame_equal(result, expected)
def test_dataframe_dummies_subset(self): df = self.df result = get_dummies(df, prefix=['from_A'], columns=['A'], sparse=self.sparse) expected = DataFrame({'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) cols = ['from_A_a', 'from_A_b'] expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_sep(self): df = self.df result = get_dummies(df, prefix_sep='..', sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'A..a': [1, 0, 1], 'A..b': [0, 1, 0], 'B..b': [1, 1, 0], 'B..c': [0, 0, 1]}) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] cols = expected.columns[1:] expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse) expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, sparse=self.sparse) assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_str(self): # not that you should do this... df = self.df result = get_dummies(df, prefix='bad', sparse=self.sparse) expected = DataFrame([[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'], dtype=np.uint8) expected = expected.astype({"C": np.int64}) assert_frame_equal(result, expected)
def test_dataframe_dummies_mix_default(self): df = self.df result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], 'B_c': [0, 0, 1]}) cols = ['A_a', 'A_b', 'B_b', 'B_c'] expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected)
def test_dataframe_dummies_all_obj(self): df = self.df[['A', 'B']] result = get_dummies(df, sparse=self.sparse) expected = DataFrame( { 'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], 'B_c': [0, 0, 1] }, dtype=np.uint8) assert_frame_equal(result, expected)
def test_dataframe_dummies_drop_first_with_categorical(self): df = self.df df['cat'] = pd.Categorical(['x', 'y', 'y']) result = get_dummies(df, sparse=self.sparse, drop_first=True) expected = DataFrame({'C': [1, 2, 3], 'A_b': [0, 1, 0], 'B_c': [0, 0, 1], 'cat_y': [0, 1, 1]}) cols = ['A_b', 'B_c', 'cat_y'] expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_b', 'B_c', 'cat_y']] assert_frame_equal(result, expected)
def test_include_na(self): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=self.sparse) exp = DataFrame({'a': {0: 1, 1: 0, 2: 0}, 'b': {0: 0, 1: 1, 2: 0}}, dtype=np.uint8) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) exp_na = DataFrame({nan: {0: 0, 1: 0, 2: 1}, 'a': {0: 1, 1: 0, 2: 0}, 'b': {0: 0, 1: 1, 2: 0}}, dtype=np.uint8) exp_na = exp_na.reindex_axis(['a', 'b', nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], dtype=np.uint8) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def test_dataframe_dummies_preserve_categorical_dtype(self): # GH13854 for ordered in [False, True]: cat = pd.Categorical(list("xy"), categories=list("xyz"), ordered=ordered) result = get_dummies(cat) data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.uint8) cols = pd.CategoricalIndex(cat.categories, categories=cat.categories, ordered=ordered) expected = DataFrame(data, columns=cols) tm.assert_frame_equal(result, expected)
def test_basic_drop_first(self): # Basic case s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'b': {0: 0, 1: 1, 2: 0}, 'c': {0: 0, 1: 0, 2: 1}}, dtype=np.uint8) result = get_dummies(s_list, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected) result = get_dummies(s_series, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected) expected.index = list('ABC') result = get_dummies(s_series_index, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_dict(self): prefixes = {'A': 'from_A', 'B': 'from_B'} df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) result = get_dummies(df, prefix=prefixes, sparse=self.sparse) expected = DataFrame({'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], 'from_B_c': [0, 0, 1], 'C': [1, 2, 3]}) cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected)
def test_basic_types(self): # GH 10531 s_list = list('abc') s_series = Series(s_list) s_df = DataFrame({ 'a': [0, 1, 0, 1, 2], 'b': ['A', 'A', 'B', 'C', 'C'], 'c': [2, 3, 3, 3, 2] }) expected = DataFrame({ 'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1] }, dtype='uint8', columns=list('abc')) if not self.sparse: compare = tm.assert_frame_equal else: expected = expected.to_sparse(fill_value=0, kind='integer') compare = tm.assert_sp_frame_equal result = get_dummies(s_list, sparse=self.sparse) compare(result, expected) result = get_dummies(s_series, sparse=self.sparse) compare(result, expected) result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns) tm.assert_series_equal(result.get_dtype_counts(), Series({'uint8': 8})) result = get_dummies(s_df, sparse=self.sparse, columns=['a']) expected = Series({'uint8': 3, 'int64': 1, 'object': 1}).sort_values() tm.assert_series_equal(result.get_dtype_counts().sort_values(), expected)
def test_basic_drop_first_NA(self): # Test NA hadling together with drop_first s_NA = ['a', 'b', np.nan] res = get_dummies(s_NA, sparse=self.sparse, drop_first=True) exp = DataFrame({'b': {0: 0, 1: 1, 2: 0}}, dtype=np.uint8) assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse, drop_first=True) exp_na = DataFrame({'b': {0: 0, 1: 1, 2: 0}, nan: {0: 0, 1: 0, 2: 1}}, dtype=np.uint8).reindex_axis( ['b', nan], 1) assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse, drop_first=True) exp_just_na = DataFrame(index=np.arange(1)) assert_frame_equal(res_just_na, exp_just_na)
def test_unicode(self ): # See GH 6885 - get_dummies chokes on unicode values import unicodedata e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] res = get_dummies(s, prefix='letter', sparse=self.sparse) exp = DataFrame({'letter_e': {0: 1, 1: 0, 2: 0}, u('letter_%s') % eacute: {0: 0, 1: 1, 2: 1}}, dtype=np.uint8) assert_frame_equal(res, exp)
def make_dummy_columns(mdc_df, column_name, prefix='', append_columns=[]): dummies = reshape.get_dummies(mdc_df[column_name], prefix=prefix) arr_append(append_columns, dummies.columns) mdc_df = mdc_df.join(dummies) return mdc_df
def test_dataframe_dummies_prefix_bad_length(self): with pytest.raises(ValueError): get_dummies(self.df, prefix=['too few'], sparse=self.sparse)
def test_dataframe_dummies_drop_first(self): df = self.df[['A', 'B']] result = get_dummies(df, sparse=self.sparse, drop_first=True) expected = DataFrame({'A_b': [0, 1, 0], 'B_c': [0, 0, 1]}, dtype=np.uint8) assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_sep_bad_length(self): with pytest.raises(ValueError): get_dummies(self.df, prefix_sep=['bad'], sparse=self.sparse)
def test_dataframe_dummies_prefix_bad_length(self): with tm.assertRaises(ValueError): get_dummies(self.df, prefix=['too few'], sparse=self.sparse)
def test_dataframe_dummies_prefix_sep_bad_length(self): with tm.assertRaises(ValueError): get_dummies(self.df, prefix_sep=['bad'], sparse=self.sparse)