def test_size(self): grouped = self.df.groupby(['A', 'B']) result = grouped.size() for key, group in grouped: assert result[key] == len(group) grouped = self.df.groupby('A') result = grouped.size() for key, group in grouped: assert result[key] == len(group) grouped = self.df.groupby('B') result = grouped.size() for key, group in grouped: assert result[key] == len(group) df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])): left = df.groupby(key, sort=sort).size() right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) assert_series_equal(left, right, check_names=False) # GH11699 df = DataFrame([], columns=['A', 'B']) out = Series([], dtype='int64', index=Index([], name='A')) assert_series_equal(df.groupby('A').size(), out)
def test_size(df): grouped = df.groupby(['A', 'B']) result = grouped.size() for key, group in grouped: assert result[key] == len(group) grouped = df.groupby('A') result = grouped.size() for key, group in grouped: assert result[key] == len(group) grouped = df.groupby('B') result = grouped.size() for key, group in grouped: assert result[key] == len(group) df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])): left = df.groupby(key, sort=sort).size() right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) tm.assert_series_equal(left, right, check_names=False) # GH11699 df = DataFrame([], columns=['A', 'B']) out = Series([], dtype='int64', index=Index([], name='A')) tm.assert_series_equal(df.groupby('A').size(), out)
def test_xs_level_multiple(self): text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_csv(StringIO(text), sep=r'\s+', engine='python') result = df.xs(('a', 4), level=['one', 'four']) expected = df.xs('a').xs(4, level='four') tm.assert_frame_equal(result, expected) # this is a copy in 0.14 result = df.xs(('a', 4), level=['one', 'four']) # setting this will give a SettingWithCopyError # as we are trying to write a view def f(x): x[:] = 10 pytest.raises(com.SettingWithCopyError, f, result) # GH2107 dates = lrange(20111201, 20111205) ids = 'abcde' idx = MultiIndex.from_tuples([x for x in cart_product(dates, ids)]) idx.names = ['date', 'secid'] df = DataFrame(np.random.randn(len(idx), 3), idx, ['X', 'Y', 'Z']) rs = df.xs(20111201, level='date') xp = df.loc[20111201, :] tm.assert_frame_equal(rs, xp)
def test_series_groupby_nunique(self): def check_nunique(df, keys, as_index=True): for sort, dropna in cart_product((False, True), repeat=2): gr = df.groupby(keys, as_index=as_index, sort=sort) left = gr['julie'].nunique(dropna=dropna) gr = df.groupby(keys, as_index=as_index, sort=sort) right = gr['julie'].apply(Series.nunique, dropna=dropna) if not as_index: right = right.reset_index(drop=True) assert_series_equal(left, right, check_names=False) days = date_range('2015-08-23', periods=10) for n, m in cart_product(10**np.arange(2, 6), (10, 100, 1000)): frame = DataFrame({ 'jim': np.random.choice(list(ascii_lowercase), n), 'joe': np.random.choice(days, n), 'julie': np.random.randint(0, m, n) }) check_nunique(frame, ['jim']) check_nunique(frame, ['jim', 'joe']) frame.loc[1::17, 'jim'] = None frame.loc[3::37, 'joe'] = None frame.loc[7::19, 'julie'] = None frame.loc[8::19, 'julie'] = None frame.loc[9::19, 'julie'] = None check_nunique(frame, ['jim']) check_nunique(frame, ['jim', 'joe']) check_nunique(frame, ['jim'], as_index=False) check_nunique(frame, ['jim', 'joe'], as_index=False)
def test_xs_level_multiple(self): text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_csv(StringIO(text), sep=r'\s+', engine='python') result = df.xs(('a', 4), level=['one', 'four']) expected = df.xs('a').xs(4, level='four') tm.assert_frame_equal(result, expected) # this is a copy in 0.14 result = df.xs(('a', 4), level=['one', 'four']) # setting this will give a SettingWithCopyError # as we are trying to write a view def f(x): x[:] = 10 pytest.raises(com.SettingWithCopyError, f, result) # GH2107 dates = lrange(20111201, 20111205) ids = 'abcde' idx = MultiIndex.from_tuples([x for x in cart_product(dates, ids)]) idx.names = ['date', 'secid'] df = DataFrame(np.random.randn(len(idx), 3), idx, ['X', 'Y', 'Z']) rs = df.xs(20111201, level='date') xp = df.loc[20111201, :] tm.assert_frame_equal(rs, xp)
def check_nunique(df, keys, as_index=True): for sort, dropna in cart_product((False, True), repeat=2): gr = df.groupby(keys, as_index=as_index, sort=sort) left = gr['julie'].nunique(dropna=dropna) gr = df.groupby(keys, as_index=as_index, sort=sort) right = gr['julie'].apply(Series.nunique, dropna=dropna) if not as_index: right = right.reset_index(drop=True) assert_series_equal(left, right, check_names=False)
def check_nunique(df, keys, as_index=True): for sort, dropna in cart_product((False, True), repeat=2): gr = df.groupby(keys, as_index=as_index, sort=sort) left = gr['julie'].nunique(dropna=dropna) gr = df.groupby(keys, as_index=as_index, sort=sort) right = gr['julie'].apply(Series.nunique, dropna=dropna) if not as_index: right = right.reset_index(drop=True) assert_series_equal(left, right, check_names=False)
def test_xs_integer_key(): # see gh-2107 dates = lrange(20111201, 20111205) ids = 'abcde' index = MultiIndex.from_tuples([x for x in cart_product(dates, ids)], names=['date', 'secid']) df = DataFrame(np.random.randn(len(index), 3), index, ['X', 'Y', 'Z']) result = df.xs(20111201, level='date') expected = df.loc[20111201, :] tm.assert_frame_equal(result, expected)
def test_ngroup_cumcount_pair(self): # brute force comparison for all small series for p in cart_product(range(3), repeat=4): df = DataFrame({'a': p}) g = df.groupby(['a']) order = sorted(set(p)) ngroupd = [order.index(val) for val in p] cumcounted = [p[:i].count(val) for i, val in enumerate(p)] assert_series_equal(g.ngroup(), Series(ngroupd)) assert_series_equal(g.cumcount(), Series(cumcounted))
def test_xs_integer_key(): # see gh-2107 dates = lrange(20111201, 20111205) ids = 'abcde' index = MultiIndex.from_tuples( [x for x in cart_product(dates, ids)], names=['date', 'secid']) df = DataFrame( np.random.randn(len(index), 3), index, ['X', 'Y', 'Z']) result = df.xs(20111201, level='date') expected = df.loc[20111201, :] tm.assert_frame_equal(result, expected)
def test_series_groupby_nunique(self): def check_nunique(df, keys, as_index=True): for sort, dropna in cart_product((False, True), repeat=2): gr = df.groupby(keys, as_index=as_index, sort=sort) left = gr['julie'].nunique(dropna=dropna) gr = df.groupby(keys, as_index=as_index, sort=sort) right = gr['julie'].apply(Series.nunique, dropna=dropna) if not as_index: right = right.reset_index(drop=True) assert_series_equal(left, right, check_names=False) days = date_range('2015-08-23', periods=10) for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)): frame = DataFrame({ 'jim': np.random.choice( list(ascii_lowercase), n), 'joe': np.random.choice(days, n), 'julie': np.random.randint(0, m, n) }) check_nunique(frame, ['jim']) check_nunique(frame, ['jim', 'joe']) frame.loc[1::17, 'jim'] = None frame.loc[3::37, 'joe'] = None frame.loc[7::19, 'julie'] = None frame.loc[8::19, 'julie'] = None frame.loc[9::19, 'julie'] = None check_nunique(frame, ['jim']) check_nunique(frame, ['jim', 'joe']) check_nunique(frame, ['jim'], as_index=False) check_nunique(frame, ['jim', 'joe'], as_index=False)
df[df.PRICE == 24990].VOLUME.describe().values.tolist(), df[df.PRICE == 25499].VOLUME.describe().values.tolist() ] expected = pd.DataFrame( data, index=pd.Index([24990, 25499], name='PRICE'), columns=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']) tm.assert_frame_equal(result, expected) # nunique # -------------------------------- @pytest.mark.parametrize("n, m", cart_product(10**np.arange(2, 6), (10, 100, 1000))) @pytest.mark.parametrize("sort, dropna", cart_product((False, True), repeat=2)) def test_series_groupby_nunique(n, m, sort, dropna): def check_nunique(df, keys, as_index=True): gr = df.groupby(keys, as_index=as_index, sort=sort) left = gr['julie'].nunique(dropna=dropna) gr = df.groupby(keys, as_index=as_index, sort=sort) right = gr['julie'].apply(Series.nunique, dropna=dropna) if not as_index: right = right.reset_index(drop=True) tm.assert_series_equal(left, right, check_names=False) days = date_range('2015-08-23', periods=10)
df = pd.DataFrame({'PRICE': prices, 'VOLUME': volumes}) result = df.groupby('PRICE').VOLUME.describe() data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), df[df.PRICE == 25499].VOLUME.describe().values.tolist()] expected = pd.DataFrame(data, index=pd.Index([24990, 25499], name='PRICE'), columns=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']) tm.assert_frame_equal(result, expected) # nunique # -------------------------------- @pytest.mark.parametrize("n, m", cart_product(10 ** np.arange(2, 6), (10, 100, 1000))) @pytest.mark.parametrize("sort, dropna", cart_product((False, True), repeat=2)) def test_series_groupby_nunique(n, m, sort, dropna): def check_nunique(df, keys, as_index=True): gr = df.groupby(keys, as_index=as_index, sort=sort) left = gr['julie'].nunique(dropna=dropna) gr = df.groupby(keys, as_index=as_index, sort=sort) right = gr['julie'].apply(Series.nunique, dropna=dropna) if not as_index: right = right.reset_index(drop=True) tm.assert_series_equal(left, right, check_names=False) days = date_range('2015-08-23', periods=10)