def test_group_selection_cache(): # GH 12839 nth, head, and tail should return same result consistently df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) expected = df.iloc[[0, 2]].set_index('A') g = df.groupby('A') result1 = g.head(n=2) result2 = g.nth(0) assert_frame_equal(result1, df) assert_frame_equal(result2, expected) g = df.groupby('A') result1 = g.tail(n=2) result2 = g.nth(0) assert_frame_equal(result1, df) assert_frame_equal(result2, expected) g = df.groupby('A') result1 = g.nth(0) result2 = g.head(n=2) assert_frame_equal(result1, expected) assert_frame_equal(result2, df) g = df.groupby('A') result1 = g.nth(0) result2 = g.tail(n=2) assert_frame_equal(result1, expected) assert_frame_equal(result2, df)
def test_nunique(self): df = DataFrame({ 'A': list('abbacc'), 'B': list('abxacc'), 'C': list('abbacx'), }) expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) result = df.groupby('A', as_index=False).nunique() tm.assert_frame_equal(result, expected) # as_index expected.index = list('abc') expected.index.name = 'A' result = df.groupby('A').nunique() tm.assert_frame_equal(result, expected) # with na result = df.replace({'x': None}).groupby('A').nunique(dropna=False) tm.assert_frame_equal(result, expected) # dropna expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, index=list('abc')) expected.index.name = 'A' result = df.replace({'x': None}).groupby('A').nunique() tm.assert_frame_equal(result, expected)
def test_groupby_groups_datetimeindex(self): # GH#1430 periods = 1000 ind = pd.date_range(start='2012/1/1', freq='5min', periods=periods) df = DataFrame({'high': np.arange(periods), 'low': np.arange(periods)}, index=ind) grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) # it works! groups = grouped.groups assert isinstance(list(groups.keys())[0], datetime) # GH#11442 index = pd.date_range('2015/01/01', periods=5, name='date') df = pd.DataFrame({'A': [5, 6, 7, 8, 9], 'B': [1, 2, 3, 4, 5]}, index=index) result = df.groupby(level='date').groups dates = ['2015-01-05', '2015-01-04', '2015-01-03', '2015-01-02', '2015-01-01'] expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date') for date in dates} tm.assert_dict_equal(result, expected) grouped = df.groupby(level='date') for date in dates: result = grouped.get_group(date) data = [[df.loc[date, 'A'], df.loc[date, 'B']]] expected_index = pd.DatetimeIndex([date], name='date') expected = pd.DataFrame(data, columns=list('AB'), index=expected_index) tm.assert_frame_equal(result, expected)
def test_crosstab_margins(self): a = np.random.randint(0, 7, size=100) b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 5, size=100) df = DataFrame({'a': a, 'b': b, 'c': c}) result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), margins=True) self.assertEqual(result.index.names, ('a',)) self.assertEqual(result.columns.names, ['b', 'c']) all_cols = result['All', ''] exp_cols = df.groupby(['a']).size().astype('i8') exp_cols = exp_cols.append(Series([len(df)], index=['All'])) tm.assert_series_equal(all_cols, exp_cols) all_rows = result.ix['All'] exp_rows = df.groupby(['b', 'c']).size().astype('i8') exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')])) exp_rows = exp_rows.reindex(all_rows.index) exp_rows = exp_rows.fillna(0).astype(np.int64) tm.assert_series_equal(all_rows, exp_rows)
def test_resample_timegrouper(): # GH 7227 dates1 = [datetime(2014, 10, 1), datetime(2014, 9, 3), datetime(2014, 11, 5), datetime(2014, 9, 5), datetime(2014, 10, 8), datetime(2014, 7, 15)] dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:] dates3 = [pd.NaT] + dates1 + [pd.NaT] for dates in [dates1, dates2, dates3]: df = DataFrame(dict(A=dates, B=np.arange(len(dates)))) result = df.set_index('A').resample('M').count() exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31', '2014-09-30', '2014-10-31', '2014-11-30'], freq='M', name='A') expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx) assert_frame_equal(result, expected) result = df.groupby(pd.Grouper(freq='M', key='A')).count() assert_frame_equal(result, expected) df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange( len(dates)))) result = df.set_index('A').resample('M').count() expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]}, index=exp_idx, columns=['B', 'C']) assert_frame_equal(result, expected) result = df.groupby(pd.Grouper(freq='M', key='A')).count() assert_frame_equal(result, expected)
class Size(object): def setup(self): n = 10**5 offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') dates = np.datetime64('now') + offsets self.df = DataFrame({'key1': np.random.randint(0, 500, size=n), 'key2': np.random.randint(0, 100, size=n), 'value1': np.random.randn(n), 'value2': np.random.randn(n), 'value3': np.random.randn(n), 'dates': dates}) self.draws = Series(np.random.randn(n)) labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4)) self.cats = labels.astype('category') def time_multi_size(self): self.df.groupby(['key1', 'key2']).size() def time_dt_timegrouper_size(self): with warnings.catch_warnings(record=True): self.df.groupby(TimeGrouper(key='dates', freq='M')).size() def time_category_size(self): self.draws.groupby(self.cats).size()
def test_preserve_categorical_dtype(): # GH13743, GH13854 df = DataFrame({'A': [1, 2, 1, 1, 2], 'B': [10, 16, 22, 28, 34], 'C1': Categorical(list("abaab"), categories=list("bac"), ordered=False), 'C2': Categorical(list("abaab"), categories=list("bac"), ordered=True)}) # single grouper exp_full = DataFrame({'A': [2.0, 1.0, np.nan], 'B': [25.0, 20.0, np.nan], 'C1': Categorical(list("bac"), categories=list("bac"), ordered=False), 'C2': Categorical(list("bac"), categories=list("bac"), ordered=True)}) for col in ['C1', 'C2']: result1 = df.groupby(by=col, as_index=False, observed=False).mean() result2 = df.groupby( by=col, as_index=True, observed=False).mean().reset_index() expected = exp_full.reindex(columns=result1.columns) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected)
def test_aggregate_normal(resample_method): """Check TimeGrouper's aggregation is identical as normal groupby.""" if resample_method == 'ohlc': pytest.xfail(reason='DataError: No numeric types to aggregate') data = np.random.randn(20, 4) normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) normal_df['key'] = [1, 2, 3, 4, 5] * 4 dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3), datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 normal_grouped = normal_df.groupby('key') dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) expected = getattr(normal_grouped, resample_method)() dt_result = getattr(dt_grouped, resample_method)() expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') tm.assert_equal(expected, dt_result) # if TimeGrouper is used included, 'nth' doesn't work yet """
def test_deferred_with_groupby(): # GH 12486 # support deferred resample ops with groupby data = [['2010-01-01', 'A', 2], ['2010-01-02', 'A', 3], ['2010-01-05', 'A', 8], ['2010-01-10', 'A', 7], ['2010-01-13', 'A', 3], ['2010-01-01', 'B', 5], ['2010-01-03', 'B', 2], ['2010-01-04', 'B', 1], ['2010-01-11', 'B', 7], ['2010-01-14', 'B', 3]] df = DataFrame(data, columns=['date', 'id', 'score']) df.date = pd.to_datetime(df.date) def f(x): return x.set_index('date').resample('D').asfreq() expected = df.groupby('id').apply(f) result = df.set_index('date').groupby('id').resample('D').asfreq() assert_frame_equal(result, expected) df = DataFrame({'date': pd.date_range(start='2016-01-01', periods=4, freq='W'), 'group': [1, 1, 2, 2], 'val': [5, 6, 7, 8]}).set_index('date') def f(x): return x.resample('1D').ffill() expected = df.groupby('group').apply(f) result = df.groupby('group').resample('1D').ffill() assert_frame_equal(result, expected)
def test_preserve_categories(): # GH-13179 categories = list('abc') # ordered=True df = DataFrame({'A': pd.Categorical(list('ba'), categories=categories, ordered=True)}) index = pd.CategoricalIndex(categories, categories, ordered=True) tm.assert_index_equal( df.groupby('A', sort=True, observed=False).first().index, index) tm.assert_index_equal( df.groupby('A', sort=False, observed=False).first().index, index) # ordered=False df = DataFrame({'A': pd.Categorical(list('ba'), categories=categories, ordered=False)}) sort_index = pd.CategoricalIndex(categories, categories, ordered=False) nosort_index = pd.CategoricalIndex(list('bac'), list('bac'), ordered=False) tm.assert_index_equal( df.groupby('A', sort=True, observed=False).first().index, sort_index) tm.assert_index_equal( df.groupby('A', sort=False, observed=False).first().index, nosort_index)
def test_cython_agg_nothing_to_agg_with_dates(self): frame = DataFrame({'a': np.random.randint(0, 5, 50), 'b': ['foo', 'bar'] * 25, 'dates': pd.date_range('now', periods=50, freq='T')}) with tm.assertRaisesRegexp(DataError, "No numeric types to aggregate"): frame.groupby('b').dates.mean()
def test_rank_object_raises(ties_method, ascending, na_option, pct, vals): df = DataFrame({'key': ['foo'] * 5, 'val': vals}) with tm.assert_raises_regex(TypeError, "not callable"): df.groupby('key').rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct)
class LogAggregate: def __init__(self, dataset): self.dataset = DataFrame(dataset) def get_median(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']] else: return self.dataset.median()[kwarg['key']] def get_average(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']] else: return self.dataset.mean()[kwarg['key']] def get_min(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']] else: return self.dataset.min()[kwarg['key']] def get_max(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']] else: return self.dataset.max()[kwarg['key']] def get_count(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']] else: return self.dataset.count()[kwarg['key']]
def test_agg_datetimes_mixed(): data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]] df1 = DataFrame({'key': [x[0] for x in data], 'date': [x[1] for x in data], 'value': [x[2] for x in data]}) data = [[row[0], (dt.datetime.strptime(row[1], '%Y-%m-%d').date() if row[1] else None), row[2]] for row in data] df2 = DataFrame({'key': [x[0] for x in data], 'date': [x[1] for x in data], 'value': [x[2] for x in data]}) df1['weights'] = df1['value'] / df1['value'].sum() gb1 = df1.groupby('date').aggregate(np.sum) df2['weights'] = df1['value'] / df1['value'].sum() gb2 = df2.groupby('date').aggregate(np.sum) assert (len(gb1) == len(gb2))
def test_groupby_categorical_index_and_columns(self, observed): # GH18432, adapted for GH25871 columns = ['A', 'B', 'A', 'B'] categories = ['B', 'A'] data = np.array([[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]], int) cat_columns = CategoricalIndex(columns, categories=categories, ordered=True) df = DataFrame(data=data, columns=cat_columns) result = df.groupby(axis=1, level=0, observed=observed).sum() expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int) expected_columns = CategoricalIndex(categories, categories=categories, ordered=True) expected = DataFrame(data=expected_data, columns=expected_columns) assert_frame_equal(result, expected) # test transposed version df = DataFrame(data.T, index=cat_columns) result = df.groupby(axis=0, level=0, observed=observed).sum() expected = DataFrame(data=expected_data.T, index=expected_columns) assert_frame_equal(result, expected)
def test_ops_general(): ops = [('mean', np.mean), ('median', np.median), ('std', np.std), ('var', np.var), ('sum', np.sum), ('prod', np.prod), ('min', np.min), ('max', np.max), ('first', lambda x: x.iloc[0]), ('last', lambda x: x.iloc[-1]), ('count', np.size), ] try: from scipy.stats import sem except ImportError: pass else: ops.append(('sem', sem)) df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) for op, targop in ops: result = getattr(df.groupby(labels), op)().astype(float) expected = df.groupby(labels).agg(targop) try: tm.assert_frame_equal(result, expected) except BaseException as exc: exc.args += ('operation: %s' % op, ) raise
def test_multi_iter_frame(self, three_group): k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a']) k2 = np.array(['1', '2', '1', '2', '1', '2']) df = DataFrame({'v1': np.random.randn(6), 'v2': np.random.randn(6), 'k1': k1, 'k2': k2}, index=['one', 'two', 'three', 'four', 'five', 'six']) grouped = df.groupby(['k1', 'k2']) # things get sorted! iterated = list(grouped) idx = df.index expected = [('a', '1', df.loc[idx[[4]]]), ('a', '2', df.loc[idx[[3, 5]]]), ('b', '1', df.loc[idx[[0, 2]]]), ('b', '2', df.loc[idx[[1]]])] for i, ((one, two), three) in enumerate(iterated): e1, e2, e3 = expected[i] assert e1 == one assert e2 == two assert_frame_equal(three, e3) # don't iterate through groups with no data df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a']) df['k2'] = np.array(['1', '1', '1', '2', '2', '2']) grouped = df.groupby(['k1', 'k2']) groups = {key: gp for key, gp in grouped} assert len(groups) == 2 # axis = 1 three_levels = three_group.groupby(['A', 'B', 'C']).mean() grouped = three_levels.T.groupby(axis=1, level=(1, 2)) for key, group in grouped: pass
def test_aggregate_with_nat(func, fill_value): # check TimeGrouper's aggregation is identical as normal groupby # if NaT is included, 'var', 'std', 'mean', 'first','last' # and 'nth' doesn't work yet n = 20 data = np.random.randn(n, 4).astype('int64') normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) normal_df['key'] = [1, 2, np.nan, 4, 5] * 4 dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 normal_grouped = normal_df.groupby('key') dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) normal_result = getattr(normal_grouped, func)() dt_result = getattr(dt_grouped, func)() pad = DataFrame([[fill_value] * 4], index=[3], columns=['A', 'B', 'C', 'D']) expected = normal_result.append(pad) expected = expected.sort_index() expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') assert_frame_equal(expected, dt_result) assert dt_result.index.name == 'key'
def test__cython_agg_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) result = df.groupby(labels)._cython_agg_general(op) expected = df.groupby(labels).agg(targop) tm.assert_frame_equal(result, expected)
def test_rank_apply(): lev1 = tm.rands_array(10, 100) lev2 = tm.rands_array(10, 130) lab1 = np.random.randint(0, 100, size=500) lab2 = np.random.randint(0, 130, size=500) df = DataFrame({'value': np.random.randn(500), 'key1': lev1.take(lab1), 'key2': lev2.take(lab2)}) result = df.groupby(['key1', 'key2']).value.rank() expected = [] for key, piece in df.groupby(['key1', 'key2']): expected.append(piece.value.rank()) expected = concat(expected, axis=0) expected = expected.reindex(result.index) tm.assert_series_equal(result, expected) result = df.groupby(['key1', 'key2']).value.rank(pct=True) expected = [] for key, piece in df.groupby(['key1', 'key2']): expected.append(piece.value.rank(pct=True)) expected = concat(expected, axis=0) expected = expected.reindex(result.index) tm.assert_series_equal(result, expected)
def test_cython_agg_nothing_to_agg_with_dates(): frame = DataFrame({'a': np.random.randint(0, 5, 50), 'b': ['foo', 'bar'] * 25, 'dates': pd.date_range('now', periods=50, freq='T')}) msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): frame.groupby('b').dates.mean()
def setup(self, dtype, method, application): if method in method_blacklist.get(dtype, {}): raise NotImplementedError # skip benchmark ngroups = 1000 size = ngroups * 2 rng = np.arange(ngroups) values = rng.take(np.random.randint(0, ngroups, size=size)) if dtype == 'int': key = np.random.randint(0, size, size=size) elif dtype == 'float': key = np.concatenate([np.random.random(ngroups) * 0.1, np.random.random(ngroups) * 10.0]) elif dtype == 'object': key = ['foo'] * size elif dtype == 'datetime': key = date_range('1/1/2011', periods=size, freq='s') df = DataFrame({'values': values, 'key': key}) if application == 'transform': if method == 'describe': raise NotImplementedError self.as_group_method = lambda: df.groupby( 'key')['values'].transform(method) self.as_field_method = lambda: df.groupby( 'values')['key'].transform(method) else: self.as_group_method = getattr(df.groupby('key')['values'], method) self.as_field_method = getattr(df.groupby('values')['key'], method)
def test_crosstab_margins(self): a = np.random.randint(0, 7, size=100) b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 5, size=100) df = DataFrame({"a": a, "b": b, "c": c}) result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True) self.assertEqual(result.index.names, ("a",)) self.assertEqual(result.columns.names, ["b", "c"]) all_cols = result["All", ""] exp_cols = df.groupby(["a"]).size().astype("i8") exp_cols = exp_cols.append(Series([len(df)], index=["All"])) exp_cols.name = ("All", "") tm.assert_series_equal(all_cols, exp_cols) all_rows = result.ix["All"] exp_rows = df.groupby(["b", "c"]).size().astype("i8") exp_rows = exp_rows.append(Series([len(df)], index=[("All", "")])) exp_rows.name = "All" exp_rows = exp_rows.reindex(all_rows.index) exp_rows = exp_rows.fillna(0).astype(np.int64) tm.assert_series_equal(all_rows, exp_rows)
def test_timegrouper_with_reg_groups_freq(self, freq): # GH 6764 multiple grouping with/without sort df = DataFrame({ 'date': pd.to_datetime([ '20121002', '20121007', '20130130', '20130202', '20130305', '20121002', '20121207', '20130130', '20130202', '20130305', '20130202', '20130305' ]), 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], 'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301, 359, 801], 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12] }).set_index('date') expected = ( df.groupby('user_id')['whole_cost'] .resample(freq) .sum(min_count=1) # XXX .dropna() .reorder_levels(['date', 'user_id']) .sort_index() .astype('int64') ) expected.name = 'whole_cost' result1 = df.sort_index().groupby([pd.Grouper(freq=freq), 'user_id'])['whole_cost'].sum() assert_series_equal(result1, expected) result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[ 'whole_cost'].sum() assert_series_equal(result2, expected)
def test_groupby_corner(self): midx = MultiIndex(levels=[['foo'],['bar'],['baz']], labels=[[0],[0],[0]], names=['one','two','three']) df = DataFrame([np.random.rand(4)], columns=['a','b','c','d'], index=midx) # should work df.groupby(level='three')
def test_groupby_categorical_index_and_columns(self, observed): # GH18432 columns = ['A', 'B', 'A', 'B'] categories = ['B', 'A'] data = np.ones((5, 4), int) cat_columns = CategoricalIndex(columns, categories=categories, ordered=True) df = DataFrame(data=data, columns=cat_columns) result = df.groupby(axis=1, level=0, observed=observed).sum() expected_data = 2 * np.ones((5, 2), int) if observed: # if we are not-observed we undergo a reindex # so need to adjust the output as our expected sets us up # to be non-observed expected_columns = CategoricalIndex(['A', 'B'], categories=categories, ordered=True) else: expected_columns = CategoricalIndex(categories, categories=categories, ordered=True) expected = DataFrame(data=expected_data, columns=expected_columns) assert_frame_equal(result, expected) # test transposed version df = DataFrame(data.T, index=cat_columns) result = df.groupby(axis=0, level=0, observed=observed).sum() expected = DataFrame(data=expected_data.T, index=expected_columns) assert_frame_equal(result, expected)
def test_groupby_as_index_apply(df): # GH #4648 and #3417 df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'], 'user_id': [1, 2, 1, 1, 3, 1], 'time': range(6)}) g_as = df.groupby('user_id', as_index=True) g_not_as = df.groupby('user_id', as_index=False) res_as = g_as.head(2).index res_not_as = g_not_as.head(2).index exp = Index([0, 1, 2, 4]) tm.assert_index_equal(res_as, exp) tm.assert_index_equal(res_not_as, exp) res_as_apply = g_as.apply(lambda x: x.head(2)).index res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), ( 2, 4)]) tp = [(1, 0), (1, 2), (2, 1), (3, 4)] exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None]) tm.assert_index_equal(res_as_apply, exp_as_apply) tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) ind = Index(list('abcde')) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) res = df.groupby(0, as_index=False).apply(lambda x: x).index tm.assert_index_equal(res, ind)
def test_size(self): grouped = self.df.groupby(['A', 'B']) result = grouped.size() for key, group in grouped: assert result[key] == len(group) grouped = self.df.groupby('A') result = grouped.size() for key, group in grouped: assert result[key] == len(group) grouped = self.df.groupby('B') result = grouped.size() for key, group in grouped: assert result[key] == len(group) df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])): left = df.groupby(key, sort=sort).size() right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) assert_series_equal(left, right, check_names=False) # GH11699 df = DataFrame([], columns=['A', 'B']) out = Series([], dtype='int64', index=Index([], name='A')) assert_series_equal(df.groupby('A').size(), out)
def test_cython_agg_boolean(self): frame = DataFrame({'a': np.random.randint(0, 5, 50), 'b': np.random.randint(0, 2, 50).astype('bool')}) result = frame.groupby('a')['b'].mean() expected = frame.groupby('a')['b'].agg(np.mean) assert_series_equal(result, expected)
def test_groupby_max_datetime64(self): # GH 5869 # datetimelike dtype conversion from int df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) expected = df.groupby('A')['A'].apply(lambda x: x.max()) result = df.groupby('A')['A'].max() assert_series_equal(result, expected)
def test_multiindex_columns_empty_level(self): lst = [['count', 'values'], ['to filter', '']] midx = MultiIndex.from_tuples(lst) df = DataFrame([[1, 'A']], columns=midx) grouped = df.groupby('to filter').groups assert grouped['A'] == [0] grouped = df.groupby([('to filter', '')]).groups assert grouped['A'] == [0] df = DataFrame([[1, 'A'], [2, 'B']], columns=midx) expected = df.groupby('to filter').groups result = df.groupby([('to filter', '')]).groups assert result == expected df = DataFrame([[1, 'A'], [2, 'A']], columns=midx) expected = df.groupby('to filter').groups result = df.groupby([('to filter', '')]).groups tm.assert_dict_equal(result, expected)
def test_multiindex_columns_empty_level(self): lst = [["count", "values"], ["to filter", ""]] midx = MultiIndex.from_tuples(lst) df = DataFrame([[1, "A"]], columns=midx) grouped = df.groupby("to filter").groups assert grouped["A"] == [0] grouped = df.groupby([("to filter", "")]).groups assert grouped["A"] == [0] df = DataFrame([[1, "A"], [2, "B"]], columns=midx) expected = df.groupby("to filter").groups result = df.groupby([("to filter", "")]).groups assert result == expected df = DataFrame([[1, "A"], [2, "A"]], columns=midx) expected = df.groupby("to filter").groups result = df.groupby([("to filter", "")]).groups tm.assert_dict_equal(result, expected)
class Nth: param_names = ["dtype"] params = ["float32", "float64", "datetime", "object"] def setup(self, dtype): N = 10**5 # with datetimes (GH7555) if dtype == "datetime": values = date_range("1/1/2011", periods=N, freq="s") elif dtype == "object": values = ["foo"] * N else: values = np.arange(N).astype(dtype) key = np.arange(N) self.df = DataFrame({"key": key, "values": values}) self.df.iloc[1, 1] = np.nan # insert missing data def time_frame_nth_any(self, dtype): self.df.groupby("key").nth(0, dropna="any") def time_groupby_nth_all(self, dtype): self.df.groupby("key").nth(0, dropna="all") def time_frame_nth(self, dtype): self.df.groupby("key").nth(0) def time_series_nth_any(self, dtype): self.df["values"].groupby(self.df["key"]).nth(0, dropna="any") def time_series_nth_all(self, dtype): self.df["values"].groupby(self.df["key"]).nth(0, dropna="all") def time_series_nth(self, dtype): self.df["values"].groupby(self.df["key"]).nth(0)
def test_mangled(self): df = DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]}) result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1)) expected = DataFrame({"b": [0, 0], "c": [1, 1]}, index=Index([0, 1], name="A")) tm.assert_frame_equal(result, expected)
class CocoModel(object): def __init__(self, path): super(CocoModel, self).__init__() self.path = path self.coco = COCO(self.path) self.annotations = self.coco.loadAnns(self.coco.getAnnIds()) self.df = DataFrame(self.annotations) self.imgs_ids = None self.catgs_ids = None self.catgs_names = None self.cooc_matrix = None def images_ids(self): if not self.imgs_ids: self.imgs_ids = list(self.df['image_id'].unique()) return self.imgs_ids def categories_ids(self): if not self.catgs_ids: self.catgs_ids = list(self.df['category_id'].unique()) return self.catgs_ids def categories_names(self): if not self.catgs_names: catgs = self.coco.loadCats(self.categories_ids()) self.catgs_names = [c['name'].replace(' ', '_') for c in catgs] return self.catgs_names def max_objects_per_image(self): return self.df.groupby('image_id').size().max() def get_image_categories(self, img_id): return list(self.df[self.df.image_id == img_id]['category_id']) def get_category_name(self, catg_id): return self.categories_names()[self.categories_ids().index(catg_id)] def get_category_id_by_name(self, catg_name): if catg_name == "nop": return -1 return self.categories_ids()[self.categories_names().index(catg_name)] def cooccurrence_matrix(self): if self.cooc_matrix is None: self.cooc_matrix = np.zeros( (len(self.categories_ids()), len(self.categories_ids())), dtype=np.int32) for img in self.images_ids(): catgs_img = self.get_image_categories(img) for i, c_id in enumerate(catgs_img): i_id = self.categories_ids().index(c_id) for j in range(i + 1, len(catgs_img)): j_id = self.categories_ids().index(catgs_img[j]) self.cooc_matrix[i_id, j_id] += 1 np.fill_diagonal(self.cooc_matrix, 0) return self.cooc_matrix def topn_coocurrences(self, catg_id, n=10): idx = self.categories_ids().index(catg_id) # Get the catg_id equivalent cooc_matrix row # Get the indices that would sort this row # Reverse the indices array to get coocurrences in descending order # Get only the n most coocurring categories indices # Map the indices to the categories id and return them sort_row = np.argsort(self.cooccurrence_matrix()[idx, :])[::-1][:n] return [self.categories_ids()[i] for i in sort_row]
def analysedata(): # 从上面生成的pickle文件中读取字典里的数据 pkl_file = open('analyse.pkl', 'rb') dic_dystany = pickle.load(pkl_file) pkl_file.close() # 处理stop words stp_file = codecs.open('gswstpwrds.txt', 'r', 'utf8') stop_words = stp_file.read() stp_file.close() stop_words = stop_words.split('\n') stop_words = stop_words + [ u'\r\n', u'...', u'\r\r', u'so.gushiwen.org', u'佚名\r', ] stop_words = Series(stop_words) # dy_fanyi_list = [dic_dystany[dy]['content_fanyi_merge'] for dy in dic_dystany.keys()] dy_list = [ u'先秦', u'两汉', u'魏晋', u'南北朝', u'隋代', u'唐代', u'五代', u'宋代', u'金朝', u'元代', u'明代', u'清代', ] # 先秦、唐代、宋代、清代、元代占了全部内容的五分之四,故暂时只考虑这五个朝代 # dy_list = [u'先秦', u'唐代', u'宋代', u'元代', u'清代'] # dy_list = [u'金朝'] # 整合所有翻译文件,用作训练源数据 # for dystany in dy_list: # filename = dystany + 'fanyi.txt' # merger_file = codecs.open(filename, 'w', 'utf8') # fy_content = dic_dystany[dystany]['content_fanyi_merge'] # fy_content = fy_content.replace('\r', '\n').replace('\n\n', '\n').replace('\n\n', '\n') # fy_content = fy_content.replace(u'\u3000\u3000\n', '').replace(u'\u3000\n', '') # merger_file.write(fy_content) # merger_file.close() thul = thulac.thulac('-seg_only') thul.run() writer = ExcelWriter('gushiwen.xlsx') for dystany in dy_list: fy_content = dic_dystany[dystany]['content_fanyi_merge'] ls = [] while len(fy_content) > 10000: con = fy_content[:10000] fy_content = fy_content[10000:] ls = ls + thul.cut(con.encode('utf8')) if fy_content: ls += thul.cut(fy_content.encode('utf8')) fy_cont_seg = [val for val in ls if len(val) > 3] # 剔除所有单字符,thulac返回的是str,str长度为3 print type(fy_cont_seg), ' ', len(fy_cont_seg), ' ', dystany fy_cont_seg = [val.decode('utf8') for val in fy_cont_seg] fy_cont_seg_df = DataFrame({'segment': fy_cont_seg}) fy_cont_seg_df = fy_cont_seg_df[~fy_cont_seg_df.segment. isin(stop_words)] segtat = fy_cont_seg_df.groupby(by=['segment'])['segment'].agg({ 'count': numpy.size }).reset_index().sort_values(by=['count'], ascending=False) segtat.to_excel(writer, dystany) writer.save() print 'End of dystany : ', dystany writer.close()
def test_observed(observed): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) # gh-10132 (back-compat) # gh-8138 (back-compat) # gh-8869 cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) df['C'] = ['foo', 'bar'] * 2 # multiple groupers with a non-cat gb = df.groupby(['A', 'B', 'C'], observed=observed) exp_index = pd.MultiIndex.from_arrays([cat1, cat2, ['foo', 'bar'] * 2], names=['A', 'B', 'C']) expected = DataFrame({ 'values': Series([1, 2, 3, 4], index=exp_index) }).sort_index() result = gb.sum() if not observed: expected = cartesian_product_for_groupers(expected, [cat1, cat2, ['foo', 'bar']], list('ABC')) tm.assert_frame_equal(result, expected) gb = df.groupby(['A', 'B'], observed=observed) exp_index = pd.MultiIndex.from_arrays([cat1, cat2], names=['A', 'B']) expected = DataFrame({'values': [1, 2, 3, 4]}, index=exp_index) result = gb.sum() if not observed: expected = cartesian_product_for_groupers(expected, [cat1, cat2], list('AB')) tm.assert_frame_equal(result, expected) # https://github.com/pandas-dev/pandas/issues/8138 d = { 'cat': pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True), 'ints': [1, 1, 2, 2], 'val': [10, 20, 30, 40] } df = pd.DataFrame(d) # Grouping on a single column groups_single_key = df.groupby("cat", observed=observed) result = groups_single_key.mean() exp_index = pd.CategoricalIndex(list('ab'), name="cat", categories=list('abc'), ordered=True) expected = DataFrame({ "ints": [1.5, 1.5], "val": [20., 30] }, index=exp_index) if not observed: index = pd.CategoricalIndex(list('abc'), name="cat", categories=list('abc'), ordered=True) expected = expected.reindex(index) tm.assert_frame_equal(result, expected) # Grouping on two columns groups_double_key = df.groupby(["cat", "ints"], observed=observed) result = groups_double_key.agg('mean') expected = DataFrame({ "val": [10, 30, 20, 40], "cat": pd.Categorical(['a', 'a', 'b', 'b'], categories=['a', 'b', 'c'], ordered=True), "ints": [1, 2, 1, 2] }).set_index(["cat", "ints"]) if not observed: expected = cartesian_product_for_groupers(expected, [df.cat.values, [1, 2]], ['cat', 'ints']) tm.assert_frame_equal(result, expected) # GH 10132 for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: c, i = key result = groups_double_key.get_group(key) expected = df[(df.cat == c) & (df.ints == i)] assert_frame_equal(result, expected) # gh-8869 # with as_index d = { 'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70], 'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c'] } df = pd.DataFrame(d) cat = pd.cut(df['foo'], np.linspace(0, 10, 3)) df['range'] = cat groups = df.groupby(['range', 'baz'], as_index=False, observed=observed) result = groups.agg('mean') groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed) expected = groups2.agg('mean').reset_index() tm.assert_frame_equal(result, expected)
def transform(df: pandas.DataFrame) -> pandas.DataFrame: df = _apply_transform_list(df, TRANSFORMERS) return df.groupby('uuid').apply(_group_transform_df)
def portfolio_cross_section( self, pipeline_df: pd.DataFrame, allocation_method: PortfolioAllocationModel = ValueWeightedPortfolio ): """ We cross-split the portfolio based on factors and percentiles, and pick the resulting portfolios to go long and short on. For example: +-----------------------------------------------------------+ | | Median ME | +-----------------------+-----------------------------------| | Small Value | Big Value 70th BE/ME Percentile ------------------|------------- Small Neutral | Big Neutral 30th BE/ME Percentile ------------------|-------------- Small Growth | Big Growth :param pipeline_df :return: """ def split_stocks_quantile(df: pd.DataFrame, factor: str): factor_rank = None for factor_ in self.factors: if factor_.__class__.__name__ == factor: factor_rank = factor_.rank_split bottom_quantile = df[ df[factor] <= df[factor].quantile(factor_rank[0])] bottom_quantile_stocks = [ stock for date, stock in bottom_quantile.index.values ] top_quantile = df[ df[factor] >= df[factor].quantile(factor_rank[1])] top_quantile_stocks = [ stock for date, stock in top_quantile.index.values ] return bottom_quantile_stocks, top_quantile_stocks def select_stocks(group): returns_df = pd.DataFrame() from_date = group.index.values[0][0] from_date_idx = group.index.levels[0].to_list().index(from_date) try: # TODO think about inclusive at rebalancing day to_date = group.index.levels[0][from_date_idx + 1] - timedelta(days=1) except: to_date = self.end_date if 'SMB' in group.columns: # Do the Fama French / AQR Way small_size_stocks, large_size_stocks = split_stocks_quantile( df=group, factor='SMB') for factor in group.columns: if factor != 'SMB': bottom_quantile_stocks, top_quantile_stocks = split_stocks_quantile( df=group, factor=factor) small_top_stocks = set.intersection( set(small_size_stocks), set(top_quantile_stocks)) big_top_stocks = set.intersection( set(large_size_stocks), set(top_quantile_stocks)) small_bottom_stocks = set.intersection( set(small_size_stocks), set(bottom_quantile_stocks)) big_bottom_stocks = set.intersection( set(large_size_stocks), set(bottom_quantile_stocks)) cross_section_returns = { name: {} for name in [ 'Small Top', 'Big Top', 'Small Bottom', 'Big Bottom' ] } for name, stocks in zip([ 'Small Top', 'Big Top', 'Small Bottom', 'Big Bottom' ], [ small_top_stocks, big_top_stocks, small_bottom_stocks, big_bottom_stocks ]): if len(stocks) > 0: portfolio = self.asset_returns[stocks] # To allocate weight, need history of returns up to now weights = allocation_method( Portfolio(portfolio.loc[:from_date]) ).solve_weights() cross_section_returns[name]['Weight Allocation'] \ = [(stock, weight) for stock, weight in zip(portfolio.columns, weights)] returns = np.sum( weights * portfolio.loc[from_date:to_date], axis=1) cross_section_returns[name][ 'Returns'] = returns else: dates = pd.date_range( start=from_date + timedelta(days=1) - timedelta(seconds=1), end=to_date + timedelta(days=1) - timedelta(seconds=1)).to_list() cross_section_returns[name][ 'Returns'] = pd.Series( np.zeros((to_date - from_date).days + 1), index=dates) cross_section_returns[name][ 'Weight Allocation'] = [('', 0)] # HML = 1/2 (Small Value + Big Value) - 1/2 (Small Growth + Big Growth). long_stocks = small_top_stocks | big_top_stocks short_stocks = small_bottom_stocks | big_bottom_stocks for factor_ in self.factors: if factor_.__class__.__name__ == factor: # TODO df = pd.DataFrame( columns=['Long Stocks', 'Short Stocks'], data=0) factor_.holdings.append() returns = 0.5 * (cross_section_returns['Small Top']['Returns'].add( cross_section_returns['Big Top']['Returns'], fill_value=0)) \ - 0.5 * (cross_section_returns['Small Bottom']['Returns'].add( cross_section_returns['Big Bottom']['Returns'], fill_value=0)) returns.name = factor returns_df = returns_df.join( [returns], how='inner' ) if not returns_df.empty else returns.to_frame() for factor, returns in returns_df.iteritems(): factor_obj = None for f_ in self.factors: if f_.__class__.__name__ == factor: factor_obj = f_ factor_obj.returns = returns # factor_obj.holdings = return returns_df factor_returns = pipeline_df.groupby(level=0, axis=0).apply(select_stocks) factor_returns.index = factor_returns.index.droplevel(0) return factor_returns
for i in friends: value = i[var] variable.append(value) return variable #use get_var to get information NickName = get_var('NickName') Province = get_var('Province') print(NickName, Province) data = { 'NickName':NickName, 'Province':Province } frame = DataFrame(data)#save data print(frame) #data chuli #data groupby aggResult = frame.groupby( by=['Province'] )['NickName'].agg({'人数':numpy.size,}) print(aggResult) #transform data type aggResult['好友数'] = aggResult.人数.astype(int) aggResult['地区'] = aggResult.index #data standard # new data = (raw data-min) /(max - min) aggResult['scala'] = ( aggResult.好友数 - aggResult.好友数.min() ) / (aggResult.好友数.max() - aggResult.好友数.min()) print(aggResult['好友数'], aggResult['地区'], aggResult['scala'])
def add_realization_traces(data_frame: pd.DataFrame, color_by: str, colors: Dict[str, List[str]], phase: str) -> List[dict]: """Renders line traces for individual realizations""" # pylint: disable-msg=too-many-locals traces = [] data_frame = data_frame.loc[data_frame["KEYWORD"] == PvtPlot.PHASES[phase]] column_name = "GOR" border_value_pressure: Dict[str, list] = {} border_value_viscosity: Dict[str, list] = {} border_value_volumefactor: Dict[str, list] = {} constant_group = (data_frame["PVTNUM"].iloc[0] if color_by == "ENSEMBLE" else data_frame["ENSEMBLE"].iloc[0]) for (group, grouped_data_frame) in data_frame.groupby(color_by): for ratio_no, gas_oil_ratio in enumerate( grouped_data_frame[column_name].unique()): for realization_no, (realization, realization_data_frame) in enumerate( grouped_data_frame.groupby("REAL")): if group not in border_value_pressure: border_value_pressure[group] = [] border_value_viscosity[group] = [] border_value_volumefactor[group] = [] try: border_value_pressure[group].append( realization_data_frame.loc[ realization_data_frame[column_name] == gas_oil_ratio]["PRESSURE"].iloc[0]) border_value_volumefactor[group].append( realization_data_frame.loc[ realization_data_frame[column_name] == gas_oil_ratio]["VOLUMEFACTOR"].iloc[0]) border_value_viscosity[group].append( realization_data_frame.loc[ realization_data_frame[column_name] == gas_oil_ratio]["VISCOSITY"].iloc[0]) except IndexError as exc: raise IndexError( "This error is most likely due to PVT differences between " "realizations within the same ensemble. This is currently not " "supported.") from exc traces.extend([{ "type": "scatter", "x": realization_data_frame.loc[ realization_data_frame[column_name] == gas_oil_ratio]["PRESSURE"], "y": realization_data_frame.loc[ realization_data_frame[column_name] == gas_oil_ratio] ["VOLUMEFACTOR"], "xaxis": "x", "yaxis": "y", "hovertext": (f"{'Rs' if phase == 'OIL' else 'Rv'} = {gas_oil_ratio}" ", Pvtnum: " f"{group if color_by == 'PVTNUM' else constant_group}<br>" f"Realization: {realization}, Ensemble: " f"{group if color_by == 'ENSEMBLE' else constant_group}"), "name": group, "legendgroup": group, "marker": { "color": colors.get(group, colors[list(colors.keys())[-1]]) }, "showlegend": realization_no == 0 and ratio_no == 0, }]) traces.extend([{ "type": "scatter", "x": realization_data_frame.loc[ realization_data_frame[column_name] == gas_oil_ratio]["PRESSURE"], "y": realization_data_frame.loc[ realization_data_frame[column_name] == gas_oil_ratio] ["VISCOSITY"], "xaxis": "x2", "yaxis": "y2", "hovertext": (f"{'Rs' if phase == 'OIL' else 'Rv'} = {gas_oil_ratio}" ", Pvtnum: " f"{group if color_by == 'PVTNUM' else constant_group}<br>" f"Realization: {realization}, Ensemble: " f"{group if color_by == 'ENSEMBLE' else constant_group}"), "name": group, "legendgroup": group, "marker": { "color": colors.get(group, colors[list(colors.keys())[-1]]) }, "showlegend": False, }]) for group in border_value_pressure: traces.extend([{ "type": "scatter", "mode": "lines", "x": border_value_pressure[group], "y": border_value_volumefactor[group], "xaxis": "x", "yaxis": "y", "line": { "width": 1, "color": colors.get(group, colors[list(colors.keys())[-1]]), }, "showlegend": False, }]) traces.extend([{ "type": "scatter", "mode": "lines", "x": border_value_pressure[group], "y": border_value_viscosity[group], "xaxis": "x2", "yaxis": "y2", "line": { "width": 1, "color": colors.get(group, colors[list(colors.keys())[-1]]), }, "showlegend": False, }]) return traces
def performance_scaling(data: pd.DataFrame, set_axes_limits: bool=True, plot_regression: bool=True) -> (plt.Figure, plt.Axes): """ Parameters ---------- data : pd.DataFrame with 6 columns: "year", "performance", "kind" ∈ ["compute", "memory", "interconnect"], "name" (label shown in the plot, it can be empty), "base" (base value used for speedup, it can be empty), "comment" (e.g. data source or non-used label, it can be empty). Returns ------- fig : matplotlib figure containing the plot ax : matplotlib axis containing the plot """ ############## # Plot setup # ############## # Reset matplotlib settings; plt.rcdefaults() # Setup general plotting settings; sns.set_style("white", {"ytick.left": True, "xtick.bottom": True}) plt.rcParams["font.family"] = ["Latin Modern Roman Demi"] plt.rcParams['axes.labelpad'] = 0 # Padding between axis and axis label; plt.rcParams['xtick.major.pad'] = 1 # Padding between axis ticks and tick labels; plt.rcParams['ytick.major.pad'] = 1 # Padding between axis ticks and tick labels; plt.rcParams['axes.linewidth'] = 0.8 # Line width of the axis borders; # Create a figure for the plot, and adjust margins; fig = plt.figure(figsize=(6, 2.5)) gs = gridspec.GridSpec(1, 1) plt.subplots_adjust(top=0.98, bottom=0.1, left=0.12, right=0.99) ax = fig.add_subplot(gs[0, 0]) # Set axes limits; if set_axes_limits: ax.set_xlim(X_LIMITS) ax.set_ylim(Y_LIMITS) ################# # Main plot ##### ################# # Measure performance increase over 20 and 2 years; kind_increase = {} # Add a scatterplot for individual elements of the dataset, and change color based on hardware type; ax = sns.scatterplot(x="year", y="performance", hue="kind", style="kind", palette=PALETTE, markers=MARKERS, s=15, data=data, ax=ax, edgecolor="#2f2f2f", linewidth=0.5, zorder=4) # Add a regression plot to highlight the correlation between variables, with 95% confidence intervals; if plot_regression: for i, (kind, g) in enumerate(data.groupby("kind", sort=False)): data_tmp = g.copy() # We fit a straight line on the log of the relative performance, as the scaling is exponential. # Then, the real prediction is 10**prediction; regr = linear_model.LinearRegression() regr.fit(data_tmp["year"].values.reshape(-1, 1), np.log10(data_tmp["performance"].values.reshape(-1, 1))) data_tmp["prediction"] = np.power(10, regr.predict(data_tmp["year"].values.astype(float).reshape(-1, 1))) ax = sns.lineplot(x=[data_tmp["year"].iloc[0], data_tmp["year"].iloc[-1]], y=[data_tmp["prediction"].iloc[0], data_tmp["prediction"].iloc[-1]], color=PALETTE[i], ax=ax, alpha=0.5, linewidth=6) # Use the regression line to obtain the slope over 2 and 10 years; slope = (np.log10(data_tmp["prediction"].iloc[-1]) - np.log10(data_tmp["prediction"].iloc[0])) / ((data_tmp["year"].iloc[-1] - data_tmp["year"].iloc[0]).days / 365) slope_2_years = 10**(slope * 2) slope_20_years = 10**(slope * 20) kind_increase[kind] = (slope_2_years, slope_20_years) ax.legend_.remove() # Hack to remove legend; ##################### # Add labels ######## ##################### # Associate a color to each kind of hardware (compute, memory, interconnection) def get_color(c): # Make the color darker, to use it for text; hue, saturation, brightness = colors.rgb_to_hsv(colors.to_rgb(c)) return sns.set_hls_values(c, l=brightness * 0.6, s=saturation * 0.7) kind_to_col = {k: get_color(PALETTE[i]) for i, k in enumerate(data["kind"].unique())} data["name"] = data["name"].fillna("") for i, row in data.iterrows(): label = row["name"] # Label-specific adjustments; if label: if label == "Pentium II Xeon": xytext = (5, -9) elif label == "PCIe 4.0": xytext = (5, -9) elif label == "Radeon Fiji": xytext = (-7, 5) elif label == "TPUv2": xytext = (-7, 5) elif row["kind"] == "interconnect": xytext = (0, -9) else: xytext = (0, 5) ax.annotate(label, xy=(row["year"], row["performance"]), size=7, xytext=xytext, textcoords="offset points", ha="center", color=kind_to_col[row["kind"]]) ##################### # Style fine-tuning # ##################### # Log-scale y-axis; plt.yscale("log") # Turn on the grid; ax.yaxis.grid(True, linewidth=0.3) ax.xaxis.grid(True, linewidth=0.3) # Set tick number and parameters on x and y axes; def year_formatter(x, pos=None): d = num2date(x) if (d.year - X_LIMITS[0].year) % 3 != 0: return "" else: return d.year ax.xaxis.set_major_locator(YearLocator()) ax.xaxis.set_minor_locator(MonthLocator(interval=3)) ax.xaxis.set_major_formatter(FuncFormatter(year_formatter)) ax.yaxis.set_major_locator(plt.LogLocator(base=10, numticks=15)) ax.tick_params(axis="x", direction="out", which="both", bottom=True, top=False, labelsize=7, width=0.5, size=5) ax.tick_params(axis="x", direction="out", which="minor", size=2) # Update size of minor ticks; ax.tick_params(axis="y", direction="out", which="both", left=True, right=False, labelsize=7, width=0.5, size=5) ax.tick_params(axis="y", direction="out", which="minor", size=2) # Update size of minor ticks; # Ticks, showing relative performance; def format_speedup(l): if l >= 1: return str(int(l)) else: return f"{l:.1f}" ax.set_yticklabels(labels=[format_speedup(l) + r"$\mathdefault{\times}$" for l in ax.get_yticks()], ha="right", fontsize=7) # Add a fake legend with summary data. # We don't use a real legend as we need rows with different colors and we don't want patches on the left. # Also, we want the text to look justified. def get_kind_label(k): kind_name = "" if k == "compute": kind_name = "HW FLOPS" elif k == "memory": kind_name = "DRAM BW" else: kind_name = "Interconnect BW" return kind_name # Create a rectangle used as background; rectangle = {"boxstyle": "round", "facecolor": "white", "alpha": 0.8, "edgecolor": "#B8B8B8", "linewidth": 0.5, "pad": 0.5} for i, (k, v) in enumerate(kind_increase.items()): pad = " " * 48 + "\n\n" # Add padding to first label, to create a large rectangle that covers other labels; # Use two annotations, to make the text look justified; ax.annotate(get_kind_label(k) + ":" + (pad if i == 0 else ""), xy=(0.023, 0.94 - 0.05 * i), xycoords="axes fraction", fontsize=7, color=kind_to_col[k], ha="left", va="top", bbox=rectangle if i == 0 else None) ax.annotate(f"{v[1]:.0f}" + r"$\mathdefault{\times}$" + f"/20 years ({v[0]:.1f}" + r"$\mathdefault{\times}$"+ "/2 years)", xy=(0.43, 0.941 - 0.05 * i), xycoords="axes fraction", fontsize=7, color=kind_to_col[k], ha="right", va="top") # Add axes labels; plt.ylabel("Performance Scaling", fontsize=8) plt.xlabel(None) return fig, ax
def pipeline(self, df: pd.DataFrame) -> pd.DataFrame: df = df.groupby("StatisticsDate", as_index=False).sum().sort_values("StatisticsDate") return df.pipe(self.pipe_rename_columns).pipe(self.pipe_metadata)
def test_nth(): df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) g = df.groupby('A') assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A')) assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A')) assert_frame_equal(g.nth(2), df.loc[[]].set_index('A')) assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A')) assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A')) assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A')) assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]]) assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]]) assert_frame_equal(g[['B']].nth(0), df.loc[[0, 2], ['A', 'B']].set_index('A')) exp = df.set_index('A') assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]]) assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]]) exp['B'] = np.nan assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]]) assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]]) # out of bounds, regression from 0.13.1 # GH 6621 df = DataFrame({'color': {0: 'green', 1: 'green', 2: 'red', 3: 'red', 4: 'red'}, 'food': {0: 'ham', 1: 'eggs', 2: 'eggs', 3: 'ham', 4: 'pork'}, 'two': {0: 1.5456590000000001, 1: -0.070345000000000005, 2: -2.4004539999999999, 3: 0.46206000000000003, 4: 0.52350799999999997}, 'one': {0: 0.56573799999999996, 1: -0.9742360000000001, 2: 1.033801, 3: -0.78543499999999999, 4: 0.70422799999999997}}).set_index(['color', 'food']) result = df.groupby(level=0, as_index=False).nth(2) expected = df.iloc[[-1]] assert_frame_equal(result, expected) result = df.groupby(level=0, as_index=False).nth(3) expected = df.loc[[]] assert_frame_equal(result, expected) # GH 7559 # from the vbench df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64') s = df[1] g = df[0] expected = s.groupby(g).first() expected2 = s.groupby(g).apply(lambda x: x.iloc[0]) assert_series_equal(expected2, expected, check_names=False) assert expected.name == 1 assert expected2.name == 1 # validate first v = s[g == 1].iloc[0] assert expected.iloc[0] == v assert expected2.iloc[0] == v # this is NOT the same as .first (as sorted is default!) # as it keeps the order in the series (and not the group order) # related GH 7287 expected = s.groupby(g, sort=False).first() result = s.groupby(g, sort=False).nth(0, dropna='all') assert_series_equal(result, expected) # doc example df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) g = df.groupby('A') # PR 17493, related to issue 11038 # test Series.nth with True for dropna produces FutureWarning with assert_produces_warning(FutureWarning): result = g.B.nth(0, dropna=True) expected = g.B.first() assert_series_equal(result, expected) # test multiple nth values df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=['A', 'B']) g = df.groupby('A') assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A')) assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A')) assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A')) assert_frame_equal( g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A')) assert_frame_equal( g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) assert_frame_equal( g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A')) assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A')) business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B') df = DataFrame(1, index=business_dates, columns=['a', 'b']) # get the first, fourth and last two business days for each month key = [df.index.year, df.index.month] result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) expected_dates = pd.to_datetime( ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1', '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5', '2014/6/27', '2014/6/30']) expected = DataFrame(1, columns=['a', 'b'], index=expected_dates) assert_frame_equal(result, expected)
def data(ws, mongodb, slug): if not ws: abort(400, 'Expected WebSocket request.') DW = DataWarehouse() element = mongodb['element'].find_one({'slug': slug}) element['page_limit'] = 50 if request.GET.get('limit', True) is False: element['page_limit'] = 9999999999 data = DW.get(element.get('cube')) columns = data.get('columns') or [] fields = columns if request.GET.get('fields', None): fields = request.GET.get('fields').split(',') cube_last_update = mongodb['cube'].find_one({'slug': element.get('cube')}) ws.send(json.dumps({'type': 'last_update', 'data': str(cube_last_update.get('lastupdate', ''))})) ws.send(json.dumps({'type': 'columns', 'data': fields})) filters = [i[0] for i in request.GET.iteritems() if len(i[0].split('filter__')) > 1] if element['type'] == 'grid': page = int(request.GET.get('page', 1)) page_start = 0 page_end = element['page_limit'] if page >= 2: page_end = element['page_limit'] * page page_start = page_end - element['page_limit'] else: page_start = None page_end = None df = DataFrame(data.get('data') or {}, columns=fields) if len(filters) >= 1: for f in filters: s = f.split('__') field = s[1] operator = s[2] value = request.GET.get(f) if operator == 'like': df = df[df[field].str.contains(value)] elif operator == 'regex': df = DataFrameSearchColumn(df, field, value, operator) else: df = df.query(df_generate(df, value, f)) groupby = [] if request.GET.get('groupby', None): groupby = request.GET.get('groupby', ).split(',') if len(groupby) >= 1: df = DataFrame(df.groupby(groupby).grouper.get_group_levels()) if request.GET.get('orderby', element.get('orderby', None)) and request.GET.get( 'orderby', element.get('orderby', None)) in fields: orderby = request.GET.get('orderby', element.get('orderby', '')) if type(orderby) == str: orderby = orderby.split(',') orderby__order = request.GET.get('orderby__order', element.get('orderby__order', '')) if type(orderby__order) == str: orderby__order = orderby__order.split(',') ind = 0 for orde in orderby__order: if orde == '0': orderby__order[ind] = False else: orderby__order[ind] = True ind += 1 df = df.sort(orderby, ascending=orderby__order) ws.send(json.dumps({'type': 'max_page', 'data': len(df)})) # CLEAN MEMORY del filters, fields, columns gc.collect() categories = [] for i in df.to_dict(outtype='records')[page_start:page_end]: if element.get('categories', None): categories.append(i[element.get('categories')]) ws.send(json.dumps({'type': 'data', 'data': i})) # CLEAN MEMORY del df gc.collect() ws.send(json.dumps({'type': 'categories', 'data': categories})) ws.send(json.dumps({'type': 'close'})) # CLEAN MEMORY del categories gc.collect()
def test_agg(): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") index.name = "date" df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) df_col = df.reset_index() df_mult = df_col.copy() df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], names=["index", "date"]) r = df.resample("2D") cases = [ r, df_col.resample("2D", on="date"), df_mult.resample("2D", level="date"), df.groupby(pd.Grouper(freq="2D")), ] a_mean = r["A"].mean() a_std = r["A"].std() a_sum = r["A"].sum() b_mean = r["B"].mean() b_std = r["B"].std() b_sum = r["B"].sum() expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: result = t.aggregate([np.mean, np.std]) tm.assert_frame_equal(result, expected) expected = pd.concat([a_mean, b_std], axis=1) for t in cases: result = t.aggregate({"A": np.mean, "B": np.std}) tm.assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) for t in cases: result = t.aggregate({"A": ["mean", "std"]}) tm.assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = ["mean", "sum"] for t in cases: result = t["A"].aggregate(["mean", "sum"]) tm.assert_frame_equal(result, expected) msg = "nested renamer is not supported" for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")]) for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t.aggregate({ "A": { "mean": "mean", "sum": "sum" }, "B": { "mean2": "mean", "sum2": "sum" }, }) expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")]) for t in cases: result = t.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) tm.assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples([ ("r1", "A", "mean"), ("r1", "A", "sum"), ("r2", "B", "mean"), ("r2", "B", "sum"), ])
def test_groupby_aggregate_empty_key_empty_return(): # GH: 32580 Check if everything works, when return is empty df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]}) result = df.groupby("a").agg({"b": []}) expected = DataFrame(columns=pd.MultiIndex(levels=[["b"], []], codes=[[], []])) tm.assert_frame_equal(result, expected)
def _plot_per_cenwave(self, df: pd.DataFrame, shift: str, outliers: pd.DataFrame = None) -> int: """Plot shift v time and A-B v time by grating/cenwave""" trace_number = 0 # Keep track of the number of traces created and added groups = df.groupby(['OPT_ELEM', 'CENWAVE']) # Set symbols for different FP-POS fp_symbols = { 1: 'circle', 2: 'cross', 3: 'triangle-up', 4: 'x' } # Compute A-B shift difference seg_diff_results = compute_segment_diff(df, shift, 'FUVA', 'FUVB') # Plot A-B v time self.figure.add_trace( go.Scattergl( x=seg_diff_results.lamp_time, y=seg_diff_results.seg_diff, name='FUVA - FUVB', mode='markers', text=seg_diff_results.hover_text, visible=False, ), row=1, col=1 ) trace_number += 1 # Plot shift v time per grating/cenwave group for i, (name, group) in enumerate(groups): trace_number += 1 grating, cenwave = name lamp_time = absolute_time(df=group) self.figure.add_trace( go.Scattergl( x=lamp_time.to_datetime(), y=group[shift], name=f'{grating}-{cenwave}', mode='markers', text=group.hover_text, visible=False, marker=dict( # Color markers based on cenwave cmax=len(df.CENWAVE.unique()) - 1, # Individual plots need to be on the same scale cmin=0, color=list(repeat(i, len(group))), colorscale='Viridis', symbol=[fp_symbols[fp] for fp in group.FPPOS], size=[ 10 if time > LP_MOVES[4] and lp == 3 else 6 for lp, time in zip(group.LIFE_ADJ, Time(group.EXPSTART, format='mjd').to_datetime()) ] # Set the size to distinguish exposures taken at LP3 after the move to LP4 ) ), row=2, col=1, ) if outliers is not None: self.figure.add_trace( go.Scattergl( x=outliers.lamp_time, y=outliers.seg_diff, name='A - B Outliers', mode='markers', text=outliers.hover_text, visible=False, marker=dict(color='red'), ), row=1, col=1 ) trace_number += 1 # Plot outlier points in a different color outlier_mainplot = df[df.apply(lambda x: x.ROOTNAME in outliers.ROOTNAME.values, axis=1)] outlier_groups = outlier_mainplot.groupby(['OPT_ELEM', 'CENWAVE']) for name, group in outlier_groups: trace_number += 1 grating, cenwave = name lamp_time = absolute_time(df=group) self.figure.add_trace( go.Scattergl( x=lamp_time.to_datetime(), y=group[shift], name=f'{grating}-{cenwave} Outliers', mode='markers', text=group.hover_text, visible=False, marker=dict( color='red', symbol=[fp_symbols[fp] for fp in group.FPPOS], size=[ 10 if time > LP_MOVES[4] and lp == 3 else 6 for lp, time in zip(group.LIFE_ADJ, Time(group.EXPSTART, format='mjd').to_datetime()) ] # Set the size to distinguish exposures taken at LP3 after the move to LP4 ) ), row=2, col=1, ) return trace_number
def test_cython_transform_frame(self, op, args, targop): s = Series(np.random.randn(1000)) s_missing = s.copy() s_missing.iloc[2:10] = np.nan labels = np.random.randint(0, 50, size=1000).astype(float) strings = list('qwertyuiopasdfghjklz') strings_missing = strings[:] strings_missing[5] = np.nan df = DataFrame({ 'float': s, 'float_missing': s_missing, 'int': [1, 1, 1, 1, 2] * 200, 'datetime': pd.date_range('1990-1-1', periods=1000), 'timedelta': pd.timedelta_range(1, freq='s', periods=1000), 'string': strings * 50, 'string_missing': strings_missing * 50 }) df['cat'] = df['string'].astype('category') df2 = df.copy() df2.index = pd.MultiIndex.from_product([range(100), range(10)]) # DataFrame - Single and MultiIndex, # group by values, index level, columns for df in [df, df2]: for gb_target in [ dict(by=labels), dict(level=0), dict(by='string') ]: # dict(by='string_missing')]: # dict(by=['int','string'])]: gb = df.groupby(**gb_target) # whitelisted methods set the selection before applying # bit a of hack to make sure the cythonized shift # is equivalent to pre 0.17.1 behavior if op == 'shift': gb._set_group_selection() if op != 'shift' and 'int' not in gb_target: # numeric apply fastpath promotes dtype so have # to apply separately and concat i = gb[['int']].apply(targop) f = gb[['float', 'float_missing']].apply(targop) expected = pd.concat([f, i], axis=1) else: expected = gb.apply(targop) expected = expected.sort_index(axis=1) tm.assert_frame_equal( expected, gb.transform(op, *args).sort_index(axis=1)) tm.assert_frame_equal(expected, getattr(gb, op)(*args)) # individual columns for c in df: if c not in ['float', 'int', 'float_missing' ] and op != 'shift': pytest.raises(DataError, gb[c].transform, op) pytest.raises(DataError, getattr(gb[c], op)) else: expected = gb[c].apply(targop) expected.name = c tm.assert_series_equal(expected, gb[c].transform(op, *args)) tm.assert_series_equal(expected, getattr(gb[c], op)(*args))
def _plot_per_grating(self, df: pd.DataFrame): trace_number = 0 # Keep track of the number of traces created and added all_b_c_outliers = self.results['B-C'][self.outliers['B-C']] all_c_a_outliers = self.results['C-A'][self.outliers['C-A']] # Find matching stripe differences and outliers b_c = match_dfs(self.results['B-C'], df, 'ROOTNAME') c_a = match_dfs(self.results['C-A'], df, 'ROOTNAME') b_c_outliers = match_dfs(all_b_c_outliers, df, 'ROOTNAME') if not all_b_c_outliers.empty else None c_a_outliers = match_dfs(all_c_a_outliers, df, 'ROOTNAME') if not all_c_a_outliers.empty else None # Plot diffs v time if not b_c.empty: self.figure.add_trace( go.Scattergl( x=b_c.lamp_time, y=b_c.seg_diff, name='NUVB - NUVC', mode='markers', text=b_c.hover_text, visible=False, marker=dict(color='#1f77b4') # "muted blue" ), row=1, col=1 ) trace_number += 1 if c_a is not None and not c_a.empty: self.figure.add_trace( go.Scattergl( x=c_a.lamp_time, y=c_a.seg_diff, name='NUVC - NUVA', mode='markers', text=c_a.hover_text, visible=False, marker=dict(color='#1f77b4') ), row=2, col=1 ) trace_number += 1 # Plot shift v time per grating group groups = df.groupby('OPT_ELEM') for i, (grating, group) in enumerate(groups): trace_number += 2 abstime = absolute_time(df=group) group = group.set_index(abstime.to_datetime()) group = group.sort_index() rolling_mean = group.rolling('180D').mean() self.figure.add_trace( go.Scattergl( x=group.index, y=group[self.shift], name=grating, mode='markers', text=group.hover_text, visible=False, marker=dict( cmax=len(df.OPT_ELEM.unique()) - 1, # Individual plots need to be on the same scale cmin=0, color=list(repeat(i, len(group))), colorscale='Viridis', opacity=0.5 ) ), row=3, col=1, ) # Plot a rolling average of the shift value self.figure.add_trace( go.Scattergl( x=rolling_mean.index, y=rolling_mean[self.shift], name=f'{grating} Rolling Mean', mode='lines', visible=False ), row=3, col=1 ) # Plot each set of potential outliers outlier_sets = [b_c_outliers, c_a_outliers] position = [(1, 1), (2, 1)] labels = ['B-C Outliers', 'C-A Outliers'] for outliers, (row, col), label in zip(outlier_sets, position, labels): if outliers is not None and not outliers.empty: self.figure.add_trace( go.Scattergl( x=outliers.lamp_time, y=outliers.seg_diff, name=label, mode='markers', text=outliers.hover_text, visible=False, marker=dict(color='red'), ), row=row, col=col ) trace_number += 1 # Plot outlier points in a different color outliers_main = match_dfs(df, outliers, 'ROOTNAME') outlier_groups = outliers_main.groupby('OPT_ELEM') for grating, group in outlier_groups: trace_number += 1 lamp_time = absolute_time(df=group) self.figure.add_trace( go.Scattergl( x=lamp_time.to_datetime(), y=group[self.shift], name=f'{grating} {label}', mode='markers', text=group.hover_text, visible=False, marker=dict(color='red'), legendgroup=f'{grating} outliers' ), row=3, col=1, ) return trace_number
def test_sort_datetimelike(): # GH10505 # use same data as test_groupby_sort_categorical, which category is # corresponding to datetime.month df = DataFrame( { 'dt': [ datetime(2011, 7, 1), datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 2, 1), datetime(2011, 1, 1), datetime(2011, 5, 1) ], 'foo': [10, 8, 5, 6, 4, 1, 7], 'bar': [10, 20, 30, 40, 50, 60, 70] }, columns=['dt', 'foo', 'bar']) # ordered=True df['dt'] = Categorical(df['dt'], ordered=True) index = [ datetime(2011, 1, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 7, 1) ] result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) result_sort.index = CategoricalIndex(index, name='dt', ordered=True) index = [ datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 1, 1) ] result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], columns=['foo', 'bar']) result_nosort.index = CategoricalIndex(index, categories=index, name='dt', ordered=True) col = 'dt' assert_frame_equal(result_sort, df.groupby(col, sort=True, observed=False).first()) # when categories is ordered, group is ordered by category's order assert_frame_equal(result_sort, df.groupby(col, sort=False, observed=False).first()) # ordered = False df['dt'] = Categorical(df['dt'], ordered=False) index = [ datetime(2011, 1, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 7, 1) ] result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) result_sort.index = CategoricalIndex(index, name='dt') index = [ datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 1, 1) ] result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], columns=['foo', 'bar']) result_nosort.index = CategoricalIndex(index, categories=index, name='dt') col = 'dt' assert_frame_equal(result_sort, df.groupby(col, sort=True, observed=False).first()) assert_frame_equal(result_nosort, df.groupby(col, sort=False, observed=False).first())
def test_groupby_get_by_index(): # GH 33439 df = DataFrame({"A": ["S", "W", "W"], "B": [1.0, 1.0, 2.0]}) res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])}) expected = DataFrame({"A": ["S", "W"], "B": [1.0, 2.0]}).set_index("A") pd.testing.assert_frame_equal(res, expected)
def test_basic(): cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) # single grouper gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']], columns=['person_id', 'person_name']) x['person_name'] = Categorical(x.person_name) g = x.groupby(['person_id'], observed=False) result = g.transform(lambda x: x) tm.assert_frame_equal(result, x[['person_name']]) result = x.drop_duplicates('person_name') expected = x.iloc[[0, 1]] tm.assert_frame_equal(result, expected) def f(x): return x.drop_duplicates('person_name').iloc[0] result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name='person_id') expected['person_name'] = expected['person_name'].astype('object') tm.assert_frame_equal(result, expected) # GH 9921 # Monotonic df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), df[['a']]) # Filter tm.assert_series_equal( df.a.groupby(c, observed=False).filter(np.all), df['a']) tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df) # Non-monotonic df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[['a']]) # GH 9603 df = DataFrame({'a': [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) result = df.groupby(c, observed=False).apply(len) exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered) expected = Series([1, 0, 0, 0], index=exp_index) expected.index.name = 'a' tm.assert_series_equal(result, expected) # more basic levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) grouped = data.groupby(cats, observed=False) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) exp_cats = Categorical(ord_labels, ordered=True, categories=['foo', 'bar', 'baz', 'qux']) expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe() assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
def test_func_duplicates_raises(): # GH28426 msg = "Function names" df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) with pytest.raises(SpecificationError, match=msg): df.groupby("A").agg(["min", "min"])
eigen_solver='arpack', affinity=digits_affin).fit_predict(digits_X)) moons_spectrals.append( SpectralClustering(n_clusters=moons_num_clusters, eigen_solver='arpack', affinity=moons_affin).fit_predict(moons_X)) print( f"*****************Metrics of Spectral Clustering****************************\n" f"Digits model #{i+1}: \n {digits_num_clusters} clusters, {digits_affin} affinity" f"\nMoons model #{i+1}: \n {moons_num_clusters} clusters, {moons_affin} affinity" f"\n{clustering_metrics(moons_spectrals[i], digits_spectrals[i])}") df = DataFrame( dict(x=moons_X[:, 0], y=moons_X[:, 1], label=moons_spectrals[i])) colors = {0: 'orange', 1: 'purple'} fig, ax = plt.subplots() grouped = df.groupby('label') for key, group in grouped: group.plot( ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key], title=f'Spectral Clustering on Moons, affinity = {moons_affin}') plt.show() skplt.metrics.plot_silhouette( digits_X, digits_spectrals[i], title= f'Spectral Clustering on Digits Silhouette Analysis, affinity = {digits_affin}'
def test_agg_index_has_complex_internals(index): # GH 31223 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) result = df.groupby("group").agg({"value": Series.nunique}) expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group") tm.assert_frame_equal(result, expected)
def highly_variable_genes_single_batch_seurat( adata: sparse.spmatrix, # log transformed, base e genes: pd.DataFrame, layer=None, min_disp=0.5, max_disp=np.inf, min_mean=0.0125, max_mean=3, n_top_genes: int = 0, n_bins=20, flavor='seurat' ) -> None: X = adata.layers[layer] if layer is not None else adata#.X if flavor == 'seurat': # 如果不是以e为底的先变成以e为底 X = np.expm1(X) # 然后还原 mean, var = my_get_mean_var(X, axis='gene') mean[mean == 0] = 1e-12 dispersion = var / mean if flavor == 'seurat': dispersion[dispersion == 0] = np.nan dispersion = np.log(dispersion) mean = np.log1p(mean) genes['dispersions'] = dispersion genes['means'] = mean genes['vars'] = var if flavor == 'seurat': genes['mean_bin'] = pd.cut(genes.means, bins=n_bins) disp_grouped = genes.groupby('mean_bin')['dispersions'] single_bin_gene = [] def find_nan_interval(x): if len(x) == 1: single_bin_gene.extend(x.index) std, mean = x.mean(), 0 else: mean = x.mean() std = x.std(ddof=1) return (x - mean) / std genes['dispersions_norm'] = disp_grouped.transform(lambda x: find_nan_interval(x)) if len(single_bin_gene) > 0: print( f'Gene indices {single_bin_gene} fell into a single bin: their ' 'normalized dispersion was set to 1.', ' Decreasing `n_bins` will likely avoid this effect.' ) if n_top_genes > adata.shape[1]: print(f'`n_top_genes` > `adata.n_var`, returning all genes.') genes['highly_variable'] = np.ones(adata.shape[1], dtype=bool) elif n_top_genes > 0: genes_largest = genes.nlargest(n_top_genes, 'dispersion_norm') disp_cut_off = genes_largest.dispersion_norm[-1] genes['highly_variable'] = np.zeros(adata.shape[1], dtype=bool) genes.highly_variable.loc[genes_largest] == True print( f'the {n_top_genes} top genes correspond to a ' f'normalized dispersion cutoff of {disp_cut_off}' ) else: dispersion_norm = genes.dispersions_norm.values.astype('float32') np.nan_to_num(dispersion_norm) # similar to Seurat gene_subset = np.logical_and.reduce( ( mean > min_mean, mean < max_mean, dispersion_norm > min_disp, dispersion_norm < max_disp, ) ) genes['highly_variable'] = gene_subset sns.scatterplot(data=genes, x="means", y="dispersions", hue="highly_variable", s=7, alpha=0.5) plt.savefig('6.jpg') plt.cla() sns.scatterplot(data=genes, x="means", y="dispersions_norm", hue="highly_variable", s=7, alpha=0.5) plt.savefig('7.jpg') plt.cla() return None
def test_missing_raises(self): df = DataFrame({"A": [0, 1], "B": [1, 2]}) with pytest.raises(KeyError, match="Column 'C' does not exist"): df.groupby("A").agg(c=("C", "sum"))
def test_to_latex_multiindex(self): df = DataFrame({('x', 'y'): ['a']}) result = df.to_latex() expected = r"""\begin{tabular}{ll} \toprule {} & x \\ {} & y \\ \midrule 0 & a \\ \bottomrule \end{tabular} """ assert result == expected result = df.T.to_latex() expected = r"""\begin{tabular}{lll} \toprule & & 0 \\ \midrule x & y & a \\ \bottomrule \end{tabular} """ assert result == expected df = DataFrame.from_dict({ ('c1', 0): pd.Series({x: x for x in range(4)}), ('c1', 1): pd.Series({x: x + 4 for x in range(4)}), ('c2', 0): pd.Series({x: x for x in range(4)}), ('c2', 1): pd.Series({x: x + 4 for x in range(4)}), ('c3', 0): pd.Series({x: x for x in range(4)}), }).T result = df.to_latex() expected = r"""\begin{tabular}{llrrrr} \toprule & & 0 & 1 & 2 & 3 \\ \midrule c1 & 0 & 0 & 1 & 2 & 3 \\ & 1 & 4 & 5 & 6 & 7 \\ c2 & 0 & 0 & 1 & 2 & 3 \\ & 1 & 4 & 5 & 6 & 7 \\ c3 & 0 & 0 & 1 & 2 & 3 \\ \bottomrule \end{tabular} """ assert result == expected # GH 14184 df = df.T df.columns.names = ['a', 'b'] result = df.to_latex() expected = r"""\begin{tabular}{lrrrrr} \toprule a & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\ b & 0 & 1 & 0 & 1 & 0 \\ \midrule 0 & 0 & 4 & 0 & 4 & 0 \\ 1 & 1 & 5 & 1 & 5 & 1 \\ 2 & 2 & 6 & 2 & 6 & 2 \\ 3 & 3 & 7 & 3 & 7 & 3 \\ \bottomrule \end{tabular} """ assert result == expected # GH 10660 df = pd.DataFrame({ 'a': [0, 0, 1, 1], 'b': list('abab'), 'c': [1, 2, 3, 4] }) result = df.set_index(['a', 'b']).to_latex() expected = r"""\begin{tabular}{llr} \toprule & & c \\ a & b & \\ \midrule 0 & a & 1 \\ & b & 2 \\ 1 & a & 3 \\ & b & 4 \\ \bottomrule \end{tabular} """ assert result == expected result = df.groupby('a').describe().to_latex() expected = r"""\begin{tabular}{lrrrrrrrr} \toprule {} & \multicolumn{8}{l}{c} \\ {} & count & mean & std & min & 25\% & 50\% & 75\% & max \\ a & & & & & & & & \\ \midrule 0 & 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 & 2.0 \\ 1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 & 3.5 & 3.75 & 4.0 \\ \bottomrule \end{tabular} """ assert result == expected
def test_agg_misc(): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") index.name = "date" df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) df_col = df.reset_index() df_mult = df_col.copy() df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], names=["index", "date"]) r = df.resample("2D") cases = [ r, df_col.resample("2D", on="date"), df_mult.resample("2D", level="date"), df.groupby(pd.Grouper(freq="2D")), ] # passed lambda for t in cases: result = t.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) rcustom = t["B"].apply(lambda x: np.std(x, ddof=1)) expected = pd.concat([r["A"].sum(), rcustom], axis=1) tm.assert_frame_equal(result, expected, check_like=True) # agg with renamers expected = pd.concat( [t["A"].sum(), t["B"].sum(), t["A"].mean(), t["B"].mean()], axis=1) expected.columns = pd.MultiIndex.from_tuples([("result1", "A"), ("result1", "B"), ("result2", "A"), ("result2", "B")]) msg = r"Column\(s\) \['result1', 'result2'\] do not exist" for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t[["A", "B"]].agg({"result1": np.sum, "result2": np.mean}) # agg with different hows expected = pd.concat( [t["A"].sum(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples([("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")]) for t in cases: result = t.agg({"A": ["sum", "std"], "B": ["mean", "std"]}) tm.assert_frame_equal(result, expected, check_like=True) # equivalent of using a selection list / or not for t in cases: result = t[["A", "B"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) tm.assert_frame_equal(result, expected, check_like=True) msg = "nested renamer is not supported" # series like aggs for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t["A"].agg({"A": ["sum", "std"]}) with pytest.raises(pd.core.base.SpecificationError, match=msg): t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) # errors # invalid names in the agg specification msg = "\"Column 'B' does not exist!\"" for t in cases: with pytest.raises(KeyError, match=msg): t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]})