def test_gb_categoricals(self): codes = [1, 44, 44, 133, 75, 75, 75, 1] stringlist = ['a', 'b', 'c', 'd', 'e', 'e', 'f', 'g'] c1 = Categorical(codes, LikertDecision, sort_gb=True) c2 = Categorical(stringlist) d = {'nums': np.arange(8)} # from enum only d_enum = d.copy() d_enum['cat_from_enum'] = c1 ds_enum = Dataset(d_enum) enum_result = ds_enum.gb('cat_from_enum').sum() correct = FastArray([3, 15, 3, 7], dtype=np.int64) self.assertTrue( self.array_equal(correct, enum_result.nums), msg= f"Incorrect sum when grouping by enum categorical.\nExpected {correct}\nActual {enum_result.nums}", ) # from list only d_list = d.copy() d_list['cat_from_list'] = c2 ds_list = Dataset(d_list) list_result = ds_list.gb('cat_from_list').sum() correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64) self.assertTrue( self.array_equal(correct, list_result.nums), msg=f"Incorrect sum when grouping by list categorical.", ) d_both = d_enum.copy() d_both['cat_from_list'] = c2 ds_both = Dataset(d_both) # by enum, list result = ds_both.gb(['cat_from_enum', 'cat_from_list']).sum() num_result = result.nums correct = FastArray([0, 7, 1, 2, 9, 6, 3], dtype=np.int64) self.assertTrue( self.array_equal(correct, num_result), msg=f"Incorrect sum when grouping by enum, list categoricals.", ) # by list, enum result = ds_both.gb(['cat_from_list', 'cat_from_enum']).sum() num_result = result.nums correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64) self.assertTrue( self.array_equal(correct, num_result), msg=f"Incorrect sum when grouping by list, enum categoricals.", )
def test_nan_funcs(self): GroupBy.include_keys = True nan_dict = { 'f32': np.arange(5, dtype=np.float32), 'f64': np.arange(5, dtype=np.float64), } nan_dict['f32'][0] = np.nan nan_dict['f64'][0] = np.nan nan_ds = Dataset(nan_dict) # TODO: add when more nan functions have been implemented correct = np.array([1.0, 2.0, 3.0, 4.0, 0.0]) nan_funcs = ['nansum'] col_names = ['f32', 'f64'] for c_name in col_names: gb = nan_ds.gb(c_name) for f_name in nan_funcs: call = getattr(gb, f_name) result_ds = call() for result_name in col_names: column = getattr(result_ds, result_name) # skip groupby column if c_name != result_name: is_correct = bool(np.all(column == correct)) self.assertTrue( is_correct, msg= f"Incorrect result for groupby {c_name} in column {result_name} for {f_name} operation.", ) GroupBy.include_keys = False
def test_shift(self): import pandas as pd ds = Dataset({'col_' + str(i): np.random.rand(30) for i in range(5)}) ds.keycol = np.random.choice(['a', 'b', 'c'], 30) df = pd.DataFrame(ds.asdict()) rt_result = ds.gb('keycol').shift(periods=-2).trim() pd_result = df.groupby('keycol').shift(periods=-2).dropna(axis='rows') for k, v in rt_result.items(): self.assertTrue(bool(np.all(v == pd_result[k]))) rt_result = ds.gb('keycol').shift(periods=3).trim() pd_result = df.groupby('keycol').shift(periods=3).dropna(axis='rows') for k, v in rt_result.items(): self.assertTrue(bool(np.all(v == pd_result[k])))
def test_pad(self): arrsize = 100 numrows = 20 ds = Dataset({'time': arange(arrsize * 1.0)}) ds.data = np.random.randint(numrows, size=arrsize) ds.data2 = np.random.randint(numrows, size=arrsize) symbols = ['ZYGO', 'YHOO', 'FB', 'GOOG', 'IBM'] ds.symbol2 = Cat(1 + ds.data, list('ABCDEFGHIJKLMNOPQRST')) ds.symbol = Cat(1 + arange(arrsize) % len(symbols), symbols) ds.time[[3, 4, 7]] = nan newds = ds.gb('symbol').pad() self.assertTrue(newds.time[7] == 2.00) self.assertTrue(newds.time[3] != newds.time[3]) newds = ds.gb('symbol').backfill() self.assertTrue(newds.time[7] == 12.00) # see if we can pull a group newds = ds.gb('symbol').get_group('YHOO') self.assertTrue(np.all(newds.symbol == 'YHOO'))
def test_iter(self): correct_keys = FastArray(['e', 'd', 'b', 'c', 'a']) correct_idx = [[0, 1, 4, 7], [2, 9], [3], [5, 6], [8]] str_arr = FastArray(['e', 'e', 'd', 'b', 'e', 'c', 'c', 'e', 'a', 'd']) gb = Dataset({'keycol': str_arr, 'idxcol': arange(10)}) gb = gb.gb('keycol') for i, tup in enumerate(gb): self.assertEqual(tup[0], correct_keys[i]) self.assertTrue(bool(np.all(tup[1].idxcol == correct_idx[i])))
def test_transform(self): arrsize = 200 numrows = 7 ds = Dataset({'time': arange(arrsize * 1.0)}) ds.data = np.random.randint(numrows, size=arrsize) ds.data2 = np.random.randint(numrows, size=arrsize) symbols = ['AAPL', 'AMZN', 'FB', 'GOOG', 'IBM'] ds.symbol = Cat(1 + arange(arrsize) % len(symbols), symbols) newds = ds.gb('symbol')['data'].sum(transform=True) # removed from test since gbkeys not returned in transform # self.assertTrue(np.all(newds.symbol == ds.symbol)) catds = ds.symbol.sum(ds.data, transform=True) self.assertTrue(np.all(newds[0] == catds[0])) # test showfilter catds = ds.symbol.sum(ds.data, showfilter=True, transform=True) self.assertTrue(np.all(newds[0] == catds[0])) # test diff result1 = ds.gb('symbol').apply_nonreduce(TypeRegister.FastArray.diff) result2 = ds.gb('symbol').diff() self.assertTrue(result1.equals(result2))
def test_cumcount_vs_gb(self): arr = np.random.choice(['a', 'b', 'c', 'd', 'e'], 50) ds = Dataset({'keycol': arr, 'col1': arange(50), 'col2': arange(50)}) gb_result = ds.gb('keycol').cumcount() c = Categorical(ds.keycol) c_result = c.cumcount() rdiff = gb_result - c_result assert sum(rdiff) == 0 f = logical(arange(50) % 2) c_result = c.cumcount(filter=f) assert bool(np.all(isnotnan(c_result[f]))) assert bool(np.all(isnan(c_result[~f])))
def test_diff(self): import pandas as pd ds = Dataset({'col_' + str(i): np.random.rand(10) for i in range(5)}) ds.keycol = np.random.choice(['a', 'b', 'c'], 10) df = pd.DataFrame(ds.asdict()) rt_result = ds.gb('keycol').rolling_diff() pd_result = df.groupby('keycol').diff() for k, v in rt_result.items(): pdc = pd_result[k] pdcnan = isnan(pdc) self.assertTrue(bool(np.all(isnan(v) == pdcnan)), msg=f'{v} {pdc}') masked_valid_pd = isnotnan(pdc) masked_valid_rt = isnotnan(v) self.assertTrue(bool(np.all(masked_valid_pd == masked_valid_rt)))
49.55, 65.82, 85.28, 61.68, 72.85, 91.71, 61.12, ]) tens = FastArray([10] * 30) ds = Dataset({ 'strings': str_fa.copy(), 'ints': int_fa, 'floats': flt_fa, 'tens': tens }) gb = ds.gb('strings') ds_nums = Dataset({'ints': int_fa, 'floats': flt_fa, 'tens': tens}) data_to_compare = ['ints', 'floats', 'tens'] gbu = ds.gbu('strings') gb_funcs_L1 = [ 'sum', 'mean', 'min', 'max', 'var', 'std', 'nansum', 'nanmean', 'nanmin', 'nanmax',