def test_nan_funcs(self): GroupBy.include_keys = True nan_dict = { 'f32': np.arange(5, dtype=np.float32), 'f64': np.arange(5, dtype=np.float64), } nan_dict['f32'][0] = np.nan nan_dict['f64'][0] = np.nan nan_ds = Dataset(nan_dict) # TODO: add when more nan functions have been implemented correct = np.array([1.0, 2.0, 3.0, 4.0, 0.0]) nan_funcs = ['nansum'] col_names = ['f32', 'f64'] for c_name in col_names: gb = nan_ds.gb(c_name) for f_name in nan_funcs: call = getattr(gb, f_name) result_ds = call() for result_name in col_names: column = getattr(result_ds, result_name) # skip groupby column if c_name != result_name: is_correct = bool(np.all(column == correct)) self.assertTrue( is_correct, msg= f"Incorrect result for groupby {c_name} in column {result_name} for {f_name} operation.", ) GroupBy.include_keys = False
def get_basic_dataset(keyfield_value=None, nr=10): ds = Dataset({ _k: list(range(_i * nr, (_i + 1) * nr)) for _i, _k in enumerate('abcdefghijklmnop') }) if keyfield_value is not None: ds.keyfield = keyfield_value return ds
def test_iter(self): correct_keys = FastArray(['e', 'd', 'b', 'c', 'a']) correct_idx = [[0, 1, 4, 7], [2, 9], [3], [5, 6], [8]] str_arr = FastArray(['e', 'e', 'd', 'b', 'e', 'c', 'c', 'e', 'a', 'd']) gb = Dataset({'keycol': str_arr, 'idxcol': arange(10)}) gb = gb.gb('keycol') for i, tup in enumerate(gb): self.assertEqual(tup[0], correct_keys[i]) self.assertTrue(bool(np.all(tup[1].idxcol == correct_idx[i])))
def test_roundtrip_rt_pa_rt(self, rt_dset: rt.Dataset) -> None: """Test round-tripping from rt.Dataset to pyarrow.Table and back.""" result_pa_tbl = rt_dset.to_arrow() result_rt_dset = rt.Dataset.from_arrow(result_pa_tbl, zero_copy_only=False) assert rt_dset.keys() == result_rt_dset.keys() for col_name in rt_dset.keys(): # relaxed_cat_check=True, because we're not trying to test specific details of Categorical conversion # here, we're more interested in the dataset-level stuff. assert_array_or_cat_equal(rt_dset[col_name], result_rt_dset[col_name], relaxed_cat_check=True)
def test_nobins(self): """ Tests that Categorical.median() works correctly when there are no bins with data because the Categorical has been created with a pre-filter which has filtered out all data in the Dataset. """ data = Dataset() data.Group = np.random.randint(0, 10, 100_000) data.Values = np.random.randint(0, 10, 100_000) x = data.cat('Group', filter=data.Group < 0) x.median(data.Values)
def test_gb_categoricals(self): codes = [1, 44, 44, 133, 75, 75, 75, 1] stringlist = ['a', 'b', 'c', 'd', 'e', 'e', 'f', 'g'] c1 = Categorical(codes, LikertDecision, sort_gb=True) c2 = Categorical(stringlist) d = {'nums': np.arange(8)} # from enum only d_enum = d.copy() d_enum['cat_from_enum'] = c1 ds_enum = Dataset(d_enum) enum_result = ds_enum.gb('cat_from_enum').sum() correct = FastArray([3, 15, 3, 7], dtype=np.int64) self.assertTrue( self.array_equal(correct, enum_result.nums), msg= f"Incorrect sum when grouping by enum categorical.\nExpected {correct}\nActual {enum_result.nums}", ) # from list only d_list = d.copy() d_list['cat_from_list'] = c2 ds_list = Dataset(d_list) list_result = ds_list.gb('cat_from_list').sum() correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64) self.assertTrue( self.array_equal(correct, list_result.nums), msg=f"Incorrect sum when grouping by list categorical.", ) d_both = d_enum.copy() d_both['cat_from_list'] = c2 ds_both = Dataset(d_both) # by enum, list result = ds_both.gb(['cat_from_enum', 'cat_from_list']).sum() num_result = result.nums correct = FastArray([0, 7, 1, 2, 9, 6, 3], dtype=np.int64) self.assertTrue( self.array_equal(correct, num_result), msg=f"Incorrect sum when grouping by enum, list categoricals.", ) # by list, enum result = ds_both.gb(['cat_from_list', 'cat_from_enum']).sum() num_result = result.nums correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64) self.assertTrue( self.array_equal(correct, num_result), msg=f"Incorrect sum when grouping by list, enum categoricals.", )
def test_ops(self): ds = Dataset({ 'test': arange(300000) % 3, 'test2': arange(300000.0), 'test2i': arange(300000), 'test3': arange(300000) % 3, }) gb = ds.groupby('test') result = gb.mean() self.assertTrue(result.test2[0] == result.test2i[0]) self.assertTrue(result.test2[1] == result.test2i[1]) self.assertTrue(result.test3[1] == 1.0) result = gb.median() result = gb.trimbr() result = gb.nanmedian()
def test_cumcount_vs_gb(self): arr = np.random.choice(['a', 'b', 'c', 'd', 'e'], 50) ds = Dataset({'keycol': arr, 'col1': arange(50), 'col2': arange(50)}) gb_result = ds.gb('keycol').cumcount() c = Categorical(ds.keycol) c_result = c.cumcount() rdiff = gb_result - c_result assert sum(rdiff) == 0 f = logical(arange(50) % 2) c_result = c.cumcount(filter=f) assert bool(np.all(isnotnan(c_result[f]))) assert bool(np.all(isnan(c_result[~f])))
def test_gb_labels_enum(self): # make sure enum groupby keys are displayed as string, not integer code c = Categorical([10, 10, 10, 20, 30, 20, 10, 20, 20], { 'a': 30, 'b': 20, 'c': 10 }) c_result = c.count() c_labels = c_result[c_result.label_get_names()][0] ds = Dataset({'catcol': c, 'data': arange(9)}) ds_result = ds.gbu('catcol').count() ds_labels = ds_result[ds_result.label_get_names()][0] assert c_labels.dtype.char == ds_labels.dtype.char assert bool(np.all(c_labels == ds_labels))
def test_specify_gb_data(self): str_col = ['a', 'a', 'b', 'c', 'a'] num_col = [10, 10, 20, 30, 10] col1 = np.arange(5) col2 = np.arange(5) small_ds = Dataset({ 'str_col': str_col, 'num_col': num_col, 'col1': col1, 'col2': col2 }) ds_to_operate_on = small_ds[['col1', 'col2']] c = Categorical(str_col) # dataset d = c.sum(ds_to_operate_on) # single # list d = c.sum([col1, col2]) # tuple d = c.sum((col1, col2)) # dict d = c.sum({'a': col1, 'b': col2}) # multiple d = c.sum(col1, col2)
def test_groupby_ops_multikey_dict(self): mk_dict = {'string1': str_fa, 'string2': str_fa} mk_gb = Dataset({ 'string1': str_fa.copy(), 'string2': str_fa.copy(), 'ints': int_fa, 'floats': flt_fa, 'tens': tens, }).gbu(['string1', 'string2']) c = Categorical(mk_dict) self.funnel_all_tests(c, mk_gb, "multikey dictionary", sorted=False) # setitem hits comparison functions - need to rewrite these tests after comparison behavior change # self.mk_set_item(mk_dict, constructor_name="multikey dictionary") # conflicting names x = str_fa.copy() y = str_fa.copy() z = str_fa.copy() x.set_name('strings') y.set_name('strings') z.set_name('strings1') c = Categorical([x, y, z]) assert c._categories_wrap.ncols ==\ 3,\ f"incorrect number of columns for multikey from list. {c._categories_wrap.ncols} vs. 3" # 04/25/2019 all default column names now happen in grouping object assert list(c.categories().keys())\ == ['strings', GROUPBY_KEY_PREFIX + '_c1', 'strings1'],\ f"column names did not match for multikey from list. {list(c.categories().keys())} vs. ['strings','strings2','strings1']"
def test_as_matrix_metadata(self): error_tol = 0.00001 ds = Dataset({ 'A': ['EXCH1', 'EXCH2', 'EXCH1', 'EXCH3', 'EXCH3'], 'B': [-1.6, 2.7, 4.6, 5.7, 8.9], 'C': Categorical([0, 0, 1, 0, 2], ['CPTYA', 'CPTYB', 'CPTYC']), }) X, X_data = dataset_as_matrix(ds) self.assertIsInstance(X, numpy.ndarray) self.assertEqual(X.shape[0], ds.shape[0]) self.assertEqual(X.shape[1], ds.shape[1]) # we may break this later self.assertEqual(X_data['A']['dtype'], ds.A.dtype) self.assertEqual(X_data['B']['dtype'], ds.B.dtype) self.assertEqual(X_data['C']['dtype'], ds.C.dtype) self.assertEqual(X_data['A']['is_categorical'], False) self.assertEqual(X_data['B']['is_categorical'], False) self.assertEqual(X_data['C']['is_categorical'], True) self.assertTrue((numpy.abs(X[:, 0] - numpy.array([0., 1., 0., 2., 2.])) < error_tol).all(), msg=f"got {X[:, 0]}") self.assertTrue((numpy.abs(X[:, 2] - numpy.array([0, 0, 1, 0, 2])) < error_tol).all(), msg=f"got {X[:, 2]}") self.assertTrue( (X_data['A']['category_values'][numpy.array([0, 1, 0, 2, 2])] == ds.A).all(), msg= f"X_data {X_data['A']['category_values'][numpy.array([0, 1, 0, 2, 2])]}\nds.A {ds.A}" )
def test_dset_dict_to_list(self): ds = Dataset({ _k: list(range(_i * 10, (_i + 1) * 10)) for _i, _k in enumerate('abcdefghijklmnop') }) ds0 = ds[:3].copy() ds1 = ds[6:9].copy() ds2 = ds[11:15].copy() dd = {'one': ds0, 'two': ds1, 'μεαν': ds2} with self.assertRaises(ValueError): _ = dset_dict_to_list(dd, 'keyfield') dd = {'one': ds0, 'two': ds1, 3: ds2} with self.assertRaises(ValueError): _ = dset_dict_to_list(dd, 'keyfield') dd = {'one': ds0, 'two': ds1, 'three': ds2} with self.assertRaises(ValueError): _ = dset_dict_to_list(dd, 'a') lst1 = dset_dict_to_list(dd, 'keyfield') self.assertEqual(id(ds0), id(lst1[0])) self.assertEqual(id(ds1), id(lst1[1])) self.assertEqual(id(ds2), id(lst1[2])) self.assertEqual(list(ds0.keys()), ['a', 'b', 'c', 'keyfield']) self.assertTrue((ds0.a == list(range(10))).all()) self.assertTrue((ds0.keyfield == 'one').all()) lst2 = dset_dict_to_list(dd, 'a', allow_overwrite=True) self.assertEqual(id(ds0), id(lst1[0])) self.assertEqual(list(ds0.keys()), ['a', 'b', 'c', 'keyfield']) self.assertTrue((ds0.a == 'one').all()) self.assertTrue((ds0.b == list(range(10, 20))).all()) self.assertTrue((ds0.keyfield == 'one').all())
def read_dset_from_np(outdir: str, fname: str, mmap: bool = False) -> Dataset: """ Read columns stored as numpy follows to a Dataset. Parameters ---------- outdir is the path and fname is the name of the subdirectory containing the columns of the dataset set mmap = True for memmory mapping. Note this will allow quick loading, but has some latency cost elsewhere Returns ------- Dataset The dataset read in from the specified folder. See Also -------- write_dset_to_np """ mmap_mode = None if mmap: mmap_mode = 'r' fname = os.path.join(outdir, fname) fname = os.path.join(fname, 'columns') col_dict = dict() col_names = os.listdir(path2platform(fname)) for i in range(len(col_names)): fname_col = path2platform(os.path.join(fname, col_names[i])) curr_col_name = col_names[i].replace('.npy', '') col_dict[curr_col_name] = np.load(fname_col, mmap_mode=mmap_mode) return Dataset(col_dict)
def test_dataset_as_pandas_df_warn(self): ds = Dataset({'a': [1, 2, 3]}) if pd_version >= pd_ver_0_24: with self.assertWarns(FutureWarning): df = dataset_as_pandas_df(ds) else: df = dataset_as_pandas_df(ds)
def test_total_sizes(self): st = Struct({ 'a': Dataset({ # 10x int32 => 40B 'A': range(10), # 10x int32 => 40B 'B': range(10, 20), }), 'b': Struct({ # 1x int32 => 4B 'C': 0, # 1x int32 => 4B 'D': 1, # 1x int32 => 4B 'E': 2, }), # 5x int32 => 20B 'c': FastArray(np.arange(5)), # 5x int32 => 20B 'd': np.arange(5, 10), # ??? 'e': ['abc', 'def', 'ghi'], 'f': { # 1x int32 => 4B 'q': 1, # 1x int32 => 4B 'r': 2, }, # 1x float64 => 8B 'g': 3.14, # 1x int32 => 4B 'h': 84, # ??? 'i': None, # ??? 'j': slice(None), }) # Create some duplicated/aliased data within the struct. st.z = st.c # Calculate the sizes of the Struct's data in bytes. (physical, logical) = st.total_sizes # For now, we only check that the logical size is larger than the physical size # (due to the presence of aliased array(s) somewhere within the Struct). # TODO: Strengthen this test by checking the actual computed sizes to make sure they're correct. self.assertLess( physical, logical, "The physical size is not less than the logical size.")
def test_as_categorical(self): ds = Dataset({ 'keycol1': np.random.choice(['a', 'b', 'c'], 30), 'keycol2': np.random.choice(['a', 'b', 'c'], 30), 'data': np.random.rand(30), }) gbu = ds.gbu('keycol1') c = Categorical(ds.keycol1, ordered=False, sort_gb=False) cgbu = gbu.as_categorical() gbu_result = gbu.sum() c_result = c.sum(ds.data) cgbu_result = cgbu.sum(ds.data) for name, col in gbu_result.items(): assert bool(np.all(c_result[name] == col)) assert bool(np.all(cgbu_result[name] == col))
def test_as_matrix(self): error_tol = 0.00001 ds = Dataset({'A': [1.2, 3.1, 9.6], 'B': [-1.6, 2.7, 4.6]}) X, _ = dataset_as_matrix(ds) self.assertIsInstance(X, numpy.ndarray) self.assertEqual(X.shape[0], ds.shape[0]) self.assertEqual(X.shape[1], ds.shape[1]) # we may break this later self.assertTrue((numpy.abs(ds.A._np - X[:, 0]) < error_tol).all()) self.assertTrue((numpy.abs(ds.B._np - X[:, 1]) < error_tol).all())
def test_pad(self): arrsize = 100 numrows = 20 ds = Dataset({'time': arange(arrsize * 1.0)}) ds.data = np.random.randint(numrows, size=arrsize) ds.data2 = np.random.randint(numrows, size=arrsize) symbols = ['ZYGO', 'YHOO', 'FB', 'GOOG', 'IBM'] ds.symbol2 = Cat(1 + ds.data, list('ABCDEFGHIJKLMNOPQRST')) ds.symbol = Cat(1 + arange(arrsize) % len(symbols), symbols) ds.time[[3, 4, 7]] = nan newds = ds.gb('symbol').pad() self.assertTrue(newds.time[7] == 2.00) self.assertTrue(newds.time[3] != newds.time[3]) newds = ds.gb('symbol').backfill() self.assertTrue(newds.time[7] == 12.00) # see if we can pull a group newds = ds.gb('symbol').get_group('YHOO') self.assertTrue(np.all(newds.symbol == 'YHOO'))
def test_diff(self): import pandas as pd ds = Dataset({'col_' + str(i): np.random.rand(10) for i in range(5)}) ds.keycol = np.random.choice(['a', 'b', 'c'], 10) df = pd.DataFrame(ds.asdict()) rt_result = ds.gb('keycol').rolling_diff() pd_result = df.groupby('keycol').diff() for k, v in rt_result.items(): pdc = pd_result[k] pdcnan = isnan(pdc) self.assertTrue(bool(np.all(isnan(v) == pdcnan)), msg=f'{v} {pdc}') masked_valid_pd = isnotnan(pdc) masked_valid_rt = isnotnan(v) self.assertTrue(bool(np.all(masked_valid_pd == masked_valid_rt)))
def test_groupby_ops_multikey_list(self): mk_list = [str_fa.copy(), str_fa.copy()] mk_gb = Dataset({ 'string1': str_fa.copy(), 'string2': str_fa.copy(), 'ints': int_fa, 'floats': flt_fa, 'tens': tens, }).gbu(['string1', 'string2']) c = Categorical(mk_list) self.funnel_all_tests(c, mk_gb, "multikey list", sorted=False)
def test_as_matrix_int(self): error_tol = 0.00001 ds = Dataset({ _k: list(range(_i * 10, (_i + 1) * 10)) for _i, _k in enumerate('ABCDEFGHIJKLMNOP') }) X, _ = dataset_as_matrix(ds) self.assertIsInstance(X, numpy.ndarray) self.assertEqual(X.shape[0], ds.shape[0]) self.assertEqual(X.shape[1], ds.shape[1]) # we may break this later self.assertTrue((numpy.abs(ds.A._np - X[:, 0]) < error_tol).all()) self.assertTrue((numpy.abs(ds.B._np - X[:, 1]) < error_tol).all())
def test_append_dataset_dict(self): ds = Dataset({ _k: list(range(_i * 10, (_i + 1) * 10)) for _i, _k in enumerate('abcdefghijklmnop') }) ds0 = ds[:3].copy() ds1 = ds[6:9].copy() ds2 = ds[11:15].copy() dd = {'one': ds0, 'two': ds1, 'three': ds2} ds = append_dataset_dict(dd, 'keyfield') ucols = set() for _d in dd.values(): ucols.update(_d) self.assertEqual(set(ds.keys()), ucols) self.assertEqual(ds.get_nrows(), sum(_d.get_nrows() for _d in dd.values())) keyfield = [] for _d in dd.values(): keyfield.extend(_d.keyfield) self.assertTrue((ds.keyfield == keyfield).all()) self.assertTrue((ds.a[:10] == range(10)).all()) self.assertTrue((ds.g[10:20] == range(60, 70)).all()) self.assertTrue((ds.l[20:30] == range(110, 120)).all())
def test_add_dataset(self): arrsize = 200 numrows = 7 ds = Dataset({'time': np.arange(arrsize * 1.0)}) ds.data = np.random.randint(numrows, size=arrsize) ds.data2 = np.random.randint(numrows, size=arrsize) symbols = [ 'AAPL', 'AMZN', 'FB', 'GOOG', 'IBM', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', ] symbol2 = ['A', 'X', 'P', 'C', 'D', 'E', 'F', 'G', 'G', 'I', 'J', 'K'] ds.symbol2 = Cat(1 + np.arange(arrsize) % len(symbol2), symbol2) ds.symbol = Cat(1 + np.arange(arrsize) % len(symbols), symbols) x = ds.copy() del x.symbol del x.data del x.time x.label_set_names('data2') # now x has two columns, and one is labelled so adding an entire dataset should just add x.symbol2 ds.junk = x
def test_transform(self): arrsize = 200 numrows = 7 ds = Dataset({'time': arange(arrsize * 1.0)}) ds.data = np.random.randint(numrows, size=arrsize) ds.data2 = np.random.randint(numrows, size=arrsize) symbols = ['AAPL', 'AMZN', 'FB', 'GOOG', 'IBM'] ds.symbol = Cat(1 + arange(arrsize) % len(symbols), symbols) newds = ds.gb('symbol')['data'].sum(transform=True) # removed from test since gbkeys not returned in transform # self.assertTrue(np.all(newds.symbol == ds.symbol)) catds = ds.symbol.sum(ds.data, transform=True) self.assertTrue(np.all(newds[0] == catds[0])) # test showfilter catds = ds.symbol.sum(ds.data, showfilter=True, transform=True) self.assertTrue(np.all(newds[0] == catds[0])) # test diff result1 = ds.gb('symbol').apply_nonreduce(TypeRegister.FastArray.diff) result2 = ds.gb('symbol').diff() self.assertTrue(result1.equals(result2))
def test_shift(self): import pandas as pd ds = Dataset({'col_' + str(i): np.random.rand(30) for i in range(5)}) ds.keycol = np.random.choice(['a', 'b', 'c'], 30) df = pd.DataFrame(ds.asdict()) rt_result = ds.gb('keycol').shift(periods=-2).trim() pd_result = df.groupby('keycol').shift(periods=-2).dropna(axis='rows') for k, v in rt_result.items(): self.assertTrue(bool(np.all(v == pd_result[k]))) rt_result = ds.gb('keycol').shift(periods=3).trim() pd_result = df.groupby('keycol').shift(periods=3).dropna(axis='rows') for k, v in rt_result.items(): self.assertTrue(bool(np.all(v == pd_result[k])))
def write_dset_to_np(ds: Dataset, outdir: str, fname: str) -> None: """ Write the columns of a dataset to numpy binary files, one file per column in the specified directory. Parameters ---------- ds : Dataset A Dataset to write out to disk. outdir : str The path to the folder where the output will be written. fname : str The name of the subdirectory to store the columns. See Also -------- read_dset_from_np """ os.makedirs(os.path.join(outdir, fname)) fname = os.path.join(outdir, fname) fname = os.path.join(fname, 'columns') os.makedirs(path2platform(fname)) for name, value in ds.items(): fname_col = os.path.join(fname, str(name)) np.save(path2platform(fname_col), value)
def test_reductions(self): message_types = [ 'CREATE', 'RUN', 'CREATE', 'RUN', 'RUN', 'RUN', 'RUN', 'CANCEL', 'RUN', 'RUN', 'RUN', 'CANCEL', ] order_ids = [1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1] seconds = [50, 70, 72, 75, 90, 88, 95, 97, 98, 115, 116, 120] shares = [0, 200, 0, 500, 100, 400, 100, 0, 300, 150, 150, 0] d2 = dict( message_type=message_types, order_id=order_ids, second=seconds, shares=shares, ) dat = Dataset(d2) dat = dat[['order_id', 'message_type', 'second', 'shares']] # Numeric reduction dsr = dat.groupby('order_id').sum() self.assertEqual(dsr.shape, (2, 3)) self.assertArrayEqual(dsr.order_id, [1, 2]) self.assertArrayEqual(dsr.second, [410, 676]) self.assertArrayEqual(dsr.shares, [800, 1100]) # Numeric reduction with all columns returned dsr = dat.groupby('order_id', return_all=True).sum() self.assertEqual(dsr.shape, (2, 4)) self.assertEqual(dsr.keys()[1], 'message_type') # Order-based reduction dsr = dat.groupby('order_id').first() self.assertEqual(dsr.shape, (2, 4)) self.assertArrayEqual(dsr.order_id, [1, 2]) self.assertArrayEqual(dsr.message_type, ['CREATE', 'CREATE']) self.assertArrayEqual(dsr.second, [50, 72]) self.assertArrayEqual(dsr.shares, [0, 0]) # Order-based reduction, which returns all columns regardless dsr = dat.groupby('order_id', return_all=True).first() self.assertEqual(dsr.shape, (2, 4)) # Order-based reduction with multiple keys dsr = dat.groupby(['order_id', 'message_type']).first() self.assertEqual(dsr.shape, (6, 4)) self.assertArrayEqual(dsr.order_id, [1, 1, 1, 2, 2, 2]) self.assertArrayEqual( dsr.message_type, ['CANCEL', 'CREATE', 'RUN', 'CANCEL', 'CREATE', 'RUN']) self.assertArrayEqual(dsr.second, [120, 50, 70, 97, 72, 90]) self.assertArrayEqual(dsr.shares, [0, 0, 200, 0, 0, 100]) # On a subset of columns gb = dat.groupby('order_id') dsr = gb['shares'].sum() self.assertEqual(dsr.shape, (2, 2)) self.assertArrayEqual(dsr.shares, [800, 1100]) # Accumulating function dsr = dat.groupby('order_id').cumsum() self.assertEqual(dsr.shape, (12, 2)) self.assertArrayEqual( dsr.shares, [0, 200, 0, 700, 100, 500, 800, 500, 800, 950, 1100, 800]) # return_all has no effect with accumulating functions # 8/23/2018 SJK - changed behavior so return all shows the keys dsr = dat.groupby('order_id', return_all=True).cumsum() self.assertEqual(dsr.shape, (12, 3)) # Add cum_shares back to a dataset dat['cum_shares'] = dat.groupby('order_id').shares.cumsum().shares self.assertEqual(dat.shape, (12, 5)) self.assertArrayEqual(dat.cum_shares, gb.shares.cumsum().shares) # On a subset of columns dsr = dat.groupby('order_id')[['shares', 'second']].cumsum() self.assertEqual(dsr.shape, (12, 2)) self.assertArrayEqual( dsr.shares, [0, 200, 0, 700, 100, 500, 800, 500, 800, 950, 1100, 800]) self.assertArrayEqual( dsr.second, [50, 120, 72, 195, 162, 250, 290, 347, 445, 560, 676, 410]) # On a subset of columns with a filter f = FastArray([ True, False, True, False, True, False, True, False, True, False, True, False, ]) dsr = dat.groupby('order_id')[['shares', 'second']].cumsum(filter=f) self.assertEqual(dsr.shape, (12, 2)) self.assertArrayEqual( dsr.shares, [0, 0, 0, 0, 100, 100, 100, 100, 400, 400, 550, 100]) self.assertArrayEqual( dsr.second, [50, 50, 72, 50, 162, 162, 145, 162, 260, 260, 376, 145]) # On shares and second with filter at groupby construction dsr = dat.groupby('order_id', filter=f)[['shares', 'second']].cumsum() inv = INVALID_DICT[dsr.shares[0].dtype.num] self.assertEqual(dsr.shape, (12, 2)) self.assertArrayEqual( dsr.shares, [0, inv, 0, inv, 100, inv, 100, inv, 400, inv, 550, inv]) self.assertArrayEqual( dsr.second, [50, inv, 72, inv, 162, inv, 145, inv, 260, inv, 376, inv]) # Using agg function dsr = gb[['second', 'shares']].agg(['sum', 'mean']) self.assertEqual(dsr.shape, (2, 2)) self.assertArrayEqual(dsr.Sum.second, [410, 676]) self.assertArrayEqual(dsr.Sum.shares, [800, 1100]) self.assertArrayAlmostEqual(dsr.Mean.second, [82.00, 96.57], places=2) self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14], places=2) # Check for issue when bracket indexing on groupby f = open(os.devnull, 'w') print(gb, file=f) f.close() dsr = gb[['second', 'shares']].agg(['sum', 'mean']) # Using different functions on different columns dsr = gb.agg({'second': 'sum', 'shares': ['max', 'mean']}) self.assertEqual(dsr.shape, (2, 3)) self.assertArrayEqual(dsr.Sum.second, [410, 676]) self.assertArrayEqual(dsr.Max.shares, [500, 400]) self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14], places=2) # Using numpy functions dsr = gb.agg({'second': np.sum, 'shares': [np.max, np.mean]}) self.assertEqual(dsr.shape, (2, 3)) self.assertArrayEqual(dsr.Sum.second, [410, 676]) self.assertArrayEqual(dsr.Max.shares, [500, 400]) self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14], places=2) # Alternate way to add to multiset gb = dat.groupby('order_id') ms = gb[['shares']].agg(['max', 'mean']) ms.Sum = gb[['second']].sum() self.assertEqual(ms.shape, (2, 3)) self.assertArrayEqual(ms.Sum.second, [410, 676]) self.assertArrayEqual(ms.Max.shares, [500, 400]) self.assertArrayAlmostEqual(ms.Mean.shares, [160.00, 157.14], places=2)
def test_projections(self): num_rows_trade = 1_000_000 num_symbols = 450 Trade_Dates = [ '20180602', '20180603', '20180604', '20180605', '20180606' ] Exchanges = np.array(['EXCH1', 'EXCH2', 'EXCH3']) np.random.seed(1234) ds = Dataset({ 'SymbolID': np.random.randint(0, num_symbols, size=num_rows_trade), 'Exchange': Exchanges[np.random.randint(0, Exchanges.shape[0], size=num_rows_trade)], 'Trade_Date': [ Trade_Dates[int(i * len(Trade_Dates) / num_rows_trade)] for i in range(num_rows_trade) ], 'Time': [ int(i % (num_rows_trade / len(Trade_Dates))) for i in range(num_rows_trade) ], 'Price': 100 * (1.0 + 0.0005 * np.random.randn(num_rows_trade)), 'Size': 10 * np.array(1 + 30 * np.random.rand(num_rows_trade), dtype=np.int64), }) num_rows_quote = 1_000_000 ds2 = Dataset({ 'SymbolID': np.random.randint(0, num_symbols, size=num_rows_quote), 'Exchange': Exchanges[np.random.randint(0, Exchanges.shape[0], size=num_rows_quote)], 'Trade_Date': [ Trade_Dates[int(i * len(Trade_Dates) / num_rows_quote)] for i in range(num_rows_quote) ], 'Time': [ int(i % (num_rows_quote / len(Trade_Dates))) for i in range(num_rows_quote) ], 'Bid': 100 * (1.0 - 0.001 + 0.0005 * np.random.randn(num_rows_quote)), 'Ask': 100 * (1.0 + 0.001 + 0.0005 * np.random.randn(num_rows_quote)), }) threshold = Dataset( {'Is_Below_Thresdhold': np.random.rand(num_rows_quote) < 0.75}) trade_time = Dataset({'time_2500': (ds.Time / 2500).astype(int)}) trades = Dataset({}).concat_columns([ds, trade_time], do_copy=False) # Create GroupBy and corresponding Categorical trade_gb = trades.groupby( ['SymbolID', 'Exchange', 'Trade_Date', 'time_2500']) trade_cat = Categorical( [ds.SymbolID, ds.Exchange, ds.Trade_Date, trade_time.time_2500]) # Call sum() and count() self.assertEqual(trade_gb.sum().shape, (455654, 7)) self.assertEqual(trade_cat.sum(ds).shape, (455654, 7)) self.assertEqual(trade_gb.count().shape, (455654, 5)) # 8/24/2018 SJK - multikey categorical groupby now returns multiple columns for groupby keys self.assertEqual(trade_cat.count().shape, (455654, 5)) b1 = trade_gb.count().Count.mean() b1c = trade_cat.count().Count.mean() b2 = trade_gb.count().shape[0] self.assertAlmostEqual(ds.shape[0], b1 * b2, places=5) self.assertAlmostEqual(ds.shape[0], b1c * b2, places=5) # Create ds augmented with filtered ID trade_ds = Dataset({'ID': trade_gb.grouping.ikey}) trade_ds_below_threshold = ds * threshold.Is_Below_Thresdhold trade_ds_below_thresholdb = Dataset.concat_columns( [trade_ds_below_threshold, trade_ds], do_copy=False) # Create trade_ds size projection using GroupBy trade_gb_id = trade_ds_below_thresholdb.groupby('ID') trade_sizes_ds = trade_gb_id['Size'].sum() trade_size_ds = trade_sizes_ds.Size[trade_ds_below_thresholdb.ID - 1] self.assertEqual(trade_size_ds.shape[0], ds.shape[0]) # Create trade_ds size projection using Categorical trade_sizes_cat_ds = trade_cat.sum(trade_ds_below_thresholdb.Size) trade_size_cat_ds = trade_sizes_cat_ds.Size[trade_cat - 1] self.assertArrayAlmostEqual(trade_size_ds, trade_size_cat_ds, places=6) # Create trade_ds size projection using Pandas groupby ptrade_ds_below_thresholdb = dataset_as_pandas_df( trade_ds_below_thresholdb) ptrade_gb_id = ptrade_ds_below_thresholdb.groupby('ID') trade_sizes_pd_ds = ptrade_gb_id.sum() trade_size_pd_ds = trade_sizes_pd_ds.Size.values[ptrade_gb_id.ngroup()] self.assertArrayAlmostEqual(trade_size_ds, trade_size_pd_ds, places=6)
np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64, np.float32, np.float64, ] arr_types_string = [np.bytes_, np.str_] test_data = {'bool': np.array([True, False, True, False, True], dtype=np.bool)} for dt in arr_types + arr_types_string: test_data[dt.__name__] = np.array(num_list, dtype=dt) test_data['categorical'] = Categorical([str(i) for i in num_list]) all_headers = list(test_data.keys()) ds = Dataset(test_data) gb_funcs = ['sum', 'mean', 'first', 'last', 'median', 'min', 'max', 'var'] gb_nan_funcs = ['nansum', 'nanmean', 'nanmedian', 'nanvar'] #'rolling', 'cumsum', 'nth' class Groupby_Test(unittest.TestCase): def test_math_ops_same_return(self): result_dict = { 'sum': [5, 10], 'nansum': [5, 10], 'median': [2.5, 3], # TODO: add support for min / max on strings 'min': [1, 2], 'max': [4, 5], }