def test_single_key_string_count(self): correct_counts = FastArray([4, 5, 9, 6, 6]) # for sorting/count bug fix 8/21/2018 c_make_unique = Categorical(str_fa) result_counts = c_make_unique.count().Count match = bool(np.all(result_counts == correct_counts)) assert match c_from_codes = Categorical(sorted_codes, complete_unique_cats, base_index=0) result_counts = c_from_codes.count().Count match = bool(np.all(result_counts == correct_counts)) assert match c_from_codes_unsorted = Categorical(sorted_codes, unsorted_unique_cats, base_index=0) result_counts = c_from_codes_unsorted.count().Count match = bool(np.all(result_counts == correct_counts)) assert match # 8/24/2018 SJK - default name for groupby key columns might change, so selected this by index # also, in most cases (save intenum/dict) categorical groupby no longer returns a categorical result_keys = c_from_codes_unsorted.count()[1] match = bool(np.all(result_keys == unsorted_unique_cats)) assert match, f"Result: {result_keys} Expected: {unsorted_unique_cats}"
def test_multikey_count(self): mk_list = [str_fa.copy(), int_fa.copy(), str_fa.copy(), int_fa.copy()] c_multi = Categorical(mk_list) result_counts = c_multi.count().Count correct_counts = FastArray([6, 5, 1, 2, 3, 2, 2, 4, 2, 2, 1]) all_correct = bool(np.all(result_counts == correct_counts)) assert all_correct,\ f"Incorrect result for multikey count for 4 keys. {result_counts} vs. {correct_counts}"
def test_gb_labels_enum(self): # make sure enum groupby keys are displayed as string, not integer code c = Categorical([10, 10, 10, 20, 30, 20, 10, 20, 20], { 'a': 30, 'b': 20, 'c': 10 }) c_result = c.count() c_labels = c_result[c_result.label_get_names()][0] ds = Dataset({'catcol': c, 'data': arange(9)}) ds_result = ds.gbu('catcol').count() ds_labels = ds_result[ds_result.label_get_names()][0] assert c_labels.dtype.char == ds_labels.dtype.char assert bool(np.all(c_labels == ds_labels))
def test_projections(self): num_rows_trade = 1_000_000 num_symbols = 450 Trade_Dates = [ '20180602', '20180603', '20180604', '20180605', '20180606' ] Exchanges = np.array(['EXCH1', 'EXCH2', 'EXCH3']) np.random.seed(1234) ds = Dataset({ 'SymbolID': np.random.randint(0, num_symbols, size=num_rows_trade), 'Exchange': Exchanges[np.random.randint(0, Exchanges.shape[0], size=num_rows_trade)], 'Trade_Date': [ Trade_Dates[int(i * len(Trade_Dates) / num_rows_trade)] for i in range(num_rows_trade) ], 'Time': [ int(i % (num_rows_trade / len(Trade_Dates))) for i in range(num_rows_trade) ], 'Price': 100 * (1.0 + 0.0005 * np.random.randn(num_rows_trade)), 'Size': 10 * np.array(1 + 30 * np.random.rand(num_rows_trade), dtype=np.int64), }) num_rows_quote = 1_000_000 ds2 = Dataset({ 'SymbolID': np.random.randint(0, num_symbols, size=num_rows_quote), 'Exchange': Exchanges[np.random.randint(0, Exchanges.shape[0], size=num_rows_quote)], 'Trade_Date': [ Trade_Dates[int(i * len(Trade_Dates) / num_rows_quote)] for i in range(num_rows_quote) ], 'Time': [ int(i % (num_rows_quote / len(Trade_Dates))) for i in range(num_rows_quote) ], 'Bid': 100 * (1.0 - 0.001 + 0.0005 * np.random.randn(num_rows_quote)), 'Ask': 100 * (1.0 + 0.001 + 0.0005 * np.random.randn(num_rows_quote)), }) threshold = Dataset( {'Is_Below_Thresdhold': np.random.rand(num_rows_quote) < 0.75}) trade_time = Dataset({'time_2500': (ds.Time / 2500).astype(int)}) trades = Dataset({}).concat_columns([ds, trade_time], do_copy=False) # Create GroupBy and corresponding Categorical trade_gb = trades.groupby( ['SymbolID', 'Exchange', 'Trade_Date', 'time_2500']) trade_cat = Categorical( [ds.SymbolID, ds.Exchange, ds.Trade_Date, trade_time.time_2500]) # Call sum() and count() self.assertEqual(trade_gb.sum().shape, (455654, 7)) self.assertEqual(trade_cat.sum(ds).shape, (455654, 7)) self.assertEqual(trade_gb.count().shape, (455654, 5)) # 8/24/2018 SJK - multikey categorical groupby now returns multiple columns for groupby keys self.assertEqual(trade_cat.count().shape, (455654, 5)) b1 = trade_gb.count().Count.mean() b1c = trade_cat.count().Count.mean() b2 = trade_gb.count().shape[0] self.assertAlmostEqual(ds.shape[0], b1 * b2, places=5) self.assertAlmostEqual(ds.shape[0], b1c * b2, places=5) # Create ds augmented with filtered ID trade_ds = Dataset({'ID': trade_gb.grouping.ikey}) trade_ds_below_threshold = ds * threshold.Is_Below_Thresdhold trade_ds_below_thresholdb = Dataset.concat_columns( [trade_ds_below_threshold, trade_ds], do_copy=False) # Create trade_ds size projection using GroupBy trade_gb_id = trade_ds_below_thresholdb.groupby('ID') trade_sizes_ds = trade_gb_id['Size'].sum() trade_size_ds = trade_sizes_ds.Size[trade_ds_below_thresholdb.ID - 1] self.assertEqual(trade_size_ds.shape[0], ds.shape[0]) # Create trade_ds size projection using Categorical trade_sizes_cat_ds = trade_cat.sum(trade_ds_below_thresholdb.Size) trade_size_cat_ds = trade_sizes_cat_ds.Size[trade_cat - 1] self.assertArrayAlmostEqual(trade_size_ds, trade_size_cat_ds, places=6) # Create trade_ds size projection using Pandas groupby ptrade_ds_below_thresholdb = dataset_as_pandas_df( trade_ds_below_thresholdb) ptrade_gb_id = ptrade_ds_below_thresholdb.groupby('ID') trade_sizes_pd_ds = ptrade_gb_id.sum() trade_size_pd_ds = trade_sizes_pd_ds.Size.values[ptrade_gb_id.ngroup()] self.assertArrayAlmostEqual(trade_size_ds, trade_size_pd_ds, places=6)