def test_categorical_ctor(value_strategy, category_mode, data): # cat is drawn from CategoricalStrategy ordered: bool = data.draw(booleans()) cat: Categorical = data.draw( CategoricalStrategy(value_strategy, category_mode=category_mode, ordered=ordered)) assert _check_categorical(cat) # Validate properties on constructing a Categorical from a Categorical's values and categories. values, categories = cat.expand_array, cat._categories # For Dictionary Categoricals, 'categories' should be the original Categorical's category_map. if category_mode == CategoryMode.Dictionary: categories = cat.category_mapping cat2 = Categorical(values, categories=categories, ordered=ordered) assert _check_categorical(cat2) # Validate properties on constructing a Categorical given a Categorical. cat3 = Categorical(cat2) assert _check_categorical(cat3) # Validate properties on constructing a Categorical using _from_categorical which is a fast path # that skips internal routine checks, sorting, or making values unique, but should be identical to # the original Categorical. from_categorical = cat._categories_wrap cat4 = Categorical( values, categories=categories, _from_categorical=from_categorical, ordered=ordered, ) assert _check_categorical(cat4)
def test_groupby_ops_multikey_dict(self): mk_dict = {'string1': str_fa, 'string2': str_fa} mk_gb = Dataset({ 'string1': str_fa.copy(), 'string2': str_fa.copy(), 'ints': int_fa, 'floats': flt_fa, 'tens': tens, }).gbu(['string1', 'string2']) c = Categorical(mk_dict) self.funnel_all_tests(c, mk_gb, "multikey dictionary", sorted=False) # setitem hits comparison functions - need to rewrite these tests after comparison behavior change # self.mk_set_item(mk_dict, constructor_name="multikey dictionary") # conflicting names x = str_fa.copy() y = str_fa.copy() z = str_fa.copy() x.set_name('strings') y.set_name('strings') z.set_name('strings1') c = Categorical([x, y, z]) assert c._categories_wrap.ncols ==\ 3,\ f"incorrect number of columns for multikey from list. {c._categories_wrap.ncols} vs. 3" # 04/25/2019 all default column names now happen in grouping object assert list(c.categories().keys())\ == ['strings', GROUPBY_KEY_PREFIX + '_c1', 'strings1'],\ f"column names did not match for multikey from list. {list(c.categories().keys())} vs. ['strings','strings2','strings1']"
def test_pre_filter(self): c = Categorical(str_fa, filter=even_filter) assert c._filter == None result = c.sum(ds_nums) one_fifty = sum(result.tens) assert one_fifty == 150
def simple_string_set_item(self, *args, **kwargs): ''' This test needs to be updated with different data that reflects the new comparison behavior. SJK: 9/24/2018 ''' source = kwargs['constructor_name'] del kwargs['constructor_name'] if 'categories' in kwargs: kwargs['categories'] = kwargs['categories'].copy() c = Categorical(*args, **kwargs) set_items = [ # index by string (b'b', b'a'), (b'b', 'a'), # (b'b', 1), ('b', b'a'), ('b', 'a'), # ('b', 1), # index by bool array # boolean arrays can no longer be generated with these comparisons SJK 9/24/2018 (c == b'b', b'a'), (c == b'b', 'a'), # (c == b'b', 1), (c == 'b', b'a'), (c == 'b', 'a'), # (c == 'b', 1), # (c == 2, b'a'), # (c == 2, 'a'), # (c == 2, 1), # integer index ([5, 9, 16, 18, 21], b'a'), ([5, 9, 16, 18, 21], 'a'), # ([ 5, 9, 16, 18, 21], 1), ([5, 9, 16, 18, 21], b'a'), ([5, 9, 16, 18, 21], 'a'), # ([ 5, 9, 16, 18, 21], 1), ([5, 9, 16, 18, 21], b'a'), ([5, 9, 16, 18, 21], 'a'), # ([ 5, 9, 16, 18, 21], 1), ] # this test needs to get reworked # no longer produces the correct result for all types of categoricals because of == comparison behavior for items in set_items: c = Categorical(*args, **kwargs) goal = c == ['a', 'b'] c[items[0]] = items[1] result = c == items[1] all_set = np.sum(goal == result) assert all_set ==\ 30,\ f"did not set c[{items[0]}] to {items[1]} for categorical from {source}" none_left = np.sum(c == 'b') assert none_left ==\ 0,\ f"did not set c[{items[0]}] to {items[1]} for categorical from {source}"
def test_single_key_string_count(self): correct_counts = FastArray([4, 5, 9, 6, 6]) # for sorting/count bug fix 8/21/2018 c_make_unique = Categorical(str_fa) result_counts = c_make_unique.count().Count match = bool(np.all(result_counts == correct_counts)) assert match c_from_codes = Categorical(sorted_codes, complete_unique_cats, base_index=0) result_counts = c_from_codes.count().Count match = bool(np.all(result_counts == correct_counts)) assert match c_from_codes_unsorted = Categorical(sorted_codes, unsorted_unique_cats, base_index=0) result_counts = c_from_codes_unsorted.count().Count match = bool(np.all(result_counts == correct_counts)) assert match # 8/24/2018 SJK - default name for groupby key columns might change, so selected this by index # also, in most cases (save intenum/dict) categorical groupby no longer returns a categorical result_keys = c_from_codes_unsorted.count()[1] match = bool(np.all(result_keys == unsorted_unique_cats)) assert match, f"Result: {result_keys} Expected: {unsorted_unique_cats}"
def test_multikey_count(self): mk_list = [str_fa.copy(), int_fa.copy(), str_fa.copy(), int_fa.copy()] c_multi = Categorical(mk_list) result_counts = c_multi.count().Count correct_counts = FastArray([6, 5, 1, 2, 3, 2, 2, 4, 2, 2, 1]) all_correct = bool(np.all(result_counts == correct_counts)) assert all_correct,\ f"Incorrect result for multikey count for 4 keys. {result_counts} vs. {correct_counts}"
def mk_set_item(self, *args, **kwargs): source = kwargs['constructor_name'] del kwargs['constructor_name'] if 'categories' in kwargs: print('copying categories') kwargs['categories'] = kwargs['categories'].copy() c = Categorical(*args, **kwargs) set_items = [ # index by string ((b'b', b'b'), (b'a', b'a')), ((b'b', b'b'), ('a', 'a')), ((b'b', b'b'), 5), (('b', 'b'), (b'a', b'a')), (('b', 'b'), ('a', 'a')), (('b', 'b'), 5), # index by bool array # (c == (b'b', b'b'), (b'a', b'a')), # (c == (b'b', b'b'), ('a', 'a')), # (c == (b'b', b'b'), 5), # (c == ('b', 'b'), (b'a', b'a')), # (c == ('b', 'b'), ('a', 'a')), # (c == ('b', 'b'), 5), # (c == 4, (b'a', b'a')), # (c == 4, ('a', 'a')), # (c == 4, 5), # integer index ([5, 9, 16, 18, 21], (b'a', b'a')), ([5, 9, 16, 18, 21], ('a', 'a')), ([5, 9, 16, 18, 21], 5), ([5, 9, 16, 18, 21], (b'a', b'a')), ([5, 9, 16, 18, 21], ('a', 'a')), ([5, 9, 16, 18, 21], 5), ([5, 9, 16, 18, 21], (b'a', b'a')), ([5, 9, 16, 18, 21], ('a', 'a')), ([5, 9, 16, 18, 21], 5), ] for items in set_items: c = Categorical(*args, **kwargs) goal = mask_or([c == ('a', 'a'), c == ('b', 'b')]) c[items[0]] = items[1] result = c == items[1] all_set = np.sum(goal == result) assert all_set ==\ 30,\ f"did not set c[{items[0]}] to {items[1]} for categorical from {source}" none_left = np.sum(c == ('b', 'b')) assert none_left ==\ 0,\ f"did not set c[{items[0]}] to {items[1]} for categorical from {source}"
def test_gb_categoricals(self): codes = [1, 44, 44, 133, 75, 75, 75, 1] stringlist = ['a', 'b', 'c', 'd', 'e', 'e', 'f', 'g'] c1 = Categorical(codes, LikertDecision, sort_gb=True) c2 = Categorical(stringlist) d = {'nums': np.arange(8)} # from enum only d_enum = d.copy() d_enum['cat_from_enum'] = c1 ds_enum = Dataset(d_enum) enum_result = ds_enum.gb('cat_from_enum').sum() correct = FastArray([3, 15, 3, 7], dtype=np.int64) self.assertTrue( self.array_equal(correct, enum_result.nums), msg= f"Incorrect sum when grouping by enum categorical.\nExpected {correct}\nActual {enum_result.nums}", ) # from list only d_list = d.copy() d_list['cat_from_list'] = c2 ds_list = Dataset(d_list) list_result = ds_list.gb('cat_from_list').sum() correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64) self.assertTrue( self.array_equal(correct, list_result.nums), msg=f"Incorrect sum when grouping by list categorical.", ) d_both = d_enum.copy() d_both['cat_from_list'] = c2 ds_both = Dataset(d_both) # by enum, list result = ds_both.gb(['cat_from_enum', 'cat_from_list']).sum() num_result = result.nums correct = FastArray([0, 7, 1, 2, 9, 6, 3], dtype=np.int64) self.assertTrue( self.array_equal(correct, num_result), msg=f"Incorrect sum when grouping by enum, list categoricals.", ) # by list, enum result = ds_both.gb(['cat_from_list', 'cat_from_enum']).sum() num_result = result.nums correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64) self.assertTrue( self.array_equal(correct, num_result), msg=f"Incorrect sum when grouping by list, enum categoricals.", )
def test_cumcount_vs_gb(self): arr = np.random.choice(['a', 'b', 'c', 'd', 'e'], 50) ds = Dataset({'keycol': arr, 'col1': arange(50), 'col2': arange(50)}) gb_result = ds.gb('keycol').cumcount() c = Categorical(ds.keycol) c_result = c.cumcount() rdiff = gb_result - c_result assert sum(rdiff) == 0 f = logical(arange(50) % 2) c_result = c.cumcount(filter=f) assert bool(np.all(isnotnan(c_result[f]))) assert bool(np.all(isnan(c_result[~f])))
def test_empty_category(self): # 5/16/2019 invalid category must be in uniques # c = Categorical(str_fa_with_invalid, complete_unique_cats, invalid='invalid') # can test empty bin like this, the third result will be empty c = Categorical(np.random.choice(['a', 'b', 'd', 'e'], 30), ['a', 'b', 'c', 'd', 'e']) empty_result = [ ('sum', 0.0), ('mean', np.nan), ('min', np.nan), ('max', np.nan), ('var', np.nan), ('std', np.nan), ('nansum', 0.0), ('nanmean', np.nan), ('nanmin', np.nan), ('nanmax', np.nan), ('nanvar', np.nan), ('nanstd', np.nan), ] for correct_tup in empty_result: func = getattr(c, correct_tup[0]) result = func(ds_nums).floats[2] a = np.isnan(correct_tup[1]) if np.isnan(correct_tup[1]): assert result !=\ result,\ f"Did not product correct result for empty category after {correct_tup[0]} operation." else: assert result ==\ correct_tup[1],\ f"Did not product correct result for empty category after {correct_tup[0]} operation."
def test_gb_labels_enum(self): # make sure enum groupby keys are displayed as string, not integer code c = Categorical([10, 10, 10, 20, 30, 20, 10, 20, 20], { 'a': 30, 'b': 20, 'c': 10 }) c_result = c.count() c_labels = c_result[c_result.label_get_names()][0] ds = Dataset({'catcol': c, 'data': arange(9)}) ds_result = ds.gbu('catcol').count() ds_labels = ds_result[ds_result.label_get_names()][0] assert c_labels.dtype.char == ds_labels.dtype.char assert bool(np.all(c_labels == ds_labels))
def test_as_matrix_metadata(self): error_tol = 0.00001 ds = Dataset({ 'A': ['EXCH1', 'EXCH2', 'EXCH1', 'EXCH3', 'EXCH3'], 'B': [-1.6, 2.7, 4.6, 5.7, 8.9], 'C': Categorical([0, 0, 1, 0, 2], ['CPTYA', 'CPTYB', 'CPTYC']), }) X, X_data = dataset_as_matrix(ds) self.assertIsInstance(X, numpy.ndarray) self.assertEqual(X.shape[0], ds.shape[0]) self.assertEqual(X.shape[1], ds.shape[1]) # we may break this later self.assertEqual(X_data['A']['dtype'], ds.A.dtype) self.assertEqual(X_data['B']['dtype'], ds.B.dtype) self.assertEqual(X_data['C']['dtype'], ds.C.dtype) self.assertEqual(X_data['A']['is_categorical'], False) self.assertEqual(X_data['B']['is_categorical'], False) self.assertEqual(X_data['C']['is_categorical'], True) self.assertTrue((numpy.abs(X[:, 0] - numpy.array([0., 1., 0., 2., 2.])) < error_tol).all(), msg=f"got {X[:, 0]}") self.assertTrue((numpy.abs(X[:, 2] - numpy.array([0, 0, 1, 0, 2])) < error_tol).all(), msg=f"got {X[:, 2]}") self.assertTrue( (X_data['A']['category_values'][numpy.array([0, 1, 0, 2, 2])] == ds.A).all(), msg= f"X_data {X_data['A']['category_values'][numpy.array([0, 1, 0, 2, 2])]}\nds.A {ds.A}" )
def test_groupby_ops_mapping(self): d = dict(str_enum.__members__) d = {k: int(v) for k, v in d.items()} c = Categorical(sorted_codes, d) self.funnel_all_tests(c, gbu, "index + mapping dictionary", sorted=False)
def test_groupby_ops_user_codes_base_0(self): c = Categorical(sorted_codes.copy(), categories=complete_unique_cats, base_index=0) self.funnel_all_tests(c, gb, "index + categories + base_index 0") c = CatZero(sorted_codes.copy(), categories=complete_unique_cats) self.funnel_all_tests(c, gb, "index + categories + base_index 0")
def test_categorical_dict_key_completion(self): ip = get_ipython() complete = ip.Completer.complete ip.user_ns["cat"] = Categorical(CODES, decision_dict) _, matches = complete(line_buffer="cat['") for k in decision_dict.keys(): self.assertIn(k, matches)
def test_total_sizes_with_categorical(self): st = Struct({'c': Categorical(['aa', 'bb', 'cc', 'dd'])}) st.d = st.c (physical, logical) = st.total_sizes self.assertEqual(physical, logical // 2) self.assertGreaterEqual( physical, np.asarray(st.c).nbytes + st.c.category_array.nbytes )
def test_specify_gb_data(self): str_col = ['a', 'a', 'b', 'c', 'a'] num_col = [10, 10, 20, 30, 10] col1 = np.arange(5) col2 = np.arange(5) small_ds = Dataset({ 'str_col': str_col, 'num_col': num_col, 'col1': col1, 'col2': col2 }) ds_to_operate_on = small_ds[['col1', 'col2']] c = Categorical(str_col) # dataset d = c.sum(ds_to_operate_on) # single # list d = c.sum([col1, col2]) # tuple d = c.sum((col1, col2)) # dict d = c.sum({'a': col1, 'b': col2}) # multiple d = c.sum(col1, col2)
def test_groupby_ops_string_list_cats(self): c = Categorical(str_fa, complete_unique_cats) self.funnel_all_tests(c, gb, "string list + categories") self.simple_string_set_item( str_fa, categories=complete_unique_cats, constructor_name="string list + categories", )
def test_as_categorical(self): ds = Dataset({ 'keycol1': np.random.choice(['a', 'b', 'c'], 30), 'keycol2': np.random.choice(['a', 'b', 'c'], 30), 'data': np.random.rand(30), }) gbu = ds.gbu('keycol1') c = Categorical(ds.keycol1, ordered=False, sort_gb=False) cgbu = gbu.as_categorical() gbu_result = gbu.sum() c_result = c.sum(ds.data) cgbu_result = cgbu.sum(ds.data) for name, col in gbu_result.items(): assert bool(np.all(c_result[name] == col)) assert bool(np.all(cgbu_result[name] == col))
def test_roundtrip_rt_pa_rt(self, rt_cat: rt.Categorical, output_writable: bool, have_nulls: bool) -> None: """Test round-tripping from rt.Categorical to pyarrow.Array/pyarrow.Table and back.""" orig_cat_shape = rt_cat.shape if have_nulls: # riptable's filtering/masking uses a valid mask (where False means null/NA). indices = np.arange(len(rt_cat)) valid_mask = indices % 3 != 1 rt_cat = rt_cat.filter(valid_mask) assert rt_cat.shape == orig_cat_shape # isfiltered() doesn't work as expected for Dictionary/IntEnum-mode Categorical as of riptable 1.1.0. filtered_element_count = (rt.isnan(rt_cat._fa) if rt_cat.category_mode in (rt.rt_enum.CategoryMode.Dictionary, rt.rt_enum.CategoryMode.IntEnum) else rt_cat.isfiltered()).sum() assert filtered_element_count == (len(rt_cat) - valid_mask.sum()) result_pa_arr = rt_cat.to_arrow() # Verify the pyarrow array has the correct length, number of categories, etc. assert len(rt_cat) == len(result_pa_arr) assert pat.is_dictionary(result_pa_arr.type) assert len(result_pa_arr.dictionary) >= len(next(iter(rt_cat.category_dict.values()))), \ "The number of categories in the pyarrow array's dictionary is smaller than the number of categories in the input Categorical." if have_nulls: assert valid_mask.sum() > 0 assert (len(rt_cat) - valid_mask.sum()) == result_pa_arr.null_count # TEMP: Certain cases are marked as XFAIL here due to issues in Categorical. # * Cannot create a pre-filtered (i.e. filtered at construction time) Dictionary- or IntEnum-mode Categorical. # * Filtering a Dictionary- or IntEnum-mode Categorical causes unused categories to be dropped, # which is not the same behavior as for other Categorical modes. # * MultiKey Categoricals can't be created with an explicit list of category arrays + an index array, # like what is supported for other Categorical modes. if rt_cat.category_mode == rt.rt_enum.CategoryMode.MultiKey or (have_nulls and rt_cat.category_mode == rt.rt_enum.CategoryMode.Dictionary): pytest.xfail("Expected failure due to issues with the Categorical constructor and/or filtering.") result_cat = rt.Categorical.from_arrow(result_pa_arr, zero_copy_only=False, writable=output_writable) # relaxed_cat_check <==> rt_cat.ordered, because if the categories are ordered, we expect them to be # in the same position after being roundtripped, so they should be mapped to the same integer before/after. # multi-key cats always seem to be ordered, even if ordered=False is specified when creating them. # TODO: Remove CategoryMode.Dictionary from the relaxed_cat_check here -- it's failing because our encoding in # pyarrow doesn't currenly preserve unused entries from the name <-> code mapping. Once that's fixed # we should be able to use the stronger equality check. assert_array_or_cat_equal(rt_cat, result_cat, relaxed_cat_check=rt_cat.ordered or rt_cat.category_mode == rt.rt_enum.CategoryMode.MultiKey or rt_cat.category_mode == rt.rt_enum.CategoryMode.Dictionary)
def test_categorical_numeric_array_key_completion(self): ip = get_ipython() complete = ip.Completer.complete lst = [1, 44, 44, 133, 75] # type: List[int] ip.user_ns["cat"] = Categorical(FastArray(lst)) _, matches = complete(line_buffer="cat['") expected = [str(i) for i in lst] for c in expected: self.assertIn(c, matches)
def do_draw(self, data): # categories will be set if either: # - with_categories parameter is set to True, or # - CategoryMode or category_mode designates a dictionary Categorical. values, categories, cat = None, None, None if self.category_mode == CategoryMode.StringArray: values = list(map(str, data.draw(self.value_strategy))) if self.with_categories: categories = list(map(str, set(values))) cat = Categorical(values, categories=categories, ordered=self.ordered) elif self.category_mode == CategoryMode.Dictionary: values = data.draw(self.value_strategy) category_dict = self._construct_dict(data, values) cat = Categorical(values, categories=category_dict, ordered=self.ordered) else: raise ValueError( f"{self._CN}.do_draw: unhandled category mode {self.category_mode}\n\t{self}" ) return cat
def test_groupby_ops_user_codes_base_1(self): c = Categorical(sorted_codes + 1, complete_unique_cats, base_index=1) self.funnel_all_tests(c, gb, "index + categories + base_index 1") self.simple_string_set_item( sorted_codes.copy(), categories=complete_unique_cats, base_index=1, constructor_name="index + categories + base_index 1", )
def test_groupby_ops_multikey_list(self): mk_list = [str_fa.copy(), str_fa.copy()] mk_gb = Dataset({ 'string1': str_fa.copy(), 'string2': str_fa.copy(), 'ints': int_fa, 'floats': flt_fa, 'tens': tens, }).gbu(['string1', 'string2']) c = Categorical(mk_list) self.funnel_all_tests(c, mk_gb, "multikey list", sorted=False)
def test_categorical_string_array_key_completion(self): ip = get_ipython() complete = ip.Completer.complete lst = ['a', 'b', 'c', 'c', 'd', 'a', 'b'] # type: List[str] ip.user_ns["cat"] = Categorical(FastArray(lst), ordered=True, base_index=1, filter=None) _, matches = complete(line_buffer="cat['") for s in lst: self.assertIn(s, matches)
def test_categorical_multi_key_completion(self): ip = get_ipython() complete = ip.Completer.complete # note - 'e' is not in first list lst1 = ['b', 'a', 'a', 'c', 'a', 'b'] # type: List[str] lst2 = ['b', 'a', 'c', 'e'] # type: List[str] ip.user_ns["cat"] = Categorical(lst1, lst2, sort_gb=True) _, matches = complete(line_buffer="cat['") for c in lst1: self.assertIn(c, matches) for c in lst2: self.assertIn(c, matches)
def test_projections(self): num_rows_trade = 1_000_000 num_symbols = 450 Trade_Dates = [ '20180602', '20180603', '20180604', '20180605', '20180606' ] Exchanges = np.array(['EXCH1', 'EXCH2', 'EXCH3']) np.random.seed(1234) ds = Dataset({ 'SymbolID': np.random.randint(0, num_symbols, size=num_rows_trade), 'Exchange': Exchanges[np.random.randint(0, Exchanges.shape[0], size=num_rows_trade)], 'Trade_Date': [ Trade_Dates[int(i * len(Trade_Dates) / num_rows_trade)] for i in range(num_rows_trade) ], 'Time': [ int(i % (num_rows_trade / len(Trade_Dates))) for i in range(num_rows_trade) ], 'Price': 100 * (1.0 + 0.0005 * np.random.randn(num_rows_trade)), 'Size': 10 * np.array(1 + 30 * np.random.rand(num_rows_trade), dtype=np.int64), }) num_rows_quote = 1_000_000 ds2 = Dataset({ 'SymbolID': np.random.randint(0, num_symbols, size=num_rows_quote), 'Exchange': Exchanges[np.random.randint(0, Exchanges.shape[0], size=num_rows_quote)], 'Trade_Date': [ Trade_Dates[int(i * len(Trade_Dates) / num_rows_quote)] for i in range(num_rows_quote) ], 'Time': [ int(i % (num_rows_quote / len(Trade_Dates))) for i in range(num_rows_quote) ], 'Bid': 100 * (1.0 - 0.001 + 0.0005 * np.random.randn(num_rows_quote)), 'Ask': 100 * (1.0 + 0.001 + 0.0005 * np.random.randn(num_rows_quote)), }) threshold = Dataset( {'Is_Below_Thresdhold': np.random.rand(num_rows_quote) < 0.75}) trade_time = Dataset({'time_2500': (ds.Time / 2500).astype(int)}) trades = Dataset({}).concat_columns([ds, trade_time], do_copy=False) # Create GroupBy and corresponding Categorical trade_gb = trades.groupby( ['SymbolID', 'Exchange', 'Trade_Date', 'time_2500']) trade_cat = Categorical( [ds.SymbolID, ds.Exchange, ds.Trade_Date, trade_time.time_2500]) # Call sum() and count() self.assertEqual(trade_gb.sum().shape, (455654, 7)) self.assertEqual(trade_cat.sum(ds).shape, (455654, 7)) self.assertEqual(trade_gb.count().shape, (455654, 5)) # 8/24/2018 SJK - multikey categorical groupby now returns multiple columns for groupby keys self.assertEqual(trade_cat.count().shape, (455654, 5)) b1 = trade_gb.count().Count.mean() b1c = trade_cat.count().Count.mean() b2 = trade_gb.count().shape[0] self.assertAlmostEqual(ds.shape[0], b1 * b2, places=5) self.assertAlmostEqual(ds.shape[0], b1c * b2, places=5) # Create ds augmented with filtered ID trade_ds = Dataset({'ID': trade_gb.grouping.ikey}) trade_ds_below_threshold = ds * threshold.Is_Below_Thresdhold trade_ds_below_thresholdb = Dataset.concat_columns( [trade_ds_below_threshold, trade_ds], do_copy=False) # Create trade_ds size projection using GroupBy trade_gb_id = trade_ds_below_thresholdb.groupby('ID') trade_sizes_ds = trade_gb_id['Size'].sum() trade_size_ds = trade_sizes_ds.Size[trade_ds_below_thresholdb.ID - 1] self.assertEqual(trade_size_ds.shape[0], ds.shape[0]) # Create trade_ds size projection using Categorical trade_sizes_cat_ds = trade_cat.sum(trade_ds_below_thresholdb.Size) trade_size_cat_ds = trade_sizes_cat_ds.Size[trade_cat - 1] self.assertArrayAlmostEqual(trade_size_ds, trade_size_cat_ds, places=6) # Create trade_ds size projection using Pandas groupby ptrade_ds_below_thresholdb = dataset_as_pandas_df( trade_ds_below_thresholdb) ptrade_gb_id = ptrade_ds_below_thresholdb.groupby('ID') trade_sizes_pd_ds = ptrade_gb_id.sum() trade_size_pd_ds = trade_sizes_pd_ds.Size.values[ptrade_gb_id.ngroup()] self.assertArrayAlmostEqual(trade_size_ds, trade_size_pd_ds, places=6)
np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64, np.float32, np.float64, ] arr_types_string = [np.bytes_, np.str_] test_data = {'bool': np.array([True, False, True, False, True], dtype=np.bool)} for dt in arr_types + arr_types_string: test_data[dt.__name__] = np.array(num_list, dtype=dt) test_data['categorical'] = Categorical([str(i) for i in num_list]) all_headers = list(test_data.keys()) ds = Dataset(test_data) gb_funcs = ['sum', 'mean', 'first', 'last', 'median', 'min', 'max', 'var'] gb_nan_funcs = ['nansum', 'nanmean', 'nanmedian', 'nanvar'] #'rolling', 'cumsum', 'nth' class Groupby_Test(unittest.TestCase): def test_math_ops_same_return(self): result_dict = { 'sum': [5, 10], 'nansum': [5, 10], 'median': [2.5, 3], # TODO: add support for min / max on strings 'min': [1, 2],
def categorical_stringarray( draw, max_length: int, max_categories: int, *, endianness: str = '=', min_str_len: int = 1, max_str_len: int = 16, unicode: Optional[bool] = None, ordered: Optional[bool] = None, ) -> Categorical: """ Strategy for creating StringArray-mode Categoricals. Parameters ---------- draw max_length : int max_categories : int endianness : str min_str_len : int max_str_len : int unicode : bool, optional ordered : bool, optional Examples -------- >>> array_strategy = arrays(integer_dtypes(endianness="=", sizes=(64,)), (5,)) arrays(dtype=integer_dtypes(endianness='=', sizes=(64,)), shape=(5,)) >>> categorical_stringarray(array_strategy, with_categories=True).example() 0, 0, 0, 0, 0 Notes ----- TODO: Make sure to include the case where we have category values (in the underlying integer array) past the end of the categories array. (Or is that only for a Dictionary mode categorical?) To clarify -- this is the behavior where, when we print the Categorical, we get entries like <!456>. TODO: Also exercise (in one way or another) the following arguments to the Categorical constructor: * base_index Add an optional boolean parameter. When None, draw a boolean to fill it in. When the bool is false, call rt.Cat() with base_index=0. When True, call rt.Cat() with base_index=1. * dtype Call the ctor with dtype=None or a signed integer dtype that's either the min size given the number of categories or any larger signed integer dtype. E.g. if len(categories) == 1000, draw from { None, np.int16, np.int32, np.int64 } * filter Add an optional boolean param to the strategy which defaults to None, in which case we'll fill it by drawing a boolean. When the bool is false we we call rt.Cat() with filter=None. When True, we create a boolean array the same length as our values or fancy index and pass that as the filter. TODO: Support slicing/strides on the values/categories arrays passed to the Categorical constructor. TODO: When creating the fancy index array and we've drawn 'explicit_categories=True', allow the fancy index to be created with any applicable integer type (signed or unsigned) whose range is large enough to index into the categories array. (Or, should we just allow _any_ integer dtype, even if too small? We wouldn't be able to index categories past the range of the dtype, but maybe that's an interesting thing to test? Especially around cases like having auto_add=True.) """ # Draw a boolean indicating how the data will be passed to the Categorical constructor later. # This is done first since it's one of the most likely things to affect the behavior of the Categorical, # and shrinking (in some cases) works better when such values are drawn earlier in strategy. explicit_categories: bool = draw(st.booleans()) if explicit_categories: event('Categorical created from unique category array and fancy index.') else: event('Categorical created from non-unique array of strings.') # Draw the string dtype based on whether we want a byte (ascii) string or Unicode. is_unicode: bool = draw(st.booleans()) if unicode is None else unicode if is_unicode: labels_dtype = draw(unicode_string_dtypes(endianness=endianness, min_len=min_str_len, max_len=max_str_len)) else: labels_dtype = draw(byte_string_dtypes(endianness=endianness, min_len=min_str_len, max_len=max_str_len)) # Create an array of unique category labels. cats_shapes = array_shapes(max_dims=1, max_side=max_categories) category_label_strat = category_labels(min_str_len, max_str_len, unicode=is_unicode) unique_labels = draw(arrays(dtype=labels_dtype, shape=cats_shapes, elements=category_label_strat, unique=True)) # Use basic_indices to create a fancy index into the array of unique category labels. # Apply it to expand the array of unique labels into an array where those labels may occur zero or more times. fancy_index_shapes = array_shapes(max_dims=1, max_side=max_length) fancy_index = draw(integer_array_indices(shape=unique_labels.shape, result_shape=fancy_index_shapes)) # If the 'ordered' flag is not set, draw a boolean for it now so we have a concrete value # to use when creating the categorical. is_ordered = draw(st.booleans()) if ordered is None else ordered # If the 'explicit_categories' flag is set, create the Categorical by passing in the # unique values and fancy index separately. # Otherwise, apply the fancy index to the array of unique category values to produce an # array where each category appears zero or more times; then create the Categorical from that. if explicit_categories: return Categorical(fancy_index, categories=unique_labels, ordered=is_ordered, unicode=is_unicode) else: values = unique_labels[fancy_index] return Categorical(values, ordered=is_ordered, unicode=is_unicode)
def categorical_dictmode( draw, max_length: int, max_categories: int, *, endianness: str = '=', min_str_len: int = 1, max_str_len: int = 16, unicode: Optional[bool] = None, ordered: Optional[bool] = None, ) -> Categorical: """ Strategy for creating Dictionary-mode Categoricals. This strategy currently only covers creating `Categorical` instances with string-typed category labels. Parameters ---------- draw max_length : int max_categories : int endianness : str min_str_len : int max_str_len : int unicode : bool, optional ordered : bool, optional Examples -------- >>> categorical_dictmode(10_000, 1_000, max_str_len=20).example() 0, 0, 0, 0, 0 Notes ----- TODO: Make sure to include the case where we have category values (in the underlying integer array) past the end of the categories array. (Or is that only for a Dictionary mode categorical?) To clarify -- this is the behavior where, when we print the Categorical, we get entries like <!456>. TODO: Also exercise (in one way or another) the following arguments to the Categorical constructor: * base_index Add an optional boolean parameter. When None, draw a boolean to fill it in. When the bool is false, call rt.Cat() with base_index=0. When True, call rt.Cat() with base_index=1. * dtype Call the ctor with dtype=None or a signed integer dtype that's either the min size given the number of categories or any larger signed integer dtype. E.g. if len(categories) == 1000, draw from { None, np.int16, np.int32, np.int64 } * filter Add an optional boolean param to the strategy which defaults to None, in which case we'll fill it by drawing a boolean. When the bool is false we we call rt.Cat() with filter=None. When True, we create a boolean array the same length as our values or fancy index and pass that as the filter. TODO: Support slicing/strides on the values/categories arrays passed to the Categorical constructor. TODO: Does a Dictionary-mode Categorical allow any other types (e.g. rt.Date) to be used for the category labels? If so, these should also be covered by this strategy (though changes will needed to allow a variety of types to be used for category labels). TODO: Any possible issues (that we might want to exercise in this strategy) between the string used when displaying the invalid category (e.g. 'Inv') and category labels? What happens if we have a category label using the same string? """ # Draw a boolean indicating whether we'll use a signed or unsigned integer dtype. use_signed_integer_dtype: bool = draw(st.booleans()) # If using a signed integer dtype, draw another boolean indicating whether we'll # generate negative category values. allow_negative_category_values: bool = draw(st.booleans()) if use_signed_integer_dtype else False if use_signed_integer_dtype: if allow_negative_category_values: event('Categorical may have a mix of negative, zero, and positive category values.') else: event('Categorical has only non-negative category values.') # If the 'unicode' flag is not set, draw a boolean to fill it in. is_unicode: bool = draw(st.booleans()) if unicode is None else unicode event(f'Category labels are {"unicode" if is_unicode else "ascii"} strings.') # If the 'ordered' flag is not set, draw a boolean for it now so we have a concrete value # to use when creating the categorical. is_ordered = draw(st.booleans()) if ordered is None else ordered event(f'ordered = {is_ordered}') # Draw the dtype for the category values. # TODO: Draw a signed or unsigned integer dtype here which is at least as large as needed, but perhaps larger # than needed. # For now, we just use the smallest dtype large enough to fit the max number of categories; but allowing for # larger (randomly-selected) dtypes later will help ensure we test cases where there are non-consecutive # category values even when the max_categories value is near the max value of a dtype. values_dtype = np.min_scalar_type(max_categories) # Create the strategy for the category values (integer values representing the categories). values_dtype_info = np.iinfo(values_dtype) values_strat =\ st.integers( min_value=(values_dtype_info.min if allow_negative_category_values else 0), max_value=values_dtype_info.max) # Create an array of unique category values/codes. cats_shapes = array_shapes(max_dims=1, max_side=max_categories) unique_cat_values = draw(arrays(dtype=values_dtype, shape=cats_shapes, elements=values_strat, unique=True)) # Draw the string dtype for the labels based on whether we want a byte (ascii) string or Unicode. is_unicode: bool = draw(st.booleans()) if unicode is None else unicode if is_unicode: labels_dtype = draw(unicode_string_dtypes(endianness=endianness, min_len=min_str_len, max_len=max_str_len)) else: labels_dtype = draw(byte_string_dtypes(endianness=endianness, min_len=min_str_len, max_len=max_str_len)) # Create an array of unique category labels; this must be the same shape as the unique category values array. category_label_strat = category_labels(min_str_len, max_str_len, unicode=is_unicode) unique_labels =\ draw(arrays(dtype=labels_dtype, shape=unique_cat_values.shape, elements=category_label_strat, unique=True)) # TODO: Draw a slice (or None) that we'll apply to both arrays of uniques (the labels and values) # before using them to create the category dictionary. # This allows us to cover cases where a category value isn't in the dictionary. # Combine the unique category labels and values to create a dictionary. category_dict = dict(zip(unique_labels, unique_cat_values)) # Use basic_indices to create a fancy index into the array of unique values. # Apply it to expand the array of unique values into an array where those values may occur zero or more times. fancy_index_shapes = array_shapes(max_dims=1, max_side=max_length) fancy_index = draw(integer_array_indices(shape=unique_cat_values.shape, result_shape=fancy_index_shapes)) # Apply the fancy index to the array of unique category values to produce an # array where each category appears zero or more times; then create the Categorical from that. cat_values = unique_cat_values[fancy_index] return Categorical(cat_values, categories=category_dict, ordered=is_ordered, unicode=is_unicode)