def compare_strategies(length, ncat, op, dtype): keys = ak.randint(0, ncat, length) if dtype == 'int64': vals = ak.randint(0, length//ncat, length) elif dtype == 'bool': vals = ak.zeros(length, dtype='bool') for i in np.random.randint(0, length, ncat//2): vals[i] = True else: vals = ak.linspace(-1, 1, length) print("Global groupby", end=' ') start = time() gg = ak.GroupBy(keys, False) ggtime = time() - start print(ggtime) print("Global reduce", end=' ') start = time() gk, gv = gg.aggregate(vals, op) grtime = time() - start print(grtime) print("Local groupby", end=' ') start = time() lg = ak.GroupBy(keys, True) lgtime = time() - start print(lgtime) print("Local reduce", end=' ') start = time() lk, lv = lg.aggregate(vals, op) lrtime = time() - start print(lrtime) print(f"Keys match? {(gk == lk).all()}") print(f"Absolute diff of vals = {ak.abs(gv - lv).sum()}") return ggtime, grtime, lgtime, lrtime
def test_multi_level_categorical(self): string = ak.array(['a', 'b', 'a', 'b', 'c']) cat = ak.Categorical(string) cat_from_codes = ak.Categorical.from_codes( codes=ak.array([0, 1, 0, 1, 2]), categories=ak.array(['a', 'b', 'c'])) i = ak.arange(string.size) expected = {('a', 'a'): 2, ('b', 'b'): 2, ('c', 'c'): 1} # list of 2 strings str_grouping = ak.GroupBy([string, string]) str_labels, str_values = str_grouping.nunique(i) str_dict = to_tuple_dict(str_labels, str_values) self.assertDictEqual(expected, str_dict) # list of 2 cats (one from_codes) cat_grouping = ak.GroupBy([cat, cat_from_codes]) cat_labels, cat_values = cat_grouping.nunique(i) cat_dict = to_tuple_dict(cat_labels, cat_values) self.assertDictEqual(expected, cat_dict) # One cat (from_codes) and one string mixed_grouping = ak.GroupBy([cat_from_codes, string]) mixed_labels, mixed_values = mixed_grouping.nunique(i) mixed_dict = to_tuple_dict(mixed_labels, mixed_values) self.assertDictEqual(expected, mixed_dict)
def run_test(levels, verbose=False): ''' The run_test method enables execution of ak.GroupBy and ak.GroupBy.Reductions on a randomized set of arrays on the specified number of levels. Note: the current set of valid levels is {1,2} :return: ''' d = make_arrays() df = pd.DataFrame(d) akdf = {k:ak.array(v) for k, v in d.items()} if levels == 1: akg = ak.GroupBy(akdf['keys']) keyname = 'keys' elif levels == 2: akg = ak.GroupBy([akdf['keys'], akdf['keys2']]) keyname = ['keys', 'keys2'] tests = 0 failures = 0 not_impl = 0 if verbose: print(f"Doing .count()") tests += 1 pdkeys, pdvals = groupby_to_arrays(df, keyname, 'int64', 'count', levels) akkeys, akvals = akg.count() akvals = akvals.to_ndarray() failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals) for vname in ('int64', 'float64', 'bool'): for op in ak.GroupBy.Reductions: if verbose: print(f"\nDoing aggregate({vname}, {op})") tests += 1 do_check = True try: pdkeys, pdvals = groupby_to_arrays(df, keyname, vname, op, levels) except Exception as E: if verbose: print("Pandas does not implement") do_check = False try: akkeys, akvals = akg.aggregate(akdf[vname], op) akvals = akvals.to_ndarray() except RuntimeError as E: if verbose: print("Arkouda error: ", E) not_impl += 1 do_check = False continue if not do_check: continue if op.startswith('arg'): pdextrema = df[vname][pdvals] akextrema = akdf[vname][ak.array(akvals)].to_ndarray() if not np.allclose(pdextrema, akextrema): print(f"Different argmin/argmax: Arkouda failed to find an extremum") print("pd: ", pdextrema) print("ak: ", akextrema) failures += 1 else: failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals) print(f"{tests - failures - not_impl} / {tests - not_impl} passed, {failures} errors, {not_impl} not implemented") return failures
def setUp(self): ArkoudaTest.setUp(self) self.bvalues = ak.randint(0, 1, 10, dtype=bool) self.fvalues = ak.randint(0, 1, 10, dtype=float) self.ivalues = ak.array([4, 1, 3, 2, 2, 2, 5, 5, 2, 3]) self.igb = ak.GroupBy(self.ivalues)
def test_zero_length_groupby(self): """ This tests groupby boundary condition on a zero length pdarray, see Issue #900 for details """ g = ak.GroupBy(ak.zeros(0, dtype=ak.int64)) str( g.segments ) # passing condition, if this was deleted it will cause the test to fail
def run_test_groupby(strings, cat, akset): g = ak.GroupBy(strings) gc = ak.GroupBy(cat) # Unique keys should be same result as ak.unique assert (akset == set(g.unique_keys.to_ndarray())) assert (akset == set(gc.unique_keys.to_ndarray())) assert ((gc.permutation == g.permutation).all()) permStrings = strings[g.permutation].to_ndarray() # Check each group individually lengths = np.diff(np.hstack((g.segments.to_ndarray(), np.array([g.size])))) for uk, s, l in zip(g.unique_keys.to_ndarray(), g.segments.to_ndarray(), lengths): # All values in group should equal key assert ((permStrings[s:s + l] == uk).all()) # Key should not appear anywhere outside of group assert (not (permStrings[:s] == uk).any()) assert (not (permStrings[s + l:] == uk).any())
def test_count(self): values = ak.array([4, 1, 3, 2, 2, 2, 5, 5, 2, 3]) gb = ak.GroupBy(values) keys, counts = gb.count() self.assertTrue((np.array([1, 2, 3, 4, 5]) == keys.to_ndarray()).all()) self.assertTrue((np.array([1, 4, 2, 1, 2]) == counts.to_ndarray()).all())
def test_error_handling(self): d = make_arrays() akdf = {k: ak.array(v) for k, v in d.items()} gb = ak.GroupBy([akdf['keys'], akdf['keys2']]) with self.assertRaises(TypeError) as cm: gb.broadcast([]) self.assertEqual( 'type of argument "values" must be arkouda.pdarrayclass.pdarray; got list instead', cm.exception.args[0])
def test_nunique_types(self): string = ak.array(['a', 'b', 'a', 'b', 'c']) cat = ak.Categorical(string) i = ak.array([5, 3, 5, 3, 1]) expected = ak.array([1, 1, 1]) # Try GroupBy.nunique with every combination of types, including mixed keys = (string, cat, i, (string, cat, i)) for key in keys: g = ak.GroupBy(key) for val in keys: k, n = g.nunique(val) self.assertTrue((n == expected).all())
def test_aggregate_strings(self): s = ak.array(['a', 'b', 'a', 'b', 'c']) i = ak.arange(s.size) grouping = ak.GroupBy(s) labels, values = grouping.nunique(i) expected = {'a': 2, 'b': 2, 'c': 1} actual = { label: value for (label, value) in zip(labels.to_ndarray(), values.to_ndarray()) } self.assertDictEqual(expected, actual)
def run_test(verbose=True): ''' The run_test method enables execution of ak.GroupBy and ak.GroupBy.Reductions for mean, min, max, and sum on a randomized set of arrays including nan values. :return: ''' d = make_arrays() df = pd.DataFrame(d) akdf = {k:ak.array(v) for k, v in d.items()} akg = ak.GroupBy(akdf['keys']) keyname = 'keys' tests = 0 failures = 0 not_impl = 0 tests += 1 pdkeys, pdvals = groupby_to_arrays(df, keyname, 'float64', 'count') akkeys, akvals = akg.count() akvals = akvals.to_ndarray() for op in OPS: tests += 1 do_check = True try: pdkeys, pdvals = groupby_to_arrays(df, keyname, 'float64', op) except Exception as E: if verbose: print("Pandas does not implement") do_check = False try: akkeys, akvals = akg.aggregate(akdf['float64'], op, True) akvals = akvals.to_ndarray() except RuntimeError as E: if verbose: print("Arkouda error: ", E) not_impl += 1 do_check = False continue if not do_check: continue for i in range(pdvals.size): if np.isnan(pdvals[i]): pdvals[i] = 0.0 # clear out any nans to match ak implementation failures += compare_keys(pdkeys, akkeys, pdvals, akvals) return failures
def testPrecision(self): # See https://github.com/Bears-R-Us/arkouda/issues/964 # Grouped sum was exacerbating floating point errors # This test verifies the fix N = 10**6 G = N // 10 ub = 2**63 // N groupnum = ak.randint(0, G, N, seed=1) intval = ak.randint(0, ub, N, seed=2) floatval = ak.cast(intval, ak.float64) g = ak.GroupBy(groupnum) _, intmean = g.mean(intval) _, floatmean = g.mean(floatval) ak_mse = ak.mean((intmean - floatmean)**2) self.assertTrue(np.isclose(ak_mse, 0.0))
def test_broadcast_booleans(self): values = ak.array([4, 1, 3, 2, 2, 2, 5, 5, 2, 3]) gb = ak.GroupBy(values) keys, counts = gb.count() self.assertTrue((np.array([1, 4, 2, 1, 2]) == counts.to_ndarray()).all()) self.assertTrue((np.array([1, 2, 3, 4, 5]) == keys.to_ndarray()).all()) results = gb.broadcast(counts > 2) self.assertTrue((np.array([0, 1, 1, 1, 1, 0, 0, 0, 0, 0]), results.to_ndarray())) results = gb.broadcast(counts == 2) self.assertTrue((np.array([0, 0, 0, 0, 0, 1, 1, 0, 1, 1]), results.to_ndarray())) results = gb.broadcast(counts < 4) self.assertTrue((np.array([1, 0, 0, 0, 0, 1, 1, 1, 1, 1]), results.to_ndarray()))
def test_error_handling(self): d = make_arrays() akdf = {k: ak.array(v) for k, v in d.items()} gb = ak.GroupBy([akdf['keys'], akdf['keys2']]) with self.assertRaises(TypeError) as cm: ak.GroupBy(self.bvalues) self.assertEqual('GroupBy only supports pdarrays with a dtype int64', cm.exception.args[0]) with self.assertRaises(TypeError) as cm: ak.GroupBy(self.fvalues) self.assertEqual('GroupBy only supports pdarrays with a dtype int64', cm.exception.args[0]) with self.assertRaises(TypeError) as cm: gb.broadcast([]) self.assertEqual( 'type of argument "values" must be arkouda.pdarrayclass.pdarray; got list instead', cm.exception.args[0]) with self.assertRaises(TypeError) as cm: self.igb.nunique(ak.randint(0, 1, 10, dtype=bool)) self.assertEqual('the pdarray dtype must be int64', cm.exception.args[0]) with self.assertRaises(TypeError) as cm: self.igb.nunique(ak.randint(0, 1, 10, dtype=float64)) self.assertEqual('the pdarray dtype must be int64', cm.exception.args[0]) with self.assertRaises(TypeError) as cm: self.igb.any(ak.randint(0, 1, 10, dtype=float64)) self.assertEqual('any is only supported for pdarrays of dtype bool', cm.exception.args[0]) with self.assertRaises(TypeError) as cm: self.igb.any(ak.randint(0, 1, 10, dtype=int64)) self.assertEqual('any is only supported for pdarrays of dtype bool', cm.exception.args[0]) with self.assertRaises(TypeError) as cm: self.igb.all(ak.randint(0, 1, 10, dtype=float64)) self.assertEqual('all is only supported for pdarrays of dtype bool', cm.exception.args[0]) with self.assertRaises(TypeError) as cm: self.igb.all(ak.randint(0, 1, 10, dtype=int64)) self.assertEqual('all is only supported for pdarrays of dtype bool', cm.exception.args[0]) with self.assertRaises(TypeError) as cm: self.igb.min(ak.randint(0, 1, 10, dtype=bool)) self.assertEqual( 'min is only supported for pdarrays of dtype float64 and int64', cm.exception.args[0]) with self.assertRaises(TypeError) as cm: self.igb.max(ak.randint(0, 1, 10, dtype=bool)) self.assertEqual( 'max is only supported for pdarrays of dtype float64 and int64', cm.exception.args[0]) with self.assertRaises(TypeError) as cm: self.igb.argmin(ak.randint(0, 1, 10, dtype=bool)) self.assertEqual( 'argmin is only supported for pdarrays of dtype float64 and int64', cm.exception.args[0]) with self.assertRaises(TypeError) as cm: self.igb.argmax(ak.randint(0, 1, 10, dtype=bool)) self.assertEqual( 'argmax is only supported for pdarrays of dtype float64 and int64', cm.exception.args[0])
assert (word in more_words) # Exhaustively find all matches to make sure we didn't miss any inds = ak.zeros(strings.size, dtype=ak.bool) for word in more_words: inds |= (strings == word) assert ((inds == matches).all()) print("in1d and iter passed") # argsort test_argsort(strings, test_strings, cat) # unique test_unique(strings, test_strings, cat) # groupby g = ak.GroupBy(strings) gc = ak.GroupBy(cat) # Unique keys should be same result as ak.unique assert (akset == set(g.unique_keys.to_ndarray())) assert (akset == set(gc.unique_keys.to_ndarray())) assert ((gc.permutation == g.permutation).all()) permStrings = strings[g.permutation] # Check each group individually lengths = np.diff(np.hstack((g.segments.to_ndarray(), np.array([g.size])))) for uk, s, l in zip(g.unique_keys, g.segments, lengths): # All values in group should equal key assert ((permStrings[s:s + l] == uk).all()) # Key should not appear anywhere outside of group assert (not (permStrings[:s] == uk).any()) assert (not (permStrings[s + l:] == uk).any()) print("groupby passed")
def test_type_failure_multilevel_groupby_aggregate(self): # just checking no error occurs with hotfix for Issue 858 keys = [ak.randint(0, 10, 100), ak.randint(0, 10, 100)] g = ak.GroupBy(keys) g.min(ak.randint(0, 10, 100))
if __name__ == '__main__': import sys if len(sys.argv) != 7: print( f"Usage: {sys.argv[0]} <server> <port> <strategy (0=global, 1=perLocale)> <length> <num_keys> <num_vals>" ) sys.exit() per_locale = (sys.argv[3] == '1') print("per_locale = ", per_locale) length = int(sys.argv[4]) print("length = ", length) nkeys = int(sys.argv[5]) print("nkeys = ", nkeys) nvals = int(sys.argv[6]) print("nvals = ", nvals) ak.connect(sys.argv[1], int(sys.argv[2])) print("Generating keys and vals...") start = time() keys, vals = generate_arrays(length, nkeys, nvals) print(f"{time() - start:.2f} seconds", end="\n\n") print("GroupBy...") start = time() g = ak.GroupBy(keys, per_locale) print(f"{time() - start:.2f} seconds", end="\n\n") for op in OPERATORS: print(f"Aggregate('{op}') ...") start = time() uk, rv = g.aggregate(vals, op) print(f"{time() - start:.2f} seconds", end="\n\n") sys.exit()