def test_int64_overflow_groupby_large_df_shuffled(self, agg): arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) i = np.random.choice(len(arr), len(arr) * 4) arr = np.vstack((arr, arr[i])) # add some duplicate rows i = np.random.permutation(len(arr)) arr = arr[i] # shuffle rows df = DataFrame(arr, columns=list("abcde")) df["jim"], df["joe"] = np.random.randn(2, len(df)) * 10 gr = df.groupby(list("abcde")) # verify this is testing what it is supposed to test! assert is_int64_overflow_possible(gr.grouper.shape) # manually compute groupings jim, joe = defaultdict(list), defaultdict(list) for key, a, b in zip(map(tuple, arr), df["jim"], df["joe"]): jim[key].append(a) joe[key].append(b) assert len(gr) == len(jim) mi = MultiIndex.from_tuples(jim.keys(), names=list("abcde")) f = lambda a: np.fromiter(map(getattr(np, agg), a), dtype="f8") arr = np.vstack((f(jim.values()), f(joe.values()))).T res = DataFrame(arr, columns=["jim", "joe"], index=mi).sort_index() tm.assert_frame_equal(getattr(gr, agg)(), res)
def test_int64_overflow_moar(self): # GH9096 values = range(55109) data = DataFrame.from_dict({ "a": values, "b": values, "c": values, "d": values }) grouped = data.groupby(["a", "b", "c", "d"]) assert len(grouped) == len(values) arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) i = np.random.choice(len(arr), len(arr) * 4) arr = np.vstack((arr, arr[i])) # add sume duplicate rows i = np.random.permutation(len(arr)) arr = arr[i] # shuffle rows df = DataFrame(arr, columns=list("abcde")) df["jim"], df["joe"] = np.random.randn(2, len(df)) * 10 gr = df.groupby(list("abcde")) # verify this is testing what it is supposed to test! assert is_int64_overflow_possible(gr.grouper.shape) # manually compute groupings jim, joe = defaultdict(list), defaultdict(list) for key, a, b in zip(map(tuple, arr), df["jim"], df["joe"]): jim[key].append(a) joe[key].append(b) assert len(gr) == len(jim) mi = MultiIndex.from_tuples(jim.keys(), names=list("abcde")) def aggr(func): f = lambda a: np.fromiter(map(func, a), dtype="f8") arr = np.vstack((f(jim.values()), f(joe.values()))).T res = DataFrame(arr, columns=["jim", "joe"], index=mi) return res.sort_index() tm.assert_frame_equal(gr.mean(), aggr(np.mean)) tm.assert_frame_equal(gr.median(), aggr(np.median))
def test_int64_overflow_moar(self): # GH9096 values = range(55109) data = pd.DataFrame.from_dict({'a': values, 'b': values, 'c': values, 'd': values}) grouped = data.groupby(['a', 'b', 'c', 'd']) assert len(grouped) == len(values) arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) i = np.random.choice(len(arr), len(arr) * 4) arr = np.vstack((arr, arr[i])) # add sume duplicate rows i = np.random.permutation(len(arr)) arr = arr[i] # shuffle rows df = DataFrame(arr, columns=list('abcde')) df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10 gr = df.groupby(list('abcde')) # verify this is testing what it is supposed to test! assert is_int64_overflow_possible(gr.grouper.shape) # mannually compute groupings jim, joe = defaultdict(list), defaultdict(list) for key, a, b in zip(map(tuple, arr), df['jim'], df['joe']): jim[key].append(a) joe[key].append(b) assert len(gr) == len(jim) mi = MultiIndex.from_tuples(jim.keys(), names=list('abcde')) def aggr(func): f = lambda a: np.fromiter(map(func, a), dtype='f8') arr = np.vstack((f(jim.values()), f(joe.values()))).T res = DataFrame(arr, columns=['jim', 'joe'], index=mi) return res.sort_index() assert_frame_equal(gr.mean(), aggr(np.mean)) assert_frame_equal(gr.median(), aggr(np.median))
def test_int64_overflow_issues(self): # #2690, combinatorial explosion df1 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G1"]) df2 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G2"]) # it works! result = merge(df1, df2, how="outer") assert len(result) == 2000 low, high, n = -1 << 10, 1 << 10, 1 << 20 left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG")) left["left"] = left.sum(axis=1) # one-2-one match i = np.random.permutation(len(left)) right = left.iloc[i].copy() right.columns = right.columns[:-1].tolist() + ["right"] right.index = np.arange(len(right)) right["right"] *= -1 out = merge(left, right, how="outer") assert len(out) == len(left) tm.assert_series_equal(out["left"], -out["right"], check_names=False) result = out.iloc[:, :-2].sum(axis=1) tm.assert_series_equal(out["left"], result, check_names=False) assert result.name is None out.sort_values(out.columns.tolist(), inplace=True) out.index = np.arange(len(out)) for how in ["left", "right", "outer", "inner"]: tm.assert_frame_equal(out, merge(left, right, how=how, sort=True)) # check that left merge w/ sort=False maintains left frame order out = merge(left, right, how="left", sort=False) tm.assert_frame_equal(left, out[left.columns.tolist()]) out = merge(right, left, how="left", sort=False) tm.assert_frame_equal(right, out[right.columns.tolist()]) # one-2-many/none match n = 1 << 11 left = DataFrame( np.random.randint(low, high, (n, 7)).astype("int64"), columns=list("ABCDEFG"), ) # confirm that this is checking what it is supposed to check shape = left.apply(Series.nunique).values assert is_int64_overflow_possible(shape) # add duplicates to left frame left = concat([left, left], ignore_index=True) right = DataFrame( np.random.randint(low, high, (n // 2, 7)).astype("int64"), columns=list("ABCDEFG"), ) # add duplicates & overlap with left to the right frame i = np.random.choice(len(left), n) right = concat([right, right, left.iloc[i]], ignore_index=True) left["left"] = np.random.randn(len(left)) right["right"] = np.random.randn(len(right)) # shuffle left & right frames i = np.random.permutation(len(left)) left = left.iloc[i].copy() left.index = np.arange(len(left)) i = np.random.permutation(len(right)) right = right.iloc[i].copy() right.index = np.arange(len(right)) # manually compute outer merge ldict, rdict = defaultdict(list), defaultdict(list) for idx, row in left.set_index(list("ABCDEFG")).iterrows(): ldict[idx].append(row["left"]) for idx, row in right.set_index(list("ABCDEFG")).iterrows(): rdict[idx].append(row["right"]) vals = [] for k, lval in ldict.items(): rval = rdict.get(k, [np.nan]) for lv, rv in product(lval, rval): vals.append(k + tuple([lv, rv])) for k, rval in rdict.items(): if k not in ldict: for rv in rval: vals.append(k + tuple([np.nan, rv])) def align(df): df = df.sort_values(df.columns.tolist()) df.index = np.arange(len(df)) return df def verify_order(df): kcols = list("ABCDEFG") tm.assert_frame_equal( df[kcols].copy(), df[kcols].sort_values(kcols, kind="mergesort")) out = DataFrame(vals, columns=list("ABCDEFG") + ["left", "right"]) out = align(out) jmask = { "left": out["left"].notna(), "right": out["right"].notna(), "inner": out["left"].notna() & out["right"].notna(), "outer": np.ones(len(out), dtype="bool"), } for how in "left", "right", "outer", "inner": mask = jmask[how] frame = align(out[mask].copy()) assert mask.all() ^ mask.any() or how == "outer" for sort in [False, True]: res = merge(left, right, how=how, sort=sort) if sort: verify_order(res) # as in GH9092 dtypes break with outer/right join tm.assert_frame_equal(frame, align(res), check_dtype=how not in ("right", "outer"))
def test_int64_overflow_one_to_many_none_match(self, how, sort): # one-2-many/none match low, high, n = -1 << 10, 1 << 10, 1 << 11 left = DataFrame( np.random.randint(low, high, (n, 7)).astype("int64"), columns=list("ABCDEFG"), ) # confirm that this is checking what it is supposed to check shape = left.apply(Series.nunique).values assert is_int64_overflow_possible(shape) # add duplicates to left frame left = concat([left, left], ignore_index=True) right = DataFrame( np.random.randint(low, high, (n // 2, 7)).astype("int64"), columns=list("ABCDEFG"), ) # add duplicates & overlap with left to the right frame i = np.random.choice(len(left), n) right = concat([right, right, left.iloc[i]], ignore_index=True) left["left"] = np.random.randn(len(left)) right["right"] = np.random.randn(len(right)) # shuffle left & right frames i = np.random.permutation(len(left)) left = left.iloc[i].copy() left.index = np.arange(len(left)) i = np.random.permutation(len(right)) right = right.iloc[i].copy() right.index = np.arange(len(right)) # manually compute outer merge ldict, rdict = defaultdict(list), defaultdict(list) for idx, row in left.set_index(list("ABCDEFG")).iterrows(): ldict[idx].append(row["left"]) for idx, row in right.set_index(list("ABCDEFG")).iterrows(): rdict[idx].append(row["right"]) vals = [] for k, lval in ldict.items(): rval = rdict.get(k, [np.nan]) for lv, rv in product(lval, rval): vals.append(k + ( lv, rv, )) for k, rval in rdict.items(): if k not in ldict: for rv in rval: vals.append(k + ( np.nan, rv, )) def align(df): df = df.sort_values(df.columns.tolist()) df.index = np.arange(len(df)) return df out = DataFrame(vals, columns=list("ABCDEFG") + ["left", "right"]) out = align(out) jmask = { "left": out["left"].notna(), "right": out["right"].notna(), "inner": out["left"].notna() & out["right"].notna(), "outer": np.ones(len(out), dtype="bool"), } mask = jmask[how] frame = align(out[mask].copy()) assert mask.all() ^ mask.any() or how == "outer" res = merge(left, right, how=how, sort=sort) if sort: kcols = list("ABCDEFG") tm.assert_frame_equal( res[kcols].copy(), res[kcols].sort_values(kcols, kind="mergesort")) # as in GH9092 dtypes break with outer/right join # 2021-12-18: dtype does not break anymore tm.assert_frame_equal(frame, align(res))
def test_int64_overflow_issues(self): # #2690, combinatorial explosion df1 = DataFrame(np.random.randn(1000, 7), columns=list('ABCDEF') + ['G1']) df2 = DataFrame(np.random.randn(1000, 7), columns=list('ABCDEF') + ['G2']) # it works! result = merge(df1, df2, how='outer') assert len(result) == 2000 low, high, n = -1 << 10, 1 << 10, 1 << 20 left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list('ABCDEFG')) left['left'] = left.sum(axis=1) # one-2-one match i = np.random.permutation(len(left)) right = left.iloc[i].copy() right.columns = right.columns[:-1].tolist() + ['right'] right.index = np.arange(len(right)) right['right'] *= -1 out = merge(left, right, how='outer') assert len(out) == len(left) assert_series_equal(out['left'], - out['right'], check_names=False) result = out.iloc[:, :-2].sum(axis=1) assert_series_equal(out['left'], result, check_names=False) assert result.name is None out.sort_values(out.columns.tolist(), inplace=True) out.index = np.arange(len(out)) for how in ['left', 'right', 'outer', 'inner']: assert_frame_equal(out, merge(left, right, how=how, sort=True)) # check that left merge w/ sort=False maintains left frame order out = merge(left, right, how='left', sort=False) assert_frame_equal(left, out[left.columns.tolist()]) out = merge(right, left, how='left', sort=False) assert_frame_equal(right, out[right.columns.tolist()]) # one-2-many/none match n = 1 << 11 left = DataFrame(np.random.randint(low, high, (n, 7)).astype('int64'), columns=list('ABCDEFG')) # confirm that this is checking what it is supposed to check shape = left.apply(Series.nunique).values assert is_int64_overflow_possible(shape) # add duplicates to left frame left = concat([left, left], ignore_index=True) right = DataFrame(np.random.randint(low, high, (n // 2, 7)) .astype('int64'), columns=list('ABCDEFG')) # add duplicates & overlap with left to the right frame i = np.random.choice(len(left), n) right = concat([right, right, left.iloc[i]], ignore_index=True) left['left'] = np.random.randn(len(left)) right['right'] = np.random.randn(len(right)) # shuffle left & right frames i = np.random.permutation(len(left)) left = left.iloc[i].copy() left.index = np.arange(len(left)) i = np.random.permutation(len(right)) right = right.iloc[i].copy() right.index = np.arange(len(right)) # manually compute outer merge ldict, rdict = defaultdict(list), defaultdict(list) for idx, row in left.set_index(list('ABCDEFG')).iterrows(): ldict[idx].append(row['left']) for idx, row in right.set_index(list('ABCDEFG')).iterrows(): rdict[idx].append(row['right']) vals = [] for k, lval in ldict.items(): rval = rdict.get(k, [np.nan]) for lv, rv in product(lval, rval): vals.append(k + tuple([lv, rv])) for k, rval in rdict.items(): if k not in ldict: for rv in rval: vals.append(k + tuple([np.nan, rv])) def align(df): df = df.sort_values(df.columns.tolist()) df.index = np.arange(len(df)) return df def verify_order(df): kcols = list('ABCDEFG') assert_frame_equal(df[kcols].copy(), df[kcols].sort_values(kcols, kind='mergesort')) out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right']) out = align(out) jmask = {'left': out['left'].notna(), 'right': out['right'].notna(), 'inner': out['left'].notna() & out['right'].notna(), 'outer': np.ones(len(out), dtype='bool')} for how in 'left', 'right', 'outer', 'inner': mask = jmask[how] frame = align(out[mask].copy()) assert mask.all() ^ mask.any() or how == 'outer' for sort in [False, True]: res = merge(left, right, how=how, sort=sort) if sort: verify_order(res) # as in GH9092 dtypes break with outer/right join assert_frame_equal(frame, align(res), check_dtype=how not in ('right', 'outer'))
def test_int64_overflow(self): B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500))) A = np.arange(2500) df = DataFrame({'A': A, 'B': B, 'C': A, 'D': B, 'E': A, 'F': B, 'G': A, 'H': B, 'values': np.random.randn(2500)}) lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']) rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']) left = lg.sum()['values'] right = rg.sum()['values'] exp_index, _ = left.index.sortlevel() self.assert_index_equal(left.index, exp_index) exp_index, _ = right.index.sortlevel(0) self.assert_index_equal(right.index, exp_index) tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H' ]].values)) tups = com._asarray_tuplesafe(tups) expected = df.groupby(tups).sum()['values'] for k, v in compat.iteritems(expected): self.assertEqual(left[k], right[k[::-1]]) self.assertEqual(left[k], v) self.assertEqual(len(left), len(right)) # GH9096 values = range(55109) data = pd.DataFrame.from_dict({'a': values, 'b': values, 'c': values, 'd': values}) grouped = data.groupby(['a', 'b', 'c', 'd']) self.assertEqual(len(grouped), len(values)) arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) i = np.random.choice(len(arr), len(arr) * 4) arr = np.vstack((arr, arr[i])) # add sume duplicate rows i = np.random.permutation(len(arr)) arr = arr[i] # shuffle rows df = DataFrame(arr, columns=list('abcde')) df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10 gr = df.groupby(list('abcde')) # verify this is testing what it is supposed to test! self.assertTrue(is_int64_overflow_possible(gr.grouper.shape)) # mannually compute groupings jim, joe = defaultdict(list), defaultdict(list) for key, a, b in zip(map(tuple, arr), df['jim'], df['joe']): jim[key].append(a) joe[key].append(b) self.assertEqual(len(gr), len(jim)) mi = MultiIndex.from_tuples(jim.keys(), names=list('abcde')) def aggr(func): f = lambda a: np.fromiter(map(func, a), dtype='f8') arr = np.vstack((f(jim.values()), f(joe.values()))).T res = DataFrame(arr, columns=['jim', 'joe'], index=mi) return res.sort_index() assert_frame_equal(gr.mean(), aggr(np.mean)) assert_frame_equal(gr.median(), aggr(np.median))