def testWithoutShuffleAndWithOneChunk(self): # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=(5, 10)) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=(6, 10)) df3 = add(df1, df2) # test df3's index and columns pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns) self.assertTrue(df3.columns.should_be_monotonic) self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index) self.assertTrue(df3.index_value.should_be_monotonic) pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) self.assertNotEqual(df3.index_value.key, df1.index_value.key) self.assertNotEqual(df3.index_value.key, df2.index_value.key) self.assertEqual(df3.shape[1], 12) # columns is recorded, so we can get it df3.tiles() data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)] data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)] left_index_splits, right_index_splits = split_monotonic_index_min_max( data1_index_min_max, True, data2_index_min_max, False) left_index_idx_to_original_idx = build_split_idx_to_origin_idx( left_index_splits) right_index_idx_to_original_idx = build_split_idx_to_origin_idx( right_index_splits, False) self.assertEqual(df3.chunk_shape, (7, 1)) for c in df3.chunks: self.assertIsInstance(c.op, DataFrameAdd) self.assertEqual(len(c.inputs), 2) # test shape idx = c.index # test the left side self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignMap) left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[ idx[0]] expect_df1_input = df1.cix[left_row_idx, 0].data self.assertIs(c.inputs[0].inputs[0], expect_df1_input) left_index_min_max = left_index_splits[left_row_idx][ left_row_inner_idx] self.assertEqual(c.inputs[0].op.index_min, left_index_min_max[0]) self.assertEqual(c.inputs[0].op.index_min_close, left_index_min_max[1]) self.assertEqual(c.inputs[0].op.index_max, left_index_min_max[2]) self.assertEqual(c.inputs[0].op.index_max_close, left_index_min_max[3]) self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) self.assertEqual(c.inputs[0].op.column_min, expect_df1_input.columns.min_val) self.assertEqual(c.inputs[0].op.column_min_close, expect_df1_input.columns.min_val_close) self.assertEqual(c.inputs[0].op.column_max, expect_df1_input.columns.max_val) self.assertEqual(c.inputs[0].op.column_max_close, expect_df1_input.columns.max_val_close) expect_left_columns = expect_df1_input.columns pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(), expect_left_columns.to_pandas()) pd.testing.assert_index_equal(c.inputs[0].dtypes.index, expect_left_columns.to_pandas()) # test the right side self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignMap) right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[ idx[0]] expect_df2_input = df2.cix[right_row_idx, 0].data self.assertIs(c.inputs[1].inputs[0], expect_df2_input) right_index_min_max = right_index_splits[right_row_idx][ right_row_inner_idx] self.assertEqual(c.inputs[1].op.index_min, right_index_min_max[0]) self.assertEqual(c.inputs[1].op.index_min_close, right_index_min_max[1]) self.assertEqual(c.inputs[1].op.index_max, right_index_min_max[2]) self.assertEqual(c.inputs[1].op.index_max_close, right_index_min_max[3]) self.assertIsInstance(c.inputs[1].index_value.to_pandas(), type(data2.index)) self.assertEqual(c.inputs[1].op.column_min, expect_df2_input.columns.min_val) self.assertEqual(c.inputs[1].op.column_min_close, expect_df2_input.columns.min_val_close) self.assertEqual(c.inputs[1].op.column_max, expect_df2_input.columns.max_val) self.assertEqual(c.inputs[1].op.column_max_close, expect_df2_input.columns.max_val_close) expect_right_columns = expect_df2_input.columns pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(), expect_right_columns.to_pandas()) pd.testing.assert_index_equal(c.inputs[1].dtypes.index, expect_right_columns.to_pandas())
def testAddWithOneShuffle(self): # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=5) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=6) df3 = add(df1, df2) # test df3's index and columns pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns) self.assertTrue(df3.columns.should_be_monotonic) self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index) self.assertTrue(df3.index_value.should_be_monotonic) pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) self.assertNotEqual(df3.index_value.key, df1.index_value.key) self.assertNotEqual(df3.index_value.key, df2.index_value.key) self.assertEqual(df3.shape[1], 12) # columns is recorded, so we can get it df3.tiles() data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)] data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)] left_index_splits, right_index_splits = split_monotonic_index_min_max( data1_index_min_max, True, data2_index_min_max, False) left_index_idx_to_original_idx = build_split_idx_to_origin_idx( left_index_splits) right_index_idx_to_original_idx = build_split_idx_to_origin_idx( right_index_splits, False) self.assertEqual(df3.chunk_shape, (7, 2)) for c in df3.chunks: self.assertIsInstance(c.op, DataFrameAdd) self.assertEqual(len(c.inputs), 2) idx = c.index # test the left side self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignReduce) expect_dtypes = pd.concat([ hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]] for ic in c.inputs[0].inputs[0].inputs ]) pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes) pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(), c.inputs[0].dtypes.index) self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) self.assertIsInstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy) left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[ idx[0]] left_index_min_max = left_index_splits[left_row_idx][ left_row_inner_idx] ics = [ic for ic in df1.chunks if ic.index[0] == left_row_idx] for j, ci, ic in zip(itertools.count(0), c.inputs[0].inputs[0].inputs, ics): self.assertIsInstance(ci.op, DataFrameIndexAlignMap) self.assertEqual(ci.index, (idx[0], j)) self.assertEqual(ci.op.index_min, left_index_min_max[0]) self.assertEqual(ci.op.index_min_close, left_index_min_max[1]) self.assertEqual(ci.op.index_max, left_index_min_max[2]) self.assertEqual(ci.op.index_max_close, left_index_min_max[3]) self.assertIsInstance(ci.index_value.to_pandas(), type(data1.index)) self.assertTrue(ci.op.column_shuffle_size, 2) shuffle_segments = ci.op.column_shuffle_segments expected_shuffle_segments = hash_dtypes(ic.data.dtypes, 2) self.assertEqual(len(shuffle_segments), len(expected_shuffle_segments)) for ss, ess in zip(shuffle_segments, expected_shuffle_segments): pd.testing.assert_series_equal(ss, ess) self.assertIs(ci.inputs[0], ic.data) # test the right side self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignReduce) expect_dtypes = pd.concat([ hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]] for ic in c.inputs[1].inputs[0].inputs ]) pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes) pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(), c.inputs[1].dtypes.index) self.assertIsInstance(c.inputs[1].index_value.to_pandas(), type(data1.index)) self.assertIsInstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy) right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[ idx[0]] right_index_min_max = right_index_splits[right_row_idx][ right_row_inner_idx] ics = [ic for ic in df2.chunks if ic.index[0] == right_row_idx] for j, ci, ic in zip(itertools.count(0), c.inputs[1].inputs[0].inputs, ics): self.assertIsInstance(ci.op, DataFrameIndexAlignMap) self.assertEqual(ci.index, (idx[0], j)) self.assertEqual(ci.op.index_min, right_index_min_max[0]) self.assertEqual(ci.op.index_min_close, right_index_min_max[1]) self.assertEqual(ci.op.index_max, right_index_min_max[2]) self.assertEqual(ci.op.index_max_close, right_index_min_max[3]) self.assertTrue(ci.op.column_shuffle_size, 2) shuffle_segments = ci.op.column_shuffle_segments expected_shuffle_segments = hash_dtypes(ic.data.dtypes, 2) self.assertEqual(len(shuffle_segments), len(expected_shuffle_segments)) for ss, ess in zip(shuffle_segments, expected_shuffle_segments): pd.testing.assert_series_equal(ss, ess) self.assertIs(ci.inputs[0], ic.data) # make sure shuffle proxies' key are different proxy_keys = set() for i in range(df3.chunk_shape[0]): cs = [c for c in df3.chunks if c.index[0] == i] lps = {c.inputs[0].inputs[0].op.key for c in cs} self.assertEqual(len(lps), 1) proxy_keys.add(lps.pop()) rps = {c.inputs[1].inputs[0].op.key for c in cs} self.assertEqual(len(rps), 1) proxy_keys.add(rps.pop()) self.assertEqual(len(proxy_keys), 2 * df3.chunk_shape[0])
def testAddWithoutShuffle(self): # all the axes are monotonic # data1 with index split into [0...4], [5...9], # columns [3...7], [8...12] data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13)) df1 = from_pandas(data1, chunk_size=5) # data2 with index split into [6...11], [2, 5], # columns [4...9], [10, 13] data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=np.arange(4, 14)) df2 = from_pandas(data2, chunk_size=6) df3 = add(df1, df2) # test df3's index and columns pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns) self.assertTrue(df3.columns.should_be_monotonic) self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index) self.assertTrue(df3.index_value.should_be_monotonic) pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) self.assertNotEqual(df3.index_value.key, df1.index_value.key) self.assertNotEqual(df3.index_value.key, df2.index_value.key) self.assertEqual(df3.shape[1], 11) # columns is recorded, so we can get it df3.tiles() # test df3's index and columns after tiling pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns) self.assertTrue(df3.columns.should_be_monotonic) self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index) self.assertTrue(df3.index_value.should_be_monotonic) pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) self.assertNotEqual(df3.index_value.key, df1.index_value.key) self.assertNotEqual(df3.index_value.key, df2.index_value.key) self.assertEqual(df3.shape[1], 11) # columns is recorded, so we can get it data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)] data1_columns_min_max = [[3, True, 7, True], [8, True, 12, True]] data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)] data2_columns_min_max = [(4, True, 9, True), (10, True, 13, True)] left_index_splits, right_index_splits = split_monotonic_index_min_max( data1_index_min_max, True, data2_index_min_max, False) left_columns_splits, right_columns_splits = split_monotonic_index_min_max( data1_columns_min_max, True, data2_columns_min_max, True) left_index_idx_to_original_idx = build_split_idx_to_origin_idx( left_index_splits) right_index_idx_to_original_idx = build_split_idx_to_origin_idx( right_index_splits, False) left_columns_idx_to_original_idx = build_split_idx_to_origin_idx( left_columns_splits) right_columns_idx_to_original_idx = build_split_idx_to_origin_idx( right_columns_splits) self.assertEqual(df3.chunk_shape, (7, 7)) for c in df3.chunks: self.assertIsInstance(c.op, DataFrameAdd) self.assertEqual(len(c.inputs), 2) # test shape idx = c.index # test the left side self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignMap) left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[ idx[0]] left_col_idx, left_col_inner_idx = left_columns_idx_to_original_idx[ idx[1]] expect_df1_input = df1.cix[left_row_idx, left_col_idx].data self.assertIs(c.inputs[0].inputs[0], expect_df1_input) left_index_min_max = left_index_splits[left_row_idx][ left_row_inner_idx] self.assertEqual(c.inputs[0].op.index_min, left_index_min_max[0]) self.assertEqual(c.inputs[0].op.index_min_close, left_index_min_max[1]) self.assertEqual(c.inputs[0].op.index_max, left_index_min_max[2]) self.assertEqual(c.inputs[0].op.index_max_close, left_index_min_max[3]) self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) left_column_min_max = left_columns_splits[left_col_idx][ left_col_inner_idx] self.assertEqual(c.inputs[0].op.column_min, left_column_min_max[0]) self.assertEqual(c.inputs[0].op.column_min_close, left_column_min_max[1]) self.assertEqual(c.inputs[0].op.column_max, left_column_min_max[2]) self.assertEqual(c.inputs[0].op.column_max_close, left_column_min_max[3]) expect_left_columns = filter_index_value(expect_df1_input.columns, left_column_min_max, store_data=True) pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(), expect_left_columns.to_pandas()) pd.testing.assert_index_equal(c.inputs[0].dtypes.index, expect_left_columns.to_pandas()) # test the right side self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignMap) right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[ idx[0]] right_col_idx, right_col_inner_idx = right_columns_idx_to_original_idx[ idx[1]] expect_df2_input = df2.cix[right_row_idx, right_col_idx].data self.assertIs(c.inputs[1].inputs[0], expect_df2_input) right_index_min_max = right_index_splits[right_row_idx][ right_row_inner_idx] self.assertEqual(c.inputs[1].op.index_min, right_index_min_max[0]) self.assertEqual(c.inputs[1].op.index_min_close, right_index_min_max[1]) self.assertEqual(c.inputs[1].op.index_max, right_index_min_max[2]) self.assertEqual(c.inputs[1].op.index_max_close, right_index_min_max[3]) self.assertIsInstance(c.inputs[1].index_value.to_pandas(), type(data2.index)) right_column_min_max = right_columns_splits[right_col_idx][ right_col_inner_idx] self.assertEqual(c.inputs[1].op.column_min, right_column_min_max[0]) self.assertEqual(c.inputs[1].op.column_min_close, right_column_min_max[1]) self.assertEqual(c.inputs[1].op.column_max, right_column_min_max[2]) self.assertEqual(c.inputs[1].op.column_max_close, right_column_min_max[3]) expect_right_columns = filter_index_value(expect_df2_input.columns, left_column_min_max, store_data=True) pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(), expect_right_columns.to_pandas()) pd.testing.assert_index_equal(c.inputs[1].dtypes.index, expect_right_columns.to_pandas())
def testSplitMonotonicIndexMinMax(self): left_min_max = [[0, True, 3, True], [3, False, 5, False]] right_min_max = [[1, False, 3, True], [4, False, 6, True]] left_splits, right_splits = \ split_monotonic_index_min_max(left_min_max, True, right_min_max, True) self.assertEqual(left_splits, [[(0, True, 1, True), (1, False, 3, True)], [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)]]) self.assertEqual(right_splits, [[(0, True, 1, True), (1, False, 3, True)], [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)]]) left_splits, right_splits = split_monotonic_index_min_max( right_min_max, False, left_min_max, False) self.assertEqual(list(reversed(left_splits)), [[(0, True, 1, True), (1, False, 3, True)], [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)]]) self.assertEqual(list(reversed(right_splits)), [[(0, True, 1, True), (1, False, 3, True)], [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)]]) left_min_max = [[2, True, 4, True], [8, True, 9, False]] right_min_max = [[1, False, 3, True], [4, False, 6, True]] left_splits, right_splits = \ split_monotonic_index_min_max(left_min_max, True, right_min_max, True) self.assertEqual( left_splits, [[(1, False, 2, False), (2, True, 3, True), (3, False, 4, True)], [(4, False, 6, True), (8, True, 9, False)]]) self.assertEqual(right_splits, [[(1, False, 2, False), (2, True, 3, True)], [(3, False, 4, True), (4, False, 6, True), (8, True, 9, False)]]) left_min_max = [[1, False, 3, True], [4, False, 6, True], [10, True, 12, False], [13, True, 14, False]] right_min_max = [[2, True, 4, True], [5, True, 7, False]] left_splits, right_splits = \ split_monotonic_index_min_max(left_min_max, True, right_min_max, True) self.assertEqual(left_splits, [[(1, False, 2, False), (2, True, 3, True)], [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)], [(6, False, 7, False), (10, True, 12, False)], [(13, True, 14, False)]]) self.assertEqual(right_splits, [[(1, False, 2, False), (2, True, 3, True), (3, False, 4, True)], [(4, False, 5, False), (5, True, 6, True), (6, False, 7, False), (10, True, 12, False), (13, True, 14, False)]]) left_splits, right_splits = \ split_monotonic_index_min_max(right_min_max, True, left_min_max, True) self.assertEqual(left_splits, [[(1, False, 2, False), (2, True, 3, True), (3, False, 4, True)], [(4, False, 5, False), (5, True, 6, True), (6, False, 7, False), (10, True, 12, False), (13, True, 14, False)]]) self.assertEqual(right_splits, [[(1, False, 2, False), (2, True, 3, True)], [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)], [(6, False, 7, False), (10, True, 12, False)], [(13, True, 14, False)]]) # left min_max like ([.., .., 4 True], [4, False, ..., ...] # right min_max like ([..., ..., 4 False], [4, True, ..., ...] left_min_max = [[1, False, 4, True], [4, False, 6, True]] right_min_max = [[1, False, 4, False], [4, True, 6, True]] left_splits, right_splits = split_monotonic_index_min_max( left_min_max, True, right_min_max, True) self.assertEqual(left_splits, [[(1, False, 4, False), (4, True, 4, True)], [(4, False, 6, True)]]) self.assertEqual( right_splits, [[(1, False, 4, False)], [(4, True, 4, True), (4, False, 6, True)]]) # identical index left_min_max = [[1, False, 3, True], [4, False, 6, True]] right_min_max = [[1, False, 3, True], [4, False, 6, True]] left_splits, right_splits = \ split_monotonic_index_min_max(left_min_max, True, right_min_max, True) self.assertEqual(left_splits, [[tuple(it)] for it in left_min_max]) self.assertEqual(right_splits, [[tuple(it)] for it in left_min_max])