예제 #1
0
    def testWithoutShuffleAndWithOneChunk(self):
        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=(5, 10))
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=(6, 10))

        df3 = add(df1, df2)

        # test df3's index and columns
        pd.testing.assert_index_equal(df3.columns.to_pandas(),
                                      (data1 + data2).columns)
        self.assertTrue(df3.columns.should_be_monotonic)
        self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df3.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df3.index_value.to_pandas(),
                                      pd.Int64Index([]))
        self.assertNotEqual(df3.index_value.key, df1.index_value.key)
        self.assertNotEqual(df3.index_value.key, df2.index_value.key)
        self.assertEqual(df3.shape[1],
                         12)  # columns is recorded, so we can get it

        df3.tiles()

        data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)]
        data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)]

        left_index_splits, right_index_splits = split_monotonic_index_min_max(
            data1_index_min_max, True, data2_index_min_max, False)

        left_index_idx_to_original_idx = build_split_idx_to_origin_idx(
            left_index_splits)
        right_index_idx_to_original_idx = build_split_idx_to_origin_idx(
            right_index_splits, False)

        self.assertEqual(df3.chunk_shape, (7, 1))
        for c in df3.chunks:
            self.assertIsInstance(c.op, DataFrameAdd)
            self.assertEqual(len(c.inputs), 2)
            # test shape
            idx = c.index
            # test the left side
            self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignMap)
            left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[
                idx[0]]
            expect_df1_input = df1.cix[left_row_idx, 0].data
            self.assertIs(c.inputs[0].inputs[0], expect_df1_input)
            left_index_min_max = left_index_splits[left_row_idx][
                left_row_inner_idx]
            self.assertEqual(c.inputs[0].op.index_min, left_index_min_max[0])
            self.assertEqual(c.inputs[0].op.index_min_close,
                             left_index_min_max[1])
            self.assertEqual(c.inputs[0].op.index_max, left_index_min_max[2])
            self.assertEqual(c.inputs[0].op.index_max_close,
                             left_index_min_max[3])
            self.assertIsInstance(c.inputs[0].index_value.to_pandas(),
                                  type(data1.index))
            self.assertEqual(c.inputs[0].op.column_min,
                             expect_df1_input.columns.min_val)
            self.assertEqual(c.inputs[0].op.column_min_close,
                             expect_df1_input.columns.min_val_close)
            self.assertEqual(c.inputs[0].op.column_max,
                             expect_df1_input.columns.max_val)
            self.assertEqual(c.inputs[0].op.column_max_close,
                             expect_df1_input.columns.max_val_close)
            expect_left_columns = expect_df1_input.columns
            pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(),
                                          expect_left_columns.to_pandas())
            pd.testing.assert_index_equal(c.inputs[0].dtypes.index,
                                          expect_left_columns.to_pandas())
            # test the right side
            self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignMap)
            right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[
                idx[0]]
            expect_df2_input = df2.cix[right_row_idx, 0].data
            self.assertIs(c.inputs[1].inputs[0], expect_df2_input)
            right_index_min_max = right_index_splits[right_row_idx][
                right_row_inner_idx]
            self.assertEqual(c.inputs[1].op.index_min, right_index_min_max[0])
            self.assertEqual(c.inputs[1].op.index_min_close,
                             right_index_min_max[1])
            self.assertEqual(c.inputs[1].op.index_max, right_index_min_max[2])
            self.assertEqual(c.inputs[1].op.index_max_close,
                             right_index_min_max[3])
            self.assertIsInstance(c.inputs[1].index_value.to_pandas(),
                                  type(data2.index))
            self.assertEqual(c.inputs[1].op.column_min,
                             expect_df2_input.columns.min_val)
            self.assertEqual(c.inputs[1].op.column_min_close,
                             expect_df2_input.columns.min_val_close)
            self.assertEqual(c.inputs[1].op.column_max,
                             expect_df2_input.columns.max_val)
            self.assertEqual(c.inputs[1].op.column_max_close,
                             expect_df2_input.columns.max_val_close)
            expect_right_columns = expect_df2_input.columns
            pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(),
                                          expect_right_columns.to_pandas())
            pd.testing.assert_index_equal(c.inputs[1].dtypes.index,
                                          expect_right_columns.to_pandas())
예제 #2
0
    def testAddWithOneShuffle(self):
        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        # test df3's index and columns
        pd.testing.assert_index_equal(df3.columns.to_pandas(),
                                      (data1 + data2).columns)
        self.assertTrue(df3.columns.should_be_monotonic)
        self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df3.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df3.index_value.to_pandas(),
                                      pd.Int64Index([]))
        self.assertNotEqual(df3.index_value.key, df1.index_value.key)
        self.assertNotEqual(df3.index_value.key, df2.index_value.key)
        self.assertEqual(df3.shape[1],
                         12)  # columns is recorded, so we can get it

        df3.tiles()

        data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)]
        data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)]

        left_index_splits, right_index_splits = split_monotonic_index_min_max(
            data1_index_min_max, True, data2_index_min_max, False)

        left_index_idx_to_original_idx = build_split_idx_to_origin_idx(
            left_index_splits)
        right_index_idx_to_original_idx = build_split_idx_to_origin_idx(
            right_index_splits, False)

        self.assertEqual(df3.chunk_shape, (7, 2))
        for c in df3.chunks:
            self.assertIsInstance(c.op, DataFrameAdd)
            self.assertEqual(len(c.inputs), 2)
            idx = c.index
            # test the left side
            self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignReduce)
            expect_dtypes = pd.concat([
                hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]]
                for ic in c.inputs[0].inputs[0].inputs
            ])
            pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes)
            pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(),
                                          c.inputs[0].dtypes.index)
            self.assertIsInstance(c.inputs[0].index_value.to_pandas(),
                                  type(data1.index))
            self.assertIsInstance(c.inputs[0].inputs[0].op,
                                  DataFrameShuffleProxy)
            left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[
                idx[0]]
            left_index_min_max = left_index_splits[left_row_idx][
                left_row_inner_idx]
            ics = [ic for ic in df1.chunks if ic.index[0] == left_row_idx]
            for j, ci, ic in zip(itertools.count(0),
                                 c.inputs[0].inputs[0].inputs, ics):
                self.assertIsInstance(ci.op, DataFrameIndexAlignMap)
                self.assertEqual(ci.index, (idx[0], j))
                self.assertEqual(ci.op.index_min, left_index_min_max[0])
                self.assertEqual(ci.op.index_min_close, left_index_min_max[1])
                self.assertEqual(ci.op.index_max, left_index_min_max[2])
                self.assertEqual(ci.op.index_max_close, left_index_min_max[3])
                self.assertIsInstance(ci.index_value.to_pandas(),
                                      type(data1.index))
                self.assertTrue(ci.op.column_shuffle_size, 2)
                shuffle_segments = ci.op.column_shuffle_segments
                expected_shuffle_segments = hash_dtypes(ic.data.dtypes, 2)
                self.assertEqual(len(shuffle_segments),
                                 len(expected_shuffle_segments))
                for ss, ess in zip(shuffle_segments,
                                   expected_shuffle_segments):
                    pd.testing.assert_series_equal(ss, ess)
                self.assertIs(ci.inputs[0], ic.data)
            # test the right side
            self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignReduce)
            expect_dtypes = pd.concat([
                hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]]
                for ic in c.inputs[1].inputs[0].inputs
            ])
            pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes)
            pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(),
                                          c.inputs[1].dtypes.index)
            self.assertIsInstance(c.inputs[1].index_value.to_pandas(),
                                  type(data1.index))
            self.assertIsInstance(c.inputs[1].inputs[0].op,
                                  DataFrameShuffleProxy)
            right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[
                idx[0]]
            right_index_min_max = right_index_splits[right_row_idx][
                right_row_inner_idx]
            ics = [ic for ic in df2.chunks if ic.index[0] == right_row_idx]
            for j, ci, ic in zip(itertools.count(0),
                                 c.inputs[1].inputs[0].inputs, ics):
                self.assertIsInstance(ci.op, DataFrameIndexAlignMap)
                self.assertEqual(ci.index, (idx[0], j))
                self.assertEqual(ci.op.index_min, right_index_min_max[0])
                self.assertEqual(ci.op.index_min_close, right_index_min_max[1])
                self.assertEqual(ci.op.index_max, right_index_min_max[2])
                self.assertEqual(ci.op.index_max_close, right_index_min_max[3])
                self.assertTrue(ci.op.column_shuffle_size, 2)
                shuffle_segments = ci.op.column_shuffle_segments
                expected_shuffle_segments = hash_dtypes(ic.data.dtypes, 2)
                self.assertEqual(len(shuffle_segments),
                                 len(expected_shuffle_segments))
                for ss, ess in zip(shuffle_segments,
                                   expected_shuffle_segments):
                    pd.testing.assert_series_equal(ss, ess)
                self.assertIs(ci.inputs[0], ic.data)

        # make sure shuffle proxies' key are different
        proxy_keys = set()
        for i in range(df3.chunk_shape[0]):
            cs = [c for c in df3.chunks if c.index[0] == i]
            lps = {c.inputs[0].inputs[0].op.key for c in cs}
            self.assertEqual(len(lps), 1)
            proxy_keys.add(lps.pop())
            rps = {c.inputs[1].inputs[0].op.key for c in cs}
            self.assertEqual(len(rps), 1)
            proxy_keys.add(rps.pop())
        self.assertEqual(len(proxy_keys), 2 * df3.chunk_shape[0])
예제 #3
0
    def testAddWithoutShuffle(self):
        # all the axes are monotonic
        # data1 with index split into [0...4], [5...9],
        # columns [3...7], [8...12]
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=np.arange(3, 13))
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with index split into [6...11], [2, 5],
        # columns [4...9], [10, 13]
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=np.arange(4, 14))
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        # test df3's index and columns
        pd.testing.assert_index_equal(df3.columns.to_pandas(),
                                      (data1 + data2).columns)
        self.assertTrue(df3.columns.should_be_monotonic)
        self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df3.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df3.index_value.to_pandas(),
                                      pd.Int64Index([]))
        self.assertNotEqual(df3.index_value.key, df1.index_value.key)
        self.assertNotEqual(df3.index_value.key, df2.index_value.key)
        self.assertEqual(df3.shape[1],
                         11)  # columns is recorded, so we can get it

        df3.tiles()

        # test df3's index and columns after tiling
        pd.testing.assert_index_equal(df3.columns.to_pandas(),
                                      (data1 + data2).columns)
        self.assertTrue(df3.columns.should_be_monotonic)
        self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df3.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df3.index_value.to_pandas(),
                                      pd.Int64Index([]))
        self.assertNotEqual(df3.index_value.key, df1.index_value.key)
        self.assertNotEqual(df3.index_value.key, df2.index_value.key)
        self.assertEqual(df3.shape[1],
                         11)  # columns is recorded, so we can get it

        data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)]
        data1_columns_min_max = [[3, True, 7, True], [8, True, 12, True]]
        data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)]
        data2_columns_min_max = [(4, True, 9, True), (10, True, 13, True)]

        left_index_splits, right_index_splits = split_monotonic_index_min_max(
            data1_index_min_max, True, data2_index_min_max, False)
        left_columns_splits, right_columns_splits = split_monotonic_index_min_max(
            data1_columns_min_max, True, data2_columns_min_max, True)

        left_index_idx_to_original_idx = build_split_idx_to_origin_idx(
            left_index_splits)
        right_index_idx_to_original_idx = build_split_idx_to_origin_idx(
            right_index_splits, False)
        left_columns_idx_to_original_idx = build_split_idx_to_origin_idx(
            left_columns_splits)
        right_columns_idx_to_original_idx = build_split_idx_to_origin_idx(
            right_columns_splits)

        self.assertEqual(df3.chunk_shape, (7, 7))
        for c in df3.chunks:
            self.assertIsInstance(c.op, DataFrameAdd)
            self.assertEqual(len(c.inputs), 2)
            # test shape
            idx = c.index
            # test the left side
            self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignMap)
            left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[
                idx[0]]
            left_col_idx, left_col_inner_idx = left_columns_idx_to_original_idx[
                idx[1]]
            expect_df1_input = df1.cix[left_row_idx, left_col_idx].data
            self.assertIs(c.inputs[0].inputs[0], expect_df1_input)
            left_index_min_max = left_index_splits[left_row_idx][
                left_row_inner_idx]
            self.assertEqual(c.inputs[0].op.index_min, left_index_min_max[0])
            self.assertEqual(c.inputs[0].op.index_min_close,
                             left_index_min_max[1])
            self.assertEqual(c.inputs[0].op.index_max, left_index_min_max[2])
            self.assertEqual(c.inputs[0].op.index_max_close,
                             left_index_min_max[3])
            self.assertIsInstance(c.inputs[0].index_value.to_pandas(),
                                  type(data1.index))
            left_column_min_max = left_columns_splits[left_col_idx][
                left_col_inner_idx]
            self.assertEqual(c.inputs[0].op.column_min, left_column_min_max[0])
            self.assertEqual(c.inputs[0].op.column_min_close,
                             left_column_min_max[1])
            self.assertEqual(c.inputs[0].op.column_max, left_column_min_max[2])
            self.assertEqual(c.inputs[0].op.column_max_close,
                             left_column_min_max[3])
            expect_left_columns = filter_index_value(expect_df1_input.columns,
                                                     left_column_min_max,
                                                     store_data=True)
            pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(),
                                          expect_left_columns.to_pandas())
            pd.testing.assert_index_equal(c.inputs[0].dtypes.index,
                                          expect_left_columns.to_pandas())
            # test the right side
            self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignMap)
            right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[
                idx[0]]
            right_col_idx, right_col_inner_idx = right_columns_idx_to_original_idx[
                idx[1]]
            expect_df2_input = df2.cix[right_row_idx, right_col_idx].data
            self.assertIs(c.inputs[1].inputs[0], expect_df2_input)
            right_index_min_max = right_index_splits[right_row_idx][
                right_row_inner_idx]
            self.assertEqual(c.inputs[1].op.index_min, right_index_min_max[0])
            self.assertEqual(c.inputs[1].op.index_min_close,
                             right_index_min_max[1])
            self.assertEqual(c.inputs[1].op.index_max, right_index_min_max[2])
            self.assertEqual(c.inputs[1].op.index_max_close,
                             right_index_min_max[3])
            self.assertIsInstance(c.inputs[1].index_value.to_pandas(),
                                  type(data2.index))
            right_column_min_max = right_columns_splits[right_col_idx][
                right_col_inner_idx]
            self.assertEqual(c.inputs[1].op.column_min,
                             right_column_min_max[0])
            self.assertEqual(c.inputs[1].op.column_min_close,
                             right_column_min_max[1])
            self.assertEqual(c.inputs[1].op.column_max,
                             right_column_min_max[2])
            self.assertEqual(c.inputs[1].op.column_max_close,
                             right_column_min_max[3])
            expect_right_columns = filter_index_value(expect_df2_input.columns,
                                                      left_column_min_max,
                                                      store_data=True)
            pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(),
                                          expect_right_columns.to_pandas())
            pd.testing.assert_index_equal(c.inputs[1].dtypes.index,
                                          expect_right_columns.to_pandas())
예제 #4
0
    def testSplitMonotonicIndexMinMax(self):
        left_min_max = [[0, True, 3, True], [3, False, 5, False]]
        right_min_max = [[1, False, 3, True], [4, False, 6, True]]
        left_splits, right_splits = \
            split_monotonic_index_min_max(left_min_max, True, right_min_max, True)
        self.assertEqual(left_splits,
                         [[(0, True, 1, True), (1, False, 3, True)],
                          [(3, False, 4, True), (4, False, 5, False),
                           (5, True, 6, True)]])
        self.assertEqual(right_splits, [[(0, True, 1, True),
                                         (1, False, 3, True)],
                                        [(3, False, 4, True),
                                         (4, False, 5, False),
                                         (5, True, 6, True)]])
        left_splits, right_splits = split_monotonic_index_min_max(
            right_min_max, False, left_min_max, False)
        self.assertEqual(list(reversed(left_splits)), [[(0, True, 1, True),
                                                        (1, False, 3, True)],
                                                       [(3, False, 4, True),
                                                        (4, False, 5, False),
                                                        (5, True, 6, True)]])
        self.assertEqual(list(reversed(right_splits)), [[(0, True, 1, True),
                                                         (1, False, 3, True)],
                                                        [(3, False, 4, True),
                                                         (4, False, 5, False),
                                                         (5, True, 6, True)]])

        left_min_max = [[2, True, 4, True], [8, True, 9, False]]
        right_min_max = [[1, False, 3, True], [4, False, 6, True]]
        left_splits, right_splits = \
            split_monotonic_index_min_max(left_min_max, True, right_min_max, True)
        self.assertEqual(
            left_splits,
            [[(1, False, 2, False), (2, True, 3, True),
              (3, False, 4, True)], [(4, False, 6, True),
                                     (8, True, 9, False)]])
        self.assertEqual(right_splits, [[(1, False, 2, False),
                                         (2, True, 3, True)],
                                        [(3, False, 4, True),
                                         (4, False, 6, True),
                                         (8, True, 9, False)]])

        left_min_max = [[1, False, 3, True], [4, False, 6, True],
                        [10, True, 12, False], [13, True, 14, False]]
        right_min_max = [[2, True, 4, True], [5, True, 7, False]]
        left_splits, right_splits = \
            split_monotonic_index_min_max(left_min_max, True, right_min_max, True)
        self.assertEqual(left_splits,
                         [[(1, False, 2, False), (2, True, 3, True)],
                          [(3, False, 4, True), (4, False, 5, False),
                           (5, True, 6, True)],
                          [(6, False, 7, False),
                           (10, True, 12, False)], [(13, True, 14, False)]])
        self.assertEqual(right_splits, [[(1, False, 2, False),
                                         (2, True, 3, True),
                                         (3, False, 4, True)],
                                        [(4, False, 5, False),
                                         (5, True, 6, True),
                                         (6, False, 7, False),
                                         (10, True, 12, False),
                                         (13, True, 14, False)]])
        left_splits, right_splits = \
            split_monotonic_index_min_max(right_min_max, True, left_min_max, True)
        self.assertEqual(left_splits,
                         [[(1, False, 2, False), (2, True, 3, True),
                           (3, False, 4, True)],
                          [(4, False, 5, False), (5, True, 6, True),
                           (6, False, 7, False), (10, True, 12, False),
                           (13, True, 14, False)]])
        self.assertEqual(right_splits,
                         [[(1, False, 2, False), (2, True, 3, True)],
                          [(3, False, 4, True), (4, False, 5, False),
                           (5, True, 6, True)],
                          [(6, False, 7, False),
                           (10, True, 12, False)], [(13, True, 14, False)]])

        # left min_max like ([.., .., 4 True], [4, False, ..., ...]
        # right min_max like ([..., ..., 4 False], [4, True, ..., ...]
        left_min_max = [[1, False, 4, True], [4, False, 6, True]]
        right_min_max = [[1, False, 4, False], [4, True, 6, True]]
        left_splits, right_splits = split_monotonic_index_min_max(
            left_min_max, True, right_min_max, True)
        self.assertEqual(left_splits,
                         [[(1, False, 4, False),
                           (4, True, 4, True)], [(4, False, 6, True)]])
        self.assertEqual(
            right_splits,
            [[(1, False, 4, False)], [(4, True, 4, True),
                                      (4, False, 6, True)]])

        # identical index
        left_min_max = [[1, False, 3, True], [4, False, 6, True]]
        right_min_max = [[1, False, 3, True], [4, False, 6, True]]
        left_splits, right_splits = \
            split_monotonic_index_min_max(left_min_max, True, right_min_max, True)
        self.assertEqual(left_splits, [[tuple(it)] for it in left_min_max])
        self.assertEqual(right_splits, [[tuple(it)] for it in left_min_max])