def testFilterIndexValue(self): pd_index = pd.RangeIndex(10) index_value = parse_index(pd_index) min_max = (0, True, 9, True) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist()) min_max = (0, False, 9, False) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index > 0) & (pd_index < 9)].tolist()) pd_index = pd.RangeIndex(1, 11, 3) index_value = parse_index(pd_index) min_max = (2, True, 10, True) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist()) min_max = (2, False, 10, False) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index > 2) & (pd_index < 10)].tolist()) pd_index = pd.RangeIndex(9, -1, -1) index_value = parse_index(pd_index) min_max = (0, True, 9, True) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist()) min_max = (0, False, 9, False) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index > 0) & (pd_index < 9)].tolist()) pd_index = pd.RangeIndex(10, 0, -3) index_value = parse_index(pd_index, store_data=False) min_max = (2, True, 10, True) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist()) min_max = (2, False, 10, False) self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index > 2) & (pd_index < 10)].tolist()) pd_index = pd.Int64Index([0, 3, 8]) index_value = parse_index(pd_index, store_data=True) min_max = (2, True, 8, False) self.assertEqual(filter_index_value(index_value, min_max, store_data=True).to_pandas().tolist(), pd_index[(pd_index >= 2) & (pd_index < 8)].tolist()) index_value = parse_index(pd_index) min_max = (2, True, 8, False) filtered = filter_index_value(index_value, min_max) self.assertEqual(len(filtered.to_pandas().tolist()), 0) self.assertIsInstance(filtered.value, IndexValue.Int64Index)
def test_filter_index_value(): pd_index = pd.RangeIndex(10) index_value = parse_index(pd_index) min_max = (0, True, 9, True) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist() min_max = (0, False, 9, False) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index > 0) & (pd_index < 9)].tolist() pd_index = pd.RangeIndex(1, 11, 3) index_value = parse_index(pd_index) min_max = (2, True, 10, True) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist() min_max = (2, False, 10, False) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index > 2) & (pd_index < 10)].tolist() pd_index = pd.RangeIndex(9, -1, -1) index_value = parse_index(pd_index) min_max = (0, True, 9, True) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist() min_max = (0, False, 9, False) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index > 0) & (pd_index < 9)].tolist() pd_index = pd.RangeIndex(10, 0, -3) index_value = parse_index(pd_index, store_data=False) min_max = (2, True, 10, True) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist() min_max = (2, False, 10, False) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index > 2) & (pd_index < 10)].tolist() pd_index = pd.Int64Index([0, 3, 8]) index_value = parse_index(pd_index, store_data=True) min_max = (2, True, 8, False) assert filter_index_value( index_value, min_max, store_data=True).to_pandas().tolist() == pd_index[ (pd_index >= 2) & (pd_index < 8)].tolist() index_value = parse_index(pd_index) min_max = (2, True, 8, False) filtered = filter_index_value(index_value, min_max) assert len(filtered.to_pandas().tolist()) == 0 assert isinstance(filtered.value, IndexValue.Int64Index)
def testAddWithoutShuffle(self): # all the axes are monotonic # data1 with index split into [0...4], [5...9], # columns [3...7], [8...12] data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13)) df1 = from_pandas(data1, chunk_size=5) # data2 with index split into [6...11], [2, 5], # columns [4...9], [10, 13] data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=np.arange(4, 14)) df2 = from_pandas(data2, chunk_size=6) df3 = add(df1, df2) # test df3's index and columns pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns) self.assertTrue(df3.columns.should_be_monotonic) self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index) self.assertTrue(df3.index_value.should_be_monotonic) pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) self.assertNotEqual(df3.index_value.key, df1.index_value.key) self.assertNotEqual(df3.index_value.key, df2.index_value.key) self.assertEqual(df3.shape[1], 11) # columns is recorded, so we can get it df3.tiles() # test df3's index and columns after tiling pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns) self.assertTrue(df3.columns.should_be_monotonic) self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index) self.assertTrue(df3.index_value.should_be_monotonic) pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) self.assertNotEqual(df3.index_value.key, df1.index_value.key) self.assertNotEqual(df3.index_value.key, df2.index_value.key) self.assertEqual(df3.shape[1], 11) # columns is recorded, so we can get it data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)] data1_columns_min_max = [[3, True, 7, True], [8, True, 12, True]] data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)] data2_columns_min_max = [(4, True, 9, True), (10, True, 13, True)] left_index_splits, right_index_splits = split_monotonic_index_min_max( data1_index_min_max, True, data2_index_min_max, False) left_columns_splits, right_columns_splits = split_monotonic_index_min_max( data1_columns_min_max, True, data2_columns_min_max, True) left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits) right_index_idx_to_original_idx = build_split_idx_to_origin_idx(right_index_splits, False) left_columns_idx_to_original_idx = build_split_idx_to_origin_idx(left_columns_splits) right_columns_idx_to_original_idx = build_split_idx_to_origin_idx(right_columns_splits) self.assertEqual(df3.chunk_shape, (7, 7)) for c in df3.chunks: self.assertIsInstance(c.op, DataFrameAdd) self.assertEqual(len(c.inputs), 2) # test shape idx = c.index # test the left side self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignMap) left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[idx[0]] left_col_idx, left_col_inner_idx = left_columns_idx_to_original_idx[idx[1]] expect_df1_input = df1.cix[left_row_idx, left_col_idx].data self.assertIs(c.inputs[0].inputs[0], expect_df1_input) left_index_min_max = left_index_splits[left_row_idx][left_row_inner_idx] self.assertEqual(c.inputs[0].op.index_min, left_index_min_max[0]) self.assertEqual(c.inputs[0].op.index_min_close, left_index_min_max[1]) self.assertEqual(c.inputs[0].op.index_max, left_index_min_max[2]) self.assertEqual(c.inputs[0].op.index_max_close, left_index_min_max[3]) self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) left_column_min_max = left_columns_splits[left_col_idx][left_col_inner_idx] self.assertEqual(c.inputs[0].op.column_min, left_column_min_max[0]) self.assertEqual(c.inputs[0].op.column_min_close, left_column_min_max[1]) self.assertEqual(c.inputs[0].op.column_max, left_column_min_max[2]) self.assertEqual(c.inputs[0].op.column_max_close, left_column_min_max[3]) expect_left_columns = filter_index_value(expect_df1_input.columns, left_column_min_max, store_data=True) pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(), expect_left_columns.to_pandas()) pd.testing.assert_index_equal(c.inputs[0].dtypes.index, expect_left_columns.to_pandas()) # test the right side self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignMap) right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]] right_col_idx, right_col_inner_idx = right_columns_idx_to_original_idx[idx[1]] expect_df2_input = df2.cix[right_row_idx, right_col_idx].data self.assertIs(c.inputs[1].inputs[0], expect_df2_input) right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx] self.assertEqual(c.inputs[1].op.index_min, right_index_min_max[0]) self.assertEqual(c.inputs[1].op.index_min_close, right_index_min_max[1]) self.assertEqual(c.inputs[1].op.index_max, right_index_min_max[2]) self.assertEqual(c.inputs[1].op.index_max_close, right_index_min_max[3]) self.assertIsInstance(c.inputs[1].index_value.to_pandas(), type(data2.index)) right_column_min_max = right_columns_splits[right_col_idx][right_col_inner_idx] self.assertEqual(c.inputs[1].op.column_min, right_column_min_max[0]) self.assertEqual(c.inputs[1].op.column_min_close, right_column_min_max[1]) self.assertEqual(c.inputs[1].op.column_max, right_column_min_max[2]) self.assertEqual(c.inputs[1].op.column_max_close, right_column_min_max[3]) expect_right_columns = filter_index_value(expect_df2_input.columns, left_column_min_max, store_data=True) pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(), expect_right_columns.to_pandas()) pd.testing.assert_index_equal(c.inputs[1].dtypes.index, expect_right_columns.to_pandas())