def test_always_make_a_copy(self): table = pa.Table.from_arrays( [pa.array([[1], [2], [3]], type=pa.list_(pa.int64()))], ["f1"]) sliced = table_util.SliceTableByRowIndices( table, pa.array([1, 2], type=pa.int32())) self.assertEqual(1, sliced.columns[0].num_chunks) self.assertEqual(0, sliced.columns[0].chunk(0).offset)
def test_success(self, row_indices, expected_output): table = pa.Table.from_arrays([ pa.array([[1, 2, 3], None, [4], [], [5, 6], [7], [8, 9], [10], []], type=pa.list_(pa.int32())), pa.array([["a"], ["b", "c"], None, [], None, ["d", "e"], ["f"], None, ["g"]], type=pa.list_(pa.binary())), ], ["f1", "f2"]) sliced = table_util.SliceTableByRowIndices( table, pa.array(row_indices, type=pa.int32())) self.assertTrue(sliced.equals(expected_output), "Expected {}, got {}".format(expected_output, sliced)) if sliced.num_rows > 0: for c in sliced.columns: # TODO(zhuo): stop using column.data after arrow dep is bumped to 0.15. self.assertEqual(c.data.num_chunks, 1)
def test_success(self, row_indices, expected_output): table = pa.Table.from_arrays([ pa.array([[1, 2, 3], None, [4], [], [5, 6], [7], [8, 9], [10], []], type=pa.list_(pa.int32())), pa.array( [["a"], ["b", "c"], None, [], None, ["d", "e"], ["f"], None, ["g"]], type=pa.list_(pa.binary())), ], ["f1", "f2"]) for row_indices_type in (pa.int32(), pa.int64()): sliced = table_util.SliceTableByRowIndices( table, pa.array(row_indices, type=row_indices_type)) self.assertTrue( sliced.equals(expected_output), "Expected {}, got {}".format(expected_output, sliced)) if sliced.num_rows > 0: for c in sliced.columns: self.assertEqual(c.num_chunks, 1)
def feature_value_slicer(table): """A function that generates sliced tables. The naive approach of doing this would be to iterate each row, identify slice keys for the row and keep track of index ranges for each slice key. And then generate an arrow table for each slice key based on the index ranges. This would be expensive as we are identifying the slice keys for each row individually and we would have to loop over the feature values including crossing them when we have to slice on multiple features. The current approach generates the slice keys for a batch by performing joins over indices of individual features. And then groups the joined table by slice key to get the row indices corresponding to a slice. Args: table: Arrow table. Yields: Sliced table (slice_key, Arrow table) where the table contains the rows corresponding to a slice. """ per_feature_parent_indices = [] for feature_name, values in six.iteritems(features): column = table.column(feature_name) # Assume we have a single chunk. feature_array = column.data.chunk(0) non_missing_values = arrow_util.primitive_array_to_numpy( feature_array.flatten()) value_parent_indices = array_util.GetFlattenedArrayParentIndices( feature_array).to_numpy() # Create dataframe with feature value and parent index. df = pd.DataFrame({ feature_name: non_missing_values, _PARENT_INDEX_COLUMN: value_parent_indices }) df.drop_duplicates(inplace=True) # Filter based on slice values if values is not None: df = df.loc[df[feature_name].isin(values)] per_feature_parent_indices.append(df) # Join dataframes based on parent indices. # Note that we want the parent indices per slice key to be sorted in the # merged dataframe. The individual dataframes have the parent indices in # sorted order. We use "inner" join type to preserve the order of the left # keys (also note that same parent index rows would be consecutive). Hence # we expect the merged dataframe to have sorted parent indices per # slice key. merged_df = functools.reduce( lambda base, update: pd.merge( base, update, how='inner', # pylint: disable=g-long-lambda on=_PARENT_INDEX_COLUMN), per_feature_parent_indices) # Construct a new column in the merged dataframe with the slice keys. merged_df[_SLICE_KEY_COLUMN] = '' index = 0 for col_name in sorted(merged_df.columns): if col_name in [_PARENT_INDEX_COLUMN, _SLICE_KEY_COLUMN]: continue slice_key_col = (_to_slice_key(col_name) + '_' + merged_df[col_name].apply(_to_slice_key)) if index == 0: merged_df[_SLICE_KEY_COLUMN] = slice_key_col index += 1 else: merged_df[_SLICE_KEY_COLUMN] += ('_' + slice_key_col) # Since the parent indices are sorted per slice key, the groupby would # preserve the sorted order within each group. per_slice_parent_indices = merged_df.groupby( _SLICE_KEY_COLUMN, sort=False)[_PARENT_INDEX_COLUMN] for slice_key, parent_indices in per_slice_parent_indices: yield (slice_key, table_util.SliceTableByRowIndices( table, pa.array(parent_indices.to_numpy())))
def test_invalid_inputs(self, row_indices, expected_error_type, expected_error_regexp): with self.assertRaisesRegexp(expected_error_type, expected_error_regexp): table_util.SliceTableByRowIndices( pa.Table.from_arrays([pa.array([1])], ["f1"]), row_indices)