Пример #1
0
 def test_always_make_a_copy(self):
     table = pa.Table.from_arrays(
         [pa.array([[1], [2], [3]], type=pa.list_(pa.int64()))], ["f1"])
     sliced = table_util.SliceTableByRowIndices(
         table, pa.array([1, 2], type=pa.int32()))
     self.assertEqual(1, sliced.columns[0].num_chunks)
     self.assertEqual(0, sliced.columns[0].chunk(0).offset)
Пример #2
0
    def test_success(self, row_indices, expected_output):
        table = pa.Table.from_arrays([
            pa.array([[1, 2, 3], None, [4], [], [5, 6], [7], [8, 9], [10], []],
                     type=pa.list_(pa.int32())),
            pa.array([["a"], ["b", "c"], None, [], None, ["d", "e"], ["f"],
                      None, ["g"]],
                     type=pa.list_(pa.binary())),
        ], ["f1", "f2"])

        sliced = table_util.SliceTableByRowIndices(
            table, pa.array(row_indices, type=pa.int32()))
        self.assertTrue(sliced.equals(expected_output),
                        "Expected {}, got {}".format(expected_output, sliced))
        if sliced.num_rows > 0:
            for c in sliced.columns:
                # TODO(zhuo): stop using column.data after arrow dep is bumped to 0.15.
                self.assertEqual(c.data.num_chunks, 1)
Пример #3
0
  def test_success(self, row_indices, expected_output):
    table = pa.Table.from_arrays([
        pa.array([[1, 2, 3], None, [4], [], [5, 6], [7], [8, 9], [10], []],
                 type=pa.list_(pa.int32())),
        pa.array(
            [["a"], ["b", "c"], None, [], None, ["d", "e"], ["f"], None, ["g"]],
            type=pa.list_(pa.binary())),
    ], ["f1", "f2"])

    for row_indices_type in (pa.int32(), pa.int64()):
      sliced = table_util.SliceTableByRowIndices(
          table, pa.array(row_indices, type=row_indices_type))
      self.assertTrue(
          sliced.equals(expected_output),
          "Expected {}, got {}".format(expected_output, sliced))
      if sliced.num_rows > 0:
        for c in sliced.columns:
          self.assertEqual(c.num_chunks, 1)
Пример #4
0
    def feature_value_slicer(table):
        """A function that generates sliced tables.

    The naive approach of doing this would be to iterate each row, identify
    slice keys for the row and keep track of index ranges for each slice key.
    And then generate an arrow table for each slice key based on the index
    ranges. This would be expensive as we are identifying the slice keys for
    each row individually and we would have to loop over the feature values
    including crossing them when we have to slice on multiple features. The
    current approach generates the slice keys for a batch by performing joins
    over indices of individual features. And then groups the joined table by
    slice key to get the row indices corresponding to a slice.

    Args:
      table: Arrow table.

    Yields:
      Sliced table (slice_key, Arrow table) where the table contains the rows
      corresponding to a slice.
    """
        per_feature_parent_indices = []
        for feature_name, values in six.iteritems(features):
            column = table.column(feature_name)
            # Assume we have a single chunk.
            feature_array = column.data.chunk(0)
            non_missing_values = arrow_util.primitive_array_to_numpy(
                feature_array.flatten())
            value_parent_indices = array_util.GetFlattenedArrayParentIndices(
                feature_array).to_numpy()
            # Create dataframe with feature value and parent index.
            df = pd.DataFrame({
                feature_name: non_missing_values,
                _PARENT_INDEX_COLUMN: value_parent_indices
            })
            df.drop_duplicates(inplace=True)
            # Filter based on slice values
            if values is not None:
                df = df.loc[df[feature_name].isin(values)]
            per_feature_parent_indices.append(df)

        # Join dataframes based on parent indices.
        # Note that we want the parent indices per slice key to be sorted in the
        # merged dataframe. The individual dataframes have the parent indices in
        # sorted order. We use "inner" join type to preserve the order of the left
        # keys (also note that same parent index rows would be consecutive). Hence
        # we expect the merged dataframe to have sorted parent indices per
        # slice key.
        merged_df = functools.reduce(
            lambda base, update: pd.merge(
                base,
                update,
                how='inner',  # pylint: disable=g-long-lambda
                on=_PARENT_INDEX_COLUMN),
            per_feature_parent_indices)

        # Construct a new column in the merged dataframe with the slice keys.
        merged_df[_SLICE_KEY_COLUMN] = ''
        index = 0
        for col_name in sorted(merged_df.columns):
            if col_name in [_PARENT_INDEX_COLUMN, _SLICE_KEY_COLUMN]:
                continue
            slice_key_col = (_to_slice_key(col_name) + '_' +
                             merged_df[col_name].apply(_to_slice_key))
            if index == 0:
                merged_df[_SLICE_KEY_COLUMN] = slice_key_col
                index += 1
            else:
                merged_df[_SLICE_KEY_COLUMN] += ('_' + slice_key_col)

        # Since the parent indices are sorted per slice key, the groupby would
        # preserve the sorted order within each group.
        per_slice_parent_indices = merged_df.groupby(
            _SLICE_KEY_COLUMN, sort=False)[_PARENT_INDEX_COLUMN]
        for slice_key, parent_indices in per_slice_parent_indices:
            yield (slice_key,
                   table_util.SliceTableByRowIndices(
                       table, pa.array(parent_indices.to_numpy())))
Пример #5
0
 def test_invalid_inputs(self, row_indices, expected_error_type,
                         expected_error_regexp):
     with self.assertRaisesRegexp(expected_error_type,
                                  expected_error_regexp):
         table_util.SliceTableByRowIndices(
             pa.Table.from_arrays([pa.array([1])], ["f1"]), row_indices)