예제 #1
0
 def test_simple(self):
   record_batch = pa.RecordBatch.from_arrays(
       [pa.array([[1, 2], None, [3]]),
        pa.array([["a", "b"], ["c"], None])], ["a", "b"])
   self.assertTrue(
       table_util.RecordBatchTake(record_batch,
                                  pa.array([1, 2]))
       .equals(record_batch.slice(1)))
예제 #2
0
  def test_success(self, row_indices, expected_output):
    record_batch = pa.RecordBatch.from_arrays([
        pa.array([[1, 2, 3], None, [4], [], [5, 6], [7], [8, 9], [10], []],
                 type=pa.list_(pa.int32())),
        pa.array(
            [["a"], ["b", "c"], None, [], None, ["d", "e"], ["f"], None, ["g"]],
            type=pa.list_(pa.binary())),
    ], ["f1", "f2"])

    for row_indices_type in (pa.int32(), pa.int64()):
      sliced = table_util.RecordBatchTake(
          record_batch, pa.array(row_indices, type=row_indices_type))
      self.assertTrue(
          sliced.equals(expected_output),
          "Expected {}, got {}".format(expected_output, sliced))
예제 #3
0
    def process(
            self,
            record_batch: pa.RecordBatch) -> Iterable[types.SlicedRecordBatch]:
        # Keep track of row indices per slice key.
        per_slice_indices = collections.defaultdict(set)
        for query in self._get_queries_for_schema(record_batch.schema):
            # Example of result with batch size = 3:
            # result = [[[('feature', 'value_1')]],
            #           [[('feature', 'value_2')]],
            #           []
            #          ]
            result = query.Execute(record_batch)
            for i, per_row_slices in enumerate(result):
                for slice_tuples in per_row_slices:
                    slice_key = '_'.join(map('_'.join, slice_tuples))
                    per_slice_indices[slice_key].add(i)

        yield (constants.DEFAULT_SLICE_KEY, record_batch)
        for slice_key, row_indices in per_slice_indices.items():
            yield (slice_key,
                   table_util.RecordBatchTake(record_batch,
                                              pa.array(row_indices)))
예제 #4
0
    def feature_value_slicer(
            record_batch: pa.RecordBatch) -> Iterable[types.SlicedRecordBatch]:
        """A function that generates sliced record batches.

    The naive approach of doing this would be to iterate each row, identify
    slice keys for the row and keep track of index ranges for each slice key.
    And then generate an arrow record batch for each slice key based on the
    index ranges. This would be expensive as we are identifying the slice keys
    for each row individually and we would have to loop over the feature values
    including crossing them when we have to slice on multiple features. The
    current approach generates the slice keys for a batch by performing joins
    over indices of individual features. And then groups the joined record batch
    by slice key to get the row indices corresponding to a slice.

    Args:
      record_batch: Arrow RecordBatch.

    Yields:
      Sliced record batch (slice_key, record_batch) where record_batch contains
      the rows corresponding to a slice.
    """
        per_feature_parent_indices = []
        for feature_name, values in six.iteritems(features):
            feature_array = record_batch.column(
                record_batch.schema.get_field_index(feature_name))
            flattened, value_parent_indices = arrow_util.flatten_nested(
                feature_array, True)
            non_missing_values = np.asarray(flattened)
            # Create dataframe with feature value and parent index.
            df = DataFrame({
                feature_name: non_missing_values,
                _PARENT_INDEX_COLUMN: value_parent_indices
            })
            df.drop_duplicates(inplace=True)
            # Filter based on slice values
            if values is not None:
                df = df.loc[df[feature_name].isin(values)]
            per_feature_parent_indices.append(df)

        # Join dataframes based on parent indices.
        # Note that we want the parent indices per slice key to be sorted in the
        # merged dataframe. The individual dataframes have the parent indices in
        # sorted order. We use "inner" join type to preserve the order of the left
        # keys (also note that same parent index rows would be consecutive). Hence
        # we expect the merged dataframe to have sorted parent indices per
        # slice key.
        merged_df = functools.reduce(
            lambda base, update: pd.merge(
                base,
                update,
                how='inner',  # pylint: disable=g-long-lambda
                on=_PARENT_INDEX_COLUMN),
            per_feature_parent_indices)

        # Construct a new column in the merged dataframe with the slice keys.
        merged_df[_SLICE_KEY_COLUMN] = ''
        index = 0
        for col_name in sorted(merged_df.columns):
            if col_name in [_PARENT_INDEX_COLUMN, _SLICE_KEY_COLUMN]:
                continue
            slice_key_col = (_to_slice_key(col_name) + '_' +
                             merged_df[col_name].apply(_to_slice_key))
            if index == 0:
                merged_df[_SLICE_KEY_COLUMN] = slice_key_col
                index += 1
            else:
                merged_df[_SLICE_KEY_COLUMN] += ('_' + slice_key_col)

        # Since the parent indices are sorted per slice key, the groupby would
        # preserve the sorted order within each group.
        per_slice_parent_indices = merged_df.groupby(
            _SLICE_KEY_COLUMN, sort=False)[_PARENT_INDEX_COLUMN]
        for slice_key, parent_indices in per_slice_parent_indices:
            yield (slice_key,
                   table_util.RecordBatchTake(
                       record_batch, pa.array(parent_indices.to_numpy())))
예제 #5
0
    def _compact_impl(
        self, accumulator: _SampleRecordBatchRowsAccumulator
    ) -> _SampleRecordBatchRowsAccumulator:
        """Compacts the accumulator.

    This compact selects samples rows from the record batch, and merges them
    into one record batch. We can then clear the cache of all record batches
    seen so far. If the accumulator holds too few record batches, then nothing
    will be compacted.

    The sampling is done by assigning each row in the record batch a random
    number. Then we choose the top-k of the random numbers to get a sample of
    size k.

    Args:
      accumulator: The _SampleRecordBatchRowsAccumulator to compact.

    Returns:
      A _SampleRecordBatchRowsAccumulator that contains one or a list of record
      batch.
    """
        self._combine_num_record_batches.update(len(
            accumulator.record_batches))

        # There is nothing to compact.
        if accumulator.curr_num_rows <= 1:
            return accumulator

        # There is no need to compact yet.
        if (len(accumulator.record_batches) <= 1
                and accumulator.curr_num_rows <= self._sample_size):
            return accumulator
        self._num_compacts.inc(1)
        k = min(self._sample_size, accumulator.curr_num_rows)

        rand_ints = np.concatenate(accumulator.random_ints)

        # Find the value that is the breakpoint for the top-k.
        kth_value = np.partition(rand_ints, k - 1)[k - 1]

        # This mask will always have >= 1 Trues.
        equals_to_kth = (rand_ints == kth_value)

        # This mask will always have < k Trues.
        less_than_kth = rand_ints < kth_value

        # Since there may be duplicate values, `equals_to_kth + less_than_kth` might
        # be greater than `k`. We need to keep track of how many to add, without
        # surpassing `k`.
        kth_to_add = k - np.sum(less_than_kth)

        # Preserve the random integers that we had assigned to each row.
        sample_random_ints = rand_ints[rand_ints <= kth_value][:k]

        beg = 0
        sample_indices = []
        for rb in accumulator.record_batches:
            size = rb.num_rows
            end = beg + size
            less_than_kth_indices = np.nonzero(less_than_kth[beg:end])[0]
            indices = less_than_kth_indices

            # Add indices of any duplicate values that are equal to `k`.
            if kth_to_add > 0:
                equals_to_kth_indices = np.nonzero(equals_to_kth[beg:end])[0]
                if equals_to_kth_indices.size > 0:
                    if equals_to_kth_indices.size >= kth_to_add:
                        indices = np.concatenate([
                            less_than_kth_indices,
                            equals_to_kth_indices[:kth_to_add]
                        ])
                        kth_to_add = 0
                    else:
                        indices = np.concatenate(
                            [less_than_kth_indices, equals_to_kth_indices])
                        kth_to_add -= equals_to_kth_indices.size

            sample_indices.append(indices)
            beg += size

        result = _SampleRecordBatchRowsAccumulator()

        # Take and merge the record batches, based on the sampled indices.
        rbs = []
        for rb, indices in zip(accumulator.record_batches, sample_indices):
            rbs.append(table_util.RecordBatchTake(rb, pa.array(indices)))
        compressed_rb = table_util.MergeRecordBatches(rbs)
        result.record_batches = [compressed_rb]
        result.curr_num_rows = compressed_rb.num_rows
        result.curr_byte_size = compressed_rb.nbytes
        result.random_ints = [sample_random_ints]

        self._combine_byte_size.update(result.curr_byte_size)

        return result