def test_simple(self): record_batch = pa.RecordBatch.from_arrays( [pa.array([[1, 2], None, [3]]), pa.array([["a", "b"], ["c"], None])], ["a", "b"]) self.assertTrue( table_util.RecordBatchTake(record_batch, pa.array([1, 2])) .equals(record_batch.slice(1)))
def test_success(self, row_indices, expected_output): record_batch = pa.RecordBatch.from_arrays([ pa.array([[1, 2, 3], None, [4], [], [5, 6], [7], [8, 9], [10], []], type=pa.list_(pa.int32())), pa.array( [["a"], ["b", "c"], None, [], None, ["d", "e"], ["f"], None, ["g"]], type=pa.list_(pa.binary())), ], ["f1", "f2"]) for row_indices_type in (pa.int32(), pa.int64()): sliced = table_util.RecordBatchTake( record_batch, pa.array(row_indices, type=row_indices_type)) self.assertTrue( sliced.equals(expected_output), "Expected {}, got {}".format(expected_output, sliced))
def process( self, record_batch: pa.RecordBatch) -> Iterable[types.SlicedRecordBatch]: # Keep track of row indices per slice key. per_slice_indices = collections.defaultdict(set) for query in self._get_queries_for_schema(record_batch.schema): # Example of result with batch size = 3: # result = [[[('feature', 'value_1')]], # [[('feature', 'value_2')]], # [] # ] result = query.Execute(record_batch) for i, per_row_slices in enumerate(result): for slice_tuples in per_row_slices: slice_key = '_'.join(map('_'.join, slice_tuples)) per_slice_indices[slice_key].add(i) yield (constants.DEFAULT_SLICE_KEY, record_batch) for slice_key, row_indices in per_slice_indices.items(): yield (slice_key, table_util.RecordBatchTake(record_batch, pa.array(row_indices)))
def feature_value_slicer( record_batch: pa.RecordBatch) -> Iterable[types.SlicedRecordBatch]: """A function that generates sliced record batches. The naive approach of doing this would be to iterate each row, identify slice keys for the row and keep track of index ranges for each slice key. And then generate an arrow record batch for each slice key based on the index ranges. This would be expensive as we are identifying the slice keys for each row individually and we would have to loop over the feature values including crossing them when we have to slice on multiple features. The current approach generates the slice keys for a batch by performing joins over indices of individual features. And then groups the joined record batch by slice key to get the row indices corresponding to a slice. Args: record_batch: Arrow RecordBatch. Yields: Sliced record batch (slice_key, record_batch) where record_batch contains the rows corresponding to a slice. """ per_feature_parent_indices = [] for feature_name, values in six.iteritems(features): feature_array = record_batch.column( record_batch.schema.get_field_index(feature_name)) flattened, value_parent_indices = arrow_util.flatten_nested( feature_array, True) non_missing_values = np.asarray(flattened) # Create dataframe with feature value and parent index. df = DataFrame({ feature_name: non_missing_values, _PARENT_INDEX_COLUMN: value_parent_indices }) df.drop_duplicates(inplace=True) # Filter based on slice values if values is not None: df = df.loc[df[feature_name].isin(values)] per_feature_parent_indices.append(df) # Join dataframes based on parent indices. # Note that we want the parent indices per slice key to be sorted in the # merged dataframe. The individual dataframes have the parent indices in # sorted order. We use "inner" join type to preserve the order of the left # keys (also note that same parent index rows would be consecutive). Hence # we expect the merged dataframe to have sorted parent indices per # slice key. merged_df = functools.reduce( lambda base, update: pd.merge( base, update, how='inner', # pylint: disable=g-long-lambda on=_PARENT_INDEX_COLUMN), per_feature_parent_indices) # Construct a new column in the merged dataframe with the slice keys. merged_df[_SLICE_KEY_COLUMN] = '' index = 0 for col_name in sorted(merged_df.columns): if col_name in [_PARENT_INDEX_COLUMN, _SLICE_KEY_COLUMN]: continue slice_key_col = (_to_slice_key(col_name) + '_' + merged_df[col_name].apply(_to_slice_key)) if index == 0: merged_df[_SLICE_KEY_COLUMN] = slice_key_col index += 1 else: merged_df[_SLICE_KEY_COLUMN] += ('_' + slice_key_col) # Since the parent indices are sorted per slice key, the groupby would # preserve the sorted order within each group. per_slice_parent_indices = merged_df.groupby( _SLICE_KEY_COLUMN, sort=False)[_PARENT_INDEX_COLUMN] for slice_key, parent_indices in per_slice_parent_indices: yield (slice_key, table_util.RecordBatchTake( record_batch, pa.array(parent_indices.to_numpy())))
def _compact_impl( self, accumulator: _SampleRecordBatchRowsAccumulator ) -> _SampleRecordBatchRowsAccumulator: """Compacts the accumulator. This compact selects samples rows from the record batch, and merges them into one record batch. We can then clear the cache of all record batches seen so far. If the accumulator holds too few record batches, then nothing will be compacted. The sampling is done by assigning each row in the record batch a random number. Then we choose the top-k of the random numbers to get a sample of size k. Args: accumulator: The _SampleRecordBatchRowsAccumulator to compact. Returns: A _SampleRecordBatchRowsAccumulator that contains one or a list of record batch. """ self._combine_num_record_batches.update(len( accumulator.record_batches)) # There is nothing to compact. if accumulator.curr_num_rows <= 1: return accumulator # There is no need to compact yet. if (len(accumulator.record_batches) <= 1 and accumulator.curr_num_rows <= self._sample_size): return accumulator self._num_compacts.inc(1) k = min(self._sample_size, accumulator.curr_num_rows) rand_ints = np.concatenate(accumulator.random_ints) # Find the value that is the breakpoint for the top-k. kth_value = np.partition(rand_ints, k - 1)[k - 1] # This mask will always have >= 1 Trues. equals_to_kth = (rand_ints == kth_value) # This mask will always have < k Trues. less_than_kth = rand_ints < kth_value # Since there may be duplicate values, `equals_to_kth + less_than_kth` might # be greater than `k`. We need to keep track of how many to add, without # surpassing `k`. kth_to_add = k - np.sum(less_than_kth) # Preserve the random integers that we had assigned to each row. sample_random_ints = rand_ints[rand_ints <= kth_value][:k] beg = 0 sample_indices = [] for rb in accumulator.record_batches: size = rb.num_rows end = beg + size less_than_kth_indices = np.nonzero(less_than_kth[beg:end])[0] indices = less_than_kth_indices # Add indices of any duplicate values that are equal to `k`. if kth_to_add > 0: equals_to_kth_indices = np.nonzero(equals_to_kth[beg:end])[0] if equals_to_kth_indices.size > 0: if equals_to_kth_indices.size >= kth_to_add: indices = np.concatenate([ less_than_kth_indices, equals_to_kth_indices[:kth_to_add] ]) kth_to_add = 0 else: indices = np.concatenate( [less_than_kth_indices, equals_to_kth_indices]) kth_to_add -= equals_to_kth_indices.size sample_indices.append(indices) beg += size result = _SampleRecordBatchRowsAccumulator() # Take and merge the record batches, based on the sampled indices. rbs = [] for rb, indices in zip(accumulator.record_batches, sample_indices): rbs.append(table_util.RecordBatchTake(rb, pa.array(indices))) compressed_rb = table_util.MergeRecordBatches(rbs) result.record_batches = [compressed_rb] result.curr_num_rows = compressed_rb.num_rows result.curr_byte_size = compressed_rb.nbytes result.random_ints = [sample_random_ints] self._combine_byte_size.update(result.curr_byte_size) return result