def test_get_flattened_array_parent_indices(self): indices = array_util.GetFlattenedArrayParentIndices( pa.array([], type=pa.list_(pa.int32()))) self.assertTrue(indices.equals(pa.array([], type=pa.int32()))) indices = array_util.GetFlattenedArrayParentIndices( pa.array([[1.], [2.], [], [3.]])) self.assertTrue(indices.equals(pa.array([0, 1, 3], type=pa.int32())))
def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, example_indices: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" if not query_path: return array, example_indices array_type = array.type if (not is_list_like(array_type) or not pa.types.is_struct(array_type.value_type)): raise KeyError( 'Cannot process query_path "{}" inside an array of type ' '{}. Expecting a (large_)list<struct<...>>.'.format( query_path, array_type)) flat_struct_array = array.flatten() flat_indices = None if example_indices is not None: flat_indices = example_indices[ array_util.GetFlattenedArrayParentIndices(array).to_numpy()] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_indices)
def flatten_nested( array: pa.Array, return_parent_indices: bool = False ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Flattens all the list arrays nesting an array. If `array` is not list-like, itself will be returned. Args: array: pa.Array to flatten. return_parent_indices: If True, also returns the parent indices array. Returns: A tuple. The first term is the flattened array. The second term is None if `return_parent_indices` is False; otherwise it's a parent indices array parallel to the flattened array: if parent_indices[i] = j, then flattened_array[i] belongs to the j-th element of the input array. """ parent_indices = None while is_list_like(array.type): if return_parent_indices: cur_parent_indices = array_util.GetFlattenedArrayParentIndices( array).to_numpy() if parent_indices is None: parent_indices = cur_parent_indices else: parent_indices = parent_indices[cur_parent_indices] array = array.flatten() # the array is not nested at the first place. if return_parent_indices and parent_indices is None: parent_indices = np.arange(len(array)) return array, parent_indices
def _recursion_helper( feature_path: types.FeaturePath, array: pa.Array, weights: Optional[np.ndarray] ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]: """Recursion helper.""" array_type = array.type if is_list_like(array_type) and pa.types.is_struct( array_type.value_type): if not enumerate_leaves_only: yield (feature_path, array, weights) flat_struct_array = array.flatten() flat_weights = None if weights is not None: flat_weights = weights[ array_util.GetFlattenedArrayParentIndices( array).to_numpy()] for field in flat_struct_array.type: field_name = field.name # use "yield from" after PY 3.3. for e in _recursion_helper(feature_path.child(field_name), flat_struct_array.field(field_name), flat_weights): yield e else: yield (feature_path, array, weights)
def GetTensor(self, record_batch: pa.RecordBatch, produce_eager_tensors: bool) -> Any: values_array = record_batch.column(self._value_column_index) values_parent_indices = array_util.GetFlattenedArrayParentIndices( values_array) indices_arrays = [np.asarray(values_parent_indices)] for index_column_index in self._index_column_indices: indices_arrays.append( np.asarray(record_batch.column(index_column_index).flatten())) flat_values_array = values_array.flatten() if self._convert_to_binary_fn is not None: flat_values_array = self._convert_to_binary_fn(flat_values_array) values_np = np.asarray(flat_values_array) coo_np = np.empty(shape=(len(values_np), self._coo_size), dtype=np.int64) try: np.stack(indices_arrays, axis=1, out=coo_np) except ValueError as e: raise ValueError("Error constructing the COO for SparseTensor. " "number of values: {}; " "size of each index array: {}; " "original error {}.".format( len(values_np), [len(i) for i in indices_arrays], e)) dense_shape = [len(record_batch)] + self._shape if produce_eager_tensors: return tf.sparse.SparseTensor( indices=tf.convert_to_tensor(coo_np), dense_shape=tf.convert_to_tensor(dense_shape, dtype=tf.int64), values=tf.convert_to_tensor(values_np)) return tf.compat.v1.SparseTensorValue(indices=coo_np, dense_shape=dense_shape, values=values_np)
def update(self, feature_array: pa.Array, values_quantiles_combiner: Any, weights: Optional[np.ndarray] = None) -> None: """Update the partial numeric statistics using the input value.""" # np.max / np.min below cannot handle empty arrays. And there's nothing # we can collect in this case. if not feature_array: return flattened_value_array = feature_array.flatten() # Note: to_numpy will fail if flattened_value_array is empty. if not flattened_value_array: return values = np.asarray(flattened_value_array) nan_mask = np.isnan(values) self.num_nan += np.sum(nan_mask) non_nan_mask = ~nan_mask values_no_nan = values[non_nan_mask] # We do this check to avoid failing in np.min/max with empty array. if values_no_nan.size == 0: return # This is to avoid integer overflow when computing sum or sum of squares. values_no_nan_as_double = values_no_nan.astype(np.float64) self.sum += np.sum(values_no_nan_as_double) self.sum_of_squares += np.sum(values_no_nan_as_double * values_no_nan_as_double) # Use np.minimum.reduce(values_no_nan, initial=self.min) once we upgrade # to numpy 1.16 curr_min = np.min(values_no_nan) curr_max = np.max(values_no_nan) self.min = min(self.min, curr_min) self.max = max(self.max, curr_max) if curr_min == float('-inf') or curr_max == float('inf'): finite_values = values_no_nan[np.isfinite(values_no_nan)] if finite_values.size > 0: self.finite_min = min(self.finite_min, np.min(finite_values)) self.finite_max = max(self.finite_max, np.max(finite_values)) self.num_zeros += values_no_nan.size - np.count_nonzero(values_no_nan) self.quantiles_summary = values_quantiles_combiner.add_input( self.quantiles_summary, [values_no_nan, np.ones_like(values_no_nan)]) if weights is not None: value_parent_indices = np.asarray( array_util.GetFlattenedArrayParentIndices(feature_array)) flat_weights = weights[value_parent_indices] flat_weights_no_nan = flat_weights[non_nan_mask] weighted_values = flat_weights_no_nan * values_no_nan self.weighted_sum += np.sum(weighted_values) self.weighted_sum_of_squares += np.sum(weighted_values * values_no_nan) self.weighted_quantiles_summary = values_quantiles_combiner.add_input( self.weighted_quantiles_summary, [values_no_nan, flat_weights_no_nan]) self.weighted_total_num_values += np.sum(flat_weights_no_nan)
def _flatten_and_impute( examples_table: pa.Table, categorical_features: Set[types.FeaturePath] ) -> Dict[types.FeaturePath, np.ndarray]: """Flattens and imputes the values in the input Arrow table. Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE for categorical features and 10*max(feature_values) for numeric features. We impute missing values with an extreme value that is far from observed values so it does not incorrectly impact KNN results. 10*max(feature_values) is used instead of sys.max_float because max_float is large enough to cause unexpected float arithmetic errors. Args: examples_table: Arrow table containing a batch of examples where all features are univalent. categorical_features: Set of categorical feature names. Returns: A Dict[FeaturePath, np.ndarray] where the key is the feature path and the value is a 1D numpy array corresponding to the feature values. """ num_rows = examples_table.num_rows result = {} for feature_column in examples_table.itercolumns(): feature_path = types.FeaturePath([feature_column.name]) # Assume we have only a single chunk. feature_array = feature_column.data.chunk(0) # to_pandas returns a readonly array. Create a copy as we will be imputing # the NaN values. non_missing_values = np.copy( arrow_util.primitive_array_to_numpy(feature_array.flatten())) non_missing_parent_indices = arrow_util.primitive_array_to_numpy( array_util.GetFlattenedArrayParentIndices(feature_array)) is_categorical_feature = feature_path in categorical_features result_dtype = non_missing_values.dtype if non_missing_parent_indices.size < num_rows and is_categorical_feature: result_dtype = np.object flattened_array = np.ndarray(shape=num_rows, dtype=result_dtype) num_values = arrow_util.primitive_array_to_numpy( array_util.ListLengthsFromListArray(feature_array)) missing_parent_indices = np.where(num_values == 0)[0] if feature_path in categorical_features: imputation_fill_value = CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE else: # Also impute any NaN values. nan_mask = np.isnan(non_missing_values) imputation_fill_value = sys.maxsize if not np.all(nan_mask): imputation_fill_value = non_missing_values[~nan_mask].max( ) * 10 non_missing_values[nan_mask.nonzero()[0]] = imputation_fill_value flattened_array[non_missing_parent_indices] = non_missing_values if missing_parent_indices.any(): flattened_array[missing_parent_indices] = imputation_fill_value result[feature_path] = flattened_array return result
def test_get_flattened_array_parent_indices(self, list_type_factory, parent_indices_type): indices = array_util.GetFlattenedArrayParentIndices( pa.array([], type=list_type_factory(pa.int32()))) self.assertTrue(indices.equals(pa.array([], type=parent_indices_type))) indices = array_util.GetFlattenedArrayParentIndices( pa.array([[1.], [2.], [], [3., 4.]], type=list_type_factory(pa.float32()))) self.assertTrue( indices.equals(pa.array([0, 1, 3, 3], type=parent_indices_type))) indices = array_util.GetFlattenedArrayParentIndices( pa.array([[1.], [2.], [], [3., 4.]], type=list_type_factory(pa.float32())).slice(1)) self.assertTrue( indices.equals(pa.array([0, 2, 2], type=parent_indices_type))) indices = array_util.GetFlattenedArrayParentIndices( pa.array([list(range(1024))], type=list_type_factory(pa.int64()))) self.assertTrue( indices.equals(pa.array([0] * 1024, type=parent_indices_type)))
def _get_example_value_presence( table: pa.Table, path: types.FeaturePath, boundaries: Optional[Iterable[float]]) -> Optional[pd.Series]: """Returns information about which examples contained which values. This function treats all values for a given path within a single example as a set and and returns a mapping between each example index and the distinct values which are present in that example. The result of calling this function for path 'p' on an arrow table with the two records [{'p': ['a', 'a', 'b']}, {'p': [a]}] will be pd.Series(['a', 'b', 'a'], index=[0, 0, 1]). If the array retrieved from get_array is null, this function returns None. Args: table: The table in which to look up the path. path: The FeaturePath for which to fetch values. boundaries: Optionally, a set of bin boundaries to use for binning the array values. Returns: A Pandas Series containing distinct pairs of array values and example indices. The series values will be the array values, and the series index values will be the example indices. """ arr, example_indices = arrow_util.get_array(table, path, return_example_indices=True) if pa.types.is_null(arr.type): return None arr_flat = arr.flatten() example_indices_flat = example_indices[ array_util.GetFlattenedArrayParentIndices(arr).to_numpy()] if boundaries is not None: element_indices, bins = bin_util.bin_array(arr_flat, boundaries) df = pd.DataFrame({ 'example_indices': example_indices_flat[element_indices], 'values': bins }) else: df = pd.DataFrame({ 'example_indices': example_indices_flat, 'values': np.asarray(arr_flat) }) df_unique = df.drop_duplicates() return df_unique.set_index('example_indices')['values']
def _RecursionHelper(row_indices, array): """Flattens `array` while maintains the `row_indices`.""" array_type = array.type if _IsListLike(array_type): parent_indices = np.asarray( array_util.GetFlattenedArrayParentIndices(array)) _RecursionHelper(row_indices[parent_indices], array.flatten()) elif pa.types.is_struct(array_type): for child in array.flatten(): _RecursionHelper(row_indices, child) else: value_type = _GetValueType(array.type) dist_by_type = self._num_feature_values_dist_by_type[value_type] for num_values in np.bincount(row_indices, minlength=num_rows).tolist(): dist_by_type.update(num_values) self._num_feature_values_dist.update(num_values)
def update(self, feature_path: types.FeaturePath, feature_array: pa.Array, feature_type: types.FeatureNameStatisticsType, make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch], weights: Optional[np.ndarray] = None) -> None: """Update the partial common statistics using the input value.""" if self.type is None: self.type = feature_type # pytype: disable=annotation-type-mismatch elif feature_type is not None and self.type != feature_type: raise TypeError('Cannot determine the type of feature %s. ' 'Found values of types %s and %s.' % (feature_path, self.type, feature_type)) nest_level = arrow_util.get_nest_level(feature_array.type) if self.presence_and_valency_stats is None: self.presence_and_valency_stats = [ _PresenceAndValencyStats(make_quantiles_sketch_fn) for _ in range(nest_level) ] elif nest_level != len(self.presence_and_valency_stats): raise ValueError('Inconsistent nestedness in feature {}: {} vs {}'.format( feature_path, nest_level, len(self.presence_and_valency_stats))) # And there's nothing we can collect in this case. if not feature_array: return level = 0 while arrow_util.is_list_like(feature_array.type): presence_mask = ~np.asarray( array_util.GetArrayNullBitmapAsByteArray(feature_array)).view(np.bool) num_values = np.asarray( array_util.ListLengthsFromListArray(feature_array)) num_values_not_none = num_values[presence_mask] self.presence_and_valency_stats[level].update(feature_array, presence_mask, num_values, num_values_not_none, weights) flattened = feature_array.flatten() if weights is not None: parent_indices = array_util.GetFlattenedArrayParentIndices( feature_array).to_numpy() weights = weights[parent_indices] feature_array = flattened level += 1
def add_input( self, accumulator: Dict[types.FeaturePath, _ValueCounts], input_record_batch: pa.RecordBatch ) -> Dict[types.FeaturePath, _ValueCounts]: for feature_path, leaf_array, weights in arrow_util.enumerate_arrays( input_record_batch, weight_column=self._weight_feature, enumerate_leaves_only=True): feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, leaf_array.type) if feature_type is None: continue # if it's not a categorical feature nor a string feature, we don't bother # with topk stats. if (feature_path in self._categorical_features or feature_type == statistics_pb2.FeatureNameStatistics.STRING): flattened_values = leaf_array.flatten() unweighted_counts = collections.Counter() # Compute unweighted counts. value_counts = array_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): unweighted_counts[value] = count # Compute weighted counts if a weight feature is specified. weighted_counts = _WeightedCounter() if weights is not None: flattened_values_np = np.asarray(flattened_values) parent_indices = array_util.GetFlattenedArrayParentIndices( leaf_array) weighted_counts.weighted_update( flattened_values_np, weights[np.asarray(parent_indices)]) if feature_path not in accumulator: accumulator[feature_path] = _ValueCounts( unweighted_counts=unweighted_counts, weighted_counts=weighted_counts) else: accumulator[feature_path].unweighted_counts.update( unweighted_counts) accumulator[feature_path].weighted_counts.update( weighted_counts) return accumulator
def _get_univalent_values_with_parent_indices( self, examples_table: pa.Table) -> Dict[types.FeatureName, pd.DataFrame]: """Extracts univalent values for each feature along with parent indices.""" result = {} for feature_column in examples_table.itercolumns(): feature_name = feature_column.name if (self._features_needed is not None and feature_name not in self._features_needed): continue feature_type = stats_util.get_feature_type_from_arrow_type( feature_name, feature_column.type) # Only consider crosses of numeric features. # TODO(zhuo): Support numeric features nested under structs. if feature_type in (statistics_pb2.FeatureNameStatistics.STRING, statistics_pb2.FeatureNameStatistics.STRUCT): continue # Assume we have only a single chunk. assert feature_column.data.num_chunks == 1 feat_arr = feature_column.data.chunk(0) value_lengths = arrow_util.primitive_array_to_numpy( array_util.ListLengthsFromListArray(feat_arr)) univalent_parent_indices = set((value_lengths == 1).nonzero()[0]) # If there are no univalent values, continue to the next feature. if not univalent_parent_indices: continue non_missing_values = arrow_util.primitive_array_to_numpy( feat_arr.flatten()) value_parent_indices = arrow_util.primitive_array_to_numpy( array_util.GetFlattenedArrayParentIndices(feat_arr)) if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT: # Remove any NaN values if present. non_nan_mask = ~np.isnan(non_missing_values) non_missing_values = non_missing_values[non_nan_mask] value_parent_indices = value_parent_indices[non_nan_mask] df = pd.DataFrame({ feature_name: non_missing_values, 'parent_index': value_parent_indices }) # Only keep the univalent feature values. df = df[df['parent_index'].isin(univalent_parent_indices)] result[feature_name] = df return result
def _get_univalent_values_with_parent_indices( self, examples: pa.RecordBatch) -> Dict[types.FeatureName, pd.DataFrame]: """Extracts univalent values for each feature along with parent indices.""" result = {} for feature_name, feat_arr in zip(examples.schema.names, examples.columns): if (self._features_needed is not None and feature_name not in self._features_needed): continue feature_type = stats_util.get_feature_type_from_arrow_type( feature_name, feat_arr.type) # Only consider crosses of numeric features. # TODO(zhuo): Support numeric features nested under structs. if feature_type in (None, statistics_pb2.FeatureNameStatistics.STRING, statistics_pb2.FeatureNameStatistics.STRUCT): continue value_lengths = np.asarray( array_util.ListLengthsFromListArray(feat_arr)) univalent_parent_indices = set((value_lengths == 1).nonzero()[0]) # If there are no univalent values, continue to the next feature. if not univalent_parent_indices: continue non_missing_values = np.asarray(feat_arr.flatten()) value_parent_indices = np.asarray( array_util.GetFlattenedArrayParentIndices(feat_arr)) if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT: # Remove any NaN values if present. non_nan_mask = ~np.isnan(non_missing_values) non_missing_values = non_missing_values[non_nan_mask] value_parent_indices = value_parent_indices[non_nan_mask] df = pd.DataFrame({ feature_name: non_missing_values, 'parent_index': value_parent_indices }) # Only keep the univalent feature values. df = df[df['parent_index'].isin(univalent_parent_indices)] result[feature_name] = df return result
def _to_topk_tuples( sliced_table: Tuple[types.SliceKey, pa.Table], bytes_features: FrozenSet[types.FeaturePath], categorical_features: FrozenSet[types.FeaturePath], weight_feature: Optional[Text] ) -> Iterable[ Tuple[Tuple[types.SliceKey, FeaturePathTuple, Any], Union[int, Tuple[int, Union[int, float]]]]]: """Generates tuples for computing top-k and uniques from input tables.""" slice_key, table = sliced_table for feature_path, feature_array, weights in arrow_util.enumerate_arrays( table, weight_column=weight_feature, enumerate_leaves_only=True): feature_array_type = feature_array.type if pa.types.is_null(feature_array_type): continue if feature_path in bytes_features: continue if (feature_path in categorical_features or stats_util.get_feature_type_from_arrow_type( feature_path, feature_array_type) == statistics_pb2.FeatureNameStatistics.STRING): flattened_values = feature_array.flatten() if weights is not None and flattened_values: # Slow path: weighted uniques. flattened_values_np = np.asarray(flattened_values) parent_indices = ( np.asarray( array_util.GetFlattenedArrayParentIndices(feature_array))) weights_ndarray = weights[parent_indices] for value, count, weight in _weighted_unique( flattened_values_np, weights_ndarray): yield (slice_key, feature_path.steps(), value), (count, weight) else: value_counts = array_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): yield ((slice_key, feature_path.steps(), value), count)
def _get_example_value_presence( record_batch: pa.RecordBatch, path: types.FeaturePath, boundaries: Optional[Iterable[float]], weight_column_name: Optional[Text]) -> Optional[pd.DataFrame]: """Returns information about which examples contained which values. This function treats all values for a given path within a single example as a set and and returns a mapping between each example index and the distinct values which are present in that example. The result of calling this function for path 'p' on an arrow record batch with the two records [{'p': ['a', 'a', 'b']}, {'p': [a]}] will be pd.Series(['a', 'b', 'a'], index=[0, 0, 1]). If the array retrieved from get_array is null, this function returns None. Args: record_batch: The RecordBatch in which to look up the path. path: The FeaturePath for which to fetch values. boundaries: Optionally, a set of bin boundaries to use for binning the array values. weight_column_name: Optionally, a weight column to return in addition to the value and example index. Returns: A Pandas DataFrame containing distinct pairs of array values and example indices, along with the corresponding flattened example weights. The index will be the example indices and the values will be stored in a column named 'values'. If weight_column_name is provided, a second column will be returned containing the array values, and 'weights' containing the weights for the example from which each value came. """ arr, example_indices = arrow_util.get_array(record_batch, path, return_example_indices=True) if pa.types.is_null(arr.type): return None arr_flat = arr.flatten() is_binary_like = arrow_util.is_binary_like(arr_flat.type) assert boundaries is None or not is_binary_like, ( 'Boundaries can only be applied to numeric columns') if is_binary_like: # use dictionary_encode so we can use np.unique on object arrays dict_array = arr_flat.dictionary_encode() arr_flat = dict_array.indices arr_flat_dict = np.asarray(dict_array.dictionary) parent_indices = array_util.GetFlattenedArrayParentIndices(arr).to_numpy() example_indices_flat = example_indices[parent_indices] if boundaries is not None: element_indices, bins = bin_util.bin_array(arr_flat, boundaries) rows = np.vstack([example_indices_flat[element_indices], bins]) else: rows = np.vstack([example_indices_flat, np.asarray(arr_flat)]) if not rows.size: return None # Deduplicate values which show up more than once in the same example. This # makes P(X=x|Y=y) in the standard lift definition behave as # P(x \in Xs | y \in Ys) if examples contain more than one value of X and Y. unique_rows = np.unique(rows, axis=1) example_indices = unique_rows[0, :] values = unique_rows[1, :] if is_binary_like: # return binary like values a pd.Categorical wrapped in a Series. This makes # subsqeuent operations like pd.Merge cheaper. values = pd.Categorical.from_codes(values, categories=arr_flat_dict) columns = {'example_indices': example_indices, 'values': values} if weight_column_name: weights = arrow_util.get_weight_feature(record_batch, weight_column_name) columns['weights'] = np.asarray(weights)[example_indices] df = pd.DataFrame(columns) return df.set_index('example_indices')
def feature_value_slicer(table): """A function that generates sliced tables. The naive approach of doing this would be to iterate each row, identify slice keys for the row and keep track of index ranges for each slice key. And then generate an arrow table for each slice key based on the index ranges. This would be expensive as we are identifying the slice keys for each row individually and we would have to loop over the feature values including crossing them when we have to slice on multiple features. The current approach generates the slice keys for a batch by performing joins over indices of individual features. And then groups the joined table by slice key to get the row indices corresponding to a slice. Args: table: Arrow table. Yields: Sliced table (slice_key, Arrow table) where the table contains the rows corresponding to a slice. """ per_feature_parent_indices = [] for feature_name, values in six.iteritems(features): column = table.column(feature_name) # Assume we have a single chunk. feature_array = column.data.chunk(0) non_missing_values = arrow_util.primitive_array_to_numpy( feature_array.flatten()) value_parent_indices = array_util.GetFlattenedArrayParentIndices( feature_array).to_numpy() # Create dataframe with feature value and parent index. df = pd.DataFrame({ feature_name: non_missing_values, _PARENT_INDEX_COLUMN: value_parent_indices }) df.drop_duplicates(inplace=True) # Filter based on slice values if values is not None: df = df.loc[df[feature_name].isin(values)] per_feature_parent_indices.append(df) # Join dataframes based on parent indices. # Note that we want the parent indices per slice key to be sorted in the # merged dataframe. The individual dataframes have the parent indices in # sorted order. We use "inner" join type to preserve the order of the left # keys (also note that same parent index rows would be consecutive). Hence # we expect the merged dataframe to have sorted parent indices per # slice key. merged_df = functools.reduce( lambda base, update: pd.merge( base, update, how='inner', # pylint: disable=g-long-lambda on=_PARENT_INDEX_COLUMN), per_feature_parent_indices) # Construct a new column in the merged dataframe with the slice keys. merged_df[_SLICE_KEY_COLUMN] = '' index = 0 for col_name in sorted(merged_df.columns): if col_name in [_PARENT_INDEX_COLUMN, _SLICE_KEY_COLUMN]: continue slice_key_col = (_to_slice_key(col_name) + '_' + merged_df[col_name].apply(_to_slice_key)) if index == 0: merged_df[_SLICE_KEY_COLUMN] = slice_key_col index += 1 else: merged_df[_SLICE_KEY_COLUMN] += ('_' + slice_key_col) # Since the parent indices are sorted per slice key, the groupby would # preserve the sorted order within each group. per_slice_parent_indices = merged_df.groupby( _SLICE_KEY_COLUMN, sort=False)[_PARENT_INDEX_COLUMN] for slice_key, parent_indices in per_slice_parent_indices: yield (slice_key, table_util.SliceTableByRowIndices( table, pa.array(parent_indices.to_numpy())))