Exemplo n.º 1
0
def _to_topk_tuples(
    sliced_table: Tuple[Text, pa.Table],
    categorical_features: FrozenSet[types.FeaturePath],
    weight_feature: Optional[Text]
) -> Iterable[Tuple[Tuple[Text, FeaturePathTuple, Any], Union[int, Tuple[
        int, Union[int, float]]]]]:
    """Generates tuples for computing top-k and uniques from input tables."""
    slice_key, table = sliced_table

    for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
            table, weight_column=weight_feature, enumerate_leaves_only=True):
        feature_array_type = feature_array.type
        if (feature_path in categorical_features
                or stats_util.get_feature_type_from_arrow_type(
                    feature_path, feature_array_type)
                == statistics_pb2.FeatureNameStatistics.STRING):
            flattened_values = feature_array.flatten()
            if weights is not None and flattened_values:
                # Slow path: weighted uniques.
                flattened_values_np = arrow_util.primitive_array_to_numpy(
                    flattened_values)
                parent_indices = (arrow_util.primitive_array_to_numpy(
                    arrow_util.GetFlattenedArrayParentIndices(feature_array)))
                weights_ndarray = weights[parent_indices]
                for value, count, weight in _weighted_unique(
                        flattened_values_np, weights_ndarray):
                    yield (slice_key, feature_path.steps(), value), (count,
                                                                     weight)
            else:
                value_counts = arrow_util.ValueCounts(flattened_values)
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in six.moves.zip(values, counts):
                    yield ((slice_key, feature_path.steps(), value), count)
def _flatten_and_impute(
    examples_table: pa.Table, categorical_features: Set[types.FeaturePath]
) -> Dict[types.FeaturePath, np.ndarray]:
    """Flattens and imputes the values in the input Arrow table.

  Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
  for categorical features and 10*max(feature_values) for numeric features.
  We impute missing values with an extreme value that is far from observed
  values so it does not incorrectly impact KNN results. 10*max(feature_values)
  is used instead of sys.max_float because max_float is large enough to cause
  unexpected float arithmetic errors.

  Args:
    examples_table: Arrow table containing a batch of examples where all
      features are univalent.
    categorical_features: Set of categorical feature names.

  Returns:
    A Dict[FeaturePath, np.ndarray] where the key is the feature path and the
    value is a 1D numpy array corresponding to the feature values.
  """
    num_rows = examples_table.num_rows
    result = {}
    for feature_column in examples_table.itercolumns():
        feature_path = types.FeaturePath([feature_column.name])
        # Assume we have only a single chunk.
        feature_array = feature_column.data.chunk(0)
        # to_pandas returns a readonly array. Create a copy as we will be imputing
        # the NaN values.
        non_missing_values = np.copy(
            arrow_util.primitive_array_to_numpy(feature_array.flatten()))
        non_missing_parent_indices = arrow_util.primitive_array_to_numpy(
            arrow_util.GetFlattenedArrayParentIndices(feature_array))
        is_categorical_feature = feature_path in categorical_features
        result_dtype = non_missing_values.dtype
        if non_missing_parent_indices.size < num_rows and is_categorical_feature:
            result_dtype = np.object
        flattened_array = np.ndarray(shape=num_rows, dtype=result_dtype)
        num_values = arrow_util.primitive_array_to_numpy(
            arrow_util.ListLengthsFromListArray(feature_array))
        missing_parent_indices = np.where(num_values == 0)[0]
        if feature_path in categorical_features:
            imputation_fill_value = CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
        else:
            # Also impute any NaN values.
            nan_mask = np.isnan(non_missing_values)
            imputation_fill_value = sys.maxsize
            if not np.all(nan_mask):
                imputation_fill_value = non_missing_values[~nan_mask].max(
                ) * 10
            non_missing_values[nan_mask.nonzero()[0]] = imputation_fill_value
        flattened_array[non_missing_parent_indices] = non_missing_values
        if missing_parent_indices.any():
            flattened_array[missing_parent_indices] = imputation_fill_value
        result[feature_path] = flattened_array
    return result
    def update(self,
               feature_path: types.FeaturePath,
               feature_array: pa.Array,
               feature_type: types.FeatureNameStatisticsType,
               num_values_quantiles_combiner: Any,
               weights: Optional[np.ndarray] = None) -> None:
        """Update the partial common statistics using the input value."""
        # All the values in this column is null and we cannot deduce the type of
        # the feature. This is not an error as this feature might have some values
        # in other batches.
        if feature_type is None:
            return

        if self.type is None:
            self.type = feature_type
        elif self.type != feature_type:
            raise TypeError('Cannot determine the type of feature %s. '
                            'Found values of types %s and %s.' %
                            (feature_path, self.type, feature_type))

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_array:
            return

        num_values = arrow_util.primitive_array_to_numpy(
            arrow_util.ListLengthsFromListArray(feature_array))
        none_mask = arrow_util.primitive_array_to_numpy(
            arrow_util.GetArrayNullBitmapAsByteArray(feature_array)).view(
                np.bool)

        self.num_non_missing += len(feature_array) - feature_array.null_count
        num_values_not_none = num_values[~none_mask]
        # We do this check to avoid failing in np.min/max with empty array.
        if num_values_not_none.size == 0:
            return
        # Use np.maximum.reduce(num_values_not_none, initial=self.max_num_values)
        # once we upgrade to numpy 1.16
        self.max_num_values = max(np.max(num_values_not_none),
                                  self.max_num_values)
        self.min_num_values = min(np.min(num_values_not_none),
                                  self.min_num_values)
        self.total_num_values += np.sum(num_values_not_none)
        self.num_values_summary = num_values_quantiles_combiner.add_input(
            self.num_values_summary, [num_values_not_none])

        if weights is not None:
            if weights.size != num_values.size:
                raise ValueError('Weight feature must not be missing.')
            self.weighted_total_num_values += np.sum(num_values * weights)
            self.weighted_num_non_missing += np.sum(weights[~none_mask])
    def update(self,
               feature_array: pa.Array,
               values_quantiles_combiner: Any,
               weights: Optional[np.ndarray] = None) -> None:
        """Update the partial numeric statistics using the input value."""

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_array:
            return

        flattened_value_array = feature_array.flatten()
        # Note: to_numpy will fail if flattened_value_array is empty.
        if not flattened_value_array:
            return
        values = arrow_util.primitive_array_to_numpy(flattened_value_array)
        nan_mask = np.isnan(values)
        self.num_nan += np.sum(nan_mask)
        non_nan_mask = ~nan_mask
        values_no_nan = values[non_nan_mask]
        # We do this check to avoid failing in np.min/max with empty array.
        if values_no_nan.size == 0:
            return
        # This is to avoid integer overflow when computing sum or sum of squares.
        values_no_nan_as_double = values_no_nan.astype(np.float64)
        self.sum += np.sum(values_no_nan_as_double)
        self.sum_of_squares += np.sum(values_no_nan_as_double *
                                      values_no_nan_as_double)
        # Use np.minimum.reduce(values_no_nan, initial=self.min) once we upgrade
        # to numpy 1.16
        self.min = min(self.min, np.min(values_no_nan))
        self.max = max(self.max, np.max(values_no_nan))
        self.num_zeros += values_no_nan.size - np.count_nonzero(values_no_nan)
        self.quantiles_summary = values_quantiles_combiner.add_input(
            self.quantiles_summary,
            [values_no_nan, np.ones_like(values_no_nan)])
        if weights is not None:
            value_parent_indices = arrow_util.primitive_array_to_numpy(
                arrow_util.GetFlattenedArrayParentIndices(feature_array))
            flat_weights = weights[value_parent_indices]
            flat_weights_no_nan = flat_weights[non_nan_mask]
            weighted_values = flat_weights_no_nan * values_no_nan
            self.weighted_sum += np.sum(weighted_values)
            self.weighted_sum_of_squares += np.sum(weighted_values *
                                                   values_no_nan)
            self.weighted_quantiles_summary = values_quantiles_combiner.add_input(
                self.weighted_quantiles_summary,
                [values_no_nan, flat_weights_no_nan])
            self.weighted_total_num_values += np.sum(flat_weights_no_nan)
 def testNumberArrayShouldShareBuffer(self):
     float_array = pa.array([1, 2, np.NaN], pa.float32())
     np_array = arrow_util.primitive_array_to_numpy(float_array)
     self.assertEqual(np_array.dtype, np.float32)
     self.assertEqual(np_array.shape, (3, ))
     # Check that they share the same buffer.
     self.assertEqual(np_array.ctypes.data,
                      float_array.buffers()[1].address)
    def _get_univalent_values_with_parent_indices(
            self,
            examples_table: pa.Table) -> Dict[types.FeatureName, pd.DataFrame]:
        """Extracts univalent values for each feature along with parent indices."""
        result = {}
        for feature_column in examples_table.itercolumns():
            feature_name = feature_column.name
            if (self._features_needed is not None
                    and feature_name not in self._features_needed):
                continue
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, feature_column.type)
            # Only consider crosses of numeric features.
            # TODO(zhuo): Support numeric features nested under structs.
            if feature_type in (statistics_pb2.FeatureNameStatistics.STRING,
                                statistics_pb2.FeatureNameStatistics.STRUCT):
                continue
            # Assume we have only a single chunk.
            assert feature_column.data.num_chunks == 1
            feat_arr = feature_column.data.chunk(0)
            value_lengths = arrow_util.primitive_array_to_numpy(
                arrow_util.ListLengthsFromListArray(feat_arr))
            univalent_parent_indices = set((value_lengths == 1).nonzero()[0])
            # If there are no univalent values, continue to the next feature.
            if not univalent_parent_indices:
                continue
            non_missing_values = arrow_util.primitive_array_to_numpy(
                feat_arr.flatten())
            value_parent_indices = arrow_util.primitive_array_to_numpy(
                arrow_util.GetFlattenedArrayParentIndices(feat_arr))
            if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT:
                # Remove any NaN values if present.
                non_nan_mask = ~np.isnan(non_missing_values)
                non_missing_values = non_missing_values[non_nan_mask]
                value_parent_indices = value_parent_indices[non_nan_mask]
            df = pd.DataFrame({
                feature_name: non_missing_values,
                'parent_index': value_parent_indices
            })
            # Only keep the univalent feature values.
            df = df[df['parent_index'].isin(univalent_parent_indices)]

            result[feature_name] = df

        return result
    def add_input(
            self, accumulator: Dict[types.FeaturePath, _ValueCounts],
            input_table: pa.Table) -> Dict[types.FeaturePath, _ValueCounts]:
        for feature_path, leaf_array, weights in arrow_util.enumerate_arrays(
                input_table,
                weight_column=self._weight_feature,
                enumerate_leaves_only=True):
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, leaf_array.type)
            # if it's not a categorical feature nor a string feature, we don't bother
            # with topk stats.
            if (feature_path in self._categorical_features or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                flattened_values = leaf_array.flatten()
                unweighted_counts = collections.Counter()
                # Compute unweighted counts.
                value_counts = arrow_util.ValueCounts(flattened_values)
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in six.moves.zip(values, counts):
                    unweighted_counts[value] = count

                # Compute weighted counts if a weight feature is specified.
                weighted_counts = _WeightedCounter()
                if weights is not None:
                    flattened_values_np = arrow_util.primitive_array_to_numpy(
                        flattened_values)
                    parent_indices = arrow_util.GetFlattenedArrayParentIndices(
                        leaf_array)
                    weighted_counts.weighted_update(
                        flattened_values_np,
                        weights[arrow_util.primitive_array_to_numpy(
                            parent_indices)])

                if feature_path not in accumulator:
                    accumulator[feature_path] = _ValueCounts(
                        unweighted_counts=unweighted_counts,
                        weighted_counts=weighted_counts)
                else:
                    accumulator[feature_path].unweighted_counts.update(
                        unweighted_counts)
                    accumulator[feature_path].weighted_counts.update(
                        weighted_counts)

        return accumulator
Exemplo n.º 8
0
 def add_input(self, accumulator: List[float],
               examples_table: pa.Table) -> List[float]:
   accumulator[0] += examples_table.num_rows
   if self._weight_feature:
     weights_column = examples_table.column(self._weight_feature)
     for weight_array in weights_column.data.iterchunks():
       accumulator[1] += np.sum(
           arrow_util.primitive_array_to_numpy(weight_array.flatten()))
   return accumulator
Exemplo n.º 9
0
    def add_input(self, accumulator: _PartialTimeStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Column) -> _PartialTimeStats:
        """Returns result of folding a batch of inputs into the current accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow Array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidated:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        if feature_type == statistics_pb2.FeatureNameStatistics.STRING:

            def _maybe_get_utf8(val):
                return stats_util.maybe_get_utf8(val) if isinstance(
                    val, bytes) else val

            values = arrow_util.primitive_array_to_numpy(
                feature_array.flatten())
            maybe_utf8 = np.vectorize(_maybe_get_utf8,
                                      otypes=[np.object])(values)
            if not maybe_utf8.all():
                accumulator.invalidated = True
                return accumulator
            accumulator.update(maybe_utf8, feature_type)
        elif feature_type == statistics_pb2.FeatureNameStatistics.INT:
            values = arrow_util.primitive_array_to_numpy(
                feature_array.flatten())
            accumulator.update(values, feature_type)
        else:
            accumulator.invalidated = True

        return accumulator
Exemplo n.º 10
0
    def add_input(self, accumulator: _PartialImageStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Array) -> _PartialImageStats:
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidate:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        # If we see a different type, invalidate.
        if feature_type != statistics_pb2.FeatureNameStatistics.STRING:
            accumulator.invalidate = True
            return accumulator

        # Consider using memoryview to avoid copying after upgrading to
        # arrow 0.12. Note that this would involve modifying the subsequent logic
        # to iterate over the values in a loop.
        values = arrow_util.primitive_array_to_numpy(feature_array.flatten())
        accumulator.total_num_values += values.size
        image_formats = self._image_decoder.get_formats(values)
        valid_mask = ~pd.isnull(image_formats)
        valid_formats = image_formats[valid_mask]
        format_counts = np.unique(valid_formats, return_counts=True)
        for (image_format, count) in zip(*format_counts):
            accumulator.counter_by_format[image_format] += count
        unknown_count = image_formats.size - valid_formats.size
        if unknown_count > 0:
            accumulator.counter_by_format[''] += unknown_count

        if self._enable_size_stats:
            # Get image height and width.
            image_sizes = self._image_decoder.get_sizes(values[valid_mask])
            if image_sizes.any():
                max_sizes = np.max(image_sizes, axis=0)
                # Update the max image height/width with all image values.
                accumulator.max_height = max(accumulator.max_height,
                                             max_sizes[0])
                accumulator.max_width = max(accumulator.max_width,
                                            max_sizes[1])

        return accumulator
    def update(self, feature_array: pa.Array) -> None:
        """Update the partial string statistics using the input value."""
        # Iterate through the value array and update the partial stats.
        flattened_values_array = feature_array.flatten()
        if pa.types.is_binary(
                flattened_values_array.type) or pa.types.is_unicode(
                    flattened_values_array.type):
            # GetBinaryArrayTotalByteSize returns a Python long (to be compatible
            # with Python3). To make sure we do cheaper integer arithemetics in
            # Python2, we first convert it to int.
            self.total_bytes_length += int(
                arrow_util.GetBinaryArrayTotalByteSize(flattened_values_array))
        elif flattened_values_array:
            # We can only do flattened_values_array.to_numpy() when it's not empty.
            # This could be computed faster by taking log10 of the integer.
            def _len_after_conv(s):
                return len(str(s))

            self.total_bytes_length += np.sum(
                np.vectorize(_len_after_conv, otypes=[np.int32])
                (arrow_util.primitive_array_to_numpy(flattened_values_array)))
    def add_input(self, accumulator: _PartialNLStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Array) -> _PartialNLStats:
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow Array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidate:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        # If we see a different type, invalidate.
        if feature_type != statistics_pb2.FeatureNameStatistics.STRING:
            accumulator.invalidate = True
            return accumulator

        def _is_non_utf8(value):
            return (isinstance(value, bytes)
                    and stats_util.maybe_get_utf8(value) is None)

        is_non_utf_vec = np.vectorize(_is_non_utf8, otypes=[np.bool])
        classify_vec = np.vectorize(self._classifier.classify,
                                    otypes=[np.bool])
        values = arrow_util.primitive_array_to_numpy(feature_array.flatten())
        if np.any(is_non_utf_vec(values)):
            accumulator.invalidate = True
            return accumulator
        accumulator.considered += values.size
        accumulator.matched += np.sum(classify_vec(values))
        return accumulator
 def testNumberArrayWithNone(self):
     float_array = pa.array([1.0, 2.0, None], pa.float64())
     np_array = arrow_util.primitive_array_to_numpy(float_array)
     self.assertEqual(np_array.dtype, np.float64)
     np.testing.assert_array_equal(np_array, [1.0, 2.0, np.NaN])
 def testStringArray(self):
     string_array = pa.array(["a", "b"], pa.utf8())
     np_array = arrow_util.primitive_array_to_numpy(string_array)
     self.assertEqual(np_array.dtype, np.object)
     self.assertEqual(np_array.shape, (2, ))
     np.testing.assert_array_equal(np_array, [u"a", u"b"])
Exemplo n.º 15
0
    def feature_value_slicer(table):
        """A function that generates sliced tables.

    The naive approach of doing this would be to iterate each row, identify
    slice keys for the row and keep track of index ranges for each slice key.
    And then generate an arrow table for each slice key based on the index
    ranges. This would be expensive as we are identifying the slice keys for
    each row individually and we would have to loop over the feature values
    including crossing them when we have to slice on multiple features. The
    current approach generates the slice keys for a batch by performing joins
    over indices of individual features. And then groups the joined table by
    slice key to get the row indices corresponding to a slice.

    Args:
      table: Arrow table.

    Yields:
      Sliced table (slice_key, Arrow table) where the table contains the rows
      corresponding to a slice.
    """
        per_feature_parent_indices = []
        for feature_name, values in six.iteritems(features):
            column = table.column(feature_name)
            # Assume we have a single chunk.
            feature_array = column.data.chunk(0)
            non_missing_values = arrow_util.primitive_array_to_numpy(
                feature_array.flatten())
            value_parent_indices = array_util.GetFlattenedArrayParentIndices(
                feature_array).to_numpy()
            # Create dataframe with feature value and parent index.
            df = pd.DataFrame({
                feature_name: non_missing_values,
                _PARENT_INDEX_COLUMN: value_parent_indices
            })
            df.drop_duplicates(inplace=True)
            # Filter based on slice values
            if values is not None:
                df = df.loc[df[feature_name].isin(values)]
            per_feature_parent_indices.append(df)

        # Join dataframes based on parent indices.
        # Note that we want the parent indices per slice key to be sorted in the
        # merged dataframe. The individual dataframes have the parent indices in
        # sorted order. We use "inner" join type to preserve the order of the left
        # keys (also note that same parent index rows would be consecutive). Hence
        # we expect the merged dataframe to have sorted parent indices per
        # slice key.
        merged_df = functools.reduce(
            lambda base, update: pd.merge(
                base,
                update,
                how='inner',  # pylint: disable=g-long-lambda
                on=_PARENT_INDEX_COLUMN),
            per_feature_parent_indices)

        # Construct a new column in the merged dataframe with the slice keys.
        merged_df[_SLICE_KEY_COLUMN] = ''
        index = 0
        for col_name in sorted(merged_df.columns):
            if col_name in [_PARENT_INDEX_COLUMN, _SLICE_KEY_COLUMN]:
                continue
            slice_key_col = (_to_slice_key(col_name) + '_' +
                             merged_df[col_name].apply(_to_slice_key))
            if index == 0:
                merged_df[_SLICE_KEY_COLUMN] = slice_key_col
                index += 1
            else:
                merged_df[_SLICE_KEY_COLUMN] += ('_' + slice_key_col)

        # Since the parent indices are sorted per slice key, the groupby would
        # preserve the sorted order within each group.
        per_slice_parent_indices = merged_df.groupby(
            _SLICE_KEY_COLUMN, sort=False)[_PARENT_INDEX_COLUMN]
        for slice_key, parent_indices in per_slice_parent_indices:
            yield (slice_key,
                   table_util.SliceTableByRowIndices(
                       table, pa.array(parent_indices.to_numpy())))
Exemplo n.º 16
0
    def add_input(
        self, accumulator: Dict[types.FeaturePath, _PartialSparseFeatureStats],
        input_table: pa.Table
    ) -> Dict[types.FeaturePath, _PartialSparseFeatureStats]:
        """Returns result of folding a batch of inputs into the current accumulator.

    Args:
      accumulator: The current accumulator.
      input_table: An Arrow Table whose columns are features and rows are
        examples.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        component_feature_value_list_lengths = dict()
        component_feature_num_missing = dict()
        batch_example_count = input_table.num_rows
        # Do a single pass through the input table to determine the value list
        # lengths and number missing for every feature that is an index or value
        # feature in any sparse feature in the schema.
        for feature_path, leaf_array, _ in arrow_util.enumerate_arrays(
                input_table, weight_column=None, enumerate_leaves_only=True):
            if (feature_path in self._all_index_feature_paths
                    or feature_path in self._all_value_feature_paths):
                # If the column is a NullArray, skip it when populating the
                # component_feature_ dicts. Features that are missing from those dicts
                # are treated as entirely missing for the batch.
                if not pa.types.is_null(leaf_array.type):
                    component_feature_value_list_lengths[
                        feature_path] = arrow_util.primitive_array_to_numpy(
                            arrow_util.ListLengthsFromListArray(leaf_array))
                    component_feature_num_missing[
                        feature_path] = leaf_array.null_count

        # Now create a partial sparse feature stats object for each sparse feature
        # using the value list lengths and numbers missing information collected
        # above.
        for feature_path in self._sparse_feature_component_paths:
            value_feature_path = self._sparse_feature_component_paths[
                feature_path].value_feature
            index_feature_paths = self._sparse_feature_component_paths[
                feature_path].index_features
            missing_value_count = component_feature_num_missing.get(
                value_feature_path)
            # If this batch does not have the value feature at all,
            # missing_value_count is the number of examples in the batch.
            # Also populate the value list lengths for the value feature with all 0s
            # since a missing feature is considered to have a value list length of 0.
            if missing_value_count is None:
                missing_value_count = batch_example_count
                component_feature_value_list_lengths[
                    value_feature_path] = np.full(batch_example_count, 0)
            missing_index_counts = collections.Counter()
            min_length_diff = dict()
            max_length_diff = dict()
            for index_feature_path in index_feature_paths:
                missing_index_count = component_feature_num_missing.get(
                    index_feature_path)
                # If this batch does not have this index feature at all,
                # missing_index_count for that index feature is the number of
                # examples in the batch.
                # Also populate the value list lengths for the index feature with all 0s
                # since a missing feature is considered to have a value list length of
                # 0.
                if missing_index_count is None:
                    missing_index_counts[
                        index_feature_path] = batch_example_count
                    component_feature_value_list_lengths[
                        index_feature_path] = np.full(batch_example_count, 0)
                else:
                    missing_index_counts[
                        index_feature_path] = missing_index_count
                length_differences = np.subtract(
                    component_feature_value_list_lengths[index_feature_path],
                    component_feature_value_list_lengths[value_feature_path])
                min_length_diff[index_feature_path] = np.min(
                    length_differences)
                max_length_diff[index_feature_path] = np.max(
                    length_differences)

            stats_for_feature = _PartialSparseFeatureStats(
                missing_value_count, missing_index_counts, min_length_diff,
                max_length_diff)
            existing_stats_for_feature = accumulator.get(feature_path)
            if existing_stats_for_feature is None:
                accumulator[feature_path] = stats_for_feature
            else:
                accumulator[feature_path] += stats_for_feature
        return accumulator