def add_input(
        self, accumulator: Dict[types.FeaturePath,
                                _PartialBasicStats], examples_table: pa.Table
    ) -> Dict[types.FeaturePath, _PartialBasicStats]:
        for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
                examples_table,
                weight_column=self._weight_feature,
                enumerate_leaves_only=False):
            stats_for_feature = accumulator.get(feature_path)
            if stats_for_feature is None:
                stats_for_feature = _PartialBasicStats(
                    self._weight_feature is not None)
                # Store empty summary.
                stats_for_feature.common_stats.num_values_summary = (
                    self._num_values_quantiles_combiner.create_accumulator())
                stats_for_feature.numeric_stats.quantiles_summary = (
                    self._values_quantiles_combiner.create_accumulator())
                accumulator[feature_path] = stats_for_feature

            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, feature_array.type)
            stats_for_feature.common_stats.update(
                feature_path, feature_array, feature_type,
                self._num_values_quantiles_combiner, weights)
            is_categorical_feature = feature_path in self._categorical_features
            if (is_categorical_feature or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                stats_for_feature.string_stats.update(feature_array)
            elif feature_type in (statistics_pb2.FeatureNameStatistics.INT,
                                  statistics_pb2.FeatureNameStatistics.FLOAT):
                stats_for_feature.numeric_stats.update(
                    feature_array, self._values_quantiles_combiner, weights)

        return accumulator
Exemplo n.º 2
0
def _to_topk_tuples(
    sliced_table: Tuple[Text, pa.Table],
    categorical_features: FrozenSet[types.FeaturePath],
    weight_feature: Optional[Text]
) -> Iterable[Tuple[Tuple[Text, FeaturePathTuple, Any], Union[int, Tuple[
        int, Union[int, float]]]]]:
    """Generates tuples for computing top-k and uniques from input tables."""
    slice_key, table = sliced_table

    for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
            table, weight_column=weight_feature, enumerate_leaves_only=True):
        feature_array_type = feature_array.type
        if (feature_path in categorical_features
                or stats_util.get_feature_type_from_arrow_type(
                    feature_path, feature_array_type)
                == statistics_pb2.FeatureNameStatistics.STRING):
            flattened_values = feature_array.flatten()
            if weights is not None and flattened_values:
                # Slow path: weighted uniques.
                flattened_values_np = arrow_util.primitive_array_to_numpy(
                    flattened_values)
                parent_indices = (arrow_util.primitive_array_to_numpy(
                    arrow_util.GetFlattenedArrayParentIndices(feature_array)))
                weights_ndarray = weights[parent_indices]
                for value, count, weight in _weighted_unique(
                        flattened_values_np, weights_ndarray):
                    yield (slice_key, feature_path.steps(), value), (count,
                                                                     weight)
            else:
                value_counts = arrow_util.ValueCounts(flattened_values)
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in six.moves.zip(values, counts):
                    yield ((slice_key, feature_path.steps(), value), count)
def _remove_unsupported_feature_columns(examples_table: pa.Table,
                                        schema: schema_pb2.Schema) -> pa.Table:
  """Removes feature columns that contain unsupported values.

  All feature columns that are multivalent are dropped since they are
  not supported by sk-learn.

  All columns of STRUCT type are also dropped.

  Args:
    examples_table: Arrow table containing a batch of examples.
    schema: The schema for the data.

  Returns:
    Arrow table.
  """
  multivalent_features = schema_util.get_multivalent_features(schema)
  unsupported_columns = set()
  for f in multivalent_features:
    unsupported_columns.add(f.steps()[0])
  for column_name, column in zip(examples_table.schema.names,
                                 examples_table.itercolumns()):
    if (stats_util.get_feature_type_from_arrow_type(
        types.FeaturePath([column_name]),
        column.type) == statistics_pb2.FeatureNameStatistics.STRUCT):
      unsupported_columns.add(column_name)
  return examples_table.drop(unsupported_columns)
    def add_input(self, accumulator, input_table):

        weight_ndarrays = []
        if self._weight_feature is not None:
            for a in input_table.column(
                    self._weight_feature).data.iterchunks():
                weight_array = arrow_util.FlattenListArray(a)
                if len(weight_array) != len(a):
                    raise ValueError(
                        'If weight is specified, then each example must have a weight '
                        'feature of length 1.')
                # to_numpy() can only be called against a non-empty arrow array.
                if weight_array:
                    weight_ndarrays.append(weight_array.to_numpy())
                else:
                    weight_ndarrays.append(
                        np.array([], dtype=weight_array.to_pandas_dtype()))

        for column in input_table.columns:
            feature_name = column.name
            if feature_name == self._weight_feature:
                continue
            unweighted_counts = collections.Counter()
            weighted_counts = _WeightedCounter()
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, column.type)
            if not (feature_name in self._categorical_features or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                continue

            for feature_array, weight_ndarray in six.moves.zip_longest(
                    column.data.iterchunks(), weight_ndarrays, fillvalue=None):
                flattened_values_array = arrow_util.FlattenListArray(
                    feature_array)
                # to_numpy() cannot be called if the array is empty.
                if not flattened_values_array:
                    continue
                if feature_type == statistics_pb2.FeatureNameStatistics.STRING:
                    values_ndarray = flattened_values_array.to_pandas()
                else:
                    values_ndarray = flattened_values_array.to_numpy()
                value_parent_indices = arrow_util.GetFlattenedArrayParentIndices(
                    feature_array).to_numpy()
                unweighted_counts.update(values_ndarray)
                if weight_ndarray is not None:
                    weight_per_value = weight_ndarray[value_parent_indices]
                    weighted_counts.weighted_update(values_ndarray,
                                                    weight_per_value)

            if feature_name not in accumulator:
                accumulator[feature_name] = _ValueCounts(
                    unweighted_counts=unweighted_counts,
                    weighted_counts=weighted_counts)
            else:
                accumulator[feature_name].unweighted_counts.update(
                    unweighted_counts)
                accumulator[feature_name].weighted_counts.update(
                    weighted_counts)
        return accumulator
Exemplo n.º 5
0
    def add_input(self, accumulator: _PartialNLStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Array) -> _PartialNLStats:
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow Array representing a batch of feature values which
        should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if feature_path not in self._valid_feature_paths:
            accumulator.invalidate = True
            return accumulator

        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator

        if feature_type not in self._feature_type_fns:
            accumulator.invalidate = True
            return accumulator

        feature_type_fn = self._feature_type_fns[feature_type]

        vocab = None
        rvocab = None
        if self._nld_vocabularies[feature_path]:
            vocab_name = self._nld_vocabularies[feature_path]
            vocab = self._vocabs[vocab_name]
            rvocab = self._rvocabs[vocab_name]

        excluded_string_tokens = self._nld_excluded_string_tokens[feature_path]
        excluded_int_tokens = self._nld_excluded_int_tokens[feature_path]
        oov_string_tokens = self._nld_oov_string_tokens[feature_path]
        int_tokens = self._nld_specified_int_tokens[feature_path]
        string_tokens = self._nld_specified_str_tokens[feature_path]
        sequence_length_excluded_int_tokens = (
            self._nld_sequence_length_excluded_int_tokens[feature_path])
        sequence_length_excluded_string_tokens = (
            self._nld_sequence_length_excluded_string_tokens[feature_path])

        # TODO(b/175875824): Benchmark and optimize performance.
        for row in feature_array.to_pylist():
            if row is not None:
                feature_type_fn(row, accumulator, excluded_string_tokens,
                                excluded_int_tokens, oov_string_tokens, vocab,
                                rvocab, int_tokens, string_tokens,
                                sequence_length_excluded_int_tokens,
                                sequence_length_excluded_string_tokens,
                                self._num_histogram_buckets)
        return accumulator
Exemplo n.º 6
0
    def add_input(self, accumulator, input_table):
        weight_column = (input_table.column(self._weight_feature)
                         if self._weight_feature else None)
        weight_array = weight_column.data.chunk(0) if weight_column else []
        if weight_array:
            flattened_weights = arrow_util.FlattenListArray(
                weight_array).to_numpy()

        for column in input_table.columns:
            feature_name = column.name
            # Skip the weight feature.
            if feature_name == self._weight_feature:
                continue
            feature_path = types.FeaturePath([feature_name])
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, column.type)
            # if it's not a categorical feature nor a string feature, we don't bother
            # with topk stats.
            if not (feature_path in self._categorical_features or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                continue
            value_array = column.data.chunk(0)
            flattened_values = arrow_util.FlattenListArray(value_array)
            unweighted_counts = collections.Counter()
            # Compute unweighted counts.
            value_counts = arrow_util.ValueCounts(flattened_values)
            for value_count in value_counts:
                value_count = value_count.as_py()
                unweighted_counts[
                    value_count['values']] = value_count['counts']

            # Compute weighted counts if a weight feature is specified.
            weighted_counts = _WeightedCounter()
            if weight_array:
                if (pa.types.is_binary(flattened_values.type)
                        or pa.types.is_string(flattened_values.type)):
                    # no free conversion.
                    flattened_values_np = flattened_values.to_pandas()
                else:
                    flattened_values_np = flattened_values.to_numpy()
                indices = arrow_util.GetFlattenedArrayParentIndices(
                    value_array)
                weighted_counts.weighted_update(
                    flattened_values_np, flattened_weights[indices.to_numpy()])

            if feature_path not in accumulator:
                accumulator[feature_path] = _ValueCounts(
                    unweighted_counts=unweighted_counts,
                    weighted_counts=weighted_counts)
            else:
                accumulator[feature_path].unweighted_counts.update(
                    unweighted_counts)
                accumulator[feature_path].weighted_counts.update(
                    weighted_counts)
        return accumulator
Exemplo n.º 7
0
    def add_input(self, accumulator: _PartialImageStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Array) -> _PartialImageStats:
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidate:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        # If we see a different type, invalidate.
        if feature_type != statistics_pb2.FeatureNameStatistics.STRING:
            accumulator.invalidate = True
            return accumulator

        # Consider using memoryview to avoid copying after upgrading to
        # arrow 0.12. Note that this would involve modifying the subsequent logic
        # to iterate over the values in a loop.
        values = np.asarray(arrow_util.flatten_nested(feature_array)[0])
        accumulator.total_num_values += values.size
        image_formats = self._image_decoder.get_formats(values)
        valid_mask = ~pd.isnull(image_formats)
        valid_formats = image_formats[valid_mask]
        format_counts = np.unique(valid_formats, return_counts=True)
        for (image_format, count) in zip(*format_counts):
            accumulator.counter_by_format[image_format] += count
        unknown_count = image_formats.size - valid_formats.size
        if unknown_count > 0:
            accumulator.counter_by_format[''] += unknown_count

        if self._enable_size_stats:
            # Get image height and width.
            image_sizes = self._image_decoder.get_sizes(values[valid_mask])
            if image_sizes.any():
                max_sizes = np.max(image_sizes, axis=0)
                # Update the max image height/width with all image values.
                accumulator.max_height = max(accumulator.max_height,
                                             max_sizes[0])
                accumulator.max_width = max(accumulator.max_width,
                                            max_sizes[1])

        return accumulator
Exemplo n.º 8
0
 def add_input(
     self, accumulator: Dict[tfdv_types.FeaturePath, _CombinedSketch],
     input_record_batch: pa.RecordBatch
 ) -> Dict[tfdv_types.FeaturePath, _CombinedSketch]:
     for feature_path, leaf_array, weights in arrow_util.enumerate_arrays(
             input_record_batch,
             example_weight_map=self._example_weight_map,
             enumerate_leaves_only=True):
         feature_type = stats_util.get_feature_type_from_arrow_type(
             feature_path, leaf_array.type)
         if self._should_run(feature_path, feature_type):
             self._update_combined_sketch_for_feature(
                 feature_path, leaf_array, weights, accumulator)
     return accumulator
    def _remove_unsupported_feature_columns(
            self, examples: pa.RecordBatch,
            schema: schema_pb2.Schema) -> pa.RecordBatch:
        """Removes feature columns that contain unsupported values.

    All feature columns that are multivalent are dropped since they are
    not supported by sk-learn.

    All columns of STRUCT type are also dropped.

    Args:
      examples: Arrow RecordBatch containing a batch of examples.
      schema: The schema for the data.

    Returns:
      Arrow RecordBatch.
    """
        columns = set(examples.schema.names)

        multivalent_features = schema_util.get_multivalent_features(schema)
        unsupported_columns = set()
        for f in multivalent_features:
            # Drop the column if they were in the examples.
            if f.steps()[0] in columns:
                unsupported_columns.add(f.steps()[0])
        for column_name, column in zip(examples.schema.names,
                                       examples.columns):
            # only support 1-nested non-struct arrays.
            column_type = column.type
            if (arrow_util.get_nest_level(column_type) != 1
                    or stats_util.get_feature_type_from_arrow_type(
                        types.FeaturePath([column_name]), column_type)
                    == statistics_pb2.FeatureNameStatistics.STRUCT):
                unsupported_columns.add(column_name)
            # Drop columns that were not in the schema.
            if types.FeaturePath([column_name]) not in self._schema_features:
                unsupported_columns.add(column_name)

        supported_columns = []
        supported_column_names = []
        for column_name, column in zip(examples.schema.names,
                                       examples.columns):
            if column_name not in unsupported_columns:
                supported_columns.append(column)
                supported_column_names.append(column_name)

        return pa.RecordBatch.from_arrays(supported_columns,
                                          supported_column_names)
Exemplo n.º 10
0
 def add_input(
     self, accumulator: Dict[tfdv_types.FeaturePath, _CombinedSketch],
     input_record_batch: pa.RecordBatch
     ) -> Dict[tfdv_types.FeaturePath, _CombinedSketch]:
   for feature_path, leaf_array, weights in arrow_util.enumerate_arrays(
       input_record_batch,
       example_weight_map=self._example_weight_map,
       enumerate_leaves_only=True):
     feature_type = stats_util.get_feature_type_from_arrow_type(
         feature_path, leaf_array.type)
     # Only compute top-k and unique stats for categorical and string features.
     if ((feature_type == statistics_pb2.FeatureNameStatistics.INT and
          feature_path in self._categorical_features) or
         feature_type == statistics_pb2.FeatureNameStatistics.STRING):
       self._update_combined_sketch_for_feature(
           feature_path, leaf_array, weights, accumulator)
   return accumulator
Exemplo n.º 11
0
    def add_input(
        self, accumulator: Dict[types.FeaturePath, _ValueCounts],
        input_record_batch: pa.RecordBatch
    ) -> Dict[types.FeaturePath, _ValueCounts]:
        for feature_path, leaf_array, weights in arrow_util.enumerate_arrays(
                input_record_batch,
                weight_column=self._weight_feature,
                enumerate_leaves_only=True):
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, leaf_array.type)
            if feature_type is None:
                continue
            # if it's not a categorical feature nor a string feature, we don't bother
            # with topk stats.
            if (feature_path in self._categorical_features or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                flattened_values = leaf_array.flatten()
                unweighted_counts = collections.Counter()
                # Compute unweighted counts.
                value_counts = array_util.ValueCounts(flattened_values)
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in six.moves.zip(values, counts):
                    unweighted_counts[value] = count

                # Compute weighted counts if a weight feature is specified.
                weighted_counts = _WeightedCounter()
                if weights is not None:
                    flattened_values_np = np.asarray(flattened_values)
                    parent_indices = array_util.GetFlattenedArrayParentIndices(
                        leaf_array)
                    weighted_counts.weighted_update(
                        flattened_values_np,
                        weights[np.asarray(parent_indices)])

                if feature_path not in accumulator:
                    accumulator[feature_path] = _ValueCounts(
                        unweighted_counts=unweighted_counts,
                        weighted_counts=weighted_counts)
                else:
                    accumulator[feature_path].unweighted_counts.update(
                        unweighted_counts)
                    accumulator[feature_path].weighted_counts.update(
                        weighted_counts)

        return accumulator
    def _get_univalent_values_with_parent_indices(
            self,
            examples_table: pa.Table) -> Dict[types.FeatureName, pd.DataFrame]:
        """Extracts univalent values for each feature along with parent indices."""
        result = {}
        for feature_column in examples_table.itercolumns():
            feature_name = feature_column.name
            if (self._features_needed is not None
                    and feature_name not in self._features_needed):
                continue
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, feature_column.type)
            # Only consider crosses of numeric features.
            # TODO(zhuo): Support numeric features nested under structs.
            if feature_type in (statistics_pb2.FeatureNameStatistics.STRING,
                                statistics_pb2.FeatureNameStatistics.STRUCT):
                continue
            # Assume we have only a single chunk.
            assert feature_column.data.num_chunks == 1
            feat_arr = feature_column.data.chunk(0)
            value_lengths = arrow_util.primitive_array_to_numpy(
                arrow_util.ListLengthsFromListArray(feat_arr))
            univalent_parent_indices = set((value_lengths == 1).nonzero()[0])
            # If there are no univalent values, continue to the next feature.
            if not univalent_parent_indices:
                continue
            non_missing_values = arrow_util.primitive_array_to_numpy(
                feat_arr.flatten())
            value_parent_indices = arrow_util.primitive_array_to_numpy(
                arrow_util.GetFlattenedArrayParentIndices(feat_arr))
            if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT:
                # Remove any NaN values if present.
                non_nan_mask = ~np.isnan(non_missing_values)
                non_missing_values = non_missing_values[non_nan_mask]
                value_parent_indices = value_parent_indices[non_nan_mask]
            df = pd.DataFrame({
                feature_name: non_missing_values,
                'parent_index': value_parent_indices
            })
            # Only keep the univalent feature values.
            df = df[df['parent_index'].isin(univalent_parent_indices)]

            result[feature_name] = df

        return result
Exemplo n.º 13
0
def _to_topk_tuples(
    sliced_record_batch: Tuple[types.SliceKey, pa.RecordBatch],
    bytes_features: FrozenSet[types.FeaturePath],
    categorical_features: FrozenSet[types.FeaturePath],
    example_weight_map: ExampleWeightMap,
) -> Iterable[Tuple[Tuple[types.SliceKey, types.FeaturePathTuple, Any], Union[
        int, Tuple[int, Union[int, float]]]]]:
    """Generates tuples for computing top-k and uniques from the input."""
    slice_key, record_batch = sliced_record_batch

    has_any_weight = bool(example_weight_map.all_weight_features())
    for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
            record_batch,
            example_weight_map=example_weight_map,
            enumerate_leaves_only=True):
        feature_array_type = feature_array.type
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array_type)
        if feature_path in bytes_features:
            continue
        if ((feature_type == statistics_pb2.FeatureNameStatistics.INT
             and feature_path in categorical_features) or feature_type
                == statistics_pb2.FeatureNameStatistics.STRING):
            flattened_values, parent_indices = arrow_util.flatten_nested(
                feature_array, weights is not None)
            if weights is not None and flattened_values:
                # Slow path: weighted uniques.
                flattened_values_np = np.asarray(flattened_values)
                weights_ndarray = weights[parent_indices]
                for value, count, weight in _weighted_unique(
                        flattened_values_np, weights_ndarray):
                    yield (slice_key, feature_path.steps(), value), (count,
                                                                     weight)
            else:
                value_counts = flattened_values.value_counts()
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                if has_any_weight:
                    for value, count in zip(values, counts):
                        yield ((slice_key, feature_path.steps(), value),
                               (count, 1))
                else:
                    for value, count in zip(values, counts):
                        yield ((slice_key, feature_path.steps(), value), count)
Exemplo n.º 14
0
    def add_input(
        self, accumulator: Dict[types.FeaturePath, _ValueCounts],
        input_record_batch: pa.RecordBatch
    ) -> Dict[types.FeaturePath, _ValueCounts]:
        for feature_path, leaf_array, weights in arrow_util.enumerate_arrays(
                input_record_batch,
                example_weight_map=self._example_weight_map,
                enumerate_leaves_only=True):
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, leaf_array.type)
            # if it's not a categorical int feature nor a string feature, we don't
            # bother with topk stats.
            if ((feature_type == statistics_pb2.FeatureNameStatistics.INT
                 and feature_path in self._categorical_features)
                    or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                flattened_values, parent_indices = arrow_util.flatten_nested(
                    leaf_array, weights is not None)
                unweighted_counts = collections.Counter()
                # Compute unweighted counts.
                value_counts = flattened_values.value_counts()
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in zip(values, counts):
                    unweighted_counts[value] = count

                # Compute weighted counts if a weight feature is specified.
                weighted_counts = _WeightedCounter()
                if weights is not None:
                    flattened_values_np = np.asarray(flattened_values)
                    weighted_counts.weighted_update(flattened_values_np,
                                                    weights[parent_indices])

                if feature_path not in accumulator:
                    accumulator[feature_path] = _ValueCounts(
                        unweighted_counts=unweighted_counts,
                        weighted_counts=weighted_counts)
                else:
                    accumulator[feature_path].unweighted_counts.update(
                        unweighted_counts)
                    accumulator[feature_path].weighted_counts.update(
                        weighted_counts)

        return accumulator
Exemplo n.º 15
0
    def add_input(self, accumulator, examples_table):

        weights = None
        if self._weight_feature:
            weights = (arrow_util.FlattenListArray(
                examples_table.column(
                    self._weight_feature).data.chunk(0)).to_numpy())
            if len(weights) != len(examples_table):
                raise ValueError('Expected exactly one weight per example.')

        for feature_column in examples_table.itercolumns():
            feature_name = feature_column.name
            # Skip the weight feature.
            if feature_name == self._weight_feature:
                continue
            feature_path = types.FeaturePath([feature_name])
            is_categorical_feature = feature_path in self._categorical_features

            # If we encounter this feature for the first time, create a
            # new partial basic stats.
            stats_for_feature = accumulator.get(feature_path)
            if stats_for_feature is None:
                stats_for_feature = _PartialBasicStats(
                    self._weight_feature is not None)
                # Store empty summary.
                stats_for_feature.common_stats.num_values_summary = (
                    self._num_values_quantiles_combiner.create_accumulator())
                stats_for_feature.numeric_stats.quantiles_summary = (
                    self._values_quantiles_combiner.create_accumulator())
                accumulator[feature_path] = stats_for_feature

            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, feature_column.type)
            stats_for_feature.common_stats.update(
                feature_column, feature_type,
                self._num_values_quantiles_combiner, weights)
            if (is_categorical_feature or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                stats_for_feature.string_stats.update(feature_column)
            elif feature_type is not None:
                stats_for_feature.numeric_stats.update(
                    feature_column, self._values_quantiles_combiner, weights)
        return accumulator
Exemplo n.º 16
0
    def add_input(self, accumulator: _PartialTimeStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Column) -> _PartialTimeStats:
        """Returns result of folding a batch of inputs into the current accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow Array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidated:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        if feature_type == statistics_pb2.FeatureNameStatistics.STRING:

            def _maybe_get_utf8(val):
                return stats_util.maybe_get_utf8(val) if isinstance(
                    val, bytes) else val

            values = arrow_util.primitive_array_to_numpy(
                feature_array.flatten())
            maybe_utf8 = np.vectorize(_maybe_get_utf8,
                                      otypes=[np.object])(values)
            if not maybe_utf8.all():
                accumulator.invalidated = True
                return accumulator
            accumulator.update(maybe_utf8, feature_type)
        elif feature_type == statistics_pb2.FeatureNameStatistics.INT:
            values = arrow_util.primitive_array_to_numpy(
                feature_array.flatten())
            accumulator.update(values, feature_type)
        else:
            accumulator.invalidated = True

        return accumulator
    def _get_univalent_values_with_parent_indices(
            self,
            examples: pa.RecordBatch) -> Dict[types.FeatureName, DataFrame]:
        """Extracts univalent values for each feature along with parent indices."""
        result = {}
        for feature_name, feat_arr in zip(examples.schema.names,
                                          examples.columns):
            if (self._features_needed is not None
                    and feature_name not in self._features_needed):
                continue
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, feat_arr.type)
            # Only consider crosses of numeric features.
            # TODO(zhuo): Support numeric features nested under structs.
            if feature_type in (None,
                                statistics_pb2.FeatureNameStatistics.STRING,
                                statistics_pb2.FeatureNameStatistics.STRUCT):
                continue
            value_lengths = np.asarray(
                array_util.ListLengthsFromListArray(feat_arr))
            univalent_parent_indices = set((value_lengths == 1).nonzero()[0])
            # If there are no univalent values, continue to the next feature.
            if not univalent_parent_indices:
                continue
            flattened, value_parent_indices = arrow_util.flatten_nested(
                feat_arr, True)
            non_missing_values = np.asarray(flattened)
            if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT:
                # Remove any NaN values if present.
                non_nan_mask = ~np.isnan(non_missing_values)
                non_missing_values = non_missing_values[non_nan_mask]
                value_parent_indices = value_parent_indices[non_nan_mask]
            df = pd.DataFrame({
                feature_name: non_missing_values,
                'parent_index': value_parent_indices
            })
            # Only keep the univalent feature values.
            df = df[df['parent_index'].isin(univalent_parent_indices)]

            result[feature_name] = df

        return result
    def add_input(self, accumulator: _PartialNLStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Array) -> _PartialNLStats:
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow Array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidate:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        # If we see a different type, invalidate.
        if feature_type != statistics_pb2.FeatureNameStatistics.STRING:
            accumulator.invalidate = True
            return accumulator

        def _is_non_utf8(value):
            return (isinstance(value, bytes)
                    and stats_util.maybe_get_utf8(value) is None)

        is_non_utf_vec = np.vectorize(_is_non_utf8, otypes=[bool])
        classify_vec = np.vectorize(self._classifier.classify, otypes=[bool])
        values = np.asarray(
            arrow_util.flatten_nested(feature_array)[0].slice(
                0, _CROP_AT_VALUES))
        if np.any(is_non_utf_vec(values)):
            accumulator.invalidate = True
            return accumulator
        accumulator.considered += values.size
        accumulator.matched += np.sum(classify_vec(values))
        return accumulator
    def add_input(self, accumulator, input_column):
        """Returns result of folding a batch of inputs into the current accumulator.

    Args:
      accumulator: The current accumulator.
      input_column: An arrow column representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidated:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            input_column.name, input_column.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        if feature_type == statistics_pb2.FeatureNameStatistics.STRING:

            def _maybe_get_utf8(val):
                return stats_util.maybe_get_utf8(val) if isinstance(
                    val, bytes) else val

            for feature_array in input_column.data.iterchunks():
                values = arrow_util.FlattenListArray(feature_array).to_pandas()
                maybe_utf8 = np.vectorize(_maybe_get_utf8,
                                          otypes=[np.object])(values)
                if not maybe_utf8.all():
                    accumulator.invalidated = True
                    return accumulator
                accumulator.update(maybe_utf8, feature_type)

        elif feature_type == statistics_pb2.FeatureNameStatistics.INT:
            for feature_array in input_column.data.iterchunks():
                values = arrow_util.FlattenListArray(feature_array).to_pandas()
                accumulator.update(values, feature_type)

        else:
            accumulator.invalidated = True
        return accumulator
  def add_input(
      self, accumulator: Dict[types.FeaturePath, _PartialBasicStats],
      examples: pa.RecordBatch
      ) -> Dict[types.FeaturePath, _PartialBasicStats]:
    for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
        examples,
        example_weight_map=self._example_weight_map,
        enumerate_leaves_only=False):
      stats_for_feature = accumulator.get(feature_path)
      if stats_for_feature is None:
        stats_for_feature = _PartialBasicStats(
            weights is not None, self._make_quantiles_sketch_fn)
        accumulator[feature_path] = stats_for_feature

      feature_type = stats_util.get_feature_type_from_arrow_type(
          feature_path, feature_array.type)
      stats_for_feature.common_stats.update(feature_path,
                                            feature_array, feature_type,
                                            self._make_quantiles_sketch_fn,
                                            weights)
      # The user may make certain claims about a feature's data type
      # (e.g. _bytes_features imply string data type). However we should not
      # trust those claims because TFDV is also responsible for detecting
      # mismatching types. We collect stats according to the actual type, and
      # only when the actual type matches the claim do we collect the
      # type-specific stats (like for categorical int and bytes features).
      if feature_type == statistics_pb2.FeatureNameStatistics.STRING:
        if feature_path in self._bytes_features:
          stats_for_feature.bytes_stats.update(feature_array)
        else:
          stats_for_feature.string_stats.update(feature_array)
      elif feature_type == statistics_pb2.FeatureNameStatistics.INT:
        if feature_path in self._categorical_features:
          stats_for_feature.string_stats.update(feature_array)
        else:
          stats_for_feature.numeric_stats.update(feature_array, weights)
      elif feature_type == statistics_pb2.FeatureNameStatistics.FLOAT:
        stats_for_feature.numeric_stats.update(feature_array, weights)

    return accumulator
def _to_topk_tuples(
    sliced_record_batch: Tuple[types.SliceKey, pa.RecordBatch],
    bytes_features: FrozenSet[types.FeaturePath],
    categorical_features: FrozenSet[types.FeaturePath],
    weight_feature: Optional[Text]
) -> Iterable[Tuple[Tuple[types.SliceKey, types.FeaturePathTuple, Any], Union[
    int, Tuple[int, Union[int, float]]]]]:
  """Generates tuples for computing top-k and uniques from the input."""
  slice_key, record_batch = sliced_record_batch

  for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
      record_batch,
      weight_column=weight_feature,
      enumerate_leaves_only=True):
    feature_array_type = feature_array.type
    feature_type = stats_util.get_feature_type_from_arrow_type(
        feature_path, feature_array_type)
    # Skip null columns.
    if feature_type is None:
      continue
    if feature_path in bytes_features:
      continue
    if (feature_path in categorical_features or
        feature_type == statistics_pb2.FeatureNameStatistics.STRING):
      flattened_values, parent_indices = arrow_util.flatten_nested(
          feature_array, weights is not None)
      if weights is not None and flattened_values:
        # Slow path: weighted uniques.
        flattened_values_np = np.asarray(flattened_values)
        weights_ndarray = weights[parent_indices]
        for value, count, weight in _weighted_unique(
            flattened_values_np, weights_ndarray):
          yield (slice_key, feature_path.steps(), value), (count, weight)
      else:
        value_counts = array_util.ValueCounts(flattened_values)
        values = value_counts.field('values').to_pylist()
        counts = value_counts.field('counts').to_pylist()
        for value, count in six.moves.zip(values, counts):
          yield ((slice_key, feature_path.steps(), value), count)
Exemplo n.º 22
0
    def add_input(self, accumulator, input_column):
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      input_column: An arrow column representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidate:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            input_column.name, input_column.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        # If we see a different type, invalidate.
        if feature_type != statistics_pb2.FeatureNameStatistics.STRING:
            accumulator.invalidate = True
            return accumulator

        def _is_non_utf8(value):
            return (isinstance(value, bytes)
                    and stats_util.maybe_get_utf8(value) is None)

        is_non_utf_vec = np.vectorize(_is_non_utf8, otypes=[np.bool])
        classify_vec = np.vectorize(self._classifier.classify,
                                    otypes=[np.bool])
        for feature_array in input_column.data.iterchunks():
            values = arrow_util.FlattenListArray(feature_array).to_pandas()
            if np.any(is_non_utf_vec(values)):
                accumulator.invalidate = True
                return accumulator
            accumulator.considered += values.size
            accumulator.matched += np.sum(classify_vec(values))
        return accumulator
    def add_input(self, accumulator, examples_table):

        weights_column = None
        if self._weight_feature:
            weights_column = examples_table.column(self._weight_feature)

        for feature_column in examples_table.itercolumns():
            feature_name = feature_column.name
            # Skip the weight feature.
            if feature_name == self._weight_feature:
                continue
            is_categorical_feature = feature_name in self._categorical_features

            # If we encounter this feature for the first time, create a
            # new partial basic stats.
            if feature_name not in accumulator:
                partial_stats = _PartialBasicStats(
                    self._weight_feature is not None)
                # Store empty summary.
                partial_stats.common_stats.num_values_summary = (
                    self._num_values_quantiles_combiner.create_accumulator())
                partial_stats.numeric_stats.quantiles_summary = (
                    self._values_quantiles_combiner.create_accumulator())
                accumulator[feature_name] = partial_stats

            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, feature_column.type)
            accumulator[feature_name].common_stats.update(
                feature_column, feature_type,
                self._num_values_quantiles_combiner, weights_column)
            if (is_categorical_feature or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                accumulator[feature_name].string_stats.update(feature_column)
            elif feature_type is not None:
                accumulator[feature_name].numeric_stats.update(
                    feature_column, self._values_quantiles_combiner,
                    weights_column)
        return accumulator
Exemplo n.º 24
0
def _get_example_value_presence(
        record_batch: pa.RecordBatch, path: types.FeaturePath,
        boundaries: Optional[Sequence[float]],
        weight_column_name: Optional[Text]) -> Optional[pd.DataFrame]:
    """Returns information about which examples contained which values.

  This function treats all values for a given path within a single example
  as a set and and returns a mapping between each example index and the distinct
  values which are present in that example.

  The result of calling this function for path 'p' on an arrow record batch with
  the two records [{'p': ['a', 'a', 'b']}, {'p': [a]}] will be
  pd.Series(['a', 'b', 'a'], index=[0, 0, 1]).

  If the array retrieved from get_array is null, this function returns None.

  Args:
    record_batch: The RecordBatch in which to look up the path.
    path: The FeaturePath for which to fetch values.
    boundaries: Optionally, a set of bin boundaries to use for binning the array
      values.
    weight_column_name: Optionally, a weight column to return in addition to the
      value and example index.

  Returns:
    A Pandas DataFrame containing distinct pairs of array values and example
    indices, along with the corresponding flattened example weights. The index
    will be the example indices and the values will be stored in a column named
    'values'. If weight_column_name is provided, a second column will be
    returned containing the array values, and 'weights' containing the weights
    for the example from which each value came.
  """
    arr, example_indices = arrow_util.get_array(record_batch,
                                                path,
                                                return_example_indices=True)
    if stats_util.get_feature_type_from_arrow_type(path, arr.type) is None:
        return None

    arr_flat, parent_indices = arrow_util.flatten_nested(
        arr, return_parent_indices=True)
    is_binary_like = arrow_util.is_binary_like(arr_flat.type)
    assert boundaries is None or not is_binary_like, (
        'Boundaries can only be applied to numeric columns')
    if is_binary_like:
        # use dictionary_encode so we can use np.unique on object arrays
        dict_array = arr_flat.dictionary_encode()
        arr_flat = dict_array.indices
        arr_flat_dict = np.asarray(dict_array.dictionary)
    example_indices_flat = example_indices[parent_indices]
    if boundaries is not None:
        element_indices, bins = bin_util.bin_array(arr_flat, boundaries)
        rows = np.vstack([example_indices_flat[element_indices], bins])
    else:
        rows = np.vstack([example_indices_flat, np.asarray(arr_flat)])
    if not rows.size:
        return None
    # Deduplicate values which show up more than once in the same example. This
    # makes P(X=x|Y=y) in the standard lift definition behave as
    # P(x \in Xs | y \in Ys) if examples contain more than one value of X and Y.
    unique_rows = np.unique(rows, axis=1)
    example_indices = unique_rows[0, :]
    values = unique_rows[1, :]
    if is_binary_like:
        # return binary like values a pd.Categorical wrapped in a Series. This makes
        # subsqeuent operations like pd.Merge cheaper.
        values = pd.Categorical.from_codes(values, categories=arr_flat_dict)
    columns = {'example_indices': example_indices, 'values': values}
    if weight_column_name:
        weights = arrow_util.get_weight_feature(record_batch,
                                                weight_column_name)
        columns['weights'] = np.asarray(weights)[example_indices]
    df = pd.DataFrame(columns)
    return df.set_index('example_indices')