예제 #1
0
 def estimate(self):
   # Converts the result struct array into list of FeatureValueCounts.
   topk_unweighted = self._topk_unweighted.Estimate().to_pylist()
   topk_unweighted_counts = [top_k_uniques_stats_util.FeatureValueCount(
       pair["values"], pair["counts"]) for pair in topk_unweighted]
   topk_weighted = self._topk_weighted.Estimate().to_pylist()
   topk_weighted_counts = [top_k_uniques_stats_util.FeatureValueCount(
       pair["values"], pair["counts"]) for pair in topk_weighted]
   return _CombinedEstimate(
       self._distinct.Estimate(), topk_unweighted_counts, topk_weighted_counts)
    def test_make_dataset_feature_stats_proto_topk_single(self):
        expected_result = text_format.Parse(
            """
        features {
          string_stats {
            top_values {
              value: "e"
              frequency: 20.0
            }
            top_values {
              value: "d"
              frequency: 20.0
            }
            top_values {
              value: "a"
              frequency: 15.0
            }
            rank_histogram {
              buckets {
                label: "e"
                sample_count: 20.0
              }
              buckets {
                low_rank: 1
                high_rank: 1
                label: "d"
                sample_count: 20.0
              }
            }
          }
          path {
            step: "fa"
          }
    }""", statistics_pb2.DatasetFeatureStatistics())

        value_counts = [('e', 20), ('d', 20), ('a', 15), ('c', 10), ('b', 5)]
        value_count_list = [
            top_k_uniques_stats_util.FeatureValueCount(value_count[0],
                                                       value_count[1])
            for value_count in value_counts
        ]
        result = (top_k_uniques_stats_util.
                  make_dataset_feature_stats_proto_topk_single(
                      types.FeaturePath(['fa']).steps(),
                      value_count_list=value_count_list,
                      categorical_features=frozenset([
                          types.FeaturePath(['fa']),
                          types.FeaturePath(['fb'])
                      ]),
                      is_weighted_stats=False,
                      num_top_values=3,
                      frequency_threshold=1,
                      num_rank_histogram_buckets=2))
        test_util.assert_dataset_feature_stats_proto_equal(
            self, result, expected_result)
    def test_make_feature_stats_proto_topk_uniques_unordered(self):
        expected_result = text_format.Parse(
            """
        path {
          step: 'fa'
        }
        type: INT
        string_stats {
          unique: 4
          top_values {
            value: 'a'
            frequency: 4
          }
          top_values {
            value: 'c'
            frequency: 3
          }
          top_values {
            value: 'd'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
          }
    }""", statistics_pb2.FeatureNameStatistics())

        value_counts = [('a', 4), ('c', 3), ('d', 2), ('b', 2)]
        top_k_value_count_list = [
            top_k_uniques_stats_util.FeatureValueCount(value_count[0],
                                                       value_count[1])
            for value_count in value_counts
        ]
        result = (
            top_k_uniques_stats_util.make_feature_stats_proto_topk_uniques(
                types.FeaturePath(['fa']),
                is_categorical=True,
                num_top_values=3,
                frequency_threshold=1,
                num_rank_histogram_buckets=2,
                num_unique=4,
                value_count_list=top_k_value_count_list))
        test_util.assert_feature_proto_equal(self, result, expected_result)
예제 #4
0
    def extract_output(
        self, accumulator: Dict[types.FeaturePath, _ValueCounts]
    ) -> statistics_pb2.DatasetFeatureStatistics:

        result = statistics_pb2.DatasetFeatureStatistics()
        for feature_path, value_counts in accumulator.items():
            if not value_counts.unweighted_counts:
                assert not value_counts.weighted_counts
                continue
            feature_value_counts = [
                top_k_uniques_stats_util.FeatureValueCount(key, value)
                for key, value in value_counts.unweighted_counts.items()
            ]
            weighted_feature_value_counts = None
            if value_counts.weighted_counts:
                weighted_feature_value_counts = [
                    top_k_uniques_stats_util.FeatureValueCount(key, value)
                    for key, value in value_counts.weighted_counts.items()
                ]

            feature_stats_proto = (
                top_k_uniques_stats_util.make_feature_stats_proto_topk_uniques(
                    feature_path=feature_path,
                    is_categorical=feature_path in self._categorical_features,
                    frequency_threshold=self._frequency_threshold,
                    weighted_frequency_threshold=self.
                    _weighted_frequency_threshold,
                    num_top_values=self._num_top_values,
                    num_rank_histogram_buckets=self.
                    _num_rank_histogram_buckets,
                    num_unique=len(feature_value_counts),
                    value_count_list=feature_value_counts,
                    weighted_value_count_list=weighted_feature_value_counts))

            new_feature_stats_proto = result.features.add()
            new_feature_stats_proto.CopyFrom(feature_stats_proto)
        return result
  def expand(self, pcoll: beam.pvalue.PCollection) -> beam.pvalue.PCollection:

    def _sum_pairwise(
        iter_of_pairs: Iterator[Tuple[Union[int, float], Union[int, float]]]
    ) -> Tuple[Union[int, float], Union[int, float]]:
      """Computes sum of counts and weights."""
      # We take advantage of the fact that constructing a np array from a list
      # is much faster as the length is known beforehand.
      if isinstance(iter_of_pairs, list):
        arr = np.array(
            iter_of_pairs, dtype=[('c', np.int64), ('w', np.float)])
      else:
        arr = np.fromiter(
            iter_of_pairs, dtype=[('c', np.int64), ('w', np.float)])
      return arr['c'].sum(), arr['w'].sum()

    has_any_weight = bool(self._example_weight_map.all_weight_features())
    if has_any_weight:
      sum_fn = _sum_pairwise
    else:
      # For non-weighted case, use sum combine fn over integers to allow Beam
      # to use Cython combiner.
      sum_fn = sum
    top_k_tuples_combined = (
        pcoll
        | 'ToTopKTuples' >> beam.FlatMap(
            _to_topk_tuples,
            bytes_features=self._bytes_features,
            categorical_features=self._categorical_features,
            example_weight_map=self._example_weight_map)
        | 'CombineCountsAndWeights' >> beam.CombinePerKey(sum_fn)
        | 'Rearrange' >> beam.MapTuple(lambda k, v: ((k[0], k[1]), (v, k[2]))))
    # (slice_key, feature_path_steps), (count_and_maybe_weight, value)

    top_k = top_k_tuples_combined
    if has_any_weight:
      top_k |= 'Unweighted_DropWeightsAndRearrange' >> beam.MapTuple(
          lambda k, v: (k, (v[0][0], v[1])))
      # (slice_key, feature_path_steps), (count, value)

    top_k = (
        top_k
        | 'Unweighted_TopK' >> beam.combiners.Top().PerKey(
            max(self._num_top_values, self._num_rank_histogram_buckets))
        | 'Unweighted_ToFeatureValueCount' >> beam.MapTuple(
            # pylint: disable=g-long-lambda
            lambda k, v: (k, [top_k_uniques_stats_util.FeatureValueCount(
                t[1], t[0]) for t in v])
            # pylint: enable=g-long-lambda
            )
        | 'Unweighted_ToProto' >> beam.MapTuple(
            # pylint: disable=g-long-lambda
            lambda k, v: (
                k[0],
                top_k_uniques_stats_util.
                make_dataset_feature_stats_proto_topk_single(
                    feature_path_tuple=k[1],
                    value_count_list=v,
                    categorical_features=self._categorical_features,
                    is_weighted_stats=False,
                    num_top_values=self._num_top_values,
                    frequency_threshold=self._frequency_threshold,
                    num_rank_histogram_buckets=self._num_rank_histogram_buckets)
                )
            # pylint: enable=g-long-lambda
            ))
        # (slice_key, DatasetFeatureStatistics)

    uniques = (
        top_k_tuples_combined
        | 'Uniques_Keys' >> beam.Keys()
        | 'Uniques_CountPerFeatureName' >> beam.combiners.Count().PerElement()
        | 'Uniques_ConvertToSingleFeatureStats' >> beam.MapTuple(
            # pylint: disable=g-long-lambda
            lambda k, v: (
                k[0],
                top_k_uniques_stats_util.
                make_dataset_feature_stats_proto_unique_single(
                    feature_path_tuple=k[1],
                    num_uniques=v,
                    categorical_features=self._categorical_features))
            # pylint: enable=g-long-lambda
            ))
        # (slice_key, DatasetFeatureStatistics)

    result_protos = [top_k, uniques]

    if has_any_weight:
      weighted_top_k = (
          top_k_tuples_combined
          | 'Weighted_DropCountsAndRearrange'
          >> beam.MapTuple(lambda k, v: (k, (v[0][1], v[1])))
          # (slice_key, feature), (weight, value)
          | 'Weighted_TopK' >> beam.combiners.Top().PerKey(
              max(self._num_top_values, self._num_rank_histogram_buckets))
          | 'Weighted_ToFeatureValueCount' >> beam.MapTuple(
              # pylint: disable=g-long-lambda
              lambda k, v: (k, [top_k_uniques_stats_util.FeatureValueCount(
                  t[1], t[0]) for t in v])
              # pylint: enable=g-long-lambda
              )
          | 'Weighted_ToProto' >> beam.MapTuple(
              # pylint: disable=g-long-lambda
              lambda k, v:
              (k[0],
               top_k_uniques_stats_util.
               make_dataset_feature_stats_proto_topk_single(
                   feature_path_tuple=k[1],
                   value_count_list=v,
                   categorical_features=self._categorical_features,
                   is_weighted_stats=True,
                   num_top_values=self._num_top_values,
                   frequency_threshold=self._weighted_frequency_threshold,
                   num_rank_histogram_buckets=self._num_rank_histogram_buckets
               ))
              # pylint: enable=g-long-lambda
              ))
      # (slice_key, DatasetFeatureStatistics)

      result_protos.append(weighted_top_k)

    return (result_protos
            | 'FlattenTopKUniquesFeatureStatsProtos' >> beam.Flatten())
예제 #6
0
    def test_make_feature_stats_proto_topk_uniques(self):
        expected_result = text_format.Parse(
            """
        path {
            step: "fa"
          }
        string_stats {
          unique: 5
          top_values {
            value: "a"
            frequency: 3.0
          }
          top_values {
            value: "e"
            frequency: 2.0
          }
          top_values {
            value: "d"
            frequency: 2.0
          }
          rank_histogram {
            buckets {
              label: "a"
              sample_count: 3.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "e"
              sample_count: 2.0
            }
          }
          weighted_string_stats {
            top_values {
              value: "e"
              frequency: 20.0
            }
            top_values {
              value: "d"
              frequency: 20.0
            }
            top_values {
              value: "a"
              frequency: 15.0
            }
            rank_histogram {
              buckets {
                label: "e"
                sample_count: 20.0
              }
              buckets {
                low_rank: 1
                high_rank: 1
                label: "d"
                sample_count: 20.0
              }
            }
          }
        }
        """, statistics_pb2.FeatureNameStatistics())

        unweighted_value_counts = [('a', 3), ('e', 2), ('d', 2), ('c', 2),
                                   ('b', 1)]
        weighted_value_counts = [('e', 20), ('d', 20), ('a', 15), ('c', 10),
                                 ('b', 5)]
        top_k_value_count_list = [
            top_k_uniques_stats_util.FeatureValueCount(value_count[0],
                                                       value_count[1])
            for value_count in unweighted_value_counts
        ]
        top_k_value_count_list_weighted = [
            top_k_uniques_stats_util.FeatureValueCount(value_count[0],
                                                       value_count[1])
            for value_count in weighted_value_counts
        ]
        result = (
            top_k_uniques_stats_util.make_feature_stats_proto_topk_uniques(
                types.FeaturePath(['fa']),
                num_top_values=3,
                frequency_threshold=1,
                weighted_frequency_threshold=1.,
                num_rank_histogram_buckets=2,
                num_unique=5,
                value_count_list=top_k_value_count_list,
                weighted_value_count_list=top_k_value_count_list_weighted))
        test_util.assert_feature_proto_equal(self, result, expected_result)
    def test_make_feature_stats_proto_topk_uniques_custom_stats(self):
        expected_result = text_format.Parse(
            """
        path {
            step: "fa"
          }
        type: STRING
        custom_stats {
          name: "topk_sketch_rank_histogram"
          rank_histogram {
            buckets {
              label: "a"
              sample_count: 3.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "e"
              sample_count: 2.0
            }
          }
        }
        custom_stats {
          name: "weighted_topk_sketch_rank_histogram"
          rank_histogram {
              buckets {
                label: "e"
                sample_count: 20.0
              }
              buckets {
                low_rank: 1
                high_rank: 1
                label: "d"
                sample_count: 20.0
              }
          }
        }
        custom_stats {
          name: "uniques_sketch_num_uniques"
          num: 5
        }
        """, statistics_pb2.FeatureNameStatistics())

        unweighted_value_counts = [('a', 3), ('e', 2), ('d', 2), ('c', 2),
                                   ('b', 1)]
        weighted_value_counts = [('e', 20), ('d', 20), ('a', 15), ('c', 10),
                                 ('b', 5)]
        top_k_value_count_list = [
            top_k_uniques_stats_util.FeatureValueCount(value_count[0],
                                                       value_count[1])
            for value_count in unweighted_value_counts
        ]
        top_k_value_count_list_weighted = [
            top_k_uniques_stats_util.FeatureValueCount(value_count[0],
                                                       value_count[1])
            for value_count in weighted_value_counts
        ]
        result = (
            top_k_uniques_stats_util.
            make_feature_stats_proto_topk_uniques_custom_stats(
                types.FeaturePath(['fa']),
                is_categorical=False,
                num_top_values=3,
                frequency_threshold=1,
                weighted_frequency_threshold=1.,
                num_rank_histogram_buckets=2,
                num_unique=5,
                value_count_list=top_k_value_count_list,
                weighted_value_count_list=top_k_value_count_list_weighted))
        test_util.assert_feature_proto_equal(self, result, expected_result)