def estimate(self): # Converts the result struct array into list of FeatureValueCounts. topk_unweighted = self._topk_unweighted.Estimate().to_pylist() topk_unweighted_counts = [top_k_uniques_stats_util.FeatureValueCount( pair["values"], pair["counts"]) for pair in topk_unweighted] topk_weighted = self._topk_weighted.Estimate().to_pylist() topk_weighted_counts = [top_k_uniques_stats_util.FeatureValueCount( pair["values"], pair["counts"]) for pair in topk_weighted] return _CombinedEstimate( self._distinct.Estimate(), topk_unweighted_counts, topk_weighted_counts)
def test_make_dataset_feature_stats_proto_topk_single(self): expected_result = text_format.Parse( """ features { string_stats { top_values { value: "e" frequency: 20.0 } top_values { value: "d" frequency: 20.0 } top_values { value: "a" frequency: 15.0 } rank_histogram { buckets { label: "e" sample_count: 20.0 } buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 20.0 } } } path { step: "fa" } }""", statistics_pb2.DatasetFeatureStatistics()) value_counts = [('e', 20), ('d', 20), ('a', 15), ('c', 10), ('b', 5)] value_count_list = [ top_k_uniques_stats_util.FeatureValueCount(value_count[0], value_count[1]) for value_count in value_counts ] result = (top_k_uniques_stats_util. make_dataset_feature_stats_proto_topk_single( types.FeaturePath(['fa']).steps(), value_count_list=value_count_list, categorical_features=frozenset([ types.FeaturePath(['fa']), types.FeaturePath(['fb']) ]), is_weighted_stats=False, num_top_values=3, frequency_threshold=1, num_rank_histogram_buckets=2)) test_util.assert_dataset_feature_stats_proto_equal( self, result, expected_result)
def test_make_feature_stats_proto_topk_uniques_unordered(self): expected_result = text_format.Parse( """ path { step: 'fa' } type: INT string_stats { unique: 4 top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } top_values { value: 'd' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } } }""", statistics_pb2.FeatureNameStatistics()) value_counts = [('a', 4), ('c', 3), ('d', 2), ('b', 2)] top_k_value_count_list = [ top_k_uniques_stats_util.FeatureValueCount(value_count[0], value_count[1]) for value_count in value_counts ] result = ( top_k_uniques_stats_util.make_feature_stats_proto_topk_uniques( types.FeaturePath(['fa']), is_categorical=True, num_top_values=3, frequency_threshold=1, num_rank_histogram_buckets=2, num_unique=4, value_count_list=top_k_value_count_list)) test_util.assert_feature_proto_equal(self, result, expected_result)
def extract_output( self, accumulator: Dict[types.FeaturePath, _ValueCounts] ) -> statistics_pb2.DatasetFeatureStatistics: result = statistics_pb2.DatasetFeatureStatistics() for feature_path, value_counts in accumulator.items(): if not value_counts.unweighted_counts: assert not value_counts.weighted_counts continue feature_value_counts = [ top_k_uniques_stats_util.FeatureValueCount(key, value) for key, value in value_counts.unweighted_counts.items() ] weighted_feature_value_counts = None if value_counts.weighted_counts: weighted_feature_value_counts = [ top_k_uniques_stats_util.FeatureValueCount(key, value) for key, value in value_counts.weighted_counts.items() ] feature_stats_proto = ( top_k_uniques_stats_util.make_feature_stats_proto_topk_uniques( feature_path=feature_path, is_categorical=feature_path in self._categorical_features, frequency_threshold=self._frequency_threshold, weighted_frequency_threshold=self. _weighted_frequency_threshold, num_top_values=self._num_top_values, num_rank_histogram_buckets=self. _num_rank_histogram_buckets, num_unique=len(feature_value_counts), value_count_list=feature_value_counts, weighted_value_count_list=weighted_feature_value_counts)) new_feature_stats_proto = result.features.add() new_feature_stats_proto.CopyFrom(feature_stats_proto) return result
def expand(self, pcoll: beam.pvalue.PCollection) -> beam.pvalue.PCollection: def _sum_pairwise( iter_of_pairs: Iterator[Tuple[Union[int, float], Union[int, float]]] ) -> Tuple[Union[int, float], Union[int, float]]: """Computes sum of counts and weights.""" # We take advantage of the fact that constructing a np array from a list # is much faster as the length is known beforehand. if isinstance(iter_of_pairs, list): arr = np.array( iter_of_pairs, dtype=[('c', np.int64), ('w', np.float)]) else: arr = np.fromiter( iter_of_pairs, dtype=[('c', np.int64), ('w', np.float)]) return arr['c'].sum(), arr['w'].sum() has_any_weight = bool(self._example_weight_map.all_weight_features()) if has_any_weight: sum_fn = _sum_pairwise else: # For non-weighted case, use sum combine fn over integers to allow Beam # to use Cython combiner. sum_fn = sum top_k_tuples_combined = ( pcoll | 'ToTopKTuples' >> beam.FlatMap( _to_topk_tuples, bytes_features=self._bytes_features, categorical_features=self._categorical_features, example_weight_map=self._example_weight_map) | 'CombineCountsAndWeights' >> beam.CombinePerKey(sum_fn) | 'Rearrange' >> beam.MapTuple(lambda k, v: ((k[0], k[1]), (v, k[2])))) # (slice_key, feature_path_steps), (count_and_maybe_weight, value) top_k = top_k_tuples_combined if has_any_weight: top_k |= 'Unweighted_DropWeightsAndRearrange' >> beam.MapTuple( lambda k, v: (k, (v[0][0], v[1]))) # (slice_key, feature_path_steps), (count, value) top_k = ( top_k | 'Unweighted_TopK' >> beam.combiners.Top().PerKey( max(self._num_top_values, self._num_rank_histogram_buckets)) | 'Unweighted_ToFeatureValueCount' >> beam.MapTuple( # pylint: disable=g-long-lambda lambda k, v: (k, [top_k_uniques_stats_util.FeatureValueCount( t[1], t[0]) for t in v]) # pylint: enable=g-long-lambda ) | 'Unweighted_ToProto' >> beam.MapTuple( # pylint: disable=g-long-lambda lambda k, v: ( k[0], top_k_uniques_stats_util. make_dataset_feature_stats_proto_topk_single( feature_path_tuple=k[1], value_count_list=v, categorical_features=self._categorical_features, is_weighted_stats=False, num_top_values=self._num_top_values, frequency_threshold=self._frequency_threshold, num_rank_histogram_buckets=self._num_rank_histogram_buckets) ) # pylint: enable=g-long-lambda )) # (slice_key, DatasetFeatureStatistics) uniques = ( top_k_tuples_combined | 'Uniques_Keys' >> beam.Keys() | 'Uniques_CountPerFeatureName' >> beam.combiners.Count().PerElement() | 'Uniques_ConvertToSingleFeatureStats' >> beam.MapTuple( # pylint: disable=g-long-lambda lambda k, v: ( k[0], top_k_uniques_stats_util. make_dataset_feature_stats_proto_unique_single( feature_path_tuple=k[1], num_uniques=v, categorical_features=self._categorical_features)) # pylint: enable=g-long-lambda )) # (slice_key, DatasetFeatureStatistics) result_protos = [top_k, uniques] if has_any_weight: weighted_top_k = ( top_k_tuples_combined | 'Weighted_DropCountsAndRearrange' >> beam.MapTuple(lambda k, v: (k, (v[0][1], v[1]))) # (slice_key, feature), (weight, value) | 'Weighted_TopK' >> beam.combiners.Top().PerKey( max(self._num_top_values, self._num_rank_histogram_buckets)) | 'Weighted_ToFeatureValueCount' >> beam.MapTuple( # pylint: disable=g-long-lambda lambda k, v: (k, [top_k_uniques_stats_util.FeatureValueCount( t[1], t[0]) for t in v]) # pylint: enable=g-long-lambda ) | 'Weighted_ToProto' >> beam.MapTuple( # pylint: disable=g-long-lambda lambda k, v: (k[0], top_k_uniques_stats_util. make_dataset_feature_stats_proto_topk_single( feature_path_tuple=k[1], value_count_list=v, categorical_features=self._categorical_features, is_weighted_stats=True, num_top_values=self._num_top_values, frequency_threshold=self._weighted_frequency_threshold, num_rank_histogram_buckets=self._num_rank_histogram_buckets )) # pylint: enable=g-long-lambda )) # (slice_key, DatasetFeatureStatistics) result_protos.append(weighted_top_k) return (result_protos | 'FlattenTopKUniquesFeatureStatsProtos' >> beam.Flatten())
def test_make_feature_stats_proto_topk_uniques(self): expected_result = text_format.Parse( """ path { step: "fa" } string_stats { unique: 5 top_values { value: "a" frequency: 3.0 } top_values { value: "e" frequency: 2.0 } top_values { value: "d" frequency: 2.0 } rank_histogram { buckets { label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "e" sample_count: 2.0 } } weighted_string_stats { top_values { value: "e" frequency: 20.0 } top_values { value: "d" frequency: 20.0 } top_values { value: "a" frequency: 15.0 } rank_histogram { buckets { label: "e" sample_count: 20.0 } buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 20.0 } } } } """, statistics_pb2.FeatureNameStatistics()) unweighted_value_counts = [('a', 3), ('e', 2), ('d', 2), ('c', 2), ('b', 1)] weighted_value_counts = [('e', 20), ('d', 20), ('a', 15), ('c', 10), ('b', 5)] top_k_value_count_list = [ top_k_uniques_stats_util.FeatureValueCount(value_count[0], value_count[1]) for value_count in unweighted_value_counts ] top_k_value_count_list_weighted = [ top_k_uniques_stats_util.FeatureValueCount(value_count[0], value_count[1]) for value_count in weighted_value_counts ] result = ( top_k_uniques_stats_util.make_feature_stats_proto_topk_uniques( types.FeaturePath(['fa']), num_top_values=3, frequency_threshold=1, weighted_frequency_threshold=1., num_rank_histogram_buckets=2, num_unique=5, value_count_list=top_k_value_count_list, weighted_value_count_list=top_k_value_count_list_weighted)) test_util.assert_feature_proto_equal(self, result, expected_result)
def test_make_feature_stats_proto_topk_uniques_custom_stats(self): expected_result = text_format.Parse( """ path { step: "fa" } type: STRING custom_stats { name: "topk_sketch_rank_histogram" rank_histogram { buckets { label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "e" sample_count: 2.0 } } } custom_stats { name: "weighted_topk_sketch_rank_histogram" rank_histogram { buckets { label: "e" sample_count: 20.0 } buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 20.0 } } } custom_stats { name: "uniques_sketch_num_uniques" num: 5 } """, statistics_pb2.FeatureNameStatistics()) unweighted_value_counts = [('a', 3), ('e', 2), ('d', 2), ('c', 2), ('b', 1)] weighted_value_counts = [('e', 20), ('d', 20), ('a', 15), ('c', 10), ('b', 5)] top_k_value_count_list = [ top_k_uniques_stats_util.FeatureValueCount(value_count[0], value_count[1]) for value_count in unweighted_value_counts ] top_k_value_count_list_weighted = [ top_k_uniques_stats_util.FeatureValueCount(value_count[0], value_count[1]) for value_count in weighted_value_counts ] result = ( top_k_uniques_stats_util. make_feature_stats_proto_topk_uniques_custom_stats( types.FeaturePath(['fa']), is_categorical=False, num_top_values=3, frequency_threshold=1, weighted_frequency_threshold=1., num_rank_histogram_buckets=2, num_unique=5, value_count_list=top_k_value_count_list, weighted_value_count_list=top_k_value_count_list_weighted)) test_util.assert_feature_proto_equal(self, result, expected_result)