def _make_bytes_stats_proto( bytes_stats: _PartialBytesStats, total_num_values: int) -> statistics_pb2.BytesStatistics: """Convert the partial bytes statistics into BytesStatistics proto.""" result = statistics_pb2.BytesStatistics() if total_num_values > 0: result.avg_num_bytes = bytes_stats.total_num_bytes / total_num_values result.min_num_bytes = bytes_stats.min_num_bytes result.max_num_bytes = bytes_stats.max_num_bytes return result
def get_dataset_feature_statistics(builder, split): """Calculate statistics for the specified split.""" statistics = statistics_pb2.DatasetFeatureStatistics() # Make this to the best of our abilities. schema = schema_pb2.Schema() dataset = builder.as_dataset(split=split) # Just computing the number of examples for now. statistics.num_examples = 0 # Feature dictionaries. feature_to_shape = {} feature_to_dtype = {} feature_to_num_examples = collections.defaultdict(int) feature_to_min = {} feature_to_max = {} for example in dataset: statistics.num_examples += 1 assert isinstance(example, dict) feature_names = example.keys() for feature_name in feature_names: # Update the number of examples this feature appears in. feature_to_num_examples[feature_name] += 1 feature_shape = example[feature_name].shape feature_dtype = example[feature_name].dtype feature_np = example[feature_name].numpy() feature_min, feature_max = None, None is_numeric = ( feature_dtype.is_floating or feature_dtype.is_integer or feature_dtype.is_bool) if is_numeric: feature_min = np.min(feature_np) feature_max = np.max(feature_np) # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add # logic for that. # Set the shape, or assert shapes match. if feature_name not in feature_to_shape: feature_to_shape[feature_name] = feature_shape else: assert feature_to_shape[feature_name] == feature_shape # Set the shape, or assert shapes match. if feature_name not in feature_to_dtype: feature_to_dtype[feature_name] = feature_dtype else: assert feature_to_dtype[feature_name] == feature_dtype # Set or update the min, max. if is_numeric: if ((feature_name not in feature_to_min) or (feature_to_min[feature_name] > feature_min)): feature_to_min[feature_name] = feature_min if ((feature_name not in feature_to_max) or (feature_to_max[feature_name] < feature_max)): feature_to_max[feature_name] = feature_max # Start here, we've processed all examples. # Assert that the keys match up. assert feature_to_shape.keys() == feature_to_dtype.keys() assert feature_to_shape.keys() == feature_to_num_examples.keys() for feature_name in feature_to_shape: # Try to fill in the schema. feature = schema.feature.add() feature.name = feature_name # TODO(afrozm): What do we do for non fixed size shapes? # What to do for scalars? for dim in feature_to_shape[feature_name].as_list(): feature.shape.dim.add().size = dim feature_type = feature_to_dtype[feature_name] feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES) common_statistics = statistics_pb2.CommonStatistics() common_statistics.num_non_missing = feature_to_num_examples[feature_name] common_statistics.num_missing = ( statistics.num_examples - common_statistics.num_non_missing) feature_name_statistics = statistics.features.add() feature_name_statistics.name = feature_name # TODO(afrozm): This can be skipped, since type information was added to # the Schema. feature_name_statistics.type = _SCHEMA_TYPE_MAP.get( feature.type, statistics_pb2.FeatureNameStatistics.BYTES) if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT: numeric_statistics = statistics_pb2.NumericStatistics() numeric_statistics.min = feature_to_min[feature_name] numeric_statistics.max = feature_to_max[feature_name] numeric_statistics.common_stats.CopyFrom(common_statistics) feature_name_statistics.num_stats.CopyFrom(numeric_statistics) else: # Let's shove it into BytesStatistics for now. bytes_statistics = statistics_pb2.BytesStatistics() bytes_statistics.common_stats.CopyFrom(common_statistics) feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics) return statistics, schema
def get_dataset_feature_statistics(builder, split): """Calculate statistics for the specified split.""" statistics = statistics_pb2.DatasetFeatureStatistics() # Make this to the best of our abilities. schema = schema_pb2.Schema() dataset = builder.as_dataset(split=split) # Just computing the number of examples for now. statistics.num_examples = 0 # Feature dictionaries. feature_to_num_examples = collections.defaultdict(int) feature_to_min = {} feature_to_max = {} np_dataset = dataset_utils.dataset_as_numpy(dataset) for example in tqdm.tqdm(np_dataset, unit=" examples"): statistics.num_examples += 1 assert isinstance(example, dict) feature_names = sorted(example.keys()) for feature_name in feature_names: # Update the number of examples this feature appears in. feature_to_num_examples[feature_name] += 1 feature_np = example[feature_name] # For compatibility in graph and eager mode, we can get PODs here and # everything may not be neatly wrapped up in numpy's ndarray. feature_dtype = type(feature_np) if isinstance(feature_np, np.ndarray): feature_dtype = feature_np.dtype.type feature_min, feature_max = None, None is_numeric = (np.issubdtype(feature_dtype, np.number) or feature_dtype == np.bool_) if is_numeric: feature_min = np.min(feature_np) feature_max = np.max(feature_np) # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add # logic for that. # Set or update the min, max. if is_numeric: if ((feature_name not in feature_to_min) or (feature_to_min[feature_name] > feature_min)): feature_to_min[feature_name] = feature_min if ((feature_name not in feature_to_max) or (feature_to_max[feature_name] < feature_max)): feature_to_max[feature_name] = feature_max # Start here, we've processed all examples. output_shapes_dict = dataset.output_shapes output_types_dict = dataset.output_types for feature_name in sorted(feature_to_num_examples.keys()): # Try to fill in the schema. feature = schema.feature.add() feature.name = feature_name # TODO(afrozm): Make this work with nested structures, currently the Schema # proto has no support for it. maybe_feature_shape = output_shapes_dict[feature_name] if not isinstance(maybe_feature_shape, tf.TensorShape): logging.error( "Statistics generation doesn't work for nested structures yet") continue for dim in maybe_feature_shape.as_list(): # We denote `None`s as -1 in the shape proto. feature.shape.dim.add().size = dim if dim else -1 feature_type = output_types_dict[feature_name] feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES) common_statistics = statistics_pb2.CommonStatistics() common_statistics.num_non_missing = feature_to_num_examples[ feature_name] common_statistics.num_missing = (statistics.num_examples - common_statistics.num_non_missing) feature_name_statistics = statistics.features.add() feature_name_statistics.name = feature_name # TODO(afrozm): This can be skipped, since type information was added to # the Schema. feature_name_statistics.type = _SCHEMA_TYPE_MAP.get( feature.type, statistics_pb2.FeatureNameStatistics.BYTES) if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT: numeric_statistics = statistics_pb2.NumericStatistics() numeric_statistics.min = feature_to_min[feature_name] numeric_statistics.max = feature_to_max[feature_name] numeric_statistics.common_stats.CopyFrom(common_statistics) feature_name_statistics.num_stats.CopyFrom(numeric_statistics) else: # Let's shove it into BytesStatistics for now. bytes_statistics = statistics_pb2.BytesStatistics() bytes_statistics.common_stats.CopyFrom(common_statistics) feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics) return statistics, schema