Пример #1
0
def _update_example_and_missing_count(
        stats: statistics_pb2.DatasetFeatureStatistics) -> None:
    """Updates example count of the dataset and missing count for all features."""
    if not stats.features:
        return
    dummy_feature = stats_util.get_feature_stats(stats, _DUMMY_FEATURE_PATH)
    num_examples = stats_util.get_custom_stats(dummy_feature,
                                               _NUM_EXAMPLES_KEY)
    weighted_num_examples = stats_util.get_custom_stats(
        dummy_feature, _WEIGHTED_NUM_EXAMPLES_KEY)
    stats.features.remove(dummy_feature)
    for feature_stats in stats.features:
        # For features nested under a STRUCT feature, their num_missing is computed
        # in the basic stats generator (because their num_missing is relative to
        # their parent's value count).
        if len(feature_stats.path.step) > 1:
            continue
        common_stats = None
        which_oneof_stats = feature_stats.WhichOneof('stats')
        if which_oneof_stats is None:
            # There are not common_stats for this feature (which can be the case when
            # generating only custom_stats for a sparse or weighted feature). In that
            # case, simply continue without modifying the common stats.
            continue
        common_stats = getattr(feature_stats, which_oneof_stats).common_stats
        assert num_examples >= common_stats.num_non_missing, (
            'Total number of examples: {} is less than number of non missing '
            'examples: {} for feature {}.'.format(
                num_examples, common_stats.num_non_missing,
                '.'.join(feature_stats.path.step)))
        num_missing = int(num_examples - common_stats.num_non_missing)
        common_stats.num_missing = num_missing
        if common_stats.presence_and_valency_stats:
            common_stats.presence_and_valency_stats[
                0].num_missing = num_missing
        if weighted_num_examples != 0:
            weighted_num_missing = (
                weighted_num_examples -
                common_stats.weighted_common_stats.num_non_missing)
            common_stats.weighted_common_stats.num_missing = weighted_num_missing
            if common_stats.weighted_presence_and_valency_stats:
                common_stats.weighted_presence_and_valency_stats[
                    0].num_missing = (weighted_num_missing)

    stats.num_examples = int(num_examples)
    stats.weighted_num_examples = weighted_num_examples
Пример #2
0
def lists_to_partitions(
    datasets: DatasetFeatureStatisticsList,
    schema: Schema,
    examples: types.Artifact,
    partitions: List[List[Text]],
) -> List[Partition]:
    result: List[Partition] = []
    for p in partitions:
        name = '__'.join(p)
        partition = Partition(
            name=name,
            statistics=DatasetFeatureStatisticsList(datasets=[
                DatasetFeatureStatistics(
                    name=name,
                    num_examples=min([
                        getattr(feature, feature.WhichOneof(
                            'stats')).common_stats.num_non_missing
                        for feature in filter(lambda f: feature_name(f) in p,
                                              dataset.features)
                    ]),
                    weighted_num_examples=0,
                    features=list(
                        filter(lambda f: feature_name(f) in p,
                               dataset.features)),
                ) for dataset in datasets.datasets
            ]),
            example_splits=[
                ExampleSplit(split=split,
                             uri=os.path.join(examples.uri, split)) for split
                in artifact_utils.decode_split_names(examples.split_names)
            ],
            schema=Schema(feature=filter(lambda f: f.name in p,
                                         schema.feature),
                          sparse_feature=filter(lambda f: f.name in p,
                                                schema.sparse_feature),
                          weighted_feature=filter(lambda f: f.name in p,
                                                  schema.weighted_feature),
                          string_domain=filter(lambda f: f.name in p,
                                               schema.string_domain),
                          float_domain=filter(lambda f: f.name in p,
                                              schema.float_domain),
                          int_domain=filter(lambda f: f.name in p,
                                            schema.int_domain),
                          default_environment=schema.default_environment))
        result.append(partition)
    return result
Пример #3
0
def _update_example_and_missing_count(
        stats: statistics_pb2.DatasetFeatureStatistics) -> None:
    """Updates example count of the dataset and missing count for all features."""
    if not stats.features:
        return
    dummy_feature = stats_util.get_feature_stats(stats, _DUMMY_FEATURE_PATH)
    num_examples = stats_util.get_custom_stats(dummy_feature,
                                               _NUM_EXAMPLES_KEY)
    weighted_num_examples = stats_util.get_custom_stats(
        dummy_feature, _WEIGHTED_NUM_EXAMPLES_KEY)
    stats.features.remove(dummy_feature)
    for feature_stats in stats.features:
        # For features nested under a STRUCT feature, their num_missing is computed
        # in the basic stats generator (because their num_missing is relative to
        # their parent's value count).
        if len(feature_stats.path.step) > 1:
            continue
        common_stats = None
        which_oneof_stats = feature_stats.WhichOneof('stats')
        if which_oneof_stats == 'num_stats':
            common_stats = feature_stats.num_stats.common_stats
        elif which_oneof_stats == 'struct_stats':
            common_stats = feature_stats.struct_stats.common_stats
        else:
            common_stats = feature_stats.string_stats.common_stats
        assert num_examples >= common_stats.num_non_missing, (
            'Total number of examples: {} is less than number of non missing '
            'examples: {} for feature {}.'.format(
                num_examples, common_stats.num_non_missing,
                '.'.join(feature_stats.path.step)))
        common_stats.num_missing = int(num_examples -
                                       common_stats.num_non_missing)
        if weighted_num_examples != 0:
            common_stats.weighted_common_stats.num_missing = (
                weighted_num_examples -
                common_stats.weighted_common_stats.num_non_missing)
    stats.num_examples = int(num_examples)