예제 #1
0
    def compute(
        self, examples_record_batch: pa.RecordBatch
    ) -> statistics_pb2.DatasetFeatureStatistics:
        """Computes MI and AMI between all valid features and labels.

    Args:
      examples_record_batch: Arrow record_batch containing a batch of examples.

    Returns:
      DatasetFeatureStatistics proto containing AMI and MI for each feature.

    Raises:
      ValueError: If label_feature does not exist in examples.
    """
        if self._label_feature_is_unique(examples_record_batch):
            result = {}
            for feature_name in examples_record_batch.schema.names:
                feature_path = types.FeaturePath([feature_name])
                if feature_path != self._label_feature:
                    result[feature_path] = {self._custom_stats_key: 0.0}
            return stats_util.make_dataset_feature_stats_proto(result)

        encoded_examples = _encode_examples(examples_record_batch,
                                            self._multivalent_features,
                                            self._categorical_features,
                                            self._features_to_ignore,
                                            self._max_encoding_length)
        if self._normalize_by_max:
            labels = encoded_examples[self._label_feature]
        else:
            labels = encoded_examples.pop(self._label_feature)
        mi_result = self._calculate_mi(encoded_examples, labels, self._seed)
        if self._normalize_by_max:
            mi_result = self._normalize_mi_values(mi_result)
        return stats_util.make_dataset_feature_stats_proto(mi_result)
    def extract_output(self, accumulator):
        """Returns meta-statistics as a DatasetFeatureStatistics proto."""

        valid_stats_summary = _get_partitioned_statistics_summary(
            get_valid_statistics(accumulator.statistics,
                                 self._min_partitions_stat_presence))
        return stats_util.make_dataset_feature_stats_proto(valid_stats_summary)
  def compute(self, examples_table: pa.Table
             ) -> statistics_pb2.DatasetFeatureStatistics:
    """Computes MI and AMI between all valid features and labels.

    Args:
      examples_table: Arrow table containing a batch of examples.

    Returns:
      DatasetFeatureStatistics proto containing AMI and MI for each valid
        feature in the dataset. Some features may filtered out by
        _remove_unsupported_feature_columns if they are inavlid. In this case,
        AMI and MI will not be calculated for the invalid feature.

    Raises:
      ValueError: If label_feature contains unsupported data.
    """
    examples_table = _remove_unsupported_feature_columns(examples_table,
                                                         self._schema)

    flattened_examples = _flatten_and_impute(examples_table,
                                             self._categorical_features)
    if self._label_feature not in flattened_examples:
      raise ValueError("Label column contains unsupported data.")
    labels = flattened_examples.pop(self._label_feature)
    df = pd.DataFrame(flattened_examples)
    # Boolean list used to mark features as discrete for sk-learn MI computation
    discrete_feature_mask = self._convert_categorical_features_to_numeric(df)
    return stats_util.make_dataset_feature_stats_proto(
        self._calculate_mi(df, labels, discrete_feature_mask, seed=self._seed))
예제 #4
0
 def test_make_dataset_feature_stats_proto(self):
     stats = {
         types.FeaturePath(['feature_1']): {
             'Mutual Information': 0.5,
             'Correlation': 0.1
         },
         types.FeaturePath(['feature_2']): {
             'Mutual Information': 0.8,
             'Correlation': 0.6
         }
     }
     expected = {
         types.FeaturePath(['feature_1']):
         text_format.Parse(
             """
         path {
           step: 'feature_1'
         }
         custom_stats {
           name: 'Correlation'
           num: 0.1
         }
         custom_stats {
           name: 'Mutual Information'
           num: 0.5
         }
        """, statistics_pb2.FeatureNameStatistics()),
         types.FeaturePath(['feature_2']):
         text_format.Parse(
             """
         path {
           step: 'feature_2'
         }
         custom_stats {
           name: 'Correlation'
           num: 0.6
         }
         custom_stats {
           name: 'Mutual Information'
           num: 0.8
         }
        """, statistics_pb2.FeatureNameStatistics())
     }
     actual = stats_util.make_dataset_feature_stats_proto(stats)
     self.assertEqual(len(actual.features), len(expected))
     for actual_feature_stats in actual.features:
         compare.assertProtoEqual(self,
                                  actual_feature_stats,
                                  expected[types.FeaturePath.from_proto(
                                      actual_feature_stats.path)],
                                  normalize_numbers=True)
예제 #5
0
 def test_make_dataset_feature_stats_proto(self):
     stats = {
         'feature_1': {
             'Mutual Information': 0.5,
             'Correlation': 0.1
         },
         'feature_2': {
             'Mutual Information': 0.8,
             'Correlation': 0.6
         }
     }
     expected = {
         'feature_1':
         text_format.Parse(
             """
         name: 'feature_1'
         custom_stats {
           name: 'Correlation'
           num: 0.1
         }
         custom_stats {
           name: 'Mutual Information'
           num: 0.5
         }
        """, statistics_pb2.FeatureNameStatistics()),
         'feature_2':
         text_format.Parse(
             """
         name: 'feature_2'
         custom_stats {
           name: 'Correlation'
           num: 0.6
         }
         custom_stats {
           name: 'Mutual Information'
           num: 0.8
         }
        """, statistics_pb2.FeatureNameStatistics())
     }
     actual = stats_util.make_dataset_feature_stats_proto(stats)
     self.assertEqual(len(actual.features), len(expected))
     for actual_feature_stats in actual.features:
         compare.assertProtoEqual(self,
                                  actual_feature_stats,
                                  expected[actual_feature_stats.name],
                                  normalize_numbers=True)
    def compute(self, examples):
        """Computes MI and AMI between all valid features and labels.

    Args:
      examples: ExampleBatch containing the feature values for each feature.

    Returns:
      DatasetFeatureStatistics proto containing AMI and MI for each valid
        feature in the dataset. Some features may filtered out by
        _remove_unsupported_feature_columns if they are inavlid. In this case,
        AMI and MI will not be calculated for the invalid feature.

    Raises:
      ValueError: If label_feature contains unsupported data.
    """
        if self._label_feature not in examples:
            raise ValueError("Label column does not exist.")

        _remove_unsupported_feature_columns(examples, self._schema)

        if self._label_feature not in examples:
            raise ValueError("Label column contains unsupported data.")

        flattened_examples = _flatten_examples(examples)
        # TODO(b/119414212): Use Ranklab struct feature to handle null values for MI
        imputed_examples = self._impute(flattened_examples)
        labels = imputed_examples.pop(self._label_feature)
        df = pd.DataFrame(imputed_examples)
        # Boolean list used to mark features as discrete for sk-learn MI computation
        discrete_feature_mask = self._convert_categorical_features_to_numeric(
            df)
        return stats_util.make_dataset_feature_stats_proto(
            self._calculate_mi(df,
                               labels,
                               discrete_feature_mask,
                               seed=self._seed))