def test_validate_instance_global_only_anomaly_type(self):
     instance = {'annotated_enum': np.array(['D'])}
     # This schema has a presence.min_count > 1, which will generate an anomaly
     # of type FEATURE_TYPE_LOW_NUMBER_PRESENT when any single example is
     # validated using this schema. This test checks that this anomaly type
     # (which is not meaningful in per-example validation) is not included in the
     # Anomalies proto that validate_instance returns.
     schema = text_format.Parse(
         """
     string_domain {
       name: "MyAloneEnum"
       value: "A"
       value: "B"
       value: "C"
     }
     feature {
       name: "annotated_enum"
       value_count {
         min:1
         max:1
       }
       presence {
         min_count: 5
       }
       type: BYTES
       domain: "MyAloneEnum"
     }
     feature {
       name: "ignore_this"
       lifecycle_stage: DEPRECATED
       value_count {
         min:1
       }
       presence {
         min_count: 1
       }
       type: BYTES
     }
     """, schema_pb2.Schema())
     expected_anomalies = {
         'annotated_enum':
         text_format.Parse(
             """
   description: "Examples contain values missing from the schema: D "
     "(~100%). "
   severity: ERROR
   short_description: "Unexpected string values"
   reason {
     type: ENUM_TYPE_UNEXPECTED_STRING_VALUES
     short_description: "Unexpected string values"
     description: "Examples contain values missing from the schema: D "
       "(~100%). "
   }
         """, anomalies_pb2.AnomalyInfo())
     }
     options = stats_options.StatsOptions(schema=schema)
     anomalies = validation_api.validate_instance(instance, options)
     self._assert_equal_anomalies(anomalies, expected_anomalies)
예제 #2
0
    def test_validate_stats_with_previous_stats(self):
        statistics = text_format.Parse(
            """
        datasets {
          num_examples: 2
          features {
            name: 'annotated_enum'
            type: STRING
            string_stats {
              common_stats { num_non_missing: 2 num_missing: 0 max_num_values: 1 }
              rank_histogram {
                buckets { label: "a" sample_count: 1 }
                buckets { label: "b" sample_count: 1 }
              }
            }
          }
        }""", statistics_pb2.DatasetFeatureStatisticsList())

        previous_statistics = text_format.Parse(
            """
        datasets {
          num_examples: 4
          features {
            name: 'annotated_enum'
            type: STRING
            string_stats {
              common_stats { num_non_missing: 4 num_missing: 0 max_num_values: 1 }
              rank_histogram {
                buckets { label: "a" sample_count: 3 }
                buckets { label: "b" sample_count: 1 }
              }
            }
          }
        }""", statistics_pb2.DatasetFeatureStatisticsList())

        schema = text_format.Parse(
            """
        feature {
          name: "annotated_enum"
          type: BYTES
          domain: "annotated_enum"
          drift_comparator { infinity_norm { threshold: 0.01 } }
        }
        string_domain { name: "annotated_enum" value: "a" }
        """, schema_pb2.Schema())

        expected_anomalies = {
            'annotated_enum':
            text_format.Parse(self._annotated_enum_anomaly_info,
                              anomalies_pb2.AnomalyInfo())
        }
        # Validate the stats.
        anomalies = validation_api.validate_statistics(
            statistics, schema, previous_statistics=previous_statistics)
        self._assert_equal_anomalies(anomalies, expected_anomalies)
 def test_validate_instance(self):
     instance = {'annotated_enum': np.array(['D'])}
     schema = text_format.Parse(
         """
     string_domain {
       name: "MyAloneEnum"
       value: "A"
       value: "B"
       value: "C"
     }
     feature {
       name: "annotated_enum"
       value_count {
         min:1
         max:1
       }
       presence {
         min_count: 1
       }
       type: BYTES
       domain: "MyAloneEnum"
     }
     feature {
       name: "ignore_this"
       lifecycle_stage: DEPRECATED
       value_count {
         min:1
       }
       presence {
         min_count: 1
       }
       type: BYTES
     }
     """, schema_pb2.Schema())
     expected_anomalies = {
         'annotated_enum':
         text_format.Parse(
             """
   description: "Examples contain values missing from the schema: D "
     "(~100%). "
   severity: ERROR
   short_description: "Unexpected string values"
   reason {
     type: ENUM_TYPE_UNEXPECTED_STRING_VALUES
     short_description: "Unexpected string values"
     description: "Examples contain values missing from the schema: D "
       "(~100%). "
   }
         """, anomalies_pb2.AnomalyInfo())
     }
     options = stats_options.StatsOptions(schema=schema)
     anomalies = validation_api.validate_instance(instance, options)
     self._assert_equal_anomalies(anomalies, expected_anomalies)
    def test_validate_instance_environment(self):
        instance = {'feature': np.array(['A'])}
        schema = text_format.Parse(
            """
        default_environment: "TRAINING"
        default_environment: "SERVING"
        feature {
          name: "label"
          not_in_environment: "SERVING"
          value_count { min: 1 max: 1 }
          presence { min_count: 1 }
          type: BYTES
        }
        feature {
          name: "feature"
          value_count { min: 1 max: 1 }
          presence { min_count: 1 }
          type: BYTES
        }
        """, schema_pb2.Schema())
        options = stats_options.StatsOptions(schema=schema)

        # Validate the instance in TRAINING environment.
        expected_anomalies_training = {
            'label':
            text_format.Parse(
                """
            description: "Column is completely missing"
            severity: ERROR
            short_description: "Column dropped"
            reason {
              type: SCHEMA_MISSING_COLUMN
              short_description: "Column dropped"
              description: "Column is completely missing"
            }
            """, anomalies_pb2.AnomalyInfo())
        }
        anomalies_training = validation_api.validate_instance(
            instance, options, environment='TRAINING')
        self._assert_equal_anomalies(anomalies_training,
                                     expected_anomalies_training)

        # Validate the instance in SERVING environment.
        anomalies_serving = validation_api.validate_instance(
            instance, options, environment='SERVING')
        self._assert_equal_anomalies(anomalies_serving, {})
예제 #5
0
    def test_validate_stats_with_environment(self):
        statistics = text_format.Parse(
            """
        datasets {
          num_examples: 1000
          features {
            name: 'feature'
            type: STRING
            string_stats {
              common_stats {
                num_non_missing: 1000
                min_num_values: 1
                max_num_values: 1
              }
              unique: 3
            }
          }
        }""", statistics_pb2.DatasetFeatureStatisticsList())

        schema = text_format.Parse(
            """
        default_environment: "TRAINING"
        default_environment: "SERVING"
        feature {
          name: "label"
          not_in_environment: "SERVING"
          value_count { min: 1 max: 1 }
          presence { min_count: 1 }
          type: BYTES
        }
        feature {
          name: "feature"
          value_count { min: 1 max: 1 }
          presence { min_count: 1 }
          type: BYTES
        }
        """, schema_pb2.Schema())

        expected_anomalies_training = {
            'label':
            text_format.Parse(
                """
            description: "Column is completely missing"
            severity: ERROR
            short_description: "Column dropped"
            reason {
              type: SCHEMA_MISSING_COLUMN
              short_description: "Column dropped"
              description: "Column is completely missing"
            }
            """, anomalies_pb2.AnomalyInfo())
        }
        # Validate the stats in TRAINING environment.
        anomalies_training = validation_api.validate_statistics(
            statistics, schema, environment='TRAINING')
        self._assert_equal_anomalies(anomalies_training,
                                     expected_anomalies_training)

        # Validate the stats in SERVING environment.
        anomalies_serving = validation_api.validate_statistics(
            statistics, schema, environment='SERVING')
        self._assert_equal_anomalies(anomalies_serving, {})
예제 #6
0
    def test_validate_stats_with_serving_stats(self):
        statistics = text_format.Parse(
            """
        datasets {
          num_examples: 10
          features {
            name: 'bar'
            type: STRING
            string_stats {
              common_stats {
                num_missing: 0
                num_non_missing: 10
                max_num_values: 1
              }
              rank_histogram {
                buckets { label: "a" sample_count: 1 }
                buckets { label: "b" sample_count: 2 }
                buckets { label: "c" sample_count: 7 }
              }
            }
          }
        }""", statistics_pb2.DatasetFeatureStatisticsList())

        serving_statistics = text_format.Parse(
            """
        datasets {
          num_examples: 10
          features {
            name: 'bar'
            type: STRING
            string_stats {
              common_stats {
                num_missing: 0
                num_non_missing: 10
                max_num_values: 1
              }
              rank_histogram {
                buckets { label: "a" sample_count: 3 }
                buckets { label: "b" sample_count: 1 }
                buckets { label: "c" sample_count: 6 }
              }
            }
          }
        }""", statistics_pb2.DatasetFeatureStatisticsList())

        schema = text_format.Parse(
            """
        feature {
          name: 'bar'
          type: BYTES
          skew_comparator {
            infinity_norm { threshold: 0.1}
          }
        }""", schema_pb2.Schema())

        expected_anomalies = {
            'bar':
            text_format.Parse(self._bar_anomaly_info,
                              anomalies_pb2.AnomalyInfo())
        }
        # Validate the stats.
        anomalies = validation_api.validate_statistics(
            statistics, schema, serving_statistics=serving_statistics)
        self._assert_equal_anomalies(anomalies, expected_anomalies)
예제 #7
0
    def test_validate_stats(self):
        schema = text_format.Parse(
            """
        string_domain {
          name: "MyAloneEnum"
          value: "A"
          value: "B"
          value: "C"
        }
        feature {
          name: "annotated_enum"
          value_count {
            min:1
            max:1
          }
          presence {
            min_count: 1
          }
          type: BYTES
          domain: "MyAloneEnum"
        }
        feature {
          name: "ignore_this"
          lifecycle_stage: DEPRECATED
          value_count {
            min:1
          }
          presence {
            min_count: 1
          }
          type: BYTES
        }
        """, schema_pb2.Schema())
        statistics = text_format.Parse(
            """
        datasets{
          num_examples: 10
          features {
            name: 'annotated_enum'
            type: STRING
            string_stats {
              common_stats {
                num_missing: 3
                num_non_missing: 7
                min_num_values: 1
                max_num_values: 1
              }
              unique: 3
              rank_histogram {
                buckets {
                  label: "D"
                  sample_count: 1
                }
              }
            }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())
        expected_anomalies = {
            'annotated_enum':
            text_format.Parse(
                """
      description: "Examples contain values missing from the schema: D (?). "
      severity: ERROR
      short_description: "Unexpected string values"
      reason {
        type: ENUM_TYPE_UNEXPECTED_STRING_VALUES
        short_description: "Unexpected string values"
        description: "Examples contain values missing from the schema: D (?). "
      }
            """, anomalies_pb2.AnomalyInfo())
        }

        # Validate the stats.
        anomalies = validation_api.validate_statistics(statistics, schema)
        self._assert_equal_anomalies(anomalies, expected_anomalies)