Exemplo n.º 1
0
 def test_remove_anomaly_types_removes_diff_regions(self):
     anomaly_types_to_remove = set([
         anomalies_pb2.AnomalyInfo.ENUM_TYPE_BYTES_NOT_STRING,
     ])
     # The anomaly_info has multiple diff regions.
     anomalies = text_format.Parse(
         """
    anomaly_info {
      key: "feature_1"
      value {
           description: "Expected bytes but got string. Examples contain "
             "values missing from the schema."
           severity: ERROR
           short_description: "Multiple errors"
           diff_regions {
             removed {
               start: 1
               contents: "Test contents"
             }
           }
           diff_regions {
             added {
               start: 1
               contents: "Test contents"
             }
           }
           reason {
             type: ENUM_TYPE_BYTES_NOT_STRING
             short_description: "Bytes not string"
             description: "Expected bytes but got string."
           }
           reason {
             type: ENUM_TYPE_UNEXPECTED_STRING_VALUES
             short_description: "Unexpected string values"
             description: "Examples contain values missing from the schema."
           }
       }
     }""", anomalies_pb2.Anomalies())
     expected_result = text_format.Parse(
         """
    anomaly_info {
      key: "feature_1"
      value {
           description: "Examples contain values missing from the schema."
           severity: ERROR
           short_description: "Unexpected string values"
           reason {
             type: ENUM_TYPE_UNEXPECTED_STRING_VALUES
             short_description: "Unexpected string values"
             description: "Examples contain values missing from the schema."
           }
       }
     }""", anomalies_pb2.Anomalies())
     anomalies_util.remove_anomaly_types(anomalies, anomaly_types_to_remove)
     compare.assertProtoEqual(self, anomalies, expected_result)
Exemplo n.º 2
0
 def test_remove_anomaly_types_does_not_change_proto(
         self, anomaly_types_to_remove, input_anomalies_proto_text):
     """Tests where remove_anomaly_types does not modify the Anomalies proto."""
     input_anomalies_proto = text_format.Parse(input_anomalies_proto_text,
                                               anomalies_pb2.Anomalies())
     expected_anomalies_proto = anomalies_pb2.Anomalies()
     expected_anomalies_proto.CopyFrom(input_anomalies_proto)
     anomalies_util.remove_anomaly_types(input_anomalies_proto,
                                         anomaly_types_to_remove)
     compare.assertProtoEqual(self, input_anomalies_proto,
                              expected_anomalies_proto)
Exemplo n.º 3
0
 def test_remove_anomaly_types_changes_proto(self, anomaly_types_to_remove,
                                             input_anomalies_proto_text,
                                             expected_anomalies_proto_text):
     """Tests where remove_anomaly_types modifies the Anomalies proto."""
     input_anomalies_proto = text_format.Parse(input_anomalies_proto_text,
                                               anomalies_pb2.Anomalies())
     expected_anomalies_proto = text_format.Parse(
         expected_anomalies_proto_text, anomalies_pb2.Anomalies())
     anomalies_util.remove_anomaly_types(input_anomalies_proto,
                                         anomaly_types_to_remove)
     compare.assertProtoEqual(self, input_anomalies_proto,
                              expected_anomalies_proto)
Exemplo n.º 4
0
def validate_instance(
    instance,
    options,
    environment = None
):
  """Validates a single example against the schema provided in `options`.

  If an optional `environment` is specified, the schema is filtered using the
  `environment` and the `instance` is validated against the filtered schema.

  Args:
    instance: A single example in the form of a dict mapping a feature name to a
      numpy array.
    options: `tfdv.StatsOptions` for generating data statistics. This must
      contain a schema.
    environment: An optional string denoting the validation environment. Must be
      one of the default environments specified in the schema. In some cases
      introducing slight schema variations is necessary, for instance features
      used as labels are required during training (and should be validated), but
      are missing during serving. Environments can be used to express such
      requirements. For example, assume a feature named 'LABEL' is required for
      training, but is expected to be missing from serving. This can be
      expressed by defining two distinct environments in the schema: ["SERVING",
      "TRAINING"] and associating 'LABEL' only with environment "TRAINING".

  Returns:
    An Anomalies protocol buffer.

  Raises:
    ValueError: If `options` is not a StatsOptions object.
    ValueError: If `options` does not contain a schema.
  """
  if not isinstance(options, stats_options.StatsOptions):
    raise ValueError('options must be a StatsOptions object.')
  if options.schema is None:
    raise ValueError('options must include a schema.')
  feature_statistics_list = (
      stats_impl.generate_statistics_in_memory([instance], options))
  anomalies = validate_statistics(feature_statistics_list, options.schema,
                                  environment)
  if anomalies.anomaly_info:
    # If anomalies were found, remove anomaly types that do not apply on a
    # per-example basis from the Anomalies proto.
    anomalies_util.remove_anomaly_types(anomalies, _GLOBAL_ONLY_ANOMALY_TYPES)
  return anomalies