예제 #1
0
    def test_empty_inputs(self):
        training_examples, serving_examples, _ = get_test_input(
            include_skewed_features=True, include_close_floats=True)

        # Expect no skew results or sample in each case.
        expected_result = list()

        # Empty training collection.
        with beam.Pipeline() as p:
            training_examples_1 = p | 'Create Training' >> beam.Create([])
            serving_examples_1 = p | 'Create Serving' >> beam.Create(
                serving_examples)
            skew_result_1, skew_sample_1 = (
                (training_examples_1, serving_examples_1)
                | feature_skew_detector.DetectFeatureSkewImpl(
                    [_IDENTIFIER1, _IDENTIFIER2], [_IGNORE_FEATURE],
                    sample_size=1))
            util.assert_that(
                skew_result_1,
                test_util.make_skew_result_equal_fn(self, expected_result),
                'CheckSkewResult')
            util.assert_that(skew_sample_1,
                             make_sample_equal_fn(self, 0, expected_result),
                             'CheckSkewSample')

        # Empty serving collection.
        with beam.Pipeline() as p:
            training_examples_2 = p | 'Create Training' >> beam.Create(
                training_examples)
            serving_examples_2 = p | 'Create Serving' >> beam.Create([])
            skew_result_2, skew_sample_2 = (
                (training_examples_2, serving_examples_2)
                | feature_skew_detector.DetectFeatureSkewImpl(
                    [_IDENTIFIER1, _IDENTIFIER2], [_IGNORE_FEATURE],
                    sample_size=1))
            util.assert_that(
                skew_result_2,
                test_util.make_skew_result_equal_fn(self, expected_result),
                'CheckSkewResult')
            util.assert_that(skew_sample_2,
                             make_sample_equal_fn(self, 0, expected_result),
                             'CheckSkewSample')

        # Empty training and serving collections.
        with beam.Pipeline() as p:
            training_examples_3 = p | 'Create Training' >> beam.Create([])
            serving_examples_3 = p | 'Create Serving' >> beam.Create([])
            skew_result_3, skew_sample_3 = (
                (training_examples_3, serving_examples_3)
                | feature_skew_detector.DetectFeatureSkewImpl(
                    [_IDENTIFIER1, _IDENTIFIER2], [_IGNORE_FEATURE],
                    sample_size=1))
            util.assert_that(
                skew_result_3,
                test_util.make_skew_result_equal_fn(self, expected_result),
                'CheckSkewResult')
            util.assert_that(skew_sample_3,
                             make_sample_equal_fn(self, 0, expected_result),
                             'CheckSkewSample')
예제 #2
0
    def test_telemetry(self):
        base_example = tf.train.Example()
        base_example.features.feature[_IDENTIFIER1].int64_list.value.append(1)

        training_example = tf.train.Example()
        training_example.CopyFrom(base_example)
        serving_example = tf.train.Example()
        serving_example.CopyFrom(base_example)

        # Add Identifier 2 to training example only.
        training_example.features.feature[
            _IDENTIFIER2].int64_list.value.append(2)

        p = beam.Pipeline()
        training_data = p | 'Create Training' >> beam.Create(
            [training_example])
        serving_data = p | 'Create Serving' >> beam.Create([serving_example])
        _, _ = ((training_data, serving_data)
                | feature_skew_detector.DetectFeatureSkewImpl(
                    [_IDENTIFIER1, _IDENTIFIER2]))
        runner = p.run()
        runner.wait_until_finish()
        result_metrics = runner.metrics()

        # Serving example does not have Identifier 2.
        actual_counter = result_metrics.query(
            beam.metrics.metric.MetricsFilter().with_name(
                'examples_with_missing_identifier_features'))['counters']
        self.assertLen(actual_counter, 1)
        self.assertEqual(actual_counter[0].committed, 1)
예제 #3
0
    def test_detect_no_skew(self):
        training_examples, serving_examples, _ = get_test_input(
            include_skewed_features=False, include_close_floats=False)

        expected_result = [
            text_format.Parse(
                """
        feature_name: 'no_skew'
        training_count: 2
        serving_count: 2
        match_count: 2
        diff_count: 0""", feature_skew_results_pb2.FeatureSkew()),
        ]

        with beam.Pipeline() as p:
            training_examples = p | 'Create Training' >> beam.Create(
                training_examples)
            serving_examples = p | 'Create Serving' >> beam.Create(
                serving_examples)
            skew_result, skew_sample = (
                (training_examples, serving_examples)
                | feature_skew_detector.DetectFeatureSkewImpl(
                    [_IDENTIFIER1, _IDENTIFIER2], [_IGNORE_FEATURE],
                    sample_size=2))
            util.assert_that(
                skew_result,
                test_util.make_skew_result_equal_fn(self, expected_result),
                'CheckSkewResult')
            util.assert_that(skew_sample, make_sample_equal_fn(self, 0, []),
                             'CheckSkewSample')
예제 #4
0
 def expand(
     self, datasets: Tuple[beam.pvalue.PCollection, beam.pvalue.PCollection]
 ) -> Tuple[beam.pvalue.PCollection, beam.pvalue.PCollection]:
     return (
         datasets |
         'DetectFeatureSkew' >> feature_skew_detector.DetectFeatureSkewImpl(
             self._identifier_features, self._features_to_ignore,
             self._sample_size, self._float_round_ndigits,
             self._allow_duplicate_identifiers))
예제 #5
0
 def test_no_identifier_features(self):
     training_examples, serving_examples, _ = get_test_input(
         include_skewed_features=False, include_close_floats=False)
     with self.assertRaisesRegex(
             ValueError, 'At least one feature name must be specified'):
         with beam.Pipeline() as p:
             training_examples = p | 'Create Training' >> beam.Create(
                 training_examples)
             serving_examples = p | 'Create Serving' >> beam.Create(
                 serving_examples)
             _ = ((training_examples, serving_examples)
                  | feature_skew_detector.DetectFeatureSkewImpl([]))
예제 #6
0
    def test_detect_feature_skew(self):
        training_examples, serving_examples, _ = get_test_input(
            include_skewed_features=True, include_close_floats=True)

        expected_result = [
            text_format.Parse(
                """
        feature_name: 'close_float'
        training_count: 2
        serving_count: 2
        mismatch_count: 2
        diff_count: 2""", feature_skew_results_pb2.FeatureSkew()),
            text_format.Parse(
                """
        feature_name: 'skewed'
        training_count: 2
        serving_count: 2
        mismatch_count: 2
        diff_count: 2""", feature_skew_results_pb2.FeatureSkew()),
            text_format.Parse(
                """
        feature_name: 'training_only'
        training_count: 2
        training_only: 2
        diff_count: 2""", feature_skew_results_pb2.FeatureSkew()),
            text_format.Parse(
                """
        feature_name: 'serving_only'
        serving_count: 2
        serving_only: 2
        diff_count: 2""", feature_skew_results_pb2.FeatureSkew()),
            text_format.Parse(
                """
        feature_name: 'no_skew'
        training_count: 2
        serving_count: 2
        match_count: 2
        diff_count: 0""", feature_skew_results_pb2.FeatureSkew()),
        ]

        with beam.Pipeline() as p:
            training_examples = p | 'Create Training' >> beam.Create(
                training_examples)
            serving_examples = p | 'Create Serving' >> beam.Create(
                serving_examples)
            skew_result, _ = (
                (training_examples, serving_examples)
                | feature_skew_detector.DetectFeatureSkewImpl(
                    [_IDENTIFIER1, _IDENTIFIER2], [_IGNORE_FEATURE]))
            util.assert_that(
                skew_result,
                test_util.make_skew_result_equal_fn(self, expected_result))
예제 #7
0
    def test_obtain_skew_sample(self):
        training_examples, serving_examples, skew_pairs = get_test_input(
            include_skewed_features=True, include_close_floats=False)

        sample_size = 1
        potential_samples = skew_pairs
        with beam.Pipeline() as p:
            training_examples = p | 'Create Training' >> beam.Create(
                training_examples)
            serving_examples = p | 'Create Serving' >> beam.Create(
                serving_examples)
            _, skew_sample = ((training_examples, serving_examples)
                              | feature_skew_detector.DetectFeatureSkewImpl(
                                  [_IDENTIFIER1, _IDENTIFIER2],
                                  [_IGNORE_FEATURE], sample_size))
            util.assert_that(
                skew_sample,
                make_sample_equal_fn(self, sample_size, potential_samples))
예제 #8
0
    def test_duplicate_identifiers_not_allowed_with_duplicates(self):
        training_example_1 = text_format.Parse(
            """
        features {
          feature {
            key: "id"
            value { int64_list { value: 1 } }
          }
          feature {
            key: "val"
            value { int64_list { value: 100 } }
          }
        }
        """, tf.train.Example())
        training_example_2 = text_format.Parse(
            """
        features {
          feature {
            key: "id"
            value { int64_list { value: 1 } }
          }
          feature {
            key: "val"
            value { int64_list { value: 50 } }
          }
        }
        """, tf.train.Example())
        serving_example = text_format.Parse(
            """
        features {
          feature {
            key: "id"
            value { int64_list { value: 1 } }
          }
          feature {
            key: "val"
            value { int64_list { value: 100 } }
          }
          feature {
            key: "val2"
            value { int64_list { value: 100 } }
          }
        }
        """, tf.train.Example())
        with beam.Pipeline() as p:
            training_examples = p | 'Create Training' >> beam.Create(
                [training_example_1, training_example_2])
            serving_examples = p | 'Create Serving' >> beam.Create(
                [serving_example])
            skew_result, _ = ((training_examples, serving_examples)
                              | feature_skew_detector.DetectFeatureSkewImpl(
                                  ['id'], [],
                                  allow_duplicate_identifiers=False))
            util.assert_that(skew_result,
                             test_util.make_skew_result_equal_fn(self, []))

        runner = p.run()
        runner.wait_until_finish()
        result_metrics = runner.metrics()
        actual_counter = result_metrics.query(
            beam.metrics.metric.MetricsFilter().with_name(
                'skipped_duplicate_identifier'))['counters']
        self.assertLen(actual_counter, 1)
        self.assertEqual(actual_counter[0].committed, 1)
예제 #9
0
 def test_duplicate_identifiers_allowed_with_duplicates(self):
     training_example_1 = text_format.Parse(
         """
     features {
       feature {
         key: "id"
         value { int64_list { value: 1 } }
       }
       feature {
         key: "val"
         value { int64_list { value: 100 } }
       }
     }
     """, tf.train.Example())
     training_example_2 = text_format.Parse(
         """
     features {
       feature {
         key: "id"
         value { int64_list { value: 1 } }
       }
       feature {
         key: "val"
         value { int64_list { value: 50 } }
       }
     }
     """, tf.train.Example())
     serving_example = text_format.Parse(
         """
     features {
       feature {
         key: "id"
         value { int64_list { value: 1 } }
       }
       feature {
         key: "val"
         value { int64_list { value: 100 } }
       }
       feature {
         key: "val2"
         value { int64_list { value: 100 } }
       }
     }
     """, tf.train.Example())
     expected_result = [
         text_format.Parse(
             """
     feature_name: 'val'
     training_count: 2
     serving_count: 2
     match_count: 1
     mismatch_count: 1
     diff_count: 1""", feature_skew_results_pb2.FeatureSkew()),
         text_format.Parse(
             """
     feature_name: 'val2'
     training_count: 0
     serving_count: 2
     serving_only: 2
     diff_count: 2""", feature_skew_results_pb2.FeatureSkew()),
     ]
     with beam.Pipeline() as p:
         training_examples = p | 'Create Training' >> beam.Create(
             [training_example_1, training_example_2])
         serving_examples = p | 'Create Serving' >> beam.Create(
             [serving_example])
         skew_result, _ = (
             (training_examples, serving_examples)
             | feature_skew_detector.DetectFeatureSkewImpl(
                 ['id'], [], allow_duplicate_identifiers=True))
         util.assert_that(
             skew_result,
             test_util.make_skew_result_equal_fn(self, expected_result))
예제 #10
0
    def test_float_precision_configuration(self):
        training_examples, serving_examples, _ = get_test_input(
            include_skewed_features=True, include_close_floats=True)

        expected_result = [
            text_format.Parse(
                """
        feature_name: 'skewed'
        training_count: 2
        serving_count: 2
        mismatch_count: 2
        diff_count: 2""", feature_skew_results_pb2.FeatureSkew()),
            text_format.Parse(
                """
        feature_name: 'training_only'
        training_count: 2
        training_only: 2
        diff_count: 2""", feature_skew_results_pb2.FeatureSkew()),
            text_format.Parse(
                """
        feature_name: 'serving_only'
        serving_count: 2
        serving_only: 2
        diff_count: 2""", feature_skew_results_pb2.FeatureSkew()),
            text_format.Parse(
                """
        feature_name: 'no_skew'
        training_count: 2
        serving_count: 2
        match_count: 2""", feature_skew_results_pb2.FeatureSkew()),
        ]

        expected_with_float = expected_result + [
            text_format.Parse(
                """
        feature_name: 'close_float'
        training_count: 2
        serving_count: 2
        mismatch_count: 2
        diff_count: 2""", feature_skew_results_pb2.FeatureSkew())
        ]

        # Do not set a float_round_ndigits.
        with beam.Pipeline() as p:
            training_examples_1 = p | 'Create Training' >> beam.Create(
                training_examples)
            serving_examples_1 = p | 'Create Serving' >> beam.Create(
                serving_examples)
            skew_result, _ = (
                (training_examples_1, serving_examples_1)
                | feature_skew_detector.DetectFeatureSkewImpl(
                    [_IDENTIFIER1, _IDENTIFIER2], [_IGNORE_FEATURE],
                    sample_size=1))
            util.assert_that(
                skew_result,
                test_util.make_skew_result_equal_fn(self, expected_with_float))

        expected_with_float_and_option = expected_result + [
            text_format.Parse(
                """
              feature_name: 'close_float'
              training_count: 2
              serving_count: 2
              match_count: 2
              """, feature_skew_results_pb2.FeatureSkew())
        ]

        # Set float_round_ndigits
        with beam.Pipeline() as p:
            training_examples_2 = p | 'Create Training' >> beam.Create(
                training_examples)
            serving_examples_2 = p | 'Create Serving' >> beam.Create(
                serving_examples)
            skew_result, _ = (
                (training_examples_2, serving_examples_2)
                | feature_skew_detector.DetectFeatureSkewImpl(
                    [_IDENTIFIER1, _IDENTIFIER2], [_IGNORE_FEATURE],
                    sample_size=1,
                    float_round_ndigits=2))
            util.assert_that(
                skew_result,
                test_util.make_skew_result_equal_fn(
                    self, expected_with_float_and_option))