Пример #1
0
    def test_compute(self):
        multiclass_paired_metrics = metrics.MulticlassPairedMetricsImpl()

        indices = ['7f7f85', '345ac4', '3a3112', '88bcda']
        metas = [{'parentId': '345ac4'}, {}, {}, {'parentId': '3a3112'}]

        # No swaps.
        result = multiclass_paired_metrics.compute_with_metadata(
            ['1', '1', '0', '0'], [[0, 1], [0, 1], [1, 0], [1, 0]],
            types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1'],
                                  null_idx=0), indices, metas)
        testing_utils.assert_deep_almost_equal(self, result, {
            'mean_jsd': 0.0,
            'num_pairs': 2,
            'swap_rate': 0.0
        })

        # One swap.
        result = multiclass_paired_metrics.compute_with_metadata(
            ['1', '1', '0', '0'], [[0, 1], [1, 0], [1, 0], [1, 0]],
            types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1'],
                                  null_idx=0), indices, metas)
        testing_utils.assert_deep_almost_equal(self, result, {
            'mean_jsd': 0.34657,
            'num_pairs': 2,
            'swap_rate': 0.5
        })

        # Two swaps.
        result = multiclass_paired_metrics.compute_with_metadata(
            ['1', '1', '0', '0'], [[0, 1], [1, 0], [1, 0], [0, 1]],
            types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1'],
                                  null_idx=0), indices, metas)
        testing_utils.assert_deep_almost_equal(self, result, {
            'mean_jsd': 0.69315,
            'num_pairs': 2,
            'swap_rate': 1.0
        })

        # Two swaps, no null index.
        result = multiclass_paired_metrics.compute_with_metadata(
            ['1', '1', '0', '0'], [[0, 1], [1, 0], [1, 0], [0, 1]],
            types.CategoryLabel(), types.MulticlassPreds(vocab=['0', '1']),
            indices, metas)
        testing_utils.assert_deep_almost_equal(self, result, {
            'mean_jsd': 0.69315,
            'num_pairs': 2,
            'swap_rate': 1.0
        })

        # Empty predictions, indices, and meta.
        result = multiclass_paired_metrics.compute_with_metadata(
            [], [], types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1'], null_idx=0), [], [])
        testing_utils.assert_deep_almost_equal(self, result, {})
Пример #2
0
 def test_classification_cat(self):
   config = {
       'feature': 'cats',
   }
   result = self.pdp.run_with_metadata([self.dataset.indexed_examples[0]],
                                       self.class_model, self.dataset,
                                       config=config)
   expected = {'One': [0.49, 0.51], 'None': [0.99, 0.01]}
   testing_utils.assert_deep_almost_equal(self, result['probas'], expected)
Пример #3
0
 def test_regression_cat(self):
   config = {
       'feature': 'cats',
   }
   result = self.pdp.run_with_metadata([self.dataset.indexed_examples[0]],
                                       self.reg_model, self.dataset,
                                       config=config)
   expected = {'One': 2.0, 'None': 1.0}
   testing_utils.assert_deep_almost_equal(self, result['score'], expected)
Пример #4
0
 def test_regression_num(self):
   config = {
       'feature': 'num',
   }
   result = self.pdp.run_with_metadata([self.dataset.indexed_examples[0]],
                                       self.reg_model, self.dataset,
                                       config=config)
   expected = {1.0: 2.0, 2.0: 3.0, 3.0: 4.0, 4.0: 5.0, 5.0: 6.0, 6.0: 7.0,
               7.0: 8.0, 8.0: 9.0, 9.0: 10.0, 10.0: 11.0}
   testing_utils.assert_deep_almost_equal(self, result['score'], expected)
Пример #5
0
 def test_multiple_inputs(self):
   config = {
       'feature': 'num',
   }
   result = self.pdp.run_with_metadata(self.dataset.indexed_examples[0:2],
                                       self.reg_model, self.dataset,
                                       config=config)
   expected = {1.0: 1.5, 2.0: 2.5, 3.0: 3.5, 4.0: 4.5, 5.0: 5.5, 6.0: 6.5,
               7.0: 7.5, 8.0: 8.5, 9.0: 9.5, 10.0: 10.5}
   testing_utils.assert_deep_almost_equal(self, result['score'], expected)
Пример #6
0
  def test_tcav_sample_from_positive(self):
    # Tests the case where more concept examples are passed than non-concept
    # examples, so the concept set is sampled from the concept examples.

    random.seed(0)  # Sets seed since create_comparison_splits() uses random.

    # Basic test with dummy outputs from the model.
    examples = [
        {'sentence': 'a'},
        {'sentence': 'b'},
        {'sentence': 'c'},
        {'sentence': 'd'},
        {'sentence': 'e'},
        {'sentence': 'f'},
        {'sentence': 'g'},
        {'sentence': 'h'}]

    indexed_inputs = [{'id': caching.input_hash(ex), 'data': ex}
                      for ex in examples]
    dataset = lit_dataset.IndexedDataset(id_fn=caching.input_hash,
                                         indexed_examples=indexed_inputs)
    config = {
        'concept_set_ids': [indexed_inputs[0]['id'],
                            indexed_inputs[2]['id'],
                            indexed_inputs[3]['id'],
                            indexed_inputs[4]['id'],
                            indexed_inputs[7]['id']],
        'class_to_explain': '1',
        'grad_layer': 'cls_grad',
        'random_state': 0
    }
    result = self.tcav.run_with_metadata(indexed_inputs, self.model, dataset,
                                         config=config)

    self.assertLen(result, 1)
    expected = {
        'p_val': 0.80489,
        'random_mean': 0.53333,
        'result': {
            'score': 0.8,
            'cos_sim': [
                0.09527, -0.20442, 0.05141,
                0.14985, 0.06750, -0.28244,
                -0.11022, -0.14479
            ],
            'dot_prods': [
                152.48776, -335.64998, 82.99588,
                247.80113, 109.53684, -461.81805,
                -181.29095, -239.47817
            ],
            'accuracy': 1.0
        }
    }

    testing_utils.assert_deep_almost_equal(self, expected, result[0])
Пример #7
0
    def test_compute(self):
        multiclass_metrics = metrics.MulticlassMetricsImpl()

        # All correct predictions.
        result = multiclass_metrics.compute(
            ['1', '2', '0', '1'], [[0, 1, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0]],
            types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1', '2'], null_idx=0))
        testing_utils.assert_deep_almost_equal(self, result, {
            'accuracy': 1.0,
            'f1': 1.0,
            'precision': 1.0,
            'recall': 1.0
        })

        # Some incorrect predictions.
        result = multiclass_metrics.compute(
            ['1', '2', '0', '1'],
            [[.1, .4, .5], [0, .1, .9], [.1, 0, .9], [0, 1, 0]],
            types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1', '2'], null_idx=0))
        testing_utils.assert_deep_almost_equal(self, result, {
            'accuracy': 0.5,
            'f1': 0.57143,
            'precision': 0.5,
            'recall': 0.66667
        })

        # All incorrect predictions.
        result = multiclass_metrics.compute(
            ['1', '2', '0', '1'],
            [[.1, .4, .5], [.2, .7, .1], [.1, 0, .9], [1, 0, 0]],
            types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1', '2'], null_idx=0))
        testing_utils.assert_deep_almost_equal(self, result, {
            'accuracy': 0.0,
            'f1': 0.0,
            'precision': 0.0,
            'recall': 0.0
        })

        # No null index.
        result = multiclass_metrics.compute(
            ['1', '2', '0', '1'],
            [[.1, .4, .5], [0, .1, .9], [.1, 0, .9], [0, 1, 0]],
            types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1', '2']))
        testing_utils.assert_deep_almost_equal(self, result, {'accuracy': 0.5})

        # Empty labels and predictions
        result = multiclass_metrics.compute([], [], types.CategoryLabel(),
                                            types.MulticlassPreds(
                                                vocab=['0', '1', '2'],
                                                null_idx=0))
        testing_utils.assert_deep_almost_equal(self, result, {})
Пример #8
0
  def test_tcav(self):
    random.seed(0)  # Sets seed since create_comparison_splits() uses random.

    # Basic test with dummy outputs from the model.
    examples = [
        {'sentence': 'a'},
        {'sentence': 'b'},
        {'sentence': 'c'},
        {'sentence': 'd'},
        {'sentence': 'e'},
        {'sentence': 'f'},
        {'sentence': 'g'},
        {'sentence': 'h'},
        {'sentence': 'i'}]

    indexed_inputs = [{'id': caching.input_hash(ex), 'data': ex}
                      for ex in examples]
    dataset = lit_dataset.IndexedDataset(id_fn=caching.input_hash,
                                         indexed_examples=indexed_inputs)
    config = {
        'concept_set_ids': [indexed_inputs[0]['id'],
                            indexed_inputs[2]['id'],
                            indexed_inputs[3]['id'],
                            indexed_inputs[7]['id']],
        'class_to_explain': '1',
        'grad_layer': 'cls_grad',
        'random_state': 0,
        'dataset_name': 'test'
    }
    result = self.tcav.run_with_metadata(indexed_inputs, self.model, dataset,
                                         config=config)

    self.assertLen(result, 1)
    expected = {
        'p_val': 0.13311,
        'random_mean': 0.56667,
        'result': {
            'score': 0.33333,
            'cos_sim': [
                0.088691, -0.12179, 0.16013,
                0.24840, -0.09793, 0.05166,
                -0.21578, -0.06560, -0.14759
            ],
            'dot_prods': [
                189.085096, -266.36317, 344.350498,
                547.144949, -211.663965, 112.502439,
                -472.72066, -144.529598, -323.31888
            ],
            'accuracy': 0.66667
        }
    }

    testing_utils.assert_deep_almost_equal(self, expected, result[0])
Пример #9
0
  def test_class_num(self):
    config = {
        'feature': 'num',
    }
    result = self.pdp.run_with_metadata([self.dataset.indexed_examples[0]],
                                        self.class_model, self.dataset,
                                        config=config)

    expected = {1.0: [0.49, 0.51], 2.0: [0.48, 0.52], 3.0: [0.47, 0.53],
                4.0: [0.46, 0.54], 5.0: [0.45, 0.55], 6.0: [0.44, 0.56],
                7.0: [0.43, 0.57], 8.0: [0.42, 0.58], 9.0: [0.41, 0.59],
                10.0: [0.4, 0.6]}
    testing_utils.assert_deep_almost_equal(self, result['probas'], expected)
Пример #10
0
    def test_compute(self):
        corpusblue_metrics = metrics.CorpusBLEU()

        # All correct predictions.
        result = corpusblue_metrics.compute(
            ['This is a test.', 'Test two', 'A third test example'],
            ['This is a test.', 'Test two', 'A third test example'],
            types.GeneratedText(), types.GeneratedText())
        testing_utils.assert_deep_almost_equal(self, result,
                                               {'corpus_bleu': 100.00000})

        # Some incorrect predictions.
        result = corpusblue_metrics.compute(
            ['This is a test.', 'Test one', 'A third test'],
            ['This is a test.', 'Test two', 'A third test example'],
            types.GeneratedText(), types.GeneratedText())
        testing_utils.assert_deep_almost_equal(self, result,
                                               {'corpus_bleu': 68.037493})

        result = corpusblue_metrics.compute(
            ['This is a test.', 'Test one', 'A third test'],
            ['these test.', 'Test two', 'A third test example'],
            types.GeneratedText(), types.GeneratedText())
        testing_utils.assert_deep_almost_equal(
            self, result, {'corpus_bleu': 29.508062388758525})

        # Empty labels and predictions
        result = corpusblue_metrics.compute([], [], types.GeneratedText(),
                                            types.GeneratedText())
        testing_utils.assert_deep_almost_equal(self, result, {})
Пример #11
0
    def test_run_nn(self):
        examples = [
            {
                'segment': 'a'
            },
            {
                'segment': 'b'
            },
            {
                'segment': 'c'
            },
        ]
        indexed_inputs = [{
            'id': caching.input_hash(ex),
            'data': ex
        } for ex in examples]

        model = TestModelNearestNeighbors()
        dataset = lit_dataset.IndexedDataset(id_fn=caching.input_hash,
                                             indexed_examples=indexed_inputs)
        config = {
            'embedding_name': 'input_embs',
            'num_neighbors': 2,
        }
        result = self.nearest_neighbors.run_with_metadata([indexed_inputs[1]],
                                                          model,
                                                          dataset,
                                                          config=config)
        expected = {
            'nearest_neighbors': [{
                'id': '1',
                'nn_distance': 0.0
            }, {
                'id': '0',
                'nn_distance': 1.7320508075688772
            }]
        }

        self.assertLen(result, 1)
        testing_utils.assert_deep_almost_equal(self, expected, result[0])
Пример #12
0
    def test_compute(self):
        regression_metrics = metrics.RegressionMetrics()

        # All correct predictions.
        result = regression_metrics.compute([1, 2, 3, 4], [1, 2, 3, 4],
                                            types.RegressionScore(),
                                            types.RegressionScore())
        testing_utils.assert_deep_almost_equal(self, result, {
            'mse': 0,
            'pearsonr': 1.0,
            'spearmanr': 1.0
        })

        # Some incorrect predictions.
        result = regression_metrics.compute([1, 2, 3, 4], [1, 2, 5.5, 6.3],
                                            types.RegressionScore(),
                                            types.RegressionScore())
        testing_utils.assert_deep_almost_equal(self, result, {
            'mse': 2.885,
            'pearsonr': 0.96566,
            'spearmanr': 1.0
        })

        # All incorrect predictions (and not monotonic).
        result = regression_metrics.compute([1, 2, 3, 4], [-5, -10, 5, 6],
                                            types.RegressionScore(),
                                            types.RegressionScore())
        testing_utils.assert_deep_almost_equal(self, result, {
            'mse': 47.0,
            'pearsonr': 0.79559,
            'spearmanr': 0.799999
        })

        # Empty labels and predictions
        result = regression_metrics.compute([], [], types.RegressionScore(),
                                            types.RegressionScore())
        testing_utils.assert_deep_almost_equal(self, result, {})
Пример #13
0
  def test_relative_tcav(self):
    # Tests passing in a negative set.

    random.seed(0)  # Sets seed since create_comparison_splits() uses random.

    # Basic test with dummy outputs from the model.
    examples = [
        {'sentence': 'happy'},  # 0
        {'sentence': 'sad'},  # 1
        {'sentence': 'good'},  # 2
        {'sentence': 'bad'},  # 3
        {'sentence': 'pretty'},  # 4
        {'sentence': 'ugly'},  # 5
        {'sentence': 'sweet'},  # 6
        {'sentence': 'bitter'},  # 7
        {'sentence': 'well'},  # 8
        {'sentence': 'poor'},  # 9
        {'sentence': 'compelling'},  # 10
        {'sentence': 'boring'},  # 11
        {'sentence': 'pleasing'},  # 12
        {'sentence': 'gross'},  # 13
        {'sentence': 'blue'},  # 14
        {'sentence': 'red'},  # 15
        {'sentence': 'flower'},  # 16
        {'sentence': 'bee'},  # 17
        {'sentence': 'snake'},  # 18
        {'sentence': 'windshield'},  # 19
        {'sentence': 'plant'},  # 20
        {'sentence': 'scary'},  # 21
        {'sentence': 'pencil'},  # 22
        {'sentence': 'hello'}  # 23
    ]

    indexed_inputs = [{'id': caching.input_hash(ex), 'data': ex}
                      for ex in examples]
    dataset = lit_dataset.IndexedDataset(id_fn=caching.input_hash,
                                         indexed_examples=indexed_inputs)

    # This first example doesn't have enough examples for statistical testing,
    # so the returned p-value is None.
    config = {
        'concept_set_ids': [indexed_inputs[0]['id'],
                            indexed_inputs[2]['id'],
                            indexed_inputs[4]['id']],
        'negative_set_ids': [indexed_inputs[1]['id'],
                             indexed_inputs[3]['id'],
                             indexed_inputs[5]['id']],
        'class_to_explain': '1',
        'grad_layer': 'cls_grad',
        'random_state': 0
    }

    result = self.tcav.run_with_metadata(indexed_inputs, self.model, dataset,
                                         config=config)

    self.assertLen(result, 1)
    expected = {
        'result': {
            'score': 1.0,
            'cos_sim': [
                0.9999999581246426, 0.049332143689572144, 0.8987945047547466,
                -0.41858423757857954, 0.6908297036543664, -0.5167857909664919,
                0.8423017503220364, -0.005793079244916016, 0.8334491603894322,
                -0.4054645113448612, 0.7616102123736647, -0.4578596155267783,
                0.8366905563807711, -0.27390786544756535, 0.7325538474066896,
                0.5190287630768531, 0.8145227936096425, 0.02005592868363552,
                -0.1143256029298114, -0.1221480700842533, 0.6852995739227957,
                0.3984620730733816, 0.5211149530112407, 0.5909723902471223
            ],
            'dot_prods': [
                1385.1480610241554, 69.95638452724207, 1239.4947646060161,
                -595.253135700978, 971.5880156862692, -725.0749813217176,
                1182.8641913758102, -8.149647641120662, 1146.5803071544124,
                -576.4043054391316, 1038.3510704649307, -648.097269442522,
                1154.4720122394317, -378.32103870822493, 1024.066390571124,
                738.6959135414066, 1139.7963358416857, 28.691395032352318,
                -167.37808507284706, -176.4474746971391, 959.5159619261449,
                562.8772536987927, 716.7270332848395, 840.7031847912738
            ],
            'accuracy': 0.5
        },
        'p_val': None,
        'random_mean': 0.9285714285714286,
        'split_size': 3,
        'num_runs': 1
    }

    testing_utils.assert_deep_almost_equal(self, expected, result[0])

    # This example has enough inputs for two runs of size 3.
    config = {
        'concept_set_ids': [
            indexed_inputs[1]['id'], indexed_inputs[2]['id'],
            indexed_inputs[4]['id'], indexed_inputs[5]['id'],
            indexed_inputs[10]['id'], indexed_inputs[9]['id']
        ],
        'negative_set_ids': [
            indexed_inputs[0]['id'], indexed_inputs[3]['id'],
            indexed_inputs[12]['id'], indexed_inputs[6]['id'],
            indexed_inputs[7]['id'], indexed_inputs[8]['id']
        ],
        'class_to_explain': '0',
        'grad_layer': 'cls_grad',
        'random_state': 0
    }

    result = self.tcav.run_with_metadata(indexed_inputs, self.model, dataset,
                                         config=config)
    self.assertLen(result, 1)
    expected = {
        'result': {
            'score': 0.0,
            'cos_sim': [
                0.2731987606830683, 0.427838045403812, 0.3166440584420665,
                -0.1358964965831398, 0.5616614702946262, -0.16511808390168164,
                -0.05103355252438478, -0.16945565920473257, 0.28148962348967155,
                -0.18169036476392003, 0.33244873698665106, -0.13316476546155087,
                0.15226772288202886, -0.05534469666649352, 0.2886150002073456,
                0.33888135113008555, 0.12875301375254147, 0.046908665182593096,
                -0.052445114502024985, 0.088858405172313, 0.219517174438115,
                0.35833013079793435, 0.2291162415605806, 0.3635686086637199
            ],
            'dot_prods': [
                452.17220644153525, 724.9460578876271, 521.776546745851,
                -230.9170522777958, 943.8754747127095, -276.8190148523963,
                -85.63511897570154, -284.8487792023684, 462.71830216201926,
                -308.62790255581496, 541.5830529968077, -225.2299308998058,
                251.04716264718752, -91.33998249705493, 482.0991668852444,
                576.3029773313335, 215.28329927312336, 80.18458502795752,
                -91.74640483442752, 153.37559992294862, 367.2562273288043,
                604.8378479001944, 376.53473821563625, 618.003311205616
            ],
            'accuracy': 0.5
        },
        'p_val': 0.42264973081037427,
        'random_mean': 0.0,
        'split_size': 3,
        'num_runs': 2
    }

    testing_utils.assert_deep_almost_equal(self, expected, result[0])

    # This example has enough examples for three runs of size 3 and two runs of
    # size 5, and returns results with p-value < 0.05.
    config = {
        'concept_set_ids': [indexed_inputs[0]['id'],
                            indexed_inputs[1]['id'],
                            indexed_inputs[2]['id'],
                            indexed_inputs[3]['id'],
                            indexed_inputs[4]['id'],
                            indexed_inputs[5]['id'],
                            indexed_inputs[6]['id'],
                            indexed_inputs[7]['id'],
                            indexed_inputs[8]['id'],
                            indexed_inputs[9]['id']],
        'negative_set_ids': [indexed_inputs[10]['id'],
                             indexed_inputs[11]['id'],
                             indexed_inputs[12]['id'],
                             indexed_inputs[13]['id'],
                             indexed_inputs[14]['id'],
                             indexed_inputs[15]['id'],
                             indexed_inputs[16]['id'],
                             indexed_inputs[17]['id'],
                             indexed_inputs[18]['id'],
                             indexed_inputs[19]['id']],
        'class_to_explain': '1',
        'grad_layer': 'cls_grad',
        'random_state': 0
    }

    result = self.tcav.run_with_metadata(indexed_inputs, self.model, dataset,
                                         config=config)
    self.assertLen(result, 1)
    expected = [{
        'result': {
            'score': 0.42857142857142855,
            'cos_sim': [
                -0.1107393877916321, -0.0993967046974328, -0.2214985917242054,
                0.08132588965575606, -0.3590211572508748, 0.18708109817461333,
                0.000724498781128839, 0.09700473783330398, -0.25015742815240055,
                0.16108236033785076, -0.10283274286140846, 0.0972663321478731,
                -0.05924679176256152, -0.048499696342091746,
                -0.4357117016074766, -0.593245752003111, -0.3645147796989344,
                -0.5507605083253673, -0.27914997949782694, -0.30908550968594417,
                -0.5584676299422896, -0.16983339994284577, -0.42587740852240746,
                -0.37482298817032594
            ],
            'dot_prods': [
                -261.4389298435066, -240.23776409902007, -520.6275907607769,
                197.11495117497446, -860.6035066083074, 447.3775519523981,
                1.7341104803878409, 232.59170976304426, -586.5576327736542,
                390.2961568516803, -238.95427152619726, 234.6617547723058,
                -139.3334215524385, -114.17392512371171, -1038.149036709951,
                -1439.0663895591745, -869.3828698612926, -1342.899780229334,
                -696.569760699206, -760.9907977738051, -1332.7284530349625,
                -408.90435403478875, -998.3360993150825, -908.8111404537224
            ],
            'accuracy': 0.75
        },
        'p_val': 0.04400624968940752,
        'random_mean': 0.9642857142857143,
        'split_size': 5,
        'num_runs': 2
    }]
    testing_utils.assert_deep_almost_equal(self, expected, result)