示例#1
0
    def test_set_model_file_without_dict_file(self):
        """Check that moving a model without moving the dictionary raises the
        appropriate error.
        """
        # Download model, move to a new location
        datapath = ParlaiParser().parse_args(print_args=False)['datapath']
        try:
            # remove unittest models if there before
            shutil.rmtree(os.path.join(datapath, 'models/unittest'))
        except FileNotFoundError:
            pass
        testing_utils.download_unittest_models()

        zoo_path = 'models:unittest/seq2seq/model'
        model_path = modelzoo_path(datapath, zoo_path)
        os.remove(model_path + '.dict')
        # Test that eval model fails
        with self.assertRaises(RuntimeError):
            testing_utils.eval_model(
                dict(task='babi:task1k:1', model_file=model_path))
        try:
            # remove unittest models if there after
            shutil.rmtree(os.path.join(datapath, 'models/unittest'))
        except FileNotFoundError:
            pass
    def test_wd_specificity(self):
        """
        Checks the specificity-weighted decoding model produces correct results.
        """
        _, valid, _ = testing_utils.eval_model(
            {
                'model_file':
                'zoo:controllable_dialogue/convai2_finetuned_baseline',
                'task': 'projects.controllable_dialogue.tasks.agents',
                'beam_size': 20,
                'beam_min_n_best': 10,
                'use_reply': 'model',
                'batchsize': 64,
                'num_examples': NUM_EXAMPLES,
                'weighted_decoding': NO_REPETITION + ',nidf:4',
            },
            skip_test=True,
        )

        if FAST_MODE:
            self.assertAlmostEqual(valid['ppl'], 25.74, delta=0.1)
            self.assertAlmostEqual(valid['f1'], 0.1366, delta=0.0002)
        else:
            self.assertAlmostEqual(valid['ppl'], 25.57, delta=0.1)
            self.assertAlmostEqual(valid['f1'], 0.1349, delta=0.0002)
    def test_convai2_finetuned_norepetition(self):
        """
        Checks the finetuned model with repetition blocking produces correct results.
        """
        _, valid, _ = testing_utils.eval_model(
            {
                'model_file':
                'zoo:controllable_dialogue/convai2_finetuned_baseline',
                'task': 'projects.controllable_dialogue.tasks.agents',
                'beam_size': 20,
                'beam_min_n_best': 10,
                'use_reply': 'model',
                'batchsize': 64,
                'num_examples': NUM_EXAMPLES,
                'weighted_decoding': NO_REPETITION,
            },
            skip_test=True,
        )

        if FAST_MODE:
            self.assertAlmostEqual(valid['ppl'], 26.66, delta=0.1)
            self.assertAlmostEqual(valid['f1'], 0.1389, delta=0.0002)
        else:
            self.assertAlmostEqual(valid['ppl'], 25.83, delta=0.1)
            self.assertAlmostEqual(valid['f1'], 0.1375, delta=0.0002)
示例#4
0
    def test_transresnet(self):
        """Test pretrained model."""
        stdout, _, test = testing_utils.eval_model(MODEL_OPTIONS,
                                                   skip_valid=True)

        # Overall
        self.assertEqual(
            test['accuracy'], 0.3667,
            'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout))
        self.assertEqual(
            test['hits@5'], 0.633,
            'test hits@5 = {}\nLOG:\n{}'.format(test['hits@5'], stdout))
        self.assertEqual(
            test['hits@10'], 0.767,
            'test hits@10 = {}\nLOG:\n{}'.format(test['hits@10'], stdout))

        # First round
        self.assertEqual(
            test['first_round']['hits@1/100'], 0.2,
            'test first round hits@1/100 = {}\nLOG:\n{}'.format(
                test['first_round']['hits@1/100'], stdout))

        # Second round
        self.assertEqual(
            test['second_round']['hits@1/100'], 0.5,
            'test second round hits@1/100 = {}\nLOG:\n{}'.format(
                test['second_round']['hits@1/100'], stdout))

        # Third round
        self.assertEqual(
            test['third_round+']['hits@1/100'], 0.4,
            'test third round hits@1/100 = {}\nLOG:\n{}'.format(
                test['third_round+']['hits@1/100'], stdout))
示例#5
0
    def test_output(self):
        """Test output of running eval_model"""
        parser = setup_args()
        parser.set_defaults(
            task='tasks.repeat:RepeatTeacher:10',
            model='repeat_label',
            datatype='valid',
            num_examples=5,
            display_examples=False,
        )

        opt = parser.parse_args(print_args=False)
        str_output, valid, test = testing_utils.eval_model(opt)
        self.assertGreater(len(str_output), 0, "Output is empty")

        # decode the output
        scores = str_output.split("\n---\n")
        for i in range(1, len(scores)):
            score = ast.literal_eval(scores[i])
            # check totals
            self.assertTrue(score['exs'] == i,
                            "Total is incorrect")
            # accuracy should be one
            self.assertTrue(score['accuracy'] == 1,
                            "accuracy != 1")
    def test_ct_question_bucket10(self):
        """
        Checks the question-controlled model (z=10 boost) produces correct results.
        """
        _, valid, _ = testing_utils.eval_model(
            {
                'model_file':
                'models:controllable_dialogue/control_questionb11e10',
                'task': 'projects.controllable_dialogue.tasks.agents',
                'beam_size': 20,
                'beam_min_n_best': 10,
                'batchsize': 64,
                'use_reply': 'model',
                'num_examples': NUM_EXAMPLES,
                'weighted_decoding':
                'extrep_nonstopword:-1e20,intrep_nonstopword:-1e20',
                'set_controls': 'question:10',
                'beam_reorder': 'best_extrep2gram_qn',
            },
            skip_test=True)

        if FAST_MODE:
            self.assertAlmostEqual(valid['ppl'], 31.27, delta=0.1)
            self.assertAlmostEqual(valid['f1'], 0.1400, delta=0.0002)
        else:
            self.assertAlmostEqual(valid['ppl'], 30.26, delta=0.1)
            self.assertAlmostEqual(valid['f1'], 0.1304, delta=0.0002)
示例#7
0
    def test_backwards_compatibility(self):
        testing_utils.download_unittest_models()

        stdout, valid, test = testing_utils.eval_model(
            dict(
                task='integration_tests:multipass',
                model='seq2seq',
                model_file='zoo:unittest/seq2seq/model',
                dict_file='zoo:unittest/seq2seq/model.dict',
                no_cuda=True,
            ))

        self.assertLessEqual(
            valid['ppl'], 1.01,
            'valid ppl = {}\nLOG:\n{}'.format(valid['ppl'], stdout))
        self.assertGreaterEqual(
            valid['accuracy'],
            0.999,
            'valid accuracy = {}\nLOG:\n{}'.format(valid['accuracy'], stdout),
        )
        self.assertGreaterEqual(
            valid['f1'], 0.999,
            'valid f1 = {}\nLOG:\n{}'.format(valid['f1'], stdout))
        self.assertLessEqual(
            test['ppl'], 1.01,
            'test ppl = {}\nLOG:\n{}'.format(test['ppl'], stdout))
        self.assertGreaterEqual(
            test['accuracy'],
            0.999,
            'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout),
        )
        self.assertGreaterEqual(
            test['f1'], 0.999,
            'test f1 = {}\nLOG:\n{}'.format(test['f1'], stdout))
    def test_wd_responsiveness(self):
        """
        Checks the responsiveness-weighted decoding model produces correct results.
        """
        _, valid, _ = testing_utils.eval_model(
            {
                'model_file':
                'models:controllable_dialogue/convai2_finetuned_baseline',
                'task':
                'projects.controllable_dialogue.tasks.agents',
                'beam_size':
                20,
                'beam_min_n_best':
                10,
                'use_reply':
                'model',
                'batchsize':
                64,
                'num_examples':
                NUM_EXAMPLES,
                'weighted_decoding':
                NO_REPETITION +
                ',intrep_2gram:-1e20,partnerrep_2gram:-1e20,lastuttsim:5'  # noqa: E501
            },
            skip_test=True)

        if FAST_MODE:
            self.assertAlmostEqual(valid['ppl'], 26.16, delta=0.1)
            self.assertAlmostEqual(valid['f1'], 0.1399, delta=0.0002)
        else:
            self.assertAlmostEqual(valid['ppl'], 25.47, delta=0.1)
            self.assertAlmostEqual(valid['f1'], 0.1369, delta=0.0002)
示例#9
0
    def test_metrics_select(self):
        """Test output of running eval_model"""
        parser = setup_args()
        parser.set_defaults(
            task='integration_tests',
            model='repeat_label',
            datatype='valid',
            num_examples=5,
            display_examples=False,
            metrics='accuracy,rouge',
        )

        opt = parser.parse_args(print_args=False)
        str_output, valid, test = testing_utils.eval_model(opt)
        self.assertGreater(len(str_output), 0, "Output is empty")

        # decode the output
        scores = str_output.split("\n---\n")

        for i in range(1, len(scores)):
            score = ast.literal_eval(scores[i])
            # check totals
            self.assertEqual(score['exs'], i, "Total is incorrect")
            # accuracy should be one
            self.assertEqual('accuracy' in score, True,
                             "Accuracy is missing from selection")
            self.assertEqual(score['accuracy'], 1, "Accuracy != 1")
            self.assertEqual('rouge-1' in score, True,
                             "Rouge is missing from selection")
            self.assertEqual(score['rouge-1'], 1, 'rouge1 != 1')
            self.assertEqual(score['rouge-2'], 1, 'rouge-2 != 1')
            self.assertEqual(score['rouge-L'], 1, 'rouge-L != 1')
    def test_ct_specificity_bucket7(self):
        """
        Checks the specificity-CT model (z=7) produces correct results.
        """
        _, valid, _ = testing_utils.eval_model(
            {
                'model_file':
                'models:controllable_dialogue/control_avgnidf10b10e',
                'task': 'projects.controllable_dialogue.tasks.agents',
                'beam_size': 20,
                'beam_min_n_best': 10,
                'use_reply': 'model',
                'batchsize': 64,
                'num_examples': NUM_EXAMPLES,
                'weighted_decoding': NO_REPETITION,
                'set_controls': 'avg_nidf:7',
            },
            skip_test=True)

        if FAST_MODE:
            self.assertAlmostEqual(valid['ppl'], 38.64, delta=0.1)
            self.assertAlmostEqual(valid['f1'], 0.1376, delta=0.0002)
        else:
            self.assertAlmostEqual(valid['ppl'], 37.03, delta=0.1)
            self.assertAlmostEqual(valid['f1'], 0.1365, delta=0.0002)
    def test_ct_question_bucket7(self):
        """
        Checks the question-controlled model (z=7) produces correct results.
        """
        _, valid, _ = testing_utils.eval_model(
            {
                # b11e10 stands for 11 buckets, embedding size 10
                'model_file':
                'models:controllable_dialogue/control_questionb11e10',
                'task': 'projects.controllable_dialogue.tasks.agents',
                'beam_size': 20,
                'beam_min_n_best': 10,
                'batchsize': 64,
                'use_reply': 'model',
                'num_examples': NUM_EXAMPLES,
                'weighted_decoding': NO_REPETITION,
                'set_controls': 'question:7',
            },
            skip_test=True)

        if FAST_MODE:
            self.assertAlmostEqual(valid['ppl'], 31.04, delta=0.1)
            self.assertAlmostEqual(valid['f1'], 0.1362, delta=0.0002)
        else:
            self.assertAlmostEqual(valid['ppl'], 29.22, delta=0.1)
            self.assertAlmostEqual(valid['f1'], 0.1336, delta=0.0002)
示例#12
0
    def test_multitasking_metrics(self):
        stdout, valid, test = testing_utils.eval_model({
            'task':
            'integration_tests:candidate,'
            'integration_tests:multiturnCandidate',
            'model':
            'random_candidate',
            'num_epochs':
            0.5,
            'aggregate_micro':
            True,
        })

        task1_acc = valid['tasks']['integration_tests:candidate']['accuracy']
        task2_acc = valid['tasks']['integration_tests:multiturnCandidate'][
            'accuracy']
        total_acc = valid['accuracy']
        # task 2 is 4 times the size of task 1
        self.assertAlmostEqual(
            total_acc,
            (task1_acc + 4 * task2_acc) / 5,
            4,
            'Task accuracy is averaged incorrectly',
        )

        stdout, valid, test = testing_utils.eval_model({
            'task':
            'integration_tests:candidate,'
            'integration_tests:multiturnCandidate',
            'model':
            'random_candidate',
            'num_epochs':
            0.5,
            'aggregate_micro':
            False,
        })
        task1_acc = valid['tasks']['integration_tests:candidate']['accuracy']
        task2_acc = valid['tasks']['integration_tests:multiturnCandidate'][
            'accuracy']
        total_acc = valid['accuracy']
        # metrics should be averaged equally across tasks
        self.assertAlmostEqual(
            total_acc,
            (task1_acc + task2_acc) / 2,
            4,
            'Task accuracy is averaged incorrectly',
        )
 def test_end2end(self):
     stdout, valid, _ = testing_utils.eval_model(END2END_OPTIONS)
     self.assertEqual(
         valid['ppl'], 61.21,
         'valid ppl = {}\nLOG:\n{}'.format(valid['ppl'], stdout))
     self.assertEqual(valid['f1'], 0.1717,
                      'valid f1 = {}\nLOG:\n{}'.format(valid['f1'], stdout))
     self.assertGreaterEqual(
         valid['know_acc'], 0.2201,
         'valid know_acc = {}\nLOG:\n{}'.format(valid['know_acc'], stdout))
示例#14
0
 def test_pretrained(self):
     stdout, _, test = testing_utils.eval_model(
         dict(task='squad:index', model_file='zoo:drqa/squad/model'))
     self.assertGreaterEqual(
         test['accuracy'],
         0.68,
         'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout),
     )
     self.assertGreaterEqual(
         test['f1'], 0.78,
         'test f1 = {}\nLOG:\n{}'.format(test['f1'], stdout))
 def test_retrieval(self):
     stdout, _, test = testing_utils.eval_model(RETRIEVAL_OPTIONS)
     self.assertGreaterEqual(
         test['accuracy'], 0.86,
         'test acc = {}\nLOG:\n{}'.format(test['accuracy'], stdout))
     self.assertGreaterEqual(
         test['hits@5'], 0.98,
         'test hits@5 = {}\nLOG:\n{}'.format(test['hits@5'], stdout))
     self.assertGreaterEqual(
         test['hits@10'], 0.99,
         'test hits@10 = {}\nLOG:\n{}'.format(test['hits@10'], stdout))
示例#16
0
    def test_generator_backcomp(self):
        """
        Tests that the generator model files work over time.
        """
        testing_utils.download_unittest_models()

        stdout, valid, test = testing_utils.eval_model(
            dict(
                task='integration_tests:multipass',
                model='transformer/generator',
                model_file='models:unittest/transformer_generator2/model',
                dict_file='models:unittest/transformer_generator2/model.dict',
                rank_candidates=True,
                batch_size=64,
            ))

        self.assertGreaterEqual(
            valid['hits@1'],
            0.95,
            'valid hits@1 = {}\nLOG:\n{}'.format(valid['hits@1'], stdout),
        )
        self.assertLessEqual(
            valid['ppl'],
            1.01,
            'valid ppl = {}\nLOG:\n{}'.format(valid['ppl'], stdout),
        )
        self.assertGreaterEqual(
            valid['accuracy'],
            .99,
            'valid accuracy = {}\nLOG:\n{}'.format(valid['accuracy'], stdout),
        )
        self.assertGreaterEqual(
            valid['f1'], .99,
            'valid f1 = {}\nLOG:\n{}'.format(valid['f1'], stdout))
        self.assertGreaterEqual(
            test['hits@1'],
            0.95,
            'test hits@1 = {}\nLOG:\n{}'.format(test['hits@1'], stdout),
        )
        self.assertLessEqual(
            test['ppl'],
            1.01,
            'test ppl = {}\nLOG:\n{}'.format(test['ppl'], stdout),
        )
        self.assertGreaterEqual(
            test['accuracy'],
            .99,
            'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout),
        )
        self.assertGreaterEqual(
            test['f1'], .99,
            'test f1 = {}\nLOG:\n{}'.format(test['f1'], stdout))
示例#17
0
    def test_convai2_finetuned_greedy(self):
        """
        Check the greedy model produces correct results.
        """
        _, valid, _ = testing_utils.eval_model({
            'model_file': 'zoo:controllable_dialogue/convai2_finetuned_baseline',
            'task': 'projects.controllable_dialogue.tasks.agents',
            'beam_size': 1,
            'batchsize': 64,
        }, skip_test=True)

        self.assertAlmostEqual(valid['ppl'], 22.86, delta=0.1)
        self.assertAlmostEqual(valid['f1'], 0.1702, delta=0.0002)
示例#18
0
    def test_hogwild_eval(self):
        """Test eval with numthreads > 1 and batchsize in [1,2,3]."""
        opt = dict(task='tasks.repeat:RepeatTeacher:{}'.format(NUM_EXS),
                   model='repeat_label')
        for nt in NUM_THREADS_CHOICES:
            for bs in BATCHSIZE_CHOICES:
                opt['num_threads'] = nt
                opt['batchsize'] = bs

                stdout, valid, test = testing_utils.eval_model(opt)
                self.assertEqual(valid['exs'], NUM_EXS,
                                 'LOG:\n{}'.format(stdout))
                self.assertEqual(test['exs'], NUM_EXS,
                                 'LOG:\n{}'.format(stdout))
示例#19
0
 def test_transresnet(self):
     """Test pretrained model."""
     stdout, _, test = testing_utils.eval_model(MODEL_OPTIONS,
                                                skip_valid=True)
     self.assertEqual(
         test['accuracy'], 0.4,
         'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout))
     self.assertEqual(
         test['hits@5'], 0.9,
         'test hits@5 = {}\nLOG:\n{}'.format(test['hits@5'], stdout))
     self.assertEqual(
         test['hits@10'], 0.9,
         'test hits@10 = {}\nLOG:\n{}'.format(test['hits@10'], stdout))
     self.assertEqual(
         test['med_rank'], 2.0,
         'test med_rank = {}\nLOG:\n{}'.format(test['med_rank'], stdout))
    def test_released_model(self):
        """
        Check the pretrained model produces correct results.
        """
        _, _, test = testing_utils.eval_model(
            {
                'model_file': 'zoo:self_feeding/hh131k_hb60k_fb60k_st1k/model',
                'task': 'self_feeding:all',
                'batchsize': 20,
            },
            skip_valid=True,
        )

        self.assertAlmostEqual(test['dia_acc'], 0.506, delta=0.001)
        self.assertAlmostEqual(test['fee_acc'], 0.744, delta=0.001)
        self.assertAlmostEqual(test['sat_f1'], 0.8343, delta=0.0001)
示例#21
0
    def test_convai2_finetuned_beamsearch(self):
        """
        Check the beamsearch baseline produces correct results.
        """
        _, valid, _ = testing_utils.eval_model({
            'model_file': 'zoo:controllable_dialogue/convai2_finetuned_baseline',
            'task': 'projects.controllable_dialogue.tasks.agents',
            'beam_size': 20,
            'beam_min_n_best': 10,
            'batchsize': 64,
            'num_examples': NUM_EXAMPLES,
        }, skip_test=True)

        if FAST_MODE:
            self.assertAlmostEqual(valid['ppl'], 23.54, delta=0.1)
            self.assertAlmostEqual(valid['f1'], 0.1575, delta=0.0002)
        else:
            self.assertAlmostEqual(valid['ppl'], 22.86, delta=0.1)
            self.assertAlmostEqual(valid['f1'], 0.1516, delta=0.0002)
示例#22
0
    def test_backcomp(self):
        """
        Tests that the transformer ranker model files continue to work over time.
        """
        testing_utils.download_unittest_models()

        stdout, valid, test = testing_utils.eval_model(
            dict(
                task='integration_tests:multipass',
                model='transformer/ranker',
                model_file='zoo:unittest/transformer_ranker/model',
                dict_file='zoo:unittest/transformer_ranker/model.dict',
                batch_size=64,
            )
        )

        self.assertGreaterEqual(
            valid['hits@1'],
            0.99,
            'valid hits@1 = {}\nLOG:\n{}'.format(valid['hits@1'], stdout),
        )
        self.assertGreaterEqual(
            valid['accuracy'],
            0.99,
            'valid accuracy = {}\nLOG:\n{}'.format(valid['accuracy'], stdout),
        )
        self.assertGreaterEqual(
            valid['f1'], 0.99, 'valid f1 = {}\nLOG:\n{}'.format(valid['f1'], stdout)
        )
        self.assertGreaterEqual(
            test['hits@1'],
            0.99,
            'test hits@1 = {}\nLOG:\n{}'.format(test['hits@1'], stdout),
        )
        self.assertGreaterEqual(
            test['accuracy'],
            0.99,
            'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout),
        )
        self.assertGreaterEqual(
            test['f1'], 0.99, 'test f1 = {}\nLOG:\n{}'.format(test['f1'], stdout)
        )