def test_set_model_file_without_dict_file(self): """Check that moving a model without moving the dictionary raises the appropriate error. """ # Download model, move to a new location datapath = ParlaiParser().parse_args(print_args=False)['datapath'] try: # remove unittest models if there before shutil.rmtree(os.path.join(datapath, 'models/unittest')) except FileNotFoundError: pass testing_utils.download_unittest_models() zoo_path = 'models:unittest/seq2seq/model' model_path = modelzoo_path(datapath, zoo_path) os.remove(model_path + '.dict') # Test that eval model fails with self.assertRaises(RuntimeError): testing_utils.eval_model( dict(task='babi:task1k:1', model_file=model_path)) try: # remove unittest models if there after shutil.rmtree(os.path.join(datapath, 'models/unittest')) except FileNotFoundError: pass
def test_wd_specificity(self): """ Checks the specificity-weighted decoding model produces correct results. """ _, valid, _ = testing_utils.eval_model( { 'model_file': 'zoo:controllable_dialogue/convai2_finetuned_baseline', 'task': 'projects.controllable_dialogue.tasks.agents', 'beam_size': 20, 'beam_min_n_best': 10, 'use_reply': 'model', 'batchsize': 64, 'num_examples': NUM_EXAMPLES, 'weighted_decoding': NO_REPETITION + ',nidf:4', }, skip_test=True, ) if FAST_MODE: self.assertAlmostEqual(valid['ppl'], 25.74, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1366, delta=0.0002) else: self.assertAlmostEqual(valid['ppl'], 25.57, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1349, delta=0.0002)
def test_convai2_finetuned_norepetition(self): """ Checks the finetuned model with repetition blocking produces correct results. """ _, valid, _ = testing_utils.eval_model( { 'model_file': 'zoo:controllable_dialogue/convai2_finetuned_baseline', 'task': 'projects.controllable_dialogue.tasks.agents', 'beam_size': 20, 'beam_min_n_best': 10, 'use_reply': 'model', 'batchsize': 64, 'num_examples': NUM_EXAMPLES, 'weighted_decoding': NO_REPETITION, }, skip_test=True, ) if FAST_MODE: self.assertAlmostEqual(valid['ppl'], 26.66, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1389, delta=0.0002) else: self.assertAlmostEqual(valid['ppl'], 25.83, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1375, delta=0.0002)
def test_transresnet(self): """Test pretrained model.""" stdout, _, test = testing_utils.eval_model(MODEL_OPTIONS, skip_valid=True) # Overall self.assertEqual( test['accuracy'], 0.3667, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout)) self.assertEqual( test['hits@5'], 0.633, 'test hits@5 = {}\nLOG:\n{}'.format(test['hits@5'], stdout)) self.assertEqual( test['hits@10'], 0.767, 'test hits@10 = {}\nLOG:\n{}'.format(test['hits@10'], stdout)) # First round self.assertEqual( test['first_round']['hits@1/100'], 0.2, 'test first round hits@1/100 = {}\nLOG:\n{}'.format( test['first_round']['hits@1/100'], stdout)) # Second round self.assertEqual( test['second_round']['hits@1/100'], 0.5, 'test second round hits@1/100 = {}\nLOG:\n{}'.format( test['second_round']['hits@1/100'], stdout)) # Third round self.assertEqual( test['third_round+']['hits@1/100'], 0.4, 'test third round hits@1/100 = {}\nLOG:\n{}'.format( test['third_round+']['hits@1/100'], stdout))
def test_output(self): """Test output of running eval_model""" parser = setup_args() parser.set_defaults( task='tasks.repeat:RepeatTeacher:10', model='repeat_label', datatype='valid', num_examples=5, display_examples=False, ) opt = parser.parse_args(print_args=False) str_output, valid, test = testing_utils.eval_model(opt) self.assertGreater(len(str_output), 0, "Output is empty") # decode the output scores = str_output.split("\n---\n") for i in range(1, len(scores)): score = ast.literal_eval(scores[i]) # check totals self.assertTrue(score['exs'] == i, "Total is incorrect") # accuracy should be one self.assertTrue(score['accuracy'] == 1, "accuracy != 1")
def test_ct_question_bucket10(self): """ Checks the question-controlled model (z=10 boost) produces correct results. """ _, valid, _ = testing_utils.eval_model( { 'model_file': 'models:controllable_dialogue/control_questionb11e10', 'task': 'projects.controllable_dialogue.tasks.agents', 'beam_size': 20, 'beam_min_n_best': 10, 'batchsize': 64, 'use_reply': 'model', 'num_examples': NUM_EXAMPLES, 'weighted_decoding': 'extrep_nonstopword:-1e20,intrep_nonstopword:-1e20', 'set_controls': 'question:10', 'beam_reorder': 'best_extrep2gram_qn', }, skip_test=True) if FAST_MODE: self.assertAlmostEqual(valid['ppl'], 31.27, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1400, delta=0.0002) else: self.assertAlmostEqual(valid['ppl'], 30.26, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1304, delta=0.0002)
def test_backwards_compatibility(self): testing_utils.download_unittest_models() stdout, valid, test = testing_utils.eval_model( dict( task='integration_tests:multipass', model='seq2seq', model_file='zoo:unittest/seq2seq/model', dict_file='zoo:unittest/seq2seq/model.dict', no_cuda=True, )) self.assertLessEqual( valid['ppl'], 1.01, 'valid ppl = {}\nLOG:\n{}'.format(valid['ppl'], stdout)) self.assertGreaterEqual( valid['accuracy'], 0.999, 'valid accuracy = {}\nLOG:\n{}'.format(valid['accuracy'], stdout), ) self.assertGreaterEqual( valid['f1'], 0.999, 'valid f1 = {}\nLOG:\n{}'.format(valid['f1'], stdout)) self.assertLessEqual( test['ppl'], 1.01, 'test ppl = {}\nLOG:\n{}'.format(test['ppl'], stdout)) self.assertGreaterEqual( test['accuracy'], 0.999, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout), ) self.assertGreaterEqual( test['f1'], 0.999, 'test f1 = {}\nLOG:\n{}'.format(test['f1'], stdout))
def test_wd_responsiveness(self): """ Checks the responsiveness-weighted decoding model produces correct results. """ _, valid, _ = testing_utils.eval_model( { 'model_file': 'models:controllable_dialogue/convai2_finetuned_baseline', 'task': 'projects.controllable_dialogue.tasks.agents', 'beam_size': 20, 'beam_min_n_best': 10, 'use_reply': 'model', 'batchsize': 64, 'num_examples': NUM_EXAMPLES, 'weighted_decoding': NO_REPETITION + ',intrep_2gram:-1e20,partnerrep_2gram:-1e20,lastuttsim:5' # noqa: E501 }, skip_test=True) if FAST_MODE: self.assertAlmostEqual(valid['ppl'], 26.16, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1399, delta=0.0002) else: self.assertAlmostEqual(valid['ppl'], 25.47, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1369, delta=0.0002)
def test_metrics_select(self): """Test output of running eval_model""" parser = setup_args() parser.set_defaults( task='integration_tests', model='repeat_label', datatype='valid', num_examples=5, display_examples=False, metrics='accuracy,rouge', ) opt = parser.parse_args(print_args=False) str_output, valid, test = testing_utils.eval_model(opt) self.assertGreater(len(str_output), 0, "Output is empty") # decode the output scores = str_output.split("\n---\n") for i in range(1, len(scores)): score = ast.literal_eval(scores[i]) # check totals self.assertEqual(score['exs'], i, "Total is incorrect") # accuracy should be one self.assertEqual('accuracy' in score, True, "Accuracy is missing from selection") self.assertEqual(score['accuracy'], 1, "Accuracy != 1") self.assertEqual('rouge-1' in score, True, "Rouge is missing from selection") self.assertEqual(score['rouge-1'], 1, 'rouge1 != 1') self.assertEqual(score['rouge-2'], 1, 'rouge-2 != 1') self.assertEqual(score['rouge-L'], 1, 'rouge-L != 1')
def test_ct_specificity_bucket7(self): """ Checks the specificity-CT model (z=7) produces correct results. """ _, valid, _ = testing_utils.eval_model( { 'model_file': 'models:controllable_dialogue/control_avgnidf10b10e', 'task': 'projects.controllable_dialogue.tasks.agents', 'beam_size': 20, 'beam_min_n_best': 10, 'use_reply': 'model', 'batchsize': 64, 'num_examples': NUM_EXAMPLES, 'weighted_decoding': NO_REPETITION, 'set_controls': 'avg_nidf:7', }, skip_test=True) if FAST_MODE: self.assertAlmostEqual(valid['ppl'], 38.64, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1376, delta=0.0002) else: self.assertAlmostEqual(valid['ppl'], 37.03, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1365, delta=0.0002)
def test_ct_question_bucket7(self): """ Checks the question-controlled model (z=7) produces correct results. """ _, valid, _ = testing_utils.eval_model( { # b11e10 stands for 11 buckets, embedding size 10 'model_file': 'models:controllable_dialogue/control_questionb11e10', 'task': 'projects.controllable_dialogue.tasks.agents', 'beam_size': 20, 'beam_min_n_best': 10, 'batchsize': 64, 'use_reply': 'model', 'num_examples': NUM_EXAMPLES, 'weighted_decoding': NO_REPETITION, 'set_controls': 'question:7', }, skip_test=True) if FAST_MODE: self.assertAlmostEqual(valid['ppl'], 31.04, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1362, delta=0.0002) else: self.assertAlmostEqual(valid['ppl'], 29.22, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1336, delta=0.0002)
def test_multitasking_metrics(self): stdout, valid, test = testing_utils.eval_model({ 'task': 'integration_tests:candidate,' 'integration_tests:multiturnCandidate', 'model': 'random_candidate', 'num_epochs': 0.5, 'aggregate_micro': True, }) task1_acc = valid['tasks']['integration_tests:candidate']['accuracy'] task2_acc = valid['tasks']['integration_tests:multiturnCandidate'][ 'accuracy'] total_acc = valid['accuracy'] # task 2 is 4 times the size of task 1 self.assertAlmostEqual( total_acc, (task1_acc + 4 * task2_acc) / 5, 4, 'Task accuracy is averaged incorrectly', ) stdout, valid, test = testing_utils.eval_model({ 'task': 'integration_tests:candidate,' 'integration_tests:multiturnCandidate', 'model': 'random_candidate', 'num_epochs': 0.5, 'aggregate_micro': False, }) task1_acc = valid['tasks']['integration_tests:candidate']['accuracy'] task2_acc = valid['tasks']['integration_tests:multiturnCandidate'][ 'accuracy'] total_acc = valid['accuracy'] # metrics should be averaged equally across tasks self.assertAlmostEqual( total_acc, (task1_acc + task2_acc) / 2, 4, 'Task accuracy is averaged incorrectly', )
def test_end2end(self): stdout, valid, _ = testing_utils.eval_model(END2END_OPTIONS) self.assertEqual( valid['ppl'], 61.21, 'valid ppl = {}\nLOG:\n{}'.format(valid['ppl'], stdout)) self.assertEqual(valid['f1'], 0.1717, 'valid f1 = {}\nLOG:\n{}'.format(valid['f1'], stdout)) self.assertGreaterEqual( valid['know_acc'], 0.2201, 'valid know_acc = {}\nLOG:\n{}'.format(valid['know_acc'], stdout))
def test_pretrained(self): stdout, _, test = testing_utils.eval_model( dict(task='squad:index', model_file='zoo:drqa/squad/model')) self.assertGreaterEqual( test['accuracy'], 0.68, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout), ) self.assertGreaterEqual( test['f1'], 0.78, 'test f1 = {}\nLOG:\n{}'.format(test['f1'], stdout))
def test_retrieval(self): stdout, _, test = testing_utils.eval_model(RETRIEVAL_OPTIONS) self.assertGreaterEqual( test['accuracy'], 0.86, 'test acc = {}\nLOG:\n{}'.format(test['accuracy'], stdout)) self.assertGreaterEqual( test['hits@5'], 0.98, 'test hits@5 = {}\nLOG:\n{}'.format(test['hits@5'], stdout)) self.assertGreaterEqual( test['hits@10'], 0.99, 'test hits@10 = {}\nLOG:\n{}'.format(test['hits@10'], stdout))
def test_generator_backcomp(self): """ Tests that the generator model files work over time. """ testing_utils.download_unittest_models() stdout, valid, test = testing_utils.eval_model( dict( task='integration_tests:multipass', model='transformer/generator', model_file='models:unittest/transformer_generator2/model', dict_file='models:unittest/transformer_generator2/model.dict', rank_candidates=True, batch_size=64, )) self.assertGreaterEqual( valid['hits@1'], 0.95, 'valid hits@1 = {}\nLOG:\n{}'.format(valid['hits@1'], stdout), ) self.assertLessEqual( valid['ppl'], 1.01, 'valid ppl = {}\nLOG:\n{}'.format(valid['ppl'], stdout), ) self.assertGreaterEqual( valid['accuracy'], .99, 'valid accuracy = {}\nLOG:\n{}'.format(valid['accuracy'], stdout), ) self.assertGreaterEqual( valid['f1'], .99, 'valid f1 = {}\nLOG:\n{}'.format(valid['f1'], stdout)) self.assertGreaterEqual( test['hits@1'], 0.95, 'test hits@1 = {}\nLOG:\n{}'.format(test['hits@1'], stdout), ) self.assertLessEqual( test['ppl'], 1.01, 'test ppl = {}\nLOG:\n{}'.format(test['ppl'], stdout), ) self.assertGreaterEqual( test['accuracy'], .99, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout), ) self.assertGreaterEqual( test['f1'], .99, 'test f1 = {}\nLOG:\n{}'.format(test['f1'], stdout))
def test_convai2_finetuned_greedy(self): """ Check the greedy model produces correct results. """ _, valid, _ = testing_utils.eval_model({ 'model_file': 'zoo:controllable_dialogue/convai2_finetuned_baseline', 'task': 'projects.controllable_dialogue.tasks.agents', 'beam_size': 1, 'batchsize': 64, }, skip_test=True) self.assertAlmostEqual(valid['ppl'], 22.86, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1702, delta=0.0002)
def test_hogwild_eval(self): """Test eval with numthreads > 1 and batchsize in [1,2,3].""" opt = dict(task='tasks.repeat:RepeatTeacher:{}'.format(NUM_EXS), model='repeat_label') for nt in NUM_THREADS_CHOICES: for bs in BATCHSIZE_CHOICES: opt['num_threads'] = nt opt['batchsize'] = bs stdout, valid, test = testing_utils.eval_model(opt) self.assertEqual(valid['exs'], NUM_EXS, 'LOG:\n{}'.format(stdout)) self.assertEqual(test['exs'], NUM_EXS, 'LOG:\n{}'.format(stdout))
def test_transresnet(self): """Test pretrained model.""" stdout, _, test = testing_utils.eval_model(MODEL_OPTIONS, skip_valid=True) self.assertEqual( test['accuracy'], 0.4, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout)) self.assertEqual( test['hits@5'], 0.9, 'test hits@5 = {}\nLOG:\n{}'.format(test['hits@5'], stdout)) self.assertEqual( test['hits@10'], 0.9, 'test hits@10 = {}\nLOG:\n{}'.format(test['hits@10'], stdout)) self.assertEqual( test['med_rank'], 2.0, 'test med_rank = {}\nLOG:\n{}'.format(test['med_rank'], stdout))
def test_released_model(self): """ Check the pretrained model produces correct results. """ _, _, test = testing_utils.eval_model( { 'model_file': 'zoo:self_feeding/hh131k_hb60k_fb60k_st1k/model', 'task': 'self_feeding:all', 'batchsize': 20, }, skip_valid=True, ) self.assertAlmostEqual(test['dia_acc'], 0.506, delta=0.001) self.assertAlmostEqual(test['fee_acc'], 0.744, delta=0.001) self.assertAlmostEqual(test['sat_f1'], 0.8343, delta=0.0001)
def test_convai2_finetuned_beamsearch(self): """ Check the beamsearch baseline produces correct results. """ _, valid, _ = testing_utils.eval_model({ 'model_file': 'zoo:controllable_dialogue/convai2_finetuned_baseline', 'task': 'projects.controllable_dialogue.tasks.agents', 'beam_size': 20, 'beam_min_n_best': 10, 'batchsize': 64, 'num_examples': NUM_EXAMPLES, }, skip_test=True) if FAST_MODE: self.assertAlmostEqual(valid['ppl'], 23.54, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1575, delta=0.0002) else: self.assertAlmostEqual(valid['ppl'], 22.86, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1516, delta=0.0002)
def test_backcomp(self): """ Tests that the transformer ranker model files continue to work over time. """ testing_utils.download_unittest_models() stdout, valid, test = testing_utils.eval_model( dict( task='integration_tests:multipass', model='transformer/ranker', model_file='zoo:unittest/transformer_ranker/model', dict_file='zoo:unittest/transformer_ranker/model.dict', batch_size=64, ) ) self.assertGreaterEqual( valid['hits@1'], 0.99, 'valid hits@1 = {}\nLOG:\n{}'.format(valid['hits@1'], stdout), ) self.assertGreaterEqual( valid['accuracy'], 0.99, 'valid accuracy = {}\nLOG:\n{}'.format(valid['accuracy'], stdout), ) self.assertGreaterEqual( valid['f1'], 0.99, 'valid f1 = {}\nLOG:\n{}'.format(valid['f1'], stdout) ) self.assertGreaterEqual( test['hits@1'], 0.99, 'test hits@1 = {}\nLOG:\n{}'.format(test['hits@1'], stdout), ) self.assertGreaterEqual( test['accuracy'], 0.99, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout), ) self.assertGreaterEqual( test['f1'], 0.99, 'test f1 = {}\nLOG:\n{}'.format(test['f1'], stdout) )