def test_no_text(self): with testing_utils.tempdir() as tmpdir: fp = os.path.join(tmpdir, "badfile.jsonl") with PathManager.open(fp, "w") as f: f.write( '{"dialog": [[{"id": "speaker1"}, {"text": "Hello.", "id": "speaker2"}]]}\n' ) opt = { 'task': 'jsonfile', 'fromfile_datapath': fp, 'display_verbose': True } with self.assertRaises(AttributeError): testing_utils.display_data(opt)
def test_good_fileformat(self): """ Checks that we succeed in loading a well formatted jsonl file. """ with testing_utils.tempdir() as tmpdir: fp = os.path.join(tmpdir, "goodfile.jsonl") with PathManager.open(fp, "w") as f: f.write( '{"dialog": [[{"text": "Hi.", "id": "speaker1"}, {"text": "Hello.", "id": "speaker2"}]]}\n' ) opt = { 'task': 'jsonfile', 'jsonfile_datapath': fp, 'verbose': True } testing_utils.display_data(opt)
def test_secondspeaker_label(self): with testing_utils.tempdir() as tmpdir: fp = os.path.join(tmpdir, "goodfile.jsonl") with PathManager.open(fp, "w") as f: f.write( '{"dialog": [[{"text": "Hi.", "id": "speaker1"}, {"text": "Hello.", "id": "speaker2"}]]}\n' ) opt = { 'task': 'jsonfile', 'jsonfile_datapath': fp, 'verbose': True, 'label_turns': 'secondspeaker', } train_out, valid_out, test_out = testing_utils.display_data(opt) texts = [ l.split(':', 1)[-1].strip() for l in train_out.split('\n') if l in train_out if 'text' in l ] labels = [ l.split(':', 1)[-1].strip() for l in train_out.split('\n') if l in train_out if 'labels' in l ] self.assertEqual(texts[0], 'Hi.') self.assertEqual(labels[0], 'Hello.')
def _test_display_output(opt): output = testing_utils.display_data(opt) train_labels = re.findall(r"\[labels: .*\]", output[0]) valid_labels = re.findall(r"\[eval_labels: .*\]", output[1]) test_labels = re.findall(r"\[eval_labels: .*\]", output[2]) for i, lbls in enumerate([train_labels, valid_labels, test_labels]): self.assertGreater(len(lbls), 0, 'DisplayData failed') self.assertEqual(len(lbls), len(set(lbls)), output[i])
def test_output(self): """ Does display_data reach the end of the loop? """ str_output, _, _ = testing_utils.display_data({ 'num_examples': 1, 'task': 'babi:task1k:1' }) self.assertGreater(len(str_output), 0, "Output is empty") self.assertIn("[babi:task1k:1]:", str_output, "Babi task did not print") self.assertIn("~~", str_output, "Example output did not complete")
def _run_display_output(self, opt: Dict[str, Union[str, bool]], ep_ex_counts: List[Tuple[int, int]]): """ Run display output, """ output = testing_utils.display_data(opt) stats = [o.split('\n')[-2] for o in output] for i, ((ep, ex), out) in enumerate(zip(ep_ex_counts, stats)): self.assertEqual( out, f'[ loaded {ep} episodes with a total of {ex} examples ]', output[i], )
def test_one_episode(self): with testing_utils.tempdir() as tmpdir: fp = os.path.join(tmpdir, "badfile.txt") with PathManager.open(fp, "w") as f: for _ in range(1000): f.write('id:test_file\ttext:placeholder\tlabels:placeholder\n\n') opt = {'task': 'fromfile', 'fromfile_datapath': fp, 'verbose': True} with self.assertLogs(logger=logging.logger, level='DEBUG') as cm: testing_utils.display_data(opt) print("\n".join(cm.output)) assert any('long episode' in l for l in cm.output) # invert the logic of the assertion with self.assertRaises(self.failureException): fp = os.path.join(tmpdir, "goodfile.txt") with PathManager.open(fp, "w") as f: for _ in range(1000): f.write( 'id:test_file\ttext:placeholder\tlabels:placeholder\tepisode_done:True\n\n' ) opt = {'task': 'fromfile', 'fromfile_datapath': fp, 'verbose': True} with self.assertLogs(logger=logging.logger, level='DEBUG') as cm: testing_utils.display_data(opt) assert any('long episode' in l for l in cm.output)
def test_display_data_with_prepend_gold(self): parser = setup_args() opt = parser.parse_args( ['--task', 'wizard_of_internet:WizardDialogGoldKnowledgeTeacher']) for out_type in display_data(opt): started_knowledge_span = False for token in [w.strip() for w in out_type.split() if w.strip()]: if token == CONST.KNOWLEDGE_TOKEN: self.assertFalse(started_knowledge_span) started_knowledge_span = True elif token == CONST.END_KNOWLEDGE_TOKEN: self.assertTrue(started_knowledge_span) started_knowledge_span = False self.assertFalse(started_knowledge_span)
def test_dataset_integrity(self): """ Check the controllble dialogue data loads. """ train_output, valid_output, test_output = testing_utils.display_data( {'task': 'self_feeding:all'}) # check valid data self.assertIn("i am spending time with my 4 sisters", train_output) self.assertIn('193777 episodes with a total of 193777 examples', train_output) # check valid data self.assertIn('3500 examples', valid_output) # check test data self.assertIn('7801 examples', test_output)
def test_display_data(self): parser = setup_args() opt = parser.parse_args(['--task', 'wizard_of_internet']) display_data(opt)