def write_conversations_format(self, outfile, world): Conversations.save_conversations( self._logs, outfile, world.opt, self_chat=world.opt.get('selfchat_task', False), )
def write_conversations_format(self, outfile, world): logging.info(f'Saving log to {outfile} in Conversations format') Conversations.save_conversations( self._logs, outfile, world.opt, self_chat=world.opt.get('selfchat_task', False), )
def test_world_logging(self): with testing_utils.tempdir() as tmpdir: save_report = os.path.join(tmpdir, 'report') testing_utils.eval_model( dict( model_file='zoo:unittest/transformer_generator2/model', task='integration_tests:multiturn_candidate', save_world_logs=True, report_filename=save_report, truncate=1024, dynamic_batching='full', batchsize=4, ) ) convo_fle = ( str(save_report) + '_integration_tests:multiturn_candidate_replies.jsonl' ) convos = Conversations(convo_fle) for convo in convos: self.assertEquals(len(convo), 2 * 4) # each episode is 4 turns # now assert that they are all from the same dynamic batch index dyn_batch_idx = convo[0]['dyn_batch_idx'] for i, turn in enumerate(convo): if i % 2 == 0 and i > 0: # we log the batch index in the teacher acts only self.assertEquals(dyn_batch_idx, turn['dyn_batch_idx'])
def test_world_logging_buffersize(self): """ Test world logging with dynamic batching. Checks when the number of examples exceeds the buffersize. """ with testing_utils.tempdir() as tmpdir: save_report = os.path.join(tmpdir, 'report') testing_utils.eval_model( dict( model_file='zoo:unittest/transformer_generator2/model', task='integration_tests:RepeatTeacher:2000', world_logs=save_report + '.jsonl', report_filename=save_report, truncate=1024, dynamic_batching='full', batchsize=4, ), valid_datatype='train:evalmode', skip_test=True, ) convo_fle = str(save_report) + '.jsonl' convos = Conversations(convo_fle) # we expect there to be 2000 episodes logged in the convos self.assertEquals(len(convos), 2000)
def load_from_chunk(self, chunk_idx: int) -> List[ChunkOutput]: """ Given the chunk index, load examples from that chunk. Return a list of tuples. The function `_create_message` will take these tuples to form the Message object that is returned by the teacher. """ convs = Conversations(_path(self.opt, self.data_files[chunk_idx])) chunk = [] for conv in convs: turns = [t for t in conv.turns if t.get('id') != 'context'] ep = self._get_ep_from_turns(turns[::2], turns[1::2]) if not ep: continue chunk += ep return chunk
def _build_pairings_file(self): """ Build and save pairings to pairings file. """ onboarding_pairs = [] with open(self.onboarding_path) as f: for line in f: onboarding_pairs.append(json.loads(line)) pairings_filepath = self._get_vs_path('pairings_files') self._print_progress(f'building pairings file, saving at {pairings_filepath}') conversations = { config_id: Conversations(self.chat_files[config_id]) for config_id in self.config_ids } pairs = self._build_conversation_pairs(conversations) with open(pairings_filepath, 'w') as f: # Write the onboarding convo pairs = onboarding_pairs + pairs for pair in pairs: f.write(json.dumps(pair) + "\n")
def test_conversations(self): act_list = [ [ [ {'id': 'Emily', 'text': 'Hello, do you like this test?'}, {'id': 'Stephen', 'text': 'Why yes! I love this test!'}, ], [ {'id': 'Emily', 'text': 'So will you stamp this diff?'}, {'id': 'Stephen', 'text': 'Yes, I will do it right now!'}, ], ], [ [ { 'id': 'A', 'text': 'Somebody once told me the world is gonna roll me', }, {'id': 'B', 'text': 'I aint the sharpest tool in the shed'}, ], [ { 'id': 'A', 'text': 'She was looking kind of dumb with her finger and her thumb', }, {'id': 'B', 'text': 'In the shape of an L on her forehead'}, ], ], ] self.opt = { 'A': 'B', 'C': 'D', 'E': 'F', } self.convo_datapath = os.path.join(self.datapath, 'convo1') Conversations.save_conversations( act_list, self.convo_datapath, self.opt, self_chat=False, other_info='Blah blah blah', ) assert os.path.exists(self.convo_datapath + '.jsonl') assert os.path.exists(self.convo_datapath + '.metadata') convos = Conversations(self.convo_datapath + '.jsonl') # test conversations loaded self.assertEqual(convos.num_conversations, 2) # test speakers saved speakers = {'Stephen', 'Emily', 'A', 'B'} self.assertEqual(set(convos.metadata.speakers), speakers) # test opt saved for x in ['A', 'C', 'E']: self.assertEqual( self.opt[x], convos.metadata.opt[x], ) # test kwargs self.assertEqual({'other_info': 'Blah blah blah'}, convos.metadata.extra_data) # test reading conversations with testing_utils.capture_output() as out: convos.read_conv_idx(0) str_version = ( 'Emily: Hello, do you like this test?\n' 'Stephen: Why yes! I love this test!\n' 'Emily: So will you stamp this diff?\n' 'Stephen: Yes, I will do it right now!\n' ) self.assertIn(str_version, out.getvalue())
def run(self): opt = self.opt if int(len(self.opt["agent_suffixes"])) % 2 != 0: raise RuntimeError("Agent suffix input should be even") suffixes = {} for i in range(int(len(self.opt["agent_suffixes"]) / 2)): agent = self.opt["agent_suffixes"][2 * i] suffix = self.opt["agent_suffixes"][2 * i + 1] suffixes[agent] = suffix with PathManager.open(opt["report_path"]) as r: report = json.load(r)["report"] tod_metrics = report["tod_metrics"] if opt["num_conversations"] > -1: tod_metrics = tod_metrics[:opt["num_conversations"]] source = self.opt["source_file"].replace(".jsonl", "") if self.opt["out_file"]: out = self.opt["out_file"] else: if ("conversations" in source ): # just to make sure we don't overwrite anything... out = source.replace("conversations", "cleaned_conversations") else: out = "cleaned_" + source speakers = [] with PathManager.open(out + ".jsonl", "w") as f: conversations = Conversations(source + ".jsonl") for i, conversation in enumerate(conversations): if opt["num_conversations"] >= 0 and i >= opt[ "num_conversations"]: break cleaned_dialog = [] for parlay_round in conversation.episode["dialog"]: cleaned_parlay_round = [] for turn in parlay_round: turn_type = self._get_turn_type(turn) if turn_type in self.opt["included_speakers"]: if turn_type in suffixes: turn["id"] += suffixes[turn_type] if turn["id"] not in speakers: speakers.append(turn["id"]) cleaned_parlay_round.append(turn) if len(cleaned_parlay_round) > 0: cleaned_dialog.append(cleaned_parlay_round) convo = {} convo["dialog"] = cleaned_dialog convo["metadata_path"] = Metadata._get_path(out) convo["context"] = [{ "synthetic_task_success": tod_metrics[i]["synthetic_task_success"], "goal_text": tod_metrics[i]["goal"]["text"], }] json_convo = json.dumps(convo) f.write(json_convo + "\n") old_meta = Metadata(source + ".jsonl") Metadata.save_metadata(out, old_meta.opt, old_meta.self_chat, speakers, **old_meta.extra_data)
def test_conversations(self): act_list = [ [ [ { 'id': 'Emily', 'text': 'Hello, do you like this test?' }, { 'id': 'Stephen', 'text': 'Why yes! I love this test!' }, ], [ { 'id': 'Emily', 'text': 'So will you stamp this diff?' }, { 'id': 'Stephen', 'text': 'Yes, I will do it right now!' }, ], ], [ [ { 'id': 'A', 'text': 'Somebody once told me the world is gonna roll me', }, { 'id': 'B', 'text': 'I aint the sharpest tool in the shed' }, ], [ { 'id': 'A', 'text': 'She was looking kind of dumb with her finger and her thumb', }, { 'id': 'B', 'text': 'In the shape of an L on her forehead' }, ], ], ] self.opt = {'A': 'B', 'C': 'D', 'E': 'F'} self.convo_datapath = os.path.join(self.datapath, 'convo1') Conversations.save_conversations( act_list, self.convo_datapath, self.opt, self_chat=False, other_info='Blah blah blah', ) assert os.path.exists(self.convo_datapath + '.jsonl') assert os.path.exists(self.convo_datapath + '.metadata') convos = Conversations(self.convo_datapath + '.jsonl') # test conversations loaded self.assertEqual(len(convos), 2) # test speakers saved speakers = {'Stephen', 'Emily', 'A', 'B'} self.assertEqual(set(convos.metadata.speakers), speakers) # test opt saved for x in ['A', 'C', 'E']: self.assertEqual(self.opt[x], convos.metadata.opt[x]) # test kwargs self.assertEqual({'other_info': 'Blah blah blah'}, convos.metadata.extra_data) # test reading conversations with self.assertLogs(logger=logging.logger, level='DEBUG') as cm: convos.read_conv_idx(0) str_version = ('Emily: Hello, do you like this test?\n' 'Stephen: Why yes! I love this test!\n' 'Emily: So will you stamp this diff?\n' 'Stephen: Yes, I will do it right now!\n') self.assertIn(str_version, "\n".join(cm.output)) # test getting a specific turn first = convos[0] # Conversation self.assertEqual(first[0].id, 'Emily') self.assertEqual(first[3].text, 'Yes, I will do it right now!')
def dump_data(opt): """ Dump task data to ACUTE-Eval. """ # create repeat label agent and assign it to the specified task agent = RepeatLabelAgent(opt) world = create_task(opt, agent) task = opt.get('task') speaker_0_id = opt.get('speaker_0_id') or f'{task}_as_human' speaker_1_id = opt.get('speaker_1_id') or f'{task}_as_model' if opt['outfile'] is None: outfile = tempfile.mkstemp(prefix='{}_{}_'.format( opt['task'], opt['datatype']), suffix='.txt')[1] else: outfile = opt['outfile'] num_episodes = (world.num_episodes() if opt['num_episodes'] == -1 else min( opt['num_episodes'], world.num_episodes())) log_timer = TimeLogger() print(f'[ starting to convert, saving output to {outfile} ]') dialogues = [] for _ in range(num_episodes): episode = [] episode_done = False while not episode_done: world.parley() acts = world.get_acts() text = acts[0].get('text') split_text = text.split('\n') label = random.choice(acts[0].get('labels', acts[0].pop('eval_labels', None))) if not episode and opt.get('prepended_context'): # first turn context = split_text[:-1] text = split_text[-1] context_turn = [{ 'text': context, 'episode_done': False, 'id': 'context' } for _ in range(2)] episode.append(context_turn) turn = [ { 'text': text, 'episode_done': False, 'id': speaker_0_id }, { 'text': label, 'episode_done': False, 'id': speaker_1_id }, ] episode.append(turn) if acts[0].get('episode_done', False): episode[-1][-1]['episode_done'] = True episode_done = True dialogues.append(episode) if log_timer.time() > opt['log_every_n_secs']: text, _log = log_timer.log(world.total_parleys, world.num_examples()) print(text) if world.epoch_done(): break Conversations.save_conversations(dialogues, outfile, opt)