def main(config): """ Creates .stitched files from .suggested files. input: a .suggested file of logs (in ParlaiDialog format) from Mturk task 2, each of which starts with an initial prompt or topic request, and ends with a y that corresponds to the y_exp given in the previous turn output: a .stitched file (in self-feeding format) with the original mistake by the bot replace with the mturked y (based on y_exp) """ examples = [] episodes = [e for e in extract_parlai_episodes(config['infile'])] for episode in episodes: history = [] num_parleys = len(episode) for i, parley in enumerate(episode): if i == 0: # Don't include the topic request history.append(parley.response) continue elif i == num_parleys - 3: # third to last was mistake and negative feedback continue elif i == num_parleys - 2: # penultimate turn was suggestion request and y_exp continue elif i == num_parleys - 1: # ultimate turn was verbatim request and y example = Parley( context=add_person_tokens(history, last_speaker=1), response=parley.response, # y ) examples.append(example) else: # normal turn; just add to history history.append(parley.context) history.append(parley.response) # Write new episodes to self-feeding format with PathManager.open(config['outfile'], 'w') as outfile: for ex in examples: outfile.write(json.dumps(ex.to_dict()) + '\n') print( f"Extracted {len(examples)} self-feeding episodes out of " f"{len(episodes)} parlai episodes and wrote them to {config['outfile']}." )
def _setup_data(self, path): # Make private method for ParlAIDialogTeacher """ Reads data in the fbdialog format. Returns ``((x,y,r,c), new_episode?)`` tuples. """ print("[ Loading Self-Feeding text data:" + path + "]") self.episodes = [] self.num_exs = 0 self.max_train = self.opt.get('max_train', 0) with PathManager.open(path, 'r') as f: for line in f.readlines(): if self.max_train and self.num_exs >= self.max_train: break parley = json.loads(line) # NOTE: History is trimmed here, not by TorchAgent (except in # interactive mode) if self.opt['history_size'] == 0: parley['context'] = '__null__' elif self.opt['history_size'] > 0: utterances = re.split(r'__p\d__', parley['context'])[1:] trimmed = utterances[-(self.opt['history_size']):] parley['context'] = add_person_tokens(trimmed, last_speaker=1) # WARNING: STRIPPING AWAY MEMORIES parley['memories'] = [] episode = { 'text': parley['context'], 'labels': [parley['response']], 'label_candidates': parley.get('candidates', []), 'reward': parley.get('reward', 0), 'episode_done': True, } # Convert integer labels (e.g., polarization dataset) to strings episode['labels'] = [str(l) for l in episode['labels']] self.num_exs += 1 self.episodes.append([episode])
def main(config): """ Creates .identity files from .sliced files. input: a .sliced file of logs (in ParlaiDialog format) from Mturk task 1, each of which starts with an initial prompt or topic request, and ends with a y_exp output: an .identity file (in self-feeding format) with y_exps used as though they were ys """ examples = [] episodes = [e for e in extract_parlai_episodes(config['infile'])] for episode in episodes: history = [] num_parleys = len(episode) for i, parley in enumerate(episode): if i == 0: # Don't include the topic request history.append(parley.response) continue elif i == num_parleys - 2: # penultimate turn was mistake and negative feedback continue elif i == num_parleys - 1: # ultimate turn was correction request and y_exp example = Parley( context=add_person_tokens(history, last_speaker=1), response=parley.response, # y_exp ) examples.append(example) else: # normal turn; just add to history history.append(parley.context) history.append(parley.response) # Write new episodes to self-feeding format with PathManager.open(config['outfile'], 'w') as outfile: for ex in examples: outfile.write(json.dumps(ex.to_dict()) + '\n') print( f"Extracted {len(examples)} self-feeding episodes out of " f"{len(episodes)} parlai episodes and wrote them to {config['outfile']}." )
def main(config): """ Creates .unfiltered files from .sliced files. input: a .sliced file of logs (in ParlaiDialog format) from Mturk task 1, each of which starts with an initial prompt or topic request, and ends with a y_exp output: a .unfiltered file (in self-feeding format) with every utterance output by bot used as a label (i.e., act as though the bot was a human and we want to train in a normal supervised way). """ examples = [] episodes = [e for e in extract_parlai_episodes(config['infile'])] for episode in episodes: history = [] num_parleys = len(episode) for i, parley in enumerate(episode): if i == 0: # Don't include the topic request history.append(parley.response) continue elif i == num_parleys - 1: # ultimate turn was correction request and explanation continue else: example = Parley( context=add_person_tokens(history, last_speaker=1), response=parley.context, # What the bot said ) examples.append(example) history.append(parley.context) history.append(parley.response) # Write new episodes to self-feeding format with PathManager.open(config['outfile'], 'w') as outfile: for ex in examples: outfile.write(json.dumps(ex.to_dict()) + '\n') print( f"Extracted {len(examples)} self-feeding episodes out of " f"{len(episodes)} parlai episodes and wrote them to {config['outfile']}." )
def main(opt): """ Extracts training data for the negative response classifier (NRC) from Mturk logs. input: file of logs (in ParlaiDialog format) from Mturk task 1 with turn-by-turn quality ratings 1-5 output: file of episodes (self-feeding format) w/ +1/-1 ratings indicating positive/negative example """ examples = [] positives = opt['positives'].split(',') negatives = opt['negatives'].split(',') assert len(set(positives).intersection(set(negatives))) == 0 num_episodes = 0 num_parleys = 0 for episode in extract_parlai_episodes(opt['infile']): num_episodes += 1 history = [] for parley in episode: num_parleys += 1 # Update history (not including stock control flow responses) if parley.context.startswith(INITIAL_PROMPT): # Conversation prompt, first utterance # Begin history history = [parley.response] elif parley.context.startswith(EXP_REQUEST): # Asked for y_exp, got y_exp # Messed up, so blast history example = Parley( context=add_person_tokens(history[:-2], last_speaker=1), response=parley.response, # y_exp ) examples.append(example) history = [] elif parley.context.startswith(NEWTOPIC): # Asked for new topic, got a first utterance # Begin new history history = [parley.response] elif parley.context.startswith(RAT_REQUEST): # Asked for rating, got one-word rating # Nothing to update in history pass elif CONTINUE in parley.context: # if response was negative, history will get blasted in EXP_REQUEST # if we're here, response was neutral/positive, so continue the history history.append(parley.context[parley.context.rindex(':') + 1:]) history.append(parley.response) else: # normal turn: maintain the history history.append(parley.context) history.append(parley.response) with PathManager.open(opt['outfile'], 'w') as outfile: for ex in examples: outfile.write(json.dumps(ex.to_dict()) + '\n') print(f"Extracted {len(examples)} ratings out of {num_episodes} episodes " f"({num_parleys} parleys) and wrote them to {opt['outfile']} with " f"histsz == {opt['history_size']}.")
def main(opt): """ Extracts training data for the negative response classifier (NRC) from Mturk logs. input: file of logs (in ParlaiDialog format) from Mturk task 1 with turn-by-turn quality ratings 1-5 output: file of episodes (self-feeding format) w/ +1/-1 ratings indicating positive/negative example """ examples = [] positives = opt['positives'].split(',') negatives = opt['negatives'].split(',') assert len(set(positives).intersection(set(negatives))) == 0 num_episodes = 0 num_parleys = 0 for episode in extract_parlai_episodes(opt['infile']): num_episodes += 1 history = [] for parley in episode: num_parleys += 1 # Update history (not including stock control flow responses) if parley.context.startswith(INITIAL_PROMPT): # Conversation prompt, first utterance # Begin history history = [parley.response] elif parley.context.startswith(EXP_REQUEST): # Asked for y_exp, got y_exp # Messed up, so blast history history = [] elif parley.context.startswith(NEWTOPIC): # Asked for new topic, got a first utterance # Begin new history history = [parley.response] elif parley.context.startswith(RAT_REQUEST): # Concatenate history and add speaker tokens as necessary # history_size refers to the total number of utterances # (history_size == 0 means predict sentiment from '__null__') # response that's being classified (so if history_size == 0 then # classify based only on the response w/o any extra context). # Note that the response being classified should always be preceded by # __p1__ (the human), not __p2__ (the bot). if opt['history_size'] < 0: utterances = history elif opt['history_size'] == 0: utterances = ['__null__'] else: utterances = history[-opt['history_size']:] context = add_person_tokens(utterances, last_speaker=1) if parley.response in positives: label = 1 elif parley.response in negatives: label = -1 else: label = 0 if label: example = Parley(context, label) examples.append(example) elif CONTINUE in parley.context: # if response was negative, history will get blasted in EXP_REQUEST # if we're here, response was neutral/positive, so continue the history history.append(parley.context[parley.context.rindex(':') + 1:]) history.append(parley.response) else: history.append(parley.context) history.append(parley.response) with PathManager.open(opt['outfile'], 'w') as outfile: for ex in examples: outfile.write(json.dumps(ex.to_dict()) + '\n') print(f"Extracted {len(examples)} ratings out of {num_episodes} episodes " f"({num_parleys} parleys) and wrote them to {opt['outfile']} with " f"histsz == {opt['history_size']}.")
def main(opt): """ Extracts training data for the negative response classifier (NRC) from Mturk logs. input: file of logs (in ParlaiDialog format) from Mturk task 1 with turn-by-turn quality ratings 1-5 output: file of episodes (self-feeding format) w/ +1/-1 ratings indicating positive/negative example """ examples = [] num_episodes = 0 num_parleys = 0 for episode in extract_parlai_episodes(opt['infile']): num_episodes += 1 history = [] for parley in episode: num_parleys += 1 # Update history (not including stock control flow responses) if parley.context.startswith( INITIAL_PROMPT) or parley.context.startswith(NEWTOPIC): # a prompt, first utterance # Begin history history = [parley.response] # NOTE: we now allow these one-utterance episodes to be examples # continue elif parley.context.startswith( EXP_REQUEST) or parley.context.startswith(RAT_REQUEST): # If 'filter_accusation' is on and the last example added was a human, # toss the previous example, which is when the human expressed # dissatisfaction if (opt['mode'] == 'human' and opt['filter_accusation'] and parley.context.startswith(EXP_REQUEST) and len(examples) > 0): examples.pop() # If 'filter_mistake' is on and the last example in the queue was a bot, # toss it too, since that's when the bot messed up if (opt['mode'] == 'bot' and opt['filter_mistake'] and parley.context.startswith(EXP_REQUEST) and len(examples) > 0): examples.pop() # Asked for y_exp or rating, got it # Messed up, so blast history history = [] continue elif CONTINUE in parley.context: # if response was negative, history will get blasted in EXP_REQUEST # if we're here, response was neutral/positive, so continue the history history.append(parley.context[parley.context.rindex(':') + 1:]) history.append(parley.response) else: # normal turn: maintain the history history.append(parley.context) history.append(parley.response) if opt['mode'] in ['bot'] and len(history) >= 2: if len(history) == 2: example = Parley(context='__null__', response=history[0]) else: example = Parley( context=add_person_tokens(history[:-2], last_speaker=1), response=history[-2], # What the bot said ) examples.append(example) if opt['mode'] in ['human']: if len(history) == 1: example = Parley(context='__null__', response=history[0]) else: example = Parley( # this is not technically true: # the last speaker was the bot (__p2__), # not the human (__p1__), but in all our data, __p1__ is always # the speaking partner of the learner context=add_person_tokens(history[:-1], last_speaker=1), response=history[-1], # What the bot said ) examples.append(example) with PathManager.open(opt['outfile'], 'w') as outfile: for ex in examples: outfile.write(json.dumps(ex.to_dict()) + '\n') print(f"Extracted {len(examples)} examples out of {num_episodes} episodes " f"({num_parleys} parleys) and wrote them to {opt['outfile']} with " f"histsz == {opt['history_size']}.")