def main(config): """ Creates input files for y_exp mturk task from conversation/rating mturk task. input: file of logs (in ParlaiDialog format) from Mturk task 1 with turn-by-turn quality ratings 1-5 output: file of logs (in ParlaiDialog format) sliced up to begin at the start of an episode or following a new topic request, and ending with a y_exp """ new_episodes = [] old_episodes = [e for e in extract_parlai_episodes(config['infile'])] for episode in old_episodes: for parley in episode: if any( parley.context.startswith(x) for x in (NEW_TOPIC_REQUEST.lower(), INITIAL_PROMPT.lower()) ): new_episode = [] new_episode.append(parley) if parley.context.startswith(SUGGESTION_REQUEST.lower()): new_episodes.append(new_episode) # Create parlai dialog file for easy viewing with open(config['outfile'], 'w') as f: for episode in new_episodes: num_parleys = len(episode) for i, parley in enumerate(episode): if i == num_parleys - 1: parley.episode_done = True f.write(f"{i}\t{parley.to_parlai()}\n") print( f"Extracted {len(new_episodes)} episodes out of {len(old_episodes)} " f"original episodes and wrote them to {config['outfile']}." )
def main(config): """Extracts training data for the negative response classifier (NRC) from Mturk logs input: file of logs (in ParlaiDialog format) from Mturk task 1 with turn-by-turn quality ratings 1-5 output: file of episodes (self-feeding format) w/ +1/-1 ratings indicating positive/negative example """ examples = [] positives = config['positives'].split(',') negatives = config['negatives'].split(',') assert(len(set(positives).intersection(set(negatives))) == 0) num_episodes = 0 num_parleys = 0 for episode in extract_parlai_episodes(config['infile']): num_episodes += 1 history = [] for parley in episode: num_parleys += 1 # Update history (not including stock control flow responses) if parley.context.startswith(INITIAL_PROMPT.lower()): # Conversation prompt, first utterance history = [parley.response] elif parley.context.startswith(SUGGESTION_REQUEST.lower()): # Asked for y_exp, got y_exp pass elif parley.context.startswith(NEW_TOPIC_REQUEST.lower()): # Asked for new topic, got a first utterance history = [parley.response] else: history.append(parley.context) history.append(parley.response) # Only create a new example if this parley's rating is relevant if parley.reward in (positives + negatives): # Concatenate history and add speaker tokens as necessary # history_size refers to the total number of utterances # (history_size == 0 means predict sentiment from '__null__') # response that's being classified (so if history_size == 0 then # classify based only on the response w/o any extra context). # Note that the response being classified should always be preceded by # __p1__ (the human), not __p2__ (the bot). if config['history_size'] < 0: utterances = history elif config['history_size'] == 0: utterances = ['__null__'] else: utterances = history[-config['history_size']:] context = add_person_tokens(utterances, last_speaker=1) label = 1 if parley.reward in positives else -1 example = Parley(context, label) examples.append(example) with open(config['outfile'], 'w') as outfile: for ex in examples: outfile.write(json.dumps(ex.to_dict()) + '\n') print(f"Extracted {len(examples)} ratings out of {num_episodes} episodes " f"({num_parleys} parleys) and wrote them to {config['outfile']} with " f"histsz == {config['history_size']}.")