def main(opt): """ Converts a Fbdialog file of episodes into two self-feeding files (split by topic) All conversations including a word in the provided topic's bag of words will be separated from conversations without those words. """ on_topic_exs = [] off_topic_exs = [] num_episodes = 0 for episode in extract_fb_episodes(opt['infile']): num_episodes += 1 if opt['min_unit'] == 'episode': if includes_topic(episode, TOPIC): on_topic_exs.extend( episode_to_examples(episode, opt['history_size'])) else: off_topic_exs.extend( episode_to_examples(episode, opt['history_size'])) elif opt['min_unit'] == 'example': for example in episode_to_examples(episode, opt['history_size']): if includes_topic([example], TOPIC): on_topic_exs.append(example) else: off_topic_exs.append(example) if opt['shuffle']: random.shuffle(on_topic_exs) random.shuffle(off_topic_exs) total = len(on_topic_exs) + len(off_topic_exs) on_pct = len(on_topic_exs) / total print(f"Separated {total} examples (from {num_episodes} episodes) into " f"{len(off_topic_exs)} off-topic and {len(on_topic_exs)} " f"({on_pct * 100:.1f}%) on-topic") outfile_base, outfile_ext = os.path.splitext(opt['outfile']) unit_prefix = opt['min_unit'][:3] topic_prefix = TOPIC_NAME[:3] on_topic_filename = f"{outfile_base}_{unit_prefix}_{topic_prefix}{outfile_ext}" with open(on_topic_filename, 'w') as outfile: for ex in on_topic_exs: outfile.write(json.dumps(ex.to_dict()) + '\n') off_topic_filename = f"{outfile_base}_{unit_prefix}_no{topic_prefix}{outfile_ext}" with open(off_topic_filename, 'w') as outfile: for ex in off_topic_exs: outfile.write(json.dumps(ex.to_dict()) + '\n')
def main(config): """Converts a Fbdialog file of episodes into a json file of Parley examples""" examples = [] for episode in extract_fb_episodes(config['infile']): examples.extend(episode_to_examples(episode, config['history_size'])) with open(config['outfile'], 'w') as outfile: for ex in examples: outfile.write(json.dumps(ex.to_dict()) + '\n')