def generate_examples(description, examples_path, max_examples, remove_fail, max_turns): global num_examples examples = [] num_failed = 0 for i in range(max_examples): scenario = scenario_db.scenarios_list[num_examples % len(scenario_db.scenarios_list)] sessions = [ agents[0].new_session(0, scenario.kbs[0]), agents[1].new_session(1, scenario.kbs[1]) ] controller = Controller(scenario, sessions) ex = controller.simulate(max_turns) if ex.outcome['reward'] == 0: num_failed += 1 if remove_fail: continue examples.append(ex) num_examples += 1 logstats.update_summary_map(summary_map, {'length': len(ex.events)}) with open(examples_path, 'w') as out: print >> out, json.dumps([e.to_dict() for e in examples]) print 'number of failed dialogues:', num_failed logstats.add('length', summary_map['length']['mean'])
def update_entity_stats(self, summary_map, batch_preds, batch_targets, prefix=''): def get_entity(x): return [e for e in x if is_entity(e)] pos_target = prefix + 'pos_target' pos_pred = prefix + 'pos_pred' tp = prefix + 'tp' for preds, targets in izip(batch_preds, batch_targets): # None targets means that this is a padded turn if targets is None: recalls.append(None) else: preds = set(get_entity(preds)) targets = set(get_entity(targets)) # Don't record cases where no entity is presented if len(targets) > 0: logstats.update_summary_map(summary_map, { pos_target: len(targets), pos_pred: len(preds) }) logstats.update_summary_map( summary_map, {tp: sum([1 if e in preds else 0 for e in targets])})
def __init__(self): keys = ('undecided', 'fact', 'single_fact', 'joint_fact', 'coref', 'correct_single', 'correct_joint', 'correct_joint_ent', 'repeated', 'same_col') self.summary_map = {} for k in keys: logstats.update_summary_map(self.summary_map, {k: 0})
def eval_joint(self, kb, span): #print 'eval_joint:', span logstats.update_summary_map(self.summary_map, {'joint_fact': 1}) num, ent1, _, ent2 = span ent1 = ent1[1] ent2 = ent2[1] if ent1 == ent2: #print 'repeated' logstats.update_summary_map(self.summary_map, {'repeated': 1}) return # Same type, i.e. in the same column if ent1[1] == ent2[1]: #print 'same column' logstats.update_summary_map(self.summary_map, {'same_col': 1}) return num = self.str_to_num(num) count = 0 for i, item in enumerate(kb.items): entities = [entity for entity in self.item_entities(item)] if ent1 in entities and ent2 in entities: count += 1 #print 'correct joint ent' logstats.update_summary_map(self.summary_map, {'correct_joint_ent': 1}) if count == num: #print 'correct joint' logstats.update_summary_map(self.summary_map, {'correct_joint': 1})
def generate_examples(description, examples_path, max_examples, remove_fail, max_turns): global num_examples examples = [] num_failed = 0 for i in range(max_examples): scenario = scenario_db.scenarios_list[num_examples % len(scenario_db.scenarios_list)] sessions = [ agents[0].new_session(0, scenario.kbs[0]), agents[1].new_session(1, scenario.kbs[1]) ] controller = Controller.get_controller(scenario, sessions) ex = controller.simulate(max_turns) if ex.outcome['reward'] == 0: num_failed += 1 if remove_fail: continue examples.append(ex) num_examples += 1 logstats.update_summary_map(summary_map, {'length': len(ex.events)}) with open(examples_path, 'w') as out: print >> out, json.dumps([e.to_dict() for e in examples]) print 'number of failed dialogues:', num_failed logstats.add('length', summary_map['length']['mean']) if args.fact_check: if args.agents[0] == args.agents[1] and hasattr(agents[0], 'env'): results0 = agents[0].env.evaluator.report() results1 = agents[1].env.evaluator.report() results = {k: (results0[k] + results1[k]) / 2. for k in results0} logstats.add('bot_chat', results)
def get_turns_vs_completed(all_chats): num_turns_dict = defaultdict(dict) for chat in all_chats: if chat["outcome"] is not None: num_turns = len(chat['events']) logstats.update_summary_map( num_turns_dict[num_turns], {'complete': 1 if chat["outcome"]["reward"] == 1 else 0}) return {k: v['complete']['sum'] for k, v in num_turns_dict.iteritems()}
def get_select_vs_completed(all_chats): num_select_dict = defaultdict(dict) for chat in all_chats: if chat["outcome"] is not None: events = [Event.from_dict(e) for e in chat["events"]] num_select = len([e for e in events if e.action == 'select']) logstats.update_summary_map( num_select_dict[num_select], {'complete': 1 if chat["outcome"]["reward"] == 1 else 0}) return {k: v['complete']['sum'] for k, v in num_select_dict.iteritems()}
def eval_single(self, kb, span): #print 'eval_single:', span logstats.update_summary_map(self.summary_map, {'single_fact': 1}) num, ent = span ent = ent[1] # take the canonical form num = self.str_to_num(num) count = 0 for i, item in enumerate(kb.items): for entity in self.item_entities(item): if entity == ent: count += 1 if num == count: #print 'correct single' logstats.update_summary_map(self.summary_map, {'correct_single': 1})
def update_selection_stats(self, summary_map, scores, targets, prefix=''): # NOTE: targets are from ground truth response and many contain new entities. # Ideally this would not happen as a mentioned entity is either from the agent's # KB or from partner's mentions (which is added to the graph), so during decoding # there shouldn't be new entities. However, the lexicon may "create" an entity. batch_size, num_nodes = scores.shape targets = targets[:, :num_nodes] pos_pred = scores > 0 pos_target = targets == 1 tp = np.sum(np.logical_and(pos_pred, pos_target)) logstats.update_summary_map( summary_map, { prefix + 'tp': tp, prefix + 'pos_pred': np.sum(pos_pred), prefix + 'pos_target': np.sum(pos_target) })
def get_stats(chat, agent_id, preprocessor): ex = Example.from_dict(None, chat) kbs = ex.scenario.kbs mentioned_entities = set() stats = {} vocab = set() for i, event in enumerate(ex.events): if agent_id != event.agent: continue if event.action == 'select': utterance = [] logstats.update_summary_map(stats, {'num_select': 1}) elif event.action == 'message': utterance = preprocessor.process_event(event, kbs[event.agent], mentioned_entities) # Skip empty utterances if not utterance: continue else: utterance = utterance[0] for token in utterance: if is_entity(token): logstats.update_summary_map(stats, {'num_entity': 1}) mentioned_entities.add(token[1][0]) else: vocab.add(token) logstats.update_summary_map(stats, {'utterance_len': len(utterance)}) speech_act = get_speech_act(defaultdict(int), event, utterance) if speech_act[0] in ('inform', 'ask', 'answer'): logstats.update_summary_map(stats, {'SA_' + speech_act[0]: 1}) logstats.update_summary_map(stats, {'num_utterance': 1}) new_stats = {} for k in stats: if k in ('num_select', 'num_utterance', 'num_entity'): new_stats[k] = stats[k]['sum'] elif k in ('utterance_len', ): new_stats[k] = stats[k]['mean'] elif k.startswith('SA_'): new_stats[k] = stats[k]['sum'] new_stats['vocab_size'] = len(vocab) return new_stats
def get_cross_talk(all_chats): summary_map = {} is_null = lambda x: x is None or x == 'null' count = 0 def is_valid(event): if is_null(event.start_time) or event.start_time >= event.time: return False return True for chat in all_chats: if chat["outcome"] is not None and chat["outcome"]["reward"] == 1: events = [Event.from_dict(e) for e in chat["events"]] for event1, event2 in izip(events, events[1:]): # start_time is not available if not is_valid(event2): continue sent_time = float(event1.time) start_time = float(event2.start_time) cross_talk = 1 if start_time < sent_time else 0 logstats.update_summary_map(summary_map, {'cross_talk': cross_talk}) if is_valid(event1): typing_time = float(event1.time) - float(event1.start_time) assert typing_time > 0 msg_len = len(event1.data) logstats.update_summary_map( summary_map, {'char_per_sec': msg_len / typing_time}) try: print 'Char/Sec:', summary_map['char_per_sec']['mean'] except KeyError: pass try: return summary_map['cross_talk']['mean'] # Cross talk only available for chats with start_time except KeyError: return -1
def check_fact(summary_map, tokens, kb): ''' Simple fact checker: each utterance is converted to a list of numbers and entities and we assume that the number describes the following entities, which will cause some false negatives. ''' hypothesis = [] N = len(kb.items) for token in tokens: if is_entity(token): if len(hypothesis) > 0: # Represent entity as its canonical form hypothesis[-1][1].append(token[1][0]) else: number = to_number(token, N) if number: hypothesis.append((number, [])) for n, entities in hypothesis: if len(entities) > 0: correct = 1 if n == count_kb_entity(kb, entities) else 0 logstats.update_summary_map(summary_map, {'correct': correct})
def get_dialog_stats(summary_map, utterance_counts, dialog): ''' Count number of entities and attributes per dialogue. ''' num_entities = 0 all_ents = set() for agent, act, ents, utterance in dialog: num_ents = len(ents) num_types = len(set(ents)) num_entities += num_ents all_ents.update(ents) if num_ents > 0: logstats.update_summary_map(summary_map, { 'multi_entity_per_entity_utterance': 1 if num_types > 1 else 0 }) logstats.update_summary_map( summary_map, { 'repeated_entity_per_entity_utterance': 1 if num_ents > num_types else 0 }) if num_ents > num_types: examples['repeated_entity_per_entity_utterance'].append( utterance) logstats.update_summary_map( summary_map, { 'num_entity_per_dialog': num_entities, 'num_entity_type_per_dialog': len(all_ents), 'num_attr_type_per_dialog': len(set([e[1] for e in all_ents])) }) dialog = abstract_entity(dialog) utterances = get_utterance(dialog) for a, b in izip(utterances, utterances[1:]): utterance_counts[a][b] += 1
def update_summary(self, summary_map, bleu_scores): for bleu_score in bleu_scores: # None means no entity in this utterance if bleu_score is not None: logstats.update_summary_map(summary_map, {'bleu': bleu_score})
def inc_undecided(self): logstats.update_summary_map(self.summary_map, {'undecided': 1})
def inc_fact(self): logstats.update_summary_map(self.summary_map, {'fact': 1})
def analyze_strategy(all_chats, scenario_db, preprocessor, text_output, lm): fout = open(text_output, 'w') if text_output is not None else None speech_act_summary_map = defaultdict(int) kb_strategy_summary_map = {} dialog_summary_map = {} fact_summary_map = {} utterance_counts = defaultdict(lambda: defaultdict(int)) ngram_counts = defaultdict(lambda: defaultdict(int)) template_summary_map = {'total': 0.} speech_act_sequence_summary_map = {'total': 0.} alpha_stats = [] num_items_stats = [] num_attrs_mentioned = 0. most_mentioned_attrs = 0. entity_mention_summary_map = {} total_events = 0 total_dialogues = 0. lm_summary_map = {} for raw in all_chats: ex = Example.from_dict(scenario_db, raw) kbs = ex.scenario.kbs if ex.outcome is None or ex.outcome["reward"] == 0: continue # skip incomplete dialogues total_dialogues += 1. dialog = [] mentioned_entities = set() for i, event in enumerate(ex.events): if event.action == 'select': utterance = [] elif event.action == 'message': utterance = preprocessor.process_event(event, kbs[event.agent], mentioned_entities) # Skip empty utterances if not utterance: continue else: utterance = utterance[0] for token in utterance: if is_entity(token): mentioned_entities.add(token[1][0]) logstats.update_summary_map( dialog_summary_map, {'utterance_length': len(utterance)}) check_fact(fact_summary_map, utterance, kbs[event.agent]) if lm: logstats.update_summary_map(lm_summary_map, { 'score': lm.score(' '.join(entity_to_type(utterance))) }) update_ngram_counts(ngram_counts, utterance) if fout: fout.write('%s\n' % (' '.join(entity_to_type(utterance)))) else: raise ValueError('Unknown event action %s.' % event.action) total_events += 1 speech_act = get_speech_act(speech_act_summary_map, event, utterance) get_linguistic_template(template_summary_map, utterance) entities = [x[1] for x in utterance if is_entity(x)] dialog.append((event.agent, speech_act, entities, utterance)) get_dialog_stats(dialog_summary_map, utterance_counts, dialog) get_speech_act_histograms(speech_act_sequence_summary_map, dialog) get_entity_mention(entity_mention_summary_map, dialog, kbs) orders, mentioned_attrs, most_mentioned_label = get_kb_strategy( kbs, dialog) orders = tuple(orders) most_mentioned_attrs += alpha_labels_to_values[most_mentioned_label] if len(orders) not in kb_strategy_summary_map.keys(): kb_strategy_summary_map[len(orders)] = {} if orders not in kb_strategy_summary_map[len(orders)].keys(): kb_strategy_summary_map[len(orders)][orders] = 0.0 kb_strategy_summary_map[len(orders)][tuple(orders)] += 1.0 alphas = ex.scenario.alphas num_attrs_mentioned += len(orders) / len(alphas) first_mentioned_label = NO_ALPHA_MENTION if len(orders) > 0: first_mentioned_label = orders[0] if len(mentioned_attrs) > 0: first_mentioned_type, first_mentioned_attr, first_agent = mentioned_attrs[ 0] update_item_stats(num_items_stats, first_mentioned_type, first_mentioned_attr, kbs[first_agent]) if first_mentioned_label != NO_ALPHA_MENTION: update_alpha_stats(alpha_stats, kbs[first_agent], first_mentioned_label) # print "First mentioned attribute alpha:", first_mentioned, alpha_labels_to_values[first_mentioned] if fout: fout.close() # Summarize stats total = float(total_events) kb_strategy_totals = { k1: sum(v2 for v2 in v1.values()) for k1, v1 in kb_strategy_summary_map.items() } dialog_stats = { k: dialog_summary_map[k]['mean'] for k in dialog_summary_map } dialog_stats['entity_type_token_ratio'] = dialog_summary_map[ 'num_entity_type_per_dialog']['sum'] / float( dialog_summary_map['num_entity_per_dialog']['sum']) unigram_counts = {k[0]: v for k, v in ngram_counts[1].iteritems()} dialog_stats['vocab_size'] = len(unigram_counts) dialog_stats['unigram_entropy'] = count_to_entropy(unigram_counts) multi_speech_act = sum([ speech_act_summary_map[k] for k in speech_act_summary_map if len(k) > 1 ]) / total return { 'speech_act': { k: speech_act_summary_map[k] / total for k in speech_act_summary_map.keys() }, 'kb_strategy': { k1: { ", ".join(k2): v2 / kb_strategy_totals[k1] for k2, v2 in v1.items() } for k1, v1 in kb_strategy_summary_map.items() }, 'dialog_stats': dialog_stats, 'lm_score': -1 if not lm else lm_summary_map['score']['mean'], 'utterance_counts': utterance_counts, 'ngram_counts': ngram_counts, 'linguistic_templates': template_summary_map, 'speech_act_sequences': speech_act_sequence_summary_map, 'correct': fact_summary_map['correct']['mean'], 'entity_mention': { k: np.mean(v) for k, v in entity_mention_summary_map['first'].iteritems() }, 'multi_speech_act': multi_speech_act, 'alpha_stats': alpha_stats, 'num_items_stats': num_items_stats }
def inc_coref(self): logstats.update_summary_map(self.summary_map, {'coref': 1})