def _init_training(self, das_file, ttree_file, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. """ # read input log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees from ' + ttree_file + '...') ttree_doc = read_ttrees(ttree_file) trees = trees_from_doc(ttree_doc, self.language, self.selector) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DialogueAct() empty_da.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das] self.y = self.da_vect.fit_transform(self.y) # initialize I/O shapes self.input_shape = [list(self.X[0].shape)] self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network()
def answer_confirm(self, state, res0, slots_to_confirm): conflicts = [] selects = [] noclues = [] for slot, slot_value in slots_to_confirm.items(): state.user_state_confirm[slot] = [] if not slot in res0: noclues += [slot] elif not res0[slot].lower() in slot_value: conflicts += [slot] if len(slot_value) > 1: selects += [(slot, res0[slot], )] if len(noclues) > 0: return DialogueAct( "&".join(["noclue(%s)" % n for n in noclues])) elif len(conflicts) == 0: if len(selects) == 0: return DialogueAct("affirm()") else: return DialogueAct( "&".join(["confirm(%s=%s)" % (n, v, ) \ for n, v in selects])) else: conf_acts = [] for conflict in conflicts: conf_acts += ["inform(%s='%s')" % (conflict, res0[conflict], )] res = DialogueAct("negate()") # XXX I understood sorting was not desired. Substituted with # the non-sorting version. # res.merge(DialogueAct("&".join(conf_acts))) res.extend(conf_acts) return res
def filter(self, in_da): """Go through the input dialogue acts and pick only the ones that we can understand and that have good enough confidence.""" new_nblist = DialogueActNBList() # for each dialogue act item check if it is of known type # and if it has good probability for item in in_da: da = item[1] new_da = DialogueAct() for dai in da: if dai.dat in ["inform", "request"]: if dai.value is not None and not dai.value in self.policy.values: continue if dai.dat in ["inform", "request", "confirm"]: if not dai.name in self.policy.slots: continue # check if the value is in our ontology #if type(dai.value) is str and \ # self.ontology_unknown_re.match(dai.value): # continue if dai.dat in ["inform", "request", "other", "confirm", "reqalts", "bye", "restart"]: new_da.append(dai) if item[0] >= 0.3: # do not consider things bellow 0.3 if len(new_da) > 0: new_nblist.add(item[0], new_da) return new_nblist
def test_parse_X(self): from alex.components.slu.dainnclassifier import DAINNClassifier np.random.seed(0) cldb = CategoryLabelDatabase() class db: database = { "task": { "find_connection": ["najít spojení", "najít spoj", "zjistit spojení", "zjistit spoj", "hledám spojení", 'spojení', 'spoj', ], "find_platform": ["najít nástupiště", "zjistit nástupiště", ], 'weather': ['pocasi', 'jak bude', ], }, "number": { "1": ["jednu"] }, "time": { "now": ["nyní", "teď", "teďka", "hned", "nejbližší", "v tuto chvíli", "co nejdřív"], }, } cldb.load(db_mod=db) preprocessing = SLUPreprocessing(cldb) clf = DAINNClassifier(cldb, preprocessing, features_size=4) # Train a simple classifier. das = { '1': DialogueAct('inform(task=weather)'), '2': DialogueAct('inform(time=now)'), '3': DialogueAct('inform(task=weather)'), '4': DialogueAct('inform(task=connection)'), } utterances = { '1': Utterance('pocasi pocasi pocasi pocasi pocasi'), '2': Utterance('hned ted nyni hned ted nyni'), '3': Utterance('jak bude jak bude jak bude jak bude'), '4': Utterance('kdy a odkat mi to jede'), } clf.extract_classifiers(das, utterances, verbose=False) clf.prune_classifiers(min_classifier_count=0) clf.gen_classifiers_data(min_pos_feature_count=0, min_neg_feature_count=0, verbose2=False) clf.train(inverse_regularisation=1e1, verbose=False) # Parse some sentences. utterance_list = UtteranceNBList() utterance_list.add(0.7, Utterance('pocasi')) utterance_list.add(0.7, Utterance('jak bude pocasi')) utterance_list.add(0.2, Utterance('hned')) utterance_list.add(0.2, Utterance('hned')) da_confnet = clf.parse_X(utterance_list, verbose=False) self.assertTrue(da_confnet.get_prob(DialogueActItem(dai='inform(task=weather)')) != 0.0) self.assertTrue(da_confnet.get_prob(DialogueActItem(dai='inform(time=now)')) != 0.0)
def read_das(da_file): """Read dialogue acts from a file, one-per-line.""" das = [] with file_stream(da_file) as fh: for line in fh: da = DialogueAct() da.parse(line) das.append(da) return das
def _zero_act_return(self): da = DialogueAct() fixed_slots_values = self.metadata['goals'][self.goal_id]['fixed_slots'] for s, v in fixed_slots_values: da.append(DialogueActItem('inform', s, v)) if self.slot_level_used == 0: self.slot_level_used = 1 return [da]
def test_dialog(self): ontology_file = script_path(__file__, 'test_ruledm_data', 'ontology.cfg') db_file = script_path(__file__, 'test_ruledm_data', 'data.txt') class TRuleDMPolicy(DRuleDMPolicy): db_cls = CamInfoDb class TRuleDM(RuleDM): policy_cls = TRuleDMPolicy dm = TRuleDM({ 'DM': { 'ontology': ontology_file, 'TRuleDM': { 'db_cfg': db_file, 'provide_code': False, 'code_submit_url': None }, 'dialogue_state': { 'type': DRuleDS, }, 'dialogue_policy': { 'type': TRuleDMPolicy, }, }, 'Logging': { 'system_logger': DummyLogger() } }) turn_init = dm.da_out() self.assertEquals(turn_init.has_dat("hello"), True) turn_init = dm.da_out() self.assertEquals(turn_init.has_dat("hello"), True) dm.da_in([(1.0, DialogueAct("inform(food=chinese)"))]) turn_1 = dm.da_out() self.assertEquals(turn_1.has_dat("inform"), True) dm.da_in([(1.0, DialogueAct("asdf(fdsa=asdf)"))]) turn_2 = dm.da_out() self.assertEquals(turn_2.has_dat("notunderstood"), True) dm.da_in([(1.0, DialogueAct("inform(price=cheap)"))]) turn_3 = dm.da_out() self.assertEquals(turn_3.has_dat("nomatch"), True) dm.da_in([(1.0, DialogueAct("bye()"))]) turn_bye = dm.da_out() self.assertEquals(turn_bye.has_dat("bye"), True)
def parse_input_da(self, l): """Converts a text including a dialogue act and its probability into a dialogue act instance and float probability. The input text must have the following form: [prob] the dialogue act """ ri = l.find(" ") prob = 1.0 if ri != -1: da = l[ri + 1:] try: prob = float(l[:ri]) except: # I cannot convert the first part of the input as a float # Therefore, assume that all the input is a DA da = l else: da = l try: da = DialogueAct(da) except (DialogueActException, DialogueActItemException): raise SemHubException("Invalid dialogue act: s") return prob, da
def say_query(self, state): query = self.build_query(state) act = "&".join(["want(%s=%s)" % ( k, v, ) for k, v in query.items()]) return DialogueAct(act)
def test_tecto_template_nlg(self): # initialize cfg = Config.load_configs(config=CONFIG_DICT, use_default=False, log=False) nlg = TectoTemplateNLG(cfg) # test all cases for da, correct_text in zip(DAS, TEXTS): # try generation da = DialogueAct(da) generated_text = nlg.generate(da) # print output s = [] s.append("") s.append("Input DA:") s.append(unicode(da)) s.append("") s.append("Correct text:") s.append(correct_text) s.append("") s.append("Generated text:") s.append(generated_text) s.append("") # test the result self.assertEqual(correct_text, generated_text)
def main(input_file, filter_threshold): data = [['type', 'abstr_utt', 'abstr_da', 'utt', 'da']] # create output headers with codecs.open(input_file, "r", 'UTF-8') as fh: for line in fh: print >> sys.stderr, 'Processing: ', line.strip() occ_num, utt, da = line.strip().split('\t') da = DialogueAct(da_str=da) occ_num = int(occ_num) if occ_num < filter_threshold: print >> sys.stderr, 'Input "%s" has only %d occurrences, skipping' % ( utt, occ_num) continue if re.match(r'^(\*[A-Z_]+)(\s+\*[A-Z_]+)*$', utt): print >> sys.stderr, 'Input "%s" only contains slots, skipping' % utt continue try: ret = process_utt(utt, da) print >> sys.stderr, 'Result:', "\n".join( ["\t".join(i) for i in ret]) print >> sys.stderr, '' data.extend(ret) except NotImplementedError as e: print >> sys.stderr, 'Error:', e with codecs.getwriter('utf-8')(sys.stdout) as fh: csvwrite = csv.writer(fh, delimiter=b"\t") for line in data: csvwrite.writerow(line)
def epilogue_final_code(self): data = None attempts = 0 url_template = self.cfg['DM']['epilogue']['final_code_url'] system_logger = self.cfg['Logging']['system_logger'] # store a code on the server (try several times if not successful) while attempts < 10 and not data or not data['response'] or data['response'] != 'success': code = self.codes.popleft() self.codes.append(code) # put the code back to the end of the queue for reuse attempts += 1 # pull the URL url = url_template.format(code=code, logdir=system_logger.get_session_dir_name()) data = urllib2.urlopen(url).read() data = json.loads(data, encoding='UTF-8') if attempts >= 10: # This shouldn't happen text = 'I am sorry. A valid code could not be generated' else: text = [c for c in code] text = ", ".join(text) text = self.cfg['DM']['epilogue']['final_code_text'].format(code=text) text = [text, ] * 3 text = self.cfg['DM']['epilogue']['final_code_text_repeat'].join(text) da = DialogueAct('say(text="{text}")'.format(text=text)) self.cfg['Logging']['session_logger'].dialogue_act("system", da) self.commands.send(DMDA(da, 'DM', 'HUB')) self.final_code_given = True
def say_inform(self, state, rec): islot_name = self.get_interesting_slot(state) or self.slots[0] if not islot_name in rec: islot_name = rec.keys()[0] slot_name = 'name' return DialogueAct( "inform(%s='%s')&inform(%s='%s')" % \ (slot_name, rec[slot_name], islot_name, rec[islot_name]) )
def test_swapping_merge_normalise(self): nblist1 = DialogueActNBList() nblist1.add(0.7, DialogueAct("hello()")) nblist1.add(0.2, DialogueAct("bye()")) nblist2 = deepcopy(nblist1) nblist1.merge().normalise() nblist2.normalise().merge() s = [] s.append("") s.append("Using merge().normalise():") s.append(unicode(nblist1)) s.append("") s.append("Using normalise().merge():") s.append(unicode(nblist2)) s.append("") self.assertEqual(nblist1, nblist2)
def compose_utterance_greedy(self, da): """\ Compose an utterance from templates by iteratively looking for the longest (up to self.compose_greedy_lookahead) matching sub-utterance at the current position in the DA. Returns the composed utterance. """ composed_utt = [] sub_start = 0 # pass through the dialogue act while sub_start < len(da): dax_utt = None dax_len = None # greedily look for the longest template that will cover the next # dialogue act items (try longer templates first, from maximum # length given in settings down to 1). for sub_len in xrange(self.compose_greedy_lookahead, 0, -1): dax = DialogueAct() dax.extend(da[sub_start:sub_start + sub_len]) try: # try to find an exact match dax_utt = self.random_select(self.templates[unicode(dax)]) dax_len = sub_len break except KeyError: # try to find a relaxed match svsx = dax.get_slots_and_values() try: dax_utt = self.match_and_fill_generic(dax, svsx) dax_len = sub_len break except TemplateNLGException: # nothing found: look for shorter templates continue if dax_utt is None: # dummy backoff dax_utt = unicode(da[sub_start]) dax_len = 1 composed_utt.append(dax_utt) sub_start += dax_len return ' '.join(composed_utt)
def test_random_dialogues(user): metadata = get_metadata() for i in range(100): print '=======================Dialogue %i============================'%(i+1) user.new_dialogue() print 'Goal:', user.goal print '-'*60 goal_des = metadata['goals'][user.goal['task']] ordered_acts = goal_des['acts'] slots = goal_des['slots'] for acts in ordered_acts: da = DialogueAct() for act in acts.split('&'): act_des = metadata['act_definitions'][act] slot = None if act_des['slot_included']: slot = sample_from_list(slots) value = None if act_des['value_included']: if slot not in user.goal.keys(): for s in get_equivalent_slots(goal_des, slot): if s in user.goal.keys(): slot = s break if slot in user.goal.keys(): if sample_a_prob(0.5): value = user.goal[slot] else: value = 'lct' else: value = 'lct' item = DialogueActItem(act, slot, value) da.append(item) print 'sys_da:\t\t', da user.da_in(da) da = user.da_out() print 'user_da:\t', da[0] if len(da[0])==0: raise RuntimeError('User simulator doesnt reply anything!!') pdb.set_trace()
def test_merge_slu_nblists_full_nbest_lists(self): # make sure the alex.components.slu.da.merge_slu_nblists merges nblists correctly nblist1 = DialogueActNBList() nblist1.add(0.7, DialogueAct("hello()")) nblist1.add(0.2, DialogueAct("bye()")) nblist1.merge().normalise() # nblist1.normalise() nblist2 = DialogueActNBList() nblist2.add(0.6, DialogueAct("hello()")) nblist2.add(0.3, DialogueAct("restart()")) nblist2.merge().normalise() # nblist2.normalise() nblists = [[0.7, nblist1], [0.3, nblist2]] merged_nblists = merge_slu_nblists(nblists) correct_merged_nblists = DialogueActNBList() correct_merged_nblists.add(0.7 * 0.7, DialogueAct("hello()")) correct_merged_nblists.add(0.7 * 0.2, DialogueAct("bye()")) correct_merged_nblists.add(0.7 * 0.1, DialogueAct("other()")) correct_merged_nblists.add(0.3 * 0.6, DialogueAct("hello()")) correct_merged_nblists.add(0.3 * 0.3, DialogueAct("restart()")) correct_merged_nblists.add(0.3 * 0.1, DialogueAct("other()")) correct_merged_nblists.merge().normalise() # correct_merged_nblists.normalise() s = [] s.append("") s.append("Merged nblists:") s.append(unicode(merged_nblists)) s.append("") s.append("Correct merged results:") s.append(unicode(correct_merged_nblists)) s.append("") print '\n'.join(s) self.assertEqual(unicode(merged_nblists), unicode(correct_merged_nblists))
def say_slots(self, state, res0, slots_to_say): d_str = [] for slot in slots_to_say: d_str += [ "inform(%s='%s')" % ( slot, res0.get(slot, 'dontknow'), ) ] state.user_state_request[slot] = True return DialogueAct("&".join(d_str))
def test_reply(user): user.new_dialogue() print 'GOAL', user.goal act_type = 'implconfirm' slots = ['from_stop', 'from_street', 'from_city'] act_slot = None act_value = None for slot in slots: if slot in user.goal.keys(): act_slot = slot act_value = user.goal[slot] break act_value='abc' da = DialogueAct() item = DialogueActItem(act_type, act_slot, act_value) da.append(item) item = DialogueActItem('request', 'to_stop') da.append(item) print 'sys_da:', da user.da_in(da) dao = user.da_out() print 'user_da:', dao[0]
def _build_da_nbest_list(self, i, da, prob): if i<len(self._sampled_da_items): da_items, probs = self._sampled_da_items[i] for dai_index in range(len(da_items)): if da is None: da_new = DialogueAct() da_new.append(da_items[dai_index]) self._build_da_nbest_list(i+1, da_new, probs[dai_index]) else: da_new = DialogueAct() da_new.extend(da) da_new.append(da_items[dai_index]) self._build_da_nbest_list(i+1, da_new, prob*probs[dai_index])#TODO check the equation and fix it when we there is more types of confusion else: self._nbest_list.add(da, prob)
def test_get_da_nblist(self): # Simple case with one good hypothesis. dacn = DialogueActConfusionNetwork() dacn.add(0.05, DialogueActItem(dai='inform(food=chinese)')) dacn.add(0.9, DialogueActItem(dai='inform(food=czech)')) dacn.add(0.05, DialogueActItem(dai='inform(food=russian)')) nblist = dacn.get_da_nblist() best_da = nblist.get_best_da() expected_da = DialogueAct(da_str='inform(food=czech)') self.assertEqual(best_da, expected_da) # More good hypotheses dacn = DialogueActConfusionNetwork() dacn.add(0.05, DialogueActItem(dai='inform(food=chinese)')) dacn.add(0.9, DialogueActItem(dai='inform(food=czech)')) dacn.add(0.9, DialogueActItem(dai='inform(food=russian)')) nblist = dacn.get_da_nblist() best_da = nblist.get_best_da() expected_da = DialogueAct( da_str='inform(food=czech)&inform(food=russian)') self.assertEqual(best_da, expected_da)
def compose_utterance_single(self, da): """\ Compose an utterance from templates for single dialogue act items. Returns the composed utterance. """ composed_utt = [] # try to find a template for each single dialogue act item for dai in da: try: # look for an exact match dai_utt = self.random_select(self.templates[unicode(dai)]) except KeyError: # try to find a relaxed match dax = DialogueAct() dax.append(dai) svsx = dax.get_slots_and_values() try: dai_utt = self.match_and_fill_generic(dax, svsx) except TemplateNLGException: dai_utt = unicode(dai) composed_utt.append(dai_utt) return ' '.join(composed_utt)
def _build_da_nbest_list(self, i, da, prob): '''Build all combination for the DialogueActItem and probs saved in self._sampled_da_item. Currently, not being used. ''' if i<len(self._sampled_da_items): da_items, probs = self._sampled_da_items[i] for dai_index in range(len(da_items)): if da is None: da_new = DialogueAct() da_new.append(da_items[dai_index]) self._build_da_nbest_list(i+1, da_new, probs[dai_index]) else: da_new = DialogueAct() da_new.extend(da) da_new.append(da_items[dai_index]) self._build_da_nbest_list(i+1, da_new, prob*probs[dai_index])#TODO check the equation and fix it when we there is more types of confusion else: self._nbest_list.add(da, prob)
def epilogue_final_code(self): code = self.codes.pop() # pull the url url = self.cfg['DM']['epilogue']['final_code_url'].format(code = code) urllib.urlopen(url) text = [c for c in code] text = ", ".join(text) text = self.cfg['DM']['epilogue']['final_code_text'].format(code = text) text = [text,]*3 text = self.cfg['DM']['epilogue']['final_code_text_repeat'].join(text) da = DialogueAct('say(text="{text}")'.format(text=text)) self.cfg['Logging']['session_logger'].dialogue_act("system", da) self.commands.send(DMDA(da, 'DM', 'HUB')) self.final_code_given = True
def _get_answer_da(self, da_in): '''Answer a sytem dialogue act.''' da_out = DialogueAct() out_of_patience=False reply_sys_acts = self.metadata['reply_system_acts'] da_metadata = self._get_dialogue_act_metadata(da_in) for act_in in da_metadata.keys(): #debug_print('------Handling the sys_act' + act_in) #print '------Handling the sys_act', act_in reply = reply_sys_acts[act_in] if isinstance(reply, dict):#this action has different definition for different goal reply = reply[self.goal_id] answer = self._sample_element_from_list_dict(reply) if 'ordered_return_acts' in answer:#process list of answer in order, and stop for first appliable for solution in answer['ordered_return_acts']: case = self._sample_element_from_list_dict(solution) da_items = self._build_one_answer(da_metadata[act_in], case, True) if len(da_items)>0: answer = case# for filtering acts with add_to_da_prob propertiy break else: da_items = self._build_one_answer(da_metadata[act_in], answer) for item in da_items:#process action can be whether add to da_out or not like impl_confirmi act_out_des = self._get_act_out_description(item.dat, answer) if 'add_to_da_prob' in act_out_des.keys(): if sample_a_prob(act_out_des['add_to_da_prob']) and item not in da_out: da_out.append(item) else: if item not in da_out: da_out.append(item) #-------update patience history if item.name is not None:#have slot, the sys act ask repeated the sema slot anserd, ignore the case of over answer if act_in not in self.patience_history.keys(): self.patience_history[act_in] = {} if item.name not in self.patience_history[act_in]: self.patience_history[act_in][item.name]=1 else: self.patience_history[act_in][item.name]+=1 if self.patience_level>=1 and self.patience_history[act_in][item.name]>self.patience_level: out_of_patience = True break#only break the inner loop #da_out.extend(da_items) if out_of_patience: if random.random()>0.5: da_out = DialogueAct(self.config['out_of_patience_act']) print '!!!!ANGRY...' else: print '!!Almost ANGRY...' return da_out
def load_templates(self, file_name): """\ Load templates from an external file, which is assumed to be a Python source which defines the variable 'templates' as a dictionary containing stringified dialog acts as keys and (lists of) templates as values. """ try: templates = load_as_module(file_name, force=True).templates # normalize the templates self.templates = {} # generalised templates self.gtemplates = {} for k, v in templates.iteritems(): da = DialogueAct(k) # k.sort() self.templates[unicode(da)] = v self.gtemplates[unicode(self.get_generic_da(da))] = (da, v) except Exception as e: raise TemplateNLGException('No templates loaded from %s -- %s!' % (file_name, e))
def test_template_nlg(self): cfg = self.cfg nlg = TemplateNLG(cfg) da = DialogueAct('affirm()&inform(num_transfers="2")').sort() correct_text = u"Ano. Na dané trase jsou dva přestupy." generated_text = nlg.generate(da) s = [] s.append("") s.append("Input DA:") s.append(unicode(da)) s.append("") s.append("Correct text:") s.append(unicode(correct_text)) s.append("") s.append("Generated text:") s.append(unicode(generated_text)) s.append("") self.assertEqual(unicode(correct_text), unicode(generated_text))
def test_template_nlg_r(self): cfg = self.cfg nlg = TemplateNLG(cfg) da = DialogueAct('affirm()&inform(from_stop=Sparta)').sort() correct_text = "Ano, jede to ze zastávky Sparta." generated_text = nlg.generate(da) s = [] s.append("") s.append("Input DA:") s.append(unicode(da)) s.append("") s.append("Correct text:") s.append(unicode(correct_text)) s.append("") s.append("Generated text:") s.append(unicode(generated_text)) s.append("") self.assertEqual(unicode(correct_text), unicode(generated_text))
def answer_confirm(self, state, res0, slots_to_confirm): conflicts = [] selects = [] noclues = [] for slot, slot_value in slots_to_confirm.items(): state.user_state_confirm[slot] = [] if not slot in res0: noclues += [slot] elif not res0[slot].lower() in slot_value: conflicts += [slot] if len(slot_value) > 1: selects += [( slot, res0[slot], )] if len(noclues) > 0: return DialogueAct("&".join(["noclue(%s)" % n for n in noclues])) elif len(conflicts) == 0: if len(selects) == 0: return DialogueAct("affirm()") else: return DialogueAct( "&".join(["confirm(%s=%s)" % (n, v, ) \ for n, v in selects])) else: conf_acts = [] for conflict in conflicts: conf_acts += [ "inform(%s='%s')" % ( conflict, res0[conflict], ) ] res = DialogueAct("negate()") # XXX I understood sorting was not desired. Substituted with # the non-sorting version. # res.merge(DialogueAct("&".join(conf_acts))) res.extend(conf_acts) return res
def test_template_nlg_r(self): cfg = self.cfg nlg = TemplateNLG(cfg) da = DialogueAct('affirm()&inform(pricerange="cheap")&inform(task="find")').sort() correct_text = "Ok, you are looking for something in the cheap price range." generated_text = nlg.generate(da) s = [] s.append("") s.append("Input DA:") s.append(unicode(da)) s.append("") s.append("Correct text:") s.append(unicode(correct_text)) s.append("") s.append("Generated text:") s.append(unicode(generated_text)) s.append("") print '\n'.join(s) self.assertEqual(unicode(correct_text), unicode(generated_text))
def parse_1_best(self, obs, verbose=False, *args, **kwargs): """Parse an utterance into a dialogue act. :rtype DialogueActConfusionNetwork """ utterance = obs['utt'] if isinstance(utterance, UtteranceHyp): # Parse just the utterance and ignore the confidence score. utterance = utterance.utterance if verbose: print 'Parsing utterance "{utt}".'.format(utt=utterance) res_cn = DialogueActConfusionNetwork() dict_da = self.utt2da.get(unicode(utterance), None) if dict_da: for dai in DialogueAct(dict_da): res_cn.add(1.0, dai) return res_cn utterance = self.preprocessing.normalise_utterance(utterance) abutterance, category_labels = self.abstract_utterance(utterance) if verbose: print 'After preprocessing: "{utt}".'.format(utt=abutterance) print category_labels self.parse_non_speech_events(utterance, res_cn) utterance = utterance.replace_all(['_noise_'], '').replace_all( ['_laugh_'], '').replace_all(['_ehm_hmm_'], '').replace_all(['_inhale_'], '') abutterance = abutterance.replace_all(['_noise_'], '').replace_all( ['_laugh_'], '').replace_all(['_ehm_hmm_'], '').replace_all(['_inhale_'], '') abutterance = self.handle_false_abstractions(abutterance) category_labels.add('CITY') category_labels.add('VEHICLE') category_labels.add('NUMBER') if len(res_cn) == 0: if 'STOP' in category_labels: self.parse_stop(abutterance, res_cn) if 'CITY' in category_labels: self.parse_city(abutterance, res_cn) if 'NUMBER' in category_labels: self.parse_number(abutterance) if any([word.startswith("TIME") for word in abutterance]): category_labels.add('TIME') if 'TIME' in category_labels: self.parse_time(abutterance, res_cn) if 'DATE_REL' in category_labels: self.parse_date_rel(abutterance, res_cn) if 'AMPM' in category_labels: self.parse_ampm(abutterance, res_cn) if 'VEHICLE' in category_labels: self.parse_vehicle(abutterance, res_cn) if 'TASK' in category_labels: self.parse_task(abutterance, res_cn) self.parse_meta(utterance, res_cn) res_cn.merge() return res_cn
def generate_task(): task = [] da = DialogueAct() # indicate that we're looking for connection da.append(DialogueActItem('inform', 'task', 'find_connection')) # get two distinct stops from_stop = random.choice(STOPS) to_stop = from_stop while to_stop == from_stop: to_stop = random.choice(STOPS) da.append(DialogueActItem('inform', 'from_stop', from_stop)) da.append(DialogueActItem('inform', 'to_stop', to_stop)) task.append(da) # generate random subsequent questions questions = random.sample(range(6), random.randint(5, 6) - len(task)) query_change = False da = DialogueAct() for question in sorted(questions): dais = QUESTIONS[question] if dais[0].name in ['alternative', 'vehicle', 'time', 'to_stop'] and not query_change: query_change = True task.append(da) da = DialogueAct() if dais[0].name == 'to_stop': new_to_stop = random.choice(STOPS) while new_to_stop == from_stop or new_to_stop == to_stop: new_to_stop = random.choice(STOPS) dais[0].value = new_to_stop da.extend(dais) task.append(da) return task
def main(args): data = [] good_toks, good_types = 0, 0 # good contexts, useful for tasks fthr_toks, fthr_types = 0, 0 # filtered because of threshold fslt_toks, fslt_types = 0, 0 # filtered as they only contain slots frep_toks, frep_types = 0, 0 # filtered because no reply can be generated finished = {} with codecs.open(args.input_file, "r", 'UTF-8') as fh: for line in fh: print >> sys.stderr, 'Processing: ', line.strip() if line.count("\t") != 2: print >> sys.stderr, 'Invalid input format, skipping' continue occ_num, utt, da = line.strip().split('\t') da = DialogueAct(da_str=da) occ_num = int(occ_num) if occ_num < args.filter_threshold: print >> sys.stderr, 'Input "%s" has only %d occurrences, skipping' % ( utt, occ_num) fthr_toks += occ_num fthr_types += 1 continue if re.match(r'^(\*[A-Z_]+)(\s+\*[A-Z_]+)*$', utt): print >> sys.stderr, 'Input "%s" only contains slots, skipping' % utt fslt_toks += occ_num fslt_types += 1 continue try: ret = process_utt(utt, da) if not ret: frep_toks += occ_num frep_types += 1 else: good_toks += occ_num good_types += 1 print >> sys.stderr, 'Result:', "\n".join( unicode(line) for line in ret) print >> sys.stderr, '' if args.occ_nums: for ret_line in ret: ret_line.occ_num = occ_num data.extend(ret) except NotImplementedError as e: frep_toks += occ_num frep_types += 1 print >> sys.stderr, 'Error:', e if args.load_finished: with codecs.open(args.load_finished, "r", 'UTF-8') as fh: csvread = csv.reader(fh, delimiter=str(args.finished_csv_delim), quotechar=b'"') columns = DataLine.get_columns_from_header(csvread.next()) for row in csvread: finished_line = DataLine.from_csv_line(row, columns) finished[finished_line.signature] = finished_line written = {} with codecs.getwriter('utf-8')(sys.stdout) as fh: # starting with the header csvwrite = csv.writer(fh, delimiter=b"\t", lineterminator="\n") csvwrite.writerow(DataLine.get_headers(args.occ_nums)) for line in data: if line.signature in written: # some lines may be duplicate, skip them print >> sys.stderr, 'Duplicate line:', line.signature continue # skip finished results (if they are loaded and if they should be skipped) if line.signature in finished: if finished[line.signature].slots != line.slots: print >> sys.stderr, ('Slots changed for ', line.signature, '-- ignoring finished.') csvwrite.writerow(line.as_tuple(args.occ_nums)) elif not args.skip_finished: finished[line.signature].occ_num = line.occ_num csvwrite.writerow(finished[line.signature].as_tuple( args.occ_nums)) # default case: not found among finished else: csvwrite.writerow(line.as_tuple(args.occ_nums)) written[line.signature] = line print >> sys.stderr, ( "\n\nGood: %d / %d\nThreshold: %d / %d\nSlots: %d / %d\nReply: %d / %d" % (good_toks, good_types, fthr_toks, fthr_types, fslt_toks, fslt_types, frep_toks, frep_types))
def say_notunderstood(self): return DialogueAct("notunderstood()")
def generate_task(): task = [] da = DialogueAct() # indicate that we're looking for connection da.append(DialogueActItem('inform', 'task', 'find_connection')) # get two distinct stops from_stop = random.choice(STOPS) to_stop = from_stop while to_stop == from_stop: to_stop = random.choice(STOPS) da.append(DialogueActItem('inform', 'from_stop', from_stop)) da.append(DialogueActItem('inform', 'to_stop', to_stop)) task.append(da) # generate random subsequent questions questions = random.sample(range(6), random.randint(5, 6) - len(task)) query_change = False da = DialogueAct() for question in sorted(questions): dais = QUESTIONS[question] if dais[0].name in ['alternative', 'vehicle', 'time', 'to_stop' ] and not query_change: query_change = True task.append(da) da = DialogueAct() if dais[0].name == 'to_stop': new_to_stop = random.choice(STOPS) while new_to_stop == from_stop or new_to_stop == to_stop: new_to_stop = random.choice(STOPS) dais[0].value = new_to_stop da.extend(dais) task.append(da) return task
def get_da(self, dialogue_state): # all slots being requested by the user requested_slots = dialogue_state.get_slots_being_requested() # all slots being confirmed by the user confirmed_slots = dialogue_state.get_slots_being_confirmed() # all slots which had been supplied by the user but have not been # implicitly confirmed non_informed_slots = dialogue_state.get_slots_being_noninformed() if len(self.das) == 0: # NLG("Thank you for calling. How may I help you?") self.last_system_dialogue_act = DialogueAct("hello()&thankyou()") dialogue_state.slots["ludait"] = "none" elif dialogue_state.slots["ludait"] == "bye": # NLG("Goodbye.") self.last_system_dialogue_act = DialogueAct("bye()") dialogue_state.slots["ludait"] = "none" elif dialogue_state.slots["ludait"] == "restart": # NLG("Let's start again from scratch. How may I help you?") dialogue_state.restart() self.last_system_dialogue_act = DialogueAct("restart()&hello()") dialogue_state.slots["ludait"] = "none" elif dialogue_state.slots["ludait"] == "repeat": # NLG - use the last dialogue act dialogue_state.slots["ludait"] = "none" elif dialogue_state.slots["ludait"] == "reqalts": # NLG("There is nothing else in the database.") self.last_system_dialogue_act = DialogueAct( "deny(alternatives=true") dialogue_state.slots["ludait"] = "none" elif requested_slots: # inform about all requested slots self.last_system_dialogue_act = DialogueAct() for slot in requested_slots: dai = DialogueActItem("inform", slot, requested_slots[slot]) self.last_system_dialogue_act.append(dai) dialogue_state.slots["rh_" + slot] = "none" elif confirmed_slots: # inform about all slots being confirmed by the user self.last_system_dialogue_act = DialogueAct() for slot in confirmed_slots: if confirmed_slots[slot] == dialogue_state.slots[slot]: # it is as user expected self.last_system_dialogue_act.append( DialogueActItem("affirm")) dai = DialogueActItem("inform", slot, dialogue_state.slots[slot]) self.last_system_dialogue_act.append(dai) else: # it is something else to what user expected self.last_system_dialogue_act.append( DialogueActItem("negate")) dai = DialogueActItem("deny", slot, dialogue_state.slots["ch_" + slot]) self.last_system_dialogue_act.append(dai) dai = DialogueActItem("inform", slot, dialogue_state.slots[slot]) self.last_system_dialogue_act.append(dai) dialogue_state.slots["ch_" + slot] = "none" elif non_informed_slots: # implicitly confirm all slots provided but not yet implicitly # confirmed self.last_system_dialogue_act = DialogueAct() self.last_system_dialogue_act.append(DialogueActItem("affirm")) for slot in non_informed_slots: dai = DialogueActItem("inform", slot, non_informed_slots[slot]) self.last_system_dialogue_act.append(dai) else: # NLG("Can I help you with anything else?") self.last_system_dialogue_act = DialogueAct("reqmore()") dialogue_state.slots["ludait"] = "none" # record the system dialogue acts self.das.append(self.last_system_dialogue_act) return self.last_system_dialogue_act
def epilogue_final_question(self): da = DialogueAct('say(text="{text}")'.format(text=self.cfg['DM']['epilogue']['final_question'])) self.cfg['Logging']['session_logger'].dialogue_act("system", da) self.commands.send(DMDA(da, 'DM', 'HUB'))
def epilogue_final_apology(self): # apology for not reaching minimum number of turns text = self.cfg['DM']['epilogue']['final_code_text_min_turn_count_not_reached'] da = DialogueAct('say(text="{text}")'.format(text=text)) self.cfg['Logging']['session_logger'].dialogue_act("system", da) self.commands.send(DMDA(da, 'DM', 'HUB'))
def _init_training(self, das, trees, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. @param das: name of source file with training DAs, or list of DAs @param trees: name of source file with corresponding trees/sentences, or list of trees @param data_portion: portion of the training data to be used (0.0-1.0) """ # read input from files or take it directly from parameters if not isinstance(das, list): log_info('Reading DAs from ' + das + '...') das = read_das(das) if not isinstance(trees, list): log_info('Reading t-trees from ' + trees + '...') ttree_doc = read_ttrees(trees) if self.mode == 'tokens': tokens = tokens_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tokens) elif self.mode == 'tagged_lemmas': tls = tagged_lemmas_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tls, use_tags=True) else: trees = trees_from_doc(ttree_doc, self.language, self.selector) elif self.mode in ['tokens', 'tagged_lemmas']: trees = self._tokens_to_flat_trees( trees, use_tags=self.mode == 'tagged_lemmas') # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # ignore contexts, if they are contained in the DAs if isinstance(self.train_das[0], tuple): self.train_das = [da for (context, da) in self.train_das] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DialogueAct() empty_da.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([ self.tree_embs.get_embeddings(tree) for tree in self.train_trees ]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [ self.tree_feats.get_features(tree, {}) for tree in self.train_trees ] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [ self.da_feats.get_features(None, {'da': da}) for da in self.train_das ] self.y = self.da_vect.fit_transform(self.y) # initialize I/O shapes if not self.tree_embs: self.input_shape = list(self.X[0].shape) else: self.input_shape = self.tree_embs.get_embeddings_shape() self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network() # initialize the NN variables self.session.run(tf.initialize_all_variables())
def _init_training(self, das, trees, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. @param das: name of source file with training DAs, or list of DAs @param trees: name of source file with corresponding trees/sentences, or list of trees @param data_portion: portion of the training data to be used (0.0-1.0) """ # read input from files or take it directly from parameters if not isinstance(das, list): log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) if not isinstance(trees, list): log_info('Reading t-trees from ' + ttree_file + '...') ttree_doc = read_ttrees(ttree_file) if self.use_tokens: tokens = tokens_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tokens) else: trees = trees_from_doc(ttree_doc, self.language, self.selector) elif self.use_tokens: trees = self._tokens_to_flat_trees(trees) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # ignore contexts, if they are contained in the DAs if isinstance(self.train_das[0], tuple): self.train_das = [da for (context, da) in self.train_das] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DialogueAct() empty_da.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das] self.y = self.da_vect.fit_transform(self.y) # initialize I/O shapes if not self.tree_embs: self.input_shape = list(self.X[0].shape) else: self.input_shape = self.tree_embs.get_embeddings_shape() self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network() # initialize the NN variables self.session.run(tf.initialize_all_variables())
def process_pending_commands(self): """Process all pending commands. Available commands: stop() - stop processing and exit the process flush() - flush input buffers. Now it only flushes the input connection. Return True if the process should terminate. """ while self.commands.poll(): command = self.commands.recv() if self.cfg['DM']['debug']: self.cfg['Logging']['system_logger'].debug(command) if isinstance(command, Command): #Thanh: if command.parsed['__name__'] == 'print_log_dir': print '===***===session-log-dir:', command.source if command.parsed['__name__'] == 'stop': return True if command.parsed['__name__'] == 'flush': # discard all data in in input buffers while self.slu_hypotheses_in.poll(): data_in = self.slu_hypotheses_in.recv() self.dm.end_dialogue() self.commands.send(Command("flushed()", 'DM', 'HUB')) return False #if command.parsed['__name__'] == 'prepare_new_dialogue': #self.dm.new_dialogue() if command.parsed['__name__'] == 'new_dialogue': self.dm.new_dialogue()#thanh change??? self.epilogue_state = None self.cfg['Logging']['session_logger'].turn("system") self.dm.log_state() # I should generate the first DM output da = self.dm.da_out() if self.cfg['DM']['debug']: s = [] s.append("DM Output") s.append("-"*60) s.append(unicode(da)) s.append("") s = '\n'.join(s) self.cfg['Logging']['system_logger'].debug(s) self.cfg['Logging']['session_logger'].dialogue_act("system", da) self.commands.send(DMDA(da, 'DM', 'HUB')) return False if command.parsed['__name__'] == 'end_dialogue': self.dm.end_dialogue() return False if command.parsed['__name__'] == 'timeout': # check whether there is a looong silence # if yes then inform the DM silence_time = command.parsed['silence_time'] cn = DialogueActConfusionNetwork() cn.add(1.0, DialogueActItem('silence','time', silence_time)) # process the input DA self.dm.da_in(cn) self.cfg['Logging']['session_logger'].turn("system") self.dm.log_state() print '----Time out: ', self.epilogue_state, silence_time '''Thanh if self.epilogue_state == 'give_code': # an cant_apply act have been chosen self.cfg['Logging']['session_logger'].dialogue_act("system", self.epilogue_da) self.commands.send(DMDA(self.epilogue_da, 'DM', 'HUB')) self.commands.send(Command('hangup()', 'DM', 'HUB')) return False #''' if self.epilogue_state and float(silence_time) > 5.0: if self.epilogue_state == 'final_question': # and self.final_question_repeated<16: da = DialogueAct('say(text="{text}")'.format(text="Sorry, did you get the correct information?")) #self.final_question_repeated += 1 self.cfg['Logging']['session_logger'].dialogue_act("system", da) self.commands.send(DMDA(da, 'DM', 'HUB')) else: # a user was silent for too long, therefore hung up self.cfg['Logging']['session_logger'].dialogue_act("system", self.epilogue_da) self.commands.send(DMDA(self.epilogue_da, 'DM', 'HUB')) self.commands.send(Command('hangup()', 'DM', 'HUB')) else: da = self.dm.da_out() if self.cfg['DM']['debug']: s = [] s.append("DM Output") s.append("-"*60) s.append(unicode(da)) s.append("") s = '\n'.join(s) self.cfg['Logging']['system_logger'].debug(s) self.cfg['Logging']['session_logger'].dialogue_act("system", da) self.commands.send(DMDA(da, 'DM', 'HUB')) if da.has_dat("bye"): self.commands.send(Command('hangup()', 'DM', 'HUB')) return False return False