def initialize_episode(self): self.episode_count += 1 if self.training and self.episode_count%self.batch_size==0: self.num_updates += 1 if self.num_updates>self.pol_start and self.num_updates%ANNEAL==0: self.anneal_lr() if self.num_updates < self.pol_start: loss = self.update(regime='SL') else: loss = self.update(regime='RL') if self.num_updates%DISPF==0: self._print_progress(loss) if self.num_updates%SAVEF==0: self.save_model(dialog_config.MODEL_PATH+self._name) self.state = {} self.state['database'] = pkl.loads(pkl.dumps(self.database,-1)) self.state['prevact'] = 'begin@begin' self.state['inform_slots'] = self._init_beliefs() self.state['turn'] = 0 self.state['num_requests'] = {s:0 for s in self.state['database'].slots} self.state['slot_tracker'] = set() self.state['dont_care'] = set() self.state['init_entropy'] = {} for s in dialog_config.inform_slots: s_p = self.state['inform_slots'][s]/self.state['inform_slots'][s].sum() self.state['init_entropy'][s] = tools.entropy_p(s_p) self.state['inputs'] = [] self.state['actions'] = [] self.state['rewards'] = [] self.state['pol_state'] = np.zeros((1,self.n_hid)).astype('float32')
def next(self, user_action, verbose=False): self._update_state(user_action['nl_sentence'], upd=self.upd, verbose=verbose) self.state['turn'] += 1 db_status, db_index = self._check_db() N_db = len(db_index) H_slots = {} for s in dialog_config.inform_slots: s_p = self.state['inform_slots'][s]/self.state['inform_slots'][s].sum() H_slots[s] = tools.entropy_p(s_p) p_vector = np.zeros((self.in_size,)).astype('float32') if self.inputtype=='entropy': for i,s in enumerate(dialog_config.inform_slots): if s in H_slots: p_vector[i] = H_slots[s] p_vector[i+len(dialog_config.inform_slots)] = 1. if s in self.state['dont_care'] \ else 0. if self.state['turn']>1: pr_act = self.state['prevact'].split('@') act_id = dialog_config.inform_slots.index(pr_act[1]) p_vector[2*len(dialog_config.inform_slots)+act_id] = 1. #p_vector[-1] = N_db/self.state['database'].N if N_db<=5: p_vector[N_db-6] = 1. else: p_vector[-1] = 1. else: p_slots = self._dict2vec(self.state['inform_slots']) p_vector[:p_slots.shape[0]] = p_slots if self.state['turn']>1: pr_act = self.state['prevact'].split('@') act_id = dialog_config.inform_slots.index(pr_act[1]) p_vector[p_slots.shape[0]+act_id] = 1. db_i_vector = np.zeros((self.database.N,)).astype('float32') db_i_vector[db_index] = 1. p_vector[-self.database.N:] = db_i_vector p_vector = np.expand_dims(np.expand_dims(p_vector, axis=0), axis=0) p_vector = standardize(p_vector) if self.training and self.num_updates<self.pol_start: # act on policy but train on expert pp = np.zeros((len(dialog_config.inform_slots)+1,)) for i,s in enumerate(dialog_config.inform_slots): pp[i] = H_slots[s] pp[-1] = N_db _, action = self._rule_act(pp, db_index) act, _, p_out = self._prob_act(p_vector, db_index, mode='sample') else: if self.training: act, action, p_out = self._prob_act(p_vector, db_index, mode='sample') else: act, action, p_out = self._prob_act(p_vector, db_index, mode='max') self.state['inputs'].append(p_vector[0,0,:]) self.state['actions'].append(action) self.state['rewards'].append(user_action['reward']) self.state['pol_state'] = p_out act['posterior'] = np.zeros((len(self.database.labels),)) if len(db_index)>0: act['posterior'][db_index] = 1./len(db_index) else: act['posterior'] = 1./len(self.database.labels) return act
def initialize_episode(self): self.episode_count += 1 if self.training and self.episode_count % self.batch_size == 0: self.num_updates += 1 if self.num_updates > self.pol_start and self.num_updates % ANNEAL == 0: self.anneal_lr() if self.num_updates < self.pol_start: loss = self.update(regime='SL') else: loss = self.update(regime='RL') if self.num_updates % DISPF == 0: self._print_progress(loss) if self.num_updates % SAVEF == 0: self.save_model(dialog_config.MODEL_PATH + self._name) self.state = {} self.state['database'] = pkl.loads(pkl.dumps(self.database, -1)) self.state['prevact'] = 'begin@begin' self.state['inform_slots'] = self._init_beliefs() self.state['turn'] = 0 self.state['num_requests'] = { s: 0 for s in self.state['database'].slots } self.state['slot_tracker'] = set() self.state['dont_care'] = set() self.state['init_entropy'] = {} for s in dialog_config.inform_slots: s_p = self.state['inform_slots'][s] / self.state['inform_slots'][ s].sum() self.state['init_entropy'][s] = tools.entropy_p(s_p) self.state['inputs'] = [] self.state['actions'] = [] self.state['rewards'] = [] self.state['pol_state'] = np.zeros((1, self.n_hid)).astype('float32')
def next(self, user_action, verbose=False): self._update_state(user_action['nl_sentence'], upd=self.upd, verbose=verbose) self.state['turn'] += 1 db_probs = self._check_db() H_db = tools.entropy_p(db_probs) H_slots = calc_entropies(self.state['inform_slots'], db_probs, self.state['database']) p_vector = np.zeros((self.in_size, )).astype('float32') if self.input_type == 'entropy': for i, s in enumerate(dialog_config.inform_slots): if s in H_slots: p_vector[i] = H_slots[s] p_vector[i+len(dialog_config.inform_slots)] = 1. if s in self.state['dont_care'] \ else 0. if self.state['turn'] > 1: pr_act = self.state['prevact'].split('@') act_id = dialog_config.inform_slots.index(pr_act[1]) p_vector[2 * len(dialog_config.inform_slots) + act_id] = 1. p_vector[-1] = H_db else: p_slots = self._dict2vec(self.state['inform_slots']) p_vector[:p_slots.shape[0]] = p_slots if self.state['turn'] > 1: pr_act = self.state['prevact'].split('@') act_id = dialog_config.inform_slots.index(pr_act[1]) p_vector[p_slots.shape[0] + act_id] = 1. p_vector[-self.database.N:] = db_probs p_vector = np.expand_dims(np.expand_dims(p_vector, axis=0), axis=0) p_vector = standardize(p_vector) if self.training and self.num_updates < self.pol_start: # act on policy but train on expert pp = np.zeros((len(dialog_config.inform_slots) + 1, )) for i, s in enumerate(dialog_config.inform_slots): pp[i] = H_slots[s] pp[-1] = H_db _, action = self._rule_act(pp, db_probs) act, _, p_out = self._prob_act(p_vector, db_probs, mode='sample') else: if self.training: act, action, p_out = self._prob_act(p_vector, db_probs, mode='sample') else: act, action, p_out = self._prob_act(p_vector, db_probs, mode='max') self.state['inputs'].append(p_vector[0, 0, :]) self.state['actions'].append(action) self.state['rewards'].append(user_action['reward']) self.state['pol_state'] = p_out act['posterior'] = db_probs return act
def next(self, user_action, verbose=False): self._update_state(user_action['nl_sentence'], upd=self.upd, verbose=verbose) self.state['turn'] += 1 act = {} act['diaact'] = 'UNK' act['request_slots'] = {} act['target'] = [] db_status, db_index = self._check_db() H_slots = {} for s in dialog_config.inform_slots: s_p = self.state['inform_slots'][s] / self.state['inform_slots'][ s].sum() H_slots[s] = tools.entropy_p(s_p) sorted_entropies = sorted(H_slots.items(), key=operator.itemgetter(1), reverse=True) if verbose: print 'Agent slot belief entropies - ' print ' '.join( ['%s:%.2f' % (k, v) for k, v in H_slots.iteritems()]) if not db_status: # no match, some error, re-ask some slot act['diaact'] = 'request' request_slot = random.choice(self.state['inform_slots'].keys()) act['request_slots'][request_slot] = 'UNK' self.state['prevact'] = 'request@%s' % request_slot self.state['num_requests'][request_slot] += 1 elif len(db_status) == 1: act['diaact'] = 'inform' act['target'] = self._inform(db_index) self.state['prevact'] = 'inform@inform' else: req = False for (s, h) in sorted_entropies: if H_slots[s]<self.frac*self.state['init_entropy'][s] or H_slots[s]<self.ts or \ self.state['num_requests'][s] >= self.max_req: continue act['diaact'] = 'request' act['request_slots'][s] = 'UNK' self.state['prevact'] = 'request@%s' % s self.state['num_requests'][s] += 1 req = True break if not req: # agent confident about all slots, inform act['diaact'] = 'inform' act['target'] = self._inform(db_index) self.state['prevact'] = 'inform@inform' act['posterior'] = np.zeros((len(self.database.labels), )) act['posterior'][db_index] = 1. / len(db_index) return act
def next(self, user_action, verbose=False): self.state['turn'] += 1 p_vector = np.zeros((self.in_size, )).astype('float32') p_vector[:self.feat_extractor.n] = self.feat_extractor.featurize( \ user_action['nl_sentence']) if self.state['turn'] > 1: pr_act = self.state['prevact'].split('@') assert pr_act[0] != 'inform', 'Agent called after informing!' act_id = dialog_config.inform_slots.index(pr_act[1]) p_vector[self.feat_extractor.n + act_id] = 1 p_vector = np.expand_dims(np.expand_dims(p_vector, axis=0), axis=0) p_vector = standardize(p_vector) p_targets = [] phi_targets = [] if self.training and self.num_updates < self.pol_start: self._update_state(user_action['nl_sentence'], upd=self.upd, verbose=verbose) db_probs = self._check_db() H_db = tools.entropy_p(db_probs) H_slots = calc_entropies(self.state['inform_slots'], db_probs, self.state['database']) # act on policy but train on expert pp = np.zeros((len(dialog_config.inform_slots) + 1, )) for i, s in enumerate(dialog_config.inform_slots): pp[i] = H_slots[s] pp[-1] = H_db pp = np.expand_dims(np.expand_dims(pp, axis=0), axis=0) _, action = self._rule_act(pp, db_probs) act, _, p_out, hid_out, p_db = self._prob_act(p_vector, mode='sample') for s in dialog_config.inform_slots: p_s = self.state['inform_slots'][s] / self.state[ 'inform_slots'][s].sum() p_targets.append(p_s) if s in self.state['dont_care']: phi_targets.append(np.ones((1, )).astype('float32')) else: phi_targets.append(np.zeros((1, )).astype('float32')) else: if self.training: act, action, p_out, hid_out, db_probs = self._prob_act( p_vector, mode='sample') else: act, action, p_out, hid_out, db_probs = self._prob_act( p_vector, mode='max') self._state_update(act, p_vector, action, user_action['reward'], p_out, hid_out, p_targets, \ phi_targets) act['posterior'] = db_probs return act
def calc_entropies(state, q, db): entropies = {} for s,c in state.iteritems(): if s not in db.slots: entropies[s] = 0. else: p = (db.ids[s]*q).sum(axis=1) u = db.priors[s]*q[db.unks[s]].sum() c_tilde = p+u c_tilde = c_tilde/c_tilde.sum() entropies[s] = tools.entropy_p(c_tilde) return entropies
def calc_entropies(state, q, db): entropies = {} for s, c in state.iteritems(): if s not in db.slots: entropies[s] = 0. else: p = (db.ids[s] * q).sum(axis=1) u = db.priors[s] * q[db.unks[s]].sum() c_tilde = p + u c_tilde = c_tilde / c_tilde.sum() entropies[s] = tools.entropy_p(c_tilde) return entropies
def next(self, user_action, verbose=False): self._update_state(user_action['nl_sentence'], upd=self.upd, verbose=verbose) self.state['turn'] += 1 act = {} act['diaact'] = 'UNK' act['request_slots'] = {} act['target'] = [] db_probs = self._check_db() H_slots = {} for s in dialog_config.inform_slots: s_p = self.state['inform_slots'][s] / self.state['inform_slots'][ s].sum() H_slots[s] = tools.entropy_p(s_p) if verbose: print 'Agent slot belief entropies - ' print ' '.join( ['%s:%.2f' % (k, v) for k, v in H_slots.iteritems()]) sorted_entropies = sorted(H_slots.items(), key=operator.itemgetter(1), reverse=True) req = False for (s, h) in sorted_entropies: if H_slots[s]<self.frac*self.state['init_entropy'][s] or H_slots[s]<self.ts or \ self.state['num_requests'][s] >= self.max_req: continue act['diaact'] = 'request' act['request_slots'][s] = 'UNK' self.state['prevact'] = 'request@%s' % s self.state['num_requests'][s] += 1 req = True break if not req: # agent confident about all slots, inform act['diaact'] = 'inform' act['target'] = self._inform(db_probs) self.state['prevact'] = 'inform@inform' act['probs'] = [np.concatenate([self.state['inform_slots'][s]/ \ self.state['inform_slots'][s].sum(), \ np.asarray([float(self.state['database'].inv_counts[s][-1])/ \ self.state['database'].N])]) \ for s in dialog_config.inform_slots] act['phis'] = [ 1. if s in self.state['dont_care'] else 0. for s in dialog_config.inform_slots ] act['posterior'] = db_probs return act
def next(self, user_action, verbose=False): self._update_state(user_action['nl_sentence'], upd=self.upd, verbose=verbose) self.state['turn'] += 1 act = {} act['diaact'] = 'UNK' act['request_slots'] = {} act['target'] = [] db_status, db_index = self._check_db() H_slots = {} for s in dialog_config.inform_slots: s_p = self.state['inform_slots'][s]/self.state['inform_slots'][s].sum() H_slots[s] = tools.entropy_p(s_p) sorted_entropies = sorted(H_slots.items(), key=operator.itemgetter(1), reverse=True) if verbose: print 'Agent slot belief entropies - ' print ' '.join(['%s:%.2f' %(k,v) for k,v in H_slots.iteritems()]) if not db_status: # no match, some error, re-ask some slot act['diaact'] = 'request' request_slot = random.choice(self.state['inform_slots'].keys()) act['request_slots'][request_slot] = 'UNK' self.state['prevact'] = 'request@%s' %request_slot self.state['num_requests'][request_slot] += 1 elif len(db_status)==1: act['diaact'] = 'inform' act['target'] = self._inform(db_index) self.state['prevact'] = 'inform@inform' else: req = False for (s,h) in sorted_entropies: if H_slots[s]<self.frac*self.state['init_entropy'][s] or H_slots[s]<self.ts or \ self.state['num_requests'][s] >= self.max_req: continue act['diaact'] = 'request' act['request_slots'][s] = 'UNK' self.state['prevact'] = 'request@%s' %s self.state['num_requests'][s] += 1 req = True break if not req: # agent confident about all slots, inform act['diaact'] = 'inform' act['target'] = self._inform(db_index) self.state['prevact'] = 'inform@inform' act['posterior'] = np.zeros((len(self.database.labels),)) act['posterior'][db_index] = 1./len(db_index) return act
def initialize_episode(self): self.state = {} self.state['database'] = pkl.loads(pkl.dumps(self.database,-1)) self.state['prevact'] = 'begin@begin' self.state['inform_slots'] = self._init_beliefs() self.state['turn'] = 0 self.state['init_entropy'] = {} for s in dialog_config.inform_slots: s_p = self.state['inform_slots'][s]/self.state['inform_slots'][s].sum() self.state['init_entropy'][s] = tools.entropy_p(s_p) self.state['num_requests'] = {s:0 for s in self.state['inform_slots'].keys()} self.state['slot_tracker'] = set() self.state['dont_care'] = set()
def next(self, user_action, verbose=False): self.state['turn'] += 1 p_vector = np.zeros((self.in_size,)).astype('float32') p_vector[:self.feat_extractor.n] = self.feat_extractor.featurize( \ user_action['nl_sentence']) if self.state['turn']>1: pr_act = self.state['prevact'].split('@') assert pr_act[0]!='inform', 'Agent called after informing!' act_id = dialog_config.inform_slots.index(pr_act[1]) p_vector[self.feat_extractor.n+act_id] = 1 p_vector = np.expand_dims(np.expand_dims(p_vector, axis=0), axis=0) p_vector = standardize(p_vector) p_targets = [] phi_targets = [] if self.training and self.num_updates<self.pol_start: self._update_state(user_action['nl_sentence'], upd=self.upd, verbose=verbose) db_probs = self._check_db() H_db = tools.entropy_p(db_probs) H_slots = calc_entropies(self.state['inform_slots'], db_probs, self.state['database']) # act on policy but train on expert pp = np.zeros((len(dialog_config.inform_slots)+1,)) for i,s in enumerate(dialog_config.inform_slots): pp[i] = H_slots[s] pp[-1] = H_db pp = np.expand_dims(np.expand_dims(pp, axis=0), axis=0) _, action = self._rule_act(pp, db_probs) act, _, p_out, hid_out, p_db = self._prob_act(p_vector, mode='sample') for s in dialog_config.inform_slots: p_s = self.state['inform_slots'][s]/self.state['inform_slots'][s].sum() p_targets.append(p_s) if s in self.state['dont_care']: phi_targets.append(np.ones((1,)).astype('float32')) else: phi_targets.append(np.zeros((1,)).astype('float32')) else: if self.training: act, action, p_out, hid_out, db_probs = self._prob_act(p_vector, mode='sample') else: act, action, p_out, hid_out, db_probs = self._prob_act(p_vector, mode='max') self._state_update(act, p_vector, action, user_action['reward'], p_out, hid_out, p_targets, \ phi_targets) act['posterior'] = db_probs return act
def next(self, user_action, verbose=False): self._update_state(user_action['nl_sentence'], upd=self.upd, verbose=verbose) self.state['turn'] += 1 act = {} act['diaact'] = 'UNK' act['request_slots'] = {} act['target'] = [] db_probs = self._check_db() H_db = tools.entropy_p(db_probs) H_slots = calc_entropies(self.state['inform_slots'], db_probs, self.state['database']) if verbose: print 'Agent DB entropy = ', H_db print 'Agent slot belief entropies - ' print ' '.join(['%s:%.2f' %(k,v) for k,v in H_slots.iteritems()]) if H_db < self.tr: # agent reasonable confident, inform act['diaact'] = 'inform' act['target'] = self._inform(db_probs) else: sorted_entropies = sorted(H_slots.items(), key=operator.itemgetter(1), reverse=True) req = False for (s,h) in sorted_entropies: if H_slots[s]<self.frac*self.state['init_entropy'][s] or H_slots[s]<self.ts or \ self.state['num_requests'][s] >= self.max_req: continue act['diaact'] = 'request' act['request_slots'][s] = 'UNK' self.state['prevact'] = 'request@%s' %s self.state['num_requests'][s] += 1 req = True break if not req: # agent confident about all slots, inform act['diaact'] = 'inform' act['target'] = self._inform(db_probs) self.state['prevact'] = 'inform@inform' act['probs'] = [np.concatenate([self.state['inform_slots'][s]/self.state['inform_slots'][s].sum(), \ np.asarray([float(self.state['database'].inv_counts[s][-1])/self.state['database'].N])]) \ for s in dialog_config.inform_slots] act['phis'] = [1. if s in self.state['dont_care'] else 0. for s in dialog_config.inform_slots] act['posterior'] = db_probs return act
def initialize_episode(self): self.state = {} self.state['database'] = pkl.loads(pkl.dumps(self.database, -1)) self.state['prevact'] = 'begin@begin' self.state['inform_slots'] = self._init_beliefs() self.state['turn'] = 0 self.state['init_entropy'] = {} for s in dialog_config.inform_slots: s_p = self.state['inform_slots'][s] / self.state['inform_slots'][ s].sum() self.state['init_entropy'][s] = tools.entropy_p(s_p) self.state['num_requests'] = { s: 0 for s in self.state['inform_slots'].keys() } self.state['slot_tracker'] = set() self.state['dont_care'] = set()
def calc_entropies(state, q, db): ''' SL中计算熵的方式,跟RL中不一样! :param state: :param q: table probability, (N,) :param db: database :return: 每个slot的熵 ''' entropies = {} for s,c in state.iteritems(): if s not in db.slots: entropies[s] = 0. else: p = (db.ids[s]*q).sum(axis=1) u = db.priors[s]*q[db.unks[s]].sum() c_tilde = p+u c_tilde = c_tilde/c_tilde.sum() entropies[s] = tools.entropy_p(c_tilde) return entropies
def next(self, user_action, verbose=False): ''' get next action based on rules :param user_action: 用户输入之后,新的state :param verbose: 是否打印模型运行过程产生的log,是否开启唠叨模式 :return: 返回action的dict里面有其他的参数,包括diaact,request_slots,target,p和q等 ''' self.state['turn'] += 1 # TODO: 改为embedding之后,这段要全部改掉,主要是in_size这个变量要改,不知道会不会出一些其他的幺蛾子 # TODO: 改为Embedding后效果很差,还是应该试试更多的模型 p_vector, seq_len = self.feat_extractor.featurize( user_action['nl_sentence']) p_vector = np.expand_dims(np.expand_dims(p_vector, axis=0), axis=0) p_vector = standardize(p_vector) # p_vector = np.zeros((self.in_size,)).astype('float32') # (|Grams|+|Slots|, ) # p_vector[:self.feat_extractor.n] = self.feat_extractor.featurize(user_action['nl_sentence']) # if self.state['turn']>1: # pr_act = self.state['prevact'].split('@') # assert pr_act[0]!='inform', 'Agent called after informing!' # act_id = dialog_config.inform_slots.index(pr_act[1]) # p_vector[self.feat_extractor.n+act_id] = 1 # p_vector = np.expand_dims(np.expand_dims(p_vector, axis=0), axis=0) # (1, 1, |Grams|+|Slots|) # p_vector = standardize(p_vector) p_targets = [] phi_targets = [] if self.training and self.num_updates < self.pol_start: self._update_state(user_action['nl_sentence'], upd=self.upd, verbose=verbose) db_probs = self._check_db() H_db = tools.entropy_p(db_probs) H_slots = calc_entropies(self.state['inform_slots'], db_probs, self.state['database']) # act on policy but train on expert pp = np.zeros((len(dialog_config.inform_slots) + 1, )) for i, s in enumerate(dialog_config.inform_slots): pp[i] = H_slots[s] pp[-1] = H_db pp = np.expand_dims(np.expand_dims(pp, axis=0), axis=0) # (1, 1, |Slots|) _, action = self._rule_act(pp, db_probs) act, _, p_out, hid_out, p_db = self._prob_act(p_vector, mode='sample') for s in dialog_config.inform_slots: p_s = self.state['inform_slots'][s] / self.state[ 'inform_slots'][s].sum() p_targets.append(p_s) if s in self.state['dont_care']: phi_targets.append(np.ones((1, )).astype('float32')) else: phi_targets.append(np.zeros((1, )).astype('float32')) else: if self.training: act, action, p_out, hid_out, db_probs = self._prob_act( p_vector, mode='sample') else: act, action, p_out, hid_out, db_probs = self._prob_act( p_vector, mode='max') # TODO: 添加seq_len参数,注意在哪找到这个参数 self._state_update(act, p_vector, action, user_action['reward'], p_out, hid_out, p_targets, phi_targets, seq_len) act['posterior'] = db_probs return act