def convertDIPStateAction(self, state, action): ''' ''' if isinstance(state, TerminalState): return [0] * 89, action else: dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString) action_name = self.actions.action_names[action] act_slot = 'general' for slot in dip_state.slots: if slot in action_name: act_slot = slot flat_belief = dip_state.get_beliefStateVec(act_slot) self.prev_state_check = flat_belief return flat_belief, action
def nextAction(self, beliefstate): ''' select next action :param beliefstate: :returns: (int) next summary action ''' # compute main belief af = None if self.actfreq_ds: #af = 1./(1 + self.action_freq) af = 1. / (1 + np.concatenate((self.si_freq, self.sd_freq))) if self.features == 'learned' or self.features == 'rnn': dipstate = padded_state(beliefstate, domainString=self.domainString, action_freq=af) else: dipstate = DIP_state(beliefstate, domainString=self.domainString, action_freq=af) dipstatevec = dipstate.get_beliefStateVec('general') # Make decision on main policy master_Q_values = self.master_policy.nextAction(dipstatevec) non_exec = self.summaryaction.getNonExecutable( beliefstate.domainStates[beliefstate.currentdomain], self.lastSystemAction) masks = get_feudal_masks(non_exec, dipstate.slots, self.slot_independent_actions, self.slot_specific_actions) master_Q_values = np.add(master_Q_values, masks['master']) if self.is_training and self.correction_factor != 0: correction = (1 - self.master_freq / sum(self.master_freq)) master_Q_values *= correction if self.sample_master is True and self.is_training is False: probs = master_Q_values[:-1] if np.any([x for x in probs if x < 0]): probs[[x < 0 for x in probs]] = 0 probs /= sum(probs) master_decision = np.random.choice([0, 1], p=probs) #print master_decision else: master_decision = np.argmax(master_Q_values) if master_decision == 0 and self.gi_dec_inrow == 4 and self.correct_master and not self.is_training: master_decision = 1 self.master_freq[master_decision] += 1 if not self.is_training: self.master_dec_count[master_decision] += 1 if np.sum(self.master_dec_count) % 1000 == 0: logger.results('master action frequencies = {}'.format( list(self.master_dec_count) / np.sum(self.master_dec_count))) #TODO: change to debug #print 'master Q:', master_Q_values, 'master decision:', master_decision self.prev_master_act = master_decision self.prev_master_belief = dipstatevec if master_decision == 0: self.gi_dec_inrow += 1. # drop to give_info policy self.prev_sub_policy = 0 child_Q_values = self.give_info_policy.nextAction(dipstatevec) child_Q_values = np.add(child_Q_values, masks['give_info']) child_decision = np.argmax(child_Q_values) summaryAct = self.slot_independent_actions[child_decision] self.prev_child_act = child_decision self.prev_child_belief = dipstatevec #print 'give info Q:', child_Q_values, 'give info decision:', summaryAct self.si_freq[child_decision] += 1 elif master_decision == 1: self.gi_dec_inrow = 0 # drop to request_info policy self.prev_sub_policy = 1 slot_Qs = {} best_action = ('slot', 'action', -np.inf) for slot in dipstate.slots: dipstatevec = dipstate.get_beliefStateVec(slot) slot_Qs[slot] = self.request_info_policy.nextAction( dipstatevec) slot_Qs[slot] = np.add(slot_Qs[slot], masks['req_info'][slot]) slot_max_Q = np.max(slot_Qs[slot]) if slot_max_Q > best_action[2]: best_action = (slot, np.argmax(slot_Qs[slot]), slot_max_Q) summaryAct = self.slot_specific_actions[ best_action[1]] + '_' + best_action[0] if 'reqmore' in summaryAct: summaryAct = 'reqmore' self.prev_child_act = best_action[1] self.prev_child_belief = dipstate.get_beliefStateVec( best_action[0]) self.sd_freq[best_action[1]] += 1 #print 'req info Q:', [slot_Qs[s] for s in slot_Qs], 'req info decision:', summaryAct self.action_freq[self.actions.action_names.index(summaryAct)] += 1 #print 1./(1+self.action_freq) beliefstate = beliefstate.getDomainState(self.domainUtil.domainString) masterAct = self.summaryaction.Convert(beliefstate, summaryAct, self.lastSystemAction) nextaIdex = self.full_action_list.index(summaryAct) return masterAct, nextaIdex
def nextAction(self, beliefstate): ''' select next action :param beliefstate: :param hyps: :returns: (int) next summary action ''' if self.architecture != 'dip2': beliefVec = flatten_belief(beliefstate, self.domainUtil) else: dip_state = DIP_state( beliefstate.domainStates[beliefstate.currentdomain], self.domainString) execMask = self.summaryaction.getExecutableMask( beliefstate, self.lastSystemAction) if self.exploration_type == 'e-greedy': # epsilon greedy if self.is_training and utils.Settings.random.rand( ) < self.epsilon: admissible = [i for i, x in enumerate(execMask) if x == 0.0] random.shuffle(admissible) nextaIdex = admissible[0] else: if self.architecture != 'dip' and self.architecture != 'dip2': action_Q = self.dqn.predict( np.reshape( beliefVec, (1, len(beliefVec)))) # + (1. / (1. + i + j)) admissible = np.add(action_Q, np.array(execMask)) logger.info('action Q...') #print admissible.shape #print admissible nextaIdex = np.argmax(admissible) # add current max Q to self.episode_ave_max_q #print 'current maxQ', np.max(admissible) self.episode_ave_max_q.append(np.max(admissible)) elif self.architecture == 'dip2': admissible = [] for idx, v in enumerate(execMask): action_name = self.actions.action_names[idx] act_slot = 'general' for slot in dip_state.slots: if slot in action_name: act_slot = slot beliefVec = dip_state.get_beliefStateVec(act_slot) action_Q = self.dqn.predict( np.reshape( beliefVec, (1, len(beliefVec)))) # + (1. / (1. + i + j)) if v == 0: admissible.append(action_Q[0][idx]) else: admissible.append(v) nextaIdex = np.argmax(admissible) self.episode_ave_max_q.append(np.max(admissible)) else: admissible = [] for idx, v in enumerate(execMask): if v > -sys.maxint: Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] Qidx = self.dqn.predict_dip( np.reshape(beliefVec, (1, len(beliefVec))), Action_idx) #print 'argmax Q',Qidx[0] admissible.append(Qidx[0]) else: admissible.append(-sys.maxint) # action_Q = self.dqn.predict(np.reshape(beliefVec, (1, len(beliefVec))))# + (1. / (1. + i + j)) # admissible = np.add(action_Q, np.array(execMask)) logger.info('action Q...') #print admissible nextaIdex = np.argmax(admissible) # add current max Q to self.episode_ave_max_q #print 'current maxQ', np.max(admissible) self.episode_ave_max_q.append(np.max(admissible)) elif self.exploration_type == 'Boltzman': # softmax if not self.is_training: self.epsilon = 0.001 # self.epsilon here is served as temperature action_Q = self.dqn.predict( np.reshape(beliefVec, (1, len(beliefVec)))) # + (1. / (1. + i + j)) action_Q_admissible = np.add(action_Q, np.array( execMask)) # enforce Q of inadmissible actions to be -inf action_prob = drlutils.softmax(action_Q_admissible / self.epsilon) logger.info('action Q...') #print action_Q_admissible logger.info('action prob...') #print action_prob sampled_prob = np.random.choice(action_prob[0], p=action_prob[0]) nextaIdex = np.argmax(action_prob[0] == sampled_prob) self.stats[nextaIdex] += 1 summaryAct = self.action_names[nextaIdex] beliefstate = beliefstate.getDomainState(self.domainUtil.domainString) masterAct = self.summaryaction.Convert(beliefstate, summaryAct, self.lastSystemAction) return masterAct, nextaIdex