def nextAction(self, beliefstate): ''' select next action :param beliefstate: :param hyps: :returns: (int) next summary action ''' if self.architecture != 'dip2': beliefVec = flatten_belief(beliefstate, self.domainUtil) else: dip_state = DIP_state( beliefstate.domainStates[beliefstate.currentdomain], self.domainString) execMask = self.summaryaction.getExecutableMask( beliefstate, self.lastSystemAction) if self.exploration_type == 'e-greedy': # epsilon greedy if self.is_training and utils.Settings.random.rand( ) < self.epsilon: admissible = [i for i, x in enumerate(execMask) if x == 0.0] random.shuffle(admissible) nextaIdex = admissible[0] else: if self.architecture != 'dip' and self.architecture != 'dip2': action_Q = self.dqn.predict( np.reshape( beliefVec, (1, len(beliefVec)))) # + (1. / (1. + i + j)) admissible = np.add(action_Q, np.array(execMask)) logger.info('action Q...') #print admissible.shape #print admissible nextaIdex = np.argmax(admissible) # add current max Q to self.episode_ave_max_q #print 'current maxQ', np.max(admissible) self.episode_ave_max_q.append(np.max(admissible)) elif self.architecture == 'dip2': admissible = [] for idx, v in enumerate(execMask): action_name = self.actions.action_names[idx] act_slot = 'general' for slot in dip_state.slots: if slot in action_name: act_slot = slot beliefVec = dip_state.get_beliefStateVec(act_slot) action_Q = self.dqn.predict( np.reshape( beliefVec, (1, len(beliefVec)))) # + (1. / (1. + i + j)) if v == 0: admissible.append(action_Q[0][idx]) else: admissible.append(v) nextaIdex = np.argmax(admissible) self.episode_ave_max_q.append(np.max(admissible)) else: admissible = [] for idx, v in enumerate(execMask): if v > -sys.maxint: Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] Qidx = self.dqn.predict_dip( np.reshape(beliefVec, (1, len(beliefVec))), Action_idx) #print 'argmax Q',Qidx[0] admissible.append(Qidx[0]) else: admissible.append(-sys.maxint) # action_Q = self.dqn.predict(np.reshape(beliefVec, (1, len(beliefVec))))# + (1. / (1. + i + j)) # admissible = np.add(action_Q, np.array(execMask)) logger.info('action Q...') #print admissible nextaIdex = np.argmax(admissible) # add current max Q to self.episode_ave_max_q #print 'current maxQ', np.max(admissible) self.episode_ave_max_q.append(np.max(admissible)) elif self.exploration_type == 'Boltzman': # softmax if not self.is_training: self.epsilon = 0.001 # self.epsilon here is served as temperature action_Q = self.dqn.predict( np.reshape(beliefVec, (1, len(beliefVec)))) # + (1. / (1. + i + j)) action_Q_admissible = np.add(action_Q, np.array( execMask)) # enforce Q of inadmissible actions to be -inf action_prob = drlutils.softmax(action_Q_admissible / self.epsilon) logger.info('action Q...') #print action_Q_admissible logger.info('action prob...') #print action_prob sampled_prob = np.random.choice(action_prob[0], p=action_prob[0]) nextaIdex = np.argmax(action_prob[0] == sampled_prob) self.stats[nextaIdex] += 1 summaryAct = self.action_names[nextaIdex] beliefstate = beliefstate.getDomainState(self.domainUtil.domainString) masterAct = self.summaryaction.Convert(beliefstate, summaryAct, self.lastSystemAction) return masterAct, nextaIdex
def nextAction(self, beliefstate): ''' select next action :param beliefstate: :param hyps: :returns: (int) next summary action ''' beliefVec = flatten_belief(beliefstate, self.domainUtil) execMask = self.summaryaction.getExecutableMask( beliefstate, self.lastSystemAction) if self.exploration_type == 'e-greedy': action_prob = self.enac.predict_policy( np.reshape(beliefVec, (1, len(beliefVec)))) admissibleCnt = [i for i, x in enumerate(execMask) if x == 0.0] admissible = np.add(action_prob, np.array(execMask)) greedyNextaIdex = np.argmax(admissible) # epsilon greedy if self.is_training and utils.Settings.random.rand( ) < self.epsilon: admissible = [i for i, x in enumerate(execMask) if x == 0.0] random.shuffle(admissible) nextaIdex = admissible[0] # Importance sampling if nextaIdex == greedyNextaIdex: self.mu_prob = self.epsilon / float( self.action_dim) + 1 - self.epsilon else: self.mu_prob = self.epsilon / float(self.action_dim) else: nextaIdex = greedyNextaIdex # add current max Q to self.episode_ave_max_q #print 'current maxQ', np.max(admissible) self.episode_ave_max_q.append(np.max(admissible)) # Importance sampling self.mu_prob = self.epsilon / float( self.action_dim) + 1 - self.epsilon elif self.exploration_type == 'Boltzman': # softmax if not self.is_training: self.epsilon = 0.001 # self.epsilon here is served as temperature #action_prob, value = self.a2c.predict_action_value(np.reshape(beliefVec, (1, len(beliefVec))))# + (1. / (1. + i + j)) action_prob = self.enac.predict_policy( np.reshape(beliefVec, (1, len(beliefVec)))) # + (1. / (1. + i + j)) action_Q_admissible = np.add(action_prob, np.array( execMask)) # enforce Q of inadmissible actions to be -inf action_prob = drlutils.softmax(action_Q_admissible / self.epsilon) logger.info('action Q...') print action_Q_admissible logger.info('action prob...') print action_prob sampled_prob = np.random.choice(action_prob[0], p=action_prob[0]) nextaIdex = np.argmax(action_prob[0] == sampled_prob) self.stats[nextaIdex] += 1 summaryAct = self.summaryaction.action_names[nextaIdex] beliefstate = beliefstate.getDomainState(self.domainUtil.domainString) masterAct = self.summaryaction.Convert(beliefstate, summaryAct, self.lastSystemAction) return masterAct, nextaIdex