예제 #1
0
    def convertDIPStateAction(self, state, action):
        '''

        '''
        if isinstance(state, TerminalState):
            return [0] * 89, action

        else:
            dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
            action_name = self.actions.action_names[action]
            act_slot = 'general'
            for slot in dip_state.slots:
                if slot in action_name:
                    act_slot = slot
            flat_belief = dip_state.get_beliefStateVec(act_slot)
            self.prev_state_check = flat_belief

            return flat_belief, action
예제 #2
0
    def nextAction(self, beliefstate):
        '''
        select next action

        :param beliefstate:
        :returns: (int) next summary action
        '''
        # compute main belief
        af = None
        if self.actfreq_ds:
            #af = 1./(1 + self.action_freq)
            af = 1. / (1 + np.concatenate((self.si_freq, self.sd_freq)))
        if self.features == 'learned' or self.features == 'rnn':
            dipstate = padded_state(beliefstate,
                                    domainString=self.domainString,
                                    action_freq=af)
        else:
            dipstate = DIP_state(beliefstate,
                                 domainString=self.domainString,
                                 action_freq=af)
        dipstatevec = dipstate.get_beliefStateVec('general')
        # Make decision on main policy
        master_Q_values = self.master_policy.nextAction(dipstatevec)
        non_exec = self.summaryaction.getNonExecutable(
            beliefstate.domainStates[beliefstate.currentdomain],
            self.lastSystemAction)
        masks = get_feudal_masks(non_exec, dipstate.slots,
                                 self.slot_independent_actions,
                                 self.slot_specific_actions)
        master_Q_values = np.add(master_Q_values, masks['master'])
        if self.is_training and self.correction_factor != 0:
            correction = (1 - self.master_freq / sum(self.master_freq))
            master_Q_values *= correction
        if self.sample_master is True and self.is_training is False:
            probs = master_Q_values[:-1]
            if np.any([x for x in probs if x < 0]):
                probs[[x < 0 for x in probs]] = 0
            probs /= sum(probs)
            master_decision = np.random.choice([0, 1], p=probs)
            #print master_decision
        else:
            master_decision = np.argmax(master_Q_values)
        if master_decision == 0 and self.gi_dec_inrow == 4 and self.correct_master and not self.is_training:
            master_decision = 1
        self.master_freq[master_decision] += 1
        if not self.is_training:
            self.master_dec_count[master_decision] += 1
            if np.sum(self.master_dec_count) % 1000 == 0:
                logger.results('master action frequencies = {}'.format(
                    list(self.master_dec_count) /
                    np.sum(self.master_dec_count)))  #TODO: change to debug
        #print 'master Q:', master_Q_values, 'master decision:', master_decision
        self.prev_master_act = master_decision
        self.prev_master_belief = dipstatevec
        if master_decision == 0:
            self.gi_dec_inrow += 1.
            # drop to give_info policy
            self.prev_sub_policy = 0
            child_Q_values = self.give_info_policy.nextAction(dipstatevec)
            child_Q_values = np.add(child_Q_values, masks['give_info'])
            child_decision = np.argmax(child_Q_values)
            summaryAct = self.slot_independent_actions[child_decision]
            self.prev_child_act = child_decision
            self.prev_child_belief = dipstatevec
            #print 'give info Q:', child_Q_values, 'give info decision:', summaryAct
            self.si_freq[child_decision] += 1

        elif master_decision == 1:
            self.gi_dec_inrow = 0
            # drop to request_info policy
            self.prev_sub_policy = 1
            slot_Qs = {}
            best_action = ('slot', 'action', -np.inf)
            for slot in dipstate.slots:
                dipstatevec = dipstate.get_beliefStateVec(slot)
                slot_Qs[slot] = self.request_info_policy.nextAction(
                    dipstatevec)
                slot_Qs[slot] = np.add(slot_Qs[slot], masks['req_info'][slot])
                slot_max_Q = np.max(slot_Qs[slot])
                if slot_max_Q > best_action[2]:
                    best_action = (slot, np.argmax(slot_Qs[slot]), slot_max_Q)
            summaryAct = self.slot_specific_actions[
                best_action[1]] + '_' + best_action[0]
            if 'reqmore' in summaryAct:
                summaryAct = 'reqmore'
            self.prev_child_act = best_action[1]
            self.prev_child_belief = dipstate.get_beliefStateVec(
                best_action[0])
            self.sd_freq[best_action[1]] += 1
            #print 'req info Q:', [slot_Qs[s] for s in slot_Qs], 'req info decision:', summaryAct

        self.action_freq[self.actions.action_names.index(summaryAct)] += 1
        #print  1./(1+self.action_freq)
        beliefstate = beliefstate.getDomainState(self.domainUtil.domainString)
        masterAct = self.summaryaction.Convert(beliefstate, summaryAct,
                                               self.lastSystemAction)
        nextaIdex = self.full_action_list.index(summaryAct)
        return masterAct, nextaIdex
    def nextAction(self, beliefstate):
        '''
        select next action

        :param beliefstate:
        :param hyps:
        :returns: (int) next summary action
        '''
        if self.architecture != 'dip2':
            beliefVec = flatten_belief(beliefstate, self.domainUtil)
        else:
            dip_state = DIP_state(
                beliefstate.domainStates[beliefstate.currentdomain],
                self.domainString)
        execMask = self.summaryaction.getExecutableMask(
            beliefstate, self.lastSystemAction)

        if self.exploration_type == 'e-greedy':
            # epsilon greedy
            if self.is_training and utils.Settings.random.rand(
            ) < self.epsilon:
                admissible = [i for i, x in enumerate(execMask) if x == 0.0]
                random.shuffle(admissible)
                nextaIdex = admissible[0]
            else:
                if self.architecture != 'dip' and self.architecture != 'dip2':
                    action_Q = self.dqn.predict(
                        np.reshape(
                            beliefVec,
                            (1, len(beliefVec))))  # + (1. / (1. + i + j))
                    admissible = np.add(action_Q, np.array(execMask))
                    logger.info('action Q...')
                    #print admissible.shape
                    #print admissible
                    nextaIdex = np.argmax(admissible)

                    # add current max Q to self.episode_ave_max_q
                    #print 'current maxQ', np.max(admissible)
                    self.episode_ave_max_q.append(np.max(admissible))
                elif self.architecture == 'dip2':
                    admissible = []
                    for idx, v in enumerate(execMask):
                        action_name = self.actions.action_names[idx]
                        act_slot = 'general'
                        for slot in dip_state.slots:
                            if slot in action_name:
                                act_slot = slot
                        beliefVec = dip_state.get_beliefStateVec(act_slot)
                        action_Q = self.dqn.predict(
                            np.reshape(
                                beliefVec,
                                (1, len(beliefVec))))  # + (1. / (1. + i + j))
                        if v == 0:
                            admissible.append(action_Q[0][idx])
                        else:
                            admissible.append(v)
                    nextaIdex = np.argmax(admissible)
                    self.episode_ave_max_q.append(np.max(admissible))

                else:
                    admissible = []
                    for idx, v in enumerate(execMask):
                        if v > -sys.maxint:
                            Action_idx = np.eye(self.action_dim,
                                                self.action_dim)[[idx]]
                            Qidx = self.dqn.predict_dip(
                                np.reshape(beliefVec, (1, len(beliefVec))),
                                Action_idx)
                            #print 'argmax Q',Qidx[0]
                            admissible.append(Qidx[0])
                        else:
                            admissible.append(-sys.maxint)
                    # action_Q = self.dqn.predict(np.reshape(beliefVec, (1, len(beliefVec))))# + (1. / (1. + i + j))
                    # admissible = np.add(action_Q, np.array(execMask))
                    logger.info('action Q...')
                    #print admissible
                    nextaIdex = np.argmax(admissible)

                    # add current max Q to self.episode_ave_max_q
                    #print 'current maxQ', np.max(admissible)
                    self.episode_ave_max_q.append(np.max(admissible))

        elif self.exploration_type == 'Boltzman':
            # softmax
            if not self.is_training:
                self.epsilon = 0.001
            # self.epsilon here is served as temperature
            action_Q = self.dqn.predict(
                np.reshape(beliefVec,
                           (1, len(beliefVec))))  # + (1. / (1. + i + j))
            action_Q_admissible = np.add(action_Q, np.array(
                execMask))  # enforce Q of inadmissible actions to be -inf

            action_prob = drlutils.softmax(action_Q_admissible / self.epsilon)
            logger.info('action Q...')
            #print action_Q_admissible
            logger.info('action prob...')
            #print action_prob
            sampled_prob = np.random.choice(action_prob[0], p=action_prob[0])
            nextaIdex = np.argmax(action_prob[0] == sampled_prob)

        self.stats[nextaIdex] += 1
        summaryAct = self.action_names[nextaIdex]
        beliefstate = beliefstate.getDomainState(self.domainUtil.domainString)
        masterAct = self.summaryaction.Convert(beliefstate, summaryAct,
                                               self.lastSystemAction)
        return masterAct, nextaIdex