示例#1
0
    def finalizeRecord(self, reward, domainInControl=None):
        if domainInControl is None:
            domainInControl = self.domainString
        if self.episodes[domainInControl] is None:
            self.logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
            return

        # print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size())
        #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q)

        #print self.stats

        # normalising total return to -1~1
        reward /= 20.0

        terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())

        if self.replay_type == 'vanilla':
            self.episodes[domainInControl].record(state=terminal_state, \
                                                  state_ori=TerminalState(), action=terminal_action, reward=reward,
                                                  terminal=True)
        elif self.replay_type == 'prioritized':
            self.episodes[domainInControl].record(state=terminal_state, \
                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False,
                                                      terminal=True)
示例#2
0
    def finalizeRecord(self, reward, domainInControl=None):
        if domainInControl is None:
            domainInControl = self.domainString
        if self.episodes[domainInControl] is None:
            logger.warning(
                "record attempted to be finalized for domain where nothing has been recorded before"
            )
            return

        # normalising total return to -1~1
        reward /= 20.0
        terminal_state, terminal_action = self.convertStateAction(
            TerminalState(), TerminalAction())

        if self.replay_type == 'vanilla':
            self.episodes[domainInControl].record(state=terminal_state, \
                                                  state_ori=TerminalState(), action=terminal_action, reward=reward,
                                                  terminal=True)
        elif self.replay_type == 'prioritized':
            # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used
            if True:
                # if self.samplecount >= self.capacity:
                self.episodes[domainInControl].record(state=terminal_state, \
                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False,
                                                      terminal=True)
            else:
                self.episodes[domainInControl].record(state=terminal_state, \
                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=True,
                                                      terminal=True)
        return
示例#3
0
    def finalizeRecord(self, reward, domainInControl=None):
        if domainInControl is None:
            domainInControl = self.domainString
        if self.episodes[domainInControl] is None:
            logger.warning(
                "record attempted to be finalized for domain where nothing has been recorded before"
            )
            return

        #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size())
        #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q)
        #print self.stats

        # normalising total return to -1~1
        reward /= 20.0

        terminal_state, terminal_action = self.convertStateAction(
            TerminalState(), TerminalAction())
        value = 0.0  # not effect on experience replay

        def calculate_discountR_advantage(r_episode, v_episode):
            #########################################################################
            # Here we take the rewards and values from the rollout, and use them to
            # generate the advantage and discounted returns.
            # The advantage function uses "Generalized Advantage Estimation"
            bootstrap_value = 0.0
            self.r_episode_plus = np.asarray(r_episode + [bootstrap_value])
            discounted_r_episode = discount(self.r_episode_plus,
                                            self.gamma)[:-1]
            self.v_episode_plus = np.asarray(v_episode + [bootstrap_value])
            advantage = r_episode + self.gamma * self.v_episode_plus[
                1:] - self.v_episode_plus[:-1]
            advantage = discount(advantage, self.gamma)
            #########################################################################
            return discounted_r_episode, advantage

        if self.replay_type == 'vanilla':
            self.episodes[domainInControl].record(state=terminal_state, \
                    state_ori=TerminalState(), action=terminal_action, reward=reward, value=value, terminal=True, distribution=None)
        elif self.replay_type == 'prioritized':
            episode_r, episode_v = self.episodes[domainInControl].record_final_and_get_episode(state=terminal_state, \
                                                                                               state_ori=TerminalState(),
                                                                                               action=terminal_action,
                                                                                               reward=reward,
                                                                                               value=value)

            # TD_error is a list of td error in the current episode
            _, TD_error = calculate_discountR_advantage(episode_r, episode_v)
            episodic_TD = np.mean(np.absolute(TD_error))
            print 'episodic_TD'
            print episodic_TD
            self.episodes[domainInControl].insertPriority(episodic_TD)

        return
示例#4
0
    def finalizeRecord(self, reward, domainInControl=None):
        if domainInControl is None:
            domainInControl = self.domainString
        if self.episodes[domainInControl] is None:
            logger.warning(
                "record attempted to be finalized for domain where nothing has been recorded before"
            )
            return

        # print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size())
        #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q)

        #print 'saving statics'
        #self.saveStats()
        #print self.stats
        #print 'stdVar'
        #print self.stdVar
        #print 'meanVar'
        #print self.meanVar
        #print 'stdMean'
        #print self.stdMean
        #print 'meanMean'
        #print self.meanMean
        # print 'td_error'
        # print self.td_error
        # print 'td_errorVar'
        # print self.td_errorVar

        # normalising total return to -1~1
        reward /= 20.0

        terminal_state, terminal_action = self.convertStateAction(
            TerminalState(), TerminalAction())

        if self.replay_type == 'vanilla':
            self.episodes[domainInControl].record(state=terminal_state, \
                                                  state_ori=TerminalState(), action=terminal_action, reward=reward,
                                                  terminal=True)
        elif self.replay_type == 'prioritized':
            # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used
            if True:
                # if self.samplecount >= self.capacity:
                self.episodes[domainInControl].record(state=terminal_state, \
                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False,
                                                      terminal=True)
            else:
                self.episodes[domainInControl].record(state=terminal_state, \
                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=True,
                                                      terminal=True)
        return
示例#5
0
    def finalizeRecord(self, reward, domainInControl=None):
        if domainInControl is None:
            domainInControl = self.domainString
        if self.episodes[domainInControl] is None:
            logger.warning(
                "record attempted to be finalized for domain where nothing has been recorded before"
            )
            return

        # # normalising total return to -1~1
        # reward /= 20.0

        terminal_state, terminal_action = self.convertStateAction(
            TerminalState(), TerminalAction())

        # # normalising total return to -1~1
        # reward /= 20.0

        self.mem_last_state = self.mem_cur_state
        self.mem_last_action = self.mem_cur_action
        self.mem_last_mask = self.mem_cur_mask
        self.mem_cur_state = np.vstack(
            [np.expand_dims(x, 0) for x in [terminal_state]])
        self.mem_cur_action = None
        self.mem_cur_mask = torch.zeros(self.action_dim).type(FloatTensor)

        state = self.mem_last_state
        action = self.mem_last_action
        next_state = self.mem_cur_state
        terminal = True

        if state is not None:
            self.trans_mem.append(
                self.trans(
                    torch.from_numpy(state).type(FloatTensor),  # state
                    action,  # action
                    torch.from_numpy(next_state).type(
                        FloatTensor),  # next state
                    torch.from_numpy(reward).type(FloatTensor),  # reward
                    terminal,  # terminal
                    self.mem_last_mask,  # action mask
                    self.mem_cur_mask))  # next action mask

            # randomly produce a preference for calculating priority
            # preference = self.w_kept
            preference = torch.randn(self.model_.reward_size)
            preference = (torch.abs(preference) /
                          torch.norm(preference, p=1)).type(FloatTensor)

            state = torch.from_numpy(state).type(FloatTensor)

            _, q = self.model_(
                Variable(state, requires_grad=False),
                Variable(preference.unsqueeze(0), requires_grad=False))

            q = q.data[0, action]

            if self.algorithm == 'naive':
                wr = preference.dot(torch.from_numpy(reward).type(FloatTensor))
                if not terminal:
                    next_state = torch.from_numpy(next_state).type(FloatTensor)
                    hq, _ = self.model_(
                        Variable(next_state, requires_grad=False),
                        Variable(preference.unsqueeze(0), requires_grad=False))
                    hq = hq.data[0]
                    p = abs(wr + self.gamma * hq - q)
                else:
                    self.w_kept = None
                    # if self.epsilon_decay:
                    #     self.epsilon -= self.epsilon_delta
                    p = abs(wr - q)
            elif self.algorithm == 'envelope':
                wq = preference.dot(q)
                wr = preference.dot(torch.from_numpy(reward).type(FloatTensor))
                if not terminal:
                    next_state = torch.from_numpy(next_state).type(FloatTensor)
                    hq, _ = self.model_(
                        Variable(next_state, requires_grad=False),
                        Variable(preference.unsqueeze(0), requires_grad=False))
                    hq = hq.data[0]
                    whq = preference.dot(hq)
                    p = abs(wr + self.gamma * whq - wq)
                else:
                    self.w_kept = None
                    # if self.epsilon_decay:
                    #     self.epsilon -= self.epsilon_delta
                    # if self.homotopy:
                    #     self.beta += self.beta_delta
                    #     self.beta_delta = (self.beta - self.beta_init) * self.beta_expbase + self.beta_init - self.beta
                    p = abs(wr - wq)

            p += 1e-5

            self.priority_mem.append(p)
            if len(self.trans_mem) > self.mem_size:
                self.trans_mem.popleft()
                self.priority_mem.popleft()