示例#1
0
    def append(self, step, s, a, n_s, r, d):
        self.sr = self.low_reward(s, self.sg, n_s)

        # Low Replay Buffer
        self.replay_buffer_low.append(s, self.sg, a, n_s, self.n_sg, self.sr,
                                      float(d))

        # High Replay Buffer
        if _is_update(step, self.buffer_freq, rem=1):
            if len(self.buf[6]) == self.buffer_freq:
                self.buf[4] = s
                self.buf[5] = float(d)
                self.replay_buffer_high.append(state=self.buf[0],
                                               goal=self.buf[1],
                                               action=self.buf[2],
                                               n_state=self.buf[4],
                                               reward=self.buf[3],
                                               done=self.buf[5],
                                               state_arr=np.array(self.buf[6]),
                                               action_arr=np.array(
                                                   self.buf[7]))
            self.buf = [s, self.fg, self.sg, 0, None, None, [], []]

        self.buf[3] += self.reward_scaling * r
        self.buf[6].append(s)
        self.buf[7].append(a)
示例#2
0
    def log(self, global_step, data):
        losses, td_errors = data[0], data[1]

        # Logs
        if global_step >= self.args.start_training_steps and _is_update(global_step, args.writer_freq):
            for k, v in losses.items():
                self.logger.write('loss/%s'%(k), v, global_step)
            
            for k, v in td_errors.items():
                self.logger.write('td_error/%s'%(k), v, global_step)
示例#3
0
    def end_episode(self, episode, logger=None):
        if logger:
            # log
            logger.write('reward/Intrinsic Reward', self.episode_subreward,
                         episode)

            # Save Model
            if _is_update(episode, self.model_save_freq):
                self.save(episode=episode)

        self.episode_subreward = 0
        self.sr = 0
        self.buf = [None, None, None, 0, None, None, [], []]
示例#4
0
 def evaluate(self, e):
     # Print
     if _is_update(e, args.print_freq):
         agent = copy.deepcopy(self.agent)
         rewards, success_rate = agent.evaluate_policy(self.env)
         #rewards, success_rate = self.agent.evaluate_policy(self.env)
         self.logger.write('Success Rate', success_rate, e)
         
         print('episode:{episode:05d}, mean:{mean:.2f}, std:{std:.2f}, median:{median:.2f}, success:{success:.2f}'.format(
                 episode=e, 
                 mean=np.mean(rewards), 
                 std=np.std(rewards), 
                 median=np.median(rewards), 
                 success=success_rate))
示例#5
0
 def end_episode(self, episode, logger=None):
     if logger:
         if _is_update(episode, self.model_save_freq):
             self.save(episode=episode)