示例#1
0
    def test_per_nstep(self):
        """
        PrioritizedReplayBuffer.on_episode_end() ignores Exception

        Ref: https://gitlab.com/ymd_h/cpprb/-/issues/111
        """

        rb = PrioritizedReplayBuffer(32, {
            "rew": {},
            "done": {}
        },
                                     Nstep={
                                         "size": 4,
                                         "rew": "rew",
                                         "gamma": 0.5
                                     })

        for _ in range(10):
            rb.add(rew=0.5, done=0.0)

        rb.add(rew=0.5, done=1.0)
        rb.on_episode_end()

        s = rb.sample(16)

        self.assertIn("discounts", s)
示例#2
0
文件: v8.py 项目: ymd-h/cpprb
    def test_sample(self):
        buffer_size = 500
        obs_shape = (84, 84, 3)
        act_dim = 4

        rb = PrioritizedReplayBuffer(buffer_size, {
            "obs": {
                "shape": obs_shape
            },
            "act": {
                "shape": act_dim
            },
            "rew": {},
            "done": {}
        },
                                     next_of="obs")

        obs = np.zeros(obs_shape)
        act = np.ones(act_dim)
        rew = 1
        done = 0

        rb.add(obs=obs, act=act, rew=rew, next_obs=obs, done=done)

        ps = 1.5

        rb.add(obs=obs,
               act=act,
               rew=rew,
               next_obs=obs,
               done=done,
               priorities=ps)

        self.assertAlmostEqual(rb.get_max_priority(), 1.5)

        obs = np.stack((obs, obs))
        act = np.stack((act, act))
        rew = (1, 0)
        done = (0.0, 1.0)

        rb.add(obs=obs, act=act, rew=rew, next_obs=obs, done=done)

        ps = (0.2, 0.4)
        rb.add(obs=obs,
               act=act,
               rew=rew,
               next_obs=obs,
               done=done,
               priorities=ps)

        sample = rb.sample(64)

        w = sample["weights"]
        i = sample["indexes"]

        rb.update_priorities(i, w * w)
示例#3
0
    def test_PrioritizedReplayBuffer_with_single_step_with_priorities(self):
        buffer_size = 256
        obs_shape = (3, 4)
        batch_size = 10

        rb = PrioritizedReplayBuffer(buffer_size,
                                     {"obs": {
                                         "shape": obs_shape
                                     }})

        v = {"obs": np.ones(shape=obs_shape), "priorities": 0.5}

        rb.add(**v)

        rb.sample(batch_size)

        for _ in range(100):
            rb.add(**v)

        rb.sample(batch_size)
示例#4
0
    def test_PrioritizedReplayBuffer_with_multiple_steps(self):
        buffer_size = 256
        obs_shape = (3, 4)
        step_size = 32
        batch_size = 10

        rb = PrioritizedReplayBuffer(buffer_size,
                                     {"obs": {
                                         "shape": obs_shape
                                     }})

        v = {"obs": np.ones(shape=(step_size, *obs_shape))}

        rb.add(**v)

        rb.sample(batch_size)

        for _ in range(100):
            rb.add(**v)

        rb.sample(batch_size)
示例#5
0
文件: issue.py 项目: ymd-h/cpprb
    def test_read_only_priority(self):
        buffer_size = 100
        batch_size = 32

        env_dict = {"done": {}}

        done = np.zeros(2)
        ps = np.ones_like(done)
        ps.setflags(write=False)

        rb = PrioritizedReplayBuffer(buffer_size, env_dict)
        rb.add(done=done, priority=ps)

        sample = rb.sample(batch_size)
        ps2 = sample["weights"]
        ps2.setflags(write=False)

        rb.update_priorities(sample["indexes"], ps2)
示例#6
0
    # Directly access internal PrioritizedBuffer,
    # since ChainerRL/PrioritizedReplayBuffer has no API to set priority.
    cprb.memory.append([{
        "state": o,
        "action": a,
        "reward": r,
        "next_state": o,
        "is_state_terminal": d
    }],
                       priority=p)

perfplot.plot(
    time_unit="ms",
    setup=lambda n: n,
    kernels=[
        lambda n: bprb.sample(n, beta=beta),
        lambda n: rprb.sample(n, beta=beta),
        sample_c(cprb), lambda n: prb.sample(n, beta=beta)
    ],
    labels=["OpenAI/Baselines", "Ray/RLlib", "Chainer/ChainerRL", "cpprb"],
    n_range=[2**n for n in range(1, 9)],
    xlabel="Batch size",
    logx=False,
    logy=False,
    equality_check=None)
plt.title("Prioritized Replay Buffer Sample Speed")
plt.savefig("PrioritizedReplayBuffer_sample.png",
            transparent=True,
            bbox_inches="tight")
plt.close()
示例#7
0
class RainbowAgent:
    """Agent interacting with environment.
    
    Attribute:
        env (gym.Env): openAI Gym environment
        memory (PrioritizedReplayBuffer): replay memory to store transitions
        batch_size (int): batch size for sampling
        target_update (int): period for target model's hard update
        gamma (float): discount factor
        dqn (Network): model to train and select actions
        dqn_target (Network): target model to update
        optimizer (torch.optim): optimizer for training dqn
        transition (list): transition information including 
                           state, action, reward, next_state, done
        v_min (float): min value of support
        v_max (float): max value of support
        atom_size (int): the unit number of support
        support (torch.Tensor): support for categorical dqn
        use_n_step (bool): whether to use n_step memory
        n_step (int): step number to calculate n-step td error
        memory_n (ReplayBuffer): n-step replay buffer
    """

    def __init__(
        self, 
        env: gym.Env,
        memory_size: int,
        batch_size: int,
        target_update: int,
        gamma: float = 0.99,
        # PER parameters
        alpha: float = 0.2,
        beta: float = 0.6,
        prior_eps: float = 1e-6,
        # Categorical DQN parameters
        v_min: float = 0.0,
        v_max: float = 200.0,
        atom_size: int = 51,
        # N-step Learning
        n_step: int = 3,
        # Convergence parameters
        convergence_window: int = 100,
        convergence_window_epsilon_p: int = 10, 
        convergence_avg_score: float = 195.0,
        convergence_avg_epsilon: float = 0.0524, # 3 degs converted to rads
        convergence_avg_epsilon_p: float = 0.0174, # 1 deg/s converted to rad/s
        # Tensorboard parameters
        model_name: str = "snake_joint",

    ):
        """Initialization.
        
        Args:
            env (gym.Env): openAI Gym environment
            memory_size (int): length of memory
            batch_size (int): batch size for sampling
            target_update (int): period for target model's hard update
            lr (float): learning rate
            gamma (float): discount factor
            alpha (float): determines how much prioritization is used
            beta (float): determines how much importance sampling is used
            prior_eps (float): guarantees every transition can be sampled
            v_min (float): min value of support
            v_max (float): max value of support
            atom_size (int): the unit number of support
            n_step (int): step number to calculate n-step td error
        """
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        
        self.env = env
        self.batch_size = batch_size
        self.target_update = target_update
        self.gamma = gamma
        # NoisyNet: All attributes related to epsilon are removed

        #produces a unique timestamp for each run 
        run_timestamp=str(
        #returns number of day and number of month
        str(time.localtime(time.time())[2]) + "_" +
        str(time.localtime(time.time())[1]) + "_" +
        #returns hour, minute and second
        str(time.localtime(time.time())[3]) + "_" +
        str(time.localtime(time.time())[4]) + "_" +
        str(time.localtime(time.time())[5])
        )

        #Will write scalars that can be visualized using tensorboard in the directory "runLogs/timestamp"
        self.writer = SummaryWriter("runLogs/" + run_timestamp)


        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )
        print(self.device)
        
        # PER
        # memory for 1-step Learning
        self.beta = beta
        self.prior_eps = prior_eps
        self.memory = PrioritizedReplayBuffer(
            memory_size,
            {
                "obs": {"shape": (obs_dim,)},
                "act": {"shape": (1,)},
                "rew": {},
                "next_obs": {"shape": (obs_dim,)},
                "done": {}
            },
            alpha=alpha    
        )
        
        # memory for N-step Learning
        self.use_n_step = True if n_step > 1 else False
        if self.use_n_step:
            self.n_step = n_step
            self.memory_n = ReplayBuffer(
                memory_size,
                {
                    "obs": {"shape": (obs_dim,)},
                    "act": {"shape": (1,)},
                    "rew": {},
                    "next_obs": {"shape": (obs_dim,)},
                    "done": {}
                },
                Nstep={
                    "size": n_step,
                    "gamma": gamma,
                    "rew": "rew",
                    "next": "next_obs"
                }
            )
            
        # Categorical DQN parameters
        self.v_min = v_min
        self.v_max = v_max
        self.atom_size = atom_size
        self.support = torch.linspace(
            self.v_min, self.v_max, self.atom_size
        ).to(self.device)

        # networks: dqn, dqn_target
        self.dqn = Network(
            obs_dim, action_dim, self.atom_size, self.support
        ).to(self.device)
        self.dqn_target = Network(
            obs_dim, action_dim, self.atom_size, self.support
        ).to(self.device)
        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()
        
        # optimizer
        self.optimizer = optim.Adam(self.dqn.parameters(),0.0001)

        # transition to store in memory
        self.transition = list()
        
        # mode: train / test
        self.is_test = False

        # Custom tensorboard object
        # self.tensorboard = RainbowTensorBoard(
        #     log_dir="single_joint_logs/{}-{}".format(
        #         model_name,
        #         datetime.now().strftime("%m-%d-%Y-%H_%M_%S")
        #     )
        # )
        # Convergence criterion
        self.convergence_window = convergence_window
        self.convergence_window_epsilon_p = convergence_window_epsilon_p
        self.convergence_avg_score = convergence_avg_score 
        self.convergence_avg_epsilon = convergence_avg_epsilon
        self.convergence_avg_epsilon_p = convergence_avg_epsilon_p


    def select_action(self, state: np.ndarray) -> np.ndarray:
        """Select an action from the input state."""
        # NoisyNet: no epsilon greedy action selection
        selected_action = self.dqn(
            torch.FloatTensor(state).to(self.device)
        ).argmax()
        selected_action = selected_action.detach().cpu().numpy()
        
        if not self.is_test:

            self.transition = [state, selected_action]
        

        return selected_action


    def step(self, action: np.ndarray, score:int) -> Tuple[np.ndarray, np.float64, bool]:
        """Take an action and return the response of the env."""
        next_state, reward, done, _ = self.env.step(action,score)

        if not self.is_test:
            self.transition += [reward, next_state, done]
            
            # N-step transition
            if self.use_n_step:
                idx = self.memory_n.add(
                    **dict(
                        zip(["obs", "act", "rew", "next_obs", "done"], self.transition)
                    )
                )
                one_step_transition = [ v[idx] for _,v in self.memory_n.get_all_transitions().items()] if idx else None

            # 1-step transition
            else:
                one_step_transition = self.transition

            # add a single step transition
            if one_step_transition:
                self.memory.add(
                    **dict(
                        zip(["obs", "act", "rew", "next_obs", "done"], one_step_transition)
                    )
                )
    
        return next_state, reward, done


    def update_model(self,frame_idx:int) -> torch.Tensor:
        """Update the model by gradient descent.
        shape of elementwise_loss = [128,51]
        shape of loss = ([])
        shape of weights ([128,1)]
        """
        # PER needs beta to calculate weights
        samples = self.memory.sample(self.batch_size, beta=self.beta)
        weights = torch.FloatTensor(
            samples["weights"].reshape(-1, 1)
        ).to(self.device)
        indices = samples["indexes"]
        #rospy.loginfo(samples.keys())
        #rospy.loginfo(weights.shape)
        #rospy.loginfo(indices.shape())

        #torch.save(self.dqn.state_dict(),str("checkpoint_"+str(time.time())))
        
        # 1-step Learning loss
        elementwise_loss = self._compute_dqn_loss(samples, self.gamma)
        
        # PER: importance sampling before average
        loss = torch.mean(elementwise_loss * weights)
        
        self.writer.add_scalar('update_model/Lossv0', loss.detach().item(),frame_idx )
        
        # N-step Learning loss
        # we are gonna combine 1-step loss and n-step loss so as to
        # prevent high-variance. The original rainbow employs n-step loss only.
        if self.use_n_step:
            gamma = self.gamma ** self.n_step
            samples = {k: [v[i] for i in indices] for k,v in self.memory_n.get_all_transitions().items()}
            elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma)
            elementwise_loss += elementwise_loss_n_loss
            
            #rospy.loginfo(elementwise_loss_n_loss.shape)
            #rospy.loginfo(elementwise_loss.shape)

            # PER: importance sampling before average
            loss = torch.mean(elementwise_loss * weights)

        
        rospy.loginfo(
            f"{elementwise_loss}"
            )
        self.optimizer.zero_grad()
        self.writer.add_scalar('update_model/Lossv1', loss.detach().item(),frame_idx )
        #From pytorch doc: backward() Computes the gradient of current tensor w.r.t. graph leaves.
        #self.writer.add_image("loss gradient before", loss, frame_idx)
        loss.backward()
        #self.writer.add_image("loss gradient after", loss, frame_idx)
        self.writer.add_scalar('update_model/Lossv2', loss.detach().item(),frame_idx )
        clip_grad_norm_(self.dqn.parameters(), 10.0)
        self.optimizer.step()
        
        # PER: update priorities
        loss_for_prior = elementwise_loss.detach().cpu().numpy()
        new_priorities = loss_for_prior + self.prior_eps
        self.memory.update_priorities(indices, new_priorities)
        
        # NoisyNet: reset noise
        self.dqn.reset_noise()
        self.dqn_target.reset_noise()
        
        #rospy.loginfo("second")
        #rospy.loginfo(loss.shape)

        #rospy.loginfo("loss dimension = " + loss.ndim()  )   
        #rospy.loginfo("loss = " + str(loss.detach().item()) + "type = " + str(type(loss.detach().item())  )   )   
        self.writer.add_scalar('update_model/Loss', loss.detach().item(),frame_idx )
        return loss.detach().item()


    def train(self, num_frames: int):
        """Train the agent."""
        self.is_test = False
        
        state = self.env.reset()
        update_cnt = 0
        losses = []
        scores = []
        score = 0

        for frame_idx in tqdm(range(1, num_frames + 1)):

            action = self.select_action(state)
            next_state, reward, done = self.step(action,score)

            state = next_state
            score += reward
            
            # NoisyNet: removed decrease of epsilon
            
            # PER: increase beta
            fraction = min(frame_idx / num_frames, 1.0)
            self.beta = self.beta + fraction * (1.0 - self.beta)

            # if episode ends
            if done:
                #rospy.loginfo("logging for done")
                self.writer.add_scalar('train/score', score, frame_idx)
                self.writer.add_scalar('train/final_epsilon', state[6], frame_idx)
                self.writer.add_scalar('train/epsilon_p', state[7], frame_idx)
                state = self.env.reset()
                scores.append(score)
                score = 0

            # if training is ready
            if self.memory.get_stored_size() >= self.batch_size:
                #frame_id given as argument for logging by self.writer. 
                #rospy.loginfo("frame_idx= " + str(frame_idx) + "type = " + str(type(frame_idx)))
                loss = self.update_model(frame_idx)

                losses.append(loss)
                update_cnt += 1
                
                # if hard update is needed
                if update_cnt % self.target_update == 0:
                    self._target_hard_update(loss)

        self.env.close()


    def test(self) -> List[np.ndarray]:
        """Test the agent."""
        self.is_test = True
        
        state = self.env.reset()
        done = False
        score = 0
        
        frames = []
        while not done:
            frames.append(self.env.render(mode="rgb_array"))
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward
        
        print("score: ", score)
        self.env.close()
        
        return frames


    def _compute_dqn_loss(self, samples: Dict[str, np.ndarray], gamma: float) -> torch.Tensor:
        """Return categorical dqn loss."""
        device = self.device  # for shortening the following lines
        state = torch.FloatTensor(samples["obs"]).to(device)
        next_state = torch.FloatTensor(samples["next_obs"]).to(device)
        action = torch.LongTensor(samples["act"]).to(device)
        reward = torch.FloatTensor(np.array(samples["rew"]).reshape(-1, 1)).to(device)
        done = torch.FloatTensor(np.array(samples["done"]).reshape(-1, 1)).to(device)
        
        # Categorical DQN algorithm
        delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1)

        with torch.no_grad():
            # Double DQN
            next_action = self.dqn(next_state).argmax(1)
            next_dist = self.dqn_target.dist(next_state)
            next_dist = next_dist[range(self.batch_size), next_action]

            t_z = reward + (1 - done) * gamma * self.support
            t_z = t_z.clamp(min=self.v_min, max=self.v_max)
            b = (t_z - self.v_min) / delta_z
            l = b.floor().long()
            u = b.ceil().long()

            offset = (
                torch.linspace(
                    0, (self.batch_size - 1) * self.atom_size, self.batch_size
                ).long()
                .unsqueeze(1)
                .expand(self.batch_size, self.atom_size)
                .to(self.device)
            )

            proj_dist = torch.zeros(next_dist.size(), device=self.device)
            proj_dist.view(-1).index_add_(
                0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1)
            )
            proj_dist.view(-1).index_add_(
                0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1)
            )
            print(f"Next Action : {next_action}\n Next Dist : {next_dist}\n")

        dist = self.dqn.dist(state)
        log_p = torch.log(dist[range(self.batch_size), action])
        elementwise_loss = -(proj_dist * log_p).sum(1)
        print(f"Proj Dist : {proj_dist}\n Dist : {dist}\n Log_p : {log_p}\n")
        if torch.isnan(elementwise_loss[0][0]):
            exit()

        return elementwise_loss


    def _target_hard_update(self,loss):
        """Hard update: target <- local."""
        self.dqn_target.load_state_dict(self.dqn.state_dict())
        #torch.save(self.dqn.state_dict(),str("checkpoint_"+str(time.time())))

        torch.save({
            'model_state_dict': self.dqn.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'loss': loss,
            }, str("checkpoints/checkpoint_"+str(time.time())))
示例#8
0
    if np.random.rand() < egreedy:
        action = env.action_space.sample()
    else:
        Q = tf.squeeze(model(observation.reshape(1, -1)))
        action = np.argmax(Q)

    next_observation, reward, done, info = env.step(action)
    rb.add(obs=observation,
           act=action,
           rew=reward,
           next_obs=next_observation,
           done=done)
    observation = next_observation

    if prioritized:
        sample = rb.sample(batch_size, beta)
        beta += beta_step
    else:
        sample = rb.sample(batch_size)

    weights = sample["weights"].ravel() if prioritized else tf.constant(1.0)

    with tf.GradientTape() as tape:
        tape.watch(model.trainable_weights)
        Q = Q_func(model, tf.constant(sample["obs"]),
                   tf.constant(sample["act"].ravel()),
                   tf.constant(env.action_space.n))
        target_Q = target_func(model, target_model,
                               tf.constant(sample['next_obs']),
                               tf.constant(sample["rew"].ravel()),
                               tf.constant(sample["done"].ravel()), discount,
示例#9
0
    r = np.random.rand(1)
    d = np.random.randint(2)  # [0,2) == 0 or 1
    p = np.random.rand(1)

    client.insert([o, a, r, o, d], priorities={"PrioritizedReplayBuffer": p})

    prb.add(obs=o, act=a, rew=r, next_obs=o, done=d, priority=p)

perfplot.plot(time_unit="ms",
              setup=lambda n: n,
              kernels=[
                  sample_client(client, "PrioritizedReplayBuffer"),
                  sample_tf_client(tf_client, "PrioritizedReplayBuffer"),
                  sample_tf_client_dataset(tf_client,
                                           "PrioritizedReplayBuffer"),
                  lambda n: prb.sample(n, beta=beta)
              ],
              labels=[
                  "DeepMind/Reverb: Client.sample",
                  "DeepMind/Reverb: TFClient.sample",
                  "DeepMind/Reverb: TFClient.dataset", "cpprb"
              ],
              n_range=[2**n for n in range(1, 9)],
              xlabel="Batch size",
              logx=False,
              logy=False,
              equality_check=None)
plt.title("Prioritized Replay Buffer Sample Speed")
plt.savefig("PrioritizedReplayBuffer_sample2.png",
            transparent=True,
            bbox_inches="tight")
class Agent:
    def __init__(self,
                 lr,
                 state_shape,
                 num_actions,
                 batch_size,
                 max_mem_size=100000):
        self.lr = lr
        self.gamma = 0.99
        self.action_space = list(range(num_actions))
        self.batch_size = batch_size

        self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000)
        self.importance_exp = Lerper(start=0.4, end=1.0, num_steps=100000)

        self.priority_exp = 0.6
        self.memory = PrioritizedReplayBuffer(max_mem_size, {
            "obs": {
                "shape": state_shape
            },
            "act": {
                "shape": 1
            },
            "rew": {},
            "next_obs": {
                "shape": state_shape
            },
            "done": {
                "shape": 1
            }
        },
                                              alpha=self.priority_exp)

        self.net = Network(lr, state_shape, num_actions)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon.value():
            state = torch.tensor(observation).float().detach()
            state = state.to(self.net.device)
            state = state.unsqueeze(0)

            q_values = self.net(state)
            action = torch.argmax(q_values).item()
            return action
        else:
            return np.random.choice(self.action_space)

    def store_memory(self, state, action, reward, next_state, done):
        self.memory.add(obs=state,
                        act=action,
                        rew=reward,
                        next_obs=next_state,
                        done=done)

    def learn(self):
        if self.memory.get_stored_size() < self.batch_size:
            return

        batch = self.memory.sample(self.batch_size,
                                   self.importance_exp.value())

        states = torch.tensor(batch["obs"]).to(self.net.device)
        actions = torch.tensor(batch["act"],
                               dtype=torch.int64).to(self.net.device).T[0]
        rewards = torch.tensor(batch["rew"]).to(self.net.device).T[0]
        states_ = torch.tensor(batch["next_obs"]).to(self.net.device)
        dones = torch.tensor(batch["done"],
                             dtype=torch.bool).to(self.net.device).T[0]
        weights = torch.tensor(batch["weights"]).to(self.net.device)

        batch_index = np.arange(self.batch_size, dtype=np.int64)

        q_values = self.net(states)[batch_index, actions]
        q_values_ = self.net(states_)

        action_qs_ = torch.max(q_values_, dim=1)[0]
        action_qs_[dones] = 0.0
        q_target = rewards + self.gamma * action_qs_

        td = q_target - q_values

        self.net.optimizer.zero_grad()
        loss = ((td**2.0) * weights).mean()
        loss.backward()
        self.net.optimizer.step()

        new_priorities = (td.abs()).detach().cpu()
        self.memory.update_priorities(batch["indexes"], new_priorities)

        self.epsilon.step()
        self.importance_exp.step()
示例#11
0
                    if int(mean) > best_reward:
                        best_reward = int(mean)
                        torch.save(net.state_dict(), f=os.path.join(
                            folder, sub_folder, str(best_reward) + '.dat'))
                    if mean > params.solve_rewards:
                        print('Solved in {}!'.format(datetime.now()-st))
                        if args.play:
                            utils.play(params.env, agent, wait=0.01)
                        break

            if frame < params.init_replay:
                continue

            if frame % args.envs == 0:
                if args.priority:
                    batch = buffer.sample(params.batch_size, BETA)
                    BETA = min(BETA + frame * step, TGT_BETA)
                else:
                    batch = buffer.sample(params.batch_size)

                optimizer.zero_grad()
                if args.priority:
                    loss_v, batch_prios, batch_indexes = calc_loss_dqn(batch, net,
                                                                       tgt_net, params.gamma, device, True)
                else:
                    loss_v = calc_loss_dqn(
                        batch, net, tgt_net, params.gamma**args.steps, device=device)
                loss_v.backward()
                optimizer.step()
                if args.priority:
                    buffer.update_priorities(batch_indexes, batch_prios)
示例#12
0
class ReplayMemory():
    def __init__(self, args, capacity, env):
        # Initial importance sampling weight β, annealed to 1 over course of training
        self.priority_weight = args.priority_weight
        self.n = args.multi_step
        self.device = args.device
        if args.mmap:
            os.makedirs('memories/', exist_ok=True)
            mmap_prefix = 'memories/mm'
        else:
            mmap_prefix = None
        self.buffer = PrioritizedReplayBuffer(
            capacity,
            {
                "obs": {
                    "shape": env.observation_space.shape,
                    "dtype": env.observation_space.dtype
                },
                "next_obs": {
                    "shape": env.observation_space.shape,
                    "dtype": env.observation_space.dtype
                },
                "act": {
                    "shape": 1,
                    "dtype": env.action_space.dtype
                },
                "rew": {
                    "dtype": np.float32
                },
                "done": {
                    "dtype": np.uint8
                },
            },
            Nstep={
                "size": self.n,
                "gamma": args.discount,
                "rew": "rew",
                "next": "next_obs",
            },
            mmap_prefix=mmap_prefix,
            alpha=args.priority_exponent,
            # next_of="obs",
            # stack_compress="obs",
        )

    def append(self, state, next_state, action, reward, done):
        self.buffer.add(
            **{
                "obs": state,
                "next_obs": next_state,
                "act": action,
                "rew": reward,
                "done": done,
            })

    def sample(self, size):
        s = self.buffer.sample(size, self.priority_weight)
        s['indexes'] = s['indexes'].astype(np.int32)
        return torchify((s['indexes'], torch.int32), (s['obs'], torch.float32),
                        (np.squeeze(s['act'], 1), torch.long),
                        (np.squeeze(s['rew'], 1), torch.float32),
                        (s['next_obs'], torch.float32),
                        (s['done'], torch.bool), (s['weights'], torch.float32),
                        device=self.device)

    def update_priorities(self, indexes, new_priorities):
        indexes = indexes.cpu().numpy()
        self.buffer.update_priorities(indexes, new_priorities)
示例#13
0
class RainbowAgent:
    """
    Rainbow Agent interacting with environment.
    
    Attribute:
        env (gym.Env): openAI Gym environment (connected to Gazebo node)
        memory (PrioritizedReplayBuffer): replay memory to store transitions
        batch_size (int): batch size for sampling
        target_update (int): period for target model's hard update
        gamma (float): discount factor
        dqn (Network): model to train and select actions
        dqn_target (Network): target model to update
        optimizer (torch.optim): optimizer for training dqn
        transition (list): transition information including 
            state, action, reward, next_state, done
        v_min (float): min value of support
        v_max (float): max value of support
        atom_size (int): the unit number of support
        support (torch.Tensor): support for categorical dqn
        use_n_step (bool): whether to use n_step memory
        n_step (int): step number to calculate n-step td error
        memory_n (ReplayBuffer): n-step replay buffer
    """
    def __init__(
        self,
        env: gym.Env,
        memory_size: int,
        batch_size: int,
        target_update: int,
        gamma: float = 0.99,
        # PER parameters
        alpha: float = 0.2,
        beta: float = 0.6,
        prior_eps: float = 1e-6,
        # Categorical DQN parameters
        v_min: float = 0.0,
        v_max: float = 200.0,
        atom_size: int = 51,
        # N-step Learning
        n_step: int = 3,
        # Convergence parameters
        convergence_window: int = 100,
        convergence_window_epsilon_p: int = 10, 
        convergence_avg_score: float = 195.0,
        convergence_avg_epsilon: float = 0.0524, # 3 degs converted to rads
        convergence_avg_epsilon_p: float = 0.0174, # 1 deg/s converted to rad/s
        # Tensorboard parameters
        model_name: str = "snake_joint",
    ):
        """
        Initialization.

        Args:
            env_client (GymEnvClient): ROS client to an openAI Gym environment server
            memory_size (int): length of memory
            batch_size (int): batch size for sampling
            target_update (int): period for target model's hard update
            lr (float): learning rate
            gamma (float): discount factor
            alpha (float): determines how much prioritization is used
            beta (float): determines how much importance sampling is used
            prior_eps (float): guarantees every transition can be sampled
            v_min (float): min value of support
            v_max (float): max value of support
            atom_size (int): the unit number of support
            n_step (int): step number to calculate n-step td error
        """
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n

        self.env = env
        self.batch_size = batch_size
        self.target_update = target_update
        self.gamma = gamma

        # Selecting computing device
        physical_devices = tf.config.list_physical_devices('GPU') 
        n_gpu = len(physical_devices)
        rospy.loginfo("Number of GPU detected : " + str(n_gpu))
        if n_gpu > 0:
            rospy.loginfo("Switching to single GPU mode : /device:GPU:0")
            self.used_device = "/device:GPU:0"
            tf.config.experimental.set_memory_growth(physical_devices[0], True)
        else:
            rospy.loginfo("No GPU detected. Switching to single CPU mode : /device:CPU:0")
            self.used_device = "/device:CPU:0"

        # PER
        # memory for 1-step learning
        self.beta = beta
        self.prior_eps = prior_eps
        self.memory = PrioritizedReplayBuffer(
            memory_size,
            {
                "obs": {"shape": (obs_dim,)},
                "act": {"shape": (1,)},
                "rew": {},
                "next_obs": {"shape": (obs_dim,)},
                "done": {}
            },
            alpha=alpha    
        )

        # memory for N-step learning
        self.use_n_step = True if n_step > 1 else False
        if self.use_n_step:
            self.n_step = n_step
            self.memory_n = ReplayBuffer(
                memory_size,
                {
                    "obs": {"shape": (obs_dim,)},
                    "act": {"shape": (1,)},
                    "rew": {},
                    "next_obs": {"shape": (obs_dim,)},
                    "done": {}
                },
                Nstep={
                    "size": n_step,
                    "gamma": gamma,
                    "rew": "rew",
                    "next": "next_obs"
                }
            )

        # Categorical DQN parameters
        self.v_min = v_min
        self.v_max = v_max
        self.atom_size = atom_size
        self.support = tf.linspace(self.v_min, self.v_max, self.atom_size, name="support")

        # networks: dqn, dqn_target
        self.dqn = Network(
            obs_dim, action_dim, self.atom_size, self.support, name="dqn"
        )
        self.dqn_target = Network(
            obs_dim, action_dim, self.atom_size, self.support, name="dqn_target"
        )

        # optimizer
        self.optimizer = Adam(
            learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, name='AdamOptimizer'
        )

        # transition to store in memory
        self.transition = list()

        # mode: train / test
        self.is_test = False

        # Custom tensorboard object
        self.tensorboard = RainbowTensorBoard(
            log_dir="single_joint_logs/{}-{}".format(
                model_name,
                datetime.now().strftime("%m-%d-%Y-%H_%M_%S")
            )
        )
        # Convergence criterion
        self.convergence_window = convergence_window
        self.convergence_window_epsilon_p = convergence_window_epsilon_p
        self.convergence_avg_score = convergence_avg_score 
        self.convergence_avg_epsilon = convergence_avg_epsilon
        self.convergence_avg_epsilon_p = convergence_avg_epsilon_p

        #TODO 
        # model checkpoint object
        self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.dqn_target)
        self.checkpoint_manager = tf.train.CheckpointManager(
            self.checkpoint, directory="single_joint_ckpts", max_to_keep=5
        )


    def select_action(self, state: np.ndarray) -> np.ndarray:
        """Select an action from the input state."""
        # NoisyNet: no epsilon greedy action selection
        selected_action = tf.math.argmax(self.dqn(
            tf.constant(state.reshape(1, state.shape[0]), dtype=tf.float32)
        ), axis=-1, name="argmax_selected_action")
        
        # Convert to numpy ndarray datatype
        selected_action = selected_action.numpy()

        if not self.is_test:
            self.transition = [state, selected_action]
        
        return selected_action


    def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool]:
        """
        Take an action and return the response of the env.
        """
        next_state, reward, done, _ = self.env.step(action,score)

        if not self.is_test:
            self.transition += [reward, next_state, done]

            # N-step transition
            if self.use_n_step:
                idx = self.memory_n.add(
                    **dict(
                        zip(["obs", "act", "rew", "next_obs", "done"], self.transition)
                    )
                )
                one_step_transition = [ v[idx] for _,v in self.memory_n.get_all_transitions().items()] if idx else None

            # 1-step transition
            else:
                one_step_transition = self.transition
            # add a single step transition
            if one_step_transition:
                self.memory.add(
                    **dict(
                        zip(["obs", "act", "rew", "next_obs", "done"], one_step_transition)
                    )
                )
        return next_state, reward, done


    def update_model(self) -> tf.Tensor:
        """
        Update the model by gradient descent
        """
        # PER needs beta to calculate weights
        samples = self.memory.sample(self.batch_size, beta=self.beta)
        weights = tf.constant(
            samples["weights"].reshape(-1, 1),
            dtype=tf.float32,
            name="update_model_weights"
        )
        indices = samples["indexes"]

        # 1-step Learning loss
        elementwise_loss = self._compute_dqn_loss(samples, self.gamma)


        with tf.GradientTape() as tape:
            # PER: importance of sampling before average
            loss = tf.math.reduce_mean(elementwise_loss * weights)

            # N-step Learning loss
            # We are going to combine 1-ste[ loss and n-step loss so as to
            # prevent high-variance.
            if self.use_n_step:
                gamma = self.gamma ** self.n_step
                samples = {k: [v[i] for i in indices] for k,v in self.memory_n.get_all_transitions().items()}
                elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma)
                elementwise_loss += elementwise_loss_n_loss

                # PER: importance of sampling before average
                loss = tf.math.reduce_mean(elementwise_loss * weights)
        
        dqn_variables = self.dqn.trainable_variables
        gradients = tape.gradient(loss, dqn_variables)
        gradients, _ = tf.clip_by_global_norm(gradients, 10.0)
        self.optimizer.apply_gradients(zip(gradients, dqn_variables))

        # PER: update priorities
        loss_for_prior = elementwise_loss.numpy()
        new_priorities = loss_for_prior + self.prior_eps
        self.memory.update_priorities(indices, new_priorities)

        # NoisyNet: reset noise
        self.dqn.reset_noise()
        self.dqn_target.reset_noise()

        return loss.numpy().ravel()


    def train(self, num_frames: int):
        """Train the agent."""
        self.is_test = False

        state = self.env.reset()
        update_cnt = 0
        scores = deque(maxlen=self.convergence_window)
        joint_epsilon = deque(maxlen=self.convergence_window)
        joint_epsilon_p = deque(maxlen=self.convergence_window_epsilon_p)
        score = 0 # cumulated reward
        episode_length = 0
        episode_cnt = 0

        for frame_idx in tqdm(range(1, num_frames + 1), file=tqdm_out):
            action = self.select_action(state)
            next_state, reward, done = self.step(action)
            state = next_state
            score += reward
            episode_length += 1

            # PER: increase beta
            fraction = min(frame_idx / num_frames, 1.0)
            self.beta = self.beta + fraction * (1.0 - self.beta)

            print("epsilon_p is {}".format(state[7]))
            print("epsilon is {}".format(state[6]))

            if done:
                print("done")
                # to be used for convergence criterion
                scores.append(score) 
                joint_epsilon.append(state[6])
                joint_epsilon_p.append(state[7])
                #

                state = self.env.reset()
                self.tensorboard.update_stats(
                    score={
                        "data": score,
                        "desc": "Score (or cumulated rewards) for an episode - episode index on x-axis."
                    },
                    episode_length={
                        "data": episode_length,
                        "desc": "Episode length (in frames)"
                    },
                    final_epsilon={
                        "data": state[6],
                        "desc": "Value of epsilon = abs(theta_ld - theta_l) at the last frame of an episode"
                    },
                    final_epsilon_p={
                        "data": state[7],
                        "desc": "Value of d(epsilon)/dt at the last frame of an episode"
                    }
                )
                score = 0
                episode_length = 0
                episode_cnt += 1

                # check convergence criterion
                converged = bool(
                    len(scores) == self.convergence_window and # be sure the score buffer is full
                    len(joint_epsilon) == self.convergence_window and # same for epsilon buffer
                    len(joint_epsilon_p) == self.convergence_window and # same for epsilon_p buffer
                    mean(scores) > self.convergence_avg_score and 
                    mean(joint_epsilon) < self.convergence_avg_epsilon and
                    mean(joint_epsilon_p) < self.convergence_avg_epsilon_p
                )
                if converged:
                    rospy.loginfo("Ran {} episodes. Solved after {} trials".format(episode_cnt, frame_idx))
                    return

            #  if training is ready
            if self.memory.get_stored_size() >= self.batch_size:
                loss = self.update_model()
                # plotting loss every frame
                self.tensorboard.update_stats(
                    loss={
                        "data": loss[0],
                        "desc": "Loss value."
                    }
                )
                update_cnt += 1
                # if hard update is needed
                if update_cnt % self.target_update == 0:
                    self._target_hard_update()
                    # checkpointing of target model (only if the loss decrease)
                    self.checkpoint_manager.save()


        self.env.close()


    def test(self) -> List[np.ndarray]:
        """Test the agent."""
        self.is_test = True
        
        state = self.env.reset()
        done = False
        score = 0
        
        frames = []
        while not done:
            frames.append(self.env.render(mode="rgb_array"))
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward
        
        rospy.loginfo("score: ", score)
        self.env.close()
        
        return frames


    def _compute_dqn_loss(self, samples: Dict[str, np.ndarray], gamma: float) -> tf.Tensor:
        with tf.device(self.used_device):
            state = tf.constant(samples["obs"], dtype=tf.float32)
            next_state = tf.constant(samples["next_obs"], dtype=tf.float32)
            action = tf.constant(samples["act"], dtype=tf.float32)
            reward = tf.reshape(tf.constant(samples["rew"], dtype=tf.float32), [-1, 1])
            done = tf.reshape(tf.constant(samples["done"], dtype=tf.float32), [-1, 1])

            # Categorical DQN algorithm
            delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1)

            # Double DQN
            next_action = tf.math.argmax(self.dqn(next_state), axis=1)
            next_dist = self.dqn_target.dist(next_state)
            next_dist = tf.gather_nd(
                next_dist,
                [[i, next_action.numpy()[0]] for i in range(self.batch_size)]
            )

            t_z = reward + (1 - done) * gamma * self.support
            t_z = tf.clip_by_value(t_z, clip_value_min=self.v_min, clip_value_max=self.v_max)
            b = tf.dtypes.cast((t_z - self.v_min) / delta_z, tf.float64)
            l = tf.dtypes.cast(tf.math.floor(b), tf.float64)
            u = tf.dtypes.cast(tf.math.ceil(b), tf.float64)

            offset = (
                tf.broadcast_to(
                    tf.expand_dims(
                        tf.dtypes.cast(
                            tf.linspace(0, (self.batch_size - 1) * self.atom_size, self.batch_size),
                            tf.float64
                        ),
                        axis=1
                    ),
                    [self.batch_size, self.atom_size]
                )
            )

            proj_dist = tf.zeros(tf.shape(next_dist), tf.float64)
            # casting
            next_dist = tf.dtypes.cast(next_dist, tf.float64)

            proj_dist = tf.tensor_scatter_nd_add(
                tf.reshape(proj_dist, [-1]), # input tensor
                tf.reshape(tf.dtypes.cast(l + offset, tf.int64), [-1, 1]), # indices
                tf.reshape((next_dist * (u - b)), [-1]) # updates
            )

            proj_dist = tf.tensor_scatter_nd_add(
                proj_dist,
                tf.reshape(tf.dtypes.cast(u + offset, tf.int64), [-1, 1]), # indices
                tf.reshape((next_dist * (b - l)), [-1]) # updates
            )
            proj_dist = tf.reshape(proj_dist, [self.batch_size, self.atom_size])

        dist = self.dqn.dist(state)
        #log_p = tf.math.log(dist[range(self.batch_size), action])
        log_p = tf.dtypes.cast(
            tf.math.log(
                tf.gather_nd(
                    dist,
                    [[i, tf.dtypes.cast(tf.reshape(action, [-1]), tf.int32).numpy()[i]] for i in range(self.batch_size)]
                )
            ),
            tf.float64
        )
        elementwise_loss = tf.math.reduce_sum(-(proj_dist * log_p), axis=1)

        return tf.dtypes.cast(elementwise_loss, tf.float32)


    def _target_hard_update(self):
        """Hard update: target <- local."""
        tf.saved_model.save(self.dqn, "single_joint_dqn")
        self.dqn_target = tf.saved_model.load("single_joint_dqn")
示例#14
0
class DQNAgent:
    def __init__(self):
        # other hyperparameters
        self.save_graph = True
        self.isTraining = True
        self.keepTraining = False
        self.play = False
        self.render = False
        self.save_model = True
        self.load_model = False
        self.random = False
        self.dueling = True
        # epsilon greedy exploration
        self.initial_epsilon = 1.0
        self.epsilon = self.initial_epsilon
        self.min_epsilon = 0.01
        self.linear_annealed = (self.initial_epsilon - self.min_epsilon) / 2000
        self.decay_rate = 0.995

        # check the hyperparameters
        if self.random:
            self.play = False
            self.isTraining = False
        if self.play:
            self.render = True
            self.save_model = False
            self.load_model = True
            self.isTraining = False
            self.keepTraining = False
        if self.keepTraining:
            self.epsilon = self.min_epsilon
            self.load_model = True
        # fixed q value - two networks
        self.learning_rate = 0.0001
        self.fixed_q_value_steps = 100
        self.target_network_counter = 0

        # n-step learning
        self.n_step = 3
        self.n_step_buffer = deque(maxlen=self.n_step)

        # experience replay used SumTree
        # combine agent and PER
        self.batch_size = 64
        self.gamma = 0.9
        self.replay_start_size = 320
        self.PER_e = 0.01  # epsilon -> pi = |delta| + epsilon transitions which have zero error also have chance to be selected
        self.PER_a = 0.6  # P(i) = p(i) ** a / total_priority ** a
        self.PER_b = 0.4
        self.PER_b_increment = 0.005
        self.absolute_error_upper = 1.  # clipped error
        self.experience_number = 0

        env_dict = {
            "obs": {
                "shape": (state_size, )
            },
            "act": {},
            "rew": {},
            "next_obs": {
                "shape": (state_size, )
            },
            "done": {}
        }
        self.experience_replay = PrioritizedReplayBuffer(memory_size,
                                                         env_dict=env_dict,
                                                         alpha=self.PER_a,
                                                         eps=self.PER_e)

        # initially, p1=1 total_priority=1,so P(1)=1,w1=batchsize**beta

        if self.load_model:
            self.model = keras.models.load_model('cartpole_nstep.h5')
            self.target_model = keras.models.load_model('cartpole_nstep.h5')
        else:
            self.model = self.create_model()
            self.target_model = self.create_model()

    # n-step learning, get the truncated n-step return
    def get_n_step_info(self, n_step_buffer, gamma):
        """Return n step reward, next state, and done."""
        # info of the last transition
        reward, next_state, done = n_step_buffer[-1][-3:]

        for transition in reversed(list(n_step_buffer)[:-1]):
            r, n_s, d = transition[-3:]
            reward = r + gamma * reward * (1 - d)
            next_state, done = (n_s, d) if d else (next_state, done)

        return reward, next_state, done

    def store(self, experience):
        self.n_step_buffer.append(experience)
        if len(self.n_step_buffer) == self.n_step:
            reward, next_state, done = self.get_n_step_info(
                self.n_step_buffer, self.gamma)
            state, action = self.n_step_buffer[0][:2]
            self.experience_replay.add(obs=state,
                                       act=action,
                                       rew=reward,
                                       next_obs=next_state,
                                       done=done)

    def create_model(self):
        inputs = tf.keras.Input(shape=(state_size, ))
        fc1 = tf.keras.layers.Dense(128, activation='relu')(inputs)
        fc2 = tf.keras.layers.Dense(128, activation='relu')(fc1)
        advantage_output = tf.keras.layers.Dense(action_size,
                                                 activation='linear')(fc2)

        value_out = tf.keras.layers.Dense(1, activation='linear')(fc2)
        norm_advantage_output = tf.keras.layers.Lambda(
            lambda x: x - tf.reduce_mean(x))(advantage_output)
        outputs = tf.keras.layers.Add()([value_out, norm_advantage_output])
        model = tf.keras.Model(inputs, outputs)

        model.compile(optimizer=tf.keras.optimizers.Adam(self.learning_rate),
                      loss=tf.keras.losses.MeanSquaredError(),
                      metrics=['accuracy'])
        model.summary()
        return model

    def train(self):
        if self.experience_replay.get_stored_size() > self.batch_size:
            samples = self.experience_replay.sample(self.batch_size)
            td_errors, loss = self._train_body(samples)
            self.experience_replay.update_priorities(samples["indexes"],
                                                     td_errors.numpy() + 1e-6)

    @tf.function
    def _train_body(self, samples):
        with tf.GradientTape() as tape:
            td_errors = self._compute_td_error_body(samples["obs"],
                                                    samples["act"],
                                                    samples["rew"],
                                                    samples["next_obs"],
                                                    samples["done"])
            loss = tf.reduce_mean(
                tf.square(td_errors))  # huber loss seems no use
        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.model.optimizer.apply_gradients(
            zip(gradients, self.model.trainable_variables))
        return td_errors, loss

    @tf.function
    def _compute_td_error_body(self, states, actions, rewards, next_states,
                               dones):
        rewards = tf.cast(tf.squeeze(rewards), dtype=tf.float32)
        dones = tf.cast(tf.squeeze(dones), dtype=tf.bool)
        actions = tf.cast(actions, dtype=tf.int32)  # (batch_size, 1)
        batch_size_range = tf.expand_dims(tf.range(self.batch_size),
                                          axis=1)  # (batch_size, 1)

        # get current q value
        current_q_indexes = tf.concat(values=(batch_size_range, actions),
                                      axis=1)  # (batch_size, 2)
        current_q = tf.gather_nd(self.model(states),
                                 current_q_indexes)  # (batch_size, )

        # get target q value using double dqn
        max_next_q_indexes = tf.argmax(self.model(next_states),
                                       axis=1,
                                       output_type=tf.int32)  # (batch_size, )
        indexes = tf.concat(values=(batch_size_range,
                                    tf.expand_dims(max_next_q_indexes,
                                                   axis=1)),
                            axis=1)  # (batch_size, 2)
        target_q = tf.gather_nd(self.target_model(next_states),
                                indexes)  # (batch_size, )

        target_q = tf.where(dones, rewards,
                            rewards + self.gamma * target_q)  # (batch_size, )
        # don't want change the weights of target network in backpropagation, so tf.stop_gradient()
        # but seems no use
        td_errors = tf.abs(current_q - tf.stop_gradient(target_q))
        return td_errors

    def select_action(self, state):
        self.target_network_counter += 1
        if self.target_network_counter % self.fixed_q_value_steps == 0:
            self.target_model.set_weights(self.model.get_weights())
        self.epsilon = max(self.epsilon - self.linear_annealed,
                           self.min_epsilon)
        if np.random.sample() <= self.epsilon:
            return np.random.randint(action_size)
        return self._get_action_body(state).numpy()

    @tf.function
    def _get_action_body(self, state):
        state = tf.expand_dims(state, axis=0)
        qvalues = self.model(state)[0]
        return tf.argmax(qvalues)
示例#15
0
文件: dqn-lap.py 项目: GTrunSec/cpprb
    if np.random.rand() < egreedy:
        action = env.action_space.sample()
    else:
        Q = tf.squeeze(model(observation.reshape(1,-1)))
        action = np.argmax(Q)

    next_observation, reward, done, info = env.step(action)
    rb.add(obs=observation,
           act=action,
           rew=reward,
           next_obs=next_observation,
           done=done)
    observation = next_observation

    sample = rb.sample(batch_size,beta=0.0)

    with tf.GradientTape() as tape:
        tape.watch(model.trainable_weights)
        Q =  Q_func(model,
                    tf.constant(sample["obs"]),
                    tf.constant(sample["act"].ravel()),
                    tf.constant(env.action_space.n))
        target_Q = target_func(model,target_model,
                               tf.constant(sample['next_obs']),
                               tf.constant(sample["rew"].ravel()),
                               tf.constant(sample["done"].ravel()),
                               discount,
                               tf.constant(env.action_space.n))
        absTD = tf.math.abs(target_Q - Q)
        loss = tf.reduce_mean(loss_func(absTD))
示例#16
0
class ReplayBuffer():
    def __init__(self, args):

        #self.memory = deque(maxlen=args.buffer_size)
        self.memory = PrioritizedReplayBuffer(
            args.buffer_size, {
                "obs": {
                    "shape": (64, 64, 6)
                },
                "act": {},
                "rew": {},
                "next_obs": {
                    "shape": (64, 64, 6)
                },
                "terminal": {}
            })
        #self.priority = deque(maxlen=args.buffer_size)
        self.length = 0
        self.args = args

    def load_queues(self, queues, q_network, target_network, lock, args):
        for q in queues:

            for i in range(int(q.qsize())):

                # Read from the queue
                # The critical section begins
                lock.acquire()
                data = queue_to_data(q.get())
                lock.release()

                # Convert to numpy for storage
                state = data[0].numpy()
                action = data[1].numpy()
                reward = data[2].numpy()
                next_state = data[3].numpy()
                terminal = data[4].numpy()

                #data_np = (state,action,reward,next_state,terminal)

                # Push to the buffer
                #self.memory.append(data_np)

                self.memory.add(obs=state,
                                act=action,
                                rew=reward,
                                next_obs=next_state,
                                terminal=terminal)

                self.length = min(self.args.buffer_size, self.length + 1)

    def prepare_batch(self, target_network, q_network):

        batch_size = min(self.length, self.args.batch_size)

        sample = self.memory.sample(batch_size)

        s = t.tensor(sample['obs'])
        a = t.tensor(sample['act'])
        r = t.tensor(sample['rew'])
        ns = t.tensor(sample['next_obs'])
        term = t.tensor(sample['terminal'])

        states = s.permute(0, 3, 1, 2).to(Device.get_device())
        actions = a.type(t.int64).to(Device.get_device())
        rewards = r.to(Device.get_device())
        next_states = ns.permute(0, 3, 1, 2).to(Device.get_device())
        terminals = term.to(Device.get_device())

        indexes = sample["indexes"]

        with t.no_grad():

            target = rewards + terminals * self.args.gamma * target_network(
                next_states).max()
            predicted = q_network(states).gather(1, actions)

        new_priorities = f.smooth_l1_loss(predicted, target,
                                          reduction='none').cpu().numpy()
        new_priorities[new_priorities < 1] = 1

        self.memory.update_priorities(indexes, new_priorities)

        return states, actions, rewards, next_states, terminals

    def __len__(self):

        return self.length