예제 #1
0
    def __init__(self,
                 lr,
                 state_shape,
                 num_actions,
                 batch_size,
                 max_mem_size=100000):
        self.lr = lr
        self.gamma = 0.99
        self.action_space = list(range(num_actions))
        self.batch_size = batch_size

        self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000)
        self.importance_exp = Lerper(start=0.4, end=1.0, num_steps=100000)

        self.priority_exp = 0.6
        self.memory = PrioritizedReplayBuffer(max_mem_size, {
            "obs": {
                "shape": state_shape
            },
            "act": {
                "shape": 1
            },
            "rew": {},
            "next_obs": {
                "shape": state_shape
            },
            "done": {
                "shape": 1
            }
        },
                                              alpha=self.priority_exp)

        self.net = Network(lr, state_shape, num_actions)
예제 #2
0
def get_replay_buffer(policy,
                      env,
                      use_prioritized_rb=False,
                      use_nstep_rb=False,
                      n_step=1,
                      size=None):
    if policy is None or env is None:
        return None

    obs_shape = get_space_size(env.observation_space)
    kwargs = get_default_rb_dict(policy.memory_capacity, env)

    if size is not None:
        kwargs["size"] = size

    # on-policy policy
    if not issubclass(type(policy), OffPolicyAgent):
        kwargs["size"] = policy.horizon
        kwargs["env_dict"].pop("next_obs")
        kwargs["env_dict"].pop("rew")
        # TODO: Remove done. Currently cannot remove because of cpprb implementation
        # kwargs["env_dict"].pop("done")
        kwargs["env_dict"]["logp"] = {}
        kwargs["env_dict"]["ret"] = {}
        kwargs["env_dict"]["adv"] = {}
        if is_discrete(env.action_space):
            kwargs["env_dict"]["act"]["dtype"] = np.int32
        return ReplayBuffer(**kwargs)

    # N-step prioritized
    if use_prioritized_rb and use_nstep_rb:
        kwargs["Nstep"] = {
            "size": n_step,
            "gamma": policy.discount,
            "rew": "rew",
            "next": "next_obs"
        }
        return PrioritizedReplayBuffer(**kwargs)

    if len(obs_shape) == 3:
        kwargs["env_dict"]["obs"]["dtype"] = np.ubyte
        kwargs["env_dict"]["next_obs"]["dtype"] = np.ubyte

    # prioritized
    if use_prioritized_rb:
        return PrioritizedReplayBuffer(**kwargs)

    # N-step
    if use_nstep_rb:
        kwargs["Nstep"] = {
            "size": n_step,
            "gamma": policy.discount,
            "rew": "rew",
            "next": "next_obs"
        }
        return ReplayBuffer(**kwargs)

    return ReplayBuffer(**kwargs)
예제 #3
0
    def test_per_nstep(self):
        """
        PrioritizedReplayBuffer.on_episode_end() ignores Exception

        Ref: https://gitlab.com/ymd_h/cpprb/-/issues/111
        """

        rb = PrioritizedReplayBuffer(32, {
            "rew": {},
            "done": {}
        },
                                     Nstep={
                                         "size": 4,
                                         "rew": "rew",
                                         "gamma": 0.5
                                     })

        for _ in range(10):
            rb.add(rew=0.5, done=0.0)

        rb.add(rew=0.5, done=1.0)
        rb.on_episode_end()

        s = rb.sample(16)

        self.assertIn("discounts", s)
예제 #4
0
    def test_save_cache_with_stack_compress(self):
        rb = PrioritizedReplayBuffer(32,
                                     env_dict={
                                         'done': {
                                             'dtype': 'bool'
                                         },
                                         'a': {
                                             'shape': (3)
                                         }
                                     },
                                     stack_compress='a')

        a = np.array([0, 1, 2])
        for i in range(3):
            done = i == 2
            rb.add(a=a, done=done)
            if done:
                rb.on_episode_end()
            a += 1
        rb.add(a=np.ones(3), done=False)

        a_ = rb.get_all_transitions()["a"]

        np.testing.assert_allclose(
            a_,
            np.asarray([[0., 1., 2.], [1., 2., 3.], [2., 3., 4.], [1., 1.,
                                                                   1.]]))
예제 #5
0
    def test_mp_update_priority(self):
        buffer_size = 256
        add_size = 200

        rb = PrioritizedReplayBuffer(buffer_size,{"obs": {"dtype": int}})

        self.assertEqual(rb.get_next_index(),0)
        self.assertEqual(rb.get_stored_size(),0)

        p = Process(target=add_args,args=[rb,
                                          [{"obs": i, "priorities": 0}
                                           for i in range(add_size)]])
        p.start()
        p.join()

        self.assertEqual(rb.get_next_index(),add_size % buffer_size)
        self.assertEqual(rb.get_stored_size(),min(add_size,buffer_size))

        s = rb.sample(1,beta=1.0)
        one_hot = s["indexes"][0]

        rb.update_priorities([one_hot],[1e+8])

        self.assertEqual(rb.get_next_index(),add_size % buffer_size)
        self.assertEqual(rb.get_stored_size(),min(add_size,buffer_size))


        s = rb.sample(100,beta=1.0)

        self.assertTrue((s["obs"] >= 0).all())
        self.assertTrue((s["obs"] < add_size).all())

        u, counts = np.unique(s["obs"],return_counts=True)
        self.assertEqual(u[counts.argmax()],one_hot)
예제 #6
0
def get_replay_buffer(policy,
                      env,
                      use_prioritized_rb=False,
                      use_nstep_rb=False,
                      n_step=1,
                      size=None):
    if policy is None or env is None:
        return None

    obs_shape = get_space_size(env.observation_space)
    kwargs = get_default_rb_dict(policy.memory_capacity, env)

    if size is not None:
        kwargs['size'] = size

    # TODO(sff1019): Add on-policy behaviour
    # TODO(sff1019): Add N-step prioritized

    if len(obs_shape) == 3:
        kwargs['env_dict']['obs']['dtype'] = np.ubyte
        kwargs['env_dict']['next_obs']['dtype'] = np.ubtye

    if use_prioritized_rb:
        return PrioritizedReplayBuffer(**kwargs)

    return ReplayBuffer(**kwargs)
예제 #7
0
    def __init__(self, args):

        #self.memory = deque(maxlen=args.buffer_size)
        self.memory = PrioritizedReplayBuffer(
            args.buffer_size, {
                "obs": {
                    "shape": (64, 64, 6)
                },
                "act": {},
                "rew": {},
                "next_obs": {
                    "shape": (64, 64, 6)
                },
                "terminal": {}
            })
        #self.priority = deque(maxlen=args.buffer_size)
        self.length = 0
        self.args = args
예제 #8
0
 def __init__(self, args, capacity, env):
     # Initial importance sampling weight β, annealed to 1 over course of training
     self.priority_weight = args.priority_weight
     self.n = args.multi_step
     self.device = args.device
     if args.mmap:
         os.makedirs('memories/', exist_ok=True)
         mmap_prefix = 'memories/mm'
     else:
         mmap_prefix = None
     self.buffer = PrioritizedReplayBuffer(
         capacity,
         {
             "obs": {
                 "shape": env.observation_space.shape,
                 "dtype": env.observation_space.dtype
             },
             "next_obs": {
                 "shape": env.observation_space.shape,
                 "dtype": env.observation_space.dtype
             },
             "act": {
                 "shape": 1,
                 "dtype": env.action_space.dtype
             },
             "rew": {
                 "dtype": np.float32
             },
             "done": {
                 "dtype": np.uint8
             },
         },
         Nstep={
             "size": self.n,
             "gamma": args.discount,
             "rew": "rew",
             "next": "next_obs",
         },
         mmap_prefix=mmap_prefix,
         alpha=args.priority_exponent,
         # next_of="obs",
         # stack_compress="obs",
     )
예제 #9
0
    def test_PrioritizedReplayBuffer_with_single_step_with_priorities(self):
        buffer_size = 256
        obs_shape = (3, 4)
        batch_size = 10

        rb = PrioritizedReplayBuffer(buffer_size,
                                     {"obs": {
                                         "shape": obs_shape
                                     }})

        v = {"obs": np.ones(shape=obs_shape), "priorities": 0.5}

        rb.add(**v)

        rb.sample(batch_size)

        for _ in range(100):
            rb.add(**v)

        rb.sample(batch_size)
예제 #10
0
    def test_PrioritizedReplayBuffer_with_multiple_steps(self):
        buffer_size = 256
        obs_shape = (3, 4)
        step_size = 32
        batch_size = 10

        rb = PrioritizedReplayBuffer(buffer_size,
                                     {"obs": {
                                         "shape": obs_shape
                                     }})

        v = {"obs": np.ones(shape=(step_size, *obs_shape))}

        rb.add(**v)

        rb.sample(batch_size)

        for _ in range(100):
            rb.add(**v)

        rb.sample(batch_size)
예제 #11
0
    def test_cpdef_super(self):
        buffer_size = 256
        obs_dim = 15
        act_dim = 3

        prb = PrioritizedReplayBuffer(
            buffer_size, {
                "obs": {
                    "shape": obs_dim
                },
                "act": {
                    "shape": act_dim
                },
                "rew": {},
                "next_obs": {
                    "shape": obs_dim
                },
                "done": {}
            })

        prb.clear()
예제 #12
0
파일: v8.py 프로젝트: ymd-h/cpprb
 def test_prioritized_nstep(self):
     rb = PrioritizedReplayBuffer(32, {
         "obs": {
             "shape": (16, 16)
         },
         'rew': {},
         'done': {}
     },
                                  next_of="obs",
                                  stack_compress="obs",
                                  Nstep={
                                      "size": 4,
                                      "rew": "rew"
                                  })
     self.assertIs(
         rb.add(obs=(np.ones((16, 16))),
                next_obs=(np.ones((16, 16))),
                rew=1,
                done=0), None)
     self.assertIs(
         rb.add(obs=(np.ones((16, 16))),
                next_obs=(np.ones((16, 16))),
                rew=1,
                done=0), None)
     self.assertIs(
         rb.add(obs=(np.ones((16, 16))),
                next_obs=(np.ones((16, 16))),
                rew=1,
                done=0), None)
     self.assertEqual(
         rb.add(obs=(np.ones((16, 16))),
                next_obs=(np.ones((16, 16))),
                rew=1,
                done=0), 0)
예제 #13
0
    def test_buffer_size(self):
        buffer_size = 1000
        obs_dim = 3
        act_dim = 1

        rb = ReplayBuffer(
            buffer_size, {
                "obs": {
                    "shape": obs_dim
                },
                "act": {
                    "shape": act_dim
                },
                "rew": {},
                "next_obs": {
                    "shape": obs_dim
                },
                "done": {}
            })
        prb = PrioritizedReplayBuffer(
            buffer_size, {
                "obs": {
                    "shape": obs_dim
                },
                "act": {
                    "shape": act_dim
                },
                "rew": {},
                "next_obs": {
                    "shape": obs_dim
                },
                "done": {}
            })

        self.assertEqual(1000, rb.get_buffer_size())
        self.assertEqual(1000, prb.get_buffer_size())

        rb._encode_sample([i for i in range(1000)])
예제 #14
0
    def test_multi_processing(self):
        buffer_size = 256

        rb = PrioritizedReplayBuffer(buffer_size,{"obs": {"dtype": int}})

        self.assertEqual(rb.get_next_index(),0)
        self.assertEqual(rb.get_stored_size(),0)

        p = Process(target=add_args,args=[rb,
                                          [{"obs": i, "priority": 0.5}
                                           for i in range(10)]])
        p.start()
        p.join()

        self.assertEqual(rb.get_next_index(),10)
        self.assertEqual(rb.get_stored_size(),10)

        s = rb.get_all_transitions()
        np.testing.assert_allclose(s["obs"].ravel(),np.arange(10,dtype=int))
예제 #15
0
파일: issue.py 프로젝트: ymd-h/cpprb
    def test_read_only_priority(self):
        buffer_size = 100
        batch_size = 32

        env_dict = {"done": {}}

        done = np.zeros(2)
        ps = np.ones_like(done)
        ps.setflags(write=False)

        rb = PrioritizedReplayBuffer(buffer_size, env_dict)
        rb.add(done=done, priority=ps)

        sample = rb.sample(batch_size)
        ps2 = sample["weights"]
        ps2.setflags(write=False)

        rb.update_priorities(sample["indexes"], ps2)
예제 #16
0
def get_replay_buffer(obs_shape,
                      action_dim,
                      buffer_size=int(1e6),
                      use_prioritized=False):
    env_dict = {
        "obs": {
            "shape": obs_shape
        },
        "act": {
            "shape": action_dim
        },
        "rew": {},
        "next_obs": {
            "shape": obs_shape
        },
        "done": {}
    }
    rb = PrioritizedReplayBuffer(buffer_size, env_dict) if use_prioritized \
        else ReplayBuffer(buffer_size, env_dict)

    return rb
예제 #17
0
 def test_per_train(self):
     """
     Run train function with PER
     """
     rb = PrioritizedReplayBuffer(
         32, {
             "obs": {
                 "shape": (3, )
             },
             "act": {},
             "rew": {},
             "next_obs": {
                 "shape": (3, )
             },
             "done": {}
         })
     train(rb,
           self.env,
           lambda obs, step, episode, is_warmup: 1.0,
           lambda kwargs, step, episode: 0.5,
           max_steps=10)
예제 #18
0
    def test_sample(self):
        buffer_size = 500
        obs_shape = (84,84,3)
        act_dim = 4

        rb = PrioritizedReplayBuffer(buffer_size,{"obs": {"shape": obs_shape},
                                                  "act": {"shape": act_dim},
                                                  "rew": {},
                                                  "done": {}})

        obs = np.zeros(obs_shape)
        act = np.ones(act_dim)
        rew = 1
        done = 0

        rb.add(obs=obs,act=act,rew=rew,done=done)

        ps = 1.5

        rb.add(obs=obs,act=act,rew=rew,done=done,priorities=ps)

        self.assertAlmostEqual(rb.get_max_priority(),1.5)

        obs = np.stack((obs,obs))
        act = np.stack((act,act))
        rew = (1,0)
        done = (0.0,1.0)

        rb.add(obs=obs,act=act,rew=rew,done=done)

        ps = (0.2,0.4)
        rb.add(obs=obs,act=act,rew=rew,done=done,priorities=ps)

        sample = rb.sample(64)

        w = sample["weights"]
        i = sample["indexes"]

        rb.update_priorities(i,w*w)
예제 #19
0
    def test_add(self):
        buffer_size = 500
        obs_shape = (84,84,3)
        act_dim = 10

        rb = PrioritizedReplayBuffer(buffer_size,{"obs": {"shape": obs_shape},
                                                  "act": {"shape": act_dim},
                                                  "rew": {},
                                                  "done": {}})

        obs = np.zeros(obs_shape)
        act = np.ones(act_dim)
        rew = 1
        done = 0

        rb.add(obs=obs,act=act,rew=rew,done=done)

        ps = 1.5

        rb.add(obs=obs,act=act,rew=rew,done=done,priorities=ps)

        self.assertAlmostEqual(rb.get_max_priority(),1.5)

        obs = np.stack((obs,obs))
        act = np.stack((act,act))
        rew = (1,0)
        done = (0.0,1.0)

        rb.add(obs=obs,act=act,rew=rew,done=done)

        ps = (0.2,0.4)
        rb.add(obs=obs,act=act,rew=rew,done=done,priorities=ps)


        rb.clear()
        self.assertEqual(rb.get_next_index(),0)
        self.assertEqual(rb.get_stored_size(),0)
예제 #20
0
def get_replay_buffer(policy, env, use_prioritized_rb, use_nstep_rb, n_step):
    if policy is None or env is None:
        return None

    kwargs = {
        "obs_shape": get_space_size(env.observation_space),
        "act_dim": get_space_size(env.action_space),
        "size": policy.update_interval
    }

    # on-policy policy
    if not issubclass(type(policy), OffPolicyAgent):
        return ReplayBuffer(**kwargs)

    # off-policy policy
    kwargs["size"] = policy.memory_capacity

    # N-step prioritized
    if use_prioritized_rb and use_nstep_rb:
        kwargs["n_step"] = n_step
        kwargs["discount"] = policy.discount
        return NstepPrioritizedReplayBuffer(**kwargs)

    # prioritized
    if use_prioritized_rb:
        return PrioritizedReplayBuffer(**kwargs)

    # N-step
    if use_nstep_rb:
        kwargs["n_step"] = n_step
        kwargs["discount"] = policy.discount
        return NstepReplayBuffer(**kwargs)

    if isinstance(kwargs["act_dim"], tuple):
        kwargs["act_dim"] = kwargs["act_dim"][0]
    return ReplayBuffer(**kwargs)
예제 #21
0
    def test_per_without_TD(self):
        """
        Run train function with PER withou TD

        Raise TypeError
        """
        rb = PrioritizedReplayBuffer(
            32, {
                "obs": {
                    "shape": (3, )
                },
                "act": {},
                "rew": {},
                "next_obs": {
                    "shape": (3, )
                },
                "done": {}
            })
        with self.assertRaises(TypeError):
            train(rb,
                  self.env,
                  lambda obs, step, episode, is_warmup: 1.0,
                  lambda kwargs, step, episode: None,
                  max_steps=10)
예제 #22
0
class RainbowAgent:
    """Agent interacting with environment.
    
    Attribute:
        env (gym.Env): openAI Gym environment
        memory (PrioritizedReplayBuffer): replay memory to store transitions
        batch_size (int): batch size for sampling
        target_update (int): period for target model's hard update
        gamma (float): discount factor
        dqn (Network): model to train and select actions
        dqn_target (Network): target model to update
        optimizer (torch.optim): optimizer for training dqn
        transition (list): transition information including 
                           state, action, reward, next_state, done
        v_min (float): min value of support
        v_max (float): max value of support
        atom_size (int): the unit number of support
        support (torch.Tensor): support for categorical dqn
        use_n_step (bool): whether to use n_step memory
        n_step (int): step number to calculate n-step td error
        memory_n (ReplayBuffer): n-step replay buffer
    """

    def __init__(
        self, 
        env: gym.Env,
        memory_size: int,
        batch_size: int,
        target_update: int,
        gamma: float = 0.99,
        # PER parameters
        alpha: float = 0.2,
        beta: float = 0.6,
        prior_eps: float = 1e-6,
        # Categorical DQN parameters
        v_min: float = 0.0,
        v_max: float = 200.0,
        atom_size: int = 51,
        # N-step Learning
        n_step: int = 3,
        # Convergence parameters
        convergence_window: int = 100,
        convergence_window_epsilon_p: int = 10, 
        convergence_avg_score: float = 195.0,
        convergence_avg_epsilon: float = 0.0524, # 3 degs converted to rads
        convergence_avg_epsilon_p: float = 0.0174, # 1 deg/s converted to rad/s
        # Tensorboard parameters
        model_name: str = "snake_joint",

    ):
        """Initialization.
        
        Args:
            env (gym.Env): openAI Gym environment
            memory_size (int): length of memory
            batch_size (int): batch size for sampling
            target_update (int): period for target model's hard update
            lr (float): learning rate
            gamma (float): discount factor
            alpha (float): determines how much prioritization is used
            beta (float): determines how much importance sampling is used
            prior_eps (float): guarantees every transition can be sampled
            v_min (float): min value of support
            v_max (float): max value of support
            atom_size (int): the unit number of support
            n_step (int): step number to calculate n-step td error
        """
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        
        self.env = env
        self.batch_size = batch_size
        self.target_update = target_update
        self.gamma = gamma
        # NoisyNet: All attributes related to epsilon are removed

        #produces a unique timestamp for each run 
        run_timestamp=str(
        #returns number of day and number of month
        str(time.localtime(time.time())[2]) + "_" +
        str(time.localtime(time.time())[1]) + "_" +
        #returns hour, minute and second
        str(time.localtime(time.time())[3]) + "_" +
        str(time.localtime(time.time())[4]) + "_" +
        str(time.localtime(time.time())[5])
        )

        #Will write scalars that can be visualized using tensorboard in the directory "runLogs/timestamp"
        self.writer = SummaryWriter("runLogs/" + run_timestamp)


        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )
        print(self.device)
        
        # PER
        # memory for 1-step Learning
        self.beta = beta
        self.prior_eps = prior_eps
        self.memory = PrioritizedReplayBuffer(
            memory_size,
            {
                "obs": {"shape": (obs_dim,)},
                "act": {"shape": (1,)},
                "rew": {},
                "next_obs": {"shape": (obs_dim,)},
                "done": {}
            },
            alpha=alpha    
        )
        
        # memory for N-step Learning
        self.use_n_step = True if n_step > 1 else False
        if self.use_n_step:
            self.n_step = n_step
            self.memory_n = ReplayBuffer(
                memory_size,
                {
                    "obs": {"shape": (obs_dim,)},
                    "act": {"shape": (1,)},
                    "rew": {},
                    "next_obs": {"shape": (obs_dim,)},
                    "done": {}
                },
                Nstep={
                    "size": n_step,
                    "gamma": gamma,
                    "rew": "rew",
                    "next": "next_obs"
                }
            )
            
        # Categorical DQN parameters
        self.v_min = v_min
        self.v_max = v_max
        self.atom_size = atom_size
        self.support = torch.linspace(
            self.v_min, self.v_max, self.atom_size
        ).to(self.device)

        # networks: dqn, dqn_target
        self.dqn = Network(
            obs_dim, action_dim, self.atom_size, self.support
        ).to(self.device)
        self.dqn_target = Network(
            obs_dim, action_dim, self.atom_size, self.support
        ).to(self.device)
        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()
        
        # optimizer
        self.optimizer = optim.Adam(self.dqn.parameters(),0.0001)

        # transition to store in memory
        self.transition = list()
        
        # mode: train / test
        self.is_test = False

        # Custom tensorboard object
        # self.tensorboard = RainbowTensorBoard(
        #     log_dir="single_joint_logs/{}-{}".format(
        #         model_name,
        #         datetime.now().strftime("%m-%d-%Y-%H_%M_%S")
        #     )
        # )
        # Convergence criterion
        self.convergence_window = convergence_window
        self.convergence_window_epsilon_p = convergence_window_epsilon_p
        self.convergence_avg_score = convergence_avg_score 
        self.convergence_avg_epsilon = convergence_avg_epsilon
        self.convergence_avg_epsilon_p = convergence_avg_epsilon_p


    def select_action(self, state: np.ndarray) -> np.ndarray:
        """Select an action from the input state."""
        # NoisyNet: no epsilon greedy action selection
        selected_action = self.dqn(
            torch.FloatTensor(state).to(self.device)
        ).argmax()
        selected_action = selected_action.detach().cpu().numpy()
        
        if not self.is_test:

            self.transition = [state, selected_action]
        

        return selected_action


    def step(self, action: np.ndarray, score:int) -> Tuple[np.ndarray, np.float64, bool]:
        """Take an action and return the response of the env."""
        next_state, reward, done, _ = self.env.step(action,score)

        if not self.is_test:
            self.transition += [reward, next_state, done]
            
            # N-step transition
            if self.use_n_step:
                idx = self.memory_n.add(
                    **dict(
                        zip(["obs", "act", "rew", "next_obs", "done"], self.transition)
                    )
                )
                one_step_transition = [ v[idx] for _,v in self.memory_n.get_all_transitions().items()] if idx else None

            # 1-step transition
            else:
                one_step_transition = self.transition

            # add a single step transition
            if one_step_transition:
                self.memory.add(
                    **dict(
                        zip(["obs", "act", "rew", "next_obs", "done"], one_step_transition)
                    )
                )
    
        return next_state, reward, done


    def update_model(self,frame_idx:int) -> torch.Tensor:
        """Update the model by gradient descent.
        shape of elementwise_loss = [128,51]
        shape of loss = ([])
        shape of weights ([128,1)]
        """
        # PER needs beta to calculate weights
        samples = self.memory.sample(self.batch_size, beta=self.beta)
        weights = torch.FloatTensor(
            samples["weights"].reshape(-1, 1)
        ).to(self.device)
        indices = samples["indexes"]
        #rospy.loginfo(samples.keys())
        #rospy.loginfo(weights.shape)
        #rospy.loginfo(indices.shape())

        #torch.save(self.dqn.state_dict(),str("checkpoint_"+str(time.time())))
        
        # 1-step Learning loss
        elementwise_loss = self._compute_dqn_loss(samples, self.gamma)
        
        # PER: importance sampling before average
        loss = torch.mean(elementwise_loss * weights)
        
        self.writer.add_scalar('update_model/Lossv0', loss.detach().item(),frame_idx )
        
        # N-step Learning loss
        # we are gonna combine 1-step loss and n-step loss so as to
        # prevent high-variance. The original rainbow employs n-step loss only.
        if self.use_n_step:
            gamma = self.gamma ** self.n_step
            samples = {k: [v[i] for i in indices] for k,v in self.memory_n.get_all_transitions().items()}
            elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma)
            elementwise_loss += elementwise_loss_n_loss
            
            #rospy.loginfo(elementwise_loss_n_loss.shape)
            #rospy.loginfo(elementwise_loss.shape)

            # PER: importance sampling before average
            loss = torch.mean(elementwise_loss * weights)

        
        rospy.loginfo(
            f"{elementwise_loss}"
            )
        self.optimizer.zero_grad()
        self.writer.add_scalar('update_model/Lossv1', loss.detach().item(),frame_idx )
        #From pytorch doc: backward() Computes the gradient of current tensor w.r.t. graph leaves.
        #self.writer.add_image("loss gradient before", loss, frame_idx)
        loss.backward()
        #self.writer.add_image("loss gradient after", loss, frame_idx)
        self.writer.add_scalar('update_model/Lossv2', loss.detach().item(),frame_idx )
        clip_grad_norm_(self.dqn.parameters(), 10.0)
        self.optimizer.step()
        
        # PER: update priorities
        loss_for_prior = elementwise_loss.detach().cpu().numpy()
        new_priorities = loss_for_prior + self.prior_eps
        self.memory.update_priorities(indices, new_priorities)
        
        # NoisyNet: reset noise
        self.dqn.reset_noise()
        self.dqn_target.reset_noise()
        
        #rospy.loginfo("second")
        #rospy.loginfo(loss.shape)

        #rospy.loginfo("loss dimension = " + loss.ndim()  )   
        #rospy.loginfo("loss = " + str(loss.detach().item()) + "type = " + str(type(loss.detach().item())  )   )   
        self.writer.add_scalar('update_model/Loss', loss.detach().item(),frame_idx )
        return loss.detach().item()


    def train(self, num_frames: int):
        """Train the agent."""
        self.is_test = False
        
        state = self.env.reset()
        update_cnt = 0
        losses = []
        scores = []
        score = 0

        for frame_idx in tqdm(range(1, num_frames + 1)):

            action = self.select_action(state)
            next_state, reward, done = self.step(action,score)

            state = next_state
            score += reward
            
            # NoisyNet: removed decrease of epsilon
            
            # PER: increase beta
            fraction = min(frame_idx / num_frames, 1.0)
            self.beta = self.beta + fraction * (1.0 - self.beta)

            # if episode ends
            if done:
                #rospy.loginfo("logging for done")
                self.writer.add_scalar('train/score', score, frame_idx)
                self.writer.add_scalar('train/final_epsilon', state[6], frame_idx)
                self.writer.add_scalar('train/epsilon_p', state[7], frame_idx)
                state = self.env.reset()
                scores.append(score)
                score = 0

            # if training is ready
            if self.memory.get_stored_size() >= self.batch_size:
                #frame_id given as argument for logging by self.writer. 
                #rospy.loginfo("frame_idx= " + str(frame_idx) + "type = " + str(type(frame_idx)))
                loss = self.update_model(frame_idx)

                losses.append(loss)
                update_cnt += 1
                
                # if hard update is needed
                if update_cnt % self.target_update == 0:
                    self._target_hard_update(loss)

        self.env.close()


    def test(self) -> List[np.ndarray]:
        """Test the agent."""
        self.is_test = True
        
        state = self.env.reset()
        done = False
        score = 0
        
        frames = []
        while not done:
            frames.append(self.env.render(mode="rgb_array"))
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward
        
        print("score: ", score)
        self.env.close()
        
        return frames


    def _compute_dqn_loss(self, samples: Dict[str, np.ndarray], gamma: float) -> torch.Tensor:
        """Return categorical dqn loss."""
        device = self.device  # for shortening the following lines
        state = torch.FloatTensor(samples["obs"]).to(device)
        next_state = torch.FloatTensor(samples["next_obs"]).to(device)
        action = torch.LongTensor(samples["act"]).to(device)
        reward = torch.FloatTensor(np.array(samples["rew"]).reshape(-1, 1)).to(device)
        done = torch.FloatTensor(np.array(samples["done"]).reshape(-1, 1)).to(device)
        
        # Categorical DQN algorithm
        delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1)

        with torch.no_grad():
            # Double DQN
            next_action = self.dqn(next_state).argmax(1)
            next_dist = self.dqn_target.dist(next_state)
            next_dist = next_dist[range(self.batch_size), next_action]

            t_z = reward + (1 - done) * gamma * self.support
            t_z = t_z.clamp(min=self.v_min, max=self.v_max)
            b = (t_z - self.v_min) / delta_z
            l = b.floor().long()
            u = b.ceil().long()

            offset = (
                torch.linspace(
                    0, (self.batch_size - 1) * self.atom_size, self.batch_size
                ).long()
                .unsqueeze(1)
                .expand(self.batch_size, self.atom_size)
                .to(self.device)
            )

            proj_dist = torch.zeros(next_dist.size(), device=self.device)
            proj_dist.view(-1).index_add_(
                0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1)
            )
            proj_dist.view(-1).index_add_(
                0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1)
            )
            print(f"Next Action : {next_action}\n Next Dist : {next_dist}\n")

        dist = self.dqn.dist(state)
        log_p = torch.log(dist[range(self.batch_size), action])
        elementwise_loss = -(proj_dist * log_p).sum(1)
        print(f"Proj Dist : {proj_dist}\n Dist : {dist}\n Log_p : {log_p}\n")
        if torch.isnan(elementwise_loss[0][0]):
            exit()

        return elementwise_loss


    def _target_hard_update(self,loss):
        """Hard update: target <- local."""
        self.dqn_target.load_state_dict(self.dqn.state_dict())
        #torch.save(self.dqn.state_dict(),str("checkpoint_"+str(time.time())))

        torch.save({
            'model_state_dict': self.dqn.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'loss': loss,
            }, str("checkpoints/checkpoint_"+str(time.time())))
예제 #23
0
nstep = 3
# nstep = False

if nstep:
    Nstep = {"size": nstep, "rew": "rew", "next": "next_obs"}
    discount = tf.constant(gamma**nstep)
else:
    Nstep = None
    discount = tf.constant(gamma)

# Prioritized Experience Replay: https://arxiv.org/abs/1511.05952
# See https://ymd_h.gitlab.io/cpprb/features/per/
prioritized = True

if prioritized:
    rb = PrioritizedReplayBuffer(buffer_size, env_dict, Nstep=Nstep)

    # Beta linear annealing
    beta = 0.4
    beta_step = (1 - beta) / N_iteration
else:
    rb = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep)


@tf.function
def Q_func(model, obs, act, act_shape):
    return tf.reduce_sum(model(obs) * tf.one_hot(act, depth=act_shape), axis=1)


@tf.function
def DQN_target_func(model, target, next_obs, rew, done, gamma, act_shape):
예제 #24
0
파일: benchmark.py 프로젝트: GTrunSec/cpprb
    },
    "act": {
        "shape": act_shape
    },
    "next_obs": {
        "shape": obs_shape
    },
    "rew": {},
    "done": {}
}

# Initialize Replay Buffer
rb = RB(buffer_size, env_dict)

# Initialize Prioritized Replay Buffer
prb = PRB(buffer_size, env_dict, alpha=alpha)

# Initalize Reverb Server
server = reverb.Server(tables=[
    reverb.Table(name='ReplayBuffer',
                 sampler=reverb.selectors.Uniform(),
                 remover=reverb.selectors.Fifo(),
                 max_size=buffer_size,
                 rate_limiter=reverb.rate_limiters.MinSize(1)),
    reverb.Table(name='PrioritizedReplayBuffer',
                 sampler=reverb.selectors.Prioritized(alpha),
                 remover=reverb.selectors.Fifo(),
                 max_size=buffer_size,
                 rate_limiter=reverb.rate_limiters.MinSize(1))
])
예제 #25
0
    eps_tracker = ptan.actions.EpsilonTracker(
        selector, params.eps_start, params.eps_final, params.eps_frames*args.envs)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        envs, agent, params.gamma, steps_count=args.steps)

    env_dict = {'state': {'shape': shape, 'dtype': np.uint8},
                'action': {'dtype': np.int8},
                'reward': {},
                'last_state': {'shape': shape, 'dtype': np.uint8},
                'done': {'dtype': np.bool}
                }

    step = (TGT_BETA - BETA)/END_BETA_FRAME

    buffer = PrioritizedReplayBuffer(params.buffer_size, env_dict) if args.priority else \
        ReplayBuffer(params.buffer_size, env_dict=env_dict)

    folder, sub_folder, log_dir = utils.writerDir(envs[0], args.steps)
    comment = "".join(
        [envs[0].game, '_', str(args.steps), '_', str(args.envs)])
    writer = SummaryWriter(comment=comment)

    optimizer = torch.optim.Adam(net.parameters(), lr=params.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.75, patience=20000,
                                                           cooldown=20000, verbose=True, min_lr=params.min_lr)
    mean = None
    best_reward = -float('inf')
    st = datetime.now()

    print(net)
예제 #26
0
class ReplayMemory():
    def __init__(self, args, capacity, env):
        # Initial importance sampling weight β, annealed to 1 over course of training
        self.priority_weight = args.priority_weight
        self.n = args.multi_step
        self.device = args.device
        if args.mmap:
            os.makedirs('memories/', exist_ok=True)
            mmap_prefix = 'memories/mm'
        else:
            mmap_prefix = None
        self.buffer = PrioritizedReplayBuffer(
            capacity,
            {
                "obs": {
                    "shape": env.observation_space.shape,
                    "dtype": env.observation_space.dtype
                },
                "next_obs": {
                    "shape": env.observation_space.shape,
                    "dtype": env.observation_space.dtype
                },
                "act": {
                    "shape": 1,
                    "dtype": env.action_space.dtype
                },
                "rew": {
                    "dtype": np.float32
                },
                "done": {
                    "dtype": np.uint8
                },
            },
            Nstep={
                "size": self.n,
                "gamma": args.discount,
                "rew": "rew",
                "next": "next_obs",
            },
            mmap_prefix=mmap_prefix,
            alpha=args.priority_exponent,
            # next_of="obs",
            # stack_compress="obs",
        )

    def append(self, state, next_state, action, reward, done):
        self.buffer.add(
            **{
                "obs": state,
                "next_obs": next_state,
                "act": action,
                "rew": reward,
                "done": done,
            })

    def sample(self, size):
        s = self.buffer.sample(size, self.priority_weight)
        s['indexes'] = s['indexes'].astype(np.int32)
        return torchify((s['indexes'], torch.int32), (s['obs'], torch.float32),
                        (np.squeeze(s['act'], 1), torch.long),
                        (np.squeeze(s['rew'], 1), torch.float32),
                        (s['next_obs'], torch.float32),
                        (s['done'], torch.bool), (s['weights'], torch.float32),
                        device=self.device)

    def update_priorities(self, indexes, new_priorities):
        indexes = indexes.cpu().numpy()
        self.buffer.update_priorities(indexes, new_priorities)
예제 #27
0
    def __init__(
        self, 
        env: gym.Env,
        memory_size: int,
        batch_size: int,
        target_update: int,
        gamma: float = 0.99,
        # PER parameters
        alpha: float = 0.2,
        beta: float = 0.6,
        prior_eps: float = 1e-6,
        # Categorical DQN parameters
        v_min: float = 0.0,
        v_max: float = 200.0,
        atom_size: int = 51,
        # N-step Learning
        n_step: int = 3,
        # Convergence parameters
        convergence_window: int = 100,
        convergence_window_epsilon_p: int = 10, 
        convergence_avg_score: float = 195.0,
        convergence_avg_epsilon: float = 0.0524, # 3 degs converted to rads
        convergence_avg_epsilon_p: float = 0.0174, # 1 deg/s converted to rad/s
        # Tensorboard parameters
        model_name: str = "snake_joint",

    ):
        """Initialization.
        
        Args:
            env (gym.Env): openAI Gym environment
            memory_size (int): length of memory
            batch_size (int): batch size for sampling
            target_update (int): period for target model's hard update
            lr (float): learning rate
            gamma (float): discount factor
            alpha (float): determines how much prioritization is used
            beta (float): determines how much importance sampling is used
            prior_eps (float): guarantees every transition can be sampled
            v_min (float): min value of support
            v_max (float): max value of support
            atom_size (int): the unit number of support
            n_step (int): step number to calculate n-step td error
        """
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        
        self.env = env
        self.batch_size = batch_size
        self.target_update = target_update
        self.gamma = gamma
        # NoisyNet: All attributes related to epsilon are removed

        #produces a unique timestamp for each run 
        run_timestamp=str(
        #returns number of day and number of month
        str(time.localtime(time.time())[2]) + "_" +
        str(time.localtime(time.time())[1]) + "_" +
        #returns hour, minute and second
        str(time.localtime(time.time())[3]) + "_" +
        str(time.localtime(time.time())[4]) + "_" +
        str(time.localtime(time.time())[5])
        )

        #Will write scalars that can be visualized using tensorboard in the directory "runLogs/timestamp"
        self.writer = SummaryWriter("runLogs/" + run_timestamp)


        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )
        print(self.device)
        
        # PER
        # memory for 1-step Learning
        self.beta = beta
        self.prior_eps = prior_eps
        self.memory = PrioritizedReplayBuffer(
            memory_size,
            {
                "obs": {"shape": (obs_dim,)},
                "act": {"shape": (1,)},
                "rew": {},
                "next_obs": {"shape": (obs_dim,)},
                "done": {}
            },
            alpha=alpha    
        )
        
        # memory for N-step Learning
        self.use_n_step = True if n_step > 1 else False
        if self.use_n_step:
            self.n_step = n_step
            self.memory_n = ReplayBuffer(
                memory_size,
                {
                    "obs": {"shape": (obs_dim,)},
                    "act": {"shape": (1,)},
                    "rew": {},
                    "next_obs": {"shape": (obs_dim,)},
                    "done": {}
                },
                Nstep={
                    "size": n_step,
                    "gamma": gamma,
                    "rew": "rew",
                    "next": "next_obs"
                }
            )
            
        # Categorical DQN parameters
        self.v_min = v_min
        self.v_max = v_max
        self.atom_size = atom_size
        self.support = torch.linspace(
            self.v_min, self.v_max, self.atom_size
        ).to(self.device)

        # networks: dqn, dqn_target
        self.dqn = Network(
            obs_dim, action_dim, self.atom_size, self.support
        ).to(self.device)
        self.dqn_target = Network(
            obs_dim, action_dim, self.atom_size, self.support
        ).to(self.device)
        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()
        
        # optimizer
        self.optimizer = optim.Adam(self.dqn.parameters(),0.0001)

        # transition to store in memory
        self.transition = list()
        
        # mode: train / test
        self.is_test = False

        # Custom tensorboard object
        # self.tensorboard = RainbowTensorBoard(
        #     log_dir="single_joint_logs/{}-{}".format(
        #         model_name,
        #         datetime.now().strftime("%m-%d-%Y-%H_%M_%S")
        #     )
        # )
        # Convergence criterion
        self.convergence_window = convergence_window
        self.convergence_window_epsilon_p = convergence_window_epsilon_p
        self.convergence_avg_score = convergence_avg_score 
        self.convergence_avg_epsilon = convergence_avg_epsilon
        self.convergence_avg_epsilon_p = convergence_avg_epsilon_p
예제 #28
0
class Agent:
    def __init__(self,
                 lr,
                 state_shape,
                 num_actions,
                 batch_size,
                 max_mem_size=100000):
        self.lr = lr
        self.gamma = 0.99
        self.action_space = list(range(num_actions))
        self.batch_size = batch_size

        self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000)
        self.importance_exp = Lerper(start=0.4, end=1.0, num_steps=100000)

        self.priority_exp = 0.6
        self.memory = PrioritizedReplayBuffer(max_mem_size, {
            "obs": {
                "shape": state_shape
            },
            "act": {
                "shape": 1
            },
            "rew": {},
            "next_obs": {
                "shape": state_shape
            },
            "done": {
                "shape": 1
            }
        },
                                              alpha=self.priority_exp)

        self.net = Network(lr, state_shape, num_actions)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon.value():
            state = torch.tensor(observation).float().detach()
            state = state.to(self.net.device)
            state = state.unsqueeze(0)

            q_values = self.net(state)
            action = torch.argmax(q_values).item()
            return action
        else:
            return np.random.choice(self.action_space)

    def store_memory(self, state, action, reward, next_state, done):
        self.memory.add(obs=state,
                        act=action,
                        rew=reward,
                        next_obs=next_state,
                        done=done)

    def learn(self):
        if self.memory.get_stored_size() < self.batch_size:
            return

        batch = self.memory.sample(self.batch_size,
                                   self.importance_exp.value())

        states = torch.tensor(batch["obs"]).to(self.net.device)
        actions = torch.tensor(batch["act"],
                               dtype=torch.int64).to(self.net.device).T[0]
        rewards = torch.tensor(batch["rew"]).to(self.net.device).T[0]
        states_ = torch.tensor(batch["next_obs"]).to(self.net.device)
        dones = torch.tensor(batch["done"],
                             dtype=torch.bool).to(self.net.device).T[0]
        weights = torch.tensor(batch["weights"]).to(self.net.device)

        batch_index = np.arange(self.batch_size, dtype=np.int64)

        q_values = self.net(states)[batch_index, actions]
        q_values_ = self.net(states_)

        action_qs_ = torch.max(q_values_, dim=1)[0]
        action_qs_[dones] = 0.0
        q_target = rewards + self.gamma * action_qs_

        td = q_target - q_values

        self.net.optimizer.zero_grad()
        loss = ((td**2.0) * weights).mean()
        loss.backward()
        self.net.optimizer.step()

        new_priorities = (td.abs()).detach().cpu()
        self.memory.update_priorities(batch["indexes"], new_priorities)

        self.epsilon.step()
        self.importance_exp.step()
예제 #29
0
    "done": {}
}

# Initialize Replay Buffer
brb = bRB(buffer_size)
rrb = rRB(buffer_size)
rrb._num_sampled = 0  # Fix: https://github.com/ray-project/ray/issues/14818

crb = cRB(buffer_size)
rb = RB(buffer_size, env_dict)

# Initialize Prioritized Replay Buffer
bprb = bPRB(buffer_size, alpha=alpha)
rprb = rPRB(buffer_size, alpha=alpha)
cprb = cPRB(buffer_size, alpha=alpha, beta0=beta, betasteps=None)
prb = PRB(buffer_size, env_dict, alpha=alpha)


# Helper Function
def env(n):
    e = {
        "obs": np.ones((n, obs_shape)),
        "act": np.zeros((n, act_shape)),
        "next_obs": np.ones((n, obs_shape)),
        "rew": np.zeros(n),
        "done": np.zeros(n)
    }
    return e


def add_b(_rb):
예제 #30
0
    def test_raise_imcompatible_priority_shape(self):
        rb = PrioritizedReplayBuffer(32, env_dict={'a': {'shape': 1}})

        with self.assertRaises(ValueError):
            rb.add(a=np.ones(5), priorities=np.ones(3))