def test_unsafe_next_of_already_filled(self): """ Load unsafe next_of transitions with already filled buffer """ buffer_size = 10 env_dict = {"a": {}} rb1 = ReplayBuffer(buffer_size, env_dict, next_of="a") rb2 = ReplayBuffer(buffer_size, env_dict, next_of="a") rb3 = ReplayBuffer(buffer_size, env_dict, next_of="a") a = [1, 2, 3, 4, 5, 6] b = [7, 8, 9] rb1.add(a=a[:-1], next_a=a[1:]) rb2.add(a=b[:-1], next_a=b[1:]) rb3.add(a=b[:-1], next_a=b[1:]) fname="unsafe_next_of_already.npz" rb1.save_transitions(fname, safe=False) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) self.assertEqual(rb1.get_stored_size()+len(b)-1, rb2.get_stored_size()) self.assertEqual(rb1.get_stored_size()+len(b)-1, rb3.get_stored_size()) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"][len(b)-1:]) np.testing.assert_allclose(t1["next_a"], t2["next_a"][len(b)-1:]) np.testing.assert_allclose(t1["a"], t3["a"][len(b)-1:]) np.testing.assert_allclose(t1["next_a"], t3["next_a"][len(b)-1:])
def test_basic(self): """ Basic Test Case Loaded buffer have same transitions with saved one. """ buffer_size = 4 env_dict = {"a": {}} rb1 = ReplayBuffer(buffer_size, env_dict) rb2 = ReplayBuffer(buffer_size, env_dict) rb3 = ReplayBuffer(buffer_size, env_dict) a = [1, 2, 3, 4] rb1.add(a=a) fname = "basic.npz" rb1.save_transitions(fname) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"]) np.testing.assert_allclose(t1["a"], t3["a"])
def test_next_of(self): """ Load next_of transitions with safe mode For safe mode, next_of is not neccessary at loaded buffer. """ buffer_size = 10 env_dict1 = {"a": {}} env_dict2 = {"a": {}, "next_a": {}} rb1 = ReplayBuffer(buffer_size, env_dict1, next_of="a") rb2 = ReplayBuffer(buffer_size, env_dict2) rb3 = ReplayBuffer(buffer_size, env_dict2) a = [1, 2, 3, 4, 5, 6] rb1.add(a=a[:-1], next_a=a[1:]) fname="next_of.npz" rb1.save_transitions(fname) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"]) np.testing.assert_allclose(t1["next_a"], t2["next_a"]) np.testing.assert_allclose(t1["a"], t3["a"]) np.testing.assert_allclose(t1["next_a"], t3["next_a"])
def test_stack_compress(self): """ Load stack_compress transitions """ buffer_size = 10 env_dict = {"a": {"shape": 3}} rb1 = ReplayBuffer(buffer_size, env_dict, stack_compress="a") rb2 = ReplayBuffer(buffer_size, env_dict, stack_compress="a") rb3 = ReplayBuffer(buffer_size, env_dict, stack_compress="a") a = [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]] rb1.add(a=a) fname="stack_compress.npz" rb1.save_transitions(fname) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"]) np.testing.assert_allclose(t1["a"], t3["a"])
def test_load_Nstep(self): """ Load Nstep transitions """ buffer_size = 10 env_dict = {"done": {}} Nstep = {"size": 3, "gamma": 0.99} rb1 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep) rb2 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep) rb3 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep) d = [0, 0, 0, 0, 1] rb1.add(done=d) rb1.on_episode_end() fname="Nstep.npz" rb1.save_transitions(fname) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["done"], t2["done"]) np.testing.assert_allclose(t1["done"], t3["done"])
def test_stack_compress(self): bsize = 10 odim = 2 ssize = 2 rb = ReplayBuffer(bsize, {"a": { "shape": (odim, ssize) }}, stack_compress="a") a = np.random.rand(odim, bsize + ssize - 1) for i in range(bsize): rb.add(a=a[:, i:i + ssize]) _a = rb.get_all_transitions()["a"] for i in range(bsize): with self.subTest(i=i, label="without cache"): np.testing.assert_allclose(_a[i], a[:, i:i + ssize]) for i in range(bsize): rb._encode_sample([i]) rb.clear() for i in range(bsize): rb.add(a=a[:, i:i + ssize]) rb.on_episode_end() _a = rb.get_all_transitions()["a"] for i in range(bsize): with self.subTest(i=i, label="without cache"): np.testing.assert_allclose(_a[i], a[:, i:i + ssize]) for i in range(bsize): rb._encode_sample([i])
def test_incompatible_unsafe_stack_compress(self): """ Load incompatible stack_compress transitions with unsafe mode """ buffer_size = 10 env_dict = {"a": {"shape": 3}} rb1 = ReplayBuffer(buffer_size, env_dict, stack_compress="a") rb2 = ReplayBuffer(buffer_size, env_dict) rb3 = ReplayBuffer(buffer_size, env_dict) a = [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]] rb1.add(a=a) fname="incompatible_unsafe_stack_compress.npz" rb1.save_transitions(fname, safe=False) rb2.load_transitions(fname) rb3.load_transitions(fname) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"]) np.testing.assert_allclose(t1["a"], t3["a"])
def test_smaller_buffer(self): """ Load to smaller buffer Loaded buffer only stored last buffer_size transitions """ buffer_size1 = 10 buffer_size2 = 4 env_dict = {"a": {}} rb1 = ReplayBuffer(buffer_size1, env_dict) rb2 = ReplayBuffer(buffer_size2, env_dict) rb3 = ReplayBuffer(buffer_size2, env_dict) a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] fname = "smaller.npz" rb1.save_transitions(fname) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"][-buffer_size2:],t2["a"])
def test_has_next_of(self): bsize = 10 rb = ReplayBuffer(bsize, {"a": {}}, next_of="a") a = np.random.rand(bsize + 1) for i in range(bsize): rb.add(a=a[i], next_a=a[i + 1]) _next_a = np.ravel(rb.get_all_transitions()["next_a"]) np.testing.assert_allclose(_next_a, a[1:bsize + 1]) for i in range(bsize): rb._encode_sample([i]) rb.clear() for i in range(bsize): rb.add(a=a[i], next_a=a[i + 1]) rb.on_episode_end() _next_a = np.ravel(rb.get_all_transitions()["next_a"]) np.testing.assert_allclose(_next_a, a[1:bsize + 1]) for i in range(bsize): rb._encode_sample([i])
def test_unsafe_next_of_stack_compress(self): """ Load next_of and stack_compress transitions """ buffer_size = 10 env_dict = {"a": {"shape": 3}} rb1 = ReplayBuffer(buffer_size, env_dict, next_of="a", stack_compress="a") rb2 = ReplayBuffer(buffer_size, env_dict, next_of="a", stack_compress="a") rb3 = ReplayBuffer(buffer_size, env_dict, next_of="a", stack_compress="a") a = [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, 8]] rb1.add(a=a[:-1], next_a=a[1:]) fname="unsafe_next_of_stack_compress.npz" rb1.save_transitions(fname, safe=False) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"]) np.testing.assert_allclose(t1["next_a"], t2["next_a"]) np.testing.assert_allclose(t1["a"], t3["a"]) np.testing.assert_allclose(t1["next_a"], t3["next_a"])
def test_load_to_filled_buffer(self): """ Load to already filled buffer Add to transitions """ buffer_size1 = 10 buffer_size2 = 10 env_dict = {"a": {}} rb1 = ReplayBuffer(buffer_size1, env_dict) rb2 = ReplayBuffer(buffer_size2, env_dict) rb3 = ReplayBuffer(buffer_size2, env_dict) a = [1, 2, 3, 4] b = [5, 6] rb1.add(a=a) rb2.add(a=b) rb3.add(a=b) fname="filled.npz" rb1.save_transitions(fname) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"][len(b):]) np.testing.assert_allclose(t1["a"], t3["a"][len(b):])
def test_fulled_unsafe_next_of(self): """ Load with already fulled buffer """ buffer_size = 10 env_dict = {"a": {}} rb1 = ReplayBuffer(buffer_size, env_dict, next_of="a") rb2 = ReplayBuffer(buffer_size, env_dict, next_of="a") rb3 = ReplayBuffer(buffer_size, env_dict, next_of="a") a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] rb1.add(a=a[:-1], next_a=a[1:]) fname="fulled_unsafe_next_of.npz" rb1.save_transitions(fname, safe=False) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"]) np.testing.assert_allclose(t1["next_a"], t2["next_a"]) np.testing.assert_allclose(t1["a"], t3["a"]) np.testing.assert_allclose(t1["next_a"], t3["next_a"])
def test_incompatible_unsafe_next_of(self): """ Load incompatible next_of transitions with unsafe mode """ buffer_size = 10 env_dict1 = {"a": {}} env_dict2 = {"a": {}, "next_a": {}} rb1 = ReplayBuffer(buffer_size, env_dict1, next_of="a") rb2 = ReplayBuffer(buffer_size, env_dict2) rb3 = ReplayBuffer(buffer_size, env_dict2) a = [1, 2, 3, 4, 5, 6] rb1.add(a=a[:-1], next_a=a[1:]) fname="unsafe_incompatible_next_of.npz" rb1.save_transitions(fname, safe=False) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["a"], t2["a"]) np.testing.assert_allclose(t1["next_a"], t2["next_a"]) np.testing.assert_allclose(t1["a"], t3["a"]) np.testing.assert_allclose(t1["next_a"], t3["next_a"])
def test_cache_next_of(self): stack_size = 3 episode_len = 5 rb = ReplayBuffer(32, {"obs": { "shape": (stack_size), "dtype": np.int }}, next_of="obs", stack_compress="obs") obs = np.arange(episode_len + stack_size + 2, dtype=np.int) # [0,1,...,episode_len+stack_size+1] obs2 = obs + 3 * episode_len # [3*episode_len,...,4*episode_len+stack_size+1] # Add 1st episode for i in range(episode_len): rb.add(obs=obs[i:i + stack_size], next_obs=obs[i + 1:i + 1 + stack_size]) s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), episode_len) for i in range(episode_len): with self.subTest(i=i): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) # Reset environment rb.on_episode_end() s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), episode_len) for i in range(episode_len): with self.subTest(i=i): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) # Add 2nd episode for i in range(episode_len): rb.add(obs=obs2[i:i + stack_size], next_obs=obs2[i + 1:i + 1 + stack_size]) s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), 2 * episode_len) for i in range(episode_len): with self.subTest(i=i): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) for i in range(episode_len): with self.subTest(i=i + episode_len): np.testing.assert_equal(s["obs"][i + episode_len], obs2[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i + episode_len], obs2[i + 1:i + 1 + stack_size])
def test_shuffle_transitions(self): rb = ReplayBuffer(64, {"a": {}}) a = np.arange(64) rb.add(a=a) s1 = rb.get_all_transitions()["a"] s2 = rb.get_all_transitions(shuffle=True)["a"] self.assertFalse((s1 == s2).all()) s = np.intersect1d(s1, s2, assume_unique=True) np.testing.assert_allclose(np.ravel(s), np.ravel(s1))
def explorer(global_rb,env_dict,is_training_done,queue): local_buffer_size = int(1e+2) local_rb = ReplayBuffer(local_buffer_size,env_dict) model = MyModel() env = gym.make("CartPole-v1") obs = env.reset() while not is_training_done.is_set(): if not queue.empty(): w = queue.get() model.weights = w action = model.get_action(obs) next_obs, reward, done, _ = env.step(action) local_rb.add(obs=obs,act=action,rew=reward,next_obs=next_obs,done=done) if done: local_rb.on_episode_end() obs = env.reset() else: obs = next_obs if local_rb.get_stored_size() == local_buffer_size: local_sample = local_rb.get_all_transitions() local_rb.clear() absTD = model.abs_TD_error(local_sample) global_rb.add(**local_sample,priorities=absTD)
def test_with_one(self): buffer_size = 32 obs_shape = 3 act_shape = 4 rb = ReplayBuffer(buffer_size, { "obs": { "shape": obs_shape }, "act": { "shape": act_shape }, "done": {} }) v = { "obs": np.ones(shape=obs_shape), "act": np.zeros(shape=act_shape), "done": 0 } rb.add(**v) tx = rb.get_all_transitions() for key in ["obs", "act", "done"]: with self.subTest(key=key): np.testing.assert_allclose(tx[key], np.asarray(v[key]).reshape((1, -1)))
def test_python_type(self): types = [bool, int, float] for d in types: with self.subTest(type=d): b = ReplayBuffer(10, {"a": {"dtype": d}}) b.add(a=d(1)) self.assertEqual(b.get_all_transitions()["a"].dtype, d)
def test_Nstep_discounts(self): buffer_size = 32 step = 4 gamma = 0.5 rb = ReplayBuffer(buffer_size, {"done": {}}, Nstep={ "size": step, "gamma": gamma }) rb.add(done=0) rb.add(done=0) rb.add(done=0) self.assertEqual(rb.get_stored_size(), 0) rb.add(done=0) np.testing.assert_allclose(rb.get_all_transitions()["done"], np.asarray([[0]])) rb.add(done=0) np.testing.assert_allclose(rb.get_all_transitions()["done"], np.asarray([[0], [0]]))
def test_next_obs(self): buffer_size = 32 nstep = 4 gamma = 0.99 rb = ReplayBuffer(buffer_size, { "next_obs": {}, "done": {} }, Nstep={ "size": nstep, "gamma": gamma, "next": "next_obs" }) rb.add(next_obs=1, done=0) rb.add(next_obs=2, done=0) rb.add(next_obs=3, done=0) rb.add(next_obs=4, done=0) rb.add(next_obs=5, done=0) np.testing.assert_allclose(rb.get_all_transitions()["next_obs"], np.asarray([[4], [5]])) rb.add(next_obs=6, done=1) rb.on_episode_end() sample = rb.get_all_transitions() np.testing.assert_allclose(sample["next_obs"][sample["done"] == 0.0], np.asarray([4, 5, 6])) rb.add(next_obs=7, done=0) rb.add(next_obs=8, done=0) rb.add(next_obs=9, done=0) rb.add(next_obs=10, done=1) rb.on_episode_end() sample = rb.get_all_transitions() np.testing.assert_allclose(sample["next_obs"][sample["done"] == 0.0], np.asarray([4, 5, 6, 10]))
def test_stack(self): rb = ReplayBuffer(3, {"a": {"shape": 2}}, stack_compress="a") # 1st iteration: Nothing special rb.add(a=[0, 1]) np.testing.assert_allclose(rb.get_all_transitions()["a"], np.asarray([[0, 1]])) rb.add(a=[1, 2]) np.testing.assert_allclose(rb.get_all_transitions()["a"], np.asarray([[0, 1], [1, 2]])) rb.add(a=[2, 3]) np.testing.assert_allclose(rb.get_all_transitions()["a"], np.asarray([[0, 1], [1, 2], [2, 3]])) # 2nd iteration: Cache rb.add(a=[3, 4]) np.testing.assert_allclose(rb.get_all_transitions()["a"], np.asarray([[3, 4], [1, 2], [2, 3]])) rb.add(a=[4, 5]) np.testing.assert_allclose(rb.get_all_transitions()["a"], np.asarray([[3, 4], [4, 5], [2, 3]])) rb.add(a=[5, 6]) np.testing.assert_allclose(rb.get_all_transitions()["a"], np.asarray([[3, 4], [4, 5], [5, 6]])) # 3rd iteration: Clean up cache beforehand and set new cache rb.add(a=[6, 7]) np.testing.assert_allclose(rb.get_all_transitions()["a"], np.asarray([[6, 7], [4, 5], [5, 6]])) rb.add(a=[7, 8]) np.testing.assert_allclose(rb.get_all_transitions()["a"], np.asarray([[6, 7], [7, 8], [5, 6]])) rb.add(a=[8, 9]) np.testing.assert_allclose(rb.get_all_transitions()["a"], np.asarray([[6, 7], [7, 8], [8, 9]]))
def test_dtype_check(self): types = [ np.bool_, np.bool8, np.byte, np.short, np.intc, np.int_, np.longlong, np.intp, np.int8, np.int16, np.int32, np.int64, np.ubyte, np.ushort, np.uintc, np.uint, np.ulonglong, np.uintp, np.uint8, np.uint16, np.uint32, np.uint64, np.half, np.single, np.double, np.float_, np.longfloat, np.float16, np.float32, np.float64, np.csingle, np.complex_, np.clongfloat, np.complex64, np.complex128 ] for d in types: with self.subTest(type=d): b = ReplayBuffer(10, {"a": {"dtype": d}}) b.add(a=np.ones(1, dtype=d)) self.assertEqual(b.get_all_transitions()["a"].dtype, d)
def test_with_empty(self): buffer_size = 32 obs_shape = 3 act_shape = 4 rb = ReplayBuffer(buffer_size, { "obs": { "shape": obs_shape }, "act": { "shape": act_shape }, "done": {} }) tx = rb.get_all_transitions() for key in ["obs", "act", "done"]: with self.subTest(key=key): self.assertEqual(tx[key].shape[0], 0)
class RainbowAgent: """Agent interacting with environment. Attribute: env (gym.Env): openAI Gym environment memory (PrioritizedReplayBuffer): replay memory to store transitions batch_size (int): batch size for sampling target_update (int): period for target model's hard update gamma (float): discount factor dqn (Network): model to train and select actions dqn_target (Network): target model to update optimizer (torch.optim): optimizer for training dqn transition (list): transition information including state, action, reward, next_state, done v_min (float): min value of support v_max (float): max value of support atom_size (int): the unit number of support support (torch.Tensor): support for categorical dqn use_n_step (bool): whether to use n_step memory n_step (int): step number to calculate n-step td error memory_n (ReplayBuffer): n-step replay buffer """ def __init__( self, env: gym.Env, memory_size: int, batch_size: int, target_update: int, gamma: float = 0.99, # PER parameters alpha: float = 0.2, beta: float = 0.6, prior_eps: float = 1e-6, # Categorical DQN parameters v_min: float = 0.0, v_max: float = 200.0, atom_size: int = 51, # N-step Learning n_step: int = 3, # Convergence parameters convergence_window: int = 100, convergence_window_epsilon_p: int = 10, convergence_avg_score: float = 195.0, convergence_avg_epsilon: float = 0.0524, # 3 degs converted to rads convergence_avg_epsilon_p: float = 0.0174, # 1 deg/s converted to rad/s # Tensorboard parameters model_name: str = "snake_joint", ): """Initialization. Args: env (gym.Env): openAI Gym environment memory_size (int): length of memory batch_size (int): batch size for sampling target_update (int): period for target model's hard update lr (float): learning rate gamma (float): discount factor alpha (float): determines how much prioritization is used beta (float): determines how much importance sampling is used prior_eps (float): guarantees every transition can be sampled v_min (float): min value of support v_max (float): max value of support atom_size (int): the unit number of support n_step (int): step number to calculate n-step td error """ obs_dim = env.observation_space.shape[0] action_dim = env.action_space.n self.env = env self.batch_size = batch_size self.target_update = target_update self.gamma = gamma # NoisyNet: All attributes related to epsilon are removed #produces a unique timestamp for each run run_timestamp=str( #returns number of day and number of month str(time.localtime(time.time())[2]) + "_" + str(time.localtime(time.time())[1]) + "_" + #returns hour, minute and second str(time.localtime(time.time())[3]) + "_" + str(time.localtime(time.time())[4]) + "_" + str(time.localtime(time.time())[5]) ) #Will write scalars that can be visualized using tensorboard in the directory "runLogs/timestamp" self.writer = SummaryWriter("runLogs/" + run_timestamp) # device: cpu / gpu self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) print(self.device) # PER # memory for 1-step Learning self.beta = beta self.prior_eps = prior_eps self.memory = PrioritizedReplayBuffer( memory_size, { "obs": {"shape": (obs_dim,)}, "act": {"shape": (1,)}, "rew": {}, "next_obs": {"shape": (obs_dim,)}, "done": {} }, alpha=alpha ) # memory for N-step Learning self.use_n_step = True if n_step > 1 else False if self.use_n_step: self.n_step = n_step self.memory_n = ReplayBuffer( memory_size, { "obs": {"shape": (obs_dim,)}, "act": {"shape": (1,)}, "rew": {}, "next_obs": {"shape": (obs_dim,)}, "done": {} }, Nstep={ "size": n_step, "gamma": gamma, "rew": "rew", "next": "next_obs" } ) # Categorical DQN parameters self.v_min = v_min self.v_max = v_max self.atom_size = atom_size self.support = torch.linspace( self.v_min, self.v_max, self.atom_size ).to(self.device) # networks: dqn, dqn_target self.dqn = Network( obs_dim, action_dim, self.atom_size, self.support ).to(self.device) self.dqn_target = Network( obs_dim, action_dim, self.atom_size, self.support ).to(self.device) self.dqn_target.load_state_dict(self.dqn.state_dict()) self.dqn_target.eval() # optimizer self.optimizer = optim.Adam(self.dqn.parameters(),0.0001) # transition to store in memory self.transition = list() # mode: train / test self.is_test = False # Custom tensorboard object # self.tensorboard = RainbowTensorBoard( # log_dir="single_joint_logs/{}-{}".format( # model_name, # datetime.now().strftime("%m-%d-%Y-%H_%M_%S") # ) # ) # Convergence criterion self.convergence_window = convergence_window self.convergence_window_epsilon_p = convergence_window_epsilon_p self.convergence_avg_score = convergence_avg_score self.convergence_avg_epsilon = convergence_avg_epsilon self.convergence_avg_epsilon_p = convergence_avg_epsilon_p def select_action(self, state: np.ndarray) -> np.ndarray: """Select an action from the input state.""" # NoisyNet: no epsilon greedy action selection selected_action = self.dqn( torch.FloatTensor(state).to(self.device) ).argmax() selected_action = selected_action.detach().cpu().numpy() if not self.is_test: self.transition = [state, selected_action] return selected_action def step(self, action: np.ndarray, score:int) -> Tuple[np.ndarray, np.float64, bool]: """Take an action and return the response of the env.""" next_state, reward, done, _ = self.env.step(action,score) if not self.is_test: self.transition += [reward, next_state, done] # N-step transition if self.use_n_step: idx = self.memory_n.add( **dict( zip(["obs", "act", "rew", "next_obs", "done"], self.transition) ) ) one_step_transition = [ v[idx] for _,v in self.memory_n.get_all_transitions().items()] if idx else None # 1-step transition else: one_step_transition = self.transition # add a single step transition if one_step_transition: self.memory.add( **dict( zip(["obs", "act", "rew", "next_obs", "done"], one_step_transition) ) ) return next_state, reward, done def update_model(self,frame_idx:int) -> torch.Tensor: """Update the model by gradient descent. shape of elementwise_loss = [128,51] shape of loss = ([]) shape of weights ([128,1)] """ # PER needs beta to calculate weights samples = self.memory.sample(self.batch_size, beta=self.beta) weights = torch.FloatTensor( samples["weights"].reshape(-1, 1) ).to(self.device) indices = samples["indexes"] #rospy.loginfo(samples.keys()) #rospy.loginfo(weights.shape) #rospy.loginfo(indices.shape()) #torch.save(self.dqn.state_dict(),str("checkpoint_"+str(time.time()))) # 1-step Learning loss elementwise_loss = self._compute_dqn_loss(samples, self.gamma) # PER: importance sampling before average loss = torch.mean(elementwise_loss * weights) self.writer.add_scalar('update_model/Lossv0', loss.detach().item(),frame_idx ) # N-step Learning loss # we are gonna combine 1-step loss and n-step loss so as to # prevent high-variance. The original rainbow employs n-step loss only. if self.use_n_step: gamma = self.gamma ** self.n_step samples = {k: [v[i] for i in indices] for k,v in self.memory_n.get_all_transitions().items()} elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma) elementwise_loss += elementwise_loss_n_loss #rospy.loginfo(elementwise_loss_n_loss.shape) #rospy.loginfo(elementwise_loss.shape) # PER: importance sampling before average loss = torch.mean(elementwise_loss * weights) rospy.loginfo( f"{elementwise_loss}" ) self.optimizer.zero_grad() self.writer.add_scalar('update_model/Lossv1', loss.detach().item(),frame_idx ) #From pytorch doc: backward() Computes the gradient of current tensor w.r.t. graph leaves. #self.writer.add_image("loss gradient before", loss, frame_idx) loss.backward() #self.writer.add_image("loss gradient after", loss, frame_idx) self.writer.add_scalar('update_model/Lossv2', loss.detach().item(),frame_idx ) clip_grad_norm_(self.dqn.parameters(), 10.0) self.optimizer.step() # PER: update priorities loss_for_prior = elementwise_loss.detach().cpu().numpy() new_priorities = loss_for_prior + self.prior_eps self.memory.update_priorities(indices, new_priorities) # NoisyNet: reset noise self.dqn.reset_noise() self.dqn_target.reset_noise() #rospy.loginfo("second") #rospy.loginfo(loss.shape) #rospy.loginfo("loss dimension = " + loss.ndim() ) #rospy.loginfo("loss = " + str(loss.detach().item()) + "type = " + str(type(loss.detach().item()) ) ) self.writer.add_scalar('update_model/Loss', loss.detach().item(),frame_idx ) return loss.detach().item() def train(self, num_frames: int): """Train the agent.""" self.is_test = False state = self.env.reset() update_cnt = 0 losses = [] scores = [] score = 0 for frame_idx in tqdm(range(1, num_frames + 1)): action = self.select_action(state) next_state, reward, done = self.step(action,score) state = next_state score += reward # NoisyNet: removed decrease of epsilon # PER: increase beta fraction = min(frame_idx / num_frames, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) # if episode ends if done: #rospy.loginfo("logging for done") self.writer.add_scalar('train/score', score, frame_idx) self.writer.add_scalar('train/final_epsilon', state[6], frame_idx) self.writer.add_scalar('train/epsilon_p', state[7], frame_idx) state = self.env.reset() scores.append(score) score = 0 # if training is ready if self.memory.get_stored_size() >= self.batch_size: #frame_id given as argument for logging by self.writer. #rospy.loginfo("frame_idx= " + str(frame_idx) + "type = " + str(type(frame_idx))) loss = self.update_model(frame_idx) losses.append(loss) update_cnt += 1 # if hard update is needed if update_cnt % self.target_update == 0: self._target_hard_update(loss) self.env.close() def test(self) -> List[np.ndarray]: """Test the agent.""" self.is_test = True state = self.env.reset() done = False score = 0 frames = [] while not done: frames.append(self.env.render(mode="rgb_array")) action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward print("score: ", score) self.env.close() return frames def _compute_dqn_loss(self, samples: Dict[str, np.ndarray], gamma: float) -> torch.Tensor: """Return categorical dqn loss.""" device = self.device # for shortening the following lines state = torch.FloatTensor(samples["obs"]).to(device) next_state = torch.FloatTensor(samples["next_obs"]).to(device) action = torch.LongTensor(samples["act"]).to(device) reward = torch.FloatTensor(np.array(samples["rew"]).reshape(-1, 1)).to(device) done = torch.FloatTensor(np.array(samples["done"]).reshape(-1, 1)).to(device) # Categorical DQN algorithm delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1) with torch.no_grad(): # Double DQN next_action = self.dqn(next_state).argmax(1) next_dist = self.dqn_target.dist(next_state) next_dist = next_dist[range(self.batch_size), next_action] t_z = reward + (1 - done) * gamma * self.support t_z = t_z.clamp(min=self.v_min, max=self.v_max) b = (t_z - self.v_min) / delta_z l = b.floor().long() u = b.ceil().long() offset = ( torch.linspace( 0, (self.batch_size - 1) * self.atom_size, self.batch_size ).long() .unsqueeze(1) .expand(self.batch_size, self.atom_size) .to(self.device) ) proj_dist = torch.zeros(next_dist.size(), device=self.device) proj_dist.view(-1).index_add_( 0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1) ) proj_dist.view(-1).index_add_( 0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1) ) print(f"Next Action : {next_action}\n Next Dist : {next_dist}\n") dist = self.dqn.dist(state) log_p = torch.log(dist[range(self.batch_size), action]) elementwise_loss = -(proj_dist * log_p).sum(1) print(f"Proj Dist : {proj_dist}\n Dist : {dist}\n Log_p : {log_p}\n") if torch.isnan(elementwise_loss[0][0]): exit() return elementwise_loss def _target_hard_update(self,loss): """Hard update: target <- local.""" self.dqn_target.load_state_dict(self.dqn.state_dict()) #torch.save(self.dqn.state_dict(),str("checkpoint_"+str(time.time()))) torch.save({ 'model_state_dict': self.dqn.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'loss': loss, }, str("checkpoints/checkpoint_"+str(time.time())))
def run_policy(env, get_action, max_ep_len=None, num_episodes=100, render=True, record=False, record_project= 'benchmarking', record_name = 'trained' , data_path='', config_name='test', max_len_rb=100, benchmark=False, log_prefix=''): assert env is not None, \ "Environment not found!\n\n It looks like the environment wasn't saved, " + \ "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \ "page on Experiment Outputs for how to handle this situation." logger = EpochLogger() o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 ep_cost = 0 local_steps_per_epoch = int(4000 / num_procs()) obs_dim = env.observation_space.shape act_dim = env.action_space.shape rew_mov_avg_10 = [] cost_mov_avg_10 = [] if benchmark: ep_costs = [] ep_rewards = [] if record: wandb.login() # 4 million env interactions wandb.init(project=record_project, name=record_name) rb = ReplayBuffer(size=10000, env_dict={ "obs": {"shape": obs_dim}, "act": {"shape": act_dim}, "rew": {}, "next_obs": {"shape": obs_dim}, "done": {}}) # columns = ['observation', 'action', 'reward', 'cost', 'done'] # sim_data = pd.DataFrame(index=[0], columns=columns) while n < num_episodes: if render: env.render() time.sleep(1e-3) a = get_action(o) next_o, r, d, info = env.step(a) if record: # buf.store(next_o, a, r, None, info['cost'], None, None, None) done_int = int(d==True) rb.add(obs=o, act=a, rew=r, next_obs=next_o, done=done_int) ep_ret += r ep_len += 1 ep_cost += info['cost'] # Important! o = next_o if d or (ep_len == max_ep_len): # finish recording and save csv if record: rb.on_episode_end() # make directory if does not exist if not os.path.exists(data_path + config_name + '_episodes'): os.makedirs(data_path + config_name + '_episodes') # buf = CostPOBuffer(obs_dim, act_dim, local_steps_per_epoch, 0.99, 0.99) if len(rew_mov_avg_10) >= 25: rew_mov_avg_10.pop(0) cost_mov_avg_10.pop(0) rew_mov_avg_10.append(ep_ret) cost_mov_avg_10.append(ep_cost) mov_avg_ret = np.mean(rew_mov_avg_10) mov_avg_cost = np.mean(cost_mov_avg_10) expert_metrics = {log_prefix + 'episode return': ep_ret, log_prefix + 'episode cost': ep_cost, # 'cumulative return': cum_ret, # 'cumulative cost': cum_cost, log_prefix + '25ep mov avg return': mov_avg_ret, log_prefix + '25ep mov avg cost': mov_avg_cost } if benchmark: ep_rewards.append(ep_ret) ep_costs.append(ep_cost) wandb.log(expert_metrics) logger.store(EpRet=ep_ret, EpLen=ep_len, EpCost=ep_cost) print('Episode %d \t EpRet %.3f \t EpLen %d \t EpCost %d' % (n, ep_ret, ep_len, ep_cost)) o, r, d, ep_ret, ep_len, ep_cost = env.reset(), 0, False, 0, 0, 0 n += 1 logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.dump_tabular() if record: print("saving final buffer") bufname_pk = data_path + config_name + '_episodes/sim_data_' + str(int(num_episodes)) + '_buffer.pkl' file_pi = open(bufname_pk, 'wb') pickle.dump(rb.get_all_transitions(), file_pi) wandb.finish() return rb if benchmark: return ep_rewards, ep_costs
class LearnerBase(tf.Module): # PUBLIC def __init__(self, model, filename=None, bufferSize=264, numEpochs=100, batchSize=30, log=False, logPath=None): self.model = model self.sDim = model.get_state_dim() self.aDim = model.get_action_dim() self.optimizer = tf.optimizers.Adam(learning_rate=0.5) self.rb = ReplayBuffer(bufferSize, env_dict={ "obs": { "shape": (self.sDim, 1) }, "act": { "shape": (self.aDim, 1) }, "next_obs": { "shape": (self.sDim, 1) } }) self.numEpochs = numEpochs self.batchSize = batchSize if filename is not None: self.load_rb(filename) self.log = log self.step = 0 if self.log: stamp = datetime.now().strftime("%Y.%m.%d-%H:%M:%S") self.logdir = os.path.join(logPath, "learner", stamp) self.writer = tf.summary.create_file_writer(self.logdir) self._save_graph() def load_rb(self, filename): self.rb.load_transitions(filename) def add_rb(self, x, u, xNext): self.rb.add(obs=x, act=u, next_obs=xNext) def train(self, X, y, batchSize=-1, epoch=1, learninRate=0.1, kfold=None): for e in range(epoch): if batchSize == -1: batchLoss = self._train_step(X, y) if self.log: with self.writer.as_default(): if kfold is not None: scope = "epoch{}/batch{}/lr{}/loss".format( epoch, batchSize, learninRate) else: scope = "Loss" tf.summary.scalar(scope, batchLoss, self.step) self.step += 1 pass for i in range(0, X.shape[0], batchSize): batchLoss = self._train_step(X[i:i + batchSize], y[i:i + batchSize]) if self.log: with self.writer.as_default(): if kfold is not None: scope = "epoch{}/batch{}/lr{}/loss_fold{}".format( epoch, batchSize, learninRate, kfold) else: scope = "Loss" tf.summary.scalar(scope, batchLoss, self.step) self.step += 1 def rb_trans(self): return self.rb.get_all_transitions().copy() def save_rb(self, filename): self.rb.save_transitions(filename) def save_params(self, step): self.model.save_params(self.logdir, step) def grid_search(self, trajs, actionSeqs): init_weights = self.model.get_weights() learningRate = np.linspace(0.0001, 0.1, 10) batchSize = np.array([-1]) epoch = np.array([100, 500, 1000]) mean = [] for lr in learningRate: for bs in batchSize: for e in epoch: fold = self.k_fold_validation(learningRate=lr, batchSize=bs, epoch=e, k=10) mean.append(np.mean(fold)) print("*" * 5, " Grid ", 5 * "*") print("lr: ", lr) print("bs: ", bs) print("e: ", e) print("fold: ", fold) print("mean: ", np.mean(fold)) self.train_all(learningRate=lr, batchSize=bs, epoch=e) err = self.validate(actionSeqs, trajs) print("validation error: ", err.numpy()) self.model.update_weights(init_weights, msg=False) print("Best mean:", np.max(mean)) def train_all(self, learningRate=0.1, batchSize=32, epoch=100): self.optimizer = tf.optimizers.Adam(learning_rate=learningRate) data = self.rb_trans() (X, y) = self.model.prepare_training_data(data['obs'], data['next_obs'], data['act']) self.step = 0 self.train(X, y, batchSize=batchSize, epoch=epoch, learninRate=learningRate) def k_fold_validation(self, k=10, learningRate=0.1, batchSize=32, epoch=100): # First get all the data self.optimizer = tf.optimizers.Adam(learning_rate=learningRate) data = self.rb_trans() (X, y) = self.model.prepare_training_data(data['obs'], data['next_obs'], data['act']) kfold = KFold(n_splits=k, shuffle=True) init_weights = self.model.get_weights() fold = [] X = X.numpy() y = y.numpy() i = 0 for train, test in kfold.split(X, y): self.step = 0 self.train(X[train], y[train], batchSize=batchSize, epoch=epoch, learninRate=learningRate, kfold=i) self.model.update_weights(init_weights, msg=False) lossFold = self.evaluate(X[test], y[test]) fold.append(lossFold.numpy()) i += 1 self.model.update_weights(init_weights, msg=False) return fold def evaluate(self, X, y): pred = self.model._predict_nn("Eval", np.squeeze(X, axis=-1)) loss = tf.reduce_mean(tf.math.squared_difference(pred, y), name="loss") return loss def plot_seq(self, traj, gtTraj): fig, axs = plt.subplots(figsize=(20, 10), nrows=2, ncols=8) # Position axs[0, 0].plot(traj[:, 0]) axs[0, 0].plot(gtTraj[:, 0]) axs[0, 1].plot(traj[:, 1]) axs[0, 1].plot(gtTraj[:, 1]) axs[0, 2].plot(traj[:, 2]) axs[0, 2].plot(gtTraj[:, 2]) # Quaternion axs[0, 3].plot(traj[:, 3]) axs[0, 3].plot(gtTraj[:, 3]) axs[0, 4].plot(traj[:, 4]) axs[0, 4].plot(gtTraj[:, 4]) axs[0, 5].plot(traj[:, 5]) axs[0, 5].plot(gtTraj[:, 5]) axs[0, 6].plot(traj[:, 6]) axs[0, 6].plot(gtTraj[:, 6]) # Lin Vel axs[1, 0].plot(traj[:, 0]) axs[1, 0].plot(gtTraj[:, 0]) axs[1, 1].plot(traj[:, 1]) axs[1, 1].plot(gtTraj[:, 1]) axs[1, 2].plot(traj[:, 2]) axs[1, 2].plot(gtTraj[:, 2]) # Ang vel axs[1, 3].plot(traj[:, 3]) axs[1, 3].plot(gtTraj[:, 3]) axs[1, 4].plot(traj[:, 4]) axs[1, 4].plot(gtTraj[:, 4]) axs[1, 5].plot(traj[:, 5]) axs[1, 5].plot(gtTraj[:, 5]) plt.show() def validate(self, actionSeqs, gtTrajs): ''' computes the error of the model for a number of trajectories with the matching action sequences. - input: -------- - acitonSeqs: Tensor of the action sequences. Shape [k, tau, 6, 1] - gtTrajs: Tensor of the ground truth trajectories. Shape [k, tau, 13, 1] - output: --------- - L(nn(actionSeqs), trajs), the loss between the predicted trajectory and the ground truth trajectory. ''' tau = actionSeqs.shape[1] k = actionSeqs.shape[0] state = np.expand_dims(gtTrajs[:, 0], axis=-1) trajs = [np.expand_dims(state, axis=1)] # PAY ATTENTION TO THE FOR LOOPS WITH @tf.function. for i in range(tau - 1): with tf.name_scope("Rollout_" + str(i)): with tf.name_scope("Prepare_data_" + str(i)) as pd: # make the action a [1, 6, 1] tensor action = np.expand_dims(actionSeqs[:, i], axis=-1) with tf.name_scope("Step_" + str(i)) as s: nextState = self.model.build_step_graph(s, state, action) state = nextState trajs.append(np.expand_dims(state, axis=1)) trajs = np.squeeze(np.concatenate(trajs, axis=1), axis=-1) err = tf.linalg.norm(tf.subtract(trajs, gtTrajs)) / k self.plot_seq(trajs[0], gtTrajs[0]) return err # PRIVATE def _train_step(self, X, y): # If batchSize = -1, feed in the entire batch with tf.GradientTape() as tape: pred = self.model._predict_nn("train", np.squeeze(X, axis=-1)) loss = tf.reduce_mean(tf.math.squared_difference(pred, y), name="loss") grads = tape.gradient(loss, self.model.weights()) self.optimizer.apply_gradients(zip(grads, self.model.weights())) return loss def _save_graph(self): state = tf.zeros((1, self.model.get_state_dim(), 1), dtype=tf.float64) action = tf.zeros((1, self.model.get_action_dim(), 1), dtype=tf.float64) with self.writer.as_default(): graph = tf.function( self.model.build_step_graph).get_concrete_function( "graph", state, action).graph # visualize summary_ops_v2.graph(graph.as_graph_def())
class HindsightReplayBuffer: """ Replay Buffer class for Hindsight Experience Replay Ref: https://arxiv.org/abs/1707.01495 """ def __init__(self, size: int, env_dict: Dict, max_episode_len: int, reward_func: Callable, *, goal_func: Optional[Callable] = None, goal_shape: Optional[Iterable[int]] = None, state: str = "obs", action: str = "act", next_state: str = "next_obs", strategy: str = "future", additional_goals: int = 4, prioritized=True, **kwargs): """ Initialize HindsightReplayBuffer Parameters ---------- size : int Buffer Size env_dict : dict of dict Dictionary specifying environments. The keies of env_dict become environment names. The values of env_dict, which are also dict, defines "shape" (default 1) and "dtypes" (fallback to `default_dtype`) max_episode_len : int Maximum episode length. reward_func : Callable[[np.ndarray, np.ndarray, np.ndarray], np.ndarray] Batch calculation of reward function SxAxG -> R. goal_func : Callable[[np.ndarray], np.ndarray], optional Batch extraction function for goal from state: S->G. If ``None`` (default), identity function is used (goal = state). goal_shape : Iterable[int], optional Shape of goal. If ``None`` (default), state shape is used. state : str, optional State name in ``env_dict``. The default is "obs". action : str, optional Action name in ``env_dict``. The default is "act". next_state : str, optional Next state name in ``env_dict``. The default is "next_obs". strategy : ["future", "episode", "random", "final"], optional Goal sampling strategy. "future" selects one of the future states in the same episode. "episode" selects states in the same episode. "random" selects from the all states in replay buffer. "final" selects the final state in the episode. For "final", ``additonal_goals`` is ignored. The default is "future" additional_goals : int, optional Number of additional goals. The default is ``4``. prioritized : bool, optional Whether use Prioritized Experience Replay. The default is ``True``. """ self.max_episode_len = max_episode_len self.reward_func = reward_func self.goal_func = goal_func or (lambda s: s) self.state = state self.action = action self.next_state = next_state self.strategy = strategy known_strategy = ["future", "episode", "random", "final"] if self.strategy not in known_strategy: raise ValueError(f"Unknown Strategy: {strategy}. " + f"Known Strategies: {known_strategy}") self.additional_goals = additional_goals if self.strategy == "final": self.additional_goals = 1 self.prioritized = prioritized if goal_shape: goal_dict = {**env_dict[state], "shape": goal_shape} self.goal_shape = np.array(goal_shape, ndmin=1) else: goal_dict = env_dict[state] self.goal_shape = np.array(env_dict[state].get("shape", 1), ndmin=1) RB = PrioritizedReplayBuffer if self.prioritized else ReplayBuffer self.rb = RB(size, { **env_dict, "rew": {}, "goal": goal_dict }, **kwargs) self.episode_rb = ReplayBuffer(self.max_episode_len, env_dict) self.rng = np.random.default_rng() def add(self, **kwargs): r"""Add transition(s) into replay buffer. Multple sets of transitions can be added simultaneously. Parameters ---------- **kwargs : array like or float or int Transitions to be stored. """ if self.episode_rb.get_stored_size() >= self.max_episode_len: raise ValueError("Exceed Max Episode Length") self.episode_rb.add(**kwargs) def sample(self, batch_size: int, **kwargs): r"""Sample the stored transitions randomly with speciped size Parameters ---------- batch_size : int sampled batch size Returns ------- sample : dict of ndarray Batch size of sampled transitions, which might contains the same transition multiple times. """ return self.rb.sample(batch_size, **kwargs) def on_episode_end(self, goal): """ Terminate the current episode and set hindsight goal Paremeters ---------- goal : array-like Original goal state of this episode. """ episode_len = self.episode_rb.get_stored_size() if episode_len == 0: return None trajectory = self.episode_rb.get_all_transitions() add_shape = (trajectory[self.state].shape[0], *self.goal_shape) goal = np.broadcast_to(np.asarray(goal), add_shape) rew = self.reward_func(trajectory[self.next_state], trajectory[self.action], goal) self.rb.add(**trajectory, goal=goal, rew=rew) if self.strategy == "future": idx = np.zeros((self.additional_goals, episode_len), dtype=np.int64) for i in range(episode_len): idx[:, i] = self.rng.integers(low=i, high=episode_len, size=self.additional_goals) for i in range(self.additional_goals): goal = self.goal_func(trajectory[self.next_state][idx[i]]) rew = self.reward_func(trajectory[self.next_state], trajectory[self.action], goal) self.rb.add(**trajectory, rew=rew, goal=goal) elif self.strategy == "episode": idx = self.rng.integers(low=0, high=episode_len, size=(self.additional_goals, episode_len)) for _i in idx: goal = self.goal_func(trajectory[self.next_state][_i]) rew = self.reward_func(trajectory[self.next_state], trajectory[self.action], goal) self.rb.add(**trajectory, rew=rew, goal=goal) elif self.strategy == "final": goal = self.goal_func( np.broadcast_to(trajectory[self.next_state][-1], trajectory[self.next_state].shape)) rew = self.reward_func(trajectory[self.next_state], trajectory[self.action], goal) self.rb.add(**trajectory, rew=rew, goal=goal) else: # random # Note 1: # We should not prioritize goal selection, # so that we manually create indices. # Note 2: # Since we cannot access internal data directly, # we have to extract set of transitions. # Although this has overhead, it is fine # becaue "random" strategy is used only for # strategy comparison. idx = self.rng.integers(low=0, high=self.rb.get_stored_size(), size=self.additional_goals * episode_len) goal = self.goal_func(self.rb._encode_sample(idx)[self.next_state]) goal = goal.reshape( (self.additional_goals, episode_len, *(goal.shape[1:]))) for g in goal: rew = self.reward_func(trajectory[self.next_state], trajectory[self.action], g) self.rb.add(**trajectory, rew=rew, goal=g) self.episode_rb.clear() self.rb.on_episode_end() def clear(self): """ Clear replay buffer """ self.rb.clear() self.episode_rb.clear() def get_stored_size(self): """ Get stored size Returns ------- int stored size """ return self.rb.get_stored_size() def get_buffer_size(self): """ Get buffer size Returns ------- int buffer size """ return self.rb.get_buffer_size() def get_all_transitions(self, shuffle: bool = False): r""" Get all transitions stored in replay buffer. Parameters ---------- shuffle : bool, optional When True, transitions are shuffled. The default value is False. Returns ------- transitions : dict of numpy.ndarray All transitions stored in this replay buffer. """ return self.rb.get_all_transitions(shuffle) def update_priorities(self, indexes, priorities): """ Update priorities Parameters ---------- indexes : array_like indexes to update priorities priorities : array_like priorities to update Raises ------ TypeError: When ``indexes`` or ``priorities`` are ``None`` ValueError: When this buffer is constructed with ``prioritized=False`` """ if not self.prioritized: raise ValueError("Buffer is constructed without PER") self.rb.update_priorities(indexes, priorities) def get_max_priority(self): """ Get max priority Returns ------- float Max priority of stored priorities Raises ------ ValueError: When this buffer is constructed with ``prioritied=False`` """ if not self.prioritized: raise ValueError("Buffer is constructed without PER") return self.rb.get_max_priority()
def test_smaller_episode_than_stack_frame(self): """ `on_episode_end()` caches stack size. When episode length is smaller than stack size, `on_episode_end()` must avoid caching from previous episode. Since cache does not wraparound, this bug does not happen at the first episode. Ref: https://gitlab.com/ymd_h/cpprb/-/issues/108 Ref: https://gitlab.com/ymd_h/cpprb/-/issues/110 """ stack_size = 4 episode_len1 = 5 episode_len2 = 2 rb = ReplayBuffer(32, {"obs": { "shape": (stack_size), "dtype": np.int }}, next_of="obs", stack_compress="obs") obs = np.arange(episode_len1 + stack_size + 2, dtype=np.int) obs2 = np.arange(episode_len2 + stack_size + 2, dtype=np.int) + 100 self.assertEqual(rb.get_current_episode_len(), 0) # Add 1st episode for i in range(episode_len1): rb.add(obs=obs[i:i + stack_size], next_obs=obs[i + 1:i + 1 + stack_size]) s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), episode_len1) self.assertEqual(rb.get_current_episode_len(), episode_len1) for i in range(episode_len1): with self.subTest(i=i): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) # Reset environment rb.on_episode_end() self.assertEqual(rb.get_current_episode_len(), 0) s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), episode_len1) for i in range(episode_len1): with self.subTest(i=i): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) # Add 2nd episode for i in range(episode_len2): rb.add(obs=obs2[i:i + stack_size], next_obs=obs2[i + 1:i + 1 + stack_size]) self.assertEqual(rb.get_current_episode_len(), episode_len2) s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), episode_len1 + episode_len2) for i in range(episode_len1): with self.subTest(i=i, v="obs"): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) with self.subTest(i=i, v="next_obs"): np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) for i in range(episode_len2): with self.subTest(i=i + episode_len1, v="obs"): np.testing.assert_equal(s["obs"][i + episode_len1], obs2[i:i + stack_size]) with self.subTest(i=i + episode_len1, v="next_obs"): np.testing.assert_equal(s["next_obs"][i + episode_len1], obs2[i + 1:i + 1 + stack_size]) rb.on_episode_end() self.assertEqual(rb.get_current_episode_len(), 0) s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), episode_len1 + episode_len2) for i in range(episode_len1): with self.subTest(i=i, v="obs"): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) with self.subTest(i=i, v="next_obs"): np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) for i in range(episode_len2): with self.subTest(i=i + episode_len1, v="obs"): np.testing.assert_equal(s["obs"][i + episode_len1], obs2[i:i + stack_size]) with self.subTest(i=i + episode_len1, v="next_obs"): np.testing.assert_equal(s["next_obs"][i + episode_len1], obs2[i + 1:i + 1 + stack_size])
class RainbowAgent: """ Rainbow Agent interacting with environment. Attribute: env (gym.Env): openAI Gym environment (connected to Gazebo node) memory (PrioritizedReplayBuffer): replay memory to store transitions batch_size (int): batch size for sampling target_update (int): period for target model's hard update gamma (float): discount factor dqn (Network): model to train and select actions dqn_target (Network): target model to update optimizer (torch.optim): optimizer for training dqn transition (list): transition information including state, action, reward, next_state, done v_min (float): min value of support v_max (float): max value of support atom_size (int): the unit number of support support (torch.Tensor): support for categorical dqn use_n_step (bool): whether to use n_step memory n_step (int): step number to calculate n-step td error memory_n (ReplayBuffer): n-step replay buffer """ def __init__( self, env: gym.Env, memory_size: int, batch_size: int, target_update: int, gamma: float = 0.99, # PER parameters alpha: float = 0.2, beta: float = 0.6, prior_eps: float = 1e-6, # Categorical DQN parameters v_min: float = 0.0, v_max: float = 200.0, atom_size: int = 51, # N-step Learning n_step: int = 3, # Convergence parameters convergence_window: int = 100, convergence_window_epsilon_p: int = 10, convergence_avg_score: float = 195.0, convergence_avg_epsilon: float = 0.0524, # 3 degs converted to rads convergence_avg_epsilon_p: float = 0.0174, # 1 deg/s converted to rad/s # Tensorboard parameters model_name: str = "snake_joint", ): """ Initialization. Args: env_client (GymEnvClient): ROS client to an openAI Gym environment server memory_size (int): length of memory batch_size (int): batch size for sampling target_update (int): period for target model's hard update lr (float): learning rate gamma (float): discount factor alpha (float): determines how much prioritization is used beta (float): determines how much importance sampling is used prior_eps (float): guarantees every transition can be sampled v_min (float): min value of support v_max (float): max value of support atom_size (int): the unit number of support n_step (int): step number to calculate n-step td error """ obs_dim = env.observation_space.shape[0] action_dim = env.action_space.n self.env = env self.batch_size = batch_size self.target_update = target_update self.gamma = gamma # Selecting computing device physical_devices = tf.config.list_physical_devices('GPU') n_gpu = len(physical_devices) rospy.loginfo("Number of GPU detected : " + str(n_gpu)) if n_gpu > 0: rospy.loginfo("Switching to single GPU mode : /device:GPU:0") self.used_device = "/device:GPU:0" tf.config.experimental.set_memory_growth(physical_devices[0], True) else: rospy.loginfo("No GPU detected. Switching to single CPU mode : /device:CPU:0") self.used_device = "/device:CPU:0" # PER # memory for 1-step learning self.beta = beta self.prior_eps = prior_eps self.memory = PrioritizedReplayBuffer( memory_size, { "obs": {"shape": (obs_dim,)}, "act": {"shape": (1,)}, "rew": {}, "next_obs": {"shape": (obs_dim,)}, "done": {} }, alpha=alpha ) # memory for N-step learning self.use_n_step = True if n_step > 1 else False if self.use_n_step: self.n_step = n_step self.memory_n = ReplayBuffer( memory_size, { "obs": {"shape": (obs_dim,)}, "act": {"shape": (1,)}, "rew": {}, "next_obs": {"shape": (obs_dim,)}, "done": {} }, Nstep={ "size": n_step, "gamma": gamma, "rew": "rew", "next": "next_obs" } ) # Categorical DQN parameters self.v_min = v_min self.v_max = v_max self.atom_size = atom_size self.support = tf.linspace(self.v_min, self.v_max, self.atom_size, name="support") # networks: dqn, dqn_target self.dqn = Network( obs_dim, action_dim, self.atom_size, self.support, name="dqn" ) self.dqn_target = Network( obs_dim, action_dim, self.atom_size, self.support, name="dqn_target" ) # optimizer self.optimizer = Adam( learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, name='AdamOptimizer' ) # transition to store in memory self.transition = list() # mode: train / test self.is_test = False # Custom tensorboard object self.tensorboard = RainbowTensorBoard( log_dir="single_joint_logs/{}-{}".format( model_name, datetime.now().strftime("%m-%d-%Y-%H_%M_%S") ) ) # Convergence criterion self.convergence_window = convergence_window self.convergence_window_epsilon_p = convergence_window_epsilon_p self.convergence_avg_score = convergence_avg_score self.convergence_avg_epsilon = convergence_avg_epsilon self.convergence_avg_epsilon_p = convergence_avg_epsilon_p #TODO # model checkpoint object self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.dqn_target) self.checkpoint_manager = tf.train.CheckpointManager( self.checkpoint, directory="single_joint_ckpts", max_to_keep=5 ) def select_action(self, state: np.ndarray) -> np.ndarray: """Select an action from the input state.""" # NoisyNet: no epsilon greedy action selection selected_action = tf.math.argmax(self.dqn( tf.constant(state.reshape(1, state.shape[0]), dtype=tf.float32) ), axis=-1, name="argmax_selected_action") # Convert to numpy ndarray datatype selected_action = selected_action.numpy() if not self.is_test: self.transition = [state, selected_action] return selected_action def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool]: """ Take an action and return the response of the env. """ next_state, reward, done, _ = self.env.step(action,score) if not self.is_test: self.transition += [reward, next_state, done] # N-step transition if self.use_n_step: idx = self.memory_n.add( **dict( zip(["obs", "act", "rew", "next_obs", "done"], self.transition) ) ) one_step_transition = [ v[idx] for _,v in self.memory_n.get_all_transitions().items()] if idx else None # 1-step transition else: one_step_transition = self.transition # add a single step transition if one_step_transition: self.memory.add( **dict( zip(["obs", "act", "rew", "next_obs", "done"], one_step_transition) ) ) return next_state, reward, done def update_model(self) -> tf.Tensor: """ Update the model by gradient descent """ # PER needs beta to calculate weights samples = self.memory.sample(self.batch_size, beta=self.beta) weights = tf.constant( samples["weights"].reshape(-1, 1), dtype=tf.float32, name="update_model_weights" ) indices = samples["indexes"] # 1-step Learning loss elementwise_loss = self._compute_dqn_loss(samples, self.gamma) with tf.GradientTape() as tape: # PER: importance of sampling before average loss = tf.math.reduce_mean(elementwise_loss * weights) # N-step Learning loss # We are going to combine 1-ste[ loss and n-step loss so as to # prevent high-variance. if self.use_n_step: gamma = self.gamma ** self.n_step samples = {k: [v[i] for i in indices] for k,v in self.memory_n.get_all_transitions().items()} elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma) elementwise_loss += elementwise_loss_n_loss # PER: importance of sampling before average loss = tf.math.reduce_mean(elementwise_loss * weights) dqn_variables = self.dqn.trainable_variables gradients = tape.gradient(loss, dqn_variables) gradients, _ = tf.clip_by_global_norm(gradients, 10.0) self.optimizer.apply_gradients(zip(gradients, dqn_variables)) # PER: update priorities loss_for_prior = elementwise_loss.numpy() new_priorities = loss_for_prior + self.prior_eps self.memory.update_priorities(indices, new_priorities) # NoisyNet: reset noise self.dqn.reset_noise() self.dqn_target.reset_noise() return loss.numpy().ravel() def train(self, num_frames: int): """Train the agent.""" self.is_test = False state = self.env.reset() update_cnt = 0 scores = deque(maxlen=self.convergence_window) joint_epsilon = deque(maxlen=self.convergence_window) joint_epsilon_p = deque(maxlen=self.convergence_window_epsilon_p) score = 0 # cumulated reward episode_length = 0 episode_cnt = 0 for frame_idx in tqdm(range(1, num_frames + 1), file=tqdm_out): action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward episode_length += 1 # PER: increase beta fraction = min(frame_idx / num_frames, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) print("epsilon_p is {}".format(state[7])) print("epsilon is {}".format(state[6])) if done: print("done") # to be used for convergence criterion scores.append(score) joint_epsilon.append(state[6]) joint_epsilon_p.append(state[7]) # state = self.env.reset() self.tensorboard.update_stats( score={ "data": score, "desc": "Score (or cumulated rewards) for an episode - episode index on x-axis." }, episode_length={ "data": episode_length, "desc": "Episode length (in frames)" }, final_epsilon={ "data": state[6], "desc": "Value of epsilon = abs(theta_ld - theta_l) at the last frame of an episode" }, final_epsilon_p={ "data": state[7], "desc": "Value of d(epsilon)/dt at the last frame of an episode" } ) score = 0 episode_length = 0 episode_cnt += 1 # check convergence criterion converged = bool( len(scores) == self.convergence_window and # be sure the score buffer is full len(joint_epsilon) == self.convergence_window and # same for epsilon buffer len(joint_epsilon_p) == self.convergence_window and # same for epsilon_p buffer mean(scores) > self.convergence_avg_score and mean(joint_epsilon) < self.convergence_avg_epsilon and mean(joint_epsilon_p) < self.convergence_avg_epsilon_p ) if converged: rospy.loginfo("Ran {} episodes. Solved after {} trials".format(episode_cnt, frame_idx)) return # if training is ready if self.memory.get_stored_size() >= self.batch_size: loss = self.update_model() # plotting loss every frame self.tensorboard.update_stats( loss={ "data": loss[0], "desc": "Loss value." } ) update_cnt += 1 # if hard update is needed if update_cnt % self.target_update == 0: self._target_hard_update() # checkpointing of target model (only if the loss decrease) self.checkpoint_manager.save() self.env.close() def test(self) -> List[np.ndarray]: """Test the agent.""" self.is_test = True state = self.env.reset() done = False score = 0 frames = [] while not done: frames.append(self.env.render(mode="rgb_array")) action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward rospy.loginfo("score: ", score) self.env.close() return frames def _compute_dqn_loss(self, samples: Dict[str, np.ndarray], gamma: float) -> tf.Tensor: with tf.device(self.used_device): state = tf.constant(samples["obs"], dtype=tf.float32) next_state = tf.constant(samples["next_obs"], dtype=tf.float32) action = tf.constant(samples["act"], dtype=tf.float32) reward = tf.reshape(tf.constant(samples["rew"], dtype=tf.float32), [-1, 1]) done = tf.reshape(tf.constant(samples["done"], dtype=tf.float32), [-1, 1]) # Categorical DQN algorithm delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1) # Double DQN next_action = tf.math.argmax(self.dqn(next_state), axis=1) next_dist = self.dqn_target.dist(next_state) next_dist = tf.gather_nd( next_dist, [[i, next_action.numpy()[0]] for i in range(self.batch_size)] ) t_z = reward + (1 - done) * gamma * self.support t_z = tf.clip_by_value(t_z, clip_value_min=self.v_min, clip_value_max=self.v_max) b = tf.dtypes.cast((t_z - self.v_min) / delta_z, tf.float64) l = tf.dtypes.cast(tf.math.floor(b), tf.float64) u = tf.dtypes.cast(tf.math.ceil(b), tf.float64) offset = ( tf.broadcast_to( tf.expand_dims( tf.dtypes.cast( tf.linspace(0, (self.batch_size - 1) * self.atom_size, self.batch_size), tf.float64 ), axis=1 ), [self.batch_size, self.atom_size] ) ) proj_dist = tf.zeros(tf.shape(next_dist), tf.float64) # casting next_dist = tf.dtypes.cast(next_dist, tf.float64) proj_dist = tf.tensor_scatter_nd_add( tf.reshape(proj_dist, [-1]), # input tensor tf.reshape(tf.dtypes.cast(l + offset, tf.int64), [-1, 1]), # indices tf.reshape((next_dist * (u - b)), [-1]) # updates ) proj_dist = tf.tensor_scatter_nd_add( proj_dist, tf.reshape(tf.dtypes.cast(u + offset, tf.int64), [-1, 1]), # indices tf.reshape((next_dist * (b - l)), [-1]) # updates ) proj_dist = tf.reshape(proj_dist, [self.batch_size, self.atom_size]) dist = self.dqn.dist(state) #log_p = tf.math.log(dist[range(self.batch_size), action]) log_p = tf.dtypes.cast( tf.math.log( tf.gather_nd( dist, [[i, tf.dtypes.cast(tf.reshape(action, [-1]), tf.int32).numpy()[i]] for i in range(self.batch_size)] ) ), tf.float64 ) elementwise_loss = tf.math.reduce_sum(-(proj_dist * log_p), axis=1) return tf.dtypes.cast(elementwise_loss, tf.float32) def _target_hard_update(self): """Hard update: target <- local.""" tf.saved_model.save(self.dqn, "single_joint_dqn") self.dqn_target = tf.saved_model.load("single_joint_dqn")