def __init__(self, lr, state_shape, num_actions, batch_size, max_mem_size=100000): self.lr = lr self.gamma = 0.99 self.action_space = list(range(num_actions)) self.batch_size = batch_size self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000) self.importance_exp = Lerper(start=0.4, end=1.0, num_steps=100000) self.priority_exp = 0.6 self.memory = PrioritizedReplayBuffer(max_mem_size, { "obs": { "shape": state_shape }, "act": { "shape": 1 }, "rew": {}, "next_obs": { "shape": state_shape }, "done": { "shape": 1 } }, alpha=self.priority_exp) self.net = Network(lr, state_shape, num_actions)
def get_replay_buffer(policy, env, use_prioritized_rb=False, use_nstep_rb=False, n_step=1, size=None): if policy is None or env is None: return None obs_shape = get_space_size(env.observation_space) kwargs = get_default_rb_dict(policy.memory_capacity, env) if size is not None: kwargs["size"] = size # on-policy policy if not issubclass(type(policy), OffPolicyAgent): kwargs["size"] = policy.horizon kwargs["env_dict"].pop("next_obs") kwargs["env_dict"].pop("rew") # TODO: Remove done. Currently cannot remove because of cpprb implementation # kwargs["env_dict"].pop("done") kwargs["env_dict"]["logp"] = {} kwargs["env_dict"]["ret"] = {} kwargs["env_dict"]["adv"] = {} if is_discrete(env.action_space): kwargs["env_dict"]["act"]["dtype"] = np.int32 return ReplayBuffer(**kwargs) # N-step prioritized if use_prioritized_rb and use_nstep_rb: kwargs["Nstep"] = { "size": n_step, "gamma": policy.discount, "rew": "rew", "next": "next_obs" } return PrioritizedReplayBuffer(**kwargs) if len(obs_shape) == 3: kwargs["env_dict"]["obs"]["dtype"] = np.ubyte kwargs["env_dict"]["next_obs"]["dtype"] = np.ubyte # prioritized if use_prioritized_rb: return PrioritizedReplayBuffer(**kwargs) # N-step if use_nstep_rb: kwargs["Nstep"] = { "size": n_step, "gamma": policy.discount, "rew": "rew", "next": "next_obs" } return ReplayBuffer(**kwargs) return ReplayBuffer(**kwargs)
def test_per_nstep(self): """ PrioritizedReplayBuffer.on_episode_end() ignores Exception Ref: https://gitlab.com/ymd_h/cpprb/-/issues/111 """ rb = PrioritizedReplayBuffer(32, { "rew": {}, "done": {} }, Nstep={ "size": 4, "rew": "rew", "gamma": 0.5 }) for _ in range(10): rb.add(rew=0.5, done=0.0) rb.add(rew=0.5, done=1.0) rb.on_episode_end() s = rb.sample(16) self.assertIn("discounts", s)
def test_save_cache_with_stack_compress(self): rb = PrioritizedReplayBuffer(32, env_dict={ 'done': { 'dtype': 'bool' }, 'a': { 'shape': (3) } }, stack_compress='a') a = np.array([0, 1, 2]) for i in range(3): done = i == 2 rb.add(a=a, done=done) if done: rb.on_episode_end() a += 1 rb.add(a=np.ones(3), done=False) a_ = rb.get_all_transitions()["a"] np.testing.assert_allclose( a_, np.asarray([[0., 1., 2.], [1., 2., 3.], [2., 3., 4.], [1., 1., 1.]]))
def test_mp_update_priority(self): buffer_size = 256 add_size = 200 rb = PrioritizedReplayBuffer(buffer_size,{"obs": {"dtype": int}}) self.assertEqual(rb.get_next_index(),0) self.assertEqual(rb.get_stored_size(),0) p = Process(target=add_args,args=[rb, [{"obs": i, "priorities": 0} for i in range(add_size)]]) p.start() p.join() self.assertEqual(rb.get_next_index(),add_size % buffer_size) self.assertEqual(rb.get_stored_size(),min(add_size,buffer_size)) s = rb.sample(1,beta=1.0) one_hot = s["indexes"][0] rb.update_priorities([one_hot],[1e+8]) self.assertEqual(rb.get_next_index(),add_size % buffer_size) self.assertEqual(rb.get_stored_size(),min(add_size,buffer_size)) s = rb.sample(100,beta=1.0) self.assertTrue((s["obs"] >= 0).all()) self.assertTrue((s["obs"] < add_size).all()) u, counts = np.unique(s["obs"],return_counts=True) self.assertEqual(u[counts.argmax()],one_hot)
def get_replay_buffer(policy, env, use_prioritized_rb=False, use_nstep_rb=False, n_step=1, size=None): if policy is None or env is None: return None obs_shape = get_space_size(env.observation_space) kwargs = get_default_rb_dict(policy.memory_capacity, env) if size is not None: kwargs['size'] = size # TODO(sff1019): Add on-policy behaviour # TODO(sff1019): Add N-step prioritized if len(obs_shape) == 3: kwargs['env_dict']['obs']['dtype'] = np.ubyte kwargs['env_dict']['next_obs']['dtype'] = np.ubtye if use_prioritized_rb: return PrioritizedReplayBuffer(**kwargs) return ReplayBuffer(**kwargs)
def __init__(self, args): #self.memory = deque(maxlen=args.buffer_size) self.memory = PrioritizedReplayBuffer( args.buffer_size, { "obs": { "shape": (64, 64, 6) }, "act": {}, "rew": {}, "next_obs": { "shape": (64, 64, 6) }, "terminal": {} }) #self.priority = deque(maxlen=args.buffer_size) self.length = 0 self.args = args
def __init__(self, args, capacity, env): # Initial importance sampling weight β, annealed to 1 over course of training self.priority_weight = args.priority_weight self.n = args.multi_step self.device = args.device if args.mmap: os.makedirs('memories/', exist_ok=True) mmap_prefix = 'memories/mm' else: mmap_prefix = None self.buffer = PrioritizedReplayBuffer( capacity, { "obs": { "shape": env.observation_space.shape, "dtype": env.observation_space.dtype }, "next_obs": { "shape": env.observation_space.shape, "dtype": env.observation_space.dtype }, "act": { "shape": 1, "dtype": env.action_space.dtype }, "rew": { "dtype": np.float32 }, "done": { "dtype": np.uint8 }, }, Nstep={ "size": self.n, "gamma": args.discount, "rew": "rew", "next": "next_obs", }, mmap_prefix=mmap_prefix, alpha=args.priority_exponent, # next_of="obs", # stack_compress="obs", )
def test_PrioritizedReplayBuffer_with_single_step_with_priorities(self): buffer_size = 256 obs_shape = (3, 4) batch_size = 10 rb = PrioritizedReplayBuffer(buffer_size, {"obs": { "shape": obs_shape }}) v = {"obs": np.ones(shape=obs_shape), "priorities": 0.5} rb.add(**v) rb.sample(batch_size) for _ in range(100): rb.add(**v) rb.sample(batch_size)
def test_PrioritizedReplayBuffer_with_multiple_steps(self): buffer_size = 256 obs_shape = (3, 4) step_size = 32 batch_size = 10 rb = PrioritizedReplayBuffer(buffer_size, {"obs": { "shape": obs_shape }}) v = {"obs": np.ones(shape=(step_size, *obs_shape))} rb.add(**v) rb.sample(batch_size) for _ in range(100): rb.add(**v) rb.sample(batch_size)
def test_cpdef_super(self): buffer_size = 256 obs_dim = 15 act_dim = 3 prb = PrioritizedReplayBuffer( buffer_size, { "obs": { "shape": obs_dim }, "act": { "shape": act_dim }, "rew": {}, "next_obs": { "shape": obs_dim }, "done": {} }) prb.clear()
def test_prioritized_nstep(self): rb = PrioritizedReplayBuffer(32, { "obs": { "shape": (16, 16) }, 'rew': {}, 'done': {} }, next_of="obs", stack_compress="obs", Nstep={ "size": 4, "rew": "rew" }) self.assertIs( rb.add(obs=(np.ones((16, 16))), next_obs=(np.ones((16, 16))), rew=1, done=0), None) self.assertIs( rb.add(obs=(np.ones((16, 16))), next_obs=(np.ones((16, 16))), rew=1, done=0), None) self.assertIs( rb.add(obs=(np.ones((16, 16))), next_obs=(np.ones((16, 16))), rew=1, done=0), None) self.assertEqual( rb.add(obs=(np.ones((16, 16))), next_obs=(np.ones((16, 16))), rew=1, done=0), 0)
def test_buffer_size(self): buffer_size = 1000 obs_dim = 3 act_dim = 1 rb = ReplayBuffer( buffer_size, { "obs": { "shape": obs_dim }, "act": { "shape": act_dim }, "rew": {}, "next_obs": { "shape": obs_dim }, "done": {} }) prb = PrioritizedReplayBuffer( buffer_size, { "obs": { "shape": obs_dim }, "act": { "shape": act_dim }, "rew": {}, "next_obs": { "shape": obs_dim }, "done": {} }) self.assertEqual(1000, rb.get_buffer_size()) self.assertEqual(1000, prb.get_buffer_size()) rb._encode_sample([i for i in range(1000)])
def test_multi_processing(self): buffer_size = 256 rb = PrioritizedReplayBuffer(buffer_size,{"obs": {"dtype": int}}) self.assertEqual(rb.get_next_index(),0) self.assertEqual(rb.get_stored_size(),0) p = Process(target=add_args,args=[rb, [{"obs": i, "priority": 0.5} for i in range(10)]]) p.start() p.join() self.assertEqual(rb.get_next_index(),10) self.assertEqual(rb.get_stored_size(),10) s = rb.get_all_transitions() np.testing.assert_allclose(s["obs"].ravel(),np.arange(10,dtype=int))
def test_read_only_priority(self): buffer_size = 100 batch_size = 32 env_dict = {"done": {}} done = np.zeros(2) ps = np.ones_like(done) ps.setflags(write=False) rb = PrioritizedReplayBuffer(buffer_size, env_dict) rb.add(done=done, priority=ps) sample = rb.sample(batch_size) ps2 = sample["weights"] ps2.setflags(write=False) rb.update_priorities(sample["indexes"], ps2)
def get_replay_buffer(obs_shape, action_dim, buffer_size=int(1e6), use_prioritized=False): env_dict = { "obs": { "shape": obs_shape }, "act": { "shape": action_dim }, "rew": {}, "next_obs": { "shape": obs_shape }, "done": {} } rb = PrioritizedReplayBuffer(buffer_size, env_dict) if use_prioritized \ else ReplayBuffer(buffer_size, env_dict) return rb
def test_per_train(self): """ Run train function with PER """ rb = PrioritizedReplayBuffer( 32, { "obs": { "shape": (3, ) }, "act": {}, "rew": {}, "next_obs": { "shape": (3, ) }, "done": {} }) train(rb, self.env, lambda obs, step, episode, is_warmup: 1.0, lambda kwargs, step, episode: 0.5, max_steps=10)
def test_sample(self): buffer_size = 500 obs_shape = (84,84,3) act_dim = 4 rb = PrioritizedReplayBuffer(buffer_size,{"obs": {"shape": obs_shape}, "act": {"shape": act_dim}, "rew": {}, "done": {}}) obs = np.zeros(obs_shape) act = np.ones(act_dim) rew = 1 done = 0 rb.add(obs=obs,act=act,rew=rew,done=done) ps = 1.5 rb.add(obs=obs,act=act,rew=rew,done=done,priorities=ps) self.assertAlmostEqual(rb.get_max_priority(),1.5) obs = np.stack((obs,obs)) act = np.stack((act,act)) rew = (1,0) done = (0.0,1.0) rb.add(obs=obs,act=act,rew=rew,done=done) ps = (0.2,0.4) rb.add(obs=obs,act=act,rew=rew,done=done,priorities=ps) sample = rb.sample(64) w = sample["weights"] i = sample["indexes"] rb.update_priorities(i,w*w)
def test_add(self): buffer_size = 500 obs_shape = (84,84,3) act_dim = 10 rb = PrioritizedReplayBuffer(buffer_size,{"obs": {"shape": obs_shape}, "act": {"shape": act_dim}, "rew": {}, "done": {}}) obs = np.zeros(obs_shape) act = np.ones(act_dim) rew = 1 done = 0 rb.add(obs=obs,act=act,rew=rew,done=done) ps = 1.5 rb.add(obs=obs,act=act,rew=rew,done=done,priorities=ps) self.assertAlmostEqual(rb.get_max_priority(),1.5) obs = np.stack((obs,obs)) act = np.stack((act,act)) rew = (1,0) done = (0.0,1.0) rb.add(obs=obs,act=act,rew=rew,done=done) ps = (0.2,0.4) rb.add(obs=obs,act=act,rew=rew,done=done,priorities=ps) rb.clear() self.assertEqual(rb.get_next_index(),0) self.assertEqual(rb.get_stored_size(),0)
def get_replay_buffer(policy, env, use_prioritized_rb, use_nstep_rb, n_step): if policy is None or env is None: return None kwargs = { "obs_shape": get_space_size(env.observation_space), "act_dim": get_space_size(env.action_space), "size": policy.update_interval } # on-policy policy if not issubclass(type(policy), OffPolicyAgent): return ReplayBuffer(**kwargs) # off-policy policy kwargs["size"] = policy.memory_capacity # N-step prioritized if use_prioritized_rb and use_nstep_rb: kwargs["n_step"] = n_step kwargs["discount"] = policy.discount return NstepPrioritizedReplayBuffer(**kwargs) # prioritized if use_prioritized_rb: return PrioritizedReplayBuffer(**kwargs) # N-step if use_nstep_rb: kwargs["n_step"] = n_step kwargs["discount"] = policy.discount return NstepReplayBuffer(**kwargs) if isinstance(kwargs["act_dim"], tuple): kwargs["act_dim"] = kwargs["act_dim"][0] return ReplayBuffer(**kwargs)
def test_per_without_TD(self): """ Run train function with PER withou TD Raise TypeError """ rb = PrioritizedReplayBuffer( 32, { "obs": { "shape": (3, ) }, "act": {}, "rew": {}, "next_obs": { "shape": (3, ) }, "done": {} }) with self.assertRaises(TypeError): train(rb, self.env, lambda obs, step, episode, is_warmup: 1.0, lambda kwargs, step, episode: None, max_steps=10)
class RainbowAgent: """Agent interacting with environment. Attribute: env (gym.Env): openAI Gym environment memory (PrioritizedReplayBuffer): replay memory to store transitions batch_size (int): batch size for sampling target_update (int): period for target model's hard update gamma (float): discount factor dqn (Network): model to train and select actions dqn_target (Network): target model to update optimizer (torch.optim): optimizer for training dqn transition (list): transition information including state, action, reward, next_state, done v_min (float): min value of support v_max (float): max value of support atom_size (int): the unit number of support support (torch.Tensor): support for categorical dqn use_n_step (bool): whether to use n_step memory n_step (int): step number to calculate n-step td error memory_n (ReplayBuffer): n-step replay buffer """ def __init__( self, env: gym.Env, memory_size: int, batch_size: int, target_update: int, gamma: float = 0.99, # PER parameters alpha: float = 0.2, beta: float = 0.6, prior_eps: float = 1e-6, # Categorical DQN parameters v_min: float = 0.0, v_max: float = 200.0, atom_size: int = 51, # N-step Learning n_step: int = 3, # Convergence parameters convergence_window: int = 100, convergence_window_epsilon_p: int = 10, convergence_avg_score: float = 195.0, convergence_avg_epsilon: float = 0.0524, # 3 degs converted to rads convergence_avg_epsilon_p: float = 0.0174, # 1 deg/s converted to rad/s # Tensorboard parameters model_name: str = "snake_joint", ): """Initialization. Args: env (gym.Env): openAI Gym environment memory_size (int): length of memory batch_size (int): batch size for sampling target_update (int): period for target model's hard update lr (float): learning rate gamma (float): discount factor alpha (float): determines how much prioritization is used beta (float): determines how much importance sampling is used prior_eps (float): guarantees every transition can be sampled v_min (float): min value of support v_max (float): max value of support atom_size (int): the unit number of support n_step (int): step number to calculate n-step td error """ obs_dim = env.observation_space.shape[0] action_dim = env.action_space.n self.env = env self.batch_size = batch_size self.target_update = target_update self.gamma = gamma # NoisyNet: All attributes related to epsilon are removed #produces a unique timestamp for each run run_timestamp=str( #returns number of day and number of month str(time.localtime(time.time())[2]) + "_" + str(time.localtime(time.time())[1]) + "_" + #returns hour, minute and second str(time.localtime(time.time())[3]) + "_" + str(time.localtime(time.time())[4]) + "_" + str(time.localtime(time.time())[5]) ) #Will write scalars that can be visualized using tensorboard in the directory "runLogs/timestamp" self.writer = SummaryWriter("runLogs/" + run_timestamp) # device: cpu / gpu self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) print(self.device) # PER # memory for 1-step Learning self.beta = beta self.prior_eps = prior_eps self.memory = PrioritizedReplayBuffer( memory_size, { "obs": {"shape": (obs_dim,)}, "act": {"shape": (1,)}, "rew": {}, "next_obs": {"shape": (obs_dim,)}, "done": {} }, alpha=alpha ) # memory for N-step Learning self.use_n_step = True if n_step > 1 else False if self.use_n_step: self.n_step = n_step self.memory_n = ReplayBuffer( memory_size, { "obs": {"shape": (obs_dim,)}, "act": {"shape": (1,)}, "rew": {}, "next_obs": {"shape": (obs_dim,)}, "done": {} }, Nstep={ "size": n_step, "gamma": gamma, "rew": "rew", "next": "next_obs" } ) # Categorical DQN parameters self.v_min = v_min self.v_max = v_max self.atom_size = atom_size self.support = torch.linspace( self.v_min, self.v_max, self.atom_size ).to(self.device) # networks: dqn, dqn_target self.dqn = Network( obs_dim, action_dim, self.atom_size, self.support ).to(self.device) self.dqn_target = Network( obs_dim, action_dim, self.atom_size, self.support ).to(self.device) self.dqn_target.load_state_dict(self.dqn.state_dict()) self.dqn_target.eval() # optimizer self.optimizer = optim.Adam(self.dqn.parameters(),0.0001) # transition to store in memory self.transition = list() # mode: train / test self.is_test = False # Custom tensorboard object # self.tensorboard = RainbowTensorBoard( # log_dir="single_joint_logs/{}-{}".format( # model_name, # datetime.now().strftime("%m-%d-%Y-%H_%M_%S") # ) # ) # Convergence criterion self.convergence_window = convergence_window self.convergence_window_epsilon_p = convergence_window_epsilon_p self.convergence_avg_score = convergence_avg_score self.convergence_avg_epsilon = convergence_avg_epsilon self.convergence_avg_epsilon_p = convergence_avg_epsilon_p def select_action(self, state: np.ndarray) -> np.ndarray: """Select an action from the input state.""" # NoisyNet: no epsilon greedy action selection selected_action = self.dqn( torch.FloatTensor(state).to(self.device) ).argmax() selected_action = selected_action.detach().cpu().numpy() if not self.is_test: self.transition = [state, selected_action] return selected_action def step(self, action: np.ndarray, score:int) -> Tuple[np.ndarray, np.float64, bool]: """Take an action and return the response of the env.""" next_state, reward, done, _ = self.env.step(action,score) if not self.is_test: self.transition += [reward, next_state, done] # N-step transition if self.use_n_step: idx = self.memory_n.add( **dict( zip(["obs", "act", "rew", "next_obs", "done"], self.transition) ) ) one_step_transition = [ v[idx] for _,v in self.memory_n.get_all_transitions().items()] if idx else None # 1-step transition else: one_step_transition = self.transition # add a single step transition if one_step_transition: self.memory.add( **dict( zip(["obs", "act", "rew", "next_obs", "done"], one_step_transition) ) ) return next_state, reward, done def update_model(self,frame_idx:int) -> torch.Tensor: """Update the model by gradient descent. shape of elementwise_loss = [128,51] shape of loss = ([]) shape of weights ([128,1)] """ # PER needs beta to calculate weights samples = self.memory.sample(self.batch_size, beta=self.beta) weights = torch.FloatTensor( samples["weights"].reshape(-1, 1) ).to(self.device) indices = samples["indexes"] #rospy.loginfo(samples.keys()) #rospy.loginfo(weights.shape) #rospy.loginfo(indices.shape()) #torch.save(self.dqn.state_dict(),str("checkpoint_"+str(time.time()))) # 1-step Learning loss elementwise_loss = self._compute_dqn_loss(samples, self.gamma) # PER: importance sampling before average loss = torch.mean(elementwise_loss * weights) self.writer.add_scalar('update_model/Lossv0', loss.detach().item(),frame_idx ) # N-step Learning loss # we are gonna combine 1-step loss and n-step loss so as to # prevent high-variance. The original rainbow employs n-step loss only. if self.use_n_step: gamma = self.gamma ** self.n_step samples = {k: [v[i] for i in indices] for k,v in self.memory_n.get_all_transitions().items()} elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma) elementwise_loss += elementwise_loss_n_loss #rospy.loginfo(elementwise_loss_n_loss.shape) #rospy.loginfo(elementwise_loss.shape) # PER: importance sampling before average loss = torch.mean(elementwise_loss * weights) rospy.loginfo( f"{elementwise_loss}" ) self.optimizer.zero_grad() self.writer.add_scalar('update_model/Lossv1', loss.detach().item(),frame_idx ) #From pytorch doc: backward() Computes the gradient of current tensor w.r.t. graph leaves. #self.writer.add_image("loss gradient before", loss, frame_idx) loss.backward() #self.writer.add_image("loss gradient after", loss, frame_idx) self.writer.add_scalar('update_model/Lossv2', loss.detach().item(),frame_idx ) clip_grad_norm_(self.dqn.parameters(), 10.0) self.optimizer.step() # PER: update priorities loss_for_prior = elementwise_loss.detach().cpu().numpy() new_priorities = loss_for_prior + self.prior_eps self.memory.update_priorities(indices, new_priorities) # NoisyNet: reset noise self.dqn.reset_noise() self.dqn_target.reset_noise() #rospy.loginfo("second") #rospy.loginfo(loss.shape) #rospy.loginfo("loss dimension = " + loss.ndim() ) #rospy.loginfo("loss = " + str(loss.detach().item()) + "type = " + str(type(loss.detach().item()) ) ) self.writer.add_scalar('update_model/Loss', loss.detach().item(),frame_idx ) return loss.detach().item() def train(self, num_frames: int): """Train the agent.""" self.is_test = False state = self.env.reset() update_cnt = 0 losses = [] scores = [] score = 0 for frame_idx in tqdm(range(1, num_frames + 1)): action = self.select_action(state) next_state, reward, done = self.step(action,score) state = next_state score += reward # NoisyNet: removed decrease of epsilon # PER: increase beta fraction = min(frame_idx / num_frames, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) # if episode ends if done: #rospy.loginfo("logging for done") self.writer.add_scalar('train/score', score, frame_idx) self.writer.add_scalar('train/final_epsilon', state[6], frame_idx) self.writer.add_scalar('train/epsilon_p', state[7], frame_idx) state = self.env.reset() scores.append(score) score = 0 # if training is ready if self.memory.get_stored_size() >= self.batch_size: #frame_id given as argument for logging by self.writer. #rospy.loginfo("frame_idx= " + str(frame_idx) + "type = " + str(type(frame_idx))) loss = self.update_model(frame_idx) losses.append(loss) update_cnt += 1 # if hard update is needed if update_cnt % self.target_update == 0: self._target_hard_update(loss) self.env.close() def test(self) -> List[np.ndarray]: """Test the agent.""" self.is_test = True state = self.env.reset() done = False score = 0 frames = [] while not done: frames.append(self.env.render(mode="rgb_array")) action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward print("score: ", score) self.env.close() return frames def _compute_dqn_loss(self, samples: Dict[str, np.ndarray], gamma: float) -> torch.Tensor: """Return categorical dqn loss.""" device = self.device # for shortening the following lines state = torch.FloatTensor(samples["obs"]).to(device) next_state = torch.FloatTensor(samples["next_obs"]).to(device) action = torch.LongTensor(samples["act"]).to(device) reward = torch.FloatTensor(np.array(samples["rew"]).reshape(-1, 1)).to(device) done = torch.FloatTensor(np.array(samples["done"]).reshape(-1, 1)).to(device) # Categorical DQN algorithm delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1) with torch.no_grad(): # Double DQN next_action = self.dqn(next_state).argmax(1) next_dist = self.dqn_target.dist(next_state) next_dist = next_dist[range(self.batch_size), next_action] t_z = reward + (1 - done) * gamma * self.support t_z = t_z.clamp(min=self.v_min, max=self.v_max) b = (t_z - self.v_min) / delta_z l = b.floor().long() u = b.ceil().long() offset = ( torch.linspace( 0, (self.batch_size - 1) * self.atom_size, self.batch_size ).long() .unsqueeze(1) .expand(self.batch_size, self.atom_size) .to(self.device) ) proj_dist = torch.zeros(next_dist.size(), device=self.device) proj_dist.view(-1).index_add_( 0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1) ) proj_dist.view(-1).index_add_( 0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1) ) print(f"Next Action : {next_action}\n Next Dist : {next_dist}\n") dist = self.dqn.dist(state) log_p = torch.log(dist[range(self.batch_size), action]) elementwise_loss = -(proj_dist * log_p).sum(1) print(f"Proj Dist : {proj_dist}\n Dist : {dist}\n Log_p : {log_p}\n") if torch.isnan(elementwise_loss[0][0]): exit() return elementwise_loss def _target_hard_update(self,loss): """Hard update: target <- local.""" self.dqn_target.load_state_dict(self.dqn.state_dict()) #torch.save(self.dqn.state_dict(),str("checkpoint_"+str(time.time()))) torch.save({ 'model_state_dict': self.dqn.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'loss': loss, }, str("checkpoints/checkpoint_"+str(time.time())))
nstep = 3 # nstep = False if nstep: Nstep = {"size": nstep, "rew": "rew", "next": "next_obs"} discount = tf.constant(gamma**nstep) else: Nstep = None discount = tf.constant(gamma) # Prioritized Experience Replay: https://arxiv.org/abs/1511.05952 # See https://ymd_h.gitlab.io/cpprb/features/per/ prioritized = True if prioritized: rb = PrioritizedReplayBuffer(buffer_size, env_dict, Nstep=Nstep) # Beta linear annealing beta = 0.4 beta_step = (1 - beta) / N_iteration else: rb = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep) @tf.function def Q_func(model, obs, act, act_shape): return tf.reduce_sum(model(obs) * tf.one_hot(act, depth=act_shape), axis=1) @tf.function def DQN_target_func(model, target, next_obs, rew, done, gamma, act_shape):
}, "act": { "shape": act_shape }, "next_obs": { "shape": obs_shape }, "rew": {}, "done": {} } # Initialize Replay Buffer rb = RB(buffer_size, env_dict) # Initialize Prioritized Replay Buffer prb = PRB(buffer_size, env_dict, alpha=alpha) # Initalize Reverb Server server = reverb.Server(tables=[ reverb.Table(name='ReplayBuffer', sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), max_size=buffer_size, rate_limiter=reverb.rate_limiters.MinSize(1)), reverb.Table(name='PrioritizedReplayBuffer', sampler=reverb.selectors.Prioritized(alpha), remover=reverb.selectors.Fifo(), max_size=buffer_size, rate_limiter=reverb.rate_limiters.MinSize(1)) ])
eps_tracker = ptan.actions.EpsilonTracker( selector, params.eps_start, params.eps_final, params.eps_frames*args.envs) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( envs, agent, params.gamma, steps_count=args.steps) env_dict = {'state': {'shape': shape, 'dtype': np.uint8}, 'action': {'dtype': np.int8}, 'reward': {}, 'last_state': {'shape': shape, 'dtype': np.uint8}, 'done': {'dtype': np.bool} } step = (TGT_BETA - BETA)/END_BETA_FRAME buffer = PrioritizedReplayBuffer(params.buffer_size, env_dict) if args.priority else \ ReplayBuffer(params.buffer_size, env_dict=env_dict) folder, sub_folder, log_dir = utils.writerDir(envs[0], args.steps) comment = "".join( [envs[0].game, '_', str(args.steps), '_', str(args.envs)]) writer = SummaryWriter(comment=comment) optimizer = torch.optim.Adam(net.parameters(), lr=params.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.75, patience=20000, cooldown=20000, verbose=True, min_lr=params.min_lr) mean = None best_reward = -float('inf') st = datetime.now() print(net)
class ReplayMemory(): def __init__(self, args, capacity, env): # Initial importance sampling weight β, annealed to 1 over course of training self.priority_weight = args.priority_weight self.n = args.multi_step self.device = args.device if args.mmap: os.makedirs('memories/', exist_ok=True) mmap_prefix = 'memories/mm' else: mmap_prefix = None self.buffer = PrioritizedReplayBuffer( capacity, { "obs": { "shape": env.observation_space.shape, "dtype": env.observation_space.dtype }, "next_obs": { "shape": env.observation_space.shape, "dtype": env.observation_space.dtype }, "act": { "shape": 1, "dtype": env.action_space.dtype }, "rew": { "dtype": np.float32 }, "done": { "dtype": np.uint8 }, }, Nstep={ "size": self.n, "gamma": args.discount, "rew": "rew", "next": "next_obs", }, mmap_prefix=mmap_prefix, alpha=args.priority_exponent, # next_of="obs", # stack_compress="obs", ) def append(self, state, next_state, action, reward, done): self.buffer.add( **{ "obs": state, "next_obs": next_state, "act": action, "rew": reward, "done": done, }) def sample(self, size): s = self.buffer.sample(size, self.priority_weight) s['indexes'] = s['indexes'].astype(np.int32) return torchify((s['indexes'], torch.int32), (s['obs'], torch.float32), (np.squeeze(s['act'], 1), torch.long), (np.squeeze(s['rew'], 1), torch.float32), (s['next_obs'], torch.float32), (s['done'], torch.bool), (s['weights'], torch.float32), device=self.device) def update_priorities(self, indexes, new_priorities): indexes = indexes.cpu().numpy() self.buffer.update_priorities(indexes, new_priorities)
def __init__( self, env: gym.Env, memory_size: int, batch_size: int, target_update: int, gamma: float = 0.99, # PER parameters alpha: float = 0.2, beta: float = 0.6, prior_eps: float = 1e-6, # Categorical DQN parameters v_min: float = 0.0, v_max: float = 200.0, atom_size: int = 51, # N-step Learning n_step: int = 3, # Convergence parameters convergence_window: int = 100, convergence_window_epsilon_p: int = 10, convergence_avg_score: float = 195.0, convergence_avg_epsilon: float = 0.0524, # 3 degs converted to rads convergence_avg_epsilon_p: float = 0.0174, # 1 deg/s converted to rad/s # Tensorboard parameters model_name: str = "snake_joint", ): """Initialization. Args: env (gym.Env): openAI Gym environment memory_size (int): length of memory batch_size (int): batch size for sampling target_update (int): period for target model's hard update lr (float): learning rate gamma (float): discount factor alpha (float): determines how much prioritization is used beta (float): determines how much importance sampling is used prior_eps (float): guarantees every transition can be sampled v_min (float): min value of support v_max (float): max value of support atom_size (int): the unit number of support n_step (int): step number to calculate n-step td error """ obs_dim = env.observation_space.shape[0] action_dim = env.action_space.n self.env = env self.batch_size = batch_size self.target_update = target_update self.gamma = gamma # NoisyNet: All attributes related to epsilon are removed #produces a unique timestamp for each run run_timestamp=str( #returns number of day and number of month str(time.localtime(time.time())[2]) + "_" + str(time.localtime(time.time())[1]) + "_" + #returns hour, minute and second str(time.localtime(time.time())[3]) + "_" + str(time.localtime(time.time())[4]) + "_" + str(time.localtime(time.time())[5]) ) #Will write scalars that can be visualized using tensorboard in the directory "runLogs/timestamp" self.writer = SummaryWriter("runLogs/" + run_timestamp) # device: cpu / gpu self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) print(self.device) # PER # memory for 1-step Learning self.beta = beta self.prior_eps = prior_eps self.memory = PrioritizedReplayBuffer( memory_size, { "obs": {"shape": (obs_dim,)}, "act": {"shape": (1,)}, "rew": {}, "next_obs": {"shape": (obs_dim,)}, "done": {} }, alpha=alpha ) # memory for N-step Learning self.use_n_step = True if n_step > 1 else False if self.use_n_step: self.n_step = n_step self.memory_n = ReplayBuffer( memory_size, { "obs": {"shape": (obs_dim,)}, "act": {"shape": (1,)}, "rew": {}, "next_obs": {"shape": (obs_dim,)}, "done": {} }, Nstep={ "size": n_step, "gamma": gamma, "rew": "rew", "next": "next_obs" } ) # Categorical DQN parameters self.v_min = v_min self.v_max = v_max self.atom_size = atom_size self.support = torch.linspace( self.v_min, self.v_max, self.atom_size ).to(self.device) # networks: dqn, dqn_target self.dqn = Network( obs_dim, action_dim, self.atom_size, self.support ).to(self.device) self.dqn_target = Network( obs_dim, action_dim, self.atom_size, self.support ).to(self.device) self.dqn_target.load_state_dict(self.dqn.state_dict()) self.dqn_target.eval() # optimizer self.optimizer = optim.Adam(self.dqn.parameters(),0.0001) # transition to store in memory self.transition = list() # mode: train / test self.is_test = False # Custom tensorboard object # self.tensorboard = RainbowTensorBoard( # log_dir="single_joint_logs/{}-{}".format( # model_name, # datetime.now().strftime("%m-%d-%Y-%H_%M_%S") # ) # ) # Convergence criterion self.convergence_window = convergence_window self.convergence_window_epsilon_p = convergence_window_epsilon_p self.convergence_avg_score = convergence_avg_score self.convergence_avg_epsilon = convergence_avg_epsilon self.convergence_avg_epsilon_p = convergence_avg_epsilon_p
class Agent: def __init__(self, lr, state_shape, num_actions, batch_size, max_mem_size=100000): self.lr = lr self.gamma = 0.99 self.action_space = list(range(num_actions)) self.batch_size = batch_size self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000) self.importance_exp = Lerper(start=0.4, end=1.0, num_steps=100000) self.priority_exp = 0.6 self.memory = PrioritizedReplayBuffer(max_mem_size, { "obs": { "shape": state_shape }, "act": { "shape": 1 }, "rew": {}, "next_obs": { "shape": state_shape }, "done": { "shape": 1 } }, alpha=self.priority_exp) self.net = Network(lr, state_shape, num_actions) def choose_action(self, observation): if np.random.random() > self.epsilon.value(): state = torch.tensor(observation).float().detach() state = state.to(self.net.device) state = state.unsqueeze(0) q_values = self.net(state) action = torch.argmax(q_values).item() return action else: return np.random.choice(self.action_space) def store_memory(self, state, action, reward, next_state, done): self.memory.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done) def learn(self): if self.memory.get_stored_size() < self.batch_size: return batch = self.memory.sample(self.batch_size, self.importance_exp.value()) states = torch.tensor(batch["obs"]).to(self.net.device) actions = torch.tensor(batch["act"], dtype=torch.int64).to(self.net.device).T[0] rewards = torch.tensor(batch["rew"]).to(self.net.device).T[0] states_ = torch.tensor(batch["next_obs"]).to(self.net.device) dones = torch.tensor(batch["done"], dtype=torch.bool).to(self.net.device).T[0] weights = torch.tensor(batch["weights"]).to(self.net.device) batch_index = np.arange(self.batch_size, dtype=np.int64) q_values = self.net(states)[batch_index, actions] q_values_ = self.net(states_) action_qs_ = torch.max(q_values_, dim=1)[0] action_qs_[dones] = 0.0 q_target = rewards + self.gamma * action_qs_ td = q_target - q_values self.net.optimizer.zero_grad() loss = ((td**2.0) * weights).mean() loss.backward() self.net.optimizer.step() new_priorities = (td.abs()).detach().cpu() self.memory.update_priorities(batch["indexes"], new_priorities) self.epsilon.step() self.importance_exp.step()
"done": {} } # Initialize Replay Buffer brb = bRB(buffer_size) rrb = rRB(buffer_size) rrb._num_sampled = 0 # Fix: https://github.com/ray-project/ray/issues/14818 crb = cRB(buffer_size) rb = RB(buffer_size, env_dict) # Initialize Prioritized Replay Buffer bprb = bPRB(buffer_size, alpha=alpha) rprb = rPRB(buffer_size, alpha=alpha) cprb = cPRB(buffer_size, alpha=alpha, beta0=beta, betasteps=None) prb = PRB(buffer_size, env_dict, alpha=alpha) # Helper Function def env(n): e = { "obs": np.ones((n, obs_shape)), "act": np.zeros((n, act_shape)), "next_obs": np.ones((n, obs_shape)), "rew": np.zeros(n), "done": np.zeros(n) } return e def add_b(_rb):
def test_raise_imcompatible_priority_shape(self): rb = PrioritizedReplayBuffer(32, env_dict={'a': {'shape': 1}}) with self.assertRaises(ValueError): rb.add(a=np.ones(5), priorities=np.ones(3))