class ReplayBuffer: def __init__(self, size, env_dict, n_step_dict=None, min_storage=10000, done_string="done"): super().__init__() self.done_string = done_string self.min_storage = min_storage cpprb_args = { "size": size, "env_dict": env_dict, "Nstep": n_step_dict } self.buffer = CPPRB(**cpprb_args) def add(self, data: Sequence[Dict[str, np.ndarray]]) -> None: for d in data: self.buffer.add(**d) if d[self.done_string]: self.buffer.on_episode_end() def sample(self, size: int) -> Dict[str, np.ndarray]: if self.buffer.get_stored_size() < self.min_storage: print( f"stored sample {self.buffer.get_stored_size()} is smaller than mininum storage" + f"size {self.min_storage}. Returning None." ) return None else: return self.buffer.sample(size)
def test_load_Nstep(self): """ Load Nstep transitions """ buffer_size = 10 env_dict = {"done": {}} Nstep = {"size": 3, "gamma": 0.99} rb1 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep) rb2 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep) rb3 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep) d = [0, 0, 0, 0, 1] rb1.add(done=d) rb1.on_episode_end() fname="Nstep.npz" rb1.save_transitions(fname) rb2.load_transitions(fname) rb3.load_transitions(v(1,fname)) t1 = rb1.get_all_transitions() t2 = rb2.get_all_transitions() t3 = rb3.get_all_transitions() np.testing.assert_allclose(t1["done"], t2["done"]) np.testing.assert_allclose(t1["done"], t3["done"])
def test_Nstep_incompatibility(self): """ Raise ValueError when Nstep incompatibility """ buffer_size = 10 env_dict = {"done": {}} Nstep = {"size": 3, "gamma": 0.99} rb1 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep) rb2 = ReplayBuffer(buffer_size, env_dict) rb3 = ReplayBuffer(buffer_size, env_dict) d = [0, 0, 0, 0, 1] rb1.add(done=d) rb1.on_episode_end() fname="Nstep_raise.npz" rb1.save_transitions(fname) with self.assertRaises(ValueError): rb2.load_transitions(fname) with self.assertRaises(ValueError): rb3.load_transitions(v(1,fname))
def test_stack_compress(self): bsize = 10 odim = 2 ssize = 2 rb = ReplayBuffer(bsize, {"a": { "shape": (odim, ssize) }}, stack_compress="a") a = np.random.rand(odim, bsize + ssize - 1) for i in range(bsize): rb.add(a=a[:, i:i + ssize]) _a = rb.get_all_transitions()["a"] for i in range(bsize): with self.subTest(i=i, label="without cache"): np.testing.assert_allclose(_a[i], a[:, i:i + ssize]) for i in range(bsize): rb._encode_sample([i]) rb.clear() for i in range(bsize): rb.add(a=a[:, i:i + ssize]) rb.on_episode_end() _a = rb.get_all_transitions()["a"] for i in range(bsize): with self.subTest(i=i, label="without cache"): np.testing.assert_allclose(_a[i], a[:, i:i + ssize]) for i in range(bsize): rb._encode_sample([i])
def test_has_next_of(self): bsize = 10 rb = ReplayBuffer(bsize, {"a": {}}, next_of="a") a = np.random.rand(bsize + 1) for i in range(bsize): rb.add(a=a[i], next_a=a[i + 1]) _next_a = np.ravel(rb.get_all_transitions()["next_a"]) np.testing.assert_allclose(_next_a, a[1:bsize + 1]) for i in range(bsize): rb._encode_sample([i]) rb.clear() for i in range(bsize): rb.add(a=a[i], next_a=a[i + 1]) rb.on_episode_end() _next_a = np.ravel(rb.get_all_transitions()["next_a"]) np.testing.assert_allclose(_next_a, a[1:bsize + 1]) for i in range(bsize): rb._encode_sample([i])
def explorer(global_rb,env_dict,is_training_done,queue): local_buffer_size = int(1e+2) local_rb = ReplayBuffer(local_buffer_size,env_dict) model = MyModel() env = gym.make("CartPole-v1") obs = env.reset() while not is_training_done.is_set(): if not queue.empty(): w = queue.get() model.weights = w action = model.get_action(obs) next_obs, reward, done, _ = env.step(action) local_rb.add(obs=obs,act=action,rew=reward,next_obs=next_obs,done=done) if done: local_rb.on_episode_end() obs = env.reset() else: obs = next_obs if local_rb.get_stored_size() == local_buffer_size: local_sample = local_rb.get_all_transitions() local_rb.clear() absTD = model.abs_TD_error(local_sample) global_rb.add(**local_sample,priorities=absTD)
def test_cache_next_of(self): stack_size = 3 episode_len = 5 rb = ReplayBuffer(32, {"obs": { "shape": (stack_size), "dtype": np.int }}, next_of="obs", stack_compress="obs") obs = np.arange(episode_len + stack_size + 2, dtype=np.int) # [0,1,...,episode_len+stack_size+1] obs2 = obs + 3 * episode_len # [3*episode_len,...,4*episode_len+stack_size+1] # Add 1st episode for i in range(episode_len): rb.add(obs=obs[i:i + stack_size], next_obs=obs[i + 1:i + 1 + stack_size]) s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), episode_len) for i in range(episode_len): with self.subTest(i=i): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) # Reset environment rb.on_episode_end() s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), episode_len) for i in range(episode_len): with self.subTest(i=i): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) # Add 2nd episode for i in range(episode_len): rb.add(obs=obs2[i:i + stack_size], next_obs=obs2[i + 1:i + 1 + stack_size]) s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), 2 * episode_len) for i in range(episode_len): with self.subTest(i=i): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) for i in range(episode_len): with self.subTest(i=i + episode_len): np.testing.assert_equal(s["obs"][i + episode_len], obs2[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i + episode_len], obs2[i + 1:i + 1 + stack_size])
def test_next_obs(self): buffer_size = 32 nstep = 4 gamma = 0.99 rb = ReplayBuffer(buffer_size, { "next_obs": {}, "done": {} }, Nstep={ "size": nstep, "gamma": gamma, "next": "next_obs" }) rb.add(next_obs=1, done=0) rb.add(next_obs=2, done=0) rb.add(next_obs=3, done=0) rb.add(next_obs=4, done=0) rb.add(next_obs=5, done=0) np.testing.assert_allclose(rb.get_all_transitions()["next_obs"], np.asarray([[4], [5]])) rb.add(next_obs=6, done=1) rb.on_episode_end() sample = rb.get_all_transitions() np.testing.assert_allclose(sample["next_obs"][sample["done"] == 0.0], np.asarray([4, 5, 6])) rb.add(next_obs=7, done=0) rb.add(next_obs=8, done=0) rb.add(next_obs=9, done=0) rb.add(next_obs=10, done=1) rb.on_episode_end() sample = rb.get_all_transitions() np.testing.assert_allclose(sample["next_obs"][sample["done"] == 0.0], np.asarray([4, 5, 6, 10]))
def run_policy(env, get_action, max_ep_len=None, num_episodes=100, render=True, record=False, record_project= 'benchmarking', record_name = 'trained' , data_path='', config_name='test', max_len_rb=100, benchmark=False, log_prefix=''): assert env is not None, \ "Environment not found!\n\n It looks like the environment wasn't saved, " + \ "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \ "page on Experiment Outputs for how to handle this situation." logger = EpochLogger() o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 ep_cost = 0 local_steps_per_epoch = int(4000 / num_procs()) obs_dim = env.observation_space.shape act_dim = env.action_space.shape rew_mov_avg_10 = [] cost_mov_avg_10 = [] if benchmark: ep_costs = [] ep_rewards = [] if record: wandb.login() # 4 million env interactions wandb.init(project=record_project, name=record_name) rb = ReplayBuffer(size=10000, env_dict={ "obs": {"shape": obs_dim}, "act": {"shape": act_dim}, "rew": {}, "next_obs": {"shape": obs_dim}, "done": {}}) # columns = ['observation', 'action', 'reward', 'cost', 'done'] # sim_data = pd.DataFrame(index=[0], columns=columns) while n < num_episodes: if render: env.render() time.sleep(1e-3) a = get_action(o) next_o, r, d, info = env.step(a) if record: # buf.store(next_o, a, r, None, info['cost'], None, None, None) done_int = int(d==True) rb.add(obs=o, act=a, rew=r, next_obs=next_o, done=done_int) ep_ret += r ep_len += 1 ep_cost += info['cost'] # Important! o = next_o if d or (ep_len == max_ep_len): # finish recording and save csv if record: rb.on_episode_end() # make directory if does not exist if not os.path.exists(data_path + config_name + '_episodes'): os.makedirs(data_path + config_name + '_episodes') # buf = CostPOBuffer(obs_dim, act_dim, local_steps_per_epoch, 0.99, 0.99) if len(rew_mov_avg_10) >= 25: rew_mov_avg_10.pop(0) cost_mov_avg_10.pop(0) rew_mov_avg_10.append(ep_ret) cost_mov_avg_10.append(ep_cost) mov_avg_ret = np.mean(rew_mov_avg_10) mov_avg_cost = np.mean(cost_mov_avg_10) expert_metrics = {log_prefix + 'episode return': ep_ret, log_prefix + 'episode cost': ep_cost, # 'cumulative return': cum_ret, # 'cumulative cost': cum_cost, log_prefix + '25ep mov avg return': mov_avg_ret, log_prefix + '25ep mov avg cost': mov_avg_cost } if benchmark: ep_rewards.append(ep_ret) ep_costs.append(ep_cost) wandb.log(expert_metrics) logger.store(EpRet=ep_ret, EpLen=ep_len, EpCost=ep_cost) print('Episode %d \t EpRet %.3f \t EpLen %d \t EpCost %d' % (n, ep_ret, ep_len, ep_cost)) o, r, d, ep_ret, ep_len, ep_cost = env.reset(), 0, False, 0, 0, 0 n += 1 logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.dump_tabular() if record: print("saving final buffer") bufname_pk = data_path + config_name + '_episodes/sim_data_' + str(int(num_episodes)) + '_buffer.pkl' file_pi = open(bufname_pk, 'wb') pickle.dump(rb.get_all_transitions(), file_pi) wandb.finish() return rb if benchmark: return ep_rewards, ep_costs
class OnPolicyTrainer(Trainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __call__(self): # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) episode_steps = 0 episode_return = 0 episode_cost = 0 episode_start_time = time.time() total_steps = np.array(0, dtype=np.int32) n_epoisode = 0 obs = self._env.reset() tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples for _ in range(self._policy.horizon): if self._normalize_obs: obs = self._obs_normalizer(obs, update=False) act, logp, val = self._policy.get_action_and_val(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_obs, reward, done, info = self._env.step(env_act) # print('[DEBUG] COST:', info['cost']) try: cost = info['cost'] except (TypeError, KeyError): cost = 0 if self._show_progress: self._env.render() episode_steps += 1 total_steps += 1 episode_return += reward episode_cost += cost done_flag = done if (hasattr(self._env, "_max_episode_steps") and episode_steps == self._env._max_episode_steps): done_flag = False self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done_flag, logp=logp, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: tf.summary.experimental.set_step(total_steps) self.finish_horizon() obs = self._env.reset() n_epoisode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 6.4f} Cost: {4: 5.4f} FPS: {5:5.2f}" .format(n_epoisode, int(total_steps), episode_steps, episode_return, episode_cost, fps)) tf.summary.scalar(name="Common/training_return", data=episode_return) tf.summary.scalar(name="Common/fps", data=fps) self.total_cost += episode_cost cost_rate = self.total_cost / total_steps wandb.log( { 'Training_Return': episode_return, 'Training_Cost': episode_cost, 'Cost_Rate': cost_rate, 'FPS': fps }, step=n_epoisode) episode_steps = 0 episode_return = 0 episode_cost = 0 episode_start_time = time.time() if total_steps % self._test_interval == 0: avg_test_return, avg_test_cost = self.evaluate_policy( total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 6.4f} Average Cost {2: 5.4f} over {3: 2} episodes" .format(total_steps, avg_test_return, avg_test_cost, self._test_episodes)) wandb.log( { 'Evaluation_Return': avg_test_return, 'Evaluation_Cost': avg_test_cost }, step=n_epoisode) # wandb.log({'Evaluation_Step': total_steps}) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) self.writer.flush() if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) # Train actor critic if self._policy.normalize_adv: samples = self.replay_buffer.get_all_transitions() mean_adv = np.mean(samples["adv"]) std_adv = np.std(samples["adv"]) # Update normalizer if self._normalize_obs: self._obs_normalizer.experience(samples["obs"]) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): for _ in range(self._policy.n_epoch): samples = self.replay_buffer._encode_sample( np.random.permutation(self._policy.horizon)) if self._normalize_obs: samples["obs"] = self._obs_normalizer(samples["obs"], update=False) if self._policy.normalize_adv: adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) tf.summary.flush() def finish_horizon(self, last_val=0): self.local_buffer.on_episode_end() samples = self.local_buffer._encode_sample( np.arange(self.local_buffer.get_stored_size())) rews = np.append(samples["rew"], last_val) vals = np.append(samples["val"], last_val) # GAE-Lambda advantage calculation deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1] if self._policy.enable_gae: advs = discount_cumsum(deltas, self._policy.discount * self._policy.lam) else: advs = deltas # Rewards-to-go, to be targets for the value function rets = discount_cumsum(rews, self._policy.discount)[:-1] self.replay_buffer.add(obs=samples["obs"], act=samples["act"], done=samples["done"], ret=rets, adv=advs, logp=np.squeeze(samples["logp"])) self.local_buffer.clear() def evaluate_policy(self, total_steps): avg_test_return = 0. avg_test_cost = 0. if self._save_test_path: replay_buffer = get_replay_buffer(self._policy, self._test_env, size=self._episode_max_steps) for i in range(self._test_episodes): episode_return = 0. episode_cost = 0. frames = [] obs = self._test_env.reset() for _ in range(self._episode_max_steps): if self._normalize_obs: obs = self._obs_normalizer(obs, update=False) act, _ = self._policy.get_action(obs, test=True) act = (act if not is_discrete(self._env.action_space) else np.clip(act, self._env.action_space.low, self._env.action_space.high)) next_obs, reward, done, info = self._test_env.step(act) try: cost = info['cost'] except (TypeError, KeyError): cost = 0 if self._save_test_path: replay_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='rgb_array')) elif self._show_test_progress: self._test_env.render() episode_return += reward episode_cost += cost obs = next_obs if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}_cost{2:010.4f}".format( total_steps, i, episode_return, episode_cost) if self._save_test_path: save_path(replay_buffer.sample(self._episode_max_steps), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: frames_to_gif(frames, prefix, self._output_dir) avg_test_return += episode_return avg_test_cost += episode_cost if self._show_test_images: images = tf.cast( tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3), tf.uint8) tf.summary.image( 'train/input_img', images, ) return avg_test_return / self._test_episodes, avg_test_cost / self._test_episodes
def test_smaller_episode_than_stack_frame(self): """ `on_episode_end()` caches stack size. When episode length is smaller than stack size, `on_episode_end()` must avoid caching from previous episode. Since cache does not wraparound, this bug does not happen at the first episode. Ref: https://gitlab.com/ymd_h/cpprb/-/issues/108 Ref: https://gitlab.com/ymd_h/cpprb/-/issues/110 """ stack_size = 4 episode_len1 = 5 episode_len2 = 2 rb = ReplayBuffer(32, {"obs": { "shape": (stack_size), "dtype": np.int }}, next_of="obs", stack_compress="obs") obs = np.arange(episode_len1 + stack_size + 2, dtype=np.int) obs2 = np.arange(episode_len2 + stack_size + 2, dtype=np.int) + 100 self.assertEqual(rb.get_current_episode_len(), 0) # Add 1st episode for i in range(episode_len1): rb.add(obs=obs[i:i + stack_size], next_obs=obs[i + 1:i + 1 + stack_size]) s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), episode_len1) self.assertEqual(rb.get_current_episode_len(), episode_len1) for i in range(episode_len1): with self.subTest(i=i): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) # Reset environment rb.on_episode_end() self.assertEqual(rb.get_current_episode_len(), 0) s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), episode_len1) for i in range(episode_len1): with self.subTest(i=i): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) # Add 2nd episode for i in range(episode_len2): rb.add(obs=obs2[i:i + stack_size], next_obs=obs2[i + 1:i + 1 + stack_size]) self.assertEqual(rb.get_current_episode_len(), episode_len2) s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), episode_len1 + episode_len2) for i in range(episode_len1): with self.subTest(i=i, v="obs"): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) with self.subTest(i=i, v="next_obs"): np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) for i in range(episode_len2): with self.subTest(i=i + episode_len1, v="obs"): np.testing.assert_equal(s["obs"][i + episode_len1], obs2[i:i + stack_size]) with self.subTest(i=i + episode_len1, v="next_obs"): np.testing.assert_equal(s["next_obs"][i + episode_len1], obs2[i + 1:i + 1 + stack_size]) rb.on_episode_end() self.assertEqual(rb.get_current_episode_len(), 0) s = rb.get_all_transitions() self.assertEqual(rb.get_stored_size(), episode_len1 + episode_len2) for i in range(episode_len1): with self.subTest(i=i, v="obs"): np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size]) with self.subTest(i=i, v="next_obs"): np.testing.assert_equal(s["next_obs"][i], obs[i + 1:i + 1 + stack_size]) for i in range(episode_len2): with self.subTest(i=i + episode_len1, v="obs"): np.testing.assert_equal(s["obs"][i + episode_len1], obs2[i:i + stack_size]) with self.subTest(i=i + episode_len1, v="next_obs"): np.testing.assert_equal(s["next_obs"][i + episode_len1], obs2[i + 1:i + 1 + stack_size])
def test_Nstep_discounts_with_done(self): buffer_size = 32 step = 4 gamma = 0.5 rb = ReplayBuffer(buffer_size, {"done": {}}, Nstep={ "size": step, "gamma": gamma }) rb.add(done=0) rb.add(done=0) rb.add(done=0) rb.add(done=1) rb.on_episode_end() np.testing.assert_allclose(rb.get_all_transitions()["done"], np.asarray([[0], [1], [1], [1]])) rb.add(done=0) rb.add(done=0) rb.add(done=0) rb.add(done=0) np.testing.assert_allclose(rb.get_all_transitions()["done"], np.asarray([[0], [1], [1], [1], [0]])) rb.add(done=1) rb.on_episode_end() np.testing.assert_allclose( rb.get_all_transitions()["done"], np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1]])) rb.add(done=1) rb.on_episode_end() np.testing.assert_allclose( rb.get_all_transitions()["done"], np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1], [1]])) rb.add(done=1) rb.on_episode_end() np.testing.assert_allclose( rb.get_all_transitions()["done"], np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1], [1], [1]])) rb.add(done=0) rb.add(done=1) rb.on_episode_end() np.testing.assert_allclose( rb.get_all_transitions()["done"], np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1], [1], [1], [1], [1]])) rb.add(done=0) rb.add(done=0) rb.add(done=1) rb.on_episode_end() np.testing.assert_allclose( rb.get_all_transitions()["done"], np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1]])) rb.clear() self.assertEqual(rb.get_stored_size(), 0) rb.add(done=1) rb.on_episode_end() np.testing.assert_allclose(rb.get_all_transitions()["done"], np.asarray([[1]]))
observation = env.reset() # Warming up for n_step in range(100): action = env.action_space.sample() # Random Action next_observation, reward, done, info = env.step(action) rb.add(obs=observation, act=action, rew=reward, next_obs=next_observation, done=done) observation = next_observation if done: observation = env.reset() rb.on_episode_end() n_episode = 0 observation = env.reset() for n_step in range(N_iteration): if np.random.rand() < egreedy: action = env.action_space.sample() else: Q = tf.squeeze(model(observation.reshape(1, -1))) action = np.argmax(Q) egreedy = decay_egreedy(egreedy) next_observation, reward, done, info = env.step(action) rb.add(obs=observation,
class OnPolicyTrainer(Trainer): """ Trainer class for on-policy reinforcement learning Command Line Args: * ``--max-steps`` (int): The maximum steps for training. The default is ``int(1e6)`` * ``--episode-max-steps`` (int): The maximum steps for an episode. The default is ``int(1e3)`` * ``--n-experiments`` (int): Number of experiments. The default is ``1`` * ``--show-progress``: Call ``render`` function during training * ``--save-model-interval`` (int): Interval to save model. The default is ``int(1e4)`` * ``--save-summary-interval`` (int): Interval to save summary. The default is ``int(1e3)`` * ``--model-dir`` (str): Directory to restore model. * ``--dir-suffix`` (str): Suffix for directory that stores results. * ``--normalize-obs``: Whether normalize observation * ``--logdir`` (str): Output directory name. The default is ``"results"`` * ``--evaluate``: Whether evaluate trained model * ``--test-interval`` (int): Interval to evaluate trained model. The default is ``int(1e4)`` * ``--show-test-progress``: Call ``render`` function during evaluation. * ``--test-episodes`` (int): Number of episodes at test. The default is ``5`` * ``--save-test-path``: Save trajectories of evaluation. * ``--show-test-images``: Show input images to neural networks when an episode finishes * ``--save-test-movie``: Save rendering results. * ``--use-prioritized-rb``: Use prioritized experience replay * ``--use-nstep-rb``: Use Nstep experience replay * ``--n-step`` (int): Number of steps for nstep experience reward. The default is ``4`` * ``--logging-level`` (DEBUG, INFO, WARNING): Choose logging level. The default is ``INFO`` """ def __init__(self, *args, **kwargs): """ Initialize On-Policy Trainer Args: policy: Policy to be trained env (gym.Env): Environment for train args (Namespace or dict): config parameters specified with command line test_env (gym.Env): Environment for test. """ super().__init__(*args, **kwargs) def __call__(self): """ Execute training """ # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) episode_steps = 0 episode_return = 0 episode_start_time = time.time() total_steps = np.array(0, dtype=np.int32) n_epoisode = 0 obs = self._env.reset() tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples for _ in range(self._policy.horizon): if self._normalize_obs: obs = self._obs_normalizer(obs, update=False) act, logp, val = self._policy.get_action_and_val(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_obs, reward, done, _ = self._env.step(env_act) if self._show_progress: self._env.render() episode_steps += 1 total_steps += 1 episode_return += reward done_flag = done if (hasattr(self._env, "_max_episode_steps") and episode_steps == self._env._max_episode_steps): done_flag = False self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done_flag, logp=logp, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: tf.summary.experimental.set_step(total_steps) self.finish_horizon() obs = self._env.reset() n_epoisode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_epoisode, int(total_steps), episode_steps, episode_return, fps)) tf.summary.scalar(name="Common/training_return", data=episode_return) tf.summary.scalar(name="Common/training_episode_length", data=episode_steps) tf.summary.scalar(name="Common/fps", data=fps) episode_steps = 0 episode_return = 0 episode_start_time = time.time() if total_steps % self._test_interval == 0: avg_test_return, avg_test_steps = self.evaluate_policy( total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) tf.summary.scalar( name="Common/average_test_episode_length", data=avg_test_steps) self.writer.flush() if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) # Train actor critic if self._policy.normalize_adv: samples = self.replay_buffer.get_all_transitions() mean_adv = np.mean(samples["adv"]) std_adv = np.std(samples["adv"]) # Update normalizer if self._normalize_obs: self._obs_normalizer.experience(samples["obs"]) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): for _ in range(self._policy.n_epoch): samples = self.replay_buffer._encode_sample( np.random.permutation(self._policy.horizon)) if self._normalize_obs: samples["obs"] = self._obs_normalizer(samples["obs"], update=False) if self._policy.normalize_adv: adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) tf.summary.flush() def finish_horizon(self, last_val=0): """ Finish horizon """ self.local_buffer.on_episode_end() samples = self.local_buffer._encode_sample( np.arange(self.local_buffer.get_stored_size())) rews = np.append(samples["rew"], last_val) vals = np.append(samples["val"], last_val) # GAE-Lambda advantage calculation deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1] if self._policy.enable_gae: advs = discount_cumsum(deltas, self._policy.discount * self._policy.lam) else: advs = deltas # Rewards-to-go, to be targets for the value function rets = discount_cumsum(rews, self._policy.discount)[:-1] self.replay_buffer.add(obs=samples["obs"], act=samples["act"], done=samples["done"], ret=rets, adv=advs, logp=np.squeeze(samples["logp"])) self.local_buffer.clear() def evaluate_policy(self, total_steps): """ Evaluate policy Args: total_steps (int): Current total steps of training """ avg_test_return = 0. avg_test_steps = 0 if self._save_test_path: replay_buffer = get_replay_buffer(self._policy, self._test_env, size=self._episode_max_steps) for i in range(self._test_episodes): episode_return = 0. frames = [] obs = self._test_env.reset() avg_test_steps += 1 for _ in range(self._episode_max_steps): if self._normalize_obs: obs = self._obs_normalizer(obs, update=False) act, _ = self._policy.get_action(obs, test=True) act = (act if is_discrete(self._env.action_space) else np.clip( act, self._env.action_space.low, self._env.action_space.high)) next_obs, reward, done, _ = self._test_env.step(act) avg_test_steps += 1 if self._save_test_path: replay_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='rgb_array')) elif self._show_test_progress: self._test_env.render() episode_return += reward obs = next_obs if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format( total_steps, i, episode_return) if self._save_test_path: save_path(replay_buffer.sample(self._episode_max_steps), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: frames_to_gif(frames, prefix, self._output_dir) avg_test_return += episode_return if self._show_test_images: images = tf.cast( tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3), tf.uint8) tf.summary.image( 'train/input_img', images, ) return avg_test_return / self._test_episodes, avg_test_steps / self._test_episodes