def test_has_next_of(self): bsize = 10 rb = ReplayBuffer(bsize, {"a": {}}, next_of="a") a = np.random.rand(bsize + 1) for i in range(bsize): rb.add(a=a[i], next_a=a[i + 1]) _next_a = np.ravel(rb.get_all_transitions()["next_a"]) np.testing.assert_allclose(_next_a, a[1:bsize + 1]) for i in range(bsize): rb._encode_sample([i]) rb.clear() for i in range(bsize): rb.add(a=a[i], next_a=a[i + 1]) rb.on_episode_end() _next_a = np.ravel(rb.get_all_transitions()["next_a"]) np.testing.assert_allclose(_next_a, a[1:bsize + 1]) for i in range(bsize): rb._encode_sample([i])
def test_stack_compress(self): bsize = 10 odim = 2 ssize = 2 rb = ReplayBuffer(bsize, {"a": { "shape": (odim, ssize) }}, stack_compress="a") a = np.random.rand(odim, bsize + ssize - 1) for i in range(bsize): rb.add(a=a[:, i:i + ssize]) _a = rb.get_all_transitions()["a"] for i in range(bsize): with self.subTest(i=i, label="without cache"): np.testing.assert_allclose(_a[i], a[:, i:i + ssize]) for i in range(bsize): rb._encode_sample([i]) rb.clear() for i in range(bsize): rb.add(a=a[:, i:i + ssize]) rb.on_episode_end() _a = rb.get_all_transitions()["a"] for i in range(bsize): with self.subTest(i=i, label="without cache"): np.testing.assert_allclose(_a[i], a[:, i:i + ssize]) for i in range(bsize): rb._encode_sample([i])
def test_buffer(self): buffer_size = 256 obs_shape = (15,15) act_dim = 5 N = 512 erb = ReplayBuffer(buffer_size,{"obs":{"shape": obs_shape}, "act":{"shape": act_dim}, "rew":{}, "next_obs":{"shape": obs_shape}, "done":{}}) for i in range(N): obs = np.full(obs_shape,i,dtype=np.double) act = np.full(act_dim,i,dtype=np.double) rew = i next_obs = obs + 1 done = 0 erb.add(obs=obs,act=act,rew=rew,next_obs=next_obs,done=done) es = erb._encode_sample(range(buffer_size)) erb.sample(32) erb.clear() self.assertEqual(erb.get_next_index(),0) self.assertEqual(erb.get_stored_size(),0)
def test_buffer_size(self): buffer_size = 1000 obs_dim = 3 act_dim = 1 rb = ReplayBuffer( buffer_size, { "obs": { "shape": obs_dim }, "act": { "shape": act_dim }, "rew": {}, "next_obs": { "shape": obs_dim }, "done": {} }) prb = PrioritizedReplayBuffer( buffer_size, { "obs": { "shape": obs_dim }, "act": { "shape": act_dim }, "rew": {}, "next_obs": { "shape": obs_dim }, "done": {} }) self.assertEqual(1000, rb.get_buffer_size()) self.assertEqual(1000, prb.get_buffer_size()) rb._encode_sample([i for i in range(1000)])
def test_next_obs(self): buffer_size = 256 obs_shape = (15, 15) act_dim = 5 rb = ReplayBuffer(buffer_size, { "obs": { "shape": obs_shape, "dtype": np.ubyte }, "act": { "shape": act_dim }, "rew": {}, "done": {} }, next_of="obs") self.assertEqual(rb.get_next_index(), 0) self.assertEqual(rb.get_stored_size(), 0) obs = np.zeros(obs_shape, dtype=np.ubyte) act = np.ones(act_dim) rew = 1 done = 0 rb.add(obs=obs, act=act, rew=rew, next_obs=obs, done=done) self.assertEqual(rb.get_next_index(), 1) self.assertEqual(rb.get_stored_size(), 1) with self.assertRaises(KeyError): rb.add(obs=obs) self.assertEqual(rb.get_next_index(), 1) self.assertEqual(rb.get_stored_size(), 1) next_obs = rb.sample(32)["next_obs"] for i in range(512): obs = np.ones(obs_shape, dtype=np.ubyte) * i rb.add(obs=obs, act=act, rew=rew, next_obs=obs + 1, done=done) sample = rb._encode_sample(range(buffer_size)) ith = rb.get_next_index() np.testing.assert_allclose( np.roll(sample["obs"], -ith - 1, axis=0)[1:], np.roll(sample["next_obs"], -ith - 1, axis=0)[:-1])
class MeTrpoTrainer(MPCTrainer): def __init__(self, *args, n_eval_episodes_per_model=5, **kwargs): kwargs["n_dynamics_model"] = 5 super().__init__(*args, **kwargs) self._n_eval_episodes_per_model = n_eval_episodes_per_model # Replay buffer to train policy self.replay_buffer = get_replay_buffer(self._policy, self._env) # Replay buffer to compute GAE rb_dict = { "size": self._episode_max_steps, "default_dtype": np.float32, "env_dict": { "obs": {"shape": self._env.observation_space.shape}, "act": {"shape": self._env.action_space.shape}, "next_obs": {"shape": self._env.observation_space.shape}, "rew": {}, "done": {}, "logp": {}, "val": {}}} self.local_buffer = ReplayBuffer(**rb_dict) def predict_next_state(self, obses, acts, idx=None): is_single_input = obses.ndim == acts.ndim and acts.ndim == 1 if is_single_input: obses = np.expand_dims(obses, axis=0) acts = np.expand_dims(acts, axis=0) inputs = np.concatenate([obses, acts], axis=1) idx = np.random.randint(self._n_dynamics_model) if idx is None else idx obs_diffs = self._dynamics_models[idx].predict(inputs) if is_single_input: return obses[0] + obs_diffs return obses + obs_diffs def _make_inputs_output_pairs(self, n_epoch): samples = self.dynamics_buffer.sample(self.dynamics_buffer.get_stored_size()) inputs = np.concatenate([samples["obs"], samples["act"]], axis=1) labels = samples["next_obs"] - samples["obs"] return inputs, labels def __call__(self): total_steps = 0 tf.summary.experimental.set_step(total_steps) while True: # Collect (s, a, s') pairs in a real environment self.collect_transitions_real_env() total_steps += self._n_collect_steps tf.summary.experimental.set_step(total_steps) # Train dynamics models self.fit_dynamics(n_epoch=1) if self._debug: ret_real_env, ret_sim_env = self._evaluate_model() self.logger.info("Returns (real, sim) = ({: .3f}, {: .3f})".format(ret_real_env, ret_sim_env)) # Prepare initial states for evaluation init_states_for_eval = np.array([ self._env.reset() for _ in range(self._n_dynamics_model * self._n_eval_episodes_per_model)]) # Returns to evaluate policy improvement returns_before_update = self._evaluate_current_return(init_states_for_eval) n_updates = 0 improve_ratios = [] while True: n_updates += 1 # Generate samples using dynamics models (simulated env) average_return = self.collect_transitions_sim_env() # Update policy self.update_policy() # Evaluate policy improvement returns_after_update = self._evaluate_current_return(init_states_for_eval) n_improved = np.sum(returns_after_update > returns_before_update) improved_ratio = n_improved / (self._n_dynamics_model * self._n_eval_episodes_per_model) improve_ratios.append(improved_ratio) if improved_ratio < 0.7: break returns_before_update = returns_after_update self.logger.info( "Training total steps: {0: 7} sim return: {1: .4f} n_update: {2:}, ratios: {3:}".format( total_steps, average_return, n_updates, improve_ratios)) tf.summary.scalar(name="mpc/n_updates", data=n_updates) # Evaluate policy in a real environment if total_steps // self._n_collect_steps % 10 == 0: avg_test_return = self.evaluate_policy(total_steps) self.logger.info("Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes".format( total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar( name="Common/average_test_return", data=avg_test_return) def _evaluate_model(self): ret_real_env, ret_sim_env = 0., 0. n_episodes = 10 for _ in range(n_episodes): real_obs = self._env.reset() sim_obs = real_obs.copy() for _ in range(self._episode_max_steps): act, _ = self._policy.get_action(real_obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_real_obs, rew, _, _ = self._env.step(env_act) ret_real_env += rew real_obs = next_real_obs next_sim_obs = self.predict_next_state(sim_obs, env_act) ret_sim_env += self._reward_fn(real_obs, act)[0] sim_obs = next_sim_obs ret_real_env /= n_episodes ret_sim_env /= n_episodes return ret_real_env, ret_sim_env def update_policy(self): # Compute mean and std for normalizing advantage if self._policy.normalize_adv: samples = self.replay_buffer.get_all_transitions() mean_adv = np.mean(samples["adv"]) std_adv = np.std(samples["adv"]) for _ in range(self._policy.n_epoch): samples = self.replay_buffer._encode_sample(np.random.permutation(self._policy.horizon)) adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8) if self._policy.normalize_adv else samples["adv"] for idx in range(int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train( states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) def _evaluate_current_return(self, init_states): n_episodes = self._n_dynamics_model * self._n_eval_episodes_per_model assert init_states.shape[0] == n_episodes obses = init_states.copy() next_obses = np.zeros_like(obses) returns = np.zeros(shape=(n_episodes,), dtype=np.float32) for _ in range(self._episode_max_steps): acts, _ = self._policy.get_action(obses) for i in range(n_episodes): model_idx = i // self._n_eval_episodes_per_model if not is_discrete(self._env.action_space): env_act = np.clip(acts[i], self._env.action_space.low, self._env.action_space.high) else: env_act = acts[i] next_obses[i] = self.predict_next_state(obses[i], env_act, idx=model_idx) returns += self._reward_fn(obses, acts) obses = next_obses return returns def _visualize_current_performance(self): obs = self._env.reset() for _ in range(self._episode_max_steps): act, _ = self._policy.get_action(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_obs = self.predict_next_state(obs, env_act) self._env.state = np.array([np.arctan2(next_obs[1], next_obs[0]), next_obs[2]], dtype=np.float32) # print(obs, act, next_obs, self._env.state) self._env.render() obs = next_obs def collect_transitions_real_env(self): total_steps = 0 episode_steps = 0 obs = self._env.reset() while total_steps < self._n_collect_steps: episode_steps += 1 total_steps += 1 act, _ = self._policy.get_action(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_obs, _, done, _ = self._env.step(env_act) self.dynamics_buffer.add( obs=obs, act=env_act, next_obs=next_obs) obs = next_obs if done or episode_steps == self._episode_max_steps: episode_steps = 0 obs = self._env.reset() def collect_transitions_sim_env(self): """ Generate transitions using dynamics model """ self.replay_buffer.clear() n_episodes = 0 ave_episode_return = 0 while self.replay_buffer.get_stored_size() < self._policy.horizon: obs = self._env.reset() episode_return = 0. for _ in range(self._episode_max_steps): act, logp, val = self._policy.get_action_and_val(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act if self._debug: next_obs, rew, _, _ = self._env.step(env_act) else: next_obs = self.predict_next_state(obs, env_act) rew = self._reward_fn(obs, act)[0] self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=rew, done=False, logp=logp, val=val) obs = next_obs episode_return += rew self.finish_horizon(last_val=val) ave_episode_return += episode_return n_episodes += 1 return ave_episode_return / n_episodes def finish_horizon(self, last_val=0): """ TODO: These codes are completly identical to the ones defined in on_policy_trainer.py. Use it. """ samples = self.local_buffer._encode_sample( np.arange(self.local_buffer.get_stored_size())) rews = np.append(samples["rew"], last_val) vals = np.append(samples["val"], last_val) # GAE-Lambda advantage calculation deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1] if self._policy.enable_gae: advs = discount_cumsum( deltas, self._policy.discount * self._policy.lam) else: advs = deltas # Rewards-to-go, to be targets for the value function rets = discount_cumsum(rews, self._policy.discount)[:-1] self.replay_buffer.add( obs=samples["obs"], act=samples["act"], done=samples["done"], ret=rets, adv=advs, logp=np.squeeze(samples["logp"])) self.local_buffer.clear() def evaluate_policy(self, total_steps): avg_test_return = 0. if self._save_test_path: replay_buffer = get_replay_buffer( self._policy, self._test_env, size=self._episode_max_steps) for i in range(self._test_episodes): episode_return = 0. frames = [] obs = self._test_env.reset() for _ in range(self._episode_max_steps): act, _ = self._policy.get_action(obs, test=True) act = (act if not hasattr(self._env.action_space, "high") else np.clip(act, self._env.action_space.low, self._env.action_space.high)) next_obs, reward, done, _ = self._test_env.step(act) if self._save_test_path: replay_buffer.add( obs=obs, act=act, next_obs=next_obs, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='rgb_array')) elif self._show_test_progress: self._test_env.render() episode_return += reward obs = next_obs if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format( total_steps, i, episode_return) if self._save_test_path: save_path(replay_buffer.sample(self._episode_max_steps), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: frames_to_gif(frames, prefix, self._output_dir) avg_test_return += episode_return if self._show_test_images: images = tf.cast( tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3), tf.uint8) tf.summary.image('train/input_img', images, ) return avg_test_return / self._test_episodes def _set_from_args(self, args): super()._set_from_args(args) self._n_collect_steps = args.n_collect_steps self._debug = args.debug @staticmethod def get_argument(parser=None): parser = MPCTrainer.get_argument(parser) parser.add_argument("--n-collect-steps", type=int, default=100) parser.add_argument("--debug", action='store_true') return parser
class OnPolicyTrainer(Trainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) assert self._test_interval % self._policy.horizon == 0, \ "Test interval should be divisible by policy horizon" def __call__(self): # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) episode_steps = 0 episode_return = 0 episode_start_time = time.time() total_steps = np.array(0, dtype=np.int32) n_epoisode = 0 obs = self._env.reset() tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples for _ in range(self._policy.horizon): act, logp, val = self._policy.get_action_and_val(obs) next_obs, reward, done, _ = self._env.step(act) if self._show_progress: self._env.render() episode_steps += 1 total_steps += 1 episode_return += reward done_flag = done if hasattr(self._env, "_max_episode_steps") and \ episode_steps == self._env._max_episode_steps: done_flag = False self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done_flag, logp=logp, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: tf.summary.experimental.set_step(total_steps) self.finish_horizon() obs = self._env.reset() n_epoisode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_epoisode, int(total_steps), episode_steps, episode_return, fps)) tf.summary.scalar(name="Common/training_return", data=episode_return) tf.summary.scalar(name="Common/fps", data=fps) episode_steps = 0 episode_return = 0 episode_start_time = time.time() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) # Train actor critic if self._policy.normalize_adv: samples = self.replay_buffer._encode_sample( np.arange(self._policy.horizon)) mean_adv = np.mean(samples["adv"]) std_adv = np.std(samples["adv"]) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): for _ in range(self._policy.n_epoch): samples = self.replay_buffer._encode_sample( np.random.permutation(self._policy.horizon)) if self._policy.normalize_adv: adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) if total_steps % self._test_interval == 0: avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) self.writer.flush() if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() tf.summary.flush() def finish_horizon(self, last_val=0): samples = self.local_buffer._encode_sample( np.arange(self.local_buffer.get_stored_size())) rews = np.append(samples["rew"], last_val) vals = np.append(samples["val"], last_val) # GAE-Lambda advantage calculation deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1] if self._policy.enable_gae: advs = discount_cumsum(deltas, self._policy.discount * self._policy.lam) else: advs = deltas # Rewards-to-go, to be targets for the value function rets = discount_cumsum(rews, self._policy.discount)[:-1] self.replay_buffer.add(obs=samples["obs"], act=samples["act"], done=samples["done"], ret=rets, adv=advs, logp=np.squeeze(samples["logp"])) self.local_buffer.clear() def evaluate_policy(self, total_steps): if self._normalize_obs: self._test_env.normalizer.set_params( *self._env.normalizer.get_params()) avg_test_return = 0. if self._save_test_path: replay_buffer = get_replay_buffer(self._policy, self._test_env, size=self._episode_max_steps) for i in range(self._test_episodes): episode_return = 0. frames = [] obs = self._test_env.reset() for _ in range(self._episode_max_steps): act, _ = self._policy.get_action(obs, test=True) act = act if not hasattr(self._env.action_space, "high") else \ np.clip(act, self._env.action_space.low, self._env.action_space.high) next_obs, reward, done, _ = self._test_env.step(act) if self._save_test_path: replay_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='rgb_array')) elif self._show_test_progress: self._test_env.render() episode_return += reward obs = next_obs if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format( total_steps, i, episode_return) if self._save_test_path: save_path(replay_buffer.sample(self._episode_max_steps), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: frames_to_gif(frames, prefix, self._output_dir) avg_test_return += episode_return if self._show_test_images: images = tf.cast( tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3), tf.uint8) tf.summary.image( 'train/input_img', images, ) return avg_test_return / self._test_episodes
def explorer(global_rb, queue, trained_steps, is_training_done, lock, env_fn, policy_fn, set_weights_fn, noise_level, n_env=64, n_thread=4, buffer_size=1024, episode_max_steps=1000, gpu=0): """ Collect transitions and store them to prioritized replay buffer. :param global_rb (multiprocessing.managers.AutoProxy[PrioritizedReplayBuffer]): Prioritized replay buffer sharing with multiple explorers and only one learner. This object is shared over processes, so it must be locked when trying to operate something with `lock` object. :param queue (multiprocessing.Queue): A FIFO shared with the `learner` and `evaluator` to get the latest network weights. This is process safe, so you don't need to lock process when use this. :param trained_steps (multiprocessing.Value): Number of steps to apply gradients. :param is_training_done (multiprocessing.Event): multiprocessing.Event object to share the status of training. :param lock (multiprocessing.Lock): multiprocessing.Lock to lock other processes. :param env_fn (function): Method object to generate an environment. :param policy_fn (function): Method object to generate an explorer. :param set_weights_fn (function): Method object to set network weights gotten from queue. :param noise_level (float): Noise level for exploration. For epsilon-greedy policy like DQN variants, this will be epsilon, and if DDPG variants this will be variance for Normal distribution. :param n_env (int): Number of environments to distribute. If this is set to be more than 1, `MultiThreadEnv` will be used. :param n_thread (int): Number of thread used in `MultiThreadEnv`. :param buffer_size (int): Size of local buffer. If this is filled with transitions, add them to `global_rb` :param episode_max_steps (int): Maximum number of steps of an episode. :param gpu (int): GPU id. If this is set to -1, then this process uses only CPU. """ import_tf() logger = logging.getLogger("tf2rl") if n_env > 1: envs = MultiThreadEnv(env_fn=env_fn, batch_size=n_env, thread_pool=n_thread, max_episode_steps=episode_max_steps) env = envs._sample_env else: env = env_fn() policy = policy_fn(env=env, name="Explorer", memory_capacity=global_rb.get_buffer_size(), noise_level=noise_level, gpu=gpu) kwargs = get_default_rb_dict(buffer_size, env) if n_env > 1: kwargs["env_dict"]["priorities"] = {} local_rb = ReplayBuffer(**kwargs) local_idx = np.arange(buffer_size).astype(np.int) if n_env == 1: s = env.reset() episode_steps = 0 total_reward = 0. total_rewards = [] start = time.time() n_sample, n_sample_old = 0, 0 while not is_training_done.is_set(): if n_env == 1: n_sample += 1 episode_steps += 1 a = policy.get_action(s) s_, r, done, _ = env.step(a) done_flag = done if episode_steps == env._max_episode_steps: done_flag = False total_reward += r local_rb.add(obs=s, act=a, rew=r, next_obs=s_, done=done_flag) s = s_ if done or episode_steps == episode_max_steps: s = env.reset() total_rewards.append(total_reward) total_reward = 0 episode_steps = 0 else: n_sample += n_env obses = envs.py_observation() actions = policy.get_action(obses, tensor=True) next_obses, rewards, dones, _ = envs.step(actions) td_errors = policy.compute_td_error(states=obses, actions=actions, next_states=next_obses, rewards=rewards, dones=dones) local_rb.add(obs=obses, act=actions, next_obs=next_obses, rew=rewards, done=dones, priorities=np.abs(td_errors + 1e-6)) # Periodically copy weights of explorer if not queue.empty(): set_weights_fn(policy, queue.get()) # Add collected experiences to global replay buffer if local_rb.get_stored_size() == buffer_size: samples = local_rb._encode_sample(local_idx) if n_env > 1: priorities = np.squeeze(samples["priorities"]) else: td_errors = policy.compute_td_error( states=samples["obs"], actions=samples["act"], next_states=samples["next_obs"], rewards=samples["rew"], dones=samples["done"]) priorities = np.abs(np.squeeze(td_errors)) + 1e-6 lock.acquire() global_rb.add(obs=samples["obs"], act=samples["act"], rew=samples["rew"], next_obs=samples["next_obs"], done=samples["done"], priorities=priorities) lock.release() local_rb.clear() msg = "Grad: {0: 6d}\t".format(trained_steps.value) msg += "Samples: {0: 7d}\t".format(n_sample) msg += "TDErr: {0:.5f}\t".format(np.average(priorities)) if n_env == 1: ave_rew = (0 if len(total_rewards) == 0 else sum(total_rewards) / len(total_rewards)) msg += "AveEpiRew: {0:.3f}\t".format(ave_rew) total_rewards = [] msg += "FPS: {0:.2f}".format( (n_sample - n_sample_old) / (time.time() - start)) logger.info(msg) start = time.time() n_sample_old = n_sample
def explorer(global_rb, queue, trained_steps, is_training_done, lock, buffer_size=1024, episode_max_steps=1000, epsilon=0.5, transitions=None): tf = import_tf() env = _env() stacked_frames = deque(maxlen=4) policy = Agent() policy.epsilon = epsilon env_dict = { "obs": { "shape": state_size }, "act": {}, "rew": {}, "next_obs": { "shape": state_size }, "done": {} } local_rb = ReplayBuffer(buffer_size, env_dict=env_dict, default_dtype=np.float16) local_idx = np.arange(buffer_size).astype(np.int) s = env.reset() s = stack_frames(stacked_frames, s, True) episode_steps = 0 total_reward = 0. total_rewards = [] start = time.time() n_sample, n_sample_old = 0, 0 while not is_training_done.is_set(): transitions.value += 1 n_sample += 1 episode_steps += 1 a = policy.acting(s) s_, r, done, _ = env.step(a) done_flag = done if episode_steps == episode_max_steps: done_flag = False total_reward += r s_ = stack_frames(stacked_frames, s_, False) policy.n_step_buffer.append((s, a, r, s_, done_flag)) if len(policy.n_step_buffer) == policy.n_step: reward, next_state, done = policy.get_n_step_info( policy.n_step_buffer, policy.gamma) state, action = policy.n_step_buffer[0][:2] local_rb.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done) s = s_ if done or episode_steps == episode_max_steps: s = env.reset() s = stack_frames(stacked_frames, s, True) total_rewards.append(total_reward) total_reward = 0 episode_steps = 0 if not queue.empty(): set_weights_fn(policy, queue.get()) if local_rb.get_stored_size() == buffer_size: samples = local_rb._encode_sample(local_idx) samples1 = {key: value[:50] for key, value in samples.items()} samples2 = {key: value[50:100] for key, value in samples.items()} samples3 = {key: value[100:150] for key, value in samples.items()} samples4 = {key: value[150:200] for key, value in samples.items()} for samples in [samples1, samples2, samples3, samples4]: td_errors = policy.compute_td_error(samples["obs"], samples["act"], samples["rew"], samples["next_obs"], samples["done"]) priorities = td_errors.numpy() + 1e-6 samples['priority'] = priorities samples = { key: np.concatenate( (value, samples2[key], samples3[key], samples4[key])) for key, value in samples1.items() } global_rb.add(obs=samples["obs"], act=samples["act"], rew=samples["rew"], next_obs=samples["next_obs"], done=samples["done"], priorities=samples['priority']) local_rb.clear() ave_rew = (0 if len(total_rewards) == 0 else sum(total_rewards) / len(total_rewards)) total_rewards = [] start = time.time() n_sample_old = n_sample
class MeTrpoTrainer(MPCTrainer): """ Trainer class for Model-Ensemble Trust-Region Policy Optimization (ME-TRPO):https://arxiv.org/abs/1802.10592 Command Line Args: * ``--max-steps`` (int): The maximum steps for training. The default is ``int(1e6)`` * ``--episode-max-steps`` (int): The maximum steps for an episode. The default is ``int(1e3)`` * ``--n-experiments`` (int): Number of experiments. The default is ``1`` * ``--show-progress``: Call ``render`` function during training * ``--save-model-interval`` (int): Interval to save model. The default is ``int(1e4)`` * ``--save-summary-interval`` (int): Interval to save summary. The default is ``int(1e3)`` * ``--model-dir`` (str): Directory to restore model. * ``--dir-suffix`` (str): Suffix for directory that stores results. * ``--normalize-obs``: Whether normalize observation * ``--logdir`` (str): Output directory name. The default is ``"results"`` * ``--evaluate``: Whether evaluate trained model * ``--test-interval`` (int): Interval to evaluate trained model. The default is ``int(1e4)`` * ``--show-test-progress``: Call ``render`` function during evaluation. * ``--test-episodes`` (int): Number of episodes at test. The default is ``5`` * ``--save-test-path``: Save trajectories of evaluation. * ``--show-test-images``: Show input images to neural networks when an episode finishes * ``--save-test-movie``: Save rendering results. * ``--use-prioritized-rb``: Use prioritized experience replay * ``--use-nstep-rb``: Use Nstep experience replay * ``--n-step`` (int): Number of steps for nstep experience reward. The default is ``4`` * ``--logging-level`` (DEBUG, INFO, WARNING): Choose logging level. The default is ``INFO`` * ``--gpu`` (int): The default is ``0`` * ``--max-iter`` (int): Maximum iteration. The default is ``100`` * ``--horizon`` (int): Number of steps to online horizon * ``--n-sample`` (int): Number of samples. The default is ``1000`` * ``--batch-size`` (int): Batch size. The default is ``512``. * ``--n-collect-steps`` (int): Number of steps to collect. The default is ``100`` * ``--debug``: Enable debug """ def __init__(self, *args, n_eval_episodes_per_model=5, **kwargs): """ Initialize ME-TRPO Args: policy: Policy to be trained env (gym.Env): Environment for train args (Namespace or dict): config parameters specified with command line test_env (gym.Env): Environment for test. reward_fn (callable): Reward function buffer_size (int): The default is ``int(1e6)`` lr (float): Learning rate for dynamics model. The default is ``0.001``. n_eval_episode_per_model (int): Number of evalation episodes per a model. The default is ``5`` """ kwargs["n_dynamics_model"] = 5 super().__init__(*args, **kwargs) self._n_eval_episodes_per_model = n_eval_episodes_per_model # Replay buffer to train policy self.replay_buffer = get_replay_buffer(self._policy, self._env) # Replay buffer to compute GAE rb_dict = { "size": self._episode_max_steps, "default_dtype": np.float32, "env_dict": { "obs": { "shape": self._env.observation_space.shape }, "act": { "shape": self._env.action_space.shape }, "next_obs": { "shape": self._env.observation_space.shape }, "rew": {}, "done": {}, "logp": {}, "val": {} } } self.local_buffer = ReplayBuffer(**rb_dict) def predict_next_state(self, obses, acts, idx=None): """ Predict Next State Args: obses acts idx (int): Index number of dynamics mode to use. If ``None`` (default), choose randomly. Returns: np.ndarray: next state """ is_single_input = obses.ndim == acts.ndim and acts.ndim == 1 if is_single_input: obses = np.expand_dims(obses, axis=0) acts = np.expand_dims(acts, axis=0) inputs = np.concatenate([obses, acts], axis=1) idx = np.random.randint(self._n_dynamics_model) if idx is None else idx obs_diffs = self._dynamics_models[idx].predict(inputs) if is_single_input: return obses[0] + obs_diffs return obses + obs_diffs def _make_inputs_output_pairs(self, n_epoch): samples = self.dynamics_buffer.sample( self.dynamics_buffer.get_stored_size()) inputs = np.concatenate([samples["obs"], samples["act"]], axis=1) labels = samples["next_obs"] - samples["obs"] return inputs, labels def __call__(self): """ Execute Training """ total_steps = 0 tf.summary.experimental.set_step(total_steps) while True: # Collect (s, a, s') pairs in a real environment self.collect_transitions_real_env() total_steps += self._n_collect_steps tf.summary.experimental.set_step(total_steps) # Train dynamics models self.fit_dynamics(n_epoch=1) if self._debug: ret_real_env, ret_sim_env = self._evaluate_model() self.logger.info( "Returns (real, sim) = ({: .3f}, {: .3f})".format( ret_real_env, ret_sim_env)) # Prepare initial states for evaluation init_states_for_eval = np.array([ self._env.reset() for _ in range(self._n_dynamics_model * self._n_eval_episodes_per_model) ]) # Returns to evaluate policy improvement returns_before_update = self._evaluate_current_return( init_states_for_eval) n_updates = 0 improve_ratios = [] while True: n_updates += 1 # Generate samples using dynamics models (simulated env) average_return = self.collect_transitions_sim_env() # Update policy self.update_policy() # Evaluate policy improvement returns_after_update = self._evaluate_current_return( init_states_for_eval) n_improved = np.sum( returns_after_update > returns_before_update) improved_ratio = n_improved / (self._n_dynamics_model * self._n_eval_episodes_per_model) improve_ratios.append(improved_ratio) if improved_ratio < 0.7: break returns_before_update = returns_after_update self.logger.info( "Training total steps: {0: 7} sim return: {1: .4f} n_update: {2:}, ratios: {3:}" .format(total_steps, average_return, n_updates, improve_ratios)) tf.summary.scalar(name="mpc/n_updates", data=n_updates) # Evaluate policy in a real environment if total_steps // self._n_collect_steps % 10 == 0: avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) def _evaluate_model(self): ret_real_env, ret_sim_env = 0., 0. n_episodes = 10 for _ in range(n_episodes): real_obs = self._env.reset() sim_obs = real_obs.copy() for _ in range(self._episode_max_steps): act, _ = self._policy.get_action(real_obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_real_obs, rew, _, _ = self._env.step(env_act) ret_real_env += rew real_obs = next_real_obs next_sim_obs = self.predict_next_state(sim_obs, env_act) ret_sim_env += self._reward_fn(real_obs, act)[0] sim_obs = next_sim_obs ret_real_env /= n_episodes ret_sim_env /= n_episodes return ret_real_env, ret_sim_env def update_policy(self): """ Update Policy """ # Compute mean and std for normalizing advantage if self._policy.normalize_adv: samples = self.replay_buffer.get_all_transitions() mean_adv = np.mean(samples["adv"]) std_adv = np.std(samples["adv"]) for _ in range(self._policy.n_epoch): samples = self.replay_buffer._encode_sample( np.random.permutation(self._policy.horizon)) adv = (samples["adv"] - mean_adv) / ( std_adv + 1e-8) if self._policy.normalize_adv else samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) def _evaluate_current_return(self, init_states): n_episodes = self._n_dynamics_model * self._n_eval_episodes_per_model assert init_states.shape[0] == n_episodes obses = init_states.copy() next_obses = np.zeros_like(obses) returns = np.zeros(shape=(n_episodes, ), dtype=np.float32) for _ in range(self._episode_max_steps): acts, _ = self._policy.get_action(obses) for i in range(n_episodes): model_idx = i // self._n_eval_episodes_per_model if not is_discrete(self._env.action_space): env_act = np.clip(acts[i], self._env.action_space.low, self._env.action_space.high) else: env_act = acts[i] next_obses[i] = self.predict_next_state(obses[i], env_act, idx=model_idx) returns += self._reward_fn(obses, acts) obses = next_obses return returns def _visualize_current_performance(self): obs = self._env.reset() for _ in range(self._episode_max_steps): act, _ = self._policy.get_action(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_obs = self.predict_next_state(obs, env_act) self._env.state = np.array( [np.arctan2(next_obs[1], next_obs[0]), next_obs[2]], dtype=np.float32) # print(obs, act, next_obs, self._env.state) self._env.render() obs = next_obs def collect_transitions_real_env(self): """ Collect Trandisions from Real Environment """ total_steps = 0 episode_steps = 0 obs = self._env.reset() while total_steps < self._n_collect_steps: episode_steps += 1 total_steps += 1 act, _ = self._policy.get_action(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_obs, _, done, _ = self._env.step(env_act) self.dynamics_buffer.add(obs=obs, act=env_act, next_obs=next_obs) obs = next_obs if done or episode_steps == self._episode_max_steps: episode_steps = 0 obs = self._env.reset() def collect_transitions_sim_env(self): """ Generate transitions using dynamics model """ self.replay_buffer.clear() n_episodes = 0 ave_episode_return = 0 while self.replay_buffer.get_stored_size() < self._policy.horizon: obs = self._env.reset() episode_return = 0. for _ in range(self._episode_max_steps): act, logp, val = self._policy.get_action_and_val(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act if self._debug: next_obs, rew, _, _ = self._env.step(env_act) else: next_obs = self.predict_next_state(obs, env_act) rew = self._reward_fn(obs, act)[0] self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=rew, done=False, logp=logp, val=val) obs = next_obs episode_return += rew self.finish_horizon(last_val=val) ave_episode_return += episode_return n_episodes += 1 return ave_episode_return / n_episodes def finish_horizon(self, last_val=0): """ TODO: These codes are completly identical to the ones defined in on_policy_trainer.py. Use it. """ samples = self.local_buffer._encode_sample( np.arange(self.local_buffer.get_stored_size())) rews = np.append(samples["rew"], last_val) vals = np.append(samples["val"], last_val) # GAE-Lambda advantage calculation deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1] if self._policy.enable_gae: advs = discount_cumsum(deltas, self._policy.discount * self._policy.lam) else: advs = deltas # Rewards-to-go, to be targets for the value function rets = discount_cumsum(rews, self._policy.discount)[:-1] self.replay_buffer.add(obs=samples["obs"], act=samples["act"], done=samples["done"], ret=rets, adv=advs, logp=np.squeeze(samples["logp"])) self.local_buffer.clear() def evaluate_policy(self, total_steps): avg_test_return = 0. if self._save_test_path: replay_buffer = get_replay_buffer(self._policy, self._test_env, size=self._episode_max_steps) for i in range(self._test_episodes): episode_return = 0. frames = [] obs = self._test_env.reset() for _ in range(self._episode_max_steps): act, _ = self._policy.get_action(obs, test=True) act = (act if not hasattr(self._env.action_space, "high") else np.clip(act, self._env.action_space.low, self._env.action_space.high)) next_obs, reward, done, _ = self._test_env.step(act) if self._save_test_path: replay_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='rgb_array')) elif self._show_test_progress: self._test_env.render() episode_return += reward obs = next_obs if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format( total_steps, i, episode_return) if self._save_test_path: save_path(replay_buffer.sample(self._episode_max_steps), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: frames_to_gif(frames, prefix, self._output_dir) avg_test_return += episode_return if self._show_test_images: images = tf.cast( tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3), tf.uint8) tf.summary.image( 'train/input_img', images, ) return avg_test_return / self._test_episodes def _set_from_args(self, args): super()._set_from_args(args) self._n_collect_steps = args.n_collect_steps self._debug = args.debug @staticmethod def get_argument(parser=None): parser = MPCTrainer.get_argument(parser) parser.add_argument("--n-collect-steps", type=int, default=100) parser.add_argument("--debug", action='store_true') return parser
class OnPolicyTrainer(Trainer): def __init__(self, *args, **kwargs): super(OnPolicyTrainer, self).__init__(*args, **kwargs) def __call__(self, *args, **kwargs): self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) episode_steps = 0 # 每次经验轨迹的步数 episode_return = 0 # 累计折扣奖励 episode_start_time = time.time() total_steps = 0 n_episode = 0 obs = self._env.reset() tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: for _ in range(self._policy.horizon): if self._normalize_obs: obs = np.expand_dims(obs, axis=0) obs = self._obs_normalizer(obs, update=False) obs = np.squeeze(obs, axis=0) act, logp, val = self._policy.get_action_and_val(obs) next_obs, reward, done, _ = self._env.step(act) if self._show_progress: self._env.render() episode_steps += 1 total_steps += 1 episode_return += reward done_flag = done if hasattr(self._env, "_max_episode_steps" ) and episode_steps == self._env._max_episode_steps: done_flag = False self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done_flag, logp=logp, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: tf.summary.experimental.set_step(total_steps) self.finish_horizon() obs = self._env.reset() n_episode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_episode, int(total_steps), episode_steps, episode_return, fps)) tf.summary.scalar(name="Common/training_return", data=episode_return) tf.summary.scalar(name="Common/fps", data=fps) episode_steps = 0 episode_return = 0 episode_start_time = time.time() # 测试时间间隔 if total_steps % self._test_interval == 0: avg_test_return = self.evaluate_policy(total_steps) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) self.writer.flush() # 以'_save_model_interval'的时间间隔保存模型参数 if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) # Train actor critic if self._policy.normalize_adv: samples = self.replay_buffer._encode_sample( np.arange(self._policy.horizon)) mean_adv = np.mean(samples["adv"]) std_adv = np.std(samples["adv"]) if self._normalize_obs: self._obs_normalizer.experience(samples["obs"]) for _ in range(self._policy.n_epoch): samples = self.replay_buffer._encode_sample( np.random.permutation(self._policy.horizon)) if self._normalize_obs: samples["obs"] = self._obs_normalizer(samples["obs"], update=False) if self._policy.normalize_adv: samples["adv"] = (samples["adv"] - mean_adv) / std_adv for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=samples["adv"][target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) tf.summary.flush() # 计算GAE-Lambda,这个函数当每个轨迹结束或者在epoch终止的时候被调用 def finish_horizon(self, last_val=0): samples = self.local_buffer._encode_sample( np.arange(self.local_buffer.get_stored_size())) rews = np.append(samples["rew"], last_val) vals = np.append(samples["val"], last_val) # GAE-Lambda advantage calculation deltas = rews[:-1] + self._policy.discount * vals[ 1:] - vals[:-1] # 时间差分误差集合[δ0, δ1, δ2, ..., δt] if self._policy.enable_gae: advs = discount_cumsum(deltas, self._policy.discount * self._policy.lam) else: advs = deltas rets = discount_cumsum(rews, self._policy.discount)[:-1] self.replay_buffer.add(obs=samples["obs"], act=samples["act"], done=samples["done"], ret=rets, adv=advs, logp=np.squeeze(samples["logp"])) self.local_buffer.clear() def evaluate_policy(self, total_steps): avg_test_return = 0. if self._save_test_path: replay_buffer = get_replay_buffer(self._policy, self._test_env, size=self._episode_max_steps) for i in range(self._test_episodes): episode_return = 0. frames = [] obs = self._test_env.reset() for _ in range(self._episode_max_steps): if self._normalize_obs: obs = np.expand_dims(obs, axis=0) obs = self._obs_normalizer(obs, update=False) obs = np.squeeze(obs, axis=0) act, _ = self._policy.get_action(obs, test=True) act = act if not hasattr(self._env.action_space, "high") else \ np.clip(act, self._env.action_space.low, self._env.action_space.high) next_obs, reward, done, _ = self._test_env.step(act) if self._save_test_path: replay_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='rgb_array')) elif self._show_test_progress: self._test_env.render() episode_return += reward obs = next_obs if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format( total_steps, i, episode_return) if self._save_test_path: save_path(replay_buffer.sample(self._episode_max_steps), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: frames_to_gif(frames, prefix, self._output_dir) avg_test_return += episode_return return avg_test_return / self._test_episodes
class OnPolicyTrainer(Trainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) assert self._test_interval % self._policy.horizon == 0, \ "Test interval should be divisible by policy horizon" def __call__(self): total_steps = 0 n_episode = 0 # TODO: clean codes # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples n_episode, total_rewards = self._collect_sample( n_episode, total_steps) total_steps += self._policy.horizon tf.summary.experimental.set_step(total_steps) if len(total_rewards) > 0: avg_training_return = sum(total_rewards) / len(total_rewards) tf.summary.scalar(name="Common/training_return", data=avg_training_return) # Train actor critic for _ in range(self._policy.n_epoch): samples = self.replay_buffer.sample(self._policy.horizon) if self._policy.normalize_adv: adv = (samples["adv"] - np.mean(samples["adv"])) / np.std( samples["adv"]) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) if total_steps % self._test_interval == 0: avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) self.writer.flush() if total_steps % self._model_save_interval == 0: self.checkpoint_manager.save() tf.summary.flush() def _collect_sample(self, n_episode, total_steps): episode_steps = 0 episode_return = 0 episode_returns = [] episode_start_time = time.time() obs = self._env.reset() for _ in range(self._policy.horizon): act, logp, val = self._policy.get_action_and_val(obs) # TODO: Clean code clipped_act = act if not hasattr(self._env.action_space, "high") else \ np.clip(act, self._env.action_space.low, self._env.action_space.high) next_obs, reward, done, _ = self._env.step(clipped_act) if self._show_progress: self._env.render() episode_steps += 1 episode_return += reward done_flag = done if hasattr(self._env, "_max_episode_steps") and \ episode_steps == self._env._max_episode_steps: done_flag = False self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done_flag, logp=logp, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: total_steps += episode_steps self.finish_horizon() obs = self._env.reset() n_episode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_episode, int(total_steps), episode_steps, episode_return, fps)) tf.summary.scalar(name="Common/fps", data=fps) episode_returns.append(episode_return) episode_steps = 0 episode_return = 0 episode_start_time = time.time() self.finish_horizon(last_val=val) return n_episode, episode_returns def finish_horizon(self, last_val=0): """ Call this at the end of a trajectory, or when one gets cut off by an epoch ending. This looks back in the buffer to where the trajectory started, and uses rewards and value estimates from the whole trajectory to compute advantage estimates with GAE-Lambda, as well as compute the rewards-to-go for each state, to use as the targets for the value function. The "last_val" argument should be 0 if the trajectory ended because the agent reached a terminal state (died), and otherwise should be V(s_T), the value function estimated for the last state. This allows us to bootstrap the reward-to-go calculation to account for timesteps beyond the arbitrary episode horizon (or epoch cutoff). """ samples = self.local_buffer._encode_sample( np.arange(self.local_buffer.get_stored_size())) rews = np.append(samples["rew"], last_val) vals = np.append(samples["val"], last_val) # GAE-Lambda advantage calculation deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1] if self._policy.enable_gae: advs = discount_cumsum(deltas, self._policy.discount * self._policy.lam) else: advs = deltas # Rewards-to-go, to be targets for the value function rets = discount_cumsum(rews, self._policy.discount)[:-1] self.replay_buffer.add(obs=samples["obs"], act=samples["act"], done=samples["done"], ret=rets, adv=advs, logp=np.squeeze(samples["logp"])) self.local_buffer.clear() def evaluate_policy(self, total_steps): avg_test_return = 0. if self._save_test_path: replay_buffer = get_replay_buffer(self._policy, self._test_env, size=self._episode_max_steps) for i in range(self._test_episodes): episode_return = 0. frames = [] obs = self._test_env.reset() done = False for _ in range(self._episode_max_steps): act, _ = self._policy.get_action(obs, test=True) act = act if not hasattr(self._env.action_space, "high") else \ np.clip(act, self._env.action_space.low, self._env.action_space.high) next_obs, reward, done, _ = self._test_env.step(act) if self._save_test_path: replay_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='rgb_array')) elif self._show_test_progress: self._test_env.render() episode_return += reward obs = next_obs if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format( total_steps, i, episode_return) if self._save_test_path: save_path(replay_buffer.sample(self._episode_max_steps), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: frames_to_gif(frames, prefix, self._output_dir) avg_test_return += episode_return if self._show_test_images: images = tf.cast( tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3), tf.uint8) tf.summary.image( 'train/input_img', images, ) return avg_test_return / self._test_episodes
def explorer(global_rb, queue, trained_steps, is_training_done, lock, buffer_size=1024, episode_max_steps=1000): env = gym.make('CartPole-v1') policy = Agent() env_dict = { "obs": { "shape": (state_size, ) }, "act": {}, "rew": {}, "next_obs": { "shape": (state_size, ) }, "done": {} } local_rb = ReplayBuffer(buffer_size, env_dict=env_dict) local_idx = np.arange(buffer_size).astype(np.int) s = env.reset() episode_steps = 0 total_reward = 0. total_rewards = [] start = time.time() n_sample, n_sample_old = 0, 0 while not is_training_done.is_set(): n_sample += 1 episode_steps += 1 a = policy.acting(s) s_, r, done, _ = env.step(a) done_flag = done if episode_steps == env._max_episode_steps: done_flag = False total_reward += r policy.n_step_buffer.append((s, a, r, s_, done_flag)) if len(policy.n_step_buffer) == policy.n_step: reward, next_state, done = policy.get_n_step_info( policy.n_step_buffer, policy.gamma) state, action = policy.n_step_buffer[0][:2] local_rb.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done) s = s_ if done or episode_steps == episode_max_steps: s = env.reset() total_rewards.append(total_reward) total_reward = 0 episode_steps = 0 if not queue.empty(): set_weights_fn(policy, queue.get()) if local_rb.get_stored_size() == buffer_size: samples = local_rb._encode_sample(local_idx) td_errors = policy.compute_td_error(samples["obs"], samples["act"], samples["rew"], samples["next_obs"], samples["done"]) priorities = np.abs(np.squeeze(td_errors)) + 1e-6 lock.acquire() global_rb.add(obs=samples["obs"], act=samples["act"], rew=samples["rew"], next_obs=samples["next_obs"], done=samples["done"], priorities=priorities) lock.release() local_rb.clear() ave_rew = (0 if len(total_rewards) == 0 else sum(total_rewards) / len(total_rewards)) total_rewards = [] start = time.time() n_sample_old = n_sample
class OnPolicyTrainer(Trainer): """ Trainer class for on-policy reinforcement learning Command Line Args: * ``--max-steps`` (int): The maximum steps for training. The default is ``int(1e6)`` * ``--episode-max-steps`` (int): The maximum steps for an episode. The default is ``int(1e3)`` * ``--n-experiments`` (int): Number of experiments. The default is ``1`` * ``--show-progress``: Call ``render`` function during training * ``--save-model-interval`` (int): Interval to save model. The default is ``int(1e4)`` * ``--save-summary-interval`` (int): Interval to save summary. The default is ``int(1e3)`` * ``--model-dir`` (str): Directory to restore model. * ``--dir-suffix`` (str): Suffix for directory that stores results. * ``--normalize-obs``: Whether normalize observation * ``--logdir`` (str): Output directory name. The default is ``"results"`` * ``--evaluate``: Whether evaluate trained model * ``--test-interval`` (int): Interval to evaluate trained model. The default is ``int(1e4)`` * ``--show-test-progress``: Call ``render`` function during evaluation. * ``--test-episodes`` (int): Number of episodes at test. The default is ``5`` * ``--save-test-path``: Save trajectories of evaluation. * ``--show-test-images``: Show input images to neural networks when an episode finishes * ``--save-test-movie``: Save rendering results. * ``--use-prioritized-rb``: Use prioritized experience replay * ``--use-nstep-rb``: Use Nstep experience replay * ``--n-step`` (int): Number of steps for nstep experience reward. The default is ``4`` * ``--logging-level`` (DEBUG, INFO, WARNING): Choose logging level. The default is ``INFO`` """ def __init__(self, *args, **kwargs): """ Initialize On-Policy Trainer Args: policy: Policy to be trained env (gym.Env): Environment for train args (Namespace or dict): config parameters specified with command line test_env (gym.Env): Environment for test. """ super().__init__(*args, **kwargs) def __call__(self): """ Execute training """ # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) episode_steps = 0 episode_return = 0 episode_start_time = time.time() total_steps = np.array(0, dtype=np.int32) n_epoisode = 0 obs = self._env.reset() tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples for _ in range(self._policy.horizon): if self._normalize_obs: obs = self._obs_normalizer(obs, update=False) act, logp, val = self._policy.get_action_and_val(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_obs, reward, done, _ = self._env.step(env_act) if self._show_progress: self._env.render() episode_steps += 1 total_steps += 1 episode_return += reward done_flag = done if (hasattr(self._env, "_max_episode_steps") and episode_steps == self._env._max_episode_steps): done_flag = False self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done_flag, logp=logp, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: tf.summary.experimental.set_step(total_steps) self.finish_horizon() obs = self._env.reset() n_epoisode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_epoisode, int(total_steps), episode_steps, episode_return, fps)) tf.summary.scalar(name="Common/training_return", data=episode_return) tf.summary.scalar(name="Common/training_episode_length", data=episode_steps) tf.summary.scalar(name="Common/fps", data=fps) episode_steps = 0 episode_return = 0 episode_start_time = time.time() if total_steps % self._test_interval == 0: avg_test_return, avg_test_steps = self.evaluate_policy( total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) tf.summary.scalar( name="Common/average_test_episode_length", data=avg_test_steps) self.writer.flush() if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) # Train actor critic if self._policy.normalize_adv: samples = self.replay_buffer.get_all_transitions() mean_adv = np.mean(samples["adv"]) std_adv = np.std(samples["adv"]) # Update normalizer if self._normalize_obs: self._obs_normalizer.experience(samples["obs"]) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): for _ in range(self._policy.n_epoch): samples = self.replay_buffer._encode_sample( np.random.permutation(self._policy.horizon)) if self._normalize_obs: samples["obs"] = self._obs_normalizer(samples["obs"], update=False) if self._policy.normalize_adv: adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) tf.summary.flush() def finish_horizon(self, last_val=0): """ Finish horizon """ self.local_buffer.on_episode_end() samples = self.local_buffer._encode_sample( np.arange(self.local_buffer.get_stored_size())) rews = np.append(samples["rew"], last_val) vals = np.append(samples["val"], last_val) # GAE-Lambda advantage calculation deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1] if self._policy.enable_gae: advs = discount_cumsum(deltas, self._policy.discount * self._policy.lam) else: advs = deltas # Rewards-to-go, to be targets for the value function rets = discount_cumsum(rews, self._policy.discount)[:-1] self.replay_buffer.add(obs=samples["obs"], act=samples["act"], done=samples["done"], ret=rets, adv=advs, logp=np.squeeze(samples["logp"])) self.local_buffer.clear() def evaluate_policy(self, total_steps): """ Evaluate policy Args: total_steps (int): Current total steps of training """ avg_test_return = 0. avg_test_steps = 0 if self._save_test_path: replay_buffer = get_replay_buffer(self._policy, self._test_env, size=self._episode_max_steps) for i in range(self._test_episodes): episode_return = 0. frames = [] obs = self._test_env.reset() avg_test_steps += 1 for _ in range(self._episode_max_steps): if self._normalize_obs: obs = self._obs_normalizer(obs, update=False) act, _ = self._policy.get_action(obs, test=True) act = (act if is_discrete(self._env.action_space) else np.clip( act, self._env.action_space.low, self._env.action_space.high)) next_obs, reward, done, _ = self._test_env.step(act) avg_test_steps += 1 if self._save_test_path: replay_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='rgb_array')) elif self._show_test_progress: self._test_env.render() episode_return += reward obs = next_obs if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format( total_steps, i, episode_return) if self._save_test_path: save_path(replay_buffer.sample(self._episode_max_steps), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: frames_to_gif(frames, prefix, self._output_dir) avg_test_return += episode_return if self._show_test_images: images = tf.cast( tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3), tf.uint8) tf.summary.image( 'train/input_img', images, ) return avg_test_return / self._test_episodes, avg_test_steps / self._test_episodes