def __call__(self): total_steps = 0 n_episode = 0 # TODO: clean codes # Prepare buffer self.replay_buffer = get_replay_buffer( self._policy, self._env) kwargs_local_buf = get_default_rb_dict( size=self._episode_max_steps, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples n_episode, total_rewards = self._collect_sample(n_episode, total_steps) total_steps += self._policy.horizon tf.summary.experimental.set_step(total_steps) if len(total_rewards) > 0: avg_training_return = sum(total_rewards) / len(total_rewards) tf.summary.scalar( name="Common/training_return", data=avg_training_return) # Train actor critic for _ in range(self._policy.n_epoch): samples = self.replay_buffer.sample(self._policy.horizon) if self._policy.normalize_adv: adv = (samples["adv"] - np.mean(samples["adv"])) / np.std(samples["adv"]) else: adv = samples["adv"] for idx in range(int(self._policy.horizon / self._policy.batch_size)): target = slice(idx*self._policy.batch_size, (idx+1)*self._policy.batch_size) self._policy.train( states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) if total_steps % self._test_interval == 0: avg_test_return = self.evaluate_policy(total_steps) self.logger.info("Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes".format( total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar( name="Common/average_test_return", data=avg_test_return) self.writer.flush() if total_steps % self._model_save_interval == 0: self.checkpoint_manager.save() tf.summary.flush()
def get_replay_buffer(policy, env, use_prioritized_rb=False, use_nstep_rb=False, n_step=1, size=None): if policy is None or env is None: return None obs_shape = get_space_size(env.observation_space) kwargs = get_default_rb_dict(policy.memory_capacity, env) if size is not None: kwargs["size"] = size # on-policy policy if not issubclass(type(policy), OffPolicyAgent): kwargs["size"] = policy.horizon kwargs["env_dict"].pop("next_obs") kwargs["env_dict"].pop("rew") # TODO: Remove done. Currently cannot remove because of cpprb implementation # kwargs["env_dict"].pop("done") kwargs["env_dict"]["logp"] = {} kwargs["env_dict"]["ret"] = {} kwargs["env_dict"]["adv"] = {} if is_discrete(env.action_space): kwargs["env_dict"]["act"]["dtype"] = np.int32 return ReplayBuffer(**kwargs) # N-step prioritized if use_prioritized_rb and use_nstep_rb: kwargs["n_step"] = n_step kwargs["discount"] = policy.discount raise NotImplementedError # return NstepPrioritizedReplayBuffer(**kwargs) if len(obs_shape) == 3: kwargs["env_dict"]["obs"]["dtype"] = np.ubyte kwargs["env_dict"]["next_obs"]["dtype"] = np.ubyte # prioritized if use_prioritized_rb: return PrioritizedReplayBuffer(**kwargs) # N-step if use_nstep_rb: kwargs["n_step"] = n_step kwargs["discount"] = policy.discount raise NotImplementedError # return NstepReplayBuffer(**kwargs) return ReplayBuffer(**kwargs)
def explorer(global_rb, queue, trained_steps, is_training_done, lock, env_fn, policy_fn, set_weights_fn, noise_level, n_env=64, n_thread=4, buffer_size=1024, episode_max_steps=1000, gpu=0): """ Collect transitions and store them to prioritized replay buffer. :param global_rb (multiprocessing.managers.AutoProxy[PrioritizedReplayBuffer]): Prioritized replay buffer sharing with multiple explorers and only one learner. This object is shared over processes, so it must be locked when trying to operate something with `lock` object. :param queue (multiprocessing.Queue): A FIFO shared with the `learner` and `evaluator` to get the latest network weights. This is process safe, so you don't need to lock process when use this. :param trained_steps (multiprocessing.Value): Number of steps to apply gradients. :param is_training_done (multiprocessing.Event): multiprocessing.Event object to share the status of training. :param lock (multiprocessing.Lock): multiprocessing.Lock to lock other processes. :param env_fn (function): Method object to generate an environment. :param policy_fn (function): Method object to generate an explorer. :param set_weights_fn (function): Method object to set network weights gotten from queue. :param noise_level (float): Noise level for exploration. For epsilon-greedy policy like DQN variants, this will be epsilon, and if DDPG variants this will be variance for Normal distribution. :param n_env (int): Number of environments to distribute. If this is set to be more than 1, `MultiThreadEnv` will be used. :param n_thread (int): Number of thread used in `MultiThreadEnv`. :param buffer_size (int): Size of local buffer. If this is filled with transitions, add them to `global_rb` :param episode_max_steps (int): Maximum number of steps of an episode. :param gpu (int): GPU id. If this is set to -1, then this process uses only CPU. """ import_tf() if n_env > 1: envs = MultiThreadEnv(env_fn=env_fn, batch_size=n_env, thread_pool=n_thread, max_episode_steps=episode_max_steps) env = envs._sample_env else: env = env_fn() policy = policy_fn(env=env, name="Explorer", memory_capacity=global_rb.get_buffer_size(), noise_level=noise_level, gpu=gpu) kwargs = get_default_rb_dict(buffer_size, env) if n_env > 1: kwargs["env_dict"]["priorities"] = {} local_rb = ReplayBuffer(**kwargs) if n_env == 1: s = env.reset() episode_steps = 0 total_reward = 0. total_rewards = [] else: obses = envs.py_reset() start = time.time() n_sample, n_sample_old = 0, 0 while not is_training_done.is_set(): if n_env == 1: n_sample += 1 episode_steps += 1 a = policy.get_action(s) s_, r, done, _ = env.step(a) done_flag = done if episode_steps == env._max_episode_steps: done_flag = False total_reward += r local_rb.add(obs=s, act=a, rew=r, next_obs=s_, done=done_flag) s = s_ if done or episode_steps == episode_max_steps: s = env.reset() total_rewards.append(total_reward) total_reward = 0 episode_steps = 0 else: n_sample += n_env obses = envs.py_observation() actions = policy.get_action(obses, tensor=True) next_obses, rewards, dones, _ = envs.step(actions) td_errors = policy.compute_td_error(states=obses, actions=actions, next_states=next_obses, rewards=rewards, dones=dones) local_rb.add(obs=obses, act=actions, next_obs=next_obses, rew=rewards, done=dones, priorities=np.abs(td_errors + 1e-6)) # Periodically copy weights of explorer if not queue.empty(): set_weights_fn(policy, queue.get()) # Add collected experiences to global replay buffer if local_rb.get_stored_size() == buffer_size: samples = local_rb.sample(local_rb.get_stored_size()) if n_env > 1: priorities = np.squeeze(samples["priorities"]) else: td_errors = policy.compute_td_error( states=samples["obs"], actions=samples["act"], next_states=samples["next_obs"], rewards=samples["rew"], dones=samples["done"]) priorities = np.abs(np.squeeze(td_errors)) + 1e-6 lock.acquire() global_rb.add(obs=samples["obs"], act=samples["act"], rew=samples["rew"], next_obs=samples["next_obs"], done=samples["done"], priorities=priorities) lock.release() local_rb.clear() msg = "Grad: {0: 6d}\t".format(trained_steps.value) msg += "Samples: {0: 7d}\t".format(n_sample) msg += "TDErr: {0:.5f}\t".format(np.average(priorities)) if n_env == 1: ave_rew = 0 if len(total_rewards) == 0 else \ sum(total_rewards) / len(total_rewards) msg += "AveEpiRew: {0:.3f}\t".format(ave_rew) total_rewards = [] msg += "FPS: {0:.2f}".format( (n_sample - n_sample_old) / (time.time() - start)) logging.info(msg) start = time.time() n_sample_old = n_sample
def __call__(self): total_steps = 0 episode_steps = 0 episode_return = 0 episode_start_time = time.time() n_episode = 0 test_step_threshold = self._test_interval # TODO: clean codes self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) obs = self._env.reset() while total_steps < self._max_steps: for _ in range(self._policy.horizon): action, log_pi, val = self._policy.get_action_and_val(obs) next_obs, reward, done, _ = self._env.step(action) if self._show_progress: self._env.render() episode_steps += 1 episode_return += reward total_steps += 1 done_flag = done if hasattr(self._env, "_max_episode_steps") and \ episode_steps == self._env._max_episode_steps: done_flag = False self.local_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done_flag, logp=log_pi, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: self.finish_horizon() obs = self._env.reset() n_episode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_episode, int(total_steps), episode_steps, episode_return, fps)) episode_steps = 0 episode_return = 0 episode_start_time = time.time() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) samples = self.replay_buffer.sample(self._policy.horizon) # Normalize advantages if self._policy.normalize_adv: adv = (samples["adv"] - np.mean(samples["adv"])) / np.std( samples["adv"]) else: adv = samples["adv"] for _ in range(1): self._policy.train_actor(samples["obs"], samples["act"], adv, samples["logp"]) # Train Critic for _ in range(5): self._policy.train_critic(samples["obs"], samples["ret"]) if total_steps > test_step_threshold: test_step_threshold += self._test_interval avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) tf.summary.scalar(name="Common/fps", data=fps) self.writer.flush() if total_steps % self._model_save_interval == 0: self.checkpoint_manager.save() tf.summary.flush()
class OnPolicyTrainer(Trainer): def __call__(self): total_steps = 0 episode_steps = 0 episode_return = 0 episode_start_time = time.time() n_episode = 0 test_step_threshold = self._test_interval # TODO: clean codes self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) obs = self._env.reset() while total_steps < self._max_steps: for _ in range(self._policy.horizon): action, log_pi, val = self._policy.get_action_and_val(obs) next_obs, reward, done, _ = self._env.step(action) if self._show_progress: self._env.render() episode_steps += 1 episode_return += reward total_steps += 1 done_flag = done if hasattr(self._env, "_max_episode_steps") and \ episode_steps == self._env._max_episode_steps: done_flag = False self.local_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done_flag, logp=log_pi, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: self.finish_horizon() obs = self._env.reset() n_episode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_episode, int(total_steps), episode_steps, episode_return, fps)) episode_steps = 0 episode_return = 0 episode_start_time = time.time() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) samples = self.replay_buffer.sample(self._policy.horizon) # Normalize advantages if self._policy.normalize_adv: adv = (samples["adv"] - np.mean(samples["adv"])) / np.std( samples["adv"]) else: adv = samples["adv"] for _ in range(1): self._policy.train_actor(samples["obs"], samples["act"], adv, samples["logp"]) # Train Critic for _ in range(5): self._policy.train_critic(samples["obs"], samples["ret"]) if total_steps > test_step_threshold: test_step_threshold += self._test_interval avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) tf.summary.scalar(name="Common/fps", data=fps) self.writer.flush() if total_steps % self._model_save_interval == 0: self.checkpoint_manager.save() tf.summary.flush() def finish_horizon(self, last_val=0): """ Call this at the end of a trajectory, or when one gets cut off by an epoch ending. This looks back in the buffer to where the trajectory started, and uses rewards and value estimates from the whole trajectory to compute advantage estimates with GAE-Lambda, as well as compute the rewards-to-go for each state, to use as the targets for the value function. The "last_val" argument should be 0 if the trajectory ended because the agent reached a terminal state (died), and otherwise should be V(s_T), the value function estimated for the last state. This allows us to bootstrap the reward-to-go calculation to account for timesteps beyond the arbitrary episode horizon (or epoch cutoff). """ samples = self.local_buffer._encode_sample( np.arange(self.local_buffer.get_stored_size())) rews = np.append(samples["rew"], last_val) vals = np.append(samples["val"], last_val) # GAE-Lambda advantage calculation deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1] if self._policy.enable_gae: advs = discount_cumsum(deltas, self._policy.discount * self._policy.lam) else: advs = deltas # Rewards-to-go, to be targets for the value function rets = discount_cumsum(rews, self._policy.discount)[:-1] self.replay_buffer.add(obs=samples["obs"], act=samples["act"], done=samples["done"], ret=rets, adv=advs, logp=np.squeeze(samples["logp"])) self.local_buffer.clear() def evaluate_policy(self, total_steps): avg_test_return = 0. if self._save_test_path: replay_buffer = get_replay_buffer(self._policy, self._test_env, size=self._episode_max_steps) for i in range(self._test_episodes): episode_return = 0. frames = [] obs = self._test_env.reset() done = False for _ in range(self._episode_max_steps): action, _ = self._policy.get_action(obs, test=True) next_obs, reward, done, _ = self._test_env.step(action) if self._save_test_path: replay_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='rgb_array')) elif self._show_test_progress: self._test_env.render() episode_return += reward obs = next_obs if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format( total_steps, i, episode_return) if self._save_test_path: save_path(replay_buffer.sample(self._episode_max_steps), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: frames_to_gif(frames, prefix, self._output_dir) avg_test_return += episode_return if self._show_test_images: images = tf.cast( tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3), tf.uint8) tf.summary.image( 'train/input_img', images, ) return avg_test_return / self._test_episodes