def test_get_replay_buffer(self): # Replay Buffer rb = get_replay_buffer(self.on_policy_agent, self.discrete_env) self.assertTrue(isinstance(rb, ReplayBuffer)) rb = get_replay_buffer(self.off_policy_agent, self.discrete_env) self.assertTrue(isinstance(rb, ReplayBuffer)) # Prioritized Replay Buffer rb = get_replay_buffer(self.off_policy_agent, self.discrete_env, use_prioritized_rb=True) self.assertTrue(isinstance(rb, PrioritizedReplayBuffer))
def evaluate_policy(self, total_steps): """ Evaluate policy Args: total_steps (int): Current total steps of training """ avg_test_return = 0. avg_test_steps = 0 if self._save_test_path: replay_buffer = get_replay_buffer(self._policy, self._test_env, size=self._episode_max_steps) for i in range(self._test_episodes): episode_return = 0. frames = [] obs = self._test_env.reset() avg_test_steps += 1 for _ in range(self._episode_max_steps): if self._normalize_obs: obs = self._obs_normalizer(obs, update=False) act, _ = self._policy.get_action(obs, test=True) act = (act if is_discrete(self._env.action_space) else np.clip( act, self._env.action_space.low, self._env.action_space.high)) next_obs, reward, done, _ = self._test_env.step(act) avg_test_steps += 1 if self._save_test_path: replay_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='rgb_array')) elif self._show_test_progress: self._test_env.render() episode_return += reward obs = next_obs if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format( total_steps, i, episode_return) if self._save_test_path: save_path(replay_buffer.sample(self._episode_max_steps), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: frames_to_gif(frames, prefix, self._output_dir) avg_test_return += episode_return if self._show_test_images: images = tf.cast( tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3), tf.uint8) tf.summary.image( 'train/input_img', images, ) return avg_test_return / self._test_episodes, avg_test_steps / self._test_episodes
def test(self, total_steps): tf.summary.experimental.set_step(total_steps) if self._normalize_obs: self._test_env.normalizer.set_params( *self._env.normalizer.get_params()) avg_test_return = 0. if self._save_test_path: replay_buffer = get_replay_buffer(self._policy, self._test_env, size=self._episode_max_steps) done = False for i in range(self._test_episodes): print('I AM RUNNING EPISODES') episode_return = 0. frames = [] n = 0 obs = self._test_env.reset() for _ in range(self._episode_max_steps): s, r, done, _ = self._test_env.step( self._test_env.action_space.sample()) n += 1 print(n) if self._save_test_path: replay_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='rgb_array')) elif self._show_test_progress: self._test_env.render()
def __init__(self, *args, n_eval_episodes_per_model=5, **kwargs): kwargs["n_dynamics_model"] = 5 super().__init__(*args, **kwargs) self._n_eval_episodes_per_model = n_eval_episodes_per_model # Replay buffer to train policy self.replay_buffer = get_replay_buffer(self._policy, self._env) # Replay buffer to compute GAE rb_dict = { "size": self._episode_max_steps, "default_dtype": np.float32, "env_dict": { "obs": { "shape": self._env.observation_space.shape }, "act": { "shape": self._env.action_space.shape }, "next_obs": { "shape": self._env.observation_space.shape }, "rew": {}, "done": {}, "logp": {}, "val": {} } } self.local_buffer = ReplayBuffer(**rb_dict)
def __call__(self): total_steps = 0 n_episode = 0 # TODO: clean codes # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples n_episode, total_rewards = self._collect_sample( n_episode, total_steps) total_steps += self._policy.horizon tf.summary.experimental.set_step(total_steps) if len(total_rewards) > 0: avg_training_return = sum(total_rewards) / len(total_rewards) tf.summary.scalar(name="Common/training_return", data=avg_training_return) # Train actor critic for _ in range(self._policy.n_epoch): samples = self.replay_buffer.sample(self._policy.horizon) if self._policy.normalize_adv: adv = (samples["adv"] - np.mean(samples["adv"])) / np.std( samples["adv"]) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) if total_steps % self._test_interval == 0: avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) self.writer.flush() if total_steps % self._model_save_interval == 0: self.checkpoint_manager.save() tf.summary.flush()
def setUpClass(cls): cls.env = gym.make("CartPole-v0") policy = DQN(state_shape=cls.env.observation_space.shape, action_dim=cls.env.action_space.n, memory_capacity=2**4) cls.replay_buffer = get_replay_buffer(policy, cls.env) cls.output_dir = os.path.join(os.path.dirname(__file__), "tests") if not os.path.isdir(cls.output_dir): os.makedirs(cls.output_dir)
def evaluate_policy(self, total_steps): tf.summary.experimental.set_step(total_steps) if self._normalize_obs: self._test_env.normalizer.set_params( *self._env.normalizer.get_params()) avg_test_return = 0. avg_test_steps = 0 if self._save_test_path: replay_buffer = get_replay_buffer(self._policy, self._test_env, size=self._episode_max_steps) for i in range(self._test_episodes): episode_return = 0. frames = [] obs = self._test_env.reset() avg_test_steps += 1 for _ in range(self._episode_max_steps): action = self._policy.get_action(obs, test=True) next_obs, reward, done, _ = self._test_env.step(action) avg_test_steps += 1 if self._save_test_path: replay_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='rgb_array')) elif self._show_test_progress: self._test_env.render() episode_return += reward obs = next_obs if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format( total_steps, i, episode_return) if self._save_test_path: save_path( replay_buffer._encode_sample( np.arange(self._episode_max_steps)), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: frames_to_gif(frames, prefix, self._output_dir) avg_test_return += episode_return if self._show_test_images: images = tf.cast( tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3), tf.uint8) tf.summary.image( 'train/input_img', images, ) return avg_test_return / self._test_episodes, avg_test_steps / self._test_episodes
def __init__(self, *args, n_eval_episodes_per_model=5, **kwargs): """ Initialize ME-TRPO Args: policy: Policy to be trained env (gym.Env): Environment for train args (Namespace or dict): config parameters specified with command line test_env (gym.Env): Environment for test. reward_fn (callable): Reward function buffer_size (int): The default is ``int(1e6)`` lr (float): Learning rate for dynamics model. The default is ``0.001``. n_eval_episode_per_model (int): Number of evalation episodes per a model. The default is ``5`` """ kwargs["n_dynamics_model"] = 5 super().__init__(*args, **kwargs) self._n_eval_episodes_per_model = n_eval_episodes_per_model # Replay buffer to train policy self.replay_buffer = get_replay_buffer(self._policy, self._env) # Replay buffer to compute GAE rb_dict = { "size": self._episode_max_steps, "default_dtype": np.float32, "env_dict": { "obs": { "shape": self._env.observation_space.shape }, "act": { "shape": self._env.action_space.shape }, "next_obs": { "shape": self._env.observation_space.shape }, "rew": {}, "done": {}, "logp": {}, "val": {} } } self.local_buffer = ReplayBuffer(**rb_dict)
def __call__(self): """ Execute training """ if self._evaluate: self.evaluate_policy_continuously() total_steps = 0 tf.summary.experimental.set_step(total_steps) episode_steps = 0 episode_return = 0 episode_start_time = time.perf_counter() n_episode = 0 replay_buffer = get_replay_buffer(self._policy, self._env, self._use_prioritized_rb, self._use_nstep_rb, self._n_step) obs = self._env.reset() while total_steps < self._max_steps: if total_steps < self._policy.n_warmup: action = self._env.action_space.sample() else: action = self._policy.get_action(obs) next_obs, reward, done, _ = self._env.step(action) if self._show_progress: self._env.render() episode_steps += 1 episode_return += reward total_steps += 1 tf.summary.experimental.set_step(total_steps) done_flag = done if (hasattr(self._env, "_max_episode_steps") and episode_steps == self._env._max_episode_steps): done_flag = False replay_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done_flag) obs = next_obs if done or episode_steps == self._episode_max_steps: replay_buffer.on_episode_end() obs = self._env.reset() n_episode += 1 fps = episode_steps / (time.perf_counter() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_episode, total_steps, episode_steps, episode_return, fps)) tf.summary.scalar(name="Common/training_return", data=episode_return) tf.summary.scalar(name="Common/training_episode_length", data=episode_steps) episode_steps = 0 episode_return = 0 episode_start_time = time.perf_counter() if total_steps < self._policy.n_warmup: continue if total_steps % self._policy.update_interval == 0: samples = replay_buffer.sample(self._policy.batch_size) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): self._policy.train( samples["obs"], samples["act"], samples["next_obs"], samples["rew"], np.array(samples["done"], dtype=np.float32), None if not self._use_prioritized_rb else samples["weights"]) if self._use_prioritized_rb: td_error = self._policy.compute_td_error( samples["obs"], samples["act"], samples["next_obs"], samples["rew"], np.array(samples["done"], dtype=np.float32)) replay_buffer.update_priorities(samples["indexes"], np.abs(td_error) + 1e-6) if total_steps % self._test_interval == 0: avg_test_return, avg_test_steps = self.evaluate_policy( total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) tf.summary.scalar(name="Common/average_test_episode_length", data=avg_test_steps) tf.summary.scalar(name="Common/fps", data=fps) if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() tf.summary.flush()
def __call__(self): # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) episode_steps = 0 episode_return = 0 episode_start_time = time.time() total_steps = np.array(0, dtype=np.int32) n_epoisode = 0 obs = self._env.reset() tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples for _ in range(self._policy.horizon): act, logp, val = self._policy.get_action_and_val(obs) next_obs, reward, done, _ = self._env.step(act) if self._show_progress: self._env.render() episode_steps += 1 total_steps += 1 episode_return += reward done_flag = done if hasattr(self._env, "_max_episode_steps") and \ episode_steps == self._env._max_episode_steps: done_flag = False self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done_flag, logp=logp, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: tf.summary.experimental.set_step(total_steps) self.finish_horizon() obs = self._env.reset() n_epoisode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_epoisode, int(total_steps), episode_steps, episode_return, fps)) tf.summary.scalar(name="Common/training_return", data=episode_return) tf.summary.scalar(name="Common/fps", data=fps) episode_steps = 0 episode_return = 0 episode_start_time = time.time() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) # Train actor critic if self._policy.normalize_adv: samples = self.replay_buffer._encode_sample( np.arange(self._policy.horizon)) mean_adv = np.mean(samples["adv"]) std_adv = np.std(samples["adv"]) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): for _ in range(self._policy.n_epoch): samples = self.replay_buffer._encode_sample( np.random.permutation(self._policy.horizon)) if self._policy.normalize_adv: adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) if total_steps % self._test_interval == 0: avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) self.writer.flush() if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() tf.summary.flush()
def __call__(self): # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) episode_steps = 0 episode_return = 0 episode_cost = 0 episode_start_time = time.time() total_steps = np.array(0, dtype=np.int32) n_epoisode = 0 obs = self._env.reset() tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples for _ in range(self._policy.horizon): if self._normalize_obs: obs = self._obs_normalizer(obs, update=False) act, logp, val = self._policy.get_action_and_val(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_obs, reward, done, info = self._env.step(env_act) # print('[DEBUG] COST:', info['cost']) try: cost = info['cost'] except (TypeError, KeyError): cost = 0 if self._show_progress: self._env.render() episode_steps += 1 total_steps += 1 episode_return += reward episode_cost += cost done_flag = done if (hasattr(self._env, "_max_episode_steps") and episode_steps == self._env._max_episode_steps): done_flag = False self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done_flag, logp=logp, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: tf.summary.experimental.set_step(total_steps) self.finish_horizon() obs = self._env.reset() n_epoisode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 6.4f} Cost: {4: 5.4f} FPS: {5:5.2f}" .format(n_epoisode, int(total_steps), episode_steps, episode_return, episode_cost, fps)) tf.summary.scalar(name="Common/training_return", data=episode_return) tf.summary.scalar(name="Common/fps", data=fps) self.total_cost += episode_cost cost_rate = self.total_cost / total_steps wandb.log( { 'Training_Return': episode_return, 'Training_Cost': episode_cost, 'Cost_Rate': cost_rate, 'FPS': fps }, step=n_epoisode) episode_steps = 0 episode_return = 0 episode_cost = 0 episode_start_time = time.time() if total_steps % self._test_interval == 0: avg_test_return, avg_test_cost = self.evaluate_policy( total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 6.4f} Average Cost {2: 5.4f} over {3: 2} episodes" .format(total_steps, avg_test_return, avg_test_cost, self._test_episodes)) wandb.log( { 'Evaluation_Return': avg_test_return, 'Evaluation_Cost': avg_test_cost }, step=n_epoisode) # wandb.log({'Evaluation_Step': total_steps}) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) self.writer.flush() if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) # Train actor critic if self._policy.normalize_adv: samples = self.replay_buffer.get_all_transitions() mean_adv = np.mean(samples["adv"]) std_adv = np.std(samples["adv"]) # Update normalizer if self._normalize_obs: self._obs_normalizer.experience(samples["obs"]) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): for _ in range(self._policy.n_epoch): samples = self.replay_buffer._encode_sample( np.random.permutation(self._policy.horizon)) if self._normalize_obs: samples["obs"] = self._obs_normalizer(samples["obs"], update=False) if self._policy.normalize_adv: adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) tf.summary.flush()
def train(self): """method for training an agent with Hindsight Workspace Relabeling""" # training mode: self._policy.eval_mode = False total_steps = 0 tf.summary.experimental.set_step(total_steps) episode_steps = 0 episode_return = 0 episode_start_time = time.perf_counter() n_episode = 0 success_traj_train = 0. relabeling_times, training_times = [], [] #Initialize replay buffer self._replay_buffer = get_replay_buffer( self._policy, self._env, self._use_prioritized_rb, self._use_nstep_rb, self._n_step) # resetting: self.trajectory = [] workspace, goal, obs = self._env.reset() #Concatenate position observation with start, goal, and reduced workspace reduced_workspace = self._CAE.evaluate(workspace) obs_full = np.concatenate((obs, goal, reduced_workspace)) while total_steps < self._max_steps: #Visualize environment if "show_progess" if self._show_progress and \ ((n_episode % self._show_progress_interval) == 0) and \ total_steps > self._policy.n_warmup: self._env.render() if total_steps in self._params["agent"]["lr_decay_steps"]: ind = self._params["agent"]["lr_decay_steps"].index(total_steps) self._params["agent"]["lr_actor"] = self._params["agent"]["actor_lr_decay_vals"][ind] self._params["agent"]["lr_actor"] = self._params["agent"]["critic_lr_decay_vals"][ind] self._policy.actor_optimizer.learning_rate = self._params["agent"]["lr_actor"] self._policy.critic_optimizer.learning_rate = self._params["agent"]["lr_critic"] print("---- Learning rate: {}".format(self._policy.actor_optimizer.learning_rate)) #Get action randomly for warmup /from Actor-NN otherwise if total_steps < self._policy.n_warmup: action = self._env.action_space.sample() else: action = self._policy.get_action(obs_full) #Take action and get next_obs, reward and done_flag from environment next_obs, reward, done, _ = self._env.step(action) next_obs_full = np.concatenate((next_obs, goal, reduced_workspace)) # add the new point to replay buffer self._replay_buffer.add(obs=obs_full, act=action, next_obs=next_obs_full, rew=reward, done=done) #Add obersvation to the trajectory storage self.trajectory.append({'workspace': workspace,'position': obs, 'next_position': next_obs,'goal': goal, 'action': action, 'reward': reward, 'done': done}) obs = next_obs obs_full = next_obs_full episode_steps += 1 episode_return += reward total_steps += 1 tf.summary.experimental.set_step(total_steps) if done or episode_steps == self._episode_max_steps: if (reward != self._env.goal_reward): """Workspace relabeling""" # plotting the trajectory: if self._params["trainer"]["show_relabeling"]: self._relabel_fig = visualize_trajectory( trajectory=self.trajectory, fig=self._relabel_fig, env=self._env ) plt.pause(1) relabeling_begin = time.time() # Create new workspace for the trajectory: relabeled_trajectory = self._relabeler.relabel(trajectory=self.trajectory, env=self._env) if relabeled_trajectory: relabeled_ws = relabeled_trajectory[0]['workspace'] relabeled_reduced_ws = self._CAE.evaluate(relabeled_ws) # adding the points of the relabeled trajectory to the replay buffer: for point in relabeled_trajectory: relabeled_obs_full = np.concatenate((point['position'], point['goal'], relabeled_reduced_ws)) relabeled_next_obs_full = np.concatenate((point['next_position'], point['goal'], relabeled_reduced_ws)) self._replay_buffer.add(obs=relabeled_obs_full, act=point['action'], next_obs=relabeled_next_obs_full, rew=point['reward'], done=point['done']) # plotting the relabeled trajectory: if self._params["trainer"]["show_relabeling"]: self._relabel_fig = visualize_trajectory( trajectory=relabeled_trajectory, fig=self._relabel_fig, env=self._env ) plt.pause(1) relabeling_times.append(time.time() - relabeling_begin) else: success_traj_train += 1 # resetting: workspace, goal, obs = self._env.reset() reduced_workspace = self._CAE.evaluate(workspace) obs_full = np.concatenate((obs, goal, reduced_workspace)) self.trajectory = [] #Print out train accuracy n_episode += 1 if n_episode % self._test_episodes == 0: train_sucess_rate = success_traj_train / self._test_episodes fps = episode_steps / (time.perf_counter() - episode_start_time) self.logger.info("Total Epi: {0: 5} Train sucess rate: {1: 5.4f} Total Steps: {2: 7} Episode Steps: {3: 5} Return: {4: 5.4f} Last reward: {5: 5.4f} FPS: {6: 5.2f}".format( n_episode, train_sucess_rate, total_steps, episode_steps, episode_return, reward, fps)) tf.summary.scalar( name="Common/training_return", data=episode_return) tf.summary.scalar( name="Common/training_success_rate", data=train_sucess_rate) success_traj_train = 0 if len(relabeling_times) != 0: print('average relabeling time: {}'.format(sum(relabeling_times) / len(relabeling_times))) relabeling_times = [] if len(training_times) != 0: print('average training time: {}'.format(sum(training_times) / len(training_times))) training_times = [] episode_steps = 0 episode_return = 0 episode_start_time = time.perf_counter() #While warmup, we only produce experiences without training if total_steps <= self._policy.n_warmup: continue # After every Update_interval we want to train/update the Actor-NN, Critic-NN, # and the Target-Actor-NN & Target-Critic-NN if total_steps % self._policy.update_interval == 0: training_begin = time.time() #Sample a new batch of experiences from the replay buffer for training samples = self._replay_buffer.sample(self._policy.batch_size) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): # Here we update the Actor-NN, Critic-NN, and the Target-Actor-NN & Target-Critic-NN # after computing the Critic-loss and the Actor-loss self._policy.train( samples["obs"], samples["act"], samples["next_obs"], samples["rew"], np.array(samples["done"], dtype=np.float32), None if not self._use_prioritized_rb else samples["weights"]) if self._use_prioritized_rb: #Here we compute the Td-Critic-Loss/error td_error = self._policy.compute_td_error( samples["obs"], samples["act"], samples["next_obs"], samples["rew"], np.array(samples["done"], dtype=np.float32)) self._replay_buffer.update_priorities( samples["indexes"], np.abs(td_error) + 1e-6) training_times.append(time.time() - training_begin) # Every test_interval we want to test our agent if total_steps % self._test_interval == 0: # setting evaluation mode for deterministic actions: self._policy.eval_mode = True avg_test_return, success_rate, ratio_straight_lines, success_rate_straight_line, success_rate_no_straight_line = self.evaluate_policy(total_steps) self.logger.info("Evaluation: Total Steps: {0: 7} Average Reward {1: 5.4f} and Sucess rate: {2: 5.4f} for {3: 2} episodes".format( total_steps, avg_test_return, success_rate, self._test_episodes)) tf.summary.scalar( name="Common/average_test_return", data=avg_test_return) tf.summary.scalar( name="Common/test_success_rate", data=success_rate) tf.summary.scalar( name="Ratio_feasible straight_line episodes", data=ratio_straight_lines) tf.summary.scalar( name="test_success_rate straight_line episodes", data=success_rate_straight_line) tf.summary.scalar( name="test_success_rate no_straight_line episodes", data=success_rate_no_straight_line) tf.summary.scalar(name="Common/fps", data=fps) self.writer.flush() # setting evaluation mode back to false: self._policy.eval_mode = False # Every save_model_interval we save the model if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() tf.summary.flush()
def evaluate_policy(self, total_steps): tf.summary.experimental.set_step(total_steps) if self._normalize_obs: self._test_env.normalizer.set_params( *self._env.normalizer.get_params()) avg_test_return = 0. avg_test_cost = 0. if self._save_test_path: replay_buffer = get_replay_buffer(self._policy, self._test_env, size=self._episode_max_steps) for i in range(self._test_episodes): #print('I AM RUNNING EPISODES') episode_return = 0. episode_cost = 0. frames = [] obs = self._test_env.reset() for _ in range(self._episode_max_steps): action = self._policy.get_action(obs, test=True) next_obs, reward, done, info = self._test_env.step(action) try: cost = info['cost'] except (TypeError, KeyError, IndexError): cost = 0 # next_obs, reward, done, _ = self._env.step(self._env.action_space.sample()) if self._save_test_path: replay_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='rgb_array')) elif self._show_test_progress: self._test_env.render() episode_return += reward episode_cost += cost obs = next_obs if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}_cost_{2:010.4f}".format( total_steps, i, episode_return, episode_cost) if self._save_test_path: save_path( replay_buffer._encode_sample( np.arange(self._episode_max_steps)), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: #print('I AM SAVING MOVIES') #frames_to_gif(frames, prefix, self._output_dir) frames_to_mp4(frames, prefix, self._output_dir) avg_test_return += episode_return avg_test_cost += episode_cost if self._show_test_images: images = tf.cast( tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3), tf.uint8) tf.summary.image( 'train/input_img', images, ) return avg_test_return / self._test_episodes, avg_test_cost / self._test_episodes
def __call__(self): total_steps = tf.train.create_global_step() episode_steps = 0 episode_return = 0 episode_start_time = time.time() n_episode = 0 replay_buffer = get_replay_buffer(self._policy, self._env, self._use_prioritized_rb, self._use_nstep_rb, self._n_step) obs = self._env.reset() with tf.summary.record_summaries_every_n_global_steps(1000): while total_steps < self._max_steps: if total_steps < self._policy.n_warmup: action = self._env.action_space.sample() else: action = self._policy.get_action(obs) next_obs, reward, done, _ = self._env.step(action) if self._show_progress: self._env.render() episode_steps += 1 episode_return += reward total_steps.assign_add(1) done_flag = done if hasattr(self._env, "_max_episode_steps") and \ episode_steps == self._env._max_episode_steps: done_flag = False replay_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done_flag) obs = next_obs if done or episode_steps == self._episode_max_steps: obs = self._env.reset() n_episode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_episode, int(total_steps), episode_steps, episode_return, fps)) episode_steps = 0 episode_return = 0 episode_start_time = time.time() if total_steps >= self._policy.n_warmup: samples = replay_buffer.sample(self._policy.batch_size) indices = np.random.randint(self._expert_obs.shape[0], size=self._policy.batch_size) expert_obs, expert_act = self._expert_obs[ indices], self._expert_act[indices] # Train IRL self._irl.train(samples["obs"], samples["act"], expert_obs, expert_act) # Train policy rew = self._irl.inference(samples["obs"], samples["act"]) td_error = self._policy.train( samples["obs"], samples["act"], samples["next_obs"], rew, np.array(samples["done"], dtype=np.float32), None if not self._use_prioritized_rb else samples["weights"]) if self._use_prioritized_rb: replay_buffer.update_priorities( samples["indexes"], np.abs(td_error) + 1e-6) if int(total_steps) % self._test_interval == 0: with tf.summary.always_record_summaries(): avg_test_return = self.evaluate_policy( int(total_steps)) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(int(total_steps), avg_test_return, self._test_episodes)) tf.summary.scalar(name="AverageTestReturn", data=avg_test_return) tf.summary.scalar(name="FPS", data=fps) self.writer.flush() if int(total_steps) % self._model_save_interval == 0: self.checkpoint_manager.save() tf.summary.flush()
def __call__(self): total_steps = 0 episode_steps = 0 episode_return = 0 episode_start_time = time.time() n_episode = 0 test_step_threshold = self._test_interval # TODO: clean codes self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) obs = self._env.reset() while total_steps < self._max_steps: for _ in range(self._policy.horizon): action, log_pi, val = self._policy.get_action_and_val(obs) next_obs, reward, done, _ = self._env.step(action) if self._show_progress: self._env.render() episode_steps += 1 episode_return += reward total_steps += 1 done_flag = done if hasattr(self._env, "_max_episode_steps") and \ episode_steps == self._env._max_episode_steps: done_flag = False self.local_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done_flag, logp=log_pi, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: self.finish_horizon() obs = self._env.reset() n_episode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_episode, int(total_steps), episode_steps, episode_return, fps)) episode_steps = 0 episode_return = 0 episode_start_time = time.time() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) samples = self.replay_buffer.sample(self._policy.horizon) # Normalize advantages if self._policy.normalize_adv: adv = (samples["adv"] - np.mean(samples["adv"])) / np.std( samples["adv"]) else: adv = samples["adv"] for _ in range(1): self._policy.train_actor(samples["obs"], samples["act"], adv, samples["logp"]) # Train Critic for _ in range(5): self._policy.train_critic(samples["obs"], samples["ret"]) if total_steps > test_step_threshold: test_step_threshold += self._test_interval avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) tf.summary.scalar(name="Common/fps", data=fps) self.writer.flush() if total_steps % self._model_save_interval == 0: self.checkpoint_manager.save() tf.summary.flush()
def evaluate_policy(self, total_steps): """evaluating the policy.""" tf.summary.experimental.set_step(total_steps) total_test_return = 0. success_traj = 0 if self._save_test_path: replay_buffer = get_replay_buffer( self._policy, self._test_env, size=self._episode_max_steps) straight_line_episode = 0 no_straight_line_episode = 0 success_traj_straight_line = 0 success_traj_no_straight_line = 0 for i in range(self._test_episodes): episode_return = 0. frames = [] workspace, goal, obs = self._test_env.reset() start = obs reduced_workspace = self._CAE.evaluate(workspace) #Concatenate position observation with start, goal, and reduced workspace!! obs_full = np.concatenate((obs, goal, reduced_workspace)) for _ in range(self._episode_max_steps): action = self._policy.get_action(obs_full) next_obs, reward, done, _ = self._test_env.step(action) #Concatenate position observation with start, goal, and reduced workspace!! next_obs_full = np.concatenate((obs, goal, reduced_workspace)) # Add obersvation to the trajectory storage self.trajectory.append({'workspace': workspace,'position': obs, 'next_position': next_obs,'goal': goal, 'action': action, 'reward': reward, 'done': done}) if self._save_test_path: replay_buffer.add(obs=obs_full, act=action, next_obs=next_obs_full, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='plot')) elif self._show_test_progress: self._test_env.render() episode_return += reward obs = next_obs obs_full = next_obs_full if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format( total_steps, i, episode_return) if self._save_test_path: save_path(replay_buffer._encode_sample(np.arange(self._episode_max_steps)), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: frames_to_gif(frames, prefix, self._output_dir) if self._save_test_path_sep: self._save_traj_separately(prefix) total_test_return += episode_return if straight_line_feasible(workspace, start, goal, self._test_env): straight_line_episode += 1 if reward == self._test_env.goal_reward: success_traj_straight_line += 1 else: no_straight_line_episode += 1 if reward == self._test_env.goal_reward: success_traj_no_straight_line += 1 if reward == self._test_env.goal_reward: success_traj += 1 # empty trajectory: self.trajectory = [] if self._show_test_images: images = tf.cast( tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=2), tf.uint8) tf.summary.image('train/input_img', images,) avg_test_return = total_test_return / self._test_episodes success_rate = success_traj / self._test_episodes if straight_line_episode > 0: success_rate_straight_line = success_traj_straight_line/straight_line_episode else: success_rate_straight_line = 0 if no_straight_line_episode > 0: success_rate_no_straight_line = success_traj_no_straight_line/no_straight_line_episode else: success_rate_no_straight_line = 0 ratio_straight_lines = straight_line_episode/ self._test_episodes return avg_test_return, success_rate, ratio_straight_lines, success_rate_straight_line, success_rate_no_straight_line
def __call__(self): total_steps = 0 tf.summary.experimental.set_step(total_steps) episode_steps = 0 episode_return = 0 episode_start_time = time.perf_counter() n_episode = 0 replay_buffer = get_replay_buffer( self._policy, self._env, self._use_prioritized_rb, self._use_nstep_rb, self._n_step) obs = self._env.reset() while total_steps < self._max_steps: while total_steps < self._max_steps: if total_steps < self._policy.n_warmup: action = self._env.action_space.sample() else: action = self._policy.get_action(obs) next_obs, reward, done, _ = self._env.step(action) if self._show_progress: self._env.render() episode_steps += 1 episode_return += reward total_steps += 1 tf.summary.experimental.set_step(total_steps) done_flag = done if hasattr(self._env, "_max_episode_steps") and \ episode_steps == self._env._max_episode_steps: done_flag = False replay_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done_flag) obs = next_obs if done or episode_steps == self._episode_max_steps: obs = self._env.reset() n_episode += 1 fps = episode_steps / (time.perf_counter() - episode_start_time) self.logger.info("Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}".format( n_episode, int(total_steps), episode_steps, episode_return, fps)) tf.summary.scalar( name="Common/training_return", data=episode_return) episode_steps = 0 episode_return = 0 episode_start_time = time.perf_counter() if total_steps < self._policy.n_warmup: continue if total_steps % self._policy.update_interval == 0: samples = replay_buffer.sample(self._policy.batch_size) # Train policy rew = self._irl.inference(samples["obs"], samples["act"], samples["next_obs"]) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): self._policy.train( samples["obs"], samples["act"], samples["next_obs"], rew, samples["done"], None if not self._use_prioritized_rb else samples["weights"]) if self._use_prioritized_rb: td_error = self._policy.compute_td_error( samples["obs"], samples["act"], samples["next_obs"], rew, samples["done"]) replay_buffer.update_priorities( samples["indexes"], np.abs(td_error) + 1e-6) # Train IRL for _ in range(self._irl.n_training): samples = replay_buffer.sample(self._irl.batch_size) # Do not allow duplication!!! indices = np.random.choice( self._random_range, self._irl.batch_size, replace=False) self._irl.train( samples["obs"], samples["act"], samples["next_obs"], self._expert_obs[indices], self._expert_act[indices], self._expert_next_obs[indices]) if total_steps % self._test_interval == 0: avg_test_return = self.evaluate_policy(total_steps) self.logger.info("Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes".format( total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar( name="Common/average_test_return", data=avg_test_return) tf.summary.scalar( name="Common/fps", data=fps) self.writer.flush() if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() tf.summary.flush()
def __call__(self): total_steps = 0 tf.summary.experimental.set_step(total_steps) episode_steps = 0 episode_return = 0 episode_cost = 0 episode_start_time = time.perf_counter() n_episode = 0 replay_buffer = get_replay_buffer(self._policy, self._env, self._use_prioritized_rb, self._use_nstep_rb, self._n_step) obs = self._env.reset() while total_steps < self._max_steps: if total_steps < self._policy.n_warmup: action = self._env.action_space.sample() else: action = self._policy.get_action(obs) next_obs, reward, done, info = self._env.step(action) try: cost = info['cost'] except (TypeError, KeyError, IndexError): cost = 0 if self._show_progress: self._env.render() episode_steps += 1 episode_return += reward episode_cost += cost total_steps += 1 tf.summary.experimental.set_step(total_steps) done_flag = done if (hasattr(self._env, "_max_episode_steps") and episode_steps == self._env._max_episode_steps): done_flag = False replay_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done_flag) obs = next_obs if done or episode_steps == self._episode_max_steps: replay_buffer.on_episode_end() obs = self._env.reset() n_episode += 1 fps = episode_steps / (time.perf_counter() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 6.4f} Cost: {4: 5.4f} FPS: {5:5.2f}" .format(n_episode, total_steps, episode_steps, episode_return, episode_cost, fps)) tf.summary.scalar(name="Common/training_return", data=episode_return) self.total_cost += episode_cost cost_rate = self.total_cost / total_steps wandb.log( { 'Training_Return': episode_return, 'Training_Cost': episode_cost, 'Cost_Rate': cost_rate, 'FPS': fps }, step=n_episode) episode_steps = 0 episode_return = 0 episode_cost = 0 episode_start_time = time.perf_counter() if total_steps < self._policy.n_warmup: # print('[DEBUG] I AM WARMING UP') continue if total_steps % self._policy.update_interval == 0: # print('[DEBUG] I AM UPDATEING') samples = replay_buffer.sample(self._policy.batch_size) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): self._policy.train( samples["obs"], samples["act"], samples["next_obs"], samples["rew"], np.array(samples["done"], dtype=np.float32), None if not self._use_prioritized_rb else samples["weights"]) if self._use_prioritized_rb: td_error = self._policy.compute_td_error( samples["obs"], samples["act"], samples["next_obs"], samples["rew"], np.array(samples["done"], dtype=np.float32)) replay_buffer.update_priorities(samples["indexes"], np.abs(td_error) + 1e-6) if total_steps % self._test_interval == 0: # print('[DEBUG] I AM TESTING') # self.test(total_steps) avg_test_return, avg_test_cost = self.evaluate_policy( total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 6.4f} Average Cost {2: 5.4f} over {3: 2} episodes" .format(total_steps, avg_test_return, avg_test_cost, self._test_episodes)) wandb.log( { 'Evaluation_Return': avg_test_return, 'Evaluation_Cost': avg_test_cost }, step=n_episode) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) tf.summary.scalar(name="Common/fps", data=fps) self.writer.flush() if total_steps % self._save_model_interval == 0: # print('[DEBUG] I AM SAVING') self.checkpoint_manager.save() tf.summary.flush()