def test_weight_loading(): model = CAE(pooling='max', latent_dim=16, input_shape=(32, 32), conv_filters=[4, 8, 16]) model.build(input_shape=(1, 32, 32, 1)) model.load_weights(filepath='../models/cae/model_num_5_size_8.h5') for k, _ in model._get_trainable_state().items(): k.trainable = False
class PointrobotTrainerTests(unittest.TestCase): """For testing the Pointrobot trainer.""" def setUp(self): """setup""" self.params = load_params('params/test_params.json') self.env = gym.make(self.params["env"]["name"], params=self.params) self.test_env = gym.make(self.params["env"]["name"], params=self.params) self.policy = DDPG(env=self.env, params=self.params) self.cae = CAE(pooling='max', latent_dim=16, input_shape=(32, 32), conv_filters=[4, 8, 16]) self.cae.build(input_shape=(1, 32, 32, 1)) self.cae.load_weights(filepath='../models/cae/model_num_5_size_8.h5') def test_pointrobot_trainer_init(self): """tests the __init__() function of the pointrobot trainer""" trainer = PointrobotTrainer(self.policy, self.env, self.params, test_env=self.test_env) def test_evaluation(self): """tests the evaluation method of the pointrobot trainer""" trainer = PointrobotTrainer(self.policy, self.env, self.params, test_env=self.test_env) trainer.evaluate() def test_training(self): """sanity check of the training method.""" self.params["trainer"]["max_steps"] = 1e4 trainer = PointrobotTrainer(self.policy, self.env, self.params, test_env=self.test_env) trainer.train()
from hwr.cae.cae import CAE from hwr.random_workspace import visualize_workspace """Visualize the output of the trained Convolutional Autoencoder""" pooling = 'max' latent_dim = 16 input_shape = (32, 32) conv_filters = [4, 8, 16] model = CAE( pooling=pooling, latent_dim=latent_dim, input_shape=input_shape, conv_filters=conv_filters, ) model.build(input_shape=(1, 32, 32, 1)) model.load_weights(filepath='../models/cae/model_num_5_size_8.h5') # Plot results on an unseen workspace: # path = os.path.join('../workspaces/', ('ws_' + str(9500) + '.csv')) x = np.expand_dims(np.loadtxt(path), axis=2).astype('float32') x = np.expand_dims(x, axis=0) x = tf.convert_to_tensor(x) x_hat = tf.cast(model(x) >= 0.5, tf.float32) fig2 = visualize_workspace(x.numpy()[0, :, :, 0], fignum=2) fig3 = visualize_workspace(x_hat.numpy()[0, :, :, 0], fignum=3) plt.show()
class PointrobotTrainer: def __init__( self, policy, env, params, test_env=None): """Initializing the training instance.""" self._params = params self._set_from_params() self._policy = policy self._env = env self._test_env = self._env if test_env is None else test_env args = self._get_args_from_params() # Convolutional Autoencoder: self._CAE = CAE(pooling=self._params["cae"]["pooling"], latent_dim=self._params["cae"]["latent_dim"], input_shape=self._env.workspace.shape, conv_filters=self._params["cae"]["conv_filters"]) self._CAE.build(input_shape=(1, self._env.workspace.shape[0], self._env.workspace.shape[1], 1)) self._CAE.load_weights(filepath=self._params["cae"]["weights_path"]) for layer, _ in self._CAE._get_trainable_state().items(): layer.trainable = False #Initialize array for trajectory storage self.trajectory=[] # Initialize workspace relabeler: self._relabeler = PointrobotRelabeler( ws_shape=(self._env.grid_size, self._env.grid_size), mode=params["trainer"]["relabeling_mode"], remove_zigzaging=params["trainer"]["remove_zigzaging"] ) # prepare log directory self._output_dir = prepare_output_dir( args=args, user_specified_dir=self._logdir, suffix="{}_{}".format(self._policy.policy_name, params["trainer"]["dir_suffix"])) self.logger = initialize_logger( logging_level=logging.getLevelName(params["trainer"]["logging_level"]), output_dir=self._output_dir) if self._save_test_path_sep: sep_logdirs = ['successful_trajs', 'unsuccessful_trajs', 'unfinished_trajs'] for logdir in sep_logdirs: if not os.path.exists(os.path.join(self._logdir, logdir)): os.makedirs(os.path.join(self._logdir, logdir)) if params["trainer"]["mode"] == "evaluate": assert glob.glob(os.path.join(params["trainer"]["model_dir"], '*')) self._set_check_point(params["trainer"]["model_dir"]) # prepare TensorBoard output self.writer = tf.summary.create_file_writer(self._output_dir) self.writer.set_as_default() # relabeling visualization: self._relabel_fig = plt.figure(2) def _set_check_point(self, model_dir): # Save and restore model self._checkpoint = tf.train.Checkpoint(policy=self._policy) self.checkpoint_manager = tf.train.CheckpointManager( self._checkpoint, directory=model_dir, max_to_keep=5) if model_dir is not None: if not os.path.isdir(model_dir): os.makedirs(model_dir) self._latest_path_ckpt = tf.train.latest_checkpoint(model_dir) self._checkpoint.restore(self._latest_path_ckpt) self.logger.info("Restored {}".format(self._latest_path_ckpt)) def train(self): """method for training an agent with Hindsight Workspace Relabeling""" # training mode: self._policy.eval_mode = False total_steps = 0 tf.summary.experimental.set_step(total_steps) episode_steps = 0 episode_return = 0 episode_start_time = time.perf_counter() n_episode = 0 success_traj_train = 0. relabeling_times, training_times = [], [] #Initialize replay buffer self._replay_buffer = get_replay_buffer( self._policy, self._env, self._use_prioritized_rb, self._use_nstep_rb, self._n_step) # resetting: self.trajectory = [] workspace, goal, obs = self._env.reset() #Concatenate position observation with start, goal, and reduced workspace reduced_workspace = self._CAE.evaluate(workspace) obs_full = np.concatenate((obs, goal, reduced_workspace)) while total_steps < self._max_steps: #Visualize environment if "show_progess" if self._show_progress and \ ((n_episode % self._show_progress_interval) == 0) and \ total_steps > self._policy.n_warmup: self._env.render() if total_steps in self._params["agent"]["lr_decay_steps"]: ind = self._params["agent"]["lr_decay_steps"].index(total_steps) self._params["agent"]["lr_actor"] = self._params["agent"]["actor_lr_decay_vals"][ind] self._params["agent"]["lr_actor"] = self._params["agent"]["critic_lr_decay_vals"][ind] self._policy.actor_optimizer.learning_rate = self._params["agent"]["lr_actor"] self._policy.critic_optimizer.learning_rate = self._params["agent"]["lr_critic"] print("---- Learning rate: {}".format(self._policy.actor_optimizer.learning_rate)) #Get action randomly for warmup /from Actor-NN otherwise if total_steps < self._policy.n_warmup: action = self._env.action_space.sample() else: action = self._policy.get_action(obs_full) #Take action and get next_obs, reward and done_flag from environment next_obs, reward, done, _ = self._env.step(action) next_obs_full = np.concatenate((next_obs, goal, reduced_workspace)) # add the new point to replay buffer self._replay_buffer.add(obs=obs_full, act=action, next_obs=next_obs_full, rew=reward, done=done) #Add obersvation to the trajectory storage self.trajectory.append({'workspace': workspace,'position': obs, 'next_position': next_obs,'goal': goal, 'action': action, 'reward': reward, 'done': done}) obs = next_obs obs_full = next_obs_full episode_steps += 1 episode_return += reward total_steps += 1 tf.summary.experimental.set_step(total_steps) if done or episode_steps == self._episode_max_steps: if (reward != self._env.goal_reward): """Workspace relabeling""" # plotting the trajectory: if self._params["trainer"]["show_relabeling"]: self._relabel_fig = visualize_trajectory( trajectory=self.trajectory, fig=self._relabel_fig, env=self._env ) plt.pause(1) relabeling_begin = time.time() # Create new workspace for the trajectory: relabeled_trajectory = self._relabeler.relabel(trajectory=self.trajectory, env=self._env) if relabeled_trajectory: relabeled_ws = relabeled_trajectory[0]['workspace'] relabeled_reduced_ws = self._CAE.evaluate(relabeled_ws) # adding the points of the relabeled trajectory to the replay buffer: for point in relabeled_trajectory: relabeled_obs_full = np.concatenate((point['position'], point['goal'], relabeled_reduced_ws)) relabeled_next_obs_full = np.concatenate((point['next_position'], point['goal'], relabeled_reduced_ws)) self._replay_buffer.add(obs=relabeled_obs_full, act=point['action'], next_obs=relabeled_next_obs_full, rew=point['reward'], done=point['done']) # plotting the relabeled trajectory: if self._params["trainer"]["show_relabeling"]: self._relabel_fig = visualize_trajectory( trajectory=relabeled_trajectory, fig=self._relabel_fig, env=self._env ) plt.pause(1) relabeling_times.append(time.time() - relabeling_begin) else: success_traj_train += 1 # resetting: workspace, goal, obs = self._env.reset() reduced_workspace = self._CAE.evaluate(workspace) obs_full = np.concatenate((obs, goal, reduced_workspace)) self.trajectory = [] #Print out train accuracy n_episode += 1 if n_episode % self._test_episodes == 0: train_sucess_rate = success_traj_train / self._test_episodes fps = episode_steps / (time.perf_counter() - episode_start_time) self.logger.info("Total Epi: {0: 5} Train sucess rate: {1: 5.4f} Total Steps: {2: 7} Episode Steps: {3: 5} Return: {4: 5.4f} Last reward: {5: 5.4f} FPS: {6: 5.2f}".format( n_episode, train_sucess_rate, total_steps, episode_steps, episode_return, reward, fps)) tf.summary.scalar( name="Common/training_return", data=episode_return) tf.summary.scalar( name="Common/training_success_rate", data=train_sucess_rate) success_traj_train = 0 if len(relabeling_times) != 0: print('average relabeling time: {}'.format(sum(relabeling_times) / len(relabeling_times))) relabeling_times = [] if len(training_times) != 0: print('average training time: {}'.format(sum(training_times) / len(training_times))) training_times = [] episode_steps = 0 episode_return = 0 episode_start_time = time.perf_counter() #While warmup, we only produce experiences without training if total_steps <= self._policy.n_warmup: continue # After every Update_interval we want to train/update the Actor-NN, Critic-NN, # and the Target-Actor-NN & Target-Critic-NN if total_steps % self._policy.update_interval == 0: training_begin = time.time() #Sample a new batch of experiences from the replay buffer for training samples = self._replay_buffer.sample(self._policy.batch_size) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): # Here we update the Actor-NN, Critic-NN, and the Target-Actor-NN & Target-Critic-NN # after computing the Critic-loss and the Actor-loss self._policy.train( samples["obs"], samples["act"], samples["next_obs"], samples["rew"], np.array(samples["done"], dtype=np.float32), None if not self._use_prioritized_rb else samples["weights"]) if self._use_prioritized_rb: #Here we compute the Td-Critic-Loss/error td_error = self._policy.compute_td_error( samples["obs"], samples["act"], samples["next_obs"], samples["rew"], np.array(samples["done"], dtype=np.float32)) self._replay_buffer.update_priorities( samples["indexes"], np.abs(td_error) + 1e-6) training_times.append(time.time() - training_begin) # Every test_interval we want to test our agent if total_steps % self._test_interval == 0: # setting evaluation mode for deterministic actions: self._policy.eval_mode = True avg_test_return, success_rate, ratio_straight_lines, success_rate_straight_line, success_rate_no_straight_line = self.evaluate_policy(total_steps) self.logger.info("Evaluation: Total Steps: {0: 7} Average Reward {1: 5.4f} and Sucess rate: {2: 5.4f} for {3: 2} episodes".format( total_steps, avg_test_return, success_rate, self._test_episodes)) tf.summary.scalar( name="Common/average_test_return", data=avg_test_return) tf.summary.scalar( name="Common/test_success_rate", data=success_rate) tf.summary.scalar( name="Ratio_feasible straight_line episodes", data=ratio_straight_lines) tf.summary.scalar( name="test_success_rate straight_line episodes", data=success_rate_straight_line) tf.summary.scalar( name="test_success_rate no_straight_line episodes", data=success_rate_no_straight_line) tf.summary.scalar(name="Common/fps", data=fps) self.writer.flush() # setting evaluation mode back to false: self._policy.eval_mode = False # Every save_model_interval we save the model if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() tf.summary.flush() def evaluate(self): """method for evaluating a pretrained agent for some episodes.""" self._policy.eval_mode = True avg_test_return, success_rate, ratio_straight_lines, success_rate_straight_line, success_rate_no_straight_line = self.evaluate_policy(total_steps=0) print("----- Evaluation -----") print("avg test return: {}".format(avg_test_return)) print("avg test success rate: {}".format(success_rate)) print("Ratio of feasible straight_line episodes: {}".format(ratio_straight_lines)) print("avg test success_rate for straight_line episodes: {}".format(success_rate_straight_line)) print("avg test success_rate for no_straight_line episodes: {}".format(success_rate_no_straight_line)) return avg_test_return, success_rate, ratio_straight_lines, success_rate_straight_line, success_rate_no_straight_line def evaluate_policy_continuously(self): """ Periodically search the latest checkpoint, and keep evaluating with the latest model until user kills process. """ if self._model_dir is None: self.logger.error("Please specify model directory by passing command line argument `--model-dir`") exit(-1) self.evaluate_policy(total_steps=0) while True: latest_path_ckpt = tf.train.latest_checkpoint(self._model_dir) if self._latest_path_ckpt != latest_path_ckpt: self._latest_path_ckpt = latest_path_ckpt self._checkpoint.restore(self._latest_path_ckpt) self.logger.info("Restored {}".format(self._latest_path_ckpt)) self.evaluate_policy(total_steps=0) def evaluate_policy(self, total_steps): """evaluating the policy.""" tf.summary.experimental.set_step(total_steps) total_test_return = 0. success_traj = 0 if self._save_test_path: replay_buffer = get_replay_buffer( self._policy, self._test_env, size=self._episode_max_steps) straight_line_episode = 0 no_straight_line_episode = 0 success_traj_straight_line = 0 success_traj_no_straight_line = 0 for i in range(self._test_episodes): episode_return = 0. frames = [] workspace, goal, obs = self._test_env.reset() start = obs reduced_workspace = self._CAE.evaluate(workspace) #Concatenate position observation with start, goal, and reduced workspace!! obs_full = np.concatenate((obs, goal, reduced_workspace)) for _ in range(self._episode_max_steps): action = self._policy.get_action(obs_full) next_obs, reward, done, _ = self._test_env.step(action) #Concatenate position observation with start, goal, and reduced workspace!! next_obs_full = np.concatenate((obs, goal, reduced_workspace)) # Add obersvation to the trajectory storage self.trajectory.append({'workspace': workspace,'position': obs, 'next_position': next_obs,'goal': goal, 'action': action, 'reward': reward, 'done': done}) if self._save_test_path: replay_buffer.add(obs=obs_full, act=action, next_obs=next_obs_full, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='plot')) elif self._show_test_progress: self._test_env.render() episode_return += reward obs = next_obs obs_full = next_obs_full if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format( total_steps, i, episode_return) if self._save_test_path: save_path(replay_buffer._encode_sample(np.arange(self._episode_max_steps)), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: frames_to_gif(frames, prefix, self._output_dir) if self._save_test_path_sep: self._save_traj_separately(prefix) total_test_return += episode_return if straight_line_feasible(workspace, start, goal, self._test_env): straight_line_episode += 1 if reward == self._test_env.goal_reward: success_traj_straight_line += 1 else: no_straight_line_episode += 1 if reward == self._test_env.goal_reward: success_traj_no_straight_line += 1 if reward == self._test_env.goal_reward: success_traj += 1 # empty trajectory: self.trajectory = [] if self._show_test_images: images = tf.cast( tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=2), tf.uint8) tf.summary.image('train/input_img', images,) avg_test_return = total_test_return / self._test_episodes success_rate = success_traj / self._test_episodes if straight_line_episode > 0: success_rate_straight_line = success_traj_straight_line/straight_line_episode else: success_rate_straight_line = 0 if no_straight_line_episode > 0: success_rate_no_straight_line = success_traj_no_straight_line/no_straight_line_episode else: success_rate_no_straight_line = 0 ratio_straight_lines = straight_line_episode/ self._test_episodes return avg_test_return, success_rate, ratio_straight_lines, success_rate_straight_line, success_rate_no_straight_line def _save_traj_separately(self, prefix): """Saves the test trajectories into separate folders under the logdir based on the ending of the trajectory. """ last_reward = self.trajectory[-1]['reward'] if last_reward == self._env.goal_reward: log_dir = os.path.join(self._logdir, 'successful_trajs') elif last_reward == self._env.collision_reward: log_dir = os.path.join(self._logdir, 'unsuccessful_trajs') else: log_dir = os.path.join(self._logdir, 'unfinished_trajs') file_name = os.path.join(log_dir, prefix + '.pkl') joblib.dump(self.trajectory, file_name) def _set_from_params(self): # experiment settings self._max_steps = self._params["trainer"]["max_steps"] self._episode_max_steps = self._params["trainer"]["episode_max_steps"] \ if self._params["trainer"]["episode_max_steps"] is not None \ else self._params["trainer"]["max_steps"] self._n_experiments = self._params["trainer"]["n_experiments"] self._show_progress = self._params["trainer"]["show_progress"] self._show_progress_interval = self._params["trainer"]["show_progress_interval"] self._save_model_interval = self._params["trainer"]["save_model_interval"] self._save_summary_interval = self._params["trainer"]["save_summary_interval"] self._normalize_obs = self._params["trainer"]["normalize_obs"] self._logdir = self._params["trainer"]["logdir"] self._model_dir = self._params["trainer"]["model_dir"] # replay buffer self._use_prioritized_rb = self._params["trainer"]["use_prioritized_rb"] self._use_nstep_rb = self._params["trainer"]["use_nstep_rb"] self._n_step = self._params["trainer"]["n_step"] # test settings self._test_interval = self._params["trainer"]["test_interval"] self._show_test_progress = self._params["trainer"]["show_test_progress"] self._test_episodes = self._params["trainer"]["test_episodes"] self._save_test_path = self._params["trainer"]["save_test_path"] self._save_test_path_sep = self._params["trainer"]["save_test_path_sep"] self._save_test_movie = self._params["trainer"]["save_test_movie"] self._show_test_images = self._params["trainer"]["show_test_images"] def _get_args_from_params(self): """creates an argparse Namespace object from params for the tf2rl based classes.""" args = {} for key in self._params["trainer"]: args[key] = self._params["trainer"][key] return argparse.Namespace(**args)