def __init__(self, policy, env, args, test_env=None): self._policy = policy self._env = env self._test_env = self._env if test_env is None else test_env self._set_from_args(args) # prepare log directory self._output_dir = prepare_output_dir(args=args, user_specified_dir="./results", suffix="{}_{}".format( self._policy.policy_name, args.dir_suffix)) self.logger = initialize_logger(logging_level=logging.getLevelName( args.logging_level), output_dir=self._output_dir) # Save and restore model checkpoint = tf.train.Checkpoint(policy=self._policy) self.checkpoint_manager = tf.train.CheckpointManager( checkpoint, directory=self._output_dir, max_to_keep=5) if args.model_dir is not None: assert os.path.isdir(args.model_dir) path_ckpt = tf.train.latest_checkpoint(args.model_dir) checkpoint.restore(path_ckpt) self.logger.info("Restored {}".format(path_ckpt)) # prepare TensorBoard output self.writer = tf.summary.create_file_writer(self._output_dir) self.writer.set_as_default()
def __init__( self, policy, env, args, test_env=None): self._set_from_args(args) self._policy = policy self._env = env self._test_env = self._env if test_env is None else test_env if self._normalize_obs: assert isinstance(env.observation_space, Box) self._obs_normalizer = EmpiricalNormalizer( shape=env.observation_space.shape) # prepare log directory self._output_dir = prepare_output_dir( args=args, user_specified_dir=self._logdir, suffix="{}_{}".format(self._policy.policy_name, args.dir_suffix)) self.logger = initialize_logger( logging_level=logging.getLevelName(args.logging_level), output_dir=self._output_dir) if args.evaluate: assert args.model_dir is not None self._set_check_point(args.model_dir) # prepare TensorBoard output self.writer = tf.summary.create_file_writer(self._output_dir) self.writer.set_as_default()
def __init__( self, policy, env, params, test_env=None): """Initializing the training instance.""" self._params = params self._set_from_params() self._policy = policy self._env = env self._test_env = self._env if test_env is None else test_env args = self._get_args_from_params() # Convolutional Autoencoder: self._CAE = CAE(pooling=self._params["cae"]["pooling"], latent_dim=self._params["cae"]["latent_dim"], input_shape=self._env.workspace.shape, conv_filters=self._params["cae"]["conv_filters"]) self._CAE.build(input_shape=(1, self._env.workspace.shape[0], self._env.workspace.shape[1], 1)) self._CAE.load_weights(filepath=self._params["cae"]["weights_path"]) for layer, _ in self._CAE._get_trainable_state().items(): layer.trainable = False #Initialize array for trajectory storage self.trajectory=[] # Initialize workspace relabeler: self._relabeler = PointrobotRelabeler( ws_shape=(self._env.grid_size, self._env.grid_size), mode=params["trainer"]["relabeling_mode"], remove_zigzaging=params["trainer"]["remove_zigzaging"] ) # prepare log directory self._output_dir = prepare_output_dir( args=args, user_specified_dir=self._logdir, suffix="{}_{}".format(self._policy.policy_name, params["trainer"]["dir_suffix"])) self.logger = initialize_logger( logging_level=logging.getLevelName(params["trainer"]["logging_level"]), output_dir=self._output_dir) if self._save_test_path_sep: sep_logdirs = ['successful_trajs', 'unsuccessful_trajs', 'unfinished_trajs'] for logdir in sep_logdirs: if not os.path.exists(os.path.join(self._logdir, logdir)): os.makedirs(os.path.join(self._logdir, logdir)) if params["trainer"]["mode"] == "evaluate": assert glob.glob(os.path.join(params["trainer"]["model_dir"], '*')) self._set_check_point(params["trainer"]["model_dir"]) # prepare TensorBoard output self.writer = tf.summary.create_file_writer(self._output_dir) self.writer.set_as_default() # relabeling visualization: self._relabel_fig = plt.figure(2)
def __init__(self, policy, env, args, test_env=None): """ Initialize Trainer class Args: policy: Policy to be trained env (gym.Env): Environment for train args (Namespace or dict): config parameters specified with command line test_env (gym.Env): Environment for test. """ if isinstance(args, dict): _args = args args = policy.__class__.get_argument(Trainer.get_argument()) args = args.parse_args([]) for k, v in _args.items(): if hasattr(args, k): setattr(args, k, v) else: raise ValueError(f"{k} is invalid parameter.") self._set_from_args(args) self._policy = policy self._env = env self._test_env = self._env if test_env is None else test_env if self._normalize_obs: assert isinstance(env.observation_space, Box) self._obs_normalizer = EmpiricalNormalizer( shape=env.observation_space.shape) # prepare log directory self._output_dir = prepare_output_dir(args=args, user_specified_dir=self._logdir, suffix="{}_{}".format( self._policy.policy_name, args.dir_suffix)) self.logger = initialize_logger(logging_level=logging.getLevelName( args.logging_level), output_dir=self._output_dir) if args.evaluate: assert args.model_dir is not None self._set_check_point(args.model_dir) # prepare TensorBoard output self.writer = tf.summary.create_file_writer(self._output_dir) self.writer.set_as_default()
def __init__(self, policy, env, args, test_env=None): self._policy = policy self._env = env self._test_env = self._env if test_env is None else test_env self._set_from_args(args) # prepare log directory self._output_dir = prepare_output_dir(args=args, user_specified_dir="./results") logging.basicConfig(level=logging.getLevelName(args.logging_level)) self.logger = logging.getLogger(__name__) # prepare TensorBoard output self.checkpoint_manager = tf.contrib.checkpoint.CheckpointManager( tf.train.Checkpoint(policy=self._policy), directory=self._output_dir, max_to_keep=5) self.writer = tf.contrib.summary.create_file_writer(self._output_dir) self.writer.set_as_default() tf.contrib.summary.initialize()
def __init__( self, policy, env, args, test_env=None): self._set_from_args(args) self._policy = policy self._env = env self._test_env = self._env if test_env is None else test_env if self._normalize_obs: assert isinstance(env.observation_space, Box) self._obs_normalizer = EmpiricalNormalizer( shape=env.observation_space.shape) # prepare log directory self._output_dir = prepare_output_dir( args=args, user_specified_dir=self._logdir, suffix="{}_{}".format(self._policy.policy_name, args.dir_suffix)) self.logger = initialize_logger( logging_level=logging.getLevelName(args.logging_level), output_dir=self._output_dir) # Save and restore model self._checkpoint = tf.train.Checkpoint(policy=self._policy) self.checkpoint_manager = tf.train.CheckpointManager( self._checkpoint, directory=self._output_dir, max_to_keep=5) if args.evaluate: assert args.model_dir is not None if args.model_dir is not None: assert os.path.isdir(args.model_dir) self._latest_path_ckpt = tf.train.latest_checkpoint(args.model_dir) self._checkpoint.restore(self._latest_path_ckpt) self.logger.info("Restored {}".format(self._latest_path_ckpt)) # prepare TensorBoard output self.writer = tf.summary.create_file_writer(self._output_dir) self.writer.set_as_default()
opt.apply_gradients(zip(grads, actor.trainable_weights)) return loss print('Train...') losses = [] # Keep track of the losses over time. for epoch in range(5): # Iterate over the batches of a dataset. for step, x in enumerate(train_dataset): loss = training_step(x) # Logging. losses.append(float(loss)) if step % 100 == 0: print("Epoch", epoch, "Step:", step, "Loss:", sum(losses) / len(losses)) # Stop after 1000 steps. # Training the model to convergence is left # as an exercise to the reader. # if step >= 10000: # break output_dir = prepare_output_dir(args=None, user_specified_dir=None, suffix="{}_{}".format(policy.policy_name, args.dir_suffix)) checkpoint = tf.train.Checkpoint(policy=policy) checkpoint_manager = tf.train.CheckpointManager(_checkpoint, directory=output_dir, max_to_keep=5)
def evaluator(is_training_done, env, policy_fn, set_weights_fn, queue, gpu, save_model_interval=int(1e6), n_evaluation=10, episode_max_steps=1000, show_test_progress=False): """ Evaluate trained network weights periodically. :param is_training_done (multiprocessing.Event): multiprocessing.Event object to share the status of training. :param env (Gym environment): Environment object. :param policy_fn (function): Method object to generate an explorer. :param set_weights_fn (function): Method object to set network weights gotten from queue. :param queue (multiprocessing.Queue): A FIFO shared with the learner to get the latest network weights. This is process safe, so you don't need to lock process when use this. :param gpu (int): GPU id. If this is set to -1, then this process uses only CPU. :param save_model_interval (int): Interval to save model. :param n_evaluation (int): Number of episodes to evaluate. :param episode_max_steps (int): Maximum number of steps of an episode. :param show_test_progress (bool): If true, `render` will be called to visualize evaluation process. """ tf = import_tf() logger = logging.getLogger("tf2rl") output_dir = prepare_output_dir(args=None, user_specified_dir="./results", suffix="evaluator") writer = tf.summary.create_file_writer(output_dir, filename_suffix="_evaluation") writer.set_as_default() policy = policy_fn(env, "Learner", gpu=gpu) model_save_threshold = save_model_interval checkpoint = tf.train.Checkpoint(policy=policy) checkpoint_manager = tf.train.CheckpointManager(checkpoint, directory=output_dir, max_to_keep=10) while not is_training_done.is_set(): n_evaluated_episode = 0 # Wait until a new weights comes if queue.empty(): continue else: set_weights_fn(policy, queue.get()) trained_steps = queue.get() tf.summary.experimental.set_step(trained_steps) avg_test_return = 0. for _ in range(n_evaluation): n_evaluated_episode += 1 episode_return = 0. obs = env.reset() done = False for _ in range(episode_max_steps): action = policy.get_action(obs, test=True) next_obs, reward, done, _ = env.step(action) if show_test_progress: env.render() episode_return += reward obs = next_obs if done: break avg_test_return += episode_return # Break if a new weights comes if not queue.empty(): break avg_test_return /= n_evaluated_episode logger.info("Evaluation: {} over {} run".format( avg_test_return, n_evaluated_episode)) tf.summary.scalar(name="apex/average_test_return", data=avg_test_return) writer.flush() if trained_steps > model_save_threshold: model_save_threshold += save_model_interval checkpoint_manager.save() checkpoint_manager.save()
def learner(global_rb, trained_steps, is_training_done, lock, env, policy_fn, get_weights_fn, n_training, update_freq, evaluation_freq, gpu, queues): """ Update network weights using samples collected by explorers. :param global_rb (multiprocessing.managers.AutoProxy[PrioritizedReplayBuffer]): Prioritized replay buffer sharing with multiple explorers and only one learner. This object is shared over processes, so it must be locked when trying to operate something with `lock` object. :param trained_steps (multiprocessing.Value): Number of steps to apply gradients. :param is_training_done (multiprocessing.Event): multiprocessing.Event object to share the status of training. :param lock (multiprocessing.Lock): multiprocessing.Lock to lock other processes. :param env (Gym environment): Environment object. :param policy_fn (function): Method object to generate an explorer. :param get_weights_fn (function): Method object to get network weights and put them to queue. :param n_training (int): Maximum number of times to apply gradients. If number of applying gradients is over this value, training will be done by setting `is_training_done` to `True` :param update_freq (int): Frequency to update parameters, i.e., put network parameters to `queues` :param evaluation_freq (int): Frequency to call `evaluator`. :param gpu (int): GPU id. If this is set to -1, then this process uses only CPU. :param queues (List): List of Queues shared with explorers to send latest network parameters. """ tf = import_tf() logger = logging.getLogger("tf2rl") policy = policy_fn(env, "Learner", global_rb.get_buffer_size(), gpu=gpu) output_dir = prepare_output_dir(args=None, user_specified_dir="./results", suffix="learner") writer = tf.summary.create_file_writer(output_dir) writer.set_as_default() # Wait until explorers collect transitions while not is_training_done.is_set( ) and global_rb.get_stored_size() < policy.n_warmup: continue start_time = time.time() while not is_training_done.is_set(): trained_steps.value += 1 tf.summary.experimental.set_step(trained_steps.value) lock.acquire() samples = global_rb.sample(policy.batch_size) lock.release() td_errors = policy.train(samples["obs"], samples["act"], samples["next_obs"], samples["rew"], samples["done"], samples["weights"]) writer.flush() lock.acquire() global_rb.update_priorities(samples["indexes"], np.abs(td_errors) + 1e-6) lock.release() # Put updated weights to queue if trained_steps.value % update_freq == 0: weights = get_weights_fn(policy) for i in range(len(queues) - 1): queues[i].put(weights) fps = update_freq / (time.time() - start_time) tf.summary.scalar(name="apex/fps", data=fps) logger.info( "Update weights. {0:.2f} FPS for GRAD. Learned {1:.2f} steps". format(fps, trained_steps.value)) start_time = time.time() # Periodically do evaluation if trained_steps.value % evaluation_freq == 0: queues[-1].put(get_weights_fn(policy)) queues[-1].put(trained_steps.value) if trained_steps.value >= n_training: is_training_done.set()
def learner(global_rb, trained_steps, is_training_done, lock, env_fn, policy_fn, n_training, update_freq, *queues): """ Collect transitions and store them to prioritized replay buffer. Args: global_rb: Prioritized replay buffer sharing with multiple explorers and only one learner. This object is shared over processes, so it must be locked when trying to operate something with `lock` object. trained_steps: Number of times to apply gradients. is_training_done: multiprocessing.Event object to share if training is done or not. lock: multiprocessing.Lock to lock other processes. It must be released after process is done. env_fn: Method object to generate an environment. policy_fn: Method object to generate an explorer. n_training: Maximum number of times to apply gradients. If number of applying gradients is over this value, training will be done by setting `is_training_done` to `True` update_freq: Frequency to update parameters, i.e., put network parameters to `queues` queues: FIFOs shared with explorers to send latest network parameters. """ env = env_fn() policy = policy_fn(env, "Learner", global_rb.get_buffer_size()) update_step = update_freq output_dir = prepare_output_dir(args=None, user_specified_dir="./results") writer = tf.contrib.summary.create_file_writer(output_dir) writer.set_as_default() tf.contrib.summary.initialize() total_steps = tf.train.create_global_step() # Wait until explorers collect transitions while not is_training_done.is_set() and global_rb.get_stored_size() == 0: continue start_time = time.time() while not is_training_done.is_set(): with tf.contrib.summary.record_summaries_every_n_global_steps(1000): trained_steps.value += 1 total_steps.assign(trained_steps.value) lock.acquire() samples = global_rb.sample(policy.batch_size) with tf.contrib.summary.always_record_summaries(): td_error = policy.train( samples["obs"], samples["act"], samples["next_obs"], samples["rew"], np.array(samples["done"], dtype=np.float64), samples["weights"]) writer.flush() global_rb.update_priorities(samples["indexes"], np.abs(td_error) + 1e-6) lock.release() # Put updated weights to queue if trained_steps.value > update_step: weights = [] weights.append(policy.actor.weights) weights.append(policy.critic.weights) weights.append(policy.critic_target.weights) for queue in queues: queue.put(weights) update_step += update_freq with tf.contrib.summary.always_record_summaries(): fps = update_freq / (time.time() - start_time) tf.contrib.summary.scalar(name="FPS", tensor=fps, family="loss") print("Update weights for explorer. {0:.2f} FPS for GRAD. Learned {1:.2f} steps".format(fps, trained_steps.value)) start_time = time.time() if trained_steps.value >= n_training: is_training_done.set()