def _init_training(self, env, policy, pool): """Method to be called at the start of training. :param env: Environment instance. :param policy: Policy instance. :return: None """ self._env = env if self._eval_n_episodes > 0: self._eval_env = deep_clone(env) self._policy = policy self._pool = pool
def _init_training(self, env, policy, pool): """Method to be called at the start of training. :param env: Environment instance. :param policy: Policy instance. :return: None """ self._env = env if self._eval_n_episodes > 0: # TODO: This is horrible. Don't do this. Get rid of this. import tensorflow as tf with tf.variable_scope("low_level_policy", reuse=True): self._eval_env = deep_clone(env) self._policy = policy self._pool = pool
def train(self): """ CG: the function that conducts ensemble training. :return: """ # Set up parameters for the training process. self._n_epochs = self._base_ac_params['n_epochs'] self._epoch_length = self._base_ac_params['epoch_length'] self._n_train_repeat = self._base_ac_params['n_train_repeat'] self._n_initial_exploration_steps = self._base_ac_params[ 'n_initial_exploration_steps'] self._eval_render = self._base_ac_params['eval_render'] self._eval_n_episodes = self._base_ac_params['eval_n_episodes'] self._eval_deterministic = self._base_ac_params['eval_deterministic'] # Set up the evaluation environment. if self._eval_n_episodes > 0: with tf.variable_scope("low_level_policy", reuse=True): self._eval_env = deep_clone(self._env) # Set up the tensor flow session. self._sess = tf_utils.get_default_session() # Import required libraries for training. import random import math import operator import numpy as np # Initialize the sampler. alg_ins = random.choice(self._alg_instances) self._sampler.initialize(self._env, alg_ins[0].policy, self._pool) # Perform the training/evaluation process. num_episode = 0. with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): isEpisodeEnd = self._sampler.sample() # If an episode is ended, we need to update performance statistics for each AC instance and # pick randomly another AC instance for next episode of exploration. if isEpisodeEnd: num_episode = num_episode + 1. alg_ins[1] = 0.9 * alg_ins[ 1] + 0.1 * self._sampler._last_path_return alg_ins[2] = alg_ins[2] + 1. if self._use_ucb: # Select an algorithm instance based on UCB. selected = False for ains in self._alg_instances: if ains[2] < 1.: alg_ins = ains selected = True break else: ains[3] = ains[1] + math.sqrt( 2.0 * math.log(num_episode) / ains[2]) if not selected: alg_ins = max(self._alg_instances, key=operator.itemgetter(3)) else: # Select an algorithm instance uniformly at random. alg_ins = random.choice(self._alg_instances) self._sampler.set_policy(alg_ins[0].policy) if not self._sampler.batch_ready(): continue gt.stamp('sample') # Perform training over all AC instances. for i in range(self._n_train_repeat): batch = self._sampler.random_batch() for ains in self._alg_instances: ains[0]._do_training(iteration=t + epoch * self._epoch_length, batch=batch) gt.stamp('train') # Perform evaluation after one full epoch of training is completed. if self._eval_n_episodes < 1: continue if self._evaluation_strategy == 'ensemble': # Use a whole ensemble of AC instances for evaluation. paths = rollouts(self._eval_env, self, self._sampler._max_path_length, self._eval_n_episodes) elif self._evaluation_strategy == 'best-policy': # Choose the AC instance with the highest observed performance so far for evaluation. eval_alg_ins = max(self._alg_instances, key=operator.itemgetter(1)) with eval_alg_ins[0].policy.deterministic( self._eval_deterministic): paths = rollouts(self._eval_env, eval_alg_ins[0].policy, self._sampler._max_path_length, self._eval_n_episodes) else: paths = None if paths is not None: total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) self._eval_env.log_diagnostics(paths) if self._eval_render: self._eval_env.render(paths) # Produce log info after each episode of training and evaluation. times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self._sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') # Terminate the sampler after the training process is completed. self._sampler.terminate()