def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph) self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None]) self.rewards_ph = rewards_ph = tf.placeholder( tf.float32, [None]) self.pg_lr_ph = pg_lr_ph = tf.placeholder(tf.float32, []) n_batch_step = None n_batch_train = None if issubclass(self.policy, LstmPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps self.model = step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False) self.model2 = train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True) self.action_ph = action_ph = train_model.pdtype.sample_placeholder( [None]) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.policy, labels=action_ph) self.logits = train_model.policy # training loss pg_loss = tf.reduce_mean(advs_ph * logpac) self.entropy = entropy = tf.reduce_mean( calc_entropy(train_model.policy)) self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy self.vf_loss = vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + self.vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.value_fn + tf.random_normal( tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=pg_lr_ph, clip_kl=self.kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=self.max_grad_norm, verbose=self.verbose) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess)
def learn(env, policy, value_fn, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002): """ Trains an ACKTR model. :param env: (Gym environment) The environment to learn from :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param value_fn: (Object) The value function model to use (MLP, CNN, LSTM, ...) :param gamma: (float) The discount value :param lam: (float) the tradeoff between exploration and exploitation :param timesteps_per_batch: (int) the number of timesteps for each batch :param num_timesteps: (int) the total number of timesteps to run :param animate: (bool) if render env :param callback: (function) called every step, used for logging and saving :param desired_kl: (float) the Kullback leibler weight for the loss """ obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize * (1 - 0.9), momentum=0.9, kfac_update=2, epsilon=1e-2, stats_decay=0.99, async_eigen_decomp=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = tf_util.function(inputs, update_op) tf_util.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for queue_runner in [q_runner, value_fn.q_runner]: assert queue_runner is not None enqueue_threads.extend( queue_runner.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) timesteps_this_batch += path["reward"].shape[0] timesteps_so_far += path["reward"].shape[0] if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = value_fn.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function value_fn.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl_loss = policy.compute_kl(ob_no, oldac_dist) if kl_loss > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl_loss < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular( "EpLenMean", np.mean([path["reward"].shape[0] for path in paths])) logger.record_tabular("KL", kl_loss) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) == False: break logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \ "an instance of common.policies.ActorCriticPolicy." if isinstance(self.action_space, Box): raise NotImplementedError( "WIP: ACKTR does not support Continuous actions yet.") self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph) n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps self.model = step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) self.params = params = tf_util.get_trainable_vars("model") with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): self.model2 = train_model = self.policy( self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope( "loss", reuse=False, custom_getter=tf_util.outer_scope_getter("loss")): self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None]) self.rewards_ph = rewards_ph = tf.placeholder( tf.float32, [None]) self.pg_lr_ph = pg_lr_ph = tf.placeholder(tf.float32, []) self.action_ph = action_ph = train_model.pdtype.sample_placeholder( [None]) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.policy, labels=action_ph) self.logits = train_model.policy # training loss pg_loss = tf.reduce_mean(advs_ph * logpac) self.entropy = entropy = tf.reduce_mean( calc_entropy(train_model.policy)) self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy self.vf_loss = vf_loss = mse( tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + self.vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.value_fn + tf.random_normal( tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean( tf.pow( train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', pg_loss) tf.summary.scalar('policy_gradient_fisher_loss', pg_fisher_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('value_function_fisher_loss', vf_fisher_loss) tf.summary.scalar('loss', train_loss) self.grads_check = tf.gradients(train_loss, params) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.pg_lr_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.pg_lr_ph) tf.summary.histogram('advantage', self.advs_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) with tf.variable_scope( "kfac", reuse=False, custom_getter=tf_util.outer_scope_getter("kfac")): with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=pg_lr_ph, clip_kl=self.kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async_eigen_decomp=self.async_eigen_decomp, cold_iter=10, max_grad_norm=self.max_grad_norm, verbose=self.verbose) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def __init__(self, ob_dim, ac_dim, verbose=1): """ Create an MLP policy for a value function :param ob_dim: (int) Observation dimention :param ac_dim: (int) action dimention :param verbose: (int) verbosity level """ obs_ph = tf.placeholder(tf.float32, shape=[None, ob_dim * 2 + ac_dim * 2 + 2 ]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} layer_1 = tf.nn.elu( dense(obs_ph, 64, "h1", weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) layer_2 = tf.nn.elu( dense(layer_1, 64, "h2", weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) vpred_n = dense(layer_2, 1, "hfinal", weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:, 0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = tf.reduce_mean( tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) self._predict = tf_util.function([obs_ph], vpred_n) optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001 * (1 - 0.9), momentum=0.9, clip_kl=0.3, epsilon=0.1, stats_decay=0.95, async=1, kfac_update=2, cold_iter=50, weight_decay_dict=wd_dict, max_grad_norm=None, verbose=verbose) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) self.do_update = tf_util.function([obs_ph, vtarg_n], update_op) # pylint: disable=E1101 tf_util.initialize() # Initialize uninitialized TF variables