def _prepare_networks(self, hparams, sess): self.action = tf.placeholder(shape=(1, ), dtype=tf.int32) batch_env = batch_env_factory(hparams) self.reward, self.done = batch_env.simulate(self.action) self.observation = batch_env.observ self.reset_op = batch_env.reset(tf.constant([0], dtype=tf.int32)) environment_wrappers = hparams.environment_spec.wrappers wrappers = copy.copy( environment_wrappers) if environment_wrappers else [] to_initialize = [batch_env] for w in wrappers: batch_env = w[0](batch_env, **w[1]) to_initialize.append(batch_env) def initialization_lambda(): for batch_env in to_initialize: batch_env.initialize(sess) self.initialize = initialization_lambda obs_copy = batch_env.observ + 0 actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams) self.policy_probs = actor_critic.policy.probs[0, 0, :] self.value = actor_critic.value[0, :]
def env_step(arg1, arg2): # pylint: disable=unused-argument """Step of the environment.""" actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams) policy = actor_critic.policy if policy_to_actions_lambda: action = policy_to_actions_lambda(policy) else: action = tf.cond(eval_phase, policy.mode, policy.sample) postprocessed_action = actor_critic.action_postprocessing(action) simulate_output = batch_env.simulate(postprocessed_action[0, ...]) pdf = policy.prob(action)[0] value_function = actor_critic.value[0] pdf = tf.reshape(pdf, shape=(hparams.num_agents,)) value_function = tf.reshape(value_function, shape=(hparams.num_agents,)) with tf.control_dependencies(simulate_output): return tf.identity(pdf), tf.identity(value_function)
def define_ppo_step(data_points, optimizer, hparams): """Define ppo step.""" observation, action, discounted_reward, norm_advantage, old_pdf = data_points new_policy_dist, new_value, _ = get_policy(observation, hparams) new_pdf = new_policy_dist.prob(action) ratio = new_pdf / old_pdf clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef, 1 + hparams.clipping_coef) surrogate_objective = tf.minimum(clipped_ratio * norm_advantage, ratio * norm_advantage) policy_loss = -tf.reduce_mean(surrogate_objective) value_error = new_value - discounted_reward value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2) entropy = new_policy_dist.entropy() entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy) losses = [policy_loss, value_loss, entropy_loss] gradients = [ list(zip(*optimizer.compute_gradients(loss))) for loss in losses ] gradients_norms = [tf.global_norm(gradient[0]) for gradient in gradients] gradients_flat = sum([gradient[0] for gradient in gradients], ()) gradients_variables_flat = sum([gradient[1] for gradient in gradients], ()) if hparams.max_gradients_norm: gradients_flat, _ = tf.clip_by_global_norm(gradients_flat, hparams.max_gradients_norm) optimize_op = optimizer.apply_gradients( zip(gradients_flat, gradients_variables_flat)) with tf.control_dependencies([optimize_op]): return [tf.identity(x) for x in losses + gradients_norms]
def define_ppo_step(data_points, optimizer, hparams): """Define ppo step.""" observation, action, discounted_reward, norm_advantage, old_pdf = data_points new_policy_dist, new_value, _ = get_policy(observation, hparams) new_pdf = new_policy_dist.prob(action) ratio = new_pdf / old_pdf clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef, 1 + hparams.clipping_coef) surrogate_objective = tf.minimum(clipped_ratio * norm_advantage, ratio * norm_advantage) policy_loss = -tf.reduce_mean(surrogate_objective) value_error = new_value - discounted_reward value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error ** 2) entropy = new_policy_dist.entropy() entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy) losses = [policy_loss, value_loss, entropy_loss] gradients = [list(zip(*optimizer.compute_gradients(loss))) for loss in losses] gradients_norms = [tf.global_norm(gradient[0]) for gradient in gradients] gradients_flat = sum([gradient[0] for gradient in gradients], ()) gradients_variables_flat = sum([gradient[1] for gradient in gradients], ()) if hparams.max_gradients_norm: gradients_flat, _ = tf.clip_by_global_norm(gradients_flat, hparams.max_gradients_norm) optimize_op = optimizer.apply_gradients(zip(gradients_flat, gradients_variables_flat)) with tf.control_dependencies([optimize_op]): return [tf.identity(x) for x in losses + gradients_norms]