def __init__(self, batch_env, step, is_training, should_log, config): """Create an instance of the PPO algorithm. Args: batch_env: In-graph batch environment. step: Integer tensor holding the current training step. is_training: Boolean tensor for whether the algorithm should train. should_log: Boolean tensor for whether summaries should be returned. config: Object containing the agent configuration as attributes. """ self._batch_env = batch_env self._step = step self._is_training = is_training self._should_log = should_log self._config = config self._observ_filter = normalize.StreamingNormalize( self._batch_env.observ[0], center=True, scale=True, clip=5, name='normalize_observ') self._reward_filter = normalize.StreamingNormalize( self._batch_env.reward[0], center=False, scale=True, clip=10, name='normalize_reward') # Memory stores tuple of observ, action, mean, logstd, reward. template = ( self._batch_env.observ[0], self._batch_env.action[0], self._batch_env.action[0], self._batch_env.action[0], self._batch_env.reward[0]) self._memory = memory.EpisodeMemory( template, config.update_every, config.max_length, 'memory') self._memory_index = tf.Variable(0, False) use_gpu = self._config.use_gpu and utility.available_gpus() with tf.device('/gpu:0' if use_gpu else '/cpu:0'): # Create network variables for later calls to reuse. self._network( tf.zeros_like(self._batch_env.observ)[:, None], tf.ones(len(self._batch_env)), reuse=None) cell = self._config.network(self._batch_env.action.shape[1].value) with tf.variable_scope('ppo_temporary'): self._episodes = memory.EpisodeMemory( template, len(batch_env), config.max_length, 'episodes') self._last_state = utility.create_nested_vars( cell.zero_state(len(batch_env), tf.float32)) self._last_action = tf.Variable( tf.zeros_like(self._batch_env.action), False, name='last_action') self._last_mean = tf.Variable( tf.zeros_like(self._batch_env.action), False, name='last_mean') self._last_logstd = tf.Variable( tf.zeros_like(self._batch_env.action), False, name='last_logstd') self._penalty = tf.Variable( self._config.kl_init_penalty, False, dtype=tf.float32) self._policy_optimizer = self._config.policy_optimizer( self._config.policy_lr, name='policy_optimizer') self._value_optimizer = self._config.value_optimizer( self._config.value_lr, name='value_optimizer')
def __init__(self, batch_env, step, is_training, should_log, config): """Create an instance of the PPO algorithm. Args: batch_env: In-graph batch environment. step: Integer tensor holding the current training step. is_training: Boolean tensor for whether the algorithm should train. should_log: Boolean tensor for whether summaries should be returned. config: Object containing the agent configuration as attributes. """ self._batch_env = batch_env self._step = step self._is_training = is_training self._should_log = should_log self._config = config self._observ_filter = normalize.StreamingNormalize( self._batch_env.observ[0], center=True, scale=True, clip=5, name='normalize_observ') self._reward_filter = normalize.StreamingNormalize( self._batch_env.reward[0], center=False, scale=True, clip=10, name='normalize_reward') # Memory stores tuple of observ, action, mean, logstd, reward. template = (self._batch_env.observ[0], self._batch_env.action[0], self._batch_env.action[0], self._batch_env.action[0], self._batch_env.reward[0]) self._memory = memory.EpisodeMemory(template, config.update_every, config.max_length, 'memory') self._memory_index = tf.Variable(0, False) use_gpu = self._config.use_gpu and utility.available_gpus() with tf.device('/gpu:0' if use_gpu else '/cpu:0'): # Create network variables for later calls to reuse. action_size = self._batch_env.action.shape[1].value self._network = tf.make_template( 'network', functools.partial(config.network, config, action_size)) output = self._network( tf.zeros_like(self._batch_env.observ)[:, None], tf.ones(len(self._batch_env))) with tf.variable_scope('ppo_temporary'): self._episodes = memory.EpisodeMemory(template, len(batch_env), config.max_length, 'episodes') if output.state is None: self._last_state = None else: # Ensure the batch dimension is set. tf.contrib.framework.nest.map_structure( lambda x: x.set_shape([len(batch_env)] + x.shape. as_list()[1:]), output.state) # pylint: disable=undefined-variable self._last_state = tf.contrib.framework.nest.map_structure( lambda x: tf.Variable(lambda: tf.zeros_like(x), False), output.state) self._last_action = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_action') self._last_mean = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_mean') self._last_logstd = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_logstd') self._penalty = tf.Variable(self._config.kl_init_penalty, False, dtype=tf.float32) self._optimizer = self._config.optimizer(self._config.learning_rate)