def __init__(self, config, scope): super(TRPOModel, self).__init__(config, scope) # TRPO specific parameters self.cg_damping = self.config.cg_damping self.max_kl_divergence = self.config.max_kl_divergence self.line_search_steps = self.config.line_search_steps self.cg_optimizer = ConjugateGradientOptimizer( self.config.cg_iterations) self.flat_tangent = tf.placeholder(tf.float32, shape=[None]) self.create_training_operations() self.session.run(tf.global_variables_initializer())
def __init__(self, config, scope, network_builder=None): super(TRPOModel, self).__init__(config, scope, network_builder=network_builder) # TRPO specific parameters self.cg_damping = self.config.cg_damping self.max_kl_divergence = self.config.max_kl_divergence self.line_search_steps = self.config.line_search_steps self.cg_optimizer = ConjugateGradientOptimizer( self.logger, self.config.cg_iterations) self.flat_tangent = tf.placeholder(tf.float32, shape=[None]) self.writer = tf.summary.FileWriter('logs', graph=tf.get_default_graph()) self.create_training_operations() self.session.run(tf.global_variables_initializer())
class TRPOModel(PGModel): default_config = TRPOModelConfig def __init__(self, config, scope, network_builder=None): super(TRPOModel, self).__init__(config, scope, network_builder=network_builder) # TRPO specific parameters self.cg_damping = self.config.cg_damping self.max_kl_divergence = self.config.max_kl_divergence self.line_search_steps = self.config.line_search_steps self.cg_optimizer = ConjugateGradientOptimizer( self.logger, self.config.cg_iterations) self.override_line_search = self.config.override_line_search self.flat_tangent = tf.placeholder(tf.float32, shape=[None]) self.writer = tf.summary.FileWriter('logs', graph=tf.get_default_graph()) self.create_training_operations() self.session.run(tf.global_variables_initializer()) def create_training_operations(self): """ Creates TRPO training operations, i.e. the natural gradient update step based on the KL divergence constraint between new and old policy. :return: """ with tf.variable_scope("update"): current_log_prob = self.dist.log_prob( self.policy.get_policy_variables(), self.actions) current_log_prob = tf.reshape(current_log_prob, [-1]) prev_log_prob = self.dist.log_prob(self.prev_dist, self.actions) prev_log_prob = tf.reshape(prev_log_prob, [-1]) prob_ratio = tf.exp(current_log_prob - prev_log_prob) surrogate_loss = -tf.reduce_mean( prob_ratio * tf.reshape(self.advantage, [-1]), axis=0) variables = tf.trainable_variables() batch_float = tf.cast(self.batch_size, tf.float32) # reshape, extract dict mean_kl_divergence = self.dist.kl_divergence(self.prev_dist, self.policy.get_policy_variables())\ / batch_float mean_entropy = self.dist.entropy( self.policy.get_policy_variables()) / batch_float self.losses = [surrogate_loss, mean_kl_divergence, mean_entropy] # Get symbolic gradient expressions self.policy_gradient = get_flattened_gradient( self.losses, variables) fixed_kl_divergence = self.dist.fixed_kl( self.policy.get_policy_variables()) / batch_float variable_shapes = map(get_shape, variables) offset = 0 tangents = [] for shape in variable_shapes: size = np.prod(shape) param = tf.reshape(self.flat_tangent[offset:(offset + size)], shape) tangents.append(param) offset += size gradients = tf.gradients(fixed_kl_divergence, variables) gradient_vector_product = [ tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents) ] self.flat_variable_helper = FlatVarHelper(self.session, variables) self.fisher_vector_product = get_flattened_gradient( gradient_vector_product, variables) def update(self, batch): """ Compute update for one batch of experiences using general advantage estimation and the constrained optimisation based on the fixed kl-divergence constraint. :param batch: :return: """ self.input_feed = None # Set per episode return and advantage for episode in batch: episode['returns'] = discount(episode['rewards'], self.gamma) episode['advantages'] = self.advantage_estimation(episode) # Update linear value function for baseline prediction self.baseline_value_function.fit(batch) self.input_feed = { self.episode_length: [episode['episode_length'] for episode in batch], self.state: [episode['states'] for episode in batch], self.actions: [episode['actions'] for episode in batch], self.advantage: [episode['advantages'] for episode in batch], self.prev_action_means: [episode['action_means'] for episode in batch] } if self.continuous: self.input_feed[self.prev_action_log_stds] = [ episode['action_log_stds'] for episode in batch ] previous_theta = self.flat_variable_helper.get() gradient = self.session.run(self.policy_gradient, self.input_feed) zero = np.zeros_like(gradient) if np.allclose(gradient, zero): self.logger.debug('Gradient zero, skipping update') else: # The details of the approximations used here to solve the constrained # optimisation can be found in Appendix C of the TRPO paper # Note that no subsampling is used, which would improve computational performance search_direction = self.cg_optimizer.solve(self.compute_fvp, -gradient) # Search direction has now been approximated as cg-solution s= A^-1g where A is # Fisher matrix, which is a local approximation of the # KL divergence constraint shs = 0.5 * search_direction.dot( self.compute_fvp(search_direction)) lagrange_multiplier = np.sqrt(shs / self.max_kl_divergence) update_step = search_direction / lagrange_multiplier negative_gradient_direction = -gradient.dot(search_direction) # Improve update step through simple backtracking line search # N.b. some implementations skip the line search improved, theta = line_search( self.compute_surrogate_loss, previous_theta, update_step, negative_gradient_direction / lagrange_multiplier, self.line_search_steps) # Only update if improved according to line search # if override_line_search is set to True, we always take the full step # this can make the algorithm very unstable/cause divergence # but potentially leads to faster solutions for some problems # Should generally be set to False if improved: self.logger.debug('Updating with line search result..') self.flat_variable_helper.set(theta) elif self.override_line_search: self.logger.debug('Updating with full step..') self.flat_variable_helper.set(previous_theta + update_step) # Get loss values for progress monitoring # Optionally manage internal LSTM state or other relevant state for n, internal_state in enumerate( self.network.internal_state_inputs): self.input_feed[internal_state] = self.internal_states[n] self.losses.extend(self.network.internal_state_outputs) fetched = self.session.run(self.losses, self.input_feed) # Sanity checks. Is entropy decreasing? Is KL divergence within reason? Is loss non-zero? self.logger.debug('Surrogate loss = ' + str(fetched[0])) self.logger.debug('KL-divergence after update = ' + str(fetched[1])) self.logger.debug('Entropy = ' + str(fetched[2])) # Update internal state optionally self.internal_states = fetched[3:] def compute_fvp(self, p): self.input_feed[self.flat_tangent] = p return self.session.run(self.fisher_vector_product, self.input_feed) + p * self.cg_damping def compute_surrogate_loss(self, theta): self.flat_variable_helper.set(theta) # Losses[0] = surrogate_loss return self.session.run(self.losses[0], self.input_feed)
class TRPOModel(PGModel): default_config = TRPOModelConfig def __init__(self, config, scope): super(TRPOModel, self).__init__(config, scope) # TRPO specific parameters self.cg_damping = self.config.cg_damping self.max_kl_divergence = self.config.max_kl_divergence self.line_search_steps = self.config.line_search_steps self.cg_optimizer = ConjugateGradientOptimizer( self.config.cg_iterations) self.flat_tangent = tf.placeholder(tf.float32, shape=[None]) self.create_training_operations() self.session.run(tf.global_variables_initializer()) def create_training_operations(self): """ Creates TRPO training operations, i.e. the natural gradient update step based on the KL divergence constraint between new and old policy. :return: """ with tf.variable_scope("update"): current_log_prob = self.dist.log_prob( self.policy.get_policy_variables(), self.actions) prev_log_prob = self.dist.log_prob(self.prev_dist, self.actions) prob_ratio = tf.exp(current_log_prob - prev_log_prob) surrogate_loss = -tf.reduce_mean(prob_ratio * self.advantage) variables = tf.trainable_variables() batch_float = tf.cast(self.batch_size, tf.float32) mean_kl_divergence = self.dist.kl_divergence(self.prev_dist, self.policy.get_policy_variables())\ / batch_float mean_entropy = self.dist.entropy( self.policy.get_policy_variables()) / batch_float self.losses = [surrogate_loss, mean_kl_divergence, mean_entropy] # Get symbolic gradient expressions self.policy_gradient = get_flattened_gradient( self.losses, variables) fixed_kl_divergence = self.dist.fixed_kl( self.policy.get_policy_variables()) / batch_float variable_shapes = map(get_shape, variables) offset = 0 tangents = [] for shape in variable_shapes: size = np.prod(shape) param = tf.reshape(self.flat_tangent[offset:(offset + size)], shape) tangents.append(param) offset += size gradients = tf.gradients(fixed_kl_divergence, variables) gradient_vector_product = [ tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents) ] self.flat_variable_helper = FlatVarHelper(self.session, variables) self.fisher_vector_product = get_flattened_gradient( gradient_vector_product, variables) def update(self, batch): """ Compute update for one batch of experiences using general advantage estimation and the constrained optimisation based on the fixed kl-divergence constraint. :param batch: :return: """ # Set per episode advantage using GAE self.input_feed = None self.compute_gae_advantage(batch, self.gamma, self.gae_lambda, self.use_gae) # Update linear value function for baseline prediction self.baseline_value_function.fit(batch) # Merge episode inputs into single arrays action_log_stds, action_means, actions, batch_advantage, states = self.merge_episodes( batch) self.input_feed = { self.state: states, self.actions: actions, self.advantage: batch_advantage, self.prev_action_means: action_means } if self.continuous: self.input_feed[self.prev_action_log_stds] = action_log_stds previous_theta = self.flat_variable_helper.get() gradient = self.session.run(self.policy_gradient, self.input_feed) zero = np.zeros_like(gradient) if np.allclose(gradient, zero): print('Gradient zero, skipping update') else: # The details of the approximations used here to solve the constrained # optimisation can be found in Appendix C of the TRPO paper # Note that no subsampling is used, which would improve computational performance search_direction = self.cg_optimizer.solve(self.compute_fvp, -gradient) # Search direction has now been approximated as cg-solution s= A^-1g where A is # Fisher matrix, which is a local approximation of the # KL divergence constraint shs = 0.5 * search_direction.dot( self.compute_fvp(search_direction)) lagrange_multiplier = np.sqrt(shs / self.max_kl_divergence) update_step = search_direction / lagrange_multiplier negative_gradient_direction = -gradient.dot(search_direction) # Improve update step through simple backtracking line search # N.b. some implementations skip the line search improved, theta = line_search( self.compute_surrogate_loss, previous_theta, update_step, negative_gradient_direction / lagrange_multiplier, self.line_search_steps) # Use line search results, otherwise take full step # N.B. some implementations don't use the line search if improved: print('Updating with line search result..') self.flat_variable_helper.set(theta) else: print('Updating with full step..') self.flat_variable_helper.set(previous_theta + update_step) # Get loss values for progress monitoring surrogate_loss, kl_divergence, entropy = self.session.run( self.losses, self.input_feed) # Sanity checks. Is entropy decreasing? Is KL divergence within reason? Is loss non-zero? print('Surrogate loss=' + str(surrogate_loss)) print('KL-divergence after update=' + str(kl_divergence)) print('Entropy=' + str(entropy)) def compute_fvp(self, p): self.input_feed[self.flat_tangent] = p return self.session.run(self.fisher_vector_product, self.input_feed) + p * self.cg_damping def compute_surrogate_loss(self, theta): self.flat_variable_helper.set(theta) # Losses[0] = surrogate_loss return self.session.run(self.losses[0], self.input_feed)