def create_tf_operations(self, config): """ Creates TRPO training operations, i.e. the natural gradient update step based on the KL divergence constraint between new and old policy. :return: """ super(TRPOModel, self).create_tf_operations(config) with tf.variable_scope('update'): log_probs = list() prob_ratios = list() kl_divs = list() # for diagnostics kl_divergences = list() entropies = list() self.distribution_tensors = dict() self.prev_distribution_tensors = dict() for name, action in self.action.items(): shape_size = util.prod(config.actions[name].shape) distribution = self.distribution[name] fixed_distribution = distribution.__class__.from_tensors( tensors=[ tf.stop_gradient(x) for x in distribution.get_tensors() ], deterministic=self.deterministic) log_prob = distribution.log_probability(action=action) log_prob = tf.reshape(tensor=log_prob, shape=(-1, shape_size)) log_probs.append(log_prob) fixed_log_prob = fixed_distribution.log_probability( action=action) fixed_log_prob = tf.reshape(tensor=fixed_log_prob, shape=(-1, shape_size)) log_prob_diff = log_prob - fixed_log_prob prob_ratio = tf.exp(x=log_prob_diff) prob_ratios.append(prob_ratio) kl_div = fixed_distribution.kl_divergence(other=distribution) kl_div = tf.reshape(tensor=kl_div, shape=(-1, shape_size)) kl_divs.append(kl_div) self.distribution_tensors[name] = list( distribution.get_tensors()) prev_distribution = list( tf.placeholder(dtype=tf.float32, shape=util.shape(tensor, unknown=None)) for tensor in distribution.get_tensors()) self.prev_distribution_tensors[name] = prev_distribution prev_distribution = distribution.from_tensors( tensors=prev_distribution, deterministic=self.deterministic) kl_divergence = prev_distribution.kl_divergence( other=distribution) kl_divergence = tf.reshape(tensor=kl_divergence, shape=(-1, shape_size)) kl_divergences.append(kl_divergence) entropy = distribution.entropy() entropy = tf.reshape(tensor=entropy, shape=(-1, shape_size)) entropies.append(entropy) self.log_prob = tf.reduce_mean(input_tensor=tf.concat( values=log_probs, axis=1), axis=1) prob_ratio = tf.reduce_mean(input_tensor=tf.concat( values=prob_ratios, axis=1), axis=1) self.loss_per_instance = -prob_ratio * self.reward self.surrogate_loss = tf.reduce_mean( input_tensor=self.loss_per_instance, axis=0) kl_div = tf.reduce_mean(input_tensor=tf.concat(values=kl_divs, axis=1), axis=1) # Get symbolic gradient expressions variables = list( tf.trainable_variables() ) # TODO: ideally not value function (see also for "gradients" below) gradients = tf.gradients(self.surrogate_loss, variables) # gradients[0] = tf.Print(gradients[0], (gradients[0],)) variables = [ var for var, grad in zip(variables, gradients) if grad is not None ] gradients = [grad for grad in gradients if grad is not None] self.policy_gradient = tf.concat( values=[tf.reshape(grad, (-1, )) for grad in gradients], axis=0) # util.prod(util.shape(v)) self.tangent = tf.placeholder(tf.float32, shape=(None, )) offset = 0 tangents = [] for variable in variables: shape = util.shape(variable) size = util.prod(shape) tangents.append( tf.reshape(self.tangent[offset:offset + size], shape)) offset += size gradients = tf.gradients(kl_div, variables) gradient_vector_product = [ tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents) ] self.flat_variable_helper = FlatVarHelper(variables) gradients = tf.gradients(gradient_vector_product, variables) self.fisher_vector_product = tf.concat( values=[tf.reshape(grad, (-1, )) for grad in gradients], axis=0) self.cg_optimizer = ConjugateGradientOptimizer( self.logger, config.cg_iterations) kl_divergence = tf.reduce_mean(input_tensor=tf.concat( values=kl_divergences, axis=1), axis=1) self.kl_divergence = tf.reduce_mean(input_tensor=kl_divergence, axis=0) entropy = tf.reduce_mean(input_tensor=tf.concat(values=entropies, axis=1), axis=1) self.entropy = tf.reduce_mean(input_tensor=entropy, axis=0)
def create_tf_operations(self, config): """ Creates TRPO training operations, i.e. the natural gradient update step based on the KL divergence constraint between new and old policy. :return: """ super(TRPOModel, self).create_tf_operations(config) with tf.variable_scope('update'): losses = list() for name, action in config.actions: distribution = self.distribution[name] previous_distribution = tuple(tf.placeholder(dtype=tf.float32, shape=util.shape(x, unknown=None)) for x in distribution) self.internal_inputs.extend(previous_distribution) self.internal_outputs.extend(distribution) if sum(1 for _ in distribution) == 2: for n, x in enumerate(distribution): if n == 0: self.internal_inits.append(np.zeros(shape=util.shape(x)[1:])) else: self.internal_inits.append(np.ones(shape=util.shape(x)[1:])) else: self.internal_inits.extend(np.zeros(shape=util.shape(x)[1:]) for x in distribution) previous_distribution = self.distribution[name].__class__(distribution=previous_distribution) log_prob = distribution.log_probability(action=self.action[name]) previous_log_prob = previous_distribution.log_probability(action=self.action[name]) prob_ratio = tf.minimum(tf.exp(log_prob - previous_log_prob), 1000) self.loss_per_instance = tf.multiply(x=prob_ratio, y=self.reward) surrogate_loss = -tf.reduce_mean(self.loss_per_instance, axis=0) kl_divergence = distribution.kl_divergence(previous_distribution) entropy = distribution.entropy() losses.append((surrogate_loss, kl_divergence, entropy)) self.losses = [tf.reduce_mean(loss) for loss in zip(*losses)] # Get symbolic gradient expressions variables = list(tf.trainable_variables()) # TODO: ideally not value function (see also for "gradients" below) gradients = tf.gradients(self.losses, variables) variables = [var for var, grad in zip(variables, gradients) if grad is not None] gradients = [grad for grad in gradients if grad is not None] self.policy_gradient = tf.concat(values=[tf.reshape(grad, (-1,)) for grad in gradients], axis=0) # util.prod(util.shape(v)) fixed_distribution = distribution.__class__([tf.stop_gradient(x) for x in distribution]) fixed_kl_divergence = fixed_distribution.kl_divergence(distribution) self.tangent = tf.placeholder(tf.float32, shape=(None,)) offset = 0 tangents = [] for variable in variables: shape = util.shape(variable) size = util.prod(shape) tangents.append(tf.reshape(self.tangent[offset:offset + size], shape)) offset += size gradients = tf.gradients(fixed_kl_divergence, variables) gradient_vector_product = [tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents)] self.flat_variable_helper = FlatVarHelper(variables) gradients = tf.gradients(gradient_vector_product, variables) self.fisher_vector_product = tf.concat(values=[tf.reshape(grad, (-1,)) for grad in gradients], axis=0) self.cg_optimizer = ConjugateGradientOptimizer(self.logger, config.cg_iterations)
class TRPOModel(PolicyGradientModel): allows_discrete_actions = True allows_continuous_actions = True default_config = dict(optimizer=None, max_kl_divergence=0.1, cg_iterations=20, cg_damping=0.001, ls_max_backtracks=10, ls_accept_ratio=0.9, ls_override=False) def __init__(self, config): config.default(TRPOModel.default_config) super(TRPOModel, self).__init__(config) self.max_kl_divergence = config.max_kl_divergence self.cg_damping = config.cg_damping self.ls_max_backtracks = config.ls_max_backtracks self.ls_accept_ratio = config.ls_accept_ratio self.ls_override = config.ls_override def create_tf_operations(self, config): """ Creates TRPO training operations, i.e. the natural gradient update step based on the KL divergence constraint between new and old policy. :return: """ super(TRPOModel, self).create_tf_operations(config) with tf.variable_scope('update'): log_probs = list() prob_ratios = list() kl_divs = list() # for diagnostics kl_divergences = list() entropies = list() self.distribution_tensors = dict() self.prev_distribution_tensors = dict() for name, action in self.action.items(): shape_size = util.prod(config.actions[name].shape) distribution = self.distribution[name] fixed_distribution = distribution.__class__.from_tensors( tensors=[ tf.stop_gradient(x) for x in distribution.get_tensors() ], deterministic=self.deterministic) log_prob = distribution.log_probability(action=action) log_prob = tf.reshape(tensor=log_prob, shape=(-1, shape_size)) log_probs.append(log_prob) fixed_log_prob = fixed_distribution.log_probability( action=action) fixed_log_prob = tf.reshape(tensor=fixed_log_prob, shape=(-1, shape_size)) log_prob_diff = log_prob - fixed_log_prob prob_ratio = tf.exp(x=log_prob_diff) prob_ratios.append(prob_ratio) kl_div = fixed_distribution.kl_divergence(other=distribution) kl_div = tf.reshape(tensor=kl_div, shape=(-1, shape_size)) kl_divs.append(kl_div) self.distribution_tensors[name] = list( distribution.get_tensors()) prev_distribution = list( tf.placeholder(dtype=tf.float32, shape=util.shape(tensor, unknown=None)) for tensor in distribution.get_tensors()) self.prev_distribution_tensors[name] = prev_distribution prev_distribution = distribution.from_tensors( tensors=prev_distribution, deterministic=self.deterministic) kl_divergence = prev_distribution.kl_divergence( other=distribution) kl_divergence = tf.reshape(tensor=kl_divergence, shape=(-1, shape_size)) kl_divergences.append(kl_divergence) entropy = distribution.entropy() entropy = tf.reshape(tensor=entropy, shape=(-1, shape_size)) entropies.append(entropy) self.log_prob = tf.reduce_mean(input_tensor=tf.concat( values=log_probs, axis=1), axis=1) prob_ratio = tf.reduce_mean(input_tensor=tf.concat( values=prob_ratios, axis=1), axis=1) self.loss_per_instance = -prob_ratio * self.reward self.surrogate_loss = tf.reduce_mean( input_tensor=self.loss_per_instance, axis=0) kl_div = tf.reduce_mean(input_tensor=tf.concat(values=kl_divs, axis=1), axis=1) # Get symbolic gradient expressions variables = list( tf.trainable_variables() ) # TODO: ideally not value function (see also for "gradients" below) gradients = tf.gradients(self.surrogate_loss, variables) # gradients[0] = tf.Print(gradients[0], (gradients[0],)) variables = [ var for var, grad in zip(variables, gradients) if grad is not None ] gradients = [grad for grad in gradients if grad is not None] self.policy_gradient = tf.concat( values=[tf.reshape(grad, (-1, )) for grad in gradients], axis=0) # util.prod(util.shape(v)) self.tangent = tf.placeholder(tf.float32, shape=(None, )) offset = 0 tangents = [] for variable in variables: shape = util.shape(variable) size = util.prod(shape) tangents.append( tf.reshape(self.tangent[offset:offset + size], shape)) offset += size gradients = tf.gradients(kl_div, variables) gradient_vector_product = [ tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents) ] self.flat_variable_helper = FlatVarHelper(variables) gradients = tf.gradients(gradient_vector_product, variables) self.fisher_vector_product = tf.concat( values=[tf.reshape(grad, (-1, )) for grad in gradients], axis=0) self.cg_optimizer = ConjugateGradientOptimizer( self.logger, config.cg_iterations) kl_divergence = tf.reduce_mean(input_tensor=tf.concat( values=kl_divergences, axis=1), axis=1) self.kl_divergence = tf.reduce_mean(input_tensor=kl_divergence, axis=0) entropy = tf.reduce_mean(input_tensor=tf.concat(values=entropies, axis=1), axis=1) self.entropy = tf.reduce_mean(input_tensor=entropy, axis=0) def set_session(self, session): super(TRPOModel, self).set_session(session) self.flat_variable_helper.session = session def update(self, batch): """ Compute update for one batch of experiences using general advantage estimation and the constrained optimisation based on the fixed kl-divergence constraint. :param batch: :return: """ super(TRPOModel, self).update(batch) assert 'policy_gradient' not in self.distribution_tensors fetches = dict(policy_gradient=self.policy_gradient) fetches.update(self.distribution_tensors) self.feed_dict = { state: batch['states'][name] for name, state in self.state.items() } self.feed_dict.update({ action: batch['actions'][name] for name, action in self.action.items() }) self.feed_dict[self.reward] = batch['rewards'] self.feed_dict[self.terminal] = batch['terminals'] self.feed_dict.update({ internal: batch['internals'][n] for n, internal in enumerate(self.internal_inputs) }) prev_distribution_tensors = self.session.run( fetches=fetches, feed_dict=self.feed_dict) # dL gradient = prev_distribution_tensors.pop('policy_gradient') if np.allclose(gradient, np.zeros_like(gradient)): self.logger.debug('Gradient zero, skipping update.') return # The details of the approximations used here to solve the constrained # optimisation can be found in Appendix C of the TRPO paper # Note that no subsampling is used, which would improve computational performance search_direction = self.cg_optimizer.solve( self.compute_fvp, -gradient) # x = ddKL(=F)^(-1) * -dL # Search direction has now been approximated as cg-solution s= A^-1g where A is # Fisher matrix, which is a local approximation of the # KL divergence constraint shs = 0.5 * search_direction.dot(self.compute_fvp( search_direction)) # (c lambda^2) = 0.5 * xT * F * x if shs < 0: self.logger.debug( 'Computing search direction failed, skipping update.') return lagrange_multiplier = max(np.sqrt(shs / self.max_kl_divergence), util.epsilon) natural_gradient_step = search_direction / lagrange_multiplier # c negative_gradient_direction = -gradient.dot( search_direction) # -dL * x estimated_improvement = negative_gradient_direction / lagrange_multiplier # Improve update step through simple backtracking line search # N.b. some implementations skip the line search parameters = self.flat_variable_helper.get() new_parameters = self.line_search( rewards=batch['rewards'], parameters=parameters, natural_gradient_step=natural_gradient_step, estimated_improvement=estimated_improvement) # Use line search results, otherwise take full step # N.B. some implementations don't use the line search if new_parameters is not None: self.logger.debug('Updating with line search result.') self.flat_variable_helper.set(new_parameters) elif self.ls_override: self.logger.debug('Updating with full step.') self.flat_variable_helper.set(parameters + natural_gradient_step) else: self.logger.debug( 'Failed to find line search solution, skipping update.') self.flat_variable_helper.set(parameters) # Get loss values for progress monitoring fetches = (self.surrogate_loss, self.kl_divergence, self.entropy, self.loss_per_instance) prev_distribution_tensors = { placeholder: tensor for name, placeholders in self.prev_distribution_tensors.items() for placeholder, tensor in zip(placeholders, prev_distribution_tensors[name]) } self.feed_dict.update(prev_distribution_tensors) surrogate_loss, kl_divergence, entropy, loss_per_instance = self.session.run( fetches=fetches, feed_dict=self.feed_dict) # Sanity checks. Is entropy decreasing? Is KL divergence within reason? Is loss non-zero? self.logger.debug('Surrogate loss = {}'.format(surrogate_loss)) self.logger.debug( 'KL-divergence after update = {}'.format(kl_divergence)) self.logger.debug('Entropy = {}'.format(entropy)) return (surrogate_loss, kl_divergence, entropy), loss_per_instance def compute_fvp(self, p): self.feed_dict[self.tangent] = p return self.session.run(self.fisher_vector_product, self.feed_dict) + p * self.cg_damping def compute_log_prob(self, theta): self.flat_variable_helper.set(theta) return self.session.run(self.log_prob, self.feed_dict) def line_search(self, rewards, parameters, natural_gradient_step, estimated_improvement): """ Line search for TRPO where a full step is taken first and then backtracked to find optimal step size. :param rewards: :param parameters: :param natural_gradient_step: :param estimated_improvement: :return: """ log_prob = self.compute_log_prob(parameters) old_value = sum(rewards) / len(rewards) estimated_improvement = max(estimated_improvement, util.epsilon) step_fraction = 1.0 for backtrack in range(self.ls_max_backtracks): new_parameters = parameters + step_fraction * natural_gradient_step new_log_prob = self.compute_log_prob(new_parameters) prob_ratio = np.exp(new_log_prob - log_prob) new_value = prob_ratio.dot(rewards) / prob_ratio.shape[0] improvement_ratio = (new_value - old_value) / estimated_improvement if improvement_ratio > self.ls_accept_ratio: self.logger.debug( 'Line search successful after {} backtracking steps.'. format(backtrack)) return new_parameters step_fraction /= 2.0 estimated_improvement /= 2.0 return None
class TRPOModel(PolicyGradientModel): allows_discrete_actions = True allows_continuous_actions = True default_config = dict( optimizer=None, override_line_search=False, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.001, cg_iterations=20 ) def __init__(self, config): config.default(TRPOModel.default_config) super(TRPOModel, self).__init__(config) self.override_line_search = config.override_line_search self.cg_damping = config.cg_damping self.max_kl_divergence = config.max_kl_divergence self.line_search_steps = config.line_search_steps def create_tf_operations(self, config): """ Creates TRPO training operations, i.e. the natural gradient update step based on the KL divergence constraint between new and old policy. :return: """ super(TRPOModel, self).create_tf_operations(config) with tf.variable_scope('update'): losses = list() for name, action in config.actions: distribution = self.distribution[name] previous_distribution = tuple(tf.placeholder(dtype=tf.float32, shape=util.shape(x, unknown=None)) for x in distribution) self.internal_inputs.extend(previous_distribution) self.internal_outputs.extend(distribution) if sum(1 for _ in distribution) == 2: for n, x in enumerate(distribution): if n == 0: self.internal_inits.append(np.zeros(shape=util.shape(x)[1:])) else: self.internal_inits.append(np.ones(shape=util.shape(x)[1:])) else: self.internal_inits.extend(np.zeros(shape=util.shape(x)[1:]) for x in distribution) previous_distribution = self.distribution[name].__class__(distribution=previous_distribution) log_prob = distribution.log_probability(action=self.action[name]) previous_log_prob = previous_distribution.log_probability(action=self.action[name]) prob_ratio = tf.minimum(tf.exp(log_prob - previous_log_prob), 1000) self.loss_per_instance = tf.multiply(x=prob_ratio, y=self.reward) surrogate_loss = -tf.reduce_mean(self.loss_per_instance, axis=0) kl_divergence = distribution.kl_divergence(previous_distribution) entropy = distribution.entropy() losses.append((surrogate_loss, kl_divergence, entropy)) self.losses = [tf.reduce_mean(loss) for loss in zip(*losses)] # Get symbolic gradient expressions variables = list(tf.trainable_variables()) # TODO: ideally not value function (see also for "gradients" below) gradients = tf.gradients(self.losses, variables) variables = [var for var, grad in zip(variables, gradients) if grad is not None] gradients = [grad for grad in gradients if grad is not None] self.policy_gradient = tf.concat(values=[tf.reshape(grad, (-1,)) for grad in gradients], axis=0) # util.prod(util.shape(v)) fixed_distribution = distribution.__class__([tf.stop_gradient(x) for x in distribution]) fixed_kl_divergence = fixed_distribution.kl_divergence(distribution) self.tangent = tf.placeholder(tf.float32, shape=(None,)) offset = 0 tangents = [] for variable in variables: shape = util.shape(variable) size = util.prod(shape) tangents.append(tf.reshape(self.tangent[offset:offset + size], shape)) offset += size gradients = tf.gradients(fixed_kl_divergence, variables) gradient_vector_product = [tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents)] self.flat_variable_helper = FlatVarHelper(variables) gradients = tf.gradients(gradient_vector_product, variables) self.fisher_vector_product = tf.concat(values=[tf.reshape(grad, (-1,)) for grad in gradients], axis=0) self.cg_optimizer = ConjugateGradientOptimizer(self.logger, config.cg_iterations) def set_session(self, session): super(TRPOModel, self).set_session(session) self.flat_variable_helper.session = session def update(self, batch): """ Compute update for one batch of experiences using general advantage estimation and the constrained optimisation based on the fixed kl-divergence constraint. :param batch: :return: """ super(TRPOModel, self).update(batch) self.feed_dict = {state: batch['states'][name] for name, state in self.state.items()} self.feed_dict.update({action: batch['actions'][name] for name, action in self.action.items()}) self.feed_dict[self.reward] = batch['rewards'] self.feed_dict[self.terminal] = batch['terminals'] self.feed_dict.update({internal: batch['internals'][n] for n, internal in enumerate(self.internal_inputs)}) gradient = self.session.run(self.policy_gradient, self.feed_dict) if np.allclose(gradient, np.zeros_like(gradient)): self.logger.debug('Gradient zero, skipping update.') return # The details of the approximations used here to solve the constrained # optimisation can be found in Appendix C of the TRPO paper # Note that no subsampling is used, which would improve computational performance search_direction = self.cg_optimizer.solve(self.compute_fvp, -gradient) # Search direction has now been approximated as cg-solution s= A^-1g where A is # Fisher matrix, which is a local approximation of the # KL divergence constraint shs = 0.5 * search_direction.dot(self.compute_fvp(search_direction)) if shs < 0: self.logger.debug('Computing search direction failed, skipping update.') return lagrange_multiplier = np.sqrt(shs / self.max_kl_divergence) update_step = search_direction / (lagrange_multiplier + util.epsilon) negative_gradient_direction = -gradient.dot(search_direction) # Improve update step through simple backtracking line search # N.b. some implementations skip the line search previous_theta = self.flat_variable_helper.get() improved, theta = line_search(self.compute_surrogate_loss, previous_theta, update_step, negative_gradient_direction / (lagrange_multiplier + util.epsilon), self.line_search_steps) # Use line search results, otherwise take full step # N.B. some implementations don't use the line search if improved: self.logger.debug('Updating with line search result..') self.flat_variable_helper.set(theta) elif self.override_line_search: self.logger.debug('Updating with full step..') self.flat_variable_helper.set(previous_theta + update_step) else: self.logger.debug('Failed to find line search solution, skipping update.') # Get loss values for progress monitoring surrogate_loss, kl_divergence, entropy, loss_per_instance = self.session.run(self.losses + [self.loss_per_instance], self.feed_dict) # Sanity checks. Is entropy decreasing? Is KL divergence within reason? Is loss non-zero? self.logger.debug('Surrogate loss = ' + str(surrogate_loss)) self.logger.debug('KL-divergence after update = ' + str(kl_divergence)) self.logger.debug('Entropy = ' + str(entropy)) return (surrogate_loss, kl_divergence, entropy), loss_per_instance def compute_fvp(self, p): self.feed_dict[self.tangent] = p return self.session.run(self.fisher_vector_product, self.feed_dict) + p * self.cg_damping def compute_surrogate_loss(self, theta): self.flat_variable_helper.set(theta) # Losses[0] = surrogate_loss return self.session.run(self.losses[0], self.feed_dict)
def create_tf_operations(self, config): """ Creates TRPO training operations, i.e. the natural gradient update step based on the KL divergence constraint between new and old policy. :return: """ super(TRPOModel, self).create_tf_operations(config) with tf.variable_scope('update'): losses = list() for name, action in config.actions: distribution = self.distribution[name] previous_distribution = tuple( tf.placeholder(dtype=tf.float32, shape=util.shape(x, unknown=None)) for x in distribution) self.internal_inputs.extend(previous_distribution) self.internal_outputs.extend(distribution) if sum(1 for _ in distribution) == 2: for n, x in enumerate(distribution): if n == 0: self.internal_inits.append( np.zeros(shape=util.shape(x)[1:])) else: self.internal_inits.append( np.ones(shape=util.shape(x)[1:])) else: self.internal_inits.extend( np.zeros(shape=util.shape(x)[1:]) for x in distribution) previous_distribution = self.distribution[name].__class__( distribution=previous_distribution) log_prob = distribution.log_probability( action=self.action[name]) previous_log_prob = previous_distribution.log_probability( action=self.action[name]) prob_ratio = tf.minimum(tf.exp(log_prob - previous_log_prob), 1000) self.loss_per_instance = tf.multiply(x=prob_ratio, y=self.reward) surrogate_loss = -tf.reduce_mean(self.loss_per_instance, axis=0) kl_divergence = distribution.kl_divergence( previous_distribution) entropy = distribution.entropy() losses.append((surrogate_loss, kl_divergence, entropy)) self.losses = [tf.reduce_mean(loss) for loss in zip(*losses)] # Get symbolic gradient expressions variables = list(tf.trainable_variables()) gradients = tf.gradients(self.losses, variables) self.policy_gradient = tf.concat( values=[tf.reshape(grad, (-1, )) for grad in gradients], axis=0) # util.prod(util.shape(v)) fixed_distribution = distribution.__class__( [tf.stop_gradient(x) for x in distribution]) fixed_kl_divergence = fixed_distribution.kl_divergence( distribution) self.tangent = tf.placeholder(tf.float32, shape=(None, )) offset = 0 tangents = [] for variable in variables: shape = util.shape(variable) size = util.prod(shape) tangents.append( tf.reshape(self.tangent[offset:offset + size], shape)) offset += size gradients = tf.gradients(fixed_kl_divergence, variables) gradient_vector_product = [ tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents) ] self.flat_variable_helper = FlatVarHelper(variables) gradients = tf.gradients(gradient_vector_product, variables) self.fisher_vector_product = tf.concat( values=[tf.reshape(grad, (-1, )) for grad in gradients], axis=0) self.cg_optimizer = ConjugateGradientOptimizer( self.logger, config.cg_iterations)
class TRPOModel(PolicyGradientModel): allows_discrete_actions = True allows_continuous_actions = True default_config = dict(optimizer=None, learning_rate=None, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.001, cg_iterations=20) def __init__(self, config): config.default(TRPOModel.default_config) super(TRPOModel, self).__init__(config) self.cg_damping = config.cg_damping self.max_kl_divergence = config.max_kl_divergence self.line_search_steps = config.line_search_steps def create_tf_operations(self, config): """ Creates TRPO training operations, i.e. the natural gradient update step based on the KL divergence constraint between new and old policy. :return: """ super(TRPOModel, self).create_tf_operations(config) with tf.variable_scope('update'): losses = list() for name, action in config.actions: distribution = self.distribution[name] previous_distribution = tuple( tf.placeholder(dtype=tf.float32, shape=util.shape(x, unknown=None)) for x in distribution) self.internal_inputs.extend(previous_distribution) self.internal_outputs.extend(distribution) if sum(1 for _ in distribution) == 2: for n, x in enumerate(distribution): if n == 0: self.internal_inits.append( np.zeros(shape=util.shape(x)[1:])) else: self.internal_inits.append( np.ones(shape=util.shape(x)[1:])) else: self.internal_inits.extend( np.zeros(shape=util.shape(x)[1:]) for x in distribution) previous_distribution = self.distribution[name].__class__( distribution=previous_distribution) log_prob = distribution.log_probability( action=self.action[name]) previous_log_prob = previous_distribution.log_probability( action=self.action[name]) prob_ratio = tf.minimum(tf.exp(log_prob - previous_log_prob), 1000) self.loss_per_instance = tf.multiply(x=prob_ratio, y=self.reward) surrogate_loss = -tf.reduce_mean(self.loss_per_instance, axis=0) kl_divergence = distribution.kl_divergence( previous_distribution) entropy = distribution.entropy() losses.append((surrogate_loss, kl_divergence, entropy)) self.losses = [tf.reduce_mean(loss) for loss in zip(*losses)] # Get symbolic gradient expressions variables = list(tf.trainable_variables()) gradients = tf.gradients(self.losses, variables) self.policy_gradient = tf.concat( values=[tf.reshape(grad, (-1, )) for grad in gradients], axis=0) # util.prod(util.shape(v)) fixed_distribution = distribution.__class__( [tf.stop_gradient(x) for x in distribution]) fixed_kl_divergence = fixed_distribution.kl_divergence( distribution) self.tangent = tf.placeholder(tf.float32, shape=(None, )) offset = 0 tangents = [] for variable in variables: shape = util.shape(variable) size = util.prod(shape) tangents.append( tf.reshape(self.tangent[offset:offset + size], shape)) offset += size gradients = tf.gradients(fixed_kl_divergence, variables) gradient_vector_product = [ tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents) ] self.flat_variable_helper = FlatVarHelper(variables) gradients = tf.gradients(gradient_vector_product, variables) self.fisher_vector_product = tf.concat( values=[tf.reshape(grad, (-1, )) for grad in gradients], axis=0) self.cg_optimizer = ConjugateGradientOptimizer( self.logger, config.cg_iterations) def set_session(self, session): super(TRPOModel, self).set_session(session) self.flat_variable_helper.session = session def update(self, batch): """ Compute update for one batch of experiences using general advantage estimation and the constrained optimisation based on the fixed kl-divergence constraint. :param batch: :return: """ self.feed_dict = { state: batch['states'][name] for name, state in self.state.items() } self.feed_dict.update({ action: batch['actions'][name] for name, action in self.action.items() }) self.feed_dict[self.reward] = batch['rewards'] self.feed_dict[self.terminal] = batch['terminals'] self.feed_dict.update({ internal: batch['internals'][n] for n, internal in enumerate(self.internal_inputs) }) gradient = self.session.run(self.policy_gradient, self.feed_dict) if np.allclose(gradient, np.zeros_like(gradient)): self.logger.debug('Gradient zero, skipping update') return # The details of the approximations used here to solve the constrained # optimisation can be found in Appendix C of the TRPO paper # Note that no subsampling is used, which would improve computational performance search_direction = self.cg_optimizer.solve(self.compute_fvp, -gradient) # Search direction has now been approximated as cg-solution s= A^-1g where A is # Fisher matrix, which is a local approximation of the # KL divergence constraint shs = 0.5 * search_direction.dot(self.compute_fvp(search_direction)) lagrange_multiplier = np.sqrt(shs / self.max_kl_divergence) update_step = search_direction / (lagrange_multiplier + util.epsilon) negative_gradient_direction = -gradient.dot(search_direction) # Improve update step through simple backtracking line search # N.b. some implementations skip the line search previous_theta = self.flat_variable_helper.get() improved, theta = line_search( self.compute_surrogate_loss, previous_theta, update_step, negative_gradient_direction / (lagrange_multiplier + util.epsilon), self.line_search_steps) # Use line search results, otherwise take full step # N.B. some implementations don't use the line search if improved: self.logger.debug('Updating with line search result..') self.flat_variable_helper.set(theta) else: self.logger.debug('Updating with full step..') self.flat_variable_helper.set(previous_theta + update_step) # Get loss values for progress monitoring surrogate_loss, kl_divergence, entropy, loss_per_instance = self.session.run( self.losses + [self.loss_per_instance], self.feed_dict) # Sanity checks. Is entropy decreasing? Is KL divergence within reason? Is loss non-zero? self.logger.debug('Surrogate loss = ' + str(surrogate_loss)) self.logger.debug('KL-divergence after update = ' + str(kl_divergence)) self.logger.debug('Entropy = ' + str(entropy)) return (surrogate_loss, kl_divergence, entropy), loss_per_instance def compute_fvp(self, p): self.feed_dict[self.tangent] = p return self.session.run(self.fisher_vector_product, self.feed_dict) + p * self.cg_damping def compute_surrogate_loss(self, theta): self.flat_variable_helper.set(theta) # Losses[0] = surrogate_loss return self.session.run(self.losses[0], self.feed_dict)
class TRPOModel(PolicyGradientModel): allows_discrete_actions = True allows_continuous_actions = True default_config = dict(optimizer=None, max_kl_divergence=0.001, cg_iterations=20, cg_damping=0.001, ls_max_backtracks=20, ls_accept_ratio=0.01, ls_override=False) def __init__(self, config): config.default(TRPOModel.default_config) super(TRPOModel, self).__init__(config) self.max_kl_divergence = config.max_kl_divergence self.cg_damping = config.cg_damping self.ls_max_backtracks = config.ls_max_backtracks self.ls_accept_ratio = config.ls_accept_ratio self.ls_override = config.ls_override def create_tf_operations(self, config): """ Creates TRPO training operations, i.e. the natural gradient update step based on the KL divergence constraint between new and old policy. :return: """ super(TRPOModel, self).create_tf_operations(config) with tf.variable_scope('update'): prob_ratios = list() kl_divergences = list() entropies = list() fixed_kl_divergences = list() for name, action in self.action.items(): distribution = self.distribution[name] prev_distribution = tuple( tf.placeholder(dtype=tf.float32, shape=util.shape(x, unknown=None)) for x in distribution) self.internal_inputs.extend(prev_distribution) self.internal_outputs.extend(distribution) self.internal_inits.extend( np.zeros(shape=util.shape(x)[1:]) for x in distribution) prev_distribution = distribution.from_tensors( parameters=prev_distribution, deterministic=self.deterministic) shape_size = util.prod(config.actions[name].shape) log_prob = distribution.log_probability(action=action) prev_log_prob = prev_distribution.log_probability( action=action) log_prob_diff = tf.minimum(x=(log_prob - prev_log_prob), y=10.0) prob_ratio = tf.exp(x=log_prob_diff) prob_ratio = tf.reshape(tensor=prob_ratio, shape=(-1, shape_size)) prob_ratios.append(prob_ratio) kl_divergence = distribution.kl_divergence( other=prev_distribution) kl_divergence = tf.reshape(tensor=kl_divergence, shape=(-1, shape_size)) kl_divergences.append(kl_divergence) entropy = distribution.entropy() entropy = tf.reshape(tensor=entropy, shape=(-1, shape_size)) entropies.append(entropy) fixed_distribution = distribution.__class__.from_tensors( parameters=[tf.stop_gradient(x) for x in distribution], deterministic=self.deterministic) fixed_kl_divergence = fixed_distribution.kl_divergence( distribution) fixed_kl_divergence = tf.reshape(tensor=fixed_kl_divergence, shape=(-1, shape_size)) fixed_kl_divergences.append(fixed_kl_divergence) prob_ratio = tf.reduce_mean(input_tensor=tf.concat( values=prob_ratios, axis=1), axis=1) self.loss_per_instance = -prob_ratio * self.reward surrogate_loss = tf.reduce_mean( input_tensor=self.loss_per_instance, axis=0) kl_divergence = tf.reduce_mean(input_tensor=tf.concat( values=kl_divergences, axis=1), axis=1) kl_divergence = tf.reduce_mean(input_tensor=kl_divergence, axis=0) entropy = tf.reduce_mean(input_tensor=tf.concat(values=entropies, axis=1), axis=1) entropy = tf.reduce_mean(input_tensor=entropy, axis=0) self.losses = (surrogate_loss, kl_divergence, entropy, self.loss_per_instance) fixed_kl_divergence = tf.reduce_mean(input_tensor=tf.concat( values=fixed_kl_divergences, axis=1), axis=1) # Get symbolic gradient expressions variables = list( tf.trainable_variables() ) # TODO: ideally not value function (see also for "gradients" below) gradients = tf.gradients(self.losses[0], variables) variables = [ var for var, grad in zip(variables, gradients) if grad is not None ] gradients = [grad for grad in gradients if grad is not None] self.policy_gradient = tf.concat( values=[tf.reshape(grad, (-1, )) for grad in gradients], axis=0) # util.prod(util.shape(v)) self.tangent = tf.placeholder(tf.float32, shape=(None, )) offset = 0 tangents = [] for variable in variables: shape = util.shape(variable) size = util.prod(shape) tangents.append( tf.reshape(self.tangent[offset:offset + size], shape)) offset += size gradients = tf.gradients(fixed_kl_divergence, variables) gradient_vector_product = [ tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents) ] self.flat_variable_helper = FlatVarHelper(variables) gradients = tf.gradients(gradient_vector_product, variables) self.fisher_vector_product = tf.concat( values=[tf.reshape(grad, (-1, )) for grad in gradients], axis=0) self.cg_optimizer = ConjugateGradientOptimizer( self.logger, config.cg_iterations) def set_session(self, session): super(TRPOModel, self).set_session(session) self.flat_variable_helper.session = session def update(self, batch): """ Compute update for one batch of experiences using general advantage estimation and the constrained optimisation based on the fixed kl-divergence constraint. :param batch: :return: """ super(TRPOModel, self).update(batch) self.feed_dict = { state: batch['states'][name] for name, state in self.state.items() } self.feed_dict.update({ action: batch['actions'][name] for name, action in self.action.items() }) self.feed_dict[self.reward] = batch['rewards'] self.feed_dict[self.terminal] = batch['terminals'] self.feed_dict.update({ internal: batch['internals'][n] for n, internal in enumerate(self.internal_inputs) }) gradient = self.session.run(self.policy_gradient, self.feed_dict) # dL if np.allclose(gradient, np.zeros_like(gradient)): self.logger.debug('Gradient zero, skipping update.') return # The details of the approximations used here to solve the constrained # optimisation can be found in Appendix C of the TRPO paper # Note that no subsampling is used, which would improve computational performance search_direction = self.cg_optimizer.solve( self.compute_fvp, -gradient) # x = ddKL(=F)^(-1) * -dL # Search direction has now been approximated as cg-solution s= A^-1g where A is # Fisher matrix, which is a local approximation of the # KL divergence constraint shs = 0.5 * search_direction.dot(self.compute_fvp( search_direction)) # (c lambda^2) = 0.5 * xT * F * x if shs < 0: self.logger.debug( 'Computing search direction failed, skipping update.') return lagrange_multiplier = np.sqrt(shs / self.max_kl_divergence) update_step = search_direction / (lagrange_multiplier + util.epsilon ) # c negative_gradient_direction = -gradient.dot( search_direction) # -dL * x # Improve update step through simple backtracking line search # N.b. some implementations skip the line search previous_theta = self.flat_variable_helper.get() improved, theta = line_search( self.compute_surrogate_loss, previous_theta, update_step, negative_gradient_direction / (lagrange_multiplier + util.epsilon), self.ls_max_backtracks, self.ls_accept_ratio) # Use line search results, otherwise take full step # N.B. some implementations don't use the line search if improved: self.logger.debug('Updating with line search result..') self.flat_variable_helper.set(theta) elif self.ls_override: self.logger.debug('Updating with full step..') self.flat_variable_helper.set(previous_theta + update_step) else: self.logger.debug( 'Failed to find line search solution, skipping update.') # Get loss values for progress monitoring surrogate_loss, kl_divergence, entropy, loss_per_instance = self.session.run( self.losses, self.feed_dict) # Sanity checks. Is entropy decreasing? Is KL divergence within reason? Is loss non-zero? self.logger.debug('Surrogate loss = {}'.format(surrogate_loss)) self.logger.debug( 'KL-divergence after update = {}'.format(kl_divergence)) self.logger.debug('Entropy = {}'.format(entropy)) return (surrogate_loss, kl_divergence, entropy), loss_per_instance def compute_fvp(self, p): self.feed_dict[self.tangent] = p return self.session.run(self.fisher_vector_product, self.feed_dict) + p * self.cg_damping def compute_surrogate_loss(self, theta): self.flat_variable_helper.set(theta) # Losses[0] = surrogate_loss return self.session.run(self.losses[0], self.feed_dict)
def create_tf_operations(self, config): """ Creates TRPO training operations, i.e. the natural gradient update step based on the KL divergence constraint between new and old policy. :return: """ super(TRPOModel, self).create_tf_operations(config) with tf.variable_scope('update'): prob_ratios = list() kl_divergences = list() entropies = list() fixed_kl_divergences = list() for name, action in self.action.items(): distribution = self.distribution[name] prev_distribution = tuple( tf.placeholder(dtype=tf.float32, shape=util.shape(x, unknown=None)) for x in distribution) self.internal_inputs.extend(prev_distribution) self.internal_outputs.extend(distribution) self.internal_inits.extend( np.zeros(shape=util.shape(x)[1:]) for x in distribution) prev_distribution = distribution.from_tensors( parameters=prev_distribution, deterministic=self.deterministic) log_prob = distribution.log_probability(action=action) prev_log_prob = prev_distribution.log_probability( action=action) log_prob_diff = tf.minimum(x=(log_prob - prev_log_prob), y=10.0) prob_ratio = tf.exp(x=log_prob_diff) kl_divergence = distribution.kl_divergence( other=prev_distribution) entropy = distribution.entropy() fixed_distribution = distribution.__class__.from_tensors( parameters=[tf.stop_gradient(x) for x in distribution], deterministic=self.deterministic) fixed_kl_divergence = fixed_distribution.kl_divergence( distribution) prs_list = [prob_ratio] kds_list = [kl_divergence] es_list = [entropy] fkds_list = [fixed_kl_divergence] for _ in range(len(config.actions[name].shape)): prs_list = [ pr for prs in prs_list for pr in tf.unstack(value=prs, axis=1) ] kds_list = [ kd for kds in kds_list for kd in tf.unstack(value=kds, axis=1) ] es_list = [ e for es in es_list for e in tf.unstack(value=es, axis=1) ] fkds_list = [ fkd for fkds in fkds_list for fkd in tf.unstack(value=fkds, axis=1) ] prob_ratios.extend(prs_list) kl_divergences.extend(kds_list) entropies.extend(es_list) fixed_kl_divergences.extend(fkds_list) prob_ratio = tf.add_n(inputs=prob_ratios) / len(prob_ratios) self.loss_per_instance = -prob_ratio * self.reward surrogate_loss = tf.reduce_mean( input_tensor=self.loss_per_instance, axis=0) kl_divergence = tf.reduce_mean( input_tensor=(tf.add_n(inputs=kl_divergences) / len(kl_divergences)), axis=0) entropy = tf.reduce_mean(input_tensor=(tf.add_n(inputs=entropies) / len(entropies)), axis=0) self.losses = (surrogate_loss, kl_divergence, entropy, self.loss_per_instance) fixed_kl_divergence = tf.add_n( inputs=fixed_kl_divergences) / len(fixed_kl_divergences) # Get symbolic gradient expressions variables = list( tf.trainable_variables() ) # TODO: ideally not value function (see also for "gradients" below) gradients = tf.gradients(self.losses[0], variables) variables = [ var for var, grad in zip(variables, gradients) if grad is not None ] gradients = [grad for grad in gradients if grad is not None] self.policy_gradient = tf.concat( values=[tf.reshape(grad, (-1, )) for grad in gradients], axis=0) # util.prod(util.shape(v)) self.tangent = tf.placeholder(tf.float32, shape=(None, )) offset = 0 tangents = [] for variable in variables: shape = util.shape(variable) size = util.prod(shape) tangents.append( tf.reshape(self.tangent[offset:offset + size], shape)) offset += size gradients = tf.gradients(fixed_kl_divergence, variables) gradient_vector_product = [ tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents) ] self.flat_variable_helper = FlatVarHelper(variables) gradients = tf.gradients(gradient_vector_product, variables) self.fisher_vector_product = tf.concat( values=[tf.reshape(grad, (-1, )) for grad in gradients], axis=0) self.cg_optimizer = ConjugateGradientOptimizer( self.logger, config.cg_iterations)