Пример #1
0
  def init_ops_for_training(self, target_critic):
    # update critic using bellman equation; Q(s1, a) = reward + discount * Q(s2, A(s2))

    # left hand side of bellman is just q_value, but let's be explicit about it...
    bellman_lhs = self.q_value

    # right hand side is ...
    #  = reward + discounted q value from target actor & critic in the non terminal case
    #  = reward  # in the terminal case
    self.reward = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="critic_reward")
    self.terminal_mask = tf.placeholder(shape=[None, 1], dtype=tf.float32,
                                        name="critic_terminal_mask")
    self.input_state_2 = target_critic.input_state
    bellman_rhs = self.reward + (self.terminal_mask * opts.discount * target_critic.q_value)

    # note: since we are NOT training target networks we stop gradients flowing to them
    bellman_rhs = tf.stop_gradient(bellman_rhs)

    # the value we are trying to mimimise is the difference between these two; the
    # temporal difference we use a squared loss for optimisation and, as for actor, we
    # wrap optimiser in a namespace so it's not picked up by target network variable
    # handling.
    self.temporal_difference = bellman_lhs - bellman_rhs
    self.temporal_difference_loss = tf.reduce_mean(tf.pow(self.temporal_difference, 2))
#    self.temporal_difference_loss = tf.Print(self.temporal_difference_loss, [self.temporal_difference_loss], 'temporal_difference_loss')
    with tf.variable_scope("optimiser"):
      # calc gradients
      optimiser = tf.train.GradientDescentOptimizer(opts.critic_learning_rate)
      gradients = optimiser.compute_gradients(self.temporal_difference_loss)
      # potentially clip and wrap with debugging tf.Print
      gradients = util.clip_and_debug_gradients(gradients, opts)
      # apply
      self.train_op = optimiser.apply_gradients(gradients)
Пример #2
0
 def init_ops_for_training(self, critic):
   # actors gradients are the gradients for it's output w.r.t it's vars using initial
   # gradients provided by critic. this requires that critic was init'd with an
   # input_action = actor.output_action (which is natural anyway)
   # we wrap the optimiser in namespace since we don't want this as part of copy to
   # target networks.
   # note that we negate the gradients from critic since we are trying to maximise
   # the q values (not minimise like a loss)
   with tf.variable_scope("optimiser"):
     gradients = tf.gradients(self.output_action,
                              self.trainable_model_vars(),
                              tf.neg(critic.q_gradients_wrt_actions()))
     gradients = zip(gradients, self.trainable_model_vars())
     # potentially clip and wrap with debugging
     gradients = util.clip_and_debug_gradients(gradients, opts)
     # apply
     optimiser = tf.train.GradientDescentOptimizer(opts.actor_learning_rate)
     self.train_op = optimiser.apply_gradients(gradients)
Пример #3
0
  def __init__(self, namespace,
               input_state, input_state_2,
               value_net, target_value_net,
               action_dim, opts):
    super(NafNetwork, self).__init__(namespace)

    # noise to apply to actions during rollouts
    self.exploration_noise = util.OrnsteinUhlenbeckNoise(action_dim,
                                                         opts.action_noise_theta,
                                                         opts.action_noise_sigma)

    # we already have the V networks, created independently because it also
    # has a target network.
    self.value_net = value_net
    self.target_value_net = target_value_net

    # keep placeholders provided and build any others required
    self.input_state = input_state
    self.input_state_2 = input_state_2
    self.input_action = tf.placeholder(shape=[None, action_dim],
                                       dtype=tf.float32, name="input_action")
    self.reward =  tf.placeholder(shape=[None, 1],
                                  dtype=tf.float32, name="reward")
    self.terminal_mask = tf.placeholder(shape=[None, 1],
                                        dtype=tf.float32, name="terminal_mask")

    with tf.variable_scope(namespace):
      # mu (output_action) is also a simple NN mapping input state -> action
      # this is our target op for inference (i.e. value that maximises Q given input_state)
      with tf.variable_scope("output_action"):
        if opts.share_conv_net:
          conv_net_output = value_net.conv_net_output
        else:
          conv_net_output = self.conv_net_on(input_state, opts)
        hidden_layers = self.hidden_layers_on(conv_net_output, [100, 50])
        weights_initializer = tf.random_uniform_initializer(-0.1, 0.1)
        self.output_action = slim.fully_connected(scope='fc',
                                                  inputs=hidden_layers,
                                                  num_outputs=action_dim,
                                                  weights_initializer=weights_initializer,
                                                  weights_regularizer=tf.contrib.layers.l2_regularizer(0.01),
                                                  activation_fn=tf.nn.tanh)  # (batch, action_dim)

      # A (advantage) is a bit more work and has three components...
      # first the u / mu difference. note: to use in a matmul we need
      # to convert this vector into a matrix by adding an "unused"
      # trailing dimension
      u_mu_diff = self.input_action - self.output_action  # (batch, action_dim)
      u_mu_diff = tf.expand_dims(u_mu_diff, -1)           # (batch, action_dim, 1)

      # next we have P = L(x).L(x)_T  where L is the values of lower triangular
      # matrix with diagonals exp'd. yikes!

      # first the L lower triangular values; a network on top of the input state
      num_l_values = (action_dim*(action_dim+1))/2
      with tf.variable_scope("l_values"):
        if opts.share_conv_net:
          conv_net_output = value_net.conv_net_output
        else:
          conv_net_output = self.conv_net_on(input_state, opts)
        hidden_layers = self.hidden_layers_on(conv_net_output, [100, 50])
        l_values = slim.fully_connected(scope='fc',
                                        inputs=hidden_layers,
                                        num_outputs=num_l_values,
                                        weights_regularizer=tf.contrib.layers.l2_regularizer(0.01),
                                        activation_fn=None)

      # we will convert these l_values into a matrix one row at a time.
      rows = []

      self._l_values = l_values

      # each row is made of three components;
      # 1) the lower part of the matrix, i.e. elements to the left of diagonal
      # 2) the single diagonal element (that we exponentiate)
      # 3) the upper part of the matrix; all zeros
      batch_size = tf.shape(l_values)[0]
      row_idx = 0
      for row_idx in xrange(action_dim):
        row_offset_in_l = (row_idx*(row_idx+1))/2
        lower = tf.slice(l_values, begin=(0, row_offset_in_l), size=(-1, row_idx))
        diag  = tf.exp(tf.slice(l_values, begin=(0, row_offset_in_l+row_idx), size=(-1, 1)))
        upper = tf.zeros((batch_size, action_dim - tf.shape(lower)[1] - 1)) # -1 for diag
        rows.append(tf.concat(1, [lower, diag, upper]))
      # full L matrix is these rows packed.
      L = tf.pack(rows, 0)
      # and since leading axis in l was always the batch
      # we need to transpose it back to axis0 again
      L = tf.transpose(L, (1, 0, 2))  # (batch_size, action_dim, action_dim)
      self.check_L = tf.check_numerics(L, "L")

      # P is L.L_T
      L_T = tf.transpose(L, (0, 2, 1))  # TODO: update tf & use batch_matrix_transpose
      P = tf.batch_matmul(L, L_T)  # (batch_size, action_dim, action_dim)

      # can now calculate advantage
      u_mu_diff_T = tf.transpose(u_mu_diff, (0, 2, 1))
      advantage = -0.5 * tf.batch_matmul(u_mu_diff_T, tf.batch_matmul(P, u_mu_diff))  # (batch, 1, 1)
      # and finally we need to reshape off the axis we added to be able to matmul
      self.advantage = tf.reshape(advantage, [-1, 1])  # (batch, 1)

      # Q is value + advantage
      self.q_value = value_net.value + self.advantage

      # target y is reward + discounted target value
      self.target_y = self.reward + (self.terminal_mask * opts.discount * \
                                     target_value_net.value)
      self.target_y = tf.stop_gradient(self.target_y)

      # loss is squared difference that we want to minimise.
      self.loss = tf.reduce_mean(tf.pow(self.q_value - self.target_y, 2))
      with tf.variable_scope("optimiser"):
        # dynamically create optimiser based on opts
        optimiser = util.construct_optimiser(opts)
        # calc gradients
        gradients = optimiser.compute_gradients(self.loss)
        # potentially clip and wrap with debugging tf.Print
        gradients = util.clip_and_debug_gradients(gradients, opts)
        # apply
        self.train_op = optimiser.apply_gradients(gradients)

      # sanity checks (in the dependent order)
      checks = []
      for op, name in [(l_values, 'l_values'), (L,'L'), (self.loss, 'loss')]:
        checks.append(tf.check_numerics(op, name))
      self.check_numerics = tf.group(*checks)
Пример #4
0
    def __init__(self, env):
        self.env = env

        num_actions = self.env.action_space.n

        # we have three place holders we'll use...
        # observations; used either during rollout to sample some actions, or
        # during training when combined with actions_taken and advantages.
        shape_with_batch = [None] + list(self.env.observation_space.shape)
        self.observations = tf.placeholder(shape=shape_with_batch,
                                           dtype=tf.float32)
        # the actions we took during rollout
        self.actions = tf.placeholder(tf.int32, name='actions')
        # the advantages we got from taken 'action_taken' in 'observation'
        self.advantages = tf.placeholder(tf.float32, name='advantages')

        # our model is a very simple MLP
        with tf.variable_scope("model"):
            # stack of hidden layers on flattened input; (batch,2,2,7) -> (batch,28)
            flat_input_state = slim.flatten(self.observations, scope='flat')
            final_hidden = self.hidden_layers_starting_at(
                flat_input_state, opts.hidden_layers)
            logits = slim.fully_connected(inputs=final_hidden,
                                          num_outputs=num_actions,
                                          activation_fn=None)

        # in the eval case just pick arg max
        self.action_argmax = tf.argmax(logits, 1)

        # for rollouts we need an op that samples actions from this
        # model to give a stochastic action.
        sample_action = tf.multinomial(logits, num_samples=1)
        self.sampled_action_op = tf.reshape(sample_action, shape=[])

        # we are trying to maximise the product of two components...
        # 1) the log_p of "good" actions.
        # 2) the advantage term based on the rewards from actions.

        # first we need the log_p values for each observation for the actions we specifically
        # took by sampling... we first run a log_softmax over the action logits to get
        # probabilities.
        log_softmax = tf.nn.log_softmax(logits)
        self.debug_softmax = tf.exp(log_softmax)

        # we then use a mask to only select the elements of the softmaxs that correspond
        # to the actions we actually took. we could also do this by complex indexing and a
        # gather but i always think this is more natural. the "cost" of dealing with the
        # mostly zero one hot, as opposed to doing a gather on sparse indexes, isn't a big
        # deal when the number of observations is >> number of actions.
        action_mask = tf.one_hot(indices=self.actions, depth=num_actions)
        action_log_prob = tf.reduce_sum(log_softmax * action_mask,
                                        reduction_indices=1)

        # the (element wise) product of these action log_p's with the total reward of the
        # episode represents the quantity we want to maximise. we standardise the advantage
        # values so roughly 1/2 +ve / -ve as a variance control.
        action_mul_advantages = tf.mul(action_log_prob,
                                       util.standardise(self.advantages))
        self.loss = -tf.reduce_sum(
            action_mul_advantages)  # recall: we are maximising.
        with tf.variable_scope("optimiser"):
            # dynamically create optimiser based on opts
            optimiser = util.construct_optimiser(opts)
            # calc gradients
            gradients = optimiser.compute_gradients(self.loss)
            # potentially clip and wrap with debugging tf.Print
            gradients = util.clip_and_debug_gradients(gradients, opts)
            # apply
            self.train_op = optimiser.apply_gradients(gradients)