Exemplo n.º 1
0
  def getLoss(self, states, actions, rewards, entropy_scale=0.01, policy_scale=1.0, **kwargs):
    n = self.rlConfig.tdN
    
    state_shape = tf.shape(states)
    state_rank = tf.shape(state_shape)[0]
    experience_length = tf.gather(state_shape, state_rank-2)
    
    train_length = experience_length - n

    values, actor_probs = self.getOutput(states)
    trainVs = tf.slice(values, [0, 0], [-1, train_length])
    #trainVs = values[:,:train_length]

    # smooth between TD(m) for m<=n?
    targets = tf.slice(values, [0, n], [-1, train_length])
    #targets = values[:,n:]
    for i in reversed(range(n)):
      targets *= self.rlConfig.discount
      targets += tf.slice(rewards, [0, i], [-1, train_length])
    targets = tf.stop_gradient(targets)

    advantages = targets - trainVs
    vLoss = tf.reduce_mean(tf.square(advantages))

    log_actor_probs = tf.log(actor_probs)
    actor_entropy = -tf.reduce_mean(tfl.batch_dot(actor_probs, log_actor_probs))
    real_log_actor_probs = tfl.batch_dot(actions, log_actor_probs)
    train_log_actor_probs = tf.slice(real_log_actor_probs, [0, 0], [-1, train_length])
    actor_gain = tf.reduce_mean(tf.mul(train_log_actor_probs, tf.stop_gradient(advantages)))

    acLoss = vLoss - policy_scale * (actor_gain + entropy_scale * actor_entropy)

    return acLoss, [('vLoss', vLoss), ('actor_gain', actor_gain), ('actor_entropy', actor_entropy)]
Exemplo n.º 2
0
  def getLoss(self, states, actions, rewards, sarsa=False, **kwargs):
    "Negative Log-Likelihood"
    
    n = self.rlConfig.tdN
    train_length = [config.experience_length - n]
    
    qMeans, qLogVariances = self.getQDists(states)
    
    realQs = tfl.batch_dot(actions, qMeans)
    maxQs = tf.reduce_max(qMeans, 1)

    # smooth between TD(m) for m<=n?
    targets = tf.slice(realQs if sarsa else maxQs, [n], train_length)
    for i in reversed(range(n)):
      targets = tf.slice(rewards, [i], train_length) + self.rlConfig.discount * targets
    targets = tf.stop_gradient(targets)

    trainQs = tf.slice(realQs, [0], train_length)
    
    realLogVariances =  tfl.batch_dot(actions, qLogVariances)
    trainLogVariances = tf.slice(realLogVariances, [0], train_length)
    
    nlls = tf.squared_difference(trainQs, targets) * tf.exp(-trainLogVariances) + trainLogVariances
    nll = tf.reduce_mean(nlls)
    return nll, [("nll", nll)]
Exemplo n.º 3
0
    def getLoss(self,
                states,
                actions,
                rewards,
                entropy_scale=0.01,
                policy_scale=1.0,
                **kwargs):
        n = self.rlConfig.tdN

        state_shape = tf.shape(states)
        state_rank = tf.shape(state_shape)[0]
        experience_length = tf.gather(state_shape, state_rank - 2)

        train_length = experience_length - n

        values = self.critic(states)
        log_actor_probs = self.actor(states)
        actor_probs = tf.exp(log_actor_probs)

        trainVs = tf.slice(values, [0, 0], [-1, train_length])
        #trainVs = values[:,:train_length]

        # smooth between TD(m) for m<=n?
        targets = tf.slice(values, [0, n], [-1, train_length])
        #targets = values[:,n:]
        for i in reversed(range(n)):
            targets *= self.rlConfig.discount
            targets += tf.slice(rewards, [0, i], [-1, train_length])
        targets = tf.stop_gradient(targets)

        advantages = targets - trainVs
        vLoss = tf.reduce_mean(tf.square(advantages))

        actor_entropy = -tf.reduce_mean(
            tfl.batch_dot(actor_probs, log_actor_probs))
        real_log_actor_probs = tfl.batch_dot(actions, log_actor_probs)
        train_log_actor_probs = tf.slice(real_log_actor_probs, [0, 0],
                                         [-1, train_length])
        actor_gain = tf.reduce_mean(
            tf.mul(train_log_actor_probs, tf.stop_gradient(advantages)))

        self.policy_scale = tf.Variable(policy_scale)

        min_rate = 1e-8
        max_rate = 1e2

        self.decrease_policy_scale = tf.assign(
            self.policy_scale, tf.maximum(min_rate, self.policy_scale / 1.5))
        self.increase_policy_scale = tf.assign(
            self.policy_scale, tf.minimum(max_rate, self.policy_scale * 1.5))

        acLoss = vLoss - self.policy_scale * (actor_gain +
                                              entropy_scale * actor_entropy)

        stats = [('vLoss', vLoss), ('actor_gain', actor_gain),
                 ('actor_entropy', actor_entropy),
                 ('policy_scale', tf.log(self.policy_scale))]

        return acLoss, stats, log_actor_probs
Exemplo n.º 4
0
  def train(self, states, actions, rewards):
    n = self.rlConfig.tdN
    
    state_shape = tf.shape(states)
    state_rank = tf.shape(state_shape)[0]
    experience_length = tf.gather(state_shape, state_rank-2)
    
    train_length = experience_length - n

    values = self.critic(states)
    actor_probs = self.actor(states)
    log_actor_probs = tf.log(actor_probs)
    
    trainVs = tf.slice(values, [0, 0], [-1, train_length])
    #trainVs = values[:,:train_length]

    # smooth between TD(m) for m<=n?
    targets = tf.slice(values, [0, n], [-1, train_length])
    #targets = values[:,n:]
    for i in reversed(range(n)):
      targets *= self.rlConfig.discount
      targets += tf.slice(rewards, [0, i], [-1, train_length])
    targets = tf.stop_gradient(targets)

    advantages = targets - trainVs
    vLoss = tf.reduce_mean(tf.square(advantages))
    tf.scalar_summary('v_loss', vLoss)
    
    variance = tf.reduce_mean(tf.squared_difference(targets, tf.reduce_mean(targets)))
    explained_variance = 1. - vLoss / variance
    tf.scalar_summary("v_ev", explained_variance)

    actor_entropy = -tf.reduce_mean(tfl.batch_dot(actor_probs, log_actor_probs))
    tf.scalar_summary('actor_entropy', actor_entropy)
    
    real_log_actor_probs = tfl.batch_dot(actions, log_actor_probs)
    train_log_actor_probs = tf.slice(real_log_actor_probs, [0, 0], [-1, train_length])
    actor_gain = tf.reduce_mean(tf.mul(train_log_actor_probs, tf.stop_gradient(advantages)))
    tf.scalar_summary('actor_gain', actor_gain)
    
    acLoss = vLoss - self.policy_scale * (actor_gain + self.entropy_scale * actor_entropy)
    
    params = tf.trainable_variables()
    pg = tf.gradients(acLoss, params, -self.learning_rate)
    
    if self.natural:
      predictions = [values, log_actor_probs]
      
      def metric(vp1, vp2):
        v1, p1 = vp1
        v2, p2 = vp2
        
        vDist = tf.reduce_mean(tf.squared_difference(v1, v2))
        pDist = tf.reduce_mean(tfl.kl(p1, p2))
        return vDist + self.kl_scale * pDist
      
      pg = self.natgrad(params, pg, predictions, metric)
    
    return tfl.apply_grads(params, pg)
Exemplo n.º 5
0
  def train(self, states, actions, rewards):
    n = self.rlConfig.tdN
    
    state_shape = tf.shape(states)
    state_rank = tf.shape(state_shape)[0]
    experience_length = tf.gather(state_shape, state_rank-2)

    train_length = experience_length - n

    self.predictedQs = self.q_net(states)
    trainQs = tfl.batch_dot(actions, self.predictedQs)
    trainQs = tf.slice(trainQs, [0, 0], [-1, train_length])
    
    self.q_target = self.q_net#.clone()
    
    targetQs = self.q_target(states)
    realQs = tfl.batch_dot(actions, targetQs)
    maxQs = tf.reduce_max(targetQs, -1)
    targetQs = realQs if self.sarsa else maxQs
    
    tf.scalar_summary("q_mean", tf.reduce_mean(self.predictedQs))
    tf.scalar_summary("q_max", tf.reduce_mean(maxQs))
    
    # smooth between TD(m) for m<=n?
    targets = tf.slice(targetQs, [0, n], [-1, train_length])
    for i in reversed(range(n)):
      targets = tf.slice(rewards, [0, i], [-1, train_length]) + self.rlConfig.discount * targets
    targets = tf.stop_gradient(targets)

    qLosses = tf.squared_difference(trainQs, targets)
    qLoss = tf.reduce_mean(qLosses)
    tf.scalar_summary("q_loss", qLoss)
    
    variance = tf.reduce_mean(tf.squared_difference(targets, tf.reduce_mean(targets)))
    explained_variance = 1 - qLoss / variance
    tf.scalar_summary("explained_variance", explained_variance)
    
    flatQs = tf.reshape(self.predictedQs, [-1, self.action_size])
    action_probs = tf.nn.softmax(flatQs / self.temperature)
    action_probs = (1.0 - self.epsilon) * action_probs + self.epsilon / self.action_size
    entropy = -tf.reduce_sum(tf.log(action_probs) * action_probs, -1)
    entropy = tf.reduce_mean(entropy)
    tf.scalar_summary("entropy", entropy)
    
    self.params = tf.trainable_variables()
    self.gradients = tf.gradients(qLoss, self.params, -self.learning_rate)
    
    gradients = self.gradients
    
    if self.natural:
      def q_metric(q1, q2):
        return self.action_size * tf.reduce_mean(tf.squared_difference(q1, q2))
      gradients = self.natgrad(self.params, gradients, self.predictedQs, q_metric)
    
    return tfl.apply_grads(self.params, gradients)
    
    """
Exemplo n.º 6
0
    def getLoss(self,
                states,
                actions,
                rewards,
                sarsa=False,
                target_delay=1000,
                **kwargs):
        n = self.rlConfig.tdN

        state_shape = tf.shape(states)
        state_rank = tf.shape(state_shape)[0]
        experience_length = tf.gather(state_shape, state_rank - 2)

        train_length = experience_length - n

        trainQs = self.q_net(states)
        trainQs = tfl.batch_dot(actions, trainQs)
        trainQs = tf.slice(trainQs, [0, 0], [-1, train_length])

        self.q_target = self.q_net.clone()

        targetQs = self.q_target(states)
        realQs = tfl.batch_dot(actions, targetQs)
        maxQs = tf.reduce_max(targetQs, -1)
        targetQs = realQs if sarsa else maxQs

        # smooth between TD(m) for m<=n?
        targets = tf.slice(targetQs, [0, n], [-1, train_length])
        for i in reversed(range(n)):
            targets = tf.slice(
                rewards, [0, i],
                [-1, train_length]) + self.rlConfig.discount * targets
        # not necessary if we optimize only on q_net variables
        # but this is easier :)
        targets = tf.stop_gradient(targets)

        qLosses = tf.squared_difference(trainQs, targets)
        qLoss = tf.reduce_mean(qLosses)

        update_target = lambda: tf.group(*self.q_target.assign(self.q_net),
                                         name="update_target")
        should_update = tf.equal(tf.mod(self.global_step, target_delay), 0)
        periodic_update = tf.case([(should_update, update_target)],
                                  default=lambda: tf.no_op())

        #return qLoss, [("qLoss", qLoss)], (1000, update_target)
        return (
            qLoss,
            [("qLoss", qLoss), ("periodic_update", periodic_update)],
            #tf.initialize_variables(self.q_target.getVariables())
        )
Exemplo n.º 7
0
  def getLoss(self, states, actions, rewards, entropy_scale=0.001, policy_scale=0.01, **kwargs):
    n = self.rlConfig.tdN
    
    state_shape = tf.shape(states)
    batch_size = state_shape[0]
    experience_length = state_shape[1]
    
    train_length = experience_length - n
    
    initial_state = tf.expand_dims(self.initial_state, 0)
    initial_state = tf.tile(initial_state, tf.pack([batch_size, 1]))
    outputs, hidden = tf.nn.dynamic_rnn(self.rnn, states, initial_state=initial_state)

    values = self.critic(outputs)
    actor_probs = self.actor(outputs)
    
    trainVs = tf.slice(values, [0, 0], [-1, train_length])
    #trainVs = values[:,:train_length]

    # smooth between TD(m) for m<=n?
    targets = tf.slice(values, [0, n], [-1, train_length])
    #targets = values[:,n:]
    for i in reversed(range(n)):
      targets *= self.rlConfig.discount
      targets += tf.slice(rewards, [0, i], [-1, train_length])
    targets = tf.stop_gradient(targets)

    advantages = targets - trainVs
    vLoss = tf.reduce_mean(tf.square(advantages))

    log_actor_probs = tf.log(actor_probs)
    actor_entropy = -tf.reduce_mean(tfl.batch_dot(actor_probs, log_actor_probs))
    real_log_actor_probs = tfl.batch_dot(actions, tf.log(actor_probs))
    train_log_actor_probs = tf.slice(real_log_actor_probs, [0, 0], [-1, train_length])
    actor_gain = tf.reduce_mean(tf.mul(train_log_actor_probs, tf.stop_gradient(advantages)))

    acLoss = vLoss - policy_scale * (actor_gain + entropy_scale * actor_entropy)

    return acLoss, [('vLoss', vLoss), ('actor_gain', actor_gain), ('actor_entropy', actor_entropy)]