示例#1
0
    def logp(self, actions, eps=1e-6):
        """
        Args:
            actions: An int64 tensor with shape [BATCH_SIZE]
            eps: A small float constant that avoids underflows when computing the log probability

        Returns:
            actions_log_prob: A float32 tensor with shape [BATCH_SIZE]
        """
        assert len(actions.shape) == 1

        logits = self.logits - layers.reduce_max(self.logits, dim=1)
        e_logits = layers.exp(logits)
        z = layers.reduce_sum(e_logits, dim=1)
        prob = e_logits / z

        actions = layers.unsqueeze(actions, axes=[1])
        actions_onehot = layers.one_hot(actions, prob.shape[1])
        actions_onehot = layers.cast(actions_onehot, dtype='float32')
        actions_prob = layers.reduce_sum(prob * actions_onehot, dim=1)

        actions_prob = actions_prob + eps
        actions_log_prob = layers.log(actions_prob)

        return actions_log_prob
示例#2
0
    def learn(self, obs, action, reward, next_obs, terminal, sample_weight):
        # print("obs:",obs)
        # raise NotImplementedError
        # obs = layers.squeeze(input=obs,axes=[-1])
        pred_value = self.model.value(obs)
        action_onehot = layers.one_hot(action, self.act_dim)
        pred_action_value = layers.reduce_sum(action_onehot * pred_value,
                                              dim=1)

        # calculate the target q value
        next_action_value = self.model.value(next_obs)
        greedy_action = layers.argmax(next_action_value, axis=-1)
        greedy_action = layers.unsqueeze(greedy_action, axes=[1])
        greedy_action_onehot = layers.one_hot(greedy_action, self.act_dim)
        next_pred_value = self.target_model.value(next_obs)
        max_v = layers.reduce_sum(greedy_action_onehot * next_pred_value,
                                  dim=1)
        max_v.stop_gradient = True

        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * max_v
        delta = layers.abs(target - pred_action_value)
        cost = sample_weight * layers.square_error_cost(
            pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost, delta
示例#3
0
    def learn(self,
              obs,
              action,
              reward,
              next_obs,
              terminal,
              learning_rate=None):
        """ update value model self.model with DQN algorithm
        """
        # Support the modification of learning_rate
        if learning_rate is None:
            assert isinstance(
                self.lr,
                float), "Please set the learning rate of DQN in initializaion."
            learning_rate = self.lr

        pred_value = self.model.value(obs)
        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(layers.elementwise_mul(
            action_onehot, pred_value),
                                              dim=1)

        # calculate the target q value
        next_action_value = self.model.value(next_obs)
        greedy_action = layers.argmax(next_action_value, axis=-1)
        greedy_action = layers.unsqueeze(greedy_action, axes=[1])
        greedy_action_onehot = layers.one_hot(greedy_action, self.act_dim)
        next_pred_value = self.target_model.value(next_obs)
        max_v = layers.reduce_sum(greedy_action_onehot * next_pred_value,
                                  dim=1)
        max_v.stop_gradient = True

        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * max_v
        cost = layers.square_error_cost(pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=learning_rate,
                                         epsilon=1e-3)
        optimizer.minimize(cost)
        return cost
示例#4
0
def from_importance_weights(behaviour_actions_log_probs,
                            target_actions_log_probs,
                            discounts,
                            rewards,
                            values,
                            bootstrap_value,
                            clip_rho_threshold=1.0,
                            clip_pg_rho_threshold=1.0,
                            name='vtrace_from_logits'):
    r"""V-trace for softmax policies.

    Calculates V-trace actor critic targets for softmax polices as described in

    "IMPALA: Scalable Distributed Deep-RL with
    Importance Weighted Actor-Learner Architectures"
    by Espeholt, Soyer, Munos et al.

    Target policy refers to the policy we are interested in improving and
    behaviour policy refers to the policy that generated the given
    rewards and actions.

    In the notation used throughout documentation and comments, T refers to the
    time dimension ranging from 0 to T-1. B refers to the batch size and
    NUM_ACTIONS refers to the number of actions.

    Args:
      behaviour_actions_log_probs: A float32 tensor of shape [T, B] of
        log-probabilities of actions in behaviour policy.
      target_policy_logits: A float32 tensor of shape [T, B] of
        log-probabilities of actions in target policy.
      discounts: A float32 tensor of shape [T, B] with the discount encountered
        when following the behaviour policy.
      rewards: A float32 tensor of shape [T, B] with the rewards generated by
        following the behaviour policy.
      values: A float32 tensor of shape [T, B] with the value function estimates
        wrt. the target policy.
      bootstrap_value: A float32 of shape [B] with the value function estimate at
        time T.
      clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
        importance weights (rho) when calculating the baseline targets (vs).
        rho^bar in the paper.
      clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
        on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
      name: The name scope that all V-trace operations will be created in.

    Returns:
      A VTraceReturns namedtuple (vs, pg_advantages) where:
        vs: A float32 tensor of shape [T, B]. Can be used as target to
          train a baseline (V(x_t) - vs_t)^2.
        pg_advantages: A float32 tensor of shape [T, B]. Can be used as the
          advantage in the calculation of policy gradients.
    """

    # rank = len(behaviour_actions_log_probs.shape)  # Usually 2.
    # assert len(target_actions_log_probs.shape) == rank
    # assert len(values.shape) == rank
    # assert len(bootstrap_value.shape) == (rank - 1)
    # assert len(discounts.shape) == rank
    # assert len(rewards.shape) == rank

    # log importance sampling weights.
    # V-trace performs operations on rhos in log-space for numerical stability.
    log_rhos = behaviour_actions_log_probs - target_actions_log_probs

    if clip_rho_threshold is not None:
        clip_rho_threshold = layers.fill_constant([1], 'float32',
                                                  clip_rho_threshold)
    if clip_pg_rho_threshold is not None:
        clip_pg_rho_threshold = layers.fill_constant([1], 'float32',
                                                     clip_pg_rho_threshold)

    rhos = layers.exp(log_rhos)
    if clip_rho_threshold is not None:
        clipped_rhos = layers.elementwise_min(rhos, clip_rho_threshold)
    else:
        clipped_rhos = rhos

    constant_one = layers.fill_constant([1], 'float32', 1.0)
    cs = layers.elementwise_min(rhos, constant_one)

    # Append bootstrapped value to get [v1, ..., v_t+1]
    values_1_t = layers.slice(values, axes=[0], starts=[1], ends=[MAX_INT32])
    values_t_plus_1 = layers.concat(
        [values_1_t, layers.unsqueeze(bootstrap_value, [0])], axis=0)

    # \delta_s * V
    deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values)

    vs_minus_v_xs = recursively_scan(discounts, cs, deltas)

    # Add V(x_s) to get v_s.
    vs = layers.elementwise_add(vs_minus_v_xs, values)

    # Advantage for policy gradient.
    vs_1_t = layers.slice(vs, axes=[0], starts=[1], ends=[MAX_INT32])
    vs_t_plus_1 = layers.concat(
        [vs_1_t, layers.unsqueeze(bootstrap_value, [0])], axis=0)

    if clip_pg_rho_threshold is not None:
        clipped_pg_rhos = layers.elementwise_min(rhos, clip_pg_rho_threshold)
    else:
        clipped_pg_rhos = rhos
    pg_advantages = (clipped_pg_rhos *
                     (rewards + discounts * vs_t_plus_1 - values))

    # Make sure no gradients backpropagated through the returned values.
    vs.stop_gradient = True
    pg_advantages.stop_gradient = True
    return VTraceReturns(vs=vs, pg_advantages=pg_advantages)