Пример #1
0
def _add_inequity_aversion_welfare_to_batch(sample_batch: SampleBatch,
                                            opp_ag_batch: SampleBatch,
                                            alpha: float, beta: float,
                                            gamma: float,
                                            lambda_: float) -> SampleBatch:
    """
    :param sample_batch: SampleBatch to mutate
    :param opp_ag_batchs:
    :param alpha: coeff of disvalue when own discounted reward is lower than opponent
    :param beta: coeff of disvalue when own discounted reward is hihger than opponent
    :param gamma: discount factor
    :return: sample_batch mutated with WELFARE_INEQUITY_AVERSION added
    """

    own_rewards = np.array(sample_batch[sample_batch.REWARDS])
    opp_rewards = np.array(opp_ag_batch[opp_ag_batch.REWARDS])
    own_rewards = np.flip(own_rewards)
    opp_rewards = np.flip(opp_rewards)
    delta = (discount(own_rewards, gamma * lambda_) -
             discount(opp_rewards, gamma * lambda_))
    delta = np.flip(delta)
    disvalue_lower_than_opp = alpha * (-delta)
    disvalue_higher_than_opp = beta * delta
    disvalue_lower_than_opp[disvalue_lower_than_opp < 0] = 0
    disvalue_higher_than_opp[disvalue_higher_than_opp < 0] = 0

    welfare = sample_batch[
        sample_batch.
        REWARDS] - disvalue_lower_than_opp - disvalue_higher_than_opp

    sample_batch.data[WELFARE_INEQUITY_AVERSION] = welfare

    # print("inequity aversion welfare", welfare)
    return sample_batch
Пример #2
0
def compute_advantages(rewards, last_r, gamma, lambda_, values, use_gae=True):
    if use_gae:
        vpred_t = np.concatenate([values, np.array([last_r])])
        delta_t = (rewards + gamma * vpred_t[1:] - vpred_t[:-1])
        advantage = discount(delta_t, gamma * lambda_)
        value_target = (advantage + values).copy().astype(np.float32)
    else:
        rewards_plus_v = np.concatenate([rewards, np.array([last_r])])
        advantage = discount(rewards_plus_v, gamma)[:-1]
        value_target = np.zeros_like(advantage)
    advantage = advantage.copy().astype(np.float32)
    return advantage, value_target
Пример #3
0
def calculate_advantages(policy,
                         sample_batch,
                         other_agent_batches=None,
                         episode=None):
    sample_batch[Postprocessing.ADVANTAGES] = discount(
        sample_batch[SampleBatch.REWARDS], policy.config["gamma"])
    return sample_batch
Пример #4
0
def calculate_advantages(policy,
                         sample_batch,
                         other_agent_batches=None,
                         episode=None):
    # print(sample_batch)
    sample_batch["returns"] = discount(sample_batch["rewards"], 0.99)
    return sample_batch
Пример #5
0
def compute_advantages(rollout, last_r, gamma=0.9, lambda_=1.0, use_gae=True):
    """Given a rollout, compute its value targets and the advantage.

    Args:
        rollout (SampleBatch): SampleBatch of a single trajectory
        last_r (float): Value estimation for last observation
        gamma (float): Discount factor.
        lambda_ (float): Parameter for GAE
        use_gae (bool): Using Generalized Advantage Estamation

    Returns:
        SampleBatch (SampleBatch): Object with experience from rollout and
            processed rewards.
    """

    traj = {}

    trajsize = len(rollout["actions"])
    for key in rollout:
        traj[key] = np.stack(rollout[key])

    if use_gae:
        assert "vf_preds" in rollout, "Values not found!"
        vpred_t = np.concatenate([rollout["vf_preds"], np.array([last_r])])
        # delta_t = traj["rewards"] + gamma * vpred_t[1:] - vpred_t[:-1]
        delta_t = (1 + traj['rewards'])*(1 + gamma * vpred_t[1:]) - 1 - vpred_t[:-1]
        # This formula for the advantage comes
        # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
        traj["advantages"] = discount(delta_t, gamma * lambda_)
        traj["value_targets"] = (traj["advantages"] + traj["vf_preds"]).copy().astype(np.float32)
    else:
        rewards_plus_v = np.concatenate(
            [rollout["rewards"], np.array([last_r])])
        traj["advantages"] = discount(rewards_plus_v, gamma)[:-1]
        # TODO(ekl): support using a critic without GAE
        traj["value_targets"] = np.zeros_like(traj["advantages"])

    traj["advantages"] = traj["advantages"].copy().astype(np.float32)

    assert all(val.shape[0] == trajsize for val in traj.values()), \
        "Rollout stacked incorrectly!"
    return SampleBatch(traj)
Пример #6
0
def compute_returns(rollout, last_r, gamma):
    traj = {}
    trajsize = len(rollout["actions"])
    for key in rollout:
        traj[key] = np.stack(rollout[key])

    rewards_plus_v = np.concatenate([rollout["rewards"], np.array([last_r])])
    traj["returns"] = discount(rewards_plus_v, gamma)[:-1]

    traj["returns"] = traj["returns"].copy().astype(np.float32)

    assert all(val.shape[0] == trajsize for val in traj.values()), \
        "Rollout stacked incorrectly!"
    return SampleBatch(traj)
Пример #7
0
def _compute_advantages_for_diversity(
        rewards, last_r, gamma, lambda_, values, use_gae=True
):
    """Compute the diversity advantage."""
    if use_gae:
        vpred_t = np.concatenate([values, np.array([last_r])])
        delta_t = (rewards + gamma * vpred_t[1:] - vpred_t[:-1])
        advantage = discount(delta_t, gamma * lambda_)
        value_target = (advantage + values).copy().astype(np.float32)
    else:
        rewards_plus_v = np.concatenate([
            rewards.reshape(-1), np.array([last_r]).reshape(-1)
        ])
        if rewards_plus_v.size <= 2:
            logger.warning(
                "********** Current reward is empty: {}. last r {}. values {}"
                ".".format(
                    rewards, last_r, values
                ))
            # rewards_plus_v = np.array([last_r])
        advantage = discount(rewards_plus_v, gamma)[:-1]
        value_target = np.zeros_like(advantage, dtype=np.float32)
    advantage = advantage.copy().astype(np.float32)
    return advantage, value_target
Пример #8
0
def compute_advantages_replay(rollout,
                              my_last_r,
                              other_last_r,
                              gamma=0.9,
                              lambda_=1.0,
                              use_gae=True,
                              clip_action_prob_ratio=1,
                              clip_advantage=False):
    """Given a rollout, compute its value targets and the advantage.

    Args:
        rollout (SampleBatch): SampleBatch of a single trajectory
        last_r (float): Value estimation for last observation
        gamma (float): Discount factor.
        lambda_ (float): Parameter for GAE
        use_gae (bool): Using Generalized Advantage Estimation

    Returns:
        SampleBatch (SampleBatch): Object with experience from rollout and
            processed rewards.
    """
    traj = {}
    trajsize = len(rollout[SampleBatch.ACTIONS])
    for key in rollout:
        traj[key] = np.stack(rollout[key])

    if use_gae:
        assert SampleBatch.VF_PREDS in rollout, "Values not found!"
        # vpred_t = np.concatenate(
        #     [rollout[SampleBatch.VF_PREDS],
        #      np.array([last_r])]
        # )
        # delta_t = \
        #     traj[SampleBatch.REWARDS] + gamma * vpred_t[1:] * (
        #             1 - lambda_) - vpred_t[:-1]

        # use other's values to compute advantages
        # this advantage will not be used to comput value target.
        other_vpred_t = np.concatenate(
            [rollout["other_vf_preds"],
             np.array([other_last_r])])
        other_delta_t = (traj[SampleBatch.REWARDS] +
                         gamma * other_vpred_t[1:] - other_vpred_t[:-1])
        other_advantage = discount(other_delta_t, gamma * lambda_)
        other_advantage = (other_advantage - other_advantage.mean()) / max(
            1e-4, other_advantage.std())

        if clip_advantage:
            # if clip_advantage happen, the normalization should not happen
            # at optimizer.
            other_advantage = np.clip(other_advantage, 0, None)

        # we put other's advantage in 'advantages' field. We need to make sure
        # this field is not used in future postprocessing.
        traj[Postprocessing.ADVANTAGES] = other_advantage

        # This formula for the advantage comes
        # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
        # traj[Postprocessing.ADVANTAGES] = discount(delta_t, gamma * lambda_)
        # advantage = ratio * delta_t
        # On the contrary, the use the naive form (single Bellman equation) to
        # compute the value target
        # advantage = calculate_gae_advantage(
        #     traj[SampleBatch.VF_PREDS], delta_t, ratio, lambda_, gamma
        # )

        # Ratio is almost deprecated. We only use it to compute value target.
        ratio = np.exp(traj['action_logp'] - traj["other_action_logp"])
        ratio = np.clip(ratio, 0.0, clip_action_prob_ratio)
        traj["debug_ratio"] = ratio
        fake_delta = np.zeros_like(ratio)
        fake_delta[-1] = 1
        traj["debug_fake_adv"] = calculate_gae_advantage(
            np.zeros_like(ratio), fake_delta, ratio, lambda_, gamma)

        # value_target = (
        #         traj[Postprocessing.ADVANTAGES] + traj[SampleBatch.VF_PREDS]
        # ).copy().astype(np.float32)
        my_vpred_t = np.concatenate(
            [rollout[SampleBatch.VF_PREDS],
             np.array([my_last_r])])
        assert ratio.shape == traj[SampleBatch.REWARDS].shape
        # value_target = \
        #     ratio * (traj[SampleBatch.REWARDS] + gamma * my_vpred_t[1:])

        value_target = (ratio *
                        (traj[SampleBatch.REWARDS] + gamma * my_vpred_t[1:]) +
                        (1 - ratio) * (my_vpred_t[:-1]))

        traj[Postprocessing.VALUE_TARGETS] = value_target

    else:
        raise NotImplementedError()
        # rewards_plus_v = np.concatenate(
        #     [rollout[SampleBatch.REWARDS],
        #      np.array([last_r])])
        #
        # ratio = np.exp(traj['action_logp'] - traj["other_action_logp"])
        # assert_nan(ratio)
        # delta = discount(rewards_plus_v, gamma)[:-1]
        #
        # assert delta.shape == ratio.shape
        #
        # traj[Postprocessing.ADVANTAGES] = ratio * delta
        # traj[Postprocessing.VALUE_TARGETS] = np.zeros_like(
        #     traj[Postprocessing.ADVANTAGES])

    traj[Postprocessing.ADVANTAGES] = traj[
        Postprocessing.ADVANTAGES].copy().astype(np.float32)

    assert all(val.shape[0] == trajsize for val in traj.values()), \
        "Rollout stacked incorrectly!"
    return SampleBatch(traj)