예제 #1
0
def postprocess_dice(policy, sample_batch, others_batches, episode):
    if not policy.loss_initialized():
        batch = postprocess_ppo_gae(policy, sample_batch)
        batch[DIVERSITY_REWARDS] = batch["advantages"].copy()
        batch[DIVERSITY_VALUE_TARGETS] = batch["advantages"].copy()
        batch[DIVERSITY_ADVANTAGES] = batch["advantages"].copy()
        batch['other_action_logp'] = batch[ACTION_LOGP].copy()
        return batch

    if (not policy.config[PURE_OFF_POLICY]) or (not others_batches):
        batch = sample_batch.copy()
        batch = postprocess_ppo_gae(policy, batch)
        batch[MY_LOGIT] = batch[BEHAVIOUR_LOGITS]
        batch = postprocess_diversity(policy, batch, others_batches)
        batches = [batch]
    else:
        batches = []

    for pid, (other_policy, other_batch_raw) in others_batches.items():
        # other_batch_raw is the data collected by other polices.
        if policy.config[ONLY_TNB]:
            break
        if other_batch_raw is None:
            continue
        other_batch_raw = other_batch_raw.copy()

        # Replay this policy to get the action distribution of this policy.
        replay_result = policy.compute_actions(
            other_batch_raw[SampleBatch.CUR_OBS]
        )[2]
        other_batch_raw[MY_LOGIT] = replay_result[BEHAVIOUR_LOGITS]

        # Compute the diversity reward and diversity advantage of this batch.
        other_batch_raw = postprocess_diversity(
            policy, other_batch_raw, others_batches
        )

        # Compute the task advantage of this batch.
        batches.append(postprocess_ppo_gae(policy, other_batch_raw))

    # Merge all batches.
    batch = SampleBatch.concat_samples(batches) if len(batches) != 1 \
        else batches[0]

    del batch.data['new_obs']  # save memory
    del batch.data['action_prob']
    if policy.config[ONLY_TNB]:
        assert np.unique(batch["agent_index"]).size == 1
    return batch
예제 #2
0
def my_postprocess_ppo_gae(policy, sample_batch, *args, **kwargs):
    if sample_batch.get('infos') is not None:
        idx = [i for i, x in enumerate(sample_batch['infos']) if x['done']]
        if idx:
            idx.append(sample_batch.count)
            sbatch = sample_batch.slice(0, idx[0] + 1)
            sbatch['dones'][-1] = True
            batch = postprocess_ppo_gae(policy, sbatch, *args, **kwargs)
            for s, t in zip(idx[:-1], idx[1:]):
                sbatch = sample_batch.slice(s, t + 1)
                sbatch['dones'][-1] = True
                batch.concat(
                    postprocess_ppo_gae(policy, sbatch, *args, **kwargs))
            return batch
    return postprocess_ppo_gae(policy, sample_batch, *args, **kwargs)
예제 #3
0
def postprocess_mu_zero(
        policy: Policy,
        sample_batch: SampleBatch,
        other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None,
        episode: Optional[MultiAgentEpisode] = None) -> SampleBatch:

    sample_batch = postprocess_ppo_gae(policy, sample_batch,
                                       other_agent_batches, episode)

    return sample_batch
def postprocess_sample_batch(policy, sample_batch, other_agent_batches=None, episode=None):
    # In theory you might want to apply to intrinsic rewards _after_ normalizing so that you can
    # use the same reward values across environments. In practice, using the same value across
    # envs works well, and doing it beforehand means you don't have to change the intrinsic
    # reward value based on whether you're normalizing the rewards.
    sample_batch = intrinsic_reward_postprocess_sample_batch(
        policy, sample_batch, other_agent_batches=other_agent_batches, episode=episode)
    sample_batch = reward_normalize_postprocess_sample_batch(
        policy, sample_batch, other_agent_batches=other_agent_batches, episode=episode)
    return postprocess_ppo_gae(policy, sample_batch, other_agent_batches, episode)
def postprocess_ppo_moa(policy,
                        sample_batch,
                        other_agent_batches=None,
                        episode=None):
    """
    Add the influence reward to the trajectory.
    Then, add the policy logits, VF preds, and advantages to the trajectory.
    :return: Updated trajectory (batch)
    """
    batch = moa_postprocess_trajectory(policy, sample_batch)
    batch = postprocess_ppo_gae(policy, batch)
    return batch
예제 #6
0
def postprocess_ppo_gae_modified(policy,
                                 sample_batch,
                                 other_agent_batches=None,
                                 episode=None):
    """This function add extra placeholder, by creating new entries in batch
    which the following RLLib procedure would detect and create placeholder
    based on the shape of them."""
    batch = postprocess_ppo_gae(policy, sample_batch, other_agent_batches,
                                episode)
    if not policy.loss_initialized():
        batch[JOINT_OBS] = np.zeros_like(sample_batch[SampleBatch.CUR_OBS],
                                         dtype=np.float32)
        batch[PEER_ACTION] = np.zeros_like(
            sample_batch[SampleBatch.ACTIONS], dtype=np.float32
        )  # peer_action is needed no matter use joint_dataset or not.
    return batch
def postprocess_trajectory(
        policy: Policy,
        sample_batch: SampleBatch,
        other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None,
        episode: Optional[MultiAgentEpisode] = None) -> SampleBatch:
    """Postprocesses a trajectory and returns the processed trajectory.

    The trajectory contains only data from one episode and from one agent.
    - If  `config.batch_mode=truncate_episodes` (default), sample_batch may
    contain a truncated (at-the-end) episode, in case the
    `config.rollout_fragment_length` was reached by the sampler.
    - If `config.batch_mode=complete_episodes`, sample_batch will contain
    exactly one episode (no matter how long).
    New columns can be added to sample_batch and existing ones may be altered.

    Args:
        policy (Policy): The Policy used to generate the trajectory
            (`sample_batch`)
        sample_batch (SampleBatch): The SampleBatch to postprocess.
        other_agent_batches (Optional[Dict[PolicyID, SampleBatch]]): Optional
            dict of AgentIDs mapping to other agents' trajectory data (from the
            same episode). NOTE: The other agents use the same policy.
        episode (Optional[MultiAgentEpisode]): Optional multi-agent episode
            object in which the agents operated.

    Returns:
        SampleBatch: The postprocessed, modified SampleBatch (or a new one).
    """
    if not policy.config["vtrace"]:
        sample_batch = postprocess_ppo_gae(policy, sample_batch,
                                           other_agent_batches, episode)

    # TODO: (sven) remove this del once we have trajectory view API fully in
    #  place.
    del sample_batch.data["new_obs"]  # not used, so save some bandwidth

    return sample_batch
예제 #8
0
def postprocess_no_replay_values(
        policy, sample_batch, others_batches, episode
):
    """Replay to collect logits. Pretend the samples from other agent is
    exactly my samples (no change values, advantages, value target)."""
    config = policy.config
    assert not config[REPLAY_VALUES]
    if not policy.loss_initialized():
        batch = postprocess_ppo_gae(policy, sample_batch)
        batch["abs_advantage"] = np.zeros_like(
            batch["advantages"], dtype=np.float32
        )
        batch['debug_ratio'] = np.zeros_like(
            batch["advantages"], dtype=np.float32
        )
        batch[NOVELTY_REWARDS] = np.zeros_like(
            batch["advantages"], dtype=np.float32
        )
        batch[NOVELTY_VALUE_TARGETS] = np.zeros_like(
            batch["advantages"], dtype=np.float32
        )
        batch[NOVELTY_ADVANTAGES] = np.zeros_like(
            batch["advantages"], dtype=np.float32
        )
        batch['other_action_logp'] = np.zeros_like(
            batch[ACTION_LOGP], dtype=np.float32
        )
        return batch

    if (not config[PURE_OFF_POLICY]) or (not others_batches):
        batch = sample_batch.copy()
        batch = postprocess_ppo_gae(policy, batch)
        batch["abs_advantage"] = np.abs(batch[Postprocessing.ADVANTAGES])
        batch[MY_LOGIT] = batch[BEHAVIOUR_LOGITS]
        batch = postprocess_diversity(policy, batch, others_batches)
        batches = [batch]
    else:
        batches = []

    for pid, (other_policy, other_batch_raw) in others_batches.items():

        if policy.config[ONLY_TNB]:
            break

        # The logic is that EVEN though we may use DISABLE or NO_REPLAY_VALUES,
        # but we still want to take a look of those statics.
        # Maybe in the future we can add knob to remove all such slowly stats.

        if other_batch_raw is None:
            continue

        other_batch_raw = other_batch_raw.copy()

        replay_result = policy.compute_actions(
            other_batch_raw[SampleBatch.CUR_OBS]
        )[2]

        other_batch_raw[MY_LOGIT] = replay_result[BEHAVIOUR_LOGITS]

        # the behaviour logits used for computing diversity is from other's
        # policy, and the comparing subject is other polies too
        # so there is a mess. We need to either (1) compute novelty use
        # my logit or (2) compute novelty with other's logit and compare to
        # my policy.
        # Maybe the first solution sound natural.
        other_batch_raw = postprocess_diversity(
            policy, other_batch_raw, others_batches
        )

        to_add_batch = postprocess_ppo_gae(policy, other_batch_raw)
        to_add_batch["abs_advantage"] = np.abs(
            to_add_batch[Postprocessing.ADVANTAGES]
        )
        batches.append(to_add_batch)

    for batch in batches:
        if ("debug_ratio" not in batch) and (not config[REPLAY_VALUES]):
            batch['debug_ratio'] = np.zeros_like(
                batch['advantages'], dtype=np.float32
            )
    batch = SampleBatch.concat_samples(batches) if len(batches) != 1 \
        else batches[0]
    return batch