def postprocess_dice(policy, sample_batch, others_batches, episode): if not policy.loss_initialized(): batch = postprocess_ppo_gae(policy, sample_batch) batch[DIVERSITY_REWARDS] = batch["advantages"].copy() batch[DIVERSITY_VALUE_TARGETS] = batch["advantages"].copy() batch[DIVERSITY_ADVANTAGES] = batch["advantages"].copy() batch['other_action_logp'] = batch[ACTION_LOGP].copy() return batch if (not policy.config[PURE_OFF_POLICY]) or (not others_batches): batch = sample_batch.copy() batch = postprocess_ppo_gae(policy, batch) batch[MY_LOGIT] = batch[BEHAVIOUR_LOGITS] batch = postprocess_diversity(policy, batch, others_batches) batches = [batch] else: batches = [] for pid, (other_policy, other_batch_raw) in others_batches.items(): # other_batch_raw is the data collected by other polices. if policy.config[ONLY_TNB]: break if other_batch_raw is None: continue other_batch_raw = other_batch_raw.copy() # Replay this policy to get the action distribution of this policy. replay_result = policy.compute_actions( other_batch_raw[SampleBatch.CUR_OBS] )[2] other_batch_raw[MY_LOGIT] = replay_result[BEHAVIOUR_LOGITS] # Compute the diversity reward and diversity advantage of this batch. other_batch_raw = postprocess_diversity( policy, other_batch_raw, others_batches ) # Compute the task advantage of this batch. batches.append(postprocess_ppo_gae(policy, other_batch_raw)) # Merge all batches. batch = SampleBatch.concat_samples(batches) if len(batches) != 1 \ else batches[0] del batch.data['new_obs'] # save memory del batch.data['action_prob'] if policy.config[ONLY_TNB]: assert np.unique(batch["agent_index"]).size == 1 return batch
def my_postprocess_ppo_gae(policy, sample_batch, *args, **kwargs): if sample_batch.get('infos') is not None: idx = [i for i, x in enumerate(sample_batch['infos']) if x['done']] if idx: idx.append(sample_batch.count) sbatch = sample_batch.slice(0, idx[0] + 1) sbatch['dones'][-1] = True batch = postprocess_ppo_gae(policy, sbatch, *args, **kwargs) for s, t in zip(idx[:-1], idx[1:]): sbatch = sample_batch.slice(s, t + 1) sbatch['dones'][-1] = True batch.concat( postprocess_ppo_gae(policy, sbatch, *args, **kwargs)) return batch return postprocess_ppo_gae(policy, sample_batch, *args, **kwargs)
def postprocess_mu_zero( policy: Policy, sample_batch: SampleBatch, other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None, episode: Optional[MultiAgentEpisode] = None) -> SampleBatch: sample_batch = postprocess_ppo_gae(policy, sample_batch, other_agent_batches, episode) return sample_batch
def postprocess_sample_batch(policy, sample_batch, other_agent_batches=None, episode=None): # In theory you might want to apply to intrinsic rewards _after_ normalizing so that you can # use the same reward values across environments. In practice, using the same value across # envs works well, and doing it beforehand means you don't have to change the intrinsic # reward value based on whether you're normalizing the rewards. sample_batch = intrinsic_reward_postprocess_sample_batch( policy, sample_batch, other_agent_batches=other_agent_batches, episode=episode) sample_batch = reward_normalize_postprocess_sample_batch( policy, sample_batch, other_agent_batches=other_agent_batches, episode=episode) return postprocess_ppo_gae(policy, sample_batch, other_agent_batches, episode)
def postprocess_ppo_moa(policy, sample_batch, other_agent_batches=None, episode=None): """ Add the influence reward to the trajectory. Then, add the policy logits, VF preds, and advantages to the trajectory. :return: Updated trajectory (batch) """ batch = moa_postprocess_trajectory(policy, sample_batch) batch = postprocess_ppo_gae(policy, batch) return batch
def postprocess_ppo_gae_modified(policy, sample_batch, other_agent_batches=None, episode=None): """This function add extra placeholder, by creating new entries in batch which the following RLLib procedure would detect and create placeholder based on the shape of them.""" batch = postprocess_ppo_gae(policy, sample_batch, other_agent_batches, episode) if not policy.loss_initialized(): batch[JOINT_OBS] = np.zeros_like(sample_batch[SampleBatch.CUR_OBS], dtype=np.float32) batch[PEER_ACTION] = np.zeros_like( sample_batch[SampleBatch.ACTIONS], dtype=np.float32 ) # peer_action is needed no matter use joint_dataset or not. return batch
def postprocess_trajectory( policy: Policy, sample_batch: SampleBatch, other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None, episode: Optional[MultiAgentEpisode] = None) -> SampleBatch: """Postprocesses a trajectory and returns the processed trajectory. The trajectory contains only data from one episode and from one agent. - If `config.batch_mode=truncate_episodes` (default), sample_batch may contain a truncated (at-the-end) episode, in case the `config.rollout_fragment_length` was reached by the sampler. - If `config.batch_mode=complete_episodes`, sample_batch will contain exactly one episode (no matter how long). New columns can be added to sample_batch and existing ones may be altered. Args: policy (Policy): The Policy used to generate the trajectory (`sample_batch`) sample_batch (SampleBatch): The SampleBatch to postprocess. other_agent_batches (Optional[Dict[PolicyID, SampleBatch]]): Optional dict of AgentIDs mapping to other agents' trajectory data (from the same episode). NOTE: The other agents use the same policy. episode (Optional[MultiAgentEpisode]): Optional multi-agent episode object in which the agents operated. Returns: SampleBatch: The postprocessed, modified SampleBatch (or a new one). """ if not policy.config["vtrace"]: sample_batch = postprocess_ppo_gae(policy, sample_batch, other_agent_batches, episode) # TODO: (sven) remove this del once we have trajectory view API fully in # place. del sample_batch.data["new_obs"] # not used, so save some bandwidth return sample_batch
def postprocess_no_replay_values( policy, sample_batch, others_batches, episode ): """Replay to collect logits. Pretend the samples from other agent is exactly my samples (no change values, advantages, value target).""" config = policy.config assert not config[REPLAY_VALUES] if not policy.loss_initialized(): batch = postprocess_ppo_gae(policy, sample_batch) batch["abs_advantage"] = np.zeros_like( batch["advantages"], dtype=np.float32 ) batch['debug_ratio'] = np.zeros_like( batch["advantages"], dtype=np.float32 ) batch[NOVELTY_REWARDS] = np.zeros_like( batch["advantages"], dtype=np.float32 ) batch[NOVELTY_VALUE_TARGETS] = np.zeros_like( batch["advantages"], dtype=np.float32 ) batch[NOVELTY_ADVANTAGES] = np.zeros_like( batch["advantages"], dtype=np.float32 ) batch['other_action_logp'] = np.zeros_like( batch[ACTION_LOGP], dtype=np.float32 ) return batch if (not config[PURE_OFF_POLICY]) or (not others_batches): batch = sample_batch.copy() batch = postprocess_ppo_gae(policy, batch) batch["abs_advantage"] = np.abs(batch[Postprocessing.ADVANTAGES]) batch[MY_LOGIT] = batch[BEHAVIOUR_LOGITS] batch = postprocess_diversity(policy, batch, others_batches) batches = [batch] else: batches = [] for pid, (other_policy, other_batch_raw) in others_batches.items(): if policy.config[ONLY_TNB]: break # The logic is that EVEN though we may use DISABLE or NO_REPLAY_VALUES, # but we still want to take a look of those statics. # Maybe in the future we can add knob to remove all such slowly stats. if other_batch_raw is None: continue other_batch_raw = other_batch_raw.copy() replay_result = policy.compute_actions( other_batch_raw[SampleBatch.CUR_OBS] )[2] other_batch_raw[MY_LOGIT] = replay_result[BEHAVIOUR_LOGITS] # the behaviour logits used for computing diversity is from other's # policy, and the comparing subject is other polies too # so there is a mess. We need to either (1) compute novelty use # my logit or (2) compute novelty with other's logit and compare to # my policy. # Maybe the first solution sound natural. other_batch_raw = postprocess_diversity( policy, other_batch_raw, others_batches ) to_add_batch = postprocess_ppo_gae(policy, other_batch_raw) to_add_batch["abs_advantage"] = np.abs( to_add_batch[Postprocessing.ADVANTAGES] ) batches.append(to_add_batch) for batch in batches: if ("debug_ratio" not in batch) and (not config[REPLAY_VALUES]): batch['debug_ratio'] = np.zeros_like( batch['advantages'], dtype=np.float32 ) batch = SampleBatch.concat_samples(batches) if len(batches) != 1 \ else batches[0] return batch