示例#1
0
    def __call__(self, samples: SampleBatchType) -> SampleBatchType:
        _check_sample_batch_type(samples)
        wrapped = False

        if isinstance(samples, SampleBatch):
            samples = samples.as_multi_agent()
            wrapped = True

        for policy_id in samples.policy_batches:
            batch = samples.policy_batches[policy_id]
            for field in self.fields:
                if field not in batch:
                    raise KeyError(
                        f"`{field}` not found in SampleBatch for policy "
                        f"`{policy_id}`! Maybe this policy fails to add "
                        f"{field} in its `postprocess_trajectory` method? Or "
                        "this policy is not meant to learn at all and you "
                        "forgot to add it to the list under `config."
                        "multiagent.policies_to_train`.")
                batch[field] = standardized(batch[field])

        if wrapped:
            samples = samples.policy_batches[DEFAULT_POLICY_ID]

        return samples
示例#2
0
文件: maml.py 项目: stjordanis/ray
        def inner_adaptation_steps(itr):
            buf = []
            split = []
            metrics = {}
            for samples in itr:

                # Processing Samples (Standardize Advantages)
                split_lst = []
                for sample in samples:
                    sample["advantages"] = standardized(sample["advantages"])
                    split_lst.append(sample.count)

                buf.extend(samples)
                split.append(split_lst)

                adapt_iter = len(split) - 1
                metrics = post_process_metrics(adapt_iter, workers, metrics)
                if len(split) > inner_steps:
                    out = SampleBatch.concat_samples(buf)
                    out["split"] = np.array(split)
                    buf = []
                    split = []

                    # Reporting Adaptation Rew Diff
                    ep_rew_pre = metrics["episode_reward_mean"]
                    ep_rew_post = metrics["episode_reward_mean_adapt_" +
                                          str(inner_steps)]
                    metrics["adaptation_delta"] = ep_rew_post - ep_rew_pre
                    yield out, metrics
                    metrics = {}
                else:
                    inner_adaptation(workers, samples)
示例#3
0
    def __call__(self, samples: SampleBatchType) -> SampleBatchType:
        _check_sample_batch_type(samples)
        wrapped = False

        if isinstance(samples, SampleBatch):
            samples = MultiAgentBatch({DEFAULT_POLICY_ID: samples},
                                      samples.count)
            wrapped = True

        for policy_id in samples.policy_batches:
            batch = samples.policy_batches[policy_id]
            for field in self.fields:
                batch[field] = standardized(batch[field])

        if wrapped:
            samples = samples.policy_batches[DEFAULT_POLICY_ID]

        return samples
示例#4
0
def standardize_fields(samples: SampleBatchType, fields: List[str]) -> SampleBatchType:
    """Standardize fields of the given SampleBatch"""
    _check_sample_batch_type(samples)
    wrapped = False

    if isinstance(samples, SampleBatch):
        samples = samples.as_multi_agent()
        wrapped = True

    for policy_id in samples.policy_batches:
        batch = samples.policy_batches[policy_id]
        for field in fields:
            if field in batch:
                batch[field] = standardized(batch[field])

    if wrapped:
        samples = samples.policy_batches[DEFAULT_POLICY_ID]

    return samples
示例#5
0
def post_process_samples(samples, config: AlgorithmConfigDict):
    # Instead of using NN for value function, we use regression
    split_lst = []
    for sample in samples:
        indexes = np.asarray(sample["dones"]).nonzero()[0]
        indexes = indexes + 1

        reward_list = np.split(sample["rewards"], indexes)[:-1]
        observation_list = np.split(sample["obs"], indexes)[:-1]

        paths = []
        for i in range(0, len(reward_list)):
            paths.append({
                "rewards": reward_list[i],
                "observations": observation_list[i]
            })

        paths = calculate_gae_advantages(paths, config["gamma"],
                                         config["lambda"])

        advantages = np.concatenate([path["advantages"] for path in paths])
        sample["advantages"] = standardized(advantages)
        split_lst.append(sample.count)
    return samples, split_lst
示例#6
0
    def training_iteration(self) -> ResultDict:
        # Generate common experiences, collect batch for PPO, store every (DQN) batch
        # into replay buffer.
        ppo_batches = []
        num_env_steps = 0
        # PPO batch size fixed at 200.
        while num_env_steps < 200:
            ma_batches = synchronous_parallel_sample(worker_set=self.workers,
                                                     concat=False)
            # Loop through (parallely collected) ma-batches.
            for ma_batch in ma_batches:
                # Update sampled counters.
                self._counters[NUM_ENV_STEPS_SAMPLED] += ma_batch.count
                self._counters[
                    NUM_AGENT_STEPS_SAMPLED] += ma_batch.agent_steps()
                ppo_batch = ma_batch.policy_batches.pop("ppo_policy")
                # Add collected batches (only for DQN policy) to replay buffer.
                self.local_replay_buffer.add(ma_batch)

                ppo_batches.append(ppo_batch)
                num_env_steps += ppo_batch.count

        # DQN sub-flow.
        dqn_train_results = {}
        dqn_train_batch = self.local_replay_buffer.sample(num_items=64)
        if dqn_train_batch is not None:
            dqn_train_results = train_one_step(self, dqn_train_batch,
                                               ["dqn_policy"])
            self._counters[
                "agent_steps_trained_DQN"] += dqn_train_batch.agent_steps()
            print(
                "DQN policy learning on samples from",
                "agent steps trained",
                dqn_train_batch.agent_steps(),
            )
        # Update DQN's target net every 500 train steps.
        if (self._counters["agent_steps_trained_DQN"] -
                self._counters[LAST_TARGET_UPDATE_TS] >= 500):
            self.workers.local_worker().get_policy(
                "dqn_policy").update_target()
            self._counters[NUM_TARGET_UPDATES] += 1
            self._counters[LAST_TARGET_UPDATE_TS] = self._counters[
                "agent_steps_trained_DQN"]

        # PPO sub-flow.
        ppo_train_batch = SampleBatch.concat_samples(ppo_batches)
        self._counters[
            "agent_steps_trained_PPO"] += ppo_train_batch.agent_steps()
        # Standardize advantages.
        ppo_train_batch[Postprocessing.ADVANTAGES] = standardized(
            ppo_train_batch[Postprocessing.ADVANTAGES])
        print(
            "PPO policy learning on samples from",
            "agent steps trained",
            ppo_train_batch.agent_steps(),
        )
        ppo_train_batch = MultiAgentBatch({"ppo_policy": ppo_train_batch},
                                          ppo_train_batch.count)
        ppo_train_results = train_one_step(self, ppo_train_batch,
                                           ["ppo_policy"])

        # Combine results for PPO and DQN into one results dict.
        results = dict(ppo_train_results, **dqn_train_results)
        return results
示例#7
0
 def post_process_samples(self, samples):
     split_lst = []
     for sample in samples:
         sample["advantages"] = standardized(sample["advantages"])
         split_lst.append(sample.count)
     return samples, split_lst