예제 #1
0
 def test_sample_from_early_done_env(self):
     ev = RolloutWorker(
         env_creator=lambda _: EarlyDoneMultiAgent(),
         policy_spec={
             "p0": PolicySpec(policy_class=MockPolicy),
             "p1": PolicySpec(policy_class=MockPolicy),
         },
         policy_mapping_fn=(lambda aid, **kwargs: "p{}".format(aid % 2)),
         batch_mode="complete_episodes",
         rollout_fragment_length=1,
     )
     # This used to raise an Error due to the EarlyDoneMultiAgent
     # terminating at e.g. agent0 w/o publishing the observation for
     # agent1 anymore. This limitation is fixed and an env may
     # terminate at any time (as well as return rewards for any agent
     # at any time, even when that agent doesn't have an obs returned
     # in the same call to `step()`).
     ma_batch = ev.sample()
     # Make sure that agents took the correct (alternating timesteps)
     # path. Except for the last timestep, where both agents got
     # terminated.
     ag0_ts = ma_batch.policy_batches["p0"]["t"]
     ag1_ts = ma_batch.policy_batches["p1"]["t"]
     self.assertTrue(np.all(np.abs(ag0_ts[:-1] - ag1_ts[:-1]) == 1.0))
     self.assertTrue(ag0_ts[-1] == ag1_ts[-1])
예제 #2
0
    def test_maddpg_compilation(self):
        """Test whether MADDPG can be built with all frameworks."""
        config = (maddpg.MADDPGConfig().environment(
            env=TwoStepGame,
            env_config={
                "actions_are_logits": True,
            },
        ).multi_agent(
            policies={
                "pol1": PolicySpec(config={"agent_id": 0}, ),
                "pol2": PolicySpec(config={"agent_id": 1}, ),
            },
            policy_mapping_fn=lambda aid, **kwargs: "pol2" if aid else "pol1",
        ))

        num_iterations = 1

        # Only working for tf right now.
        for _ in framework_iterator(config, frameworks="tf"):
            algo = config.build()
            for i in range(num_iterations):
                results = algo.train()
                check_train_results(results)
                print(results)
            algo.stop()
예제 #3
0
    def test_returning_model_based_rollouts_data(self):
        class ModelBasedPolicy(DQNTFPolicy):
            def compute_actions_from_input_dict(self,
                                                input_dict,
                                                explore=None,
                                                timestep=None,
                                                episodes=None,
                                                **kwargs):
                obs_batch = input_dict["obs"]
                # In policy loss initialization phase, no episodes are passed
                # in.
                if episodes is not None:
                    # Pretend we did a model-based rollout and want to return
                    # the extra trajectory.
                    env_id = episodes[0].env_id
                    fake_eps = Episode(episodes[0].policy_map,
                                       episodes[0].policy_mapping_fn,
                                       lambda: None, lambda x: None, env_id)
                    builder = get_global_worker().sampler.sample_collector
                    agent_id = "extra_0"
                    policy_id = "p1"  # use p1 so we can easily check it
                    builder.add_init_obs(fake_eps, agent_id, env_id, policy_id,
                                         -1, obs_batch[0])
                    for t in range(4):
                        builder.add_action_reward_next_obs(
                            episode_id=fake_eps.episode_id,
                            agent_id=agent_id,
                            env_id=env_id,
                            policy_id=policy_id,
                            agent_done=t == 3,
                            values=dict(
                                t=t,
                                actions=0,
                                rewards=0,
                                dones=t == 3,
                                infos={},
                                new_obs=obs_batch[0]))
                    batch = builder.postprocess_episode(
                        episode=fake_eps, build=True)
                    episodes[0].add_extra_batch(batch)

                # Just return zeros for actions
                return [0] * len(obs_batch), [], {}

        ev = RolloutWorker(
            env_creator=lambda _: MultiAgentCartPole({"num_agents": 2}),
            policy_spec={
                "p0": PolicySpec(policy_class=ModelBasedPolicy),
                "p1": PolicySpec(policy_class=ModelBasedPolicy),
            },
            policy_mapping_fn=lambda agent_id, episode, **kwargs: "p0",
            rollout_fragment_length=5)
        batch = ev.sample()
        # 5 environment steps (rollout_fragment_length).
        self.assertEqual(batch.count, 5)
        # 10 agent steps for p0: 2 agents, both using p0 as their policy.
        self.assertEqual(batch.policy_batches["p0"].count, 10)
        # 20 agent steps for p1: Each time both(!) agents takes 1 step,
        # p1 takes 4: 5 (rollout-fragment length) * 4 = 20
        self.assertEqual(batch.policy_batches["p1"].count, 20)
예제 #4
0
def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"):
    """Run heuristic policies vs a learned agent.

    The learned agent should eventually reach a reward of ~5 with
    use_lstm=False, and ~7 with use_lstm=True. The reason the LSTM policy
    can perform better is since it can distinguish between the always_same vs
    beat_last heuristics.
    """

    def select_policy(agent_id, episode, **kwargs):
        if agent_id == "player1":
            return "learned"
        else:
            return random.choice(["always_same", "beat_last"])

    config = {
        "env": RockPaperScissors,
        "gamma": 0.9,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,
        "num_envs_per_worker": 4,
        "rollout_fragment_length": 10,
        "train_batch_size": 200,
        "multiagent": {
            "policies_to_train": ["learned"],
            "policies": {
                "always_same": PolicySpec(policy_class=AlwaysSameHeuristic),
                "beat_last": PolicySpec(policy_class=BeatLastHeuristic),
                "learned": PolicySpec(config={
                    "model": {
                        "use_lstm": use_lstm
                    },
                    "framework": args.framework,
                }),
            },
            "policy_mapping_fn": select_policy,
        },
        "framework": args.framework,
    }
    cls = get_trainer_class(trainer) if isinstance(trainer, str) else trainer
    trainer_obj = cls(config=config)
    env = trainer_obj.workers.local_worker().env
    for _ in range(args.stop_iters):
        results = trainer_obj.train()
        print(results)
        # Timesteps reached.
        if results["timesteps_total"] > args.stop_timesteps:
            break
        # Reward (difference) reached -> all good, return.
        elif env.player1_score - env.player2_score > args.stop_reward:
            return

    # Reward (difference) not reached: Error if `as_test`.
    if args.as_test:
        raise ValueError(
            "Desired reward difference ({}) not reached! Only got to {}.".
            format(args.stop_reward, env.player1_score - env.player2_score))
예제 #5
0
 def test_multi_agent_sample_with_horizon(self):
     ev = RolloutWorker(
         env_creator=lambda _: BasicMultiAgent(5),
         policy_spec={
             "p0": PolicySpec(policy_class=MockPolicy),
             "p1": PolicySpec(policy_class=MockPolicy),
         },
         policy_mapping_fn=(lambda aid, **kwarg: "p{}".format(aid % 2)),
         episode_horizon=10,  # test with episode horizon set
         rollout_fragment_length=50)
     batch = ev.sample()
     self.assertEqual(batch.count, 50)
예제 #6
0
 def test_multi_agent_sample_async_remote(self):
     ev = RolloutWorker(
         env_creator=lambda _: BasicMultiAgent(5),
         policy_spec={
             "p0": PolicySpec(policy_class=MockPolicy),
             "p1": PolicySpec(policy_class=MockPolicy),
         },
         policy_mapping_fn=(lambda aid, **kwargs: "p{}".format(aid % 2)),
         rollout_fragment_length=50,
         num_envs=4,
         remote_worker_envs=True)
     batch = ev.sample()
     self.assertEqual(batch.count, 200)
예제 #7
0
파일: multi_agent.py 프로젝트: rlan/ray
def check_multi_agent(config: PartialTrainerConfigDict) -> \
        Tuple[MultiAgentPolicyConfigDict, bool]:
    """Checks, whether a (partial) config defines a multi-agent setup.

    Args:
        config (PartialTrainerConfigDict): The user/Trainer/Policy config
            to check for multi-agent.

    Returns:
        The resulting (all fixed) multi-agent policy dict and whether we
            have a multi-agent setup or not.
    """
    multiagent_config = config["multiagent"]
    policies = multiagent_config.get("policies")

    # Nothing specified in config dict -> Assume simple single agent setup
    # with DEFAULT_POLICY_ID as only policy.
    if not policies:
        policies = {DEFAULT_POLICY_ID}
    # Policies given as set (of PolicyIDs) -> Setup each policy automatically
    # via empty PolicySpec (will make RLlib infer obs- and action spaces
    # as well as the Policy's class).
    if isinstance(policies, set):
        policies = multiagent_config["policies"] = {
            pid: PolicySpec()
            for pid in policies
        }
    # Is this a multi-agent setup? True, iff DEFAULT_POLICY_ID is only
    # PolicyID found in policies dict.
    is_multiagent = len(policies) > 1 or DEFAULT_POLICY_ID not in policies
    return policies, is_multiagent
예제 #8
0
 def test_multi_agent_sample_round_robin(self):
     ev = RolloutWorker(
         env_creator=lambda _: RoundRobinMultiAgent(5, increment_obs=True),
         policy_spec={
             "p0": PolicySpec(policy_class=MockPolicy),
         },
         policy_mapping_fn=lambda agent_id, episode, **kwargs: "p0",
         rollout_fragment_length=50,
     )
     batch = ev.sample()
     self.assertEqual(batch.count, 50)
     # since we round robin introduce agents into the env, some of the env
     # steps don't count as proper transitions
     self.assertEqual(batch.policy_batches["p0"].count, 42)
     check(
         batch.policy_batches["p0"]["obs"][:10],
         one_hot(np.array([0, 1, 2, 3, 4] * 2), 10),
     )
     check(
         batch.policy_batches["p0"]["new_obs"][:10],
         one_hot(np.array([1, 2, 3, 4, 5] * 2), 10),
     )
     self.assertEqual(
         batch.policy_batches["p0"]["rewards"].tolist()[:10],
         [100, 100, 100, 100, 0] * 2,
     )
     self.assertEqual(
         batch.policy_batches["p0"]["dones"].tolist()[:10],
         [False, False, False, False, True] * 2,
     )
     self.assertEqual(
         batch.policy_batches["p0"]["t"].tolist()[:10],
         [4, 9, 14, 19, 24, 5, 10, 15, 20, 25],
     )
예제 #9
0
def parse_policy_specs_from_checkpoint(
    path: str,
) -> Tuple[PartialAlgorithmConfigDict, Dict[str, PolicySpec], Dict[
        str, PolicyState]]:
    """Read and parse policy specifications from a checkpoint file.

    Args:
        path: Path to a policy checkpoint.

    Returns:
        A tuple of: base policy config, dictionary of policy specs, and
        dictionary of policy states.
    """
    with open(path, "rb") as f:
        checkpoint_dict = pickle.load(f)
    # Policy data is contained as a serialized binary blob under their
    # ID keys.
    w = pickle.loads(checkpoint_dict["worker"])

    policy_config = w["policy_config"]
    assert policy_config.get("enable_connectors", False), (
        "load_policies_from_checkpoint only works for checkpoints generated by stacks "
        "with connectors enabled.")
    policy_states = w["state"]
    serialized_policy_specs = w["policy_specs"]
    policy_specs = {
        id: PolicySpec.deserialize(spec)
        for id, spec in serialized_policy_specs.items()
    }

    return policy_config, policy_specs, policy_states
예제 #10
0
 def test_multi_agent_sample_sync_remote(self):
     ev = RolloutWorker(
         env_creator=lambda _: BasicMultiAgent(5),
         policy_spec={
             "p0": PolicySpec(policy_class=MockPolicy),
             "p1": PolicySpec(policy_class=MockPolicy),
         },
         # This signature will raise a soft-deprecation warning due
         # to the new signature we are using (agent_id, episode, **kwargs),
         # but should not break this test.
         policy_mapping_fn=(lambda agent_id: "p{}".format(agent_id % 2)),
         rollout_fragment_length=50,
         num_envs=4,
         remote_worker_envs=True,
         remote_env_batch_wait_ms=99999999)
     batch = ev.sample()
     self.assertEqual(batch.count, 200)
예제 #11
0
 def gen_policy(i):
     config = {
         "model": {
             "custom_model": ["model1", "model2"][i % 2],
         },
         "gamma": random.choice([0.95, 0.99]),
     }
     return PolicySpec(config=config)
예제 #12
0
    def test_multi_agent_sample(self):
        def policy_mapping_fn(agent_id, episode, worker, **kwargs):
            return "p{}".format(agent_id % 2)

        ev = RolloutWorker(env_creator=lambda _: BasicMultiAgent(5),
                           policy_spec={
                               "p0": PolicySpec(policy_class=MockPolicy),
                               "p1": PolicySpec(policy_class=MockPolicy),
                           },
                           policy_mapping_fn=policy_mapping_fn,
                           rollout_fragment_length=50)
        batch = ev.sample()
        self.assertEqual(batch.count, 50)
        self.assertEqual(batch.policy_batches["p0"].count, 150)
        self.assertEqual(batch.policy_batches["p1"].count, 100)
        self.assertEqual(batch.policy_batches["p0"]["t"].tolist(),
                         list(range(25)) * 6)
예제 #13
0
 def test_leaky_policy(self):
     """Tests, whether our diagnostics tools can detect leaks in a policy."""
     config = dqn.DEFAULT_CONFIG.copy()
     # Make sure we have an env to test on the local worker.
     # Otherwise, `check_memory_leaks` will complain.
     config["create_env_on_driver"] = True
     config["env"] = "CartPole-v0"
     config["multiagent"]["policies"] = {
         "default_policy": PolicySpec(policy_class=MemoryLeakingPolicy),
     }
     trainer = dqn.DQN(config=config)
     results = check_memory_leaks(trainer, to_check={"policy"}, repeats=300)
     assert results["policy"]
     trainer.stop()
예제 #14
0
def check_support_multiagent(alg, config):
    register_env("multi_agent_mountaincar",
                 lambda _: MultiAgentMountainCar({"num_agents": 2}))
    register_env("multi_agent_cartpole",
                 lambda _: MultiAgentCartPole({"num_agents": 2}))

    # Simulate a simple multi-agent setup.
    policies = {
        "policy_0": PolicySpec(config={"gamma": 0.99}),
        "policy_1": PolicySpec(config={"gamma": 0.95}),
    }
    policy_ids = list(policies.keys())

    def policy_mapping_fn(agent_id, episode, worker, **kwargs):
        pol_id = policy_ids[agent_id]
        return pol_id

    config["multiagent"] = {
        "policies": policies,
        "policy_mapping_fn": policy_mapping_fn,
    }

    for fw in framework_iterator(config):
        if fw in ["tf2", "tfe"] and \
                alg in ["A3C", "APEX", "APEX_DDPG", "IMPALA"]:
            continue
        if alg in ["DDPG", "APEX_DDPG", "SAC"]:
            a = get_trainer_class(alg)(config=config,
                                       env="multi_agent_mountaincar")
        else:
            a = get_trainer_class(alg)(config=config,
                                       env="multi_agent_cartpole")

        results = a.train()
        check_train_results(results)
        print(results)
        a.stop()
예제 #15
0
    def create_policy(
        self,
        policy_id: PolicyID,
        policy_cls: Type["Policy"],
        observation_space: gym.Space,
        action_space: gym.Space,
        config_override: PartialAlgorithmConfigDict,
        merged_config: AlgorithmConfigDict,
    ) -> None:
        """Creates a new policy and stores it to the cache.

        Args:
            policy_id: The policy ID. This is the key under which
                the created policy will be stored in this map.
            policy_cls: The (original) policy class to use.
                This may still be altered in case tf-eager (and tracing)
                is used.
            observation_space: The observation space of the
                policy.
            action_space: The action space of the policy.
            config_override: The config override
                dict for this policy. This is the partial dict provided by
                the user.
            merged_config: The entire config (merged
                default config + `config_override`).
        """
        _class = get_tf_eager_cls_if_necessary(policy_cls, merged_config)

        self[policy_id] = create_policy_for_framework(
            policy_id,
            _class,
            merged_config,
            observation_space,
            action_space,
            self.worker_index,
            self.session_creator,
            self.seed,
        )

        # Store spec (class, obs-space, act-space, and config overrides) such
        # that the map will be able to reproduce on-the-fly added policies
        # from disk.
        self.policy_specs[policy_id] = PolicySpec(
            policy_class=policy_cls,
            observation_space=observation_space,
            action_space=action_space,
            config=config_override,
        )
예제 #16
0
파일: alpha_star.py 프로젝트: smorad/ray
    def add_policy(
        self,
        policy_id: PolicyID,
        policy_cls: Type[Policy],
        *,
        observation_space: Optional[gym.spaces.Space] = None,
        action_space: Optional[gym.spaces.Space] = None,
        config: Optional[PartialTrainerConfigDict] = None,
        policy_state: Optional[PolicyState] = None,
        **kwargs,
    ) -> Policy:
        # Add the new policy to all our train- and eval RolloutWorkers
        # (including the local worker).
        new_policy = super().add_policy(
            policy_id,
            policy_cls,
            observation_space=observation_space,
            action_space=action_space,
            config=config,
            policy_state=policy_state,
            **kwargs,
        )

        # Do we have to create a policy-learner actor from it as well?
        if policy_id in kwargs.get("policies_to_train", []):
            new_policy_actor = self.distributed_learners.add_policy(
                policy_id,
                PolicySpec(
                    policy_cls,
                    new_policy.observation_space,
                    new_policy.action_space,
                    self.config,
                ),
            )
            # Set state of new policy actor, if provided.
            if policy_state is not None:
                ray.get(new_policy_actor.set_state.remote(policy_state))

        return new_policy
예제 #17
0
def check_multi_agent(config: PartialTrainerConfigDict):
    """Checks, whether a (partial) config defines a multi-agent setup.

    Args:
        config (PartialTrainerConfigDict): The user/Trainer/Policy config
            to check for multi-agent.

    Returns:
        Tuple[MultiAgentPolicyConfigDict, bool]: The resulting (all
            fixed) multi-agent policy dict and whether we have a
            multi-agent setup or not.
    """
    multiagent_config = config["multiagent"]
    policies = multiagent_config.get("policies")
    if not policies:
        policies = {DEFAULT_POLICY_ID}
    if isinstance(policies, set):
        policies = multiagent_config["policies"] = {
            pid: PolicySpec()
            for pid in policies
        }
    is_multiagent = len(policies) > 1 or DEFAULT_POLICY_ID not in policies
    return policies, is_multiagent
예제 #18
0
    register_env("multi_agent_cartpole",
                 lambda _: MultiAgentCartPole({"num_agents": 4}))

    stop = {
        "training_iteration": args.stop_iters,
        "episode_reward_mean": args.stop_reward,
        "timesteps_total": args.stop_timesteps,
    }

    config = {
        "env": "multi_agent_cartpole",
        "multiagent": {
            # The multiagent Policy map.
            "policies": {
                # The Policy we are actually learning.
                "pg_policy": PolicySpec(config={"framework": args.framework}),
                # Random policy we are playing against.
                "random": PolicySpec(policy_class=RandomPolicy),
            },
            # Map to either random behavior or PR learning behavior based on
            # the agent's ID.
            "policy_mapping_fn":
            (lambda aid, **kwargs: ["pg_policy", "random"][aid % 2]),
            # We wouldn't have to specify this here as the RandomPolicy does
            # not learn anyways (it has an empty `learn_on_batch` method), but
            # it's good practice to define this list here either way.
            "policies_to_learn": ["pg_policy"],
        },
        "framework": args.framework,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
예제 #19
0
 def gen_policy():
     config = {
         "gamma": random.choice([0.5, 0.8, 0.9, 0.95, 0.99]),
         "n_step": random.choice([1, 2, 3, 4, 5]),
     }
     return PolicySpec(config=config)
예제 #20
0
    config = {
        "env": "open_spiel_env",
        "callbacks": AddPolicyCallback,
        "model": {
            "fcnet_hiddens": [512, 512],
        },
        "num_envs_per_worker": 5,
        "multiagent": {
            # Initial policy map: Random and PPO. This will be expanded
            # to more policy snapshots taken from "main" against which "main"
            # will then play (instead of "random"). This is done in the
            # custom callback defined above (`SelfPlayCallback`).
            "policies": {
                # Our main policy, we'd like to optimize.
                "main": PolicySpec(),
                # Note: We will add the "opponent" policy with callback.
            },
            # Assign agent 0 and 1 randomly to the "main" policy or
            # to the opponent ("random" at first). Make sure (via episode_id)
            # that "main" always plays against "random" (and not against
            # another "main").
            "policy_mapping_fn": policy_mapping_fn,
            # Always just train the "main" policy.
            "policies_to_train": ["main"],
        },
        "num_workers": 1,
        "framework": "torch",
        # We will be restoring a TF2 policy.
        # So tell the RolloutWorkers to enable TF eager exec as well, even if
        # framework is set to torch.
예제 #21
0
import ray
from ray.rllib.agents.a3c import A3CTrainer
from ray.rllib.agents.a3c import A2CTrainer
from ray.rllib.policy.policy import PolicySpec
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
ray.init()
policies = {
    "policy_0": PolicySpec(config={"gamma": 0.99}),
    "policy_1": PolicySpec(config={"gamma": 0.95}),
}
policy_ids = list(policies.keys())


def policy_mapping_fn(agent_id, episode, **kwargs):
    pol_id = policy_ids[agent_id]
    return pol_id


trainer = A3CTrainer(env=MultiAgentCartPole,
                     config={
                         "framework": "tfe",
                         "multiagent": {
                             "policies": policies,
                             "policy_mapping_fn": policy_mapping_fn
                         }
                     })
trainer.train()
예제 #22
0
 config = {
     "env": "open_spiel_env",
     "callbacks": SelfPlayCallback,
     "model": {
         "fcnet_hiddens": [512, 512],
     },
     "num_sgd_iter": 20,
     "num_envs_per_worker": 5,
     "multiagent": {
         # Initial policy map: Random and PPO. This will be expanded
         # to more policy snapshots taken from "main" against which "main"
         # will then play (instead of "random"). This is done in the
         # custom callback defined above (`SelfPlayCallback`).
         "policies": {
             # Our main policy, we'd like to optimize.
             "main": PolicySpec(),
             # An initial random opponent to play against.
             "random": PolicySpec(policy_class=RandomPolicy),
         },
         # Assign agent 0 and 1 randomly to the "main" policy or
         # to the opponent ("random" at first). Make sure (via episode_id)
         # that "main" always plays against "random" (and not against
         # another "main").
         "policy_mapping_fn": policy_mapping_fn,
         # Always just train the "main" policy.
         "policies_to_train": ["main"],
     },
     "num_workers": args.num_workers,
     # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
     "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
     "framework": args.framework,
예제 #23
0
파일: multi_agent.py 프로젝트: tchordia/ray
def check_multi_agent(
    config: PartialTrainerConfigDict,
) -> Tuple[MultiAgentPolicyConfigDict, bool]:
    """Checks, whether a (partial) config defines a multi-agent setup.

    Args:
        config: The user/Trainer/Policy config to check for multi-agent.

    Returns:
        Tuple consisting of the resulting (all fixed) multi-agent policy
        dict and bool indicating whether we have a multi-agent setup or not.

    Raises:
        KeyError: If `config` does not contain a "multiagent" key or if there
            is an invalid key inside the "multiagent" config or if any policy
            in the "policies" dict has a non-str ID (key).
        ValueError: If any subkey of the "multiagent" dict has an invalid
            value.
    """
    if "multiagent" not in config:
        raise KeyError(
            "Your `config` to be checked for a multi-agent setup must have "
            "the 'multiagent' key defined!"
        )
    multiagent_config = config["multiagent"]

    policies = multiagent_config.get("policies")

    # Check for invalid sub-keys of multiagent config.
    from ray.rllib.agents.trainer import COMMON_CONFIG

    allowed = list(COMMON_CONFIG["multiagent"].keys())
    if any(k not in allowed for k in multiagent_config.keys()):
        raise KeyError(
            f"You have invalid keys in your 'multiagent' config dict! "
            f"The only allowed keys are: {allowed}."
        )

    # Nothing specified in config dict -> Assume simple single agent setup
    # with DEFAULT_POLICY_ID as only policy.
    if not policies:
        policies = {DEFAULT_POLICY_ID}
    # Policies given as set/list/tuple (of PolicyIDs) -> Setup each policy
    # automatically via empty PolicySpec (will make RLlib infer obs- and action spaces
    # as well as the Policy's class).
    if isinstance(policies, (set, list, tuple)):
        policies = multiagent_config["policies"] = {
            pid: PolicySpec() for pid in policies
        }

    # Check each defined policy ID and spec.
    for pid, policy_spec in policies.copy().items():
        # Policy IDs must be strings.
        if not isinstance(pid, str):
            raise KeyError(f"Policy IDs must always be of type `str`, got {type(pid)}")
        # Convert to PolicySpec if plain list/tuple.
        if not isinstance(policy_spec, PolicySpec):
            # Values must be lists/tuples of len 4.
            if not isinstance(policy_spec, (list, tuple)) or len(policy_spec) != 4:
                raise ValueError(
                    "Policy specs must be tuples/lists of "
                    "(cls or None, obs_space, action_space, config), "
                    f"got {policy_spec}"
                )
            policies[pid] = PolicySpec(*policy_spec)

        # Config is None -> Set to {}.
        if policies[pid].config is None:
            policies[pid] = policies[pid]._replace(config={})
        # Config not a dict.
        elif not isinstance(policies[pid].config, dict):
            raise ValueError(
                f"Multiagent policy config for {pid} must be a dict, "
                f"but got {type(policies[pid].config)}!"
            )

    # Check other "multiagent" sub-keys' values.
    if multiagent_config.get("count_steps_by", "env_steps") not in [
        "env_steps",
        "agent_steps",
    ]:
        raise ValueError(
            "config.multiagent.count_steps_by must be one of "
            "[env_steps|agent_steps], not "
            f"{multiagent_config['count_steps_by']}!"
        )
    if multiagent_config.get("replay_mode", "independent") not in [
        "independent",
        "lockstep",
    ]:
        raise ValueError(
            "`config.multiagent.replay_mode` must be "
            "[independent|lockstep], not "
            f"{multiagent_config['replay_mode']}!"
        )
    # Attempt to create a `policy_mapping_fn` from config dict. Helpful
    # is users would like to specify custom callable classes in yaml files.
    if isinstance(multiagent_config.get("policy_mapping_fn"), dict):
        multiagent_config["policy_mapping_fn"] = from_config(
            multiagent_config["policy_mapping_fn"]
        )
    # Check `policies_to_train` for invalid entries.
    if isinstance(multiagent_config["policies_to_train"], (list, set, tuple)):
        if len(multiagent_config["policies_to_train"]) == 0:
            logger.warning(
                "`config.multiagent.policies_to_train` is empty! "
                "Make sure - if you would like to learn at least one policy - "
                "to add its ID to that list."
            )
        for pid in multiagent_config["policies_to_train"]:
            if pid not in policies:
                raise ValueError(
                    "`config.multiagent.policies_to_train` contains policy "
                    f"ID ({pid}) that was not defined in `config.multiagent.policies!"
                )

    # Is this a multi-agent setup? True, iff DEFAULT_POLICY_ID is only
    # PolicyID found in policies dict.
    is_multiagent = len(policies) > 1 or DEFAULT_POLICY_ID not in policies
    return policies, is_multiagent
예제 #24
0
    def __init__(
        self,
        trainer: Trainer,
        trainer_config: TrainerConfigDict,
        num_random_policies: int = 2,
        num_learning_league_exploiters: int = 4,
        num_learning_main_exploiters: int = 4,
        win_rate_threshold_for_new_snapshot: float = 0.8,
        keep_new_snapshot_training_prob: float = 0.0,
        prob_league_exploiter_match: float = 0.33,
        prob_main_exploiter_match: float = 0.33,
        prob_main_exploiter_playing_against_learning_main: float = 0.5,
    ):
        """Initializes a AlphaStarLeagueBuilder instance.

        Args:
            trainer: The Trainer object by which this league builder is used.
                Trainer calls `build_league()` after each training step.
            trainer_config: The (not yet validated) config dict to be
                used on the Trainer. Child classes of `LeagueBuilder`
                should preprocess this to add e.g. multiagent settings
                to this config.
            num_random_policies: The number of random policies to add to the
                league. This must be an even number (including 0) as these
                will be evenly distributed amongst league- and main- exploiters.
            num_learning_league_exploiters: The number of initially learning
                league-exploiters to create.
            num_learning_main_exploiters: The number of initially learning
                main-exploiters to create.
            win_rate_threshold_for_new_snapshot: The win-rate to be achieved
                for a learning policy to get snapshot'd (forked into `self` +
                a new learning or non-learning copy of `self`).
            keep_new_snapshot_training_prob: The probability with which a new
                snapshot should keep training. Note that the policy from which
                this snapshot is taken will continue to train regardless.
            prob_league_exploiter_match: Probability of an episode to become a
                league-exploiter vs snapshot match.
            prob_main_exploiter_match: Probability of an episode to become a
                main-exploiter vs main match.
            prob_main_exploiter_playing_against_learning_main: Probability of
                a main-exploiter vs (training!) main match.
        """
        super().__init__(trainer, trainer_config)

        self.win_rate_threshold_for_new_snapshot = win_rate_threshold_for_new_snapshot
        self.keep_new_snapshot_training_prob = keep_new_snapshot_training_prob
        self.prob_league_exploiter_match = prob_league_exploiter_match
        self.prob_main_exploiter_match = prob_main_exploiter_match
        self.prob_main_exploiter_playing_against_learning_main = (
            prob_main_exploiter_playing_against_learning_main)
        # Store the win rates for league overview printouts.
        self.win_rates: DefaultDict[PolicyID, float] = defaultdict(float)

        assert num_random_policies % 2 == 0, (
            "ERROR: `num_random_policies` must be even number (we'll distribute "
            "these evenly amongst league- and main-exploiters)!")

        # Build trainer's multiagent config.
        ma_config = self.config["multiagent"]
        # Make sure the multiagent config dict has no policies defined:
        assert not ma_config.get("policies"), (
            "ERROR: `config.multiagent.policies` should not be pre-defined! "
            "AlphaStarLeagueBuilder will construct this itself.")
        ma_config["policies"] = policies = {}

        self.main_policies = 1
        self.league_exploiters = (num_learning_league_exploiters +
                                  num_random_policies / 2)
        self.main_exploiters = num_learning_main_exploiters + num_random_policies / 2

        # Add 1 initial (learning) main policy.
        policies["main_0"] = PolicySpec()

        # Train all non-random policies that exist at beginning.
        ma_config["policies_to_train"] = ["main_0"]

        # Add random policies.
        i = -1
        for i in range(num_random_policies // 2):
            policies[f"league_exploiter_{i}"] = PolicySpec(
                policy_class=RandomPolicy)
            policies[f"main_exploiter_{i}"] = PolicySpec(
                policy_class=RandomPolicy)
        # Add initial (learning) league-exploiters.
        for j in range(num_learning_league_exploiters):
            pid = f"league_exploiter_{j + i + 1}"
            policies[pid] = PolicySpec()
            ma_config["policies_to_train"].append(pid)
        # Add initial (learning) main-exploiters.
        for j in range(num_learning_league_exploiters):
            pid = f"main_exploiter_{j + i + 1}"
            policies[pid] = PolicySpec()
            ma_config["policies_to_train"].append(pid)

        # Build initial policy mapping function: main_0 vs main_exploiter_0.
        ma_config["policy_mapping_fn"] = (
            lambda aid, ep, worker, **kw: "main_0"
            if ep.episode_id % 2 == aid else "main_exploiter_0")
예제 #25
0
    def policy_mapping_fn(agent_id, episode, worker, **kwargs):
        # At first, only have main play against the random main exploiter.
        return "main" if episode.episode_id % 2 == agent_id else "main_exploiter_0"

    config = {
        "env": "open_spiel_env",
        "callbacks": LeagueBasedSelfPlayCallback,
        "num_sgd_iter": 20,
        "num_envs_per_worker": 5,
        "multiagent": {
            # Initial policy map: All PPO. This will be expanded
            # to more policy snapshots. This is done in the
            # custom callback defined above (`LeagueBasedSelfPlayCallback`).
            "policies": {
                # Our main policy, we'd like to optimize.
                "main": PolicySpec(),
                # First frozen version of main (after we reach n% win-rate).
                "main_0": PolicySpec(),
                # Initial main exploiters (one random, one trainable).
                "main_exploiter_0": PolicySpec(policy_class=RandomPolicy),
                "main_exploiter_1": PolicySpec(),
                # Initial league exploiters (one random, one trainable).
                "league_exploiter_0": PolicySpec(policy_class=RandomPolicy),
                "league_exploiter_1": PolicySpec(),
            },
            "policy_mapping_fn": policy_mapping_fn,
            # At first, only train main_0 (until good enough to win against
            # random).
            "policies_to_train": ["main"],
        },
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
예제 #26
0
    )

    if args.run == "contrib/MADDPG":
        obs_space = Discrete(6)
        act_space = TwoStepGame.action_space
        config = {
            "learning_starts": 100,
            "env_config": {
                "actions_are_logits": True,
            },
            "multiagent": {
                "policies": {
                    "pol1":
                    PolicySpec(
                        observation_space=obs_space,
                        action_space=act_space,
                        config={"agent_id": 0},
                    ),
                    "pol2":
                    PolicySpec(
                        observation_space=obs_space,
                        action_space=act_space,
                        config={"agent_id": 1},
                    ),
                },
                "policy_mapping_fn": (lambda aid, **kwargs: "pol2"
                                      if aid else "pol1"),
            },
            "framework": args.framework,
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
예제 #27
0
    def get_policy_configs_for_game(
        game_name: str, ) -> Tuple[dict, Callable[[AgentID], PolicyID]]:

        # The RLlib server must know about the Spaces that the Client will be
        # using inside Unity3D, up-front.
        obs_spaces = {
            # 3DBall.
            "3DBall":
            Box(float("-inf"), float("inf"), (8, )),
            # 3DBallHard.
            "3DBallHard":
            Box(float("-inf"), float("inf"), (45, )),
            # GridFoodCollector
            "GridFoodCollector":
            Box(float("-inf"), float("inf"), (40, 40, 6)),
            # Pyramids.
            "Pyramids":
            TupleSpace([
                Box(float("-inf"), float("inf"), (56, )),
                Box(float("-inf"), float("inf"), (56, )),
                Box(float("-inf"), float("inf"), (56, )),
                Box(float("-inf"), float("inf"), (4, )),
            ]),
            # SoccerStrikersVsGoalie.
            "Goalie":
            Box(float("-inf"), float("inf"), (738, )),
            "Striker":
            TupleSpace([
                Box(float("-inf"), float("inf"), (231, )),
                Box(float("-inf"), float("inf"), (63, )),
            ]),
            # Sorter.
            "Sorter":
            TupleSpace([
                Box(
                    float("-inf"),
                    float("inf"),
                    (
                        20,
                        23,
                    ),
                ),
                Box(float("-inf"), float("inf"), (10, )),
                Box(float("-inf"), float("inf"), (8, )),
            ]),
            # Tennis.
            "Tennis":
            Box(float("-inf"), float("inf"), (27, )),
            # VisualHallway.
            "VisualHallway":
            Box(float("-inf"), float("inf"), (84, 84, 3)),
            # Walker.
            "Walker":
            Box(float("-inf"), float("inf"), (212, )),
            # FoodCollector.
            "FoodCollector":
            TupleSpace([
                Box(float("-inf"), float("inf"), (49, )),
                Box(float("-inf"), float("inf"), (4, )),
            ]),
        }
        action_spaces = {
            # 3DBall.
            "3DBall": Box(-1.0, 1.0, (2, ), dtype=np.float32),
            # 3DBallHard.
            "3DBallHard": Box(-1.0, 1.0, (2, ), dtype=np.float32),
            # GridFoodCollector.
            "GridFoodCollector": MultiDiscrete([3, 3, 3, 2]),
            # Pyramids.
            "Pyramids": MultiDiscrete([5]),
            # SoccerStrikersVsGoalie.
            "Goalie": MultiDiscrete([3, 3, 3]),
            "Striker": MultiDiscrete([3, 3, 3]),
            # Sorter.
            "Sorter": MultiDiscrete([3, 3, 3]),
            # Tennis.
            "Tennis": Box(-1.0, 1.0, (3, )),
            # VisualHallway.
            "VisualHallway": MultiDiscrete([5]),
            # Walker.
            "Walker": Box(-1.0, 1.0, (39, )),
            # FoodCollector.
            "FoodCollector": MultiDiscrete([3, 3, 3, 2]),
        }

        # Policies (Unity: "behaviors") and agent-to-policy mapping fns.
        if game_name == "SoccerStrikersVsGoalie":
            policies = {
                "Goalie":
                PolicySpec(
                    observation_space=obs_spaces["Goalie"],
                    action_space=action_spaces["Goalie"],
                ),
                "Striker":
                PolicySpec(
                    observation_space=obs_spaces["Striker"],
                    action_space=action_spaces["Striker"],
                ),
            }

            def policy_mapping_fn(agent_id, episode, worker, **kwargs):
                return "Striker" if "Striker" in agent_id else "Goalie"

        else:
            policies = {
                game_name:
                PolicySpec(
                    observation_space=obs_spaces[game_name],
                    action_space=action_spaces[game_name],
                ),
            }

            def policy_mapping_fn(agent_id, episode, worker, **kwargs):
                return game_name

        return policies, policy_mapping_fn
예제 #28
0
    def create_policy(self, policy_id: PolicyID, policy_cls: Type["Policy"],
                      observation_space: gym.Space, action_space: gym.Space,
                      config_override: PartialTrainerConfigDict,
                      merged_config: TrainerConfigDict) -> None:
        """Creates a new policy and stores it to the cache.

        Args:
            policy_id (PolicyID): The policy ID. This is the key under which
                the created policy will be stored in this map.
            policy_cls (Type[Policy]): The (original) policy class to use.
                This may still be altered in case tf-eager (and tracing)
                is used.
            observation_space (gym.Space): The observation space of the
                policy.
            action_space (gym.Space): The action space of the policy.
            config_override (PartialTrainerConfigDict): The config override
                dict for this policy. This is the partial dict provided by
                the user.
            merged_config (TrainerConfigDict): The entire config (merged
                default config + `config_override`).
        """
        framework = merged_config.get("framework", "tf")
        class_ = get_tf_eager_cls_if_necessary(policy_cls, merged_config)

        # Tf.
        if framework in ["tf2", "tf", "tfe"]:
            var_scope = policy_id + (
                ("_wk" + str(self.worker_index)) if self.worker_index else "")

            # For tf static graph, build every policy in its own graph
            # and create a new session for it.
            if framework == "tf":
                with tf1.Graph().as_default():
                    if self.session_creator:
                        sess = self.session_creator()
                    else:
                        sess = tf1.Session(config=tf1.ConfigProto(
                            gpu_options=tf1.GPUOptions(allow_growth=True)))
                    with sess.as_default():
                        # Set graph-level seed.
                        if self.seed is not None:
                            tf1.set_random_seed(self.seed)
                        with tf1.variable_scope(var_scope):
                            self[policy_id] = class_(observation_space,
                                                     action_space,
                                                     merged_config)
            # For tf-eager: no graph, no session.
            else:
                with tf1.variable_scope(var_scope):
                    self[policy_id] = \
                        class_(observation_space, action_space, merged_config)
        # Non-tf: No graph, no session.
        else:
            class_ = policy_cls
            self[policy_id] = class_(observation_space, action_space,
                                     merged_config)

        # Store spec (class, obs-space, act-space, and config overrides) such
        # that the map will be able to reproduce on-the-fly added policies
        # from disk.
        self.policy_specs[policy_id] = PolicySpec(
            policy_class=policy_cls,
            observation_space=observation_space,
            action_space=action_space,
            config=config_override)
예제 #29
0
        "grouped_twostep",
        lambda config: TwoStepGame(config).with_agent_groups(
            grouping, obs_space=obs_space, act_space=act_space))

    if args.run == "contrib/MADDPG":
        obs_space = Discrete(6)
        act_space = TwoStepGame.action_space
        config = {
            "learning_starts": 100,
            "env_config": {
                "actions_are_logits": True,
            },
            "multiagent": {
                "policies": {
                    "pol1": PolicySpec(
                        observation_space=obs_space,
                        action_space=act_space,
                        config={"agent_id": 0}),
                    "pol2": PolicySpec(
                        observation_space=obs_space,
                        action_space=act_space,
                        config={"agent_id": 1}),
                },
                "policy_mapping_fn": (
                    lambda aid, **kwargs: "pol2" if aid else "pol1"),
            },
            "framework": args.framework,
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        }
        group = False
    elif args.run == "QMIX":