def test_sample_from_early_done_env(self): ev = RolloutWorker( env_creator=lambda _: EarlyDoneMultiAgent(), policy_spec={ "p0": PolicySpec(policy_class=MockPolicy), "p1": PolicySpec(policy_class=MockPolicy), }, policy_mapping_fn=(lambda aid, **kwargs: "p{}".format(aid % 2)), batch_mode="complete_episodes", rollout_fragment_length=1, ) # This used to raise an Error due to the EarlyDoneMultiAgent # terminating at e.g. agent0 w/o publishing the observation for # agent1 anymore. This limitation is fixed and an env may # terminate at any time (as well as return rewards for any agent # at any time, even when that agent doesn't have an obs returned # in the same call to `step()`). ma_batch = ev.sample() # Make sure that agents took the correct (alternating timesteps) # path. Except for the last timestep, where both agents got # terminated. ag0_ts = ma_batch.policy_batches["p0"]["t"] ag1_ts = ma_batch.policy_batches["p1"]["t"] self.assertTrue(np.all(np.abs(ag0_ts[:-1] - ag1_ts[:-1]) == 1.0)) self.assertTrue(ag0_ts[-1] == ag1_ts[-1])
def test_maddpg_compilation(self): """Test whether MADDPG can be built with all frameworks.""" config = (maddpg.MADDPGConfig().environment( env=TwoStepGame, env_config={ "actions_are_logits": True, }, ).multi_agent( policies={ "pol1": PolicySpec(config={"agent_id": 0}, ), "pol2": PolicySpec(config={"agent_id": 1}, ), }, policy_mapping_fn=lambda aid, **kwargs: "pol2" if aid else "pol1", )) num_iterations = 1 # Only working for tf right now. for _ in framework_iterator(config, frameworks="tf"): algo = config.build() for i in range(num_iterations): results = algo.train() check_train_results(results) print(results) algo.stop()
def test_returning_model_based_rollouts_data(self): class ModelBasedPolicy(DQNTFPolicy): def compute_actions_from_input_dict(self, input_dict, explore=None, timestep=None, episodes=None, **kwargs): obs_batch = input_dict["obs"] # In policy loss initialization phase, no episodes are passed # in. if episodes is not None: # Pretend we did a model-based rollout and want to return # the extra trajectory. env_id = episodes[0].env_id fake_eps = Episode(episodes[0].policy_map, episodes[0].policy_mapping_fn, lambda: None, lambda x: None, env_id) builder = get_global_worker().sampler.sample_collector agent_id = "extra_0" policy_id = "p1" # use p1 so we can easily check it builder.add_init_obs(fake_eps, agent_id, env_id, policy_id, -1, obs_batch[0]) for t in range(4): builder.add_action_reward_next_obs( episode_id=fake_eps.episode_id, agent_id=agent_id, env_id=env_id, policy_id=policy_id, agent_done=t == 3, values=dict( t=t, actions=0, rewards=0, dones=t == 3, infos={}, new_obs=obs_batch[0])) batch = builder.postprocess_episode( episode=fake_eps, build=True) episodes[0].add_extra_batch(batch) # Just return zeros for actions return [0] * len(obs_batch), [], {} ev = RolloutWorker( env_creator=lambda _: MultiAgentCartPole({"num_agents": 2}), policy_spec={ "p0": PolicySpec(policy_class=ModelBasedPolicy), "p1": PolicySpec(policy_class=ModelBasedPolicy), }, policy_mapping_fn=lambda agent_id, episode, **kwargs: "p0", rollout_fragment_length=5) batch = ev.sample() # 5 environment steps (rollout_fragment_length). self.assertEqual(batch.count, 5) # 10 agent steps for p0: 2 agents, both using p0 as their policy. self.assertEqual(batch.policy_batches["p0"].count, 10) # 20 agent steps for p1: Each time both(!) agents takes 1 step, # p1 takes 4: 5 (rollout-fragment length) * 4 = 20 self.assertEqual(batch.policy_batches["p1"].count, 20)
def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"): """Run heuristic policies vs a learned agent. The learned agent should eventually reach a reward of ~5 with use_lstm=False, and ~7 with use_lstm=True. The reason the LSTM policy can perform better is since it can distinguish between the always_same vs beat_last heuristics. """ def select_policy(agent_id, episode, **kwargs): if agent_id == "player1": return "learned" else: return random.choice(["always_same", "beat_last"]) config = { "env": RockPaperScissors, "gamma": 0.9, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "num_workers": 0, "num_envs_per_worker": 4, "rollout_fragment_length": 10, "train_batch_size": 200, "multiagent": { "policies_to_train": ["learned"], "policies": { "always_same": PolicySpec(policy_class=AlwaysSameHeuristic), "beat_last": PolicySpec(policy_class=BeatLastHeuristic), "learned": PolicySpec(config={ "model": { "use_lstm": use_lstm }, "framework": args.framework, }), }, "policy_mapping_fn": select_policy, }, "framework": args.framework, } cls = get_trainer_class(trainer) if isinstance(trainer, str) else trainer trainer_obj = cls(config=config) env = trainer_obj.workers.local_worker().env for _ in range(args.stop_iters): results = trainer_obj.train() print(results) # Timesteps reached. if results["timesteps_total"] > args.stop_timesteps: break # Reward (difference) reached -> all good, return. elif env.player1_score - env.player2_score > args.stop_reward: return # Reward (difference) not reached: Error if `as_test`. if args.as_test: raise ValueError( "Desired reward difference ({}) not reached! Only got to {}.". format(args.stop_reward, env.player1_score - env.player2_score))
def test_multi_agent_sample_with_horizon(self): ev = RolloutWorker( env_creator=lambda _: BasicMultiAgent(5), policy_spec={ "p0": PolicySpec(policy_class=MockPolicy), "p1": PolicySpec(policy_class=MockPolicy), }, policy_mapping_fn=(lambda aid, **kwarg: "p{}".format(aid % 2)), episode_horizon=10, # test with episode horizon set rollout_fragment_length=50) batch = ev.sample() self.assertEqual(batch.count, 50)
def test_multi_agent_sample_async_remote(self): ev = RolloutWorker( env_creator=lambda _: BasicMultiAgent(5), policy_spec={ "p0": PolicySpec(policy_class=MockPolicy), "p1": PolicySpec(policy_class=MockPolicy), }, policy_mapping_fn=(lambda aid, **kwargs: "p{}".format(aid % 2)), rollout_fragment_length=50, num_envs=4, remote_worker_envs=True) batch = ev.sample() self.assertEqual(batch.count, 200)
def check_multi_agent(config: PartialTrainerConfigDict) -> \ Tuple[MultiAgentPolicyConfigDict, bool]: """Checks, whether a (partial) config defines a multi-agent setup. Args: config (PartialTrainerConfigDict): The user/Trainer/Policy config to check for multi-agent. Returns: The resulting (all fixed) multi-agent policy dict and whether we have a multi-agent setup or not. """ multiagent_config = config["multiagent"] policies = multiagent_config.get("policies") # Nothing specified in config dict -> Assume simple single agent setup # with DEFAULT_POLICY_ID as only policy. if not policies: policies = {DEFAULT_POLICY_ID} # Policies given as set (of PolicyIDs) -> Setup each policy automatically # via empty PolicySpec (will make RLlib infer obs- and action spaces # as well as the Policy's class). if isinstance(policies, set): policies = multiagent_config["policies"] = { pid: PolicySpec() for pid in policies } # Is this a multi-agent setup? True, iff DEFAULT_POLICY_ID is only # PolicyID found in policies dict. is_multiagent = len(policies) > 1 or DEFAULT_POLICY_ID not in policies return policies, is_multiagent
def test_multi_agent_sample_round_robin(self): ev = RolloutWorker( env_creator=lambda _: RoundRobinMultiAgent(5, increment_obs=True), policy_spec={ "p0": PolicySpec(policy_class=MockPolicy), }, policy_mapping_fn=lambda agent_id, episode, **kwargs: "p0", rollout_fragment_length=50, ) batch = ev.sample() self.assertEqual(batch.count, 50) # since we round robin introduce agents into the env, some of the env # steps don't count as proper transitions self.assertEqual(batch.policy_batches["p0"].count, 42) check( batch.policy_batches["p0"]["obs"][:10], one_hot(np.array([0, 1, 2, 3, 4] * 2), 10), ) check( batch.policy_batches["p0"]["new_obs"][:10], one_hot(np.array([1, 2, 3, 4, 5] * 2), 10), ) self.assertEqual( batch.policy_batches["p0"]["rewards"].tolist()[:10], [100, 100, 100, 100, 0] * 2, ) self.assertEqual( batch.policy_batches["p0"]["dones"].tolist()[:10], [False, False, False, False, True] * 2, ) self.assertEqual( batch.policy_batches["p0"]["t"].tolist()[:10], [4, 9, 14, 19, 24, 5, 10, 15, 20, 25], )
def parse_policy_specs_from_checkpoint( path: str, ) -> Tuple[PartialAlgorithmConfigDict, Dict[str, PolicySpec], Dict[ str, PolicyState]]: """Read and parse policy specifications from a checkpoint file. Args: path: Path to a policy checkpoint. Returns: A tuple of: base policy config, dictionary of policy specs, and dictionary of policy states. """ with open(path, "rb") as f: checkpoint_dict = pickle.load(f) # Policy data is contained as a serialized binary blob under their # ID keys. w = pickle.loads(checkpoint_dict["worker"]) policy_config = w["policy_config"] assert policy_config.get("enable_connectors", False), ( "load_policies_from_checkpoint only works for checkpoints generated by stacks " "with connectors enabled.") policy_states = w["state"] serialized_policy_specs = w["policy_specs"] policy_specs = { id: PolicySpec.deserialize(spec) for id, spec in serialized_policy_specs.items() } return policy_config, policy_specs, policy_states
def test_multi_agent_sample_sync_remote(self): ev = RolloutWorker( env_creator=lambda _: BasicMultiAgent(5), policy_spec={ "p0": PolicySpec(policy_class=MockPolicy), "p1": PolicySpec(policy_class=MockPolicy), }, # This signature will raise a soft-deprecation warning due # to the new signature we are using (agent_id, episode, **kwargs), # but should not break this test. policy_mapping_fn=(lambda agent_id: "p{}".format(agent_id % 2)), rollout_fragment_length=50, num_envs=4, remote_worker_envs=True, remote_env_batch_wait_ms=99999999) batch = ev.sample() self.assertEqual(batch.count, 200)
def gen_policy(i): config = { "model": { "custom_model": ["model1", "model2"][i % 2], }, "gamma": random.choice([0.95, 0.99]), } return PolicySpec(config=config)
def test_multi_agent_sample(self): def policy_mapping_fn(agent_id, episode, worker, **kwargs): return "p{}".format(agent_id % 2) ev = RolloutWorker(env_creator=lambda _: BasicMultiAgent(5), policy_spec={ "p0": PolicySpec(policy_class=MockPolicy), "p1": PolicySpec(policy_class=MockPolicy), }, policy_mapping_fn=policy_mapping_fn, rollout_fragment_length=50) batch = ev.sample() self.assertEqual(batch.count, 50) self.assertEqual(batch.policy_batches["p0"].count, 150) self.assertEqual(batch.policy_batches["p1"].count, 100) self.assertEqual(batch.policy_batches["p0"]["t"].tolist(), list(range(25)) * 6)
def test_leaky_policy(self): """Tests, whether our diagnostics tools can detect leaks in a policy.""" config = dqn.DEFAULT_CONFIG.copy() # Make sure we have an env to test on the local worker. # Otherwise, `check_memory_leaks` will complain. config["create_env_on_driver"] = True config["env"] = "CartPole-v0" config["multiagent"]["policies"] = { "default_policy": PolicySpec(policy_class=MemoryLeakingPolicy), } trainer = dqn.DQN(config=config) results = check_memory_leaks(trainer, to_check={"policy"}, repeats=300) assert results["policy"] trainer.stop()
def check_support_multiagent(alg, config): register_env("multi_agent_mountaincar", lambda _: MultiAgentMountainCar({"num_agents": 2})) register_env("multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 2})) # Simulate a simple multi-agent setup. policies = { "policy_0": PolicySpec(config={"gamma": 0.99}), "policy_1": PolicySpec(config={"gamma": 0.95}), } policy_ids = list(policies.keys()) def policy_mapping_fn(agent_id, episode, worker, **kwargs): pol_id = policy_ids[agent_id] return pol_id config["multiagent"] = { "policies": policies, "policy_mapping_fn": policy_mapping_fn, } for fw in framework_iterator(config): if fw in ["tf2", "tfe"] and \ alg in ["A3C", "APEX", "APEX_DDPG", "IMPALA"]: continue if alg in ["DDPG", "APEX_DDPG", "SAC"]: a = get_trainer_class(alg)(config=config, env="multi_agent_mountaincar") else: a = get_trainer_class(alg)(config=config, env="multi_agent_cartpole") results = a.train() check_train_results(results) print(results) a.stop()
def create_policy( self, policy_id: PolicyID, policy_cls: Type["Policy"], observation_space: gym.Space, action_space: gym.Space, config_override: PartialAlgorithmConfigDict, merged_config: AlgorithmConfigDict, ) -> None: """Creates a new policy and stores it to the cache. Args: policy_id: The policy ID. This is the key under which the created policy will be stored in this map. policy_cls: The (original) policy class to use. This may still be altered in case tf-eager (and tracing) is used. observation_space: The observation space of the policy. action_space: The action space of the policy. config_override: The config override dict for this policy. This is the partial dict provided by the user. merged_config: The entire config (merged default config + `config_override`). """ _class = get_tf_eager_cls_if_necessary(policy_cls, merged_config) self[policy_id] = create_policy_for_framework( policy_id, _class, merged_config, observation_space, action_space, self.worker_index, self.session_creator, self.seed, ) # Store spec (class, obs-space, act-space, and config overrides) such # that the map will be able to reproduce on-the-fly added policies # from disk. self.policy_specs[policy_id] = PolicySpec( policy_class=policy_cls, observation_space=observation_space, action_space=action_space, config=config_override, )
def add_policy( self, policy_id: PolicyID, policy_cls: Type[Policy], *, observation_space: Optional[gym.spaces.Space] = None, action_space: Optional[gym.spaces.Space] = None, config: Optional[PartialTrainerConfigDict] = None, policy_state: Optional[PolicyState] = None, **kwargs, ) -> Policy: # Add the new policy to all our train- and eval RolloutWorkers # (including the local worker). new_policy = super().add_policy( policy_id, policy_cls, observation_space=observation_space, action_space=action_space, config=config, policy_state=policy_state, **kwargs, ) # Do we have to create a policy-learner actor from it as well? if policy_id in kwargs.get("policies_to_train", []): new_policy_actor = self.distributed_learners.add_policy( policy_id, PolicySpec( policy_cls, new_policy.observation_space, new_policy.action_space, self.config, ), ) # Set state of new policy actor, if provided. if policy_state is not None: ray.get(new_policy_actor.set_state.remote(policy_state)) return new_policy
def check_multi_agent(config: PartialTrainerConfigDict): """Checks, whether a (partial) config defines a multi-agent setup. Args: config (PartialTrainerConfigDict): The user/Trainer/Policy config to check for multi-agent. Returns: Tuple[MultiAgentPolicyConfigDict, bool]: The resulting (all fixed) multi-agent policy dict and whether we have a multi-agent setup or not. """ multiagent_config = config["multiagent"] policies = multiagent_config.get("policies") if not policies: policies = {DEFAULT_POLICY_ID} if isinstance(policies, set): policies = multiagent_config["policies"] = { pid: PolicySpec() for pid in policies } is_multiagent = len(policies) > 1 or DEFAULT_POLICY_ID not in policies return policies, is_multiagent
register_env("multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 4})) stop = { "training_iteration": args.stop_iters, "episode_reward_mean": args.stop_reward, "timesteps_total": args.stop_timesteps, } config = { "env": "multi_agent_cartpole", "multiagent": { # The multiagent Policy map. "policies": { # The Policy we are actually learning. "pg_policy": PolicySpec(config={"framework": args.framework}), # Random policy we are playing against. "random": PolicySpec(policy_class=RandomPolicy), }, # Map to either random behavior or PR learning behavior based on # the agent's ID. "policy_mapping_fn": (lambda aid, **kwargs: ["pg_policy", "random"][aid % 2]), # We wouldn't have to specify this here as the RandomPolicy does # not learn anyways (it has an empty `learn_on_batch` method), but # it's good practice to define this list here either way. "policies_to_learn": ["pg_policy"], }, "framework": args.framework, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
def gen_policy(): config = { "gamma": random.choice([0.5, 0.8, 0.9, 0.95, 0.99]), "n_step": random.choice([1, 2, 3, 4, 5]), } return PolicySpec(config=config)
config = { "env": "open_spiel_env", "callbacks": AddPolicyCallback, "model": { "fcnet_hiddens": [512, 512], }, "num_envs_per_worker": 5, "multiagent": { # Initial policy map: Random and PPO. This will be expanded # to more policy snapshots taken from "main" against which "main" # will then play (instead of "random"). This is done in the # custom callback defined above (`SelfPlayCallback`). "policies": { # Our main policy, we'd like to optimize. "main": PolicySpec(), # Note: We will add the "opponent" policy with callback. }, # Assign agent 0 and 1 randomly to the "main" policy or # to the opponent ("random" at first). Make sure (via episode_id) # that "main" always plays against "random" (and not against # another "main"). "policy_mapping_fn": policy_mapping_fn, # Always just train the "main" policy. "policies_to_train": ["main"], }, "num_workers": 1, "framework": "torch", # We will be restoring a TF2 policy. # So tell the RolloutWorkers to enable TF eager exec as well, even if # framework is set to torch.
import ray from ray.rllib.agents.a3c import A3CTrainer from ray.rllib.agents.a3c import A2CTrainer from ray.rllib.policy.policy import PolicySpec from ray.rllib.examples.env.multi_agent import MultiAgentCartPole ray.init() policies = { "policy_0": PolicySpec(config={"gamma": 0.99}), "policy_1": PolicySpec(config={"gamma": 0.95}), } policy_ids = list(policies.keys()) def policy_mapping_fn(agent_id, episode, **kwargs): pol_id = policy_ids[agent_id] return pol_id trainer = A3CTrainer(env=MultiAgentCartPole, config={ "framework": "tfe", "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn } }) trainer.train()
config = { "env": "open_spiel_env", "callbacks": SelfPlayCallback, "model": { "fcnet_hiddens": [512, 512], }, "num_sgd_iter": 20, "num_envs_per_worker": 5, "multiagent": { # Initial policy map: Random and PPO. This will be expanded # to more policy snapshots taken from "main" against which "main" # will then play (instead of "random"). This is done in the # custom callback defined above (`SelfPlayCallback`). "policies": { # Our main policy, we'd like to optimize. "main": PolicySpec(), # An initial random opponent to play against. "random": PolicySpec(policy_class=RandomPolicy), }, # Assign agent 0 and 1 randomly to the "main" policy or # to the opponent ("random" at first). Make sure (via episode_id) # that "main" always plays against "random" (and not against # another "main"). "policy_mapping_fn": policy_mapping_fn, # Always just train the "main" policy. "policies_to_train": ["main"], }, "num_workers": args.num_workers, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": args.framework,
def check_multi_agent( config: PartialTrainerConfigDict, ) -> Tuple[MultiAgentPolicyConfigDict, bool]: """Checks, whether a (partial) config defines a multi-agent setup. Args: config: The user/Trainer/Policy config to check for multi-agent. Returns: Tuple consisting of the resulting (all fixed) multi-agent policy dict and bool indicating whether we have a multi-agent setup or not. Raises: KeyError: If `config` does not contain a "multiagent" key or if there is an invalid key inside the "multiagent" config or if any policy in the "policies" dict has a non-str ID (key). ValueError: If any subkey of the "multiagent" dict has an invalid value. """ if "multiagent" not in config: raise KeyError( "Your `config` to be checked for a multi-agent setup must have " "the 'multiagent' key defined!" ) multiagent_config = config["multiagent"] policies = multiagent_config.get("policies") # Check for invalid sub-keys of multiagent config. from ray.rllib.agents.trainer import COMMON_CONFIG allowed = list(COMMON_CONFIG["multiagent"].keys()) if any(k not in allowed for k in multiagent_config.keys()): raise KeyError( f"You have invalid keys in your 'multiagent' config dict! " f"The only allowed keys are: {allowed}." ) # Nothing specified in config dict -> Assume simple single agent setup # with DEFAULT_POLICY_ID as only policy. if not policies: policies = {DEFAULT_POLICY_ID} # Policies given as set/list/tuple (of PolicyIDs) -> Setup each policy # automatically via empty PolicySpec (will make RLlib infer obs- and action spaces # as well as the Policy's class). if isinstance(policies, (set, list, tuple)): policies = multiagent_config["policies"] = { pid: PolicySpec() for pid in policies } # Check each defined policy ID and spec. for pid, policy_spec in policies.copy().items(): # Policy IDs must be strings. if not isinstance(pid, str): raise KeyError(f"Policy IDs must always be of type `str`, got {type(pid)}") # Convert to PolicySpec if plain list/tuple. if not isinstance(policy_spec, PolicySpec): # Values must be lists/tuples of len 4. if not isinstance(policy_spec, (list, tuple)) or len(policy_spec) != 4: raise ValueError( "Policy specs must be tuples/lists of " "(cls or None, obs_space, action_space, config), " f"got {policy_spec}" ) policies[pid] = PolicySpec(*policy_spec) # Config is None -> Set to {}. if policies[pid].config is None: policies[pid] = policies[pid]._replace(config={}) # Config not a dict. elif not isinstance(policies[pid].config, dict): raise ValueError( f"Multiagent policy config for {pid} must be a dict, " f"but got {type(policies[pid].config)}!" ) # Check other "multiagent" sub-keys' values. if multiagent_config.get("count_steps_by", "env_steps") not in [ "env_steps", "agent_steps", ]: raise ValueError( "config.multiagent.count_steps_by must be one of " "[env_steps|agent_steps], not " f"{multiagent_config['count_steps_by']}!" ) if multiagent_config.get("replay_mode", "independent") not in [ "independent", "lockstep", ]: raise ValueError( "`config.multiagent.replay_mode` must be " "[independent|lockstep], not " f"{multiagent_config['replay_mode']}!" ) # Attempt to create a `policy_mapping_fn` from config dict. Helpful # is users would like to specify custom callable classes in yaml files. if isinstance(multiagent_config.get("policy_mapping_fn"), dict): multiagent_config["policy_mapping_fn"] = from_config( multiagent_config["policy_mapping_fn"] ) # Check `policies_to_train` for invalid entries. if isinstance(multiagent_config["policies_to_train"], (list, set, tuple)): if len(multiagent_config["policies_to_train"]) == 0: logger.warning( "`config.multiagent.policies_to_train` is empty! " "Make sure - if you would like to learn at least one policy - " "to add its ID to that list." ) for pid in multiagent_config["policies_to_train"]: if pid not in policies: raise ValueError( "`config.multiagent.policies_to_train` contains policy " f"ID ({pid}) that was not defined in `config.multiagent.policies!" ) # Is this a multi-agent setup? True, iff DEFAULT_POLICY_ID is only # PolicyID found in policies dict. is_multiagent = len(policies) > 1 or DEFAULT_POLICY_ID not in policies return policies, is_multiagent
def __init__( self, trainer: Trainer, trainer_config: TrainerConfigDict, num_random_policies: int = 2, num_learning_league_exploiters: int = 4, num_learning_main_exploiters: int = 4, win_rate_threshold_for_new_snapshot: float = 0.8, keep_new_snapshot_training_prob: float = 0.0, prob_league_exploiter_match: float = 0.33, prob_main_exploiter_match: float = 0.33, prob_main_exploiter_playing_against_learning_main: float = 0.5, ): """Initializes a AlphaStarLeagueBuilder instance. Args: trainer: The Trainer object by which this league builder is used. Trainer calls `build_league()` after each training step. trainer_config: The (not yet validated) config dict to be used on the Trainer. Child classes of `LeagueBuilder` should preprocess this to add e.g. multiagent settings to this config. num_random_policies: The number of random policies to add to the league. This must be an even number (including 0) as these will be evenly distributed amongst league- and main- exploiters. num_learning_league_exploiters: The number of initially learning league-exploiters to create. num_learning_main_exploiters: The number of initially learning main-exploiters to create. win_rate_threshold_for_new_snapshot: The win-rate to be achieved for a learning policy to get snapshot'd (forked into `self` + a new learning or non-learning copy of `self`). keep_new_snapshot_training_prob: The probability with which a new snapshot should keep training. Note that the policy from which this snapshot is taken will continue to train regardless. prob_league_exploiter_match: Probability of an episode to become a league-exploiter vs snapshot match. prob_main_exploiter_match: Probability of an episode to become a main-exploiter vs main match. prob_main_exploiter_playing_against_learning_main: Probability of a main-exploiter vs (training!) main match. """ super().__init__(trainer, trainer_config) self.win_rate_threshold_for_new_snapshot = win_rate_threshold_for_new_snapshot self.keep_new_snapshot_training_prob = keep_new_snapshot_training_prob self.prob_league_exploiter_match = prob_league_exploiter_match self.prob_main_exploiter_match = prob_main_exploiter_match self.prob_main_exploiter_playing_against_learning_main = ( prob_main_exploiter_playing_against_learning_main) # Store the win rates for league overview printouts. self.win_rates: DefaultDict[PolicyID, float] = defaultdict(float) assert num_random_policies % 2 == 0, ( "ERROR: `num_random_policies` must be even number (we'll distribute " "these evenly amongst league- and main-exploiters)!") # Build trainer's multiagent config. ma_config = self.config["multiagent"] # Make sure the multiagent config dict has no policies defined: assert not ma_config.get("policies"), ( "ERROR: `config.multiagent.policies` should not be pre-defined! " "AlphaStarLeagueBuilder will construct this itself.") ma_config["policies"] = policies = {} self.main_policies = 1 self.league_exploiters = (num_learning_league_exploiters + num_random_policies / 2) self.main_exploiters = num_learning_main_exploiters + num_random_policies / 2 # Add 1 initial (learning) main policy. policies["main_0"] = PolicySpec() # Train all non-random policies that exist at beginning. ma_config["policies_to_train"] = ["main_0"] # Add random policies. i = -1 for i in range(num_random_policies // 2): policies[f"league_exploiter_{i}"] = PolicySpec( policy_class=RandomPolicy) policies[f"main_exploiter_{i}"] = PolicySpec( policy_class=RandomPolicy) # Add initial (learning) league-exploiters. for j in range(num_learning_league_exploiters): pid = f"league_exploiter_{j + i + 1}" policies[pid] = PolicySpec() ma_config["policies_to_train"].append(pid) # Add initial (learning) main-exploiters. for j in range(num_learning_league_exploiters): pid = f"main_exploiter_{j + i + 1}" policies[pid] = PolicySpec() ma_config["policies_to_train"].append(pid) # Build initial policy mapping function: main_0 vs main_exploiter_0. ma_config["policy_mapping_fn"] = ( lambda aid, ep, worker, **kw: "main_0" if ep.episode_id % 2 == aid else "main_exploiter_0")
def policy_mapping_fn(agent_id, episode, worker, **kwargs): # At first, only have main play against the random main exploiter. return "main" if episode.episode_id % 2 == agent_id else "main_exploiter_0" config = { "env": "open_spiel_env", "callbacks": LeagueBasedSelfPlayCallback, "num_sgd_iter": 20, "num_envs_per_worker": 5, "multiagent": { # Initial policy map: All PPO. This will be expanded # to more policy snapshots. This is done in the # custom callback defined above (`LeagueBasedSelfPlayCallback`). "policies": { # Our main policy, we'd like to optimize. "main": PolicySpec(), # First frozen version of main (after we reach n% win-rate). "main_0": PolicySpec(), # Initial main exploiters (one random, one trainable). "main_exploiter_0": PolicySpec(policy_class=RandomPolicy), "main_exploiter_1": PolicySpec(), # Initial league exploiters (one random, one trainable). "league_exploiter_0": PolicySpec(policy_class=RandomPolicy), "league_exploiter_1": PolicySpec(), }, "policy_mapping_fn": policy_mapping_fn, # At first, only train main_0 (until good enough to win against # random). "policies_to_train": ["main"], }, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
) if args.run == "contrib/MADDPG": obs_space = Discrete(6) act_space = TwoStepGame.action_space config = { "learning_starts": 100, "env_config": { "actions_are_logits": True, }, "multiagent": { "policies": { "pol1": PolicySpec( observation_space=obs_space, action_space=act_space, config={"agent_id": 0}, ), "pol2": PolicySpec( observation_space=obs_space, action_space=act_space, config={"agent_id": 1}, ), }, "policy_mapping_fn": (lambda aid, **kwargs: "pol2" if aid else "pol1"), }, "framework": args.framework, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
def get_policy_configs_for_game( game_name: str, ) -> Tuple[dict, Callable[[AgentID], PolicyID]]: # The RLlib server must know about the Spaces that the Client will be # using inside Unity3D, up-front. obs_spaces = { # 3DBall. "3DBall": Box(float("-inf"), float("inf"), (8, )), # 3DBallHard. "3DBallHard": Box(float("-inf"), float("inf"), (45, )), # GridFoodCollector "GridFoodCollector": Box(float("-inf"), float("inf"), (40, 40, 6)), # Pyramids. "Pyramids": TupleSpace([ Box(float("-inf"), float("inf"), (56, )), Box(float("-inf"), float("inf"), (56, )), Box(float("-inf"), float("inf"), (56, )), Box(float("-inf"), float("inf"), (4, )), ]), # SoccerStrikersVsGoalie. "Goalie": Box(float("-inf"), float("inf"), (738, )), "Striker": TupleSpace([ Box(float("-inf"), float("inf"), (231, )), Box(float("-inf"), float("inf"), (63, )), ]), # Sorter. "Sorter": TupleSpace([ Box( float("-inf"), float("inf"), ( 20, 23, ), ), Box(float("-inf"), float("inf"), (10, )), Box(float("-inf"), float("inf"), (8, )), ]), # Tennis. "Tennis": Box(float("-inf"), float("inf"), (27, )), # VisualHallway. "VisualHallway": Box(float("-inf"), float("inf"), (84, 84, 3)), # Walker. "Walker": Box(float("-inf"), float("inf"), (212, )), # FoodCollector. "FoodCollector": TupleSpace([ Box(float("-inf"), float("inf"), (49, )), Box(float("-inf"), float("inf"), (4, )), ]), } action_spaces = { # 3DBall. "3DBall": Box(-1.0, 1.0, (2, ), dtype=np.float32), # 3DBallHard. "3DBallHard": Box(-1.0, 1.0, (2, ), dtype=np.float32), # GridFoodCollector. "GridFoodCollector": MultiDiscrete([3, 3, 3, 2]), # Pyramids. "Pyramids": MultiDiscrete([5]), # SoccerStrikersVsGoalie. "Goalie": MultiDiscrete([3, 3, 3]), "Striker": MultiDiscrete([3, 3, 3]), # Sorter. "Sorter": MultiDiscrete([3, 3, 3]), # Tennis. "Tennis": Box(-1.0, 1.0, (3, )), # VisualHallway. "VisualHallway": MultiDiscrete([5]), # Walker. "Walker": Box(-1.0, 1.0, (39, )), # FoodCollector. "FoodCollector": MultiDiscrete([3, 3, 3, 2]), } # Policies (Unity: "behaviors") and agent-to-policy mapping fns. if game_name == "SoccerStrikersVsGoalie": policies = { "Goalie": PolicySpec( observation_space=obs_spaces["Goalie"], action_space=action_spaces["Goalie"], ), "Striker": PolicySpec( observation_space=obs_spaces["Striker"], action_space=action_spaces["Striker"], ), } def policy_mapping_fn(agent_id, episode, worker, **kwargs): return "Striker" if "Striker" in agent_id else "Goalie" else: policies = { game_name: PolicySpec( observation_space=obs_spaces[game_name], action_space=action_spaces[game_name], ), } def policy_mapping_fn(agent_id, episode, worker, **kwargs): return game_name return policies, policy_mapping_fn
def create_policy(self, policy_id: PolicyID, policy_cls: Type["Policy"], observation_space: gym.Space, action_space: gym.Space, config_override: PartialTrainerConfigDict, merged_config: TrainerConfigDict) -> None: """Creates a new policy and stores it to the cache. Args: policy_id (PolicyID): The policy ID. This is the key under which the created policy will be stored in this map. policy_cls (Type[Policy]): The (original) policy class to use. This may still be altered in case tf-eager (and tracing) is used. observation_space (gym.Space): The observation space of the policy. action_space (gym.Space): The action space of the policy. config_override (PartialTrainerConfigDict): The config override dict for this policy. This is the partial dict provided by the user. merged_config (TrainerConfigDict): The entire config (merged default config + `config_override`). """ framework = merged_config.get("framework", "tf") class_ = get_tf_eager_cls_if_necessary(policy_cls, merged_config) # Tf. if framework in ["tf2", "tf", "tfe"]: var_scope = policy_id + ( ("_wk" + str(self.worker_index)) if self.worker_index else "") # For tf static graph, build every policy in its own graph # and create a new session for it. if framework == "tf": with tf1.Graph().as_default(): if self.session_creator: sess = self.session_creator() else: sess = tf1.Session(config=tf1.ConfigProto( gpu_options=tf1.GPUOptions(allow_growth=True))) with sess.as_default(): # Set graph-level seed. if self.seed is not None: tf1.set_random_seed(self.seed) with tf1.variable_scope(var_scope): self[policy_id] = class_(observation_space, action_space, merged_config) # For tf-eager: no graph, no session. else: with tf1.variable_scope(var_scope): self[policy_id] = \ class_(observation_space, action_space, merged_config) # Non-tf: No graph, no session. else: class_ = policy_cls self[policy_id] = class_(observation_space, action_space, merged_config) # Store spec (class, obs-space, act-space, and config overrides) such # that the map will be able to reproduce on-the-fly added policies # from disk. self.policy_specs[policy_id] = PolicySpec( policy_class=policy_cls, observation_space=observation_space, action_space=action_space, config=config_override)
"grouped_twostep", lambda config: TwoStepGame(config).with_agent_groups( grouping, obs_space=obs_space, act_space=act_space)) if args.run == "contrib/MADDPG": obs_space = Discrete(6) act_space = TwoStepGame.action_space config = { "learning_starts": 100, "env_config": { "actions_are_logits": True, }, "multiagent": { "policies": { "pol1": PolicySpec( observation_space=obs_space, action_space=act_space, config={"agent_id": 0}), "pol2": PolicySpec( observation_space=obs_space, action_space=act_space, config={"agent_id": 1}), }, "policy_mapping_fn": ( lambda aid, **kwargs: "pol2" if aid else "pol1"), }, "framework": args.framework, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), } group = False elif args.run == "QMIX":