def test_pettingzoo_env(self): register_env("simple_spread", lambda _: PettingZooEnv(simple_spread_v2.env())) env = PettingZooEnv(simple_spread_v2.env()) observation_space = env.observation_space action_space = env.action_space del env agent_class = get_trainer_class("PPO") config = deepcopy(agent_class.get_default_config()) config["multiagent"] = { # Set of policy IDs (by default, will use Trainer's # default policy class, the env's obs/act spaces and config={}). "policies": { "av": (None, observation_space, action_space, {}) }, # Mapping function that always returns "av" as policy ID to use # (for any agent). "policy_mapping_fn": lambda agent_id, episode, **kwargs: "av", } config["log_level"] = "DEBUG" config["num_workers"] = 0 config["rollout_fragment_length"] = 30 config["train_batch_size"] = 200 config["horizon"] = 200 # After n steps, force reset simulation config["no_done_at_end"] = False agent = agent_class(env="simple_spread", config=config) agent.train()
def test_pettingzoo_env(self): register_env("prison", lambda _: PettingZooEnv(simple_spread_v0.env())) agent_class = get_agent_class("PPO") config = deepcopy(agent_class._default_config) test_env = PettingZooEnv(simple_spread_v0.env()) obs_space = test_env.observation_space act_space = test_env.action_space test_env.close() config["multiagent"] = { "policies": { # the first tuple value is None -> uses default policy "av": (None, obs_space, act_space, {}), }, "policy_mapping_fn": lambda agent_id: "av" } config["log_level"] = "DEBUG" config["num_workers"] = 0 config["rollout_fragment_length"] = 30 config["train_batch_size"] = 200 config["horizon"] = 200 # After n steps, force reset simulation config["no_done_at_end"] = False agent = agent_class(env="prison", config=config) agent.train()
def train(): alg_name = "PPO" ModelCatalog.register_custom_model("pa_model", TorchMaskedActions) # function that outputs the environment you wish to register. def env_creator(): env = leduc_holdem_v3.env() return env num_cpus = 1 config = deepcopy(get_agent_class(alg_name)._default_config) register_env("leduc_holdem", lambda config: PettingZooEnv(env_creator())) test_env = PettingZooEnv(env_creator()) obs_space = test_env.observation_space print(obs_space) act_space = test_env.action_space config["multiagent"] = { "policies": { "player_0": (None, obs_space, act_space, {}), "player_1": (None, obs_space, act_space, {}), }, "policy_mapping_fn": lambda agent_id: agent_id } config["num_gpus"] = int(os.environ.get("RLLIB_NUM_GPUS", "0")) # config["log_level"] = "DEBUG" config["num_workers"] = 1 config["rollout_fragment_length"] = 30 config["train_batch_size"] = 200 config["horizon"] = 200 config["no_done_at_end"] = False config["framework"] = "torch" config["model"] = { "custom_model": "pa_model", } # config['hiddens'] = [] config['env'] = "leduc_holdem" ray.init(num_cpus=num_cpus + 1) tune.run(alg_name, name="PPO-leduc_holdem", stop={"timesteps_total": 10000000}, checkpoint_freq=10, config=config)
def get_env(env_name, number_of_agents=1): """ TODO Guy: to expand the function to work with particle environment :param env_name: :param number_of_agents: :return: """ if env_name == TAXI: from Transforms.taxi_transforms import TaxiSimpleEnv return TaxiSimpleEnv if env_name == SPEAKER_LISTENER: from supersuit import pad_observations_v0, pad_action_space_v0 from pettingzoo.mpe import simple_speaker_listener_v3 from ray.tune.registry import register_env from ray.rllib.env import PettingZooEnv def create_env(args): env = simple_speaker_listener_v3.env() env = pad_action_space_v0(env) env = pad_observations_v0(env) return env get_env_lambda = lambda config: PettingZooEnv(create_env(config)) register_env(SPEAKER_LISTENER, lambda config: get_env_lambda(config)) return get_env_lambda({}), SPEAKER_LISTENER
def test_pettingzoo_pistonball_v6_policies_are_dict_env(self): def env_creator(config): env = pistonball_v6.env() env = dtype_v0(env, dtype=float32) env = color_reduction_v0(env, mode="R") env = normalize_obs_v0(env) return env config = deepcopy(get_algorithm_class("PPO").get_default_config()) config["env_config"] = {"local_ratio": 0.5} # Register env register_env("pistonball", lambda config: PettingZooEnv(env_creator(config))) env = PettingZooEnv(env_creator(config)) observation_space = env.observation_space action_space = env.action_space del env config["multiagent"] = { # Setup a single, shared policy for all agents. "policies": { "av": (None, observation_space, action_space, {}) }, # Map all agents to that policy. "policy_mapping_fn": lambda agent_id, episode, **kwargs: "av", } config["log_level"] = "DEBUG" config["num_workers"] = 1 # Fragment length, collected at once from each worker # and for each agent! config["rollout_fragment_length"] = 30 # Training batch size -> Fragments are concatenated up to this point. config["train_batch_size"] = 200 # After n steps, force reset simulation config["horizon"] = 200 # Default: False config["no_done_at_end"] = False algo = get_algorithm_class("PPO")(env="pistonball", config=config) algo.train() algo.stop()
def get_env(env_name, number_of_agents=1): """ :param env_name: :param number_of_agents: :return: """ if env_name == TAXI: from Environments.taxi_environment_wrapper import TaxiSimpleEnv return TaxiSimpleEnv() elif env_name == TAXI_EXAMPLE: from Environments.taxi_environment_wrapper import TaxiSimpleExampleEnv return TaxiSimpleExampleEnv() elif env_name == SINGLE_TAXI_EXAMPLE: from Environments.SingleTaxiEnv.single_taxi_wrapper import SingleTaxiSimpleEnv return SingleTaxiSimpleEnv() elif env_name == SINGLE_FROZEN_EXAMPLE: from Environments.frozenlake_environment import FrozenLakeEnv return FrozenLakeEnv(map_name=FROZEN_MAP_NAME, is_slippery=IS_SLIPPERY, wind=WIND) elif env_name == LUNAR_LANDER: from Environments.lunar_lander_wrapper import LunarLenderWrapper return LunarLenderWrapper() elif env_name == SEARCH_TRANSFORM_TAXI_ENV: from Environments.SingleTaxiEnv.single_taxi_wrapper import SingleTaxiSimpleEnv new_env = load_pkl_file(TRANSFORM_SEARCH_TAXI_ENV_PATH) return new_env elif env_name == APPLE_PICKING: from Environments.ApplePicking.apple_picking_env import ApplePickingEnv return ApplePickingEnv() elif env_name == SPEAKER_LISTENER: from supersuit import pad_observations_v0, pad_action_space_v0 from pettingzoo.mpe import simple_speaker_listener_v3 from ray.tune.registry import register_env from ray.rllib.env import PettingZooEnv def create_env(args): env = simple_speaker_listener_v3.env() env = pad_action_space_v0(env) env = pad_observations_v0(env) return env get_env_lambda = lambda config: PettingZooEnv(create_env(config)) register_env(SPEAKER_LISTENER, lambda config: get_env_lambda(config)) return get_env_lambda({}), SPEAKER_LISTENER
env = color_reduction(env, mode="R") env = normalize_obs(env) return env num_cpus = 1 num_rollouts = 2 # 1. Gets default training configuration and specifies the POMgame to load. config = deepcopy(get_agent_class(alg_name)._default_config) # 2. Set environment config. This will be passed to # the env_creator function via the register env lambda below config["env_config"] = {"local_ratio": 0.5} # 3. Register env register_env("prison", lambda config: PettingZooEnv(env_creator(config))) # 4. Extract space dimensions test_env = PettingZooEnv(env_creator({})) obs_space = test_env.observation_space act_space = test_env.action_space # 5. Configuration for multiagent setup with policy sharing: config["multiagent"] = { "policies": { # the first tuple value is None -> uses default policy "av": (None, obs_space, act_space, {}), }, "policy_mapping_fn": lambda agent_id: "av" }
help="Number of timesteps to train.") parser.add_argument( "--stop-reward", type=float, default=1000.0, help="Reward at which we stop training.", ) def env_creator(args): env = rps_v2.env() return env register_env("RockPaperScissors", lambda config: PettingZooEnv(env_creator(config))) def run_same_policy(args, stop): """Use the same policy for both agents (trivial case).""" config = { "env": "RockPaperScissors", "framework": args.framework, } results = tune.run("PG", config=config, stop=stop, verbose=1) if args.as_test: # Check vs 0.0 as we are playing a zero-sum game. check_learning_achieved(results, 0.0)
env = normalize_obs_v0(env) return env num_cpus = 1 num_rollouts = 2 # 1. Gets default training configuration and specifies the POMgame to load. config = deepcopy(get_agent_class(alg_name)._default_config) # 2. Set environment config. This will be passed to # the env_creator function via the register env lambda below config["env_config"] = {"local_ratio": 0.5} # 3. Register env register_env("pistonball", lambda config: PettingZooEnv(env_creator(config))) # 4. Extract space dimensions test_env = PettingZooEnv(env_creator({})) obs_space = test_env.observation_space act_space = test_env.action_space # 5. Configuration for multiagent setup with policy sharing: config["multiagent"] = { "policies": { # the first tuple value is None -> uses default policy "av": (None, obs_space, act_space, {}), }, "policy_mapping_fn": lambda agent_id: "av" }
alg_name = "PPO" env_name = "name" s = "{:3d} reward {:6.2f}/{:6.2f}/{:6.2f} len {:6.2f}" def create_env(args): if args[env_name] == "simple_speaker_listener": env = simple_speaker_listener_v3.env() env = supersuit.pad_action_space_v0(env) env = supersuit.pad_observations_v0(env) return env register_env("simple_speaker_listener", lambda config: PettingZooEnv(create_env(config))) config = deepcopy(a2c.A2C_DEFAULT_CONFIG) config.update({ "num_gpus": 0, "lr_schedule": [[0, 0.007], [20000000, 0.0000000001]], #"num_workers": 5, "framework": "torch", "env_config": { "name": "simple_speaker_listener" }, "clip_rewards": True, "num_envs_per_worker": 1, "rollout_fragment_length": 20, }) ray.init(num_gpus=0, local_mode=True) agent = a2c.A2CTrainer(env="simple_speaker_listener", config=config)
return game_env if __name__ == "__main__": # PPO - PPO # ADQN - Apex DQN assert len( sys.argv) == 3, "Input the learning method as the second argument" env_name = sys.argv[1] method = sys.argv[2] game_env = get_env(env_name) env_creator = make_env_creator(game_env) register_env(env_name, lambda config: PettingZooEnv(env_creator(config))) test_env = PettingZooEnv(env_creator({})) obs_space = test_env.observation_space act_space = test_env.action_space ModelCatalog.register_custom_model("MLPModelV2", MLPModelV2) def gen_policy(i): config = { "model": { "custom_model": "MLPModelV2", }, "gamma": 0.99, } return (None, obs_space, act_space, config)
def get_config(args: Args): # num_rollouts = 2 ModelCatalog.register_custom_model("SoftModularActorCriticNet", SoftModularActorCriticNet) ModelCatalog.register_custom_model("SimpleEnsembleActorCriticNet", SimpleEnsembleActorCriticNet) # 1. Gets default training configuration and specifies the POMgame to load. config = deepcopy(get_agent_class(args.alg_name)._default_config) # 2. Set environment config. This will be passed to # the env_creator function via the register env lambda below. # local_ratio specify hthe ratio between global reward and the local reward # config["env_config"] = {"local_ratio": 0.5} def env_creator(): if args.game.__package__.endswith('atari'): if (args.game_name.startswith('foozpong') or args.game_name.startswith('basketball_pong') or args.game_name.startswith('volleyball_pong') ): env = args.game.env(obs_type=args.atari_obs_type, max_cycles=args.max_steps['atari'], full_action_space=False, num_players=2) else: env = args.game.env(obs_type=args.atari_obs_type, full_action_space=False, max_cycles=args.max_steps['atari']) env = frame_skip_v0(env, args.atari_frame_skip_num) env = frame_stack_v1(env, args.atari_frame_stack_num) else: env = args.game.env() if args.game_name.startswith('rps'): env = one_hot_obs_wrapper(env) env = dtype_v0(env, dtype=float32) env = pad_observations_v0(env) env = pad_action_space_v0(env) if args.game_name.startswith('connect_four') or args.game_name.startswith('tictactoe'): env = FlattenEnvWrapper(env) GAUSSIAN_STD = 1.0 assert abs(GAUSSIAN_STD - 1.0) < 1e-5, "must be 1.0, otherwise simple ensemble implementation is wrong" env = LatentGaussianAugmentedEnvWrapper(env, latent_parameter_dim=args.latent_para_dim, gaussian_std=1.0, use_dict_obs_space=args.use_dict_obs_space) return env # 3. Register env, and get trainer_class register_env(args.game_name, lambda config: PettingZooEnv(env_creator())) trainer_class = get_agent_class(args.alg_name) # 4. Extract space dimensions test_env = PettingZooEnv(env_creator()) obs_space = test_env.observation_space act_space = test_env.action_space agents_id = test_env.agents print(f"obs_space: {obs_space}; act_space: {act_space}") # 5. Configuration for multiagent setup: config["framework"] = "torch" config["num_gpus"] = 0 config["log_level"] = "INFO" config["num_workers"] = args.num_cpus // 2 config["num_cpus_per_worker"] = 1 config['num_envs_per_worker'] = 5 # Fragment length, collected at once from each worker and for each agent! config["rollout_fragment_length"] = 100 # Training batch size -> Fragments are concatenated up to this point. config["train_batch_size"] = 2000 config["sgd_minibatch_size"] = 256 config["entropy_coeff"] = 0.01 config["lambda"] = 0.9 config["vf_clip_param"] = 50 config["num_sgd_iter"] = 10 # After n steps, force reset simulation config["horizon"] = args.max_steps[args.game_type] # Default: False config["no_done_at_end"] = False # Info: If False, each agents trajectory is expected to have # maximum one done=True in the last step of the trajectory. # If no_done_at_end = True, environment is not resetted # when dones[__all__]= True. config['ignore_worker_failures'] = True def get_main_and_test_config(config: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]: main_policies = {} for i, agent_id in enumerate(agents_id): for j in range(1): main_policies[f'{agent_id}_{j}'] = (PPOTorchPolicy, obs_space, act_space, {"framework": "torch"}) test_policies = { 'test_' + agent_id: (PPOTorchPolicy, obs_space, act_space, {"framework": "torch"}) for agent_id in agents_id if is_adversary(agent_id) } policies = {**main_policies, **test_policies} main_config, test_config = deepcopy(config), deepcopy(config) main_config["multiagent"] = { "policies": policies, "policy_mapping_fn": lambda agent_id: f'{agent_id}_{0}', "policies_to_train": list(main_policies.keys()) } def test_config_policy_mapping(agent_id: str) -> str: if is_adversary(agent_id): return 'test_' + agent_id return f'{agent_id}_{0}' test_config["multiagent"] = { "policies": policies, "policy_mapping_fn": test_config_policy_mapping, "policies_to_train": list(test_policies.keys()) } return main_config, test_config def get_simple_ensemble_training_config(config: Dict[str, Any], ensemble_size: int=3) -> Tuple[Dict[str, Any], Dict[str, Any]]: if ensemble_size > 1: config["model"] = { "custom_model": "SimpleEnsembleActorCriticNet", "custom_model_config": { "use_dict_obs_space": args.use_dict_obs_space, 'ensemble_size': ensemble_size } } main_config, test_config = get_main_and_test_config(config) return main_config, test_config def get_implicit_ensemble_training_config(config: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]: config["model"] = { "custom_model": "SoftModularActorCriticNet", "custom_model_config": { "use_latent_embedding": args.use_latent_embedding, "use_dict_obs_space": args.use_dict_obs_space, "base_type": MLPBase, "em_input_shape": args.latent_para_dim, "emb_shaping_net_hidden_shapes": args.emb_shaping_net_hidden_shapes, 'emb_shaping_net_last_softmax': args.emb_shaping_net_last_softmax, 'em_hidden_shapes': [args.soft_modular_net_hidden_dim, args.soft_modular_net_hidden_dim], #[400], 'hidden_shapes': [args.soft_modular_net_hidden_dim, args.soft_modular_net_hidden_dim], #[400, 400], 'num_layers': args.soft_modular_net_num_layers, #4, 'num_modules': args.soft_modular_net_num_modules, #4, 'module_hidden': args.soft_modular_net_hidden_dim, #128, 'gating_hidden': args.soft_modular_net_hidden_dim, #256, 'num_gating_layers': 2, #with 1 gating layer, 500 step works for simple_spread 'add_bn': False, } } main_config, test_config = get_main_and_test_config(config) return main_config, test_config if args.train_setting == 'single_policy': main_config, test_config = get_simple_ensemble_training_config(config, ensemble_size=1) elif args.train_setting == 'simple_ensemble': main_config, test_config = get_simple_ensemble_training_config(config, ensemble_size=3) else: assert args.train_setting == 'implicit_ensemble' main_config, test_config = get_implicit_ensemble_training_config(config) return trainer_class, test_env, main_config, test_config
if __name__ == "__main__": alg_name = "DQN" ModelCatalog.register_custom_model("pa_model", TorchMaskedActions) # function that outputs the environment you wish to register. def env_creator(): env = leduc_holdem_v4.env() return env num_cpus = 1 config = deepcopy(get_agent_class(alg_name)._default_config) register_env("leduc_holdem", lambda config: PettingZooEnv(env_creator())) test_env = PettingZooEnv(env_creator()) obs_space = test_env.observation_space print(obs_space) act_space = test_env.action_space config["multiagent"] = { "policies": { "player_0": (None, obs_space, act_space, {}), "player_1": (None, obs_space, act_space, {}), }, "policy_mapping_fn": lambda agent_id: agent_id } config["num_gpus"] = int(os.environ.get("RLLIB_NUM_GPUS", "0"))
"--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." ) parser.add_argument( "--stop-reward", type=float, default=1000.0, help="Reward at which we stop training.", ) def env_creator(args): env = rps_v2.env() return env register_env("RockPaperScissors", lambda config: PettingZooEnv(env_creator(config))) def run_same_policy(args, stop): """Use the same policy for both agents (trivial case).""" config = { "env": "RockPaperScissors", "framework": args.framework, } results = tune.run("PG", config=config, stop=stop, verbose=1) if args.as_test: # Check vs 0.0 as we are playing a zero-sum game. check_learning_achieved(results, 0.0)
from ray.rllib.agents.a3c.a3c import A3CTrainer from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy from ray.rllib.env import PettingZooEnv from ray.tune.registry import register_env from gym import spaces import numpy as np import sumo_rl import traci if __name__ == '__main__': ray.init() register_env("4x4grid", lambda _: PettingZooEnv(sumo_rl.env(net_file='nets/4x4-Lucas/4x4.net.xml', route_file='nets/4x4-Lucas/4x4c1c2c1c2.rou.xml', out_csv_name='outputs/4x4grid/a3c', use_gui=False, num_seconds=80000))) trainer = A3CTrainer(env="4x4grid", config={ "multiagent": { "policies": { '0': (A3CTFPolicy, spaces.Box(low=np.zeros(11), high=np.ones(11)), spaces.Discrete(2), {}) }, "policy_mapping_fn": (lambda id: '0') # Traffic lights are always controlled by this policy }, "lr": 0.001, "no_done_at_end": True }) while True: print(trainer.train()) # distributed training step