示例#1
0
    def test_pettingzoo_env(self):
        register_env("simple_spread",
                     lambda _: PettingZooEnv(simple_spread_v2.env()))
        env = PettingZooEnv(simple_spread_v2.env())
        observation_space = env.observation_space
        action_space = env.action_space
        del env

        agent_class = get_trainer_class("PPO")

        config = deepcopy(agent_class.get_default_config())

        config["multiagent"] = {
            # Set of policy IDs (by default, will use Trainer's
            # default policy class, the env's obs/act spaces and config={}).
            "policies": {
                "av": (None, observation_space, action_space, {})
            },
            # Mapping function that always returns "av" as policy ID to use
            # (for any agent).
            "policy_mapping_fn": lambda agent_id, episode, **kwargs: "av",
        }

        config["log_level"] = "DEBUG"
        config["num_workers"] = 0
        config["rollout_fragment_length"] = 30
        config["train_batch_size"] = 200
        config["horizon"] = 200  # After n steps, force reset simulation
        config["no_done_at_end"] = False

        agent = agent_class(env="simple_spread", config=config)
        agent.train()
示例#2
0
    def test_pettingzoo_env(self):
        register_env("prison", lambda _: PettingZooEnv(simple_spread_v0.env()))

        agent_class = get_agent_class("PPO")

        config = deepcopy(agent_class._default_config)

        test_env = PettingZooEnv(simple_spread_v0.env())
        obs_space = test_env.observation_space
        act_space = test_env.action_space
        test_env.close()

        config["multiagent"] = {
            "policies": {
                # the first tuple value is None -> uses default policy
                "av": (None, obs_space, act_space, {}),
            },
            "policy_mapping_fn": lambda agent_id: "av"
        }

        config["log_level"] = "DEBUG"
        config["num_workers"] = 0
        config["rollout_fragment_length"] = 30
        config["train_batch_size"] = 200
        config["horizon"] = 200  # After n steps, force reset simulation
        config["no_done_at_end"] = False

        agent = agent_class(env="prison", config=config)
        agent.train()
def train():
    alg_name = "PPO"
    ModelCatalog.register_custom_model("pa_model", TorchMaskedActions)

    # function that outputs the environment you wish to register.
    def env_creator():
        env = leduc_holdem_v3.env()
        return env

    num_cpus = 1

    config = deepcopy(get_agent_class(alg_name)._default_config)

    register_env("leduc_holdem", lambda config: PettingZooEnv(env_creator()))

    test_env = PettingZooEnv(env_creator())
    obs_space = test_env.observation_space
    print(obs_space)
    act_space = test_env.action_space

    config["multiagent"] = {
        "policies": {
            "player_0": (None, obs_space, act_space, {}),
            "player_1": (None, obs_space, act_space, {}),
        },
        "policy_mapping_fn": lambda agent_id: agent_id
    }

    config["num_gpus"] = int(os.environ.get("RLLIB_NUM_GPUS", "0"))
    # config["log_level"] = "DEBUG"
    config["num_workers"] = 1
    config["rollout_fragment_length"] = 30
    config["train_batch_size"] = 200
    config["horizon"] = 200
    config["no_done_at_end"] = False
    config["framework"] = "torch"
    config["model"] = {
        "custom_model": "pa_model",
    }

    # config['hiddens'] = []
    config['env'] = "leduc_holdem"

    ray.init(num_cpus=num_cpus + 1)

    tune.run(alg_name,
             name="PPO-leduc_holdem",
             stop={"timesteps_total": 10000000},
             checkpoint_freq=10,
             config=config)
示例#4
0
def get_env(env_name, number_of_agents=1):
    """
    TODO Guy: to expand the function to work with particle environment
    :param env_name:
    :param number_of_agents:
    :return:
    """
    if env_name == TAXI:
        from Transforms.taxi_transforms import TaxiSimpleEnv
        return TaxiSimpleEnv
    if env_name == SPEAKER_LISTENER:
        from supersuit import pad_observations_v0, pad_action_space_v0
        from pettingzoo.mpe import simple_speaker_listener_v3
        from ray.tune.registry import register_env
        from ray.rllib.env import PettingZooEnv

        def create_env(args):
            env = simple_speaker_listener_v3.env()
            env = pad_action_space_v0(env)
            env = pad_observations_v0(env)
            return env

        get_env_lambda = lambda config: PettingZooEnv(create_env(config))
        register_env(SPEAKER_LISTENER, lambda config: get_env_lambda(config))
        return get_env_lambda({}), SPEAKER_LISTENER
示例#5
0
    def test_pettingzoo_pistonball_v6_policies_are_dict_env(self):
        def env_creator(config):
            env = pistonball_v6.env()
            env = dtype_v0(env, dtype=float32)
            env = color_reduction_v0(env, mode="R")
            env = normalize_obs_v0(env)
            return env

        config = deepcopy(get_algorithm_class("PPO").get_default_config())
        config["env_config"] = {"local_ratio": 0.5}
        # Register env
        register_env("pistonball",
                     lambda config: PettingZooEnv(env_creator(config)))
        env = PettingZooEnv(env_creator(config))
        observation_space = env.observation_space
        action_space = env.action_space
        del env

        config["multiagent"] = {
            # Setup a single, shared policy for all agents.
            "policies": {
                "av": (None, observation_space, action_space, {})
            },
            # Map all agents to that policy.
            "policy_mapping_fn": lambda agent_id, episode, **kwargs: "av",
        }

        config["log_level"] = "DEBUG"
        config["num_workers"] = 1
        # Fragment length, collected at once from each worker
        # and for each agent!
        config["rollout_fragment_length"] = 30
        # Training batch size -> Fragments are concatenated up to this point.
        config["train_batch_size"] = 200
        # After n steps, force reset simulation
        config["horizon"] = 200
        # Default: False
        config["no_done_at_end"] = False
        algo = get_algorithm_class("PPO")(env="pistonball", config=config)
        algo.train()
        algo.stop()
示例#6
0
def get_env(env_name, number_of_agents=1):
    """
    :param env_name:
    :param number_of_agents:
    :return:
    """
    if env_name == TAXI:
        from Environments.taxi_environment_wrapper import TaxiSimpleEnv
        return TaxiSimpleEnv()
    elif env_name == TAXI_EXAMPLE:
        from Environments.taxi_environment_wrapper import TaxiSimpleExampleEnv
        return TaxiSimpleExampleEnv()
    elif env_name == SINGLE_TAXI_EXAMPLE:
        from Environments.SingleTaxiEnv.single_taxi_wrapper import SingleTaxiSimpleEnv
        return SingleTaxiSimpleEnv()
    elif env_name == SINGLE_FROZEN_EXAMPLE:
        from Environments.frozenlake_environment import FrozenLakeEnv
        return FrozenLakeEnv(map_name=FROZEN_MAP_NAME,
                             is_slippery=IS_SLIPPERY,
                             wind=WIND)
    elif env_name == LUNAR_LANDER:
        from Environments.lunar_lander_wrapper import LunarLenderWrapper
        return LunarLenderWrapper()
    elif env_name == SEARCH_TRANSFORM_TAXI_ENV:
        from Environments.SingleTaxiEnv.single_taxi_wrapper import SingleTaxiSimpleEnv
        new_env = load_pkl_file(TRANSFORM_SEARCH_TAXI_ENV_PATH)
        return new_env
    elif env_name == APPLE_PICKING:
        from Environments.ApplePicking.apple_picking_env import ApplePickingEnv
        return ApplePickingEnv()
    elif env_name == SPEAKER_LISTENER:
        from supersuit import pad_observations_v0, pad_action_space_v0
        from pettingzoo.mpe import simple_speaker_listener_v3
        from ray.tune.registry import register_env
        from ray.rllib.env import PettingZooEnv

        def create_env(args):
            env = simple_speaker_listener_v3.env()
            env = pad_action_space_v0(env)
            env = pad_observations_v0(env)
            return env

        get_env_lambda = lambda config: PettingZooEnv(create_env(config))
        register_env(SPEAKER_LISTENER, lambda config: get_env_lambda(config))
        return get_env_lambda({}), SPEAKER_LISTENER
示例#7
0
        env = color_reduction(env, mode="R")
        env = normalize_obs(env)
        return env

    num_cpus = 1
    num_rollouts = 2

    # 1. Gets default training configuration and specifies the POMgame to load.
    config = deepcopy(get_agent_class(alg_name)._default_config)

    # 2. Set environment config. This will be passed to
    # the env_creator function via the register env lambda below
    config["env_config"] = {"local_ratio": 0.5}

    # 3. Register env
    register_env("prison", lambda config: PettingZooEnv(env_creator(config)))

    # 4. Extract space dimensions
    test_env = PettingZooEnv(env_creator({}))
    obs_space = test_env.observation_space
    act_space = test_env.action_space

    # 5. Configuration for multiagent setup with policy sharing:
    config["multiagent"] = {
        "policies": {
            # the first tuple value is None -> uses default policy
            "av": (None, obs_space, act_space, {}),
        },
        "policy_mapping_fn": lambda agent_id: "av"
    }
                    help="Number of timesteps to train.")
parser.add_argument(
    "--stop-reward",
    type=float,
    default=1000.0,
    help="Reward at which we stop training.",
)


def env_creator(args):
    env = rps_v2.env()
    return env


register_env("RockPaperScissors",
             lambda config: PettingZooEnv(env_creator(config)))


def run_same_policy(args, stop):
    """Use the same policy for both agents (trivial case)."""
    config = {
        "env": "RockPaperScissors",
        "framework": args.framework,
    }

    results = tune.run("PG", config=config, stop=stop, verbose=1)

    if args.as_test:
        # Check vs 0.0 as we are playing a zero-sum game.
        check_learning_achieved(results, 0.0)
示例#9
0
        env = normalize_obs_v0(env)
        return env

    num_cpus = 1
    num_rollouts = 2

    # 1. Gets default training configuration and specifies the POMgame to load.
    config = deepcopy(get_agent_class(alg_name)._default_config)

    # 2. Set environment config. This will be passed to
    # the env_creator function via the register env lambda below
    config["env_config"] = {"local_ratio": 0.5}

    # 3. Register env
    register_env("pistonball",
                 lambda config: PettingZooEnv(env_creator(config)))

    # 4. Extract space dimensions
    test_env = PettingZooEnv(env_creator({}))
    obs_space = test_env.observation_space
    act_space = test_env.action_space

    # 5. Configuration for multiagent setup with policy sharing:
    config["multiagent"] = {
        "policies": {
            # the first tuple value is None -> uses default policy
            "av": (None, obs_space, act_space, {}),
        },
        "policy_mapping_fn": lambda agent_id: "av"
    }
示例#10
0
alg_name = "PPO"
env_name = "name"
s = "{:3d} reward {:6.2f}/{:6.2f}/{:6.2f} len {:6.2f}"


def create_env(args):
    if args[env_name] == "simple_speaker_listener":
        env = simple_speaker_listener_v3.env()
        env = supersuit.pad_action_space_v0(env)
        env = supersuit.pad_observations_v0(env)
        return env


register_env("simple_speaker_listener",
             lambda config: PettingZooEnv(create_env(config)))
config = deepcopy(a2c.A2C_DEFAULT_CONFIG)
config.update({
    "num_gpus": 0,
    "lr_schedule": [[0, 0.007], [20000000, 0.0000000001]],
    #"num_workers": 5,
    "framework": "torch",
    "env_config": {
        "name": "simple_speaker_listener"
    },
    "clip_rewards": True,
    "num_envs_per_worker": 1,
    "rollout_fragment_length": 20,
})
ray.init(num_gpus=0, local_mode=True)
agent = a2c.A2CTrainer(env="simple_speaker_listener", config=config)
示例#11
0
    return game_env


if __name__ == "__main__":
    # PPO  - PPO
    # ADQN - Apex DQN

    assert len(
        sys.argv) == 3, "Input the learning method as the second argument"
    env_name = sys.argv[1]
    method = sys.argv[2]

    game_env = get_env(env_name)
    env_creator = make_env_creator(game_env)

    register_env(env_name, lambda config: PettingZooEnv(env_creator(config)))

    test_env = PettingZooEnv(env_creator({}))
    obs_space = test_env.observation_space
    act_space = test_env.action_space

    ModelCatalog.register_custom_model("MLPModelV2", MLPModelV2)

    def gen_policy(i):
        config = {
            "model": {
                "custom_model": "MLPModelV2",
            },
            "gamma": 0.99,
        }
        return (None, obs_space, act_space, config)
示例#12
0
def get_config(args: Args):
    # num_rollouts = 2
    ModelCatalog.register_custom_model("SoftModularActorCriticNet", SoftModularActorCriticNet)
    ModelCatalog.register_custom_model("SimpleEnsembleActorCriticNet", SimpleEnsembleActorCriticNet)
    # 1. Gets default training configuration and specifies the POMgame to load.
    config = deepcopy(get_agent_class(args.alg_name)._default_config)

    # 2. Set environment config. This will be passed to
    # the env_creator function via the register env lambda below.
    # local_ratio specify hthe ratio between global reward and the local reward
    # config["env_config"] = {"local_ratio": 0.5}
    def env_creator():
        if args.game.__package__.endswith('atari'):
            if (args.game_name.startswith('foozpong') or
                args.game_name.startswith('basketball_pong') or
                args.game_name.startswith('volleyball_pong')
                ):
                env = args.game.env(obs_type=args.atari_obs_type,
                                    max_cycles=args.max_steps['atari'],
                                    full_action_space=False,
                                    num_players=2)
            else:
                env = args.game.env(obs_type=args.atari_obs_type,
                                    full_action_space=False,
                                    max_cycles=args.max_steps['atari'])
            env = frame_skip_v0(env, args.atari_frame_skip_num)
            env = frame_stack_v1(env, args.atari_frame_stack_num)

        else:
            env = args.game.env()
        if args.game_name.startswith('rps'):
            env = one_hot_obs_wrapper(env)
        env = dtype_v0(env, dtype=float32)
        env = pad_observations_v0(env)
        env = pad_action_space_v0(env)
        if args.game_name.startswith('connect_four') or args.game_name.startswith('tictactoe'):
            env = FlattenEnvWrapper(env)
        GAUSSIAN_STD = 1.0
        assert abs(GAUSSIAN_STD - 1.0) < 1e-5, "must be 1.0, otherwise simple ensemble implementation is wrong"
        env = LatentGaussianAugmentedEnvWrapper(env,
                                                latent_parameter_dim=args.latent_para_dim,
                                                gaussian_std=1.0,
                                                use_dict_obs_space=args.use_dict_obs_space)
        return env

    # 3. Register env, and get trainer_class
    register_env(args.game_name,
                 lambda config: PettingZooEnv(env_creator()))
    trainer_class = get_agent_class(args.alg_name)

    # 4. Extract space dimensions
    test_env = PettingZooEnv(env_creator())
    obs_space = test_env.observation_space
    act_space = test_env.action_space
    agents_id = test_env.agents
    print(f"obs_space: {obs_space}; act_space: {act_space}")

    # 5. Configuration for multiagent setup:
    config["framework"] = "torch"
    config["num_gpus"] = 0
    config["log_level"] = "INFO"
    config["num_workers"] = args.num_cpus // 2
    config["num_cpus_per_worker"] = 1
    config['num_envs_per_worker'] = 5
    # Fragment length, collected at once from each worker and for each agent!
    config["rollout_fragment_length"] = 100
    # Training batch size -> Fragments are concatenated up to this point.
    config["train_batch_size"] = 2000
    config["sgd_minibatch_size"] = 256
    config["entropy_coeff"] = 0.01
    config["lambda"] = 0.9
    config["vf_clip_param"] = 50
    config["num_sgd_iter"] = 10
    # After n steps, force reset simulation
    config["horizon"] = args.max_steps[args.game_type]
    # Default: False
    config["no_done_at_end"] = False
    # Info: If False, each agents trajectory is expected to have
    # maximum one done=True in the last step of the trajectory.
    # If no_done_at_end = True, environment is not resetted
    # when dones[__all__]= True.
    config['ignore_worker_failures'] = True

    def get_main_and_test_config(config: Dict[str, Any]) -> Tuple[Dict[str, Any],
                                                                Dict[str, Any]]:

        main_policies = {}
        for i, agent_id in enumerate(agents_id):
            for j in range(1):
                main_policies[f'{agent_id}_{j}'] = (PPOTorchPolicy,
                                                    obs_space,
                                                    act_space,
                                                    {"framework": "torch"})
        test_policies = {
                'test_' + agent_id: (PPOTorchPolicy, obs_space, act_space, {"framework": "torch"})
                for agent_id in agents_id if is_adversary(agent_id)
                        }
        policies = {**main_policies, **test_policies}

        main_config, test_config = deepcopy(config), deepcopy(config)

        main_config["multiagent"] = {
            "policies": policies,
            "policy_mapping_fn": lambda agent_id: f'{agent_id}_{0}',
            "policies_to_train": list(main_policies.keys())
        }

        def test_config_policy_mapping(agent_id: str) -> str:
            if is_adversary(agent_id):
                return 'test_' + agent_id
            return f'{agent_id}_{0}'

        test_config["multiagent"] = {
            "policies": policies,
            "policy_mapping_fn": test_config_policy_mapping,
            "policies_to_train": list(test_policies.keys())
        }
        return main_config, test_config

    def get_simple_ensemble_training_config(config: Dict[str, Any], ensemble_size: int=3) -> Tuple[Dict[str, Any],
                                                                             Dict[str, Any]]:
        if ensemble_size > 1:
            config["model"] = {
                    "custom_model": "SimpleEnsembleActorCriticNet",
                    "custom_model_config": {
                                            "use_dict_obs_space": args.use_dict_obs_space,
                                            'ensemble_size': ensemble_size
                                            }
                            }
        main_config, test_config = get_main_and_test_config(config)
        return main_config, test_config

    def get_implicit_ensemble_training_config(config: Dict[str, Any]) -> Tuple[Dict[str, Any],
                                                                               Dict[str, Any]]:
        config["model"] = {
                "custom_model": "SoftModularActorCriticNet",
                "custom_model_config": {
                                        "use_latent_embedding": args.use_latent_embedding,
                                        "use_dict_obs_space": args.use_dict_obs_space,
                                        "base_type": MLPBase,
                                        "em_input_shape": args.latent_para_dim,
                                        "emb_shaping_net_hidden_shapes": args.emb_shaping_net_hidden_shapes,
                                        'emb_shaping_net_last_softmax': args.emb_shaping_net_last_softmax,
                                        'em_hidden_shapes': [args.soft_modular_net_hidden_dim,
                                                             args.soft_modular_net_hidden_dim], #[400],
                                        'hidden_shapes': [args.soft_modular_net_hidden_dim,
                                                          args.soft_modular_net_hidden_dim], #[400, 400],
                                        'num_layers': args.soft_modular_net_num_layers, #4,
                                        'num_modules': args.soft_modular_net_num_modules, #4,
                                        'module_hidden': args.soft_modular_net_hidden_dim, #128,
                                        'gating_hidden': args.soft_modular_net_hidden_dim, #256,
                                        'num_gating_layers': 2,  #with 1 gating layer, 500 step works for simple_spread
                                        'add_bn': False,
                                        }
                        }
        main_config, test_config = get_main_and_test_config(config)
        return main_config, test_config

    if args.train_setting == 'single_policy':
        main_config, test_config = get_simple_ensemble_training_config(config, ensemble_size=1)
    elif args.train_setting == 'simple_ensemble':
        main_config, test_config = get_simple_ensemble_training_config(config, ensemble_size=3)
    else:
        assert args.train_setting == 'implicit_ensemble'
        main_config, test_config = get_implicit_ensemble_training_config(config)

    return trainer_class, test_env, main_config, test_config
示例#13
0
if __name__ == "__main__":
    alg_name = "DQN"
    ModelCatalog.register_custom_model("pa_model", TorchMaskedActions)

    # function that outputs the environment you wish to register.


    def env_creator():
        env = leduc_holdem_v4.env()
        return env

    num_cpus = 1

    config = deepcopy(get_agent_class(alg_name)._default_config)

    register_env("leduc_holdem", lambda config: PettingZooEnv(env_creator()))

    test_env = PettingZooEnv(env_creator())
    obs_space = test_env.observation_space
    print(obs_space)
    act_space = test_env.action_space

    config["multiagent"] = {
        "policies": {
            "player_0": (None, obs_space, act_space, {}),
            "player_1": (None, obs_space, act_space, {}),
        },
        "policy_mapping_fn": lambda agent_id: agent_id
    }

    config["num_gpus"] = int(os.environ.get("RLLIB_NUM_GPUS", "0"))
    "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train."
)
parser.add_argument(
    "--stop-reward",
    type=float,
    default=1000.0,
    help="Reward at which we stop training.",
)


def env_creator(args):
    env = rps_v2.env()
    return env


register_env("RockPaperScissors", lambda config: PettingZooEnv(env_creator(config)))


def run_same_policy(args, stop):
    """Use the same policy for both agents (trivial case)."""
    config = {
        "env": "RockPaperScissors",
        "framework": args.framework,
    }

    results = tune.run("PG", config=config, stop=stop, verbose=1)

    if args.as_test:
        # Check vs 0.0 as we are playing a zero-sum game.
        check_learning_achieved(results, 0.0)
示例#15
0
from ray.rllib.agents.a3c.a3c import A3CTrainer
from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
from ray.rllib.env import PettingZooEnv
from ray.tune.registry import register_env
from gym import spaces
import numpy as np
import sumo_rl
import traci


if __name__ == '__main__':
    ray.init()

    register_env("4x4grid", lambda _: PettingZooEnv(sumo_rl.env(net_file='nets/4x4-Lucas/4x4.net.xml',
                                                    route_file='nets/4x4-Lucas/4x4c1c2c1c2.rou.xml',
                                                    out_csv_name='outputs/4x4grid/a3c',
                                                    use_gui=False,
                                                    num_seconds=80000)))

    trainer = A3CTrainer(env="4x4grid", config={
        "multiagent": {
            "policies": {
                '0': (A3CTFPolicy, spaces.Box(low=np.zeros(11), high=np.ones(11)), spaces.Discrete(2), {})
            },
            "policy_mapping_fn": (lambda id: '0')  # Traffic lights are always controlled by this policy
        },
        "lr": 0.001,
        "no_done_at_end": True
    })
    while True:
        print(trainer.train())  # distributed training step