예제 #1
0
    def test_apex_dqn_compilation_and_per_worker_epsilon_values(self):
        """Test whether an APEX-DQNTrainer can be built on all frameworks."""
        config = apex.APEX_DEFAULT_CONFIG.copy()
        config["num_workers"] = 3
        config["prioritized_replay"] = True
        config["timesteps_per_iteration"] = 100
        config["min_iter_time_s"] = 1
        config["optimizer"]["num_replay_buffer_shards"] = 1

        for _ in framework_iterator(config, ("torch", "tf", "eager")):
            plain_config = config.copy()
            trainer = apex.ApexTrainer(config=plain_config, env="CartPole-v0")

            # Test per-worker epsilon distribution.
            infos = trainer.workers.foreach_policy(
                lambda p, _: p.get_exploration_info())
            expected = [0.4, 0.016190862, 0.00065536]
            check([i["cur_epsilon"] for i in infos], [0.0] + expected)

            check_compute_action(trainer)

            # TODO(ekl) fix iterator metrics bugs w/multiple trainers.
            #            for i in range(1):
            #                results = trainer.train()
            #                print(results)

            # Test again per-worker epsilon distribution
            # (should not have changed).
            infos = trainer.workers.foreach_policy(
                lambda p, _: p.get_exploration_info())
            check([i["cur_epsilon"] for i in infos], [0.0] + expected)

            trainer.stop()
예제 #2
0
    def test_apex_dqn_compilation_and_per_worker_epsilon_values(self):
        """Test whether an APEX-DQNTrainer can be built on all frameworks."""
        config = apex.APEX_DEFAULT_CONFIG.copy()
        config["num_workers"] = 3
        config["num_gpus"] = 0
        config["learning_starts"] = 1000
        config["prioritized_replay"] = True
        config["timesteps_per_iteration"] = 100
        config["min_time_s_per_reporting"] = 1
        config["optimizer"]["num_replay_buffer_shards"] = 1

        for _ in framework_iterator(config, with_eager_tracing=True):
            plain_config = config.copy()
            trainer = apex.ApexTrainer(config=plain_config, env="CartPole-v0")

            # Test per-worker epsilon distribution.
            infos = trainer.workers.foreach_policy(
                lambda p, _: p.get_exploration_state())
            expected = [0.4, 0.016190862, 0.00065536]
            check([i["cur_epsilon"] for i in infos], [0.0] + expected)

            check_compute_single_action(trainer)

            for i in range(2):
                results = trainer.train()
                check_train_results(results)
                print(results)

            # Test again per-worker epsilon distribution
            # (should not have changed).
            infos = trainer.workers.foreach_policy(
                lambda p, _: p.get_exploration_state())
            check([i["cur_epsilon"] for i in infos], [0.0] + expected)

            trainer.stop()
예제 #3
0
파일: test_apex_dqn.py 프로젝트: rlan/ray
    def test_apex_lr_schedule(self):
        config = apex.APEX_DEFAULT_CONFIG.copy()
        config["num_workers"] = 1
        config["num_gpus"] = 0
        config["buffer_size"] = 100
        config["learning_starts"] = 10
        config["train_batch_size"] = 10
        config["rollout_fragment_length"] = 5
        config["prioritized_replay"] = True
        config["timesteps_per_iteration"] = 10
        # 0 metrics reporting delay, this makes sure timestep,
        # which lr depends on, is updated after each worker rollout.
        config["min_iter_time_s"] = 0
        config["optimizer"]["num_replay_buffer_shards"] = 1
        # This makes sure learning schedule is checked every 10 timesteps.
        config["optimizer"]["max_weight_sync_delay"] = 10
        # Initial lr, doesn't really matter because of the schedule below.
        config["lr"] = 0.2
        lr_schedule = [
            [0, 0.2],
            [50, 0.1],
            [100, 0.01],
            [150, 0.001],
        ]
        config["lr_schedule"] = lr_schedule

        def _step_n_times(trainer, n: int):
            """Step trainer n times.

            Returns:
                learning rate at the end of the execution.
            """
            for _ in range(n):
                results = trainer.train()
            return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][
                LEARNER_STATS_KEY]["cur_lr"]

        # Check eager execution frameworks here, since it's easier to control
        # exact timesteps with these frameworks.
        for _ in framework_iterator(config):
            trainer = apex.ApexTrainer(config=config, env="CartPole-v0")

            lr = _step_n_times(trainer, 5)  # 50 timesteps
            # PiecewiseSchedule does interpolation. So roughly 0.1 here.
            self.assertLessEqual(lr, 0.15)
            self.assertGreaterEqual(lr, 0.04)

            lr = _step_n_times(trainer, 5)  # 100 timesteps
            # PiecewiseSchedule does interpolation. So roughly 0.01 here.
            self.assertLessEqual(lr, 0.02)
            self.assertGreaterEqual(lr, 0.004)

            lr = _step_n_times(trainer, 5)  # 150 timesteps
            # PiecewiseSchedule does interpolation. So roughly 0.001 here.
            self.assertLessEqual(lr, 0.002)
            self.assertGreaterEqual(lr, 0.0004)

            trainer.stop()
예제 #4
0
    def test_apex_lr_schedule(self):
        config = apex.APEX_DEFAULT_CONFIG.copy()
        config["num_workers"] = 1
        config["num_gpus"] = 0
        config["learning_starts"] = 10
        config["train_batch_size"] = 10
        config["rollout_fragment_length"] = 5
        config["replay_buffer_config"] = {
            # For now we don't use the new ReplayBuffer API here
            "_enable_replay_buffer_api": False,
            "no_local_replay_buffer": True,
            "type": "MultiAgentReplayBuffer",
            "capacity": 100,
            "replay_batch_size": 10,
            "prioritized_replay_alpha": 0.6,
            # Beta parameter for sampling from prioritized replay buffer.
            "prioritized_replay_beta": 0.4,
            # Epsilon to add to the TD errors when updating priorities.
            "prioritized_replay_eps": 1e-6,
        }
        config["timesteps_per_iteration"] = 10
        # 0 metrics reporting delay, this makes sure timestep,
        # which lr depends on, is updated after each worker rollout.
        config["min_time_s_per_reporting"] = 0
        config["optimizer"]["num_replay_buffer_shards"] = 1
        # This makes sure learning schedule is checked every 10 timesteps.
        config["optimizer"]["max_weight_sync_delay"] = 10
        # Initial lr, doesn't really matter because of the schedule below.
        config["lr"] = 0.2
        lr_schedule = [
            [0, 0.2],
            [100, 0.001],
        ]
        config["lr_schedule"] = lr_schedule

        def _step_n_times(trainer, n: int):
            """Step trainer n times.

            Returns:
                learning rate at the end of the execution.
            """
            for _ in range(n):
                results = trainer.train()
            return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][
                LEARNER_STATS_KEY]["cur_lr"]

        for _ in framework_iterator(config):
            trainer = apex.ApexTrainer(config=config, env="CartPole-v0")

            lr = _step_n_times(trainer, 5)  # 10 timesteps
            # Close to 0.2
            self.assertGreaterEqual(lr, 0.1)

            lr = _step_n_times(trainer, 20)  # 200 timesteps
            # LR Annealed to 0.001
            self.assertLessEqual(lr, 0.0011)

            trainer.stop()
예제 #5
0
 def test_apex_epsilon_distribution(self):
     config = apex.APEX_DEFAULT_CONFIG.copy()
     config["num_workers"] = 3
     config["optimizer"]["num_replay_buffer_shards"] = 1
     trainer = apex.ApexTrainer(config, env="CartPole-v0")
     infos = trainer.workers.foreach_policy(
         lambda p, _: p.get_exploration_info())
     eps = [i["cur_epsilon"] for i in infos]
     assert np.allclose(eps, [1.0, 0.016190862, 0.00065536, 2.6527108e-05])
예제 #6
0
 def test_apex_zero_workers(self):
     config = apex.APEX_DEFAULT_CONFIG.copy()
     config["num_workers"] = 0
     config["prioritized_replay"] = True
     config["timesteps_per_iteration"] = 100
     config["min_iter_time_s"] = 1
     config["optimizer"]["num_replay_buffer_shards"] = 1
     trainer = apex.ApexTrainer(config=config, env="CartPole-v0")
     trainer.train()
     trainer.stop()
예제 #7
0
 def test_apex_zero_workers(self):
     config = apex.APEX_DEFAULT_CONFIG.copy()
     config["num_workers"] = 0
     config["learning_starts"] = 1000
     config["prioritized_replay"] = True
     config["timesteps_per_iteration"] = 100
     config["min_iter_time_s"] = 1
     config["optimizer"]["num_replay_buffer_shards"] = 1
     for _ in framework_iterator(config, frameworks=("torch", "tf")):
         trainer = apex.ApexTrainer(config=config, env="CartPole-v0")
         trainer.train()
         trainer.stop()
예제 #8
0
 def test_apex_zero_workers(self):
     config = apex.APEX_DEFAULT_CONFIG.copy()
     config["num_workers"] = 0
     config["num_gpus"] = 0
     config["learning_starts"] = 1000
     config["prioritized_replay"] = True
     config["timesteps_per_iteration"] = 100
     config["min_time_s_per_reporting"] = 1
     config["optimizer"]["num_replay_buffer_shards"] = 1
     for _ in framework_iterator(config):
         trainer = apex.ApexTrainer(config=config, env="CartPole-v0")
         results = trainer.train()
         check_train_results(results)
         print(results)
         trainer.stop()
예제 #9
0
    def test_apex_lr_schedule(self):
        config = apex.APEX_DEFAULT_CONFIG.copy()
        config["num_workers"] = 1
        config["num_gpus"] = 0
        config["buffer_size"] = 100
        config["learning_starts"] = 10
        config["train_batch_size"] = 10
        config["rollout_fragment_length"] = 5
        config["prioritized_replay"] = True
        config["timesteps_per_iteration"] = 10
        # 0 metrics reporting delay, this makes sure timestep,
        # which lr depends on, is updated after each worker rollout.
        config["min_time_s_per_reporting"] = 0
        config["optimizer"]["num_replay_buffer_shards"] = 1
        # This makes sure learning schedule is checked every 10 timesteps.
        config["optimizer"]["max_weight_sync_delay"] = 10
        # Initial lr, doesn't really matter because of the schedule below.
        config["lr"] = 0.2
        lr_schedule = [
            [0, 0.2],
            [100, 0.001],
        ]
        config["lr_schedule"] = lr_schedule

        def _step_n_times(trainer, n: int):
            """Step trainer n times.

            Returns:
                learning rate at the end of the execution.
            """
            for _ in range(n):
                results = trainer.train()
            return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][
                LEARNER_STATS_KEY]["cur_lr"]

        for _ in framework_iterator(config):
            trainer = apex.ApexTrainer(config=config, env="CartPole-v0")

            lr = _step_n_times(trainer, 1)  # 10 timesteps
            # Close to 0.2
            self.assertGreaterEqual(lr, 0.1)

            lr = _step_n_times(trainer, 20)  # 200 timesteps
            # LR Annealed to 0.001
            self.assertLessEqual(lr, 0.0011)

            trainer.stop()
예제 #10
0
 def __init__(self):
     ModelCatalog.register_custom_model("my_model", RayTFModel)
     ray.init()
     self.max_step = 10
     single_env = MultiStepsMultiAgentGymMinerEnv({'max_step': self.max_step})
     obs_space = single_env.observation_space
     act_space = single_env.action_space
     self.config = apex.APEX_DEFAULT_CONFIG.copy()
     self.config = {
         **self.config,
         'num_workers': 0,
         'in_evaluation': True,
         'multiagent': {
             'policies': {
                 'dqn1': (None, obs_space, act_space, {
                     'model': {
                         'custom_model': 'my_model',
                         'custom_model_config': {
                             'max_step': self.max_step,
                             'embedding_size': 8,
                             'conv1_filters': [
                                 (64, 4, 1),
                                 (128, 2, 2),
                                 (256, 2, 2),
                             ],
                         }
                     },
                     'gamma': 0.99,
                 }),
                 'random': (RandomPolicy, obs_space, act_space, {}),
             },
             'policy_mapping_fn': lambda agent_id: 'dqn1',
         },
         'env_config': {
             'max_step': self.max_step,
         },
     }
     self.agent = apex.ApexTrainer(
         env=MultiStepsMultiAgentGymMinerEnv, config=self.config)
     self.agent.restore('checkpoints/checkpoint_740/checkpoint-740')
     self.step_states = []
예제 #11
0
파일: test_apex.py 프로젝트: wjykl22/ray
    def test_apex_compilation_and_per_worker_epsilon_values(self):
        """Test whether an APEX-DQNTrainer can be built on all frameworks."""
        config = apex.APEX_DEFAULT_CONFIG.copy()
        config["num_workers"] = 3
        config["prioritized_replay"] = True
        config["optimizer"]["num_replay_buffer_shards"] = 1
        num_iterations = 1

        for _ in framework_iterator(config, ("torch", "tf", "eager")):
            plain_config = config.copy()
            trainer = apex.ApexTrainer(config=plain_config, env="CartPole-v0")

            # Test per-worker epsilon distribution.
            infos = trainer.workers.foreach_policy(
                lambda p, _: p.get_exploration_info())
            eps = [i["cur_epsilon"] for i in infos]
            assert np.allclose(eps,
                               [1.0, 0.016190862, 0.00065536, 2.6527108e-05])

            for i in range(num_iterations):
                results = trainer.train()
                print(results)
예제 #12
0
 def __init__(self):
     ModelCatalog.register_custom_model("my_model", RayTFModel)
     ray.init()
     single_env = MultiAgentGymMinerEnv({})
     obs_space = single_env.observation_space
     act_space = single_env.action_space
     self.config = apex.APEX_DEFAULT_CONFIG.copy()
     self.config['num_workers'] = 0
     self.config['in_evaluation'] = True
     self.config['multiagent'] = {
         'policies': {
             'dqn1': (None, obs_space, act_space, {
                 'model': {
                     'custom_model': 'my_model',
                 },
                 'gamma': 0.99,
             }),
             'random': (RandomPolicy, obs_space, act_space, {}),
         },
         'policy_mapping_fn': lambda agent_id: 'dqn1',
     }
     self.agent = apex.ApexTrainer(env=MultiAgentGymMinerEnv,
                                   config=self.config)
     self.agent.restore('checkpoints/checkpoint_1240/checkpoint-1240')
                         name)
        # Simplified to one layer.
        input = tf.keras.layers.Input(obs_space.shape, dtype=obs_space.dtype)
        output = tf.keras.layers.Dense(num_outputs, activation=None)
        self.base_model = tf.keras.models.Sequential([input, output])
        self.register_variables(self.base_model.variables)
    def forward(self, input_dict, state, seq_lens):
        return self.base_model(input_dict["obs"]), []

ModelCatalog.register_custom_model("MLPModel", MLPModel)
ModelCatalog.register_custom_model("MLPModelV2", MLPModelV2)

if algorithm == 'A2C':
    RLAgent = a2c.A2CTrainer(env=env_name, config=config)
elif algorithm == 'ADQN':
    RLAgent = adqn.ApexTrainer(env=env_name, config=config)
elif algorithm == 'DQN':
    RLAgent = dqn.DQNTrainer(env=env_name, config=config)
elif algorithm == 'IMPALA':
    RLAgent = impala.ImpalaTrainer(env=env_name, config=config)
elif algorithm == 'PPO':
    RLAgent = ppo.PPOTrainer(env=env_name, config=config)
elif algorithm == 'RDQN':
    RLAgent = dqn.DQNTrainer(env=env_name, config=config)
RLAgent.restore(checkpoint_path)

num_runs = 50
totalRewards = np.empty((num_runs,))

for j in range(num_runs):
    observations = env.reset()
예제 #14
0
import random
import ray
from ray.rllib.agents.dqn import apex
from ray.rllib.models import ModelCatalog
from mlflow import log_metric, log_param, log_artifacts

from src.envs.multi_steps_multi_agents_env import MultiStepsMultiAgentsEnv as env
from src.models.ray_tf_model_v2 import RayTFModel
from configs.v1 import config

if __name__ == "__main__":
    ModelCatalog.register_custom_model("my_model", RayTFModel)

    ray.init()

    agent = apex.ApexTrainer(env=env, config=config)

    for n in range(1000):
        result = agent.train()
        print(
            f'Step {n} - episode_reward_mean: {result["episode_reward_mean"]}')
        print(result['policy_reward_mean'])
        if (n + 1) % 20 == 0:
            agent.save('models/v1')

    ray.shutdown()
예제 #15
0
    config = build_training_config(args)
    log_config = build_log_config(args, config)
    setup_wandb(args, log_config)

    # Now we setup our custom wrapper
    registry.register_env(
        "wrapped_pacman_env",
        lambda config: build_env()
    )

    # And custom model 
    ModelCatalog.register_custom_model("pacnet", PacNet)    

    # Start ray and load a training instance.
    ray.init()
    trainer = apex.ApexTrainer(config=config, env="wrapped_pacman_env")

    # Find the new folder and make sure we can upload videos
    base_dir = '/home/ubuntu/ray_results/'
    expdir = max([base_dir + d for d in os.listdir(base_dir)], key=os.path.getmtime)
    print("Exp dir detected: {}".format(expdir))
    
    # Begin training
    timesteps = 0 
    for i in range(args.iterations):
        start_time = time.time()

        result = trainer.train()
        print("Finished iter {}".format(i), result)

        elapsed_time = time.time() - start_time