def test_apex_dqn_compilation_and_per_worker_epsilon_values(self): """Test whether an APEX-DQNTrainer can be built on all frameworks.""" config = apex.APEX_DEFAULT_CONFIG.copy() config["num_workers"] = 3 config["prioritized_replay"] = True config["timesteps_per_iteration"] = 100 config["min_iter_time_s"] = 1 config["optimizer"]["num_replay_buffer_shards"] = 1 for _ in framework_iterator(config, ("torch", "tf", "eager")): plain_config = config.copy() trainer = apex.ApexTrainer(config=plain_config, env="CartPole-v0") # Test per-worker epsilon distribution. infos = trainer.workers.foreach_policy( lambda p, _: p.get_exploration_info()) expected = [0.4, 0.016190862, 0.00065536] check([i["cur_epsilon"] for i in infos], [0.0] + expected) check_compute_action(trainer) # TODO(ekl) fix iterator metrics bugs w/multiple trainers. # for i in range(1): # results = trainer.train() # print(results) # Test again per-worker epsilon distribution # (should not have changed). infos = trainer.workers.foreach_policy( lambda p, _: p.get_exploration_info()) check([i["cur_epsilon"] for i in infos], [0.0] + expected) trainer.stop()
def test_apex_dqn_compilation_and_per_worker_epsilon_values(self): """Test whether an APEX-DQNTrainer can be built on all frameworks.""" config = apex.APEX_DEFAULT_CONFIG.copy() config["num_workers"] = 3 config["num_gpus"] = 0 config["learning_starts"] = 1000 config["prioritized_replay"] = True config["timesteps_per_iteration"] = 100 config["min_time_s_per_reporting"] = 1 config["optimizer"]["num_replay_buffer_shards"] = 1 for _ in framework_iterator(config, with_eager_tracing=True): plain_config = config.copy() trainer = apex.ApexTrainer(config=plain_config, env="CartPole-v0") # Test per-worker epsilon distribution. infos = trainer.workers.foreach_policy( lambda p, _: p.get_exploration_state()) expected = [0.4, 0.016190862, 0.00065536] check([i["cur_epsilon"] for i in infos], [0.0] + expected) check_compute_single_action(trainer) for i in range(2): results = trainer.train() check_train_results(results) print(results) # Test again per-worker epsilon distribution # (should not have changed). infos = trainer.workers.foreach_policy( lambda p, _: p.get_exploration_state()) check([i["cur_epsilon"] for i in infos], [0.0] + expected) trainer.stop()
def test_apex_lr_schedule(self): config = apex.APEX_DEFAULT_CONFIG.copy() config["num_workers"] = 1 config["num_gpus"] = 0 config["buffer_size"] = 100 config["learning_starts"] = 10 config["train_batch_size"] = 10 config["rollout_fragment_length"] = 5 config["prioritized_replay"] = True config["timesteps_per_iteration"] = 10 # 0 metrics reporting delay, this makes sure timestep, # which lr depends on, is updated after each worker rollout. config["min_iter_time_s"] = 0 config["optimizer"]["num_replay_buffer_shards"] = 1 # This makes sure learning schedule is checked every 10 timesteps. config["optimizer"]["max_weight_sync_delay"] = 10 # Initial lr, doesn't really matter because of the schedule below. config["lr"] = 0.2 lr_schedule = [ [0, 0.2], [50, 0.1], [100, 0.01], [150, 0.001], ] config["lr_schedule"] = lr_schedule def _step_n_times(trainer, n: int): """Step trainer n times. Returns: learning rate at the end of the execution. """ for _ in range(n): results = trainer.train() return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][ LEARNER_STATS_KEY]["cur_lr"] # Check eager execution frameworks here, since it's easier to control # exact timesteps with these frameworks. for _ in framework_iterator(config): trainer = apex.ApexTrainer(config=config, env="CartPole-v0") lr = _step_n_times(trainer, 5) # 50 timesteps # PiecewiseSchedule does interpolation. So roughly 0.1 here. self.assertLessEqual(lr, 0.15) self.assertGreaterEqual(lr, 0.04) lr = _step_n_times(trainer, 5) # 100 timesteps # PiecewiseSchedule does interpolation. So roughly 0.01 here. self.assertLessEqual(lr, 0.02) self.assertGreaterEqual(lr, 0.004) lr = _step_n_times(trainer, 5) # 150 timesteps # PiecewiseSchedule does interpolation. So roughly 0.001 here. self.assertLessEqual(lr, 0.002) self.assertGreaterEqual(lr, 0.0004) trainer.stop()
def test_apex_lr_schedule(self): config = apex.APEX_DEFAULT_CONFIG.copy() config["num_workers"] = 1 config["num_gpus"] = 0 config["learning_starts"] = 10 config["train_batch_size"] = 10 config["rollout_fragment_length"] = 5 config["replay_buffer_config"] = { # For now we don't use the new ReplayBuffer API here "_enable_replay_buffer_api": False, "no_local_replay_buffer": True, "type": "MultiAgentReplayBuffer", "capacity": 100, "replay_batch_size": 10, "prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. "prioritized_replay_beta": 0.4, # Epsilon to add to the TD errors when updating priorities. "prioritized_replay_eps": 1e-6, } config["timesteps_per_iteration"] = 10 # 0 metrics reporting delay, this makes sure timestep, # which lr depends on, is updated after each worker rollout. config["min_time_s_per_reporting"] = 0 config["optimizer"]["num_replay_buffer_shards"] = 1 # This makes sure learning schedule is checked every 10 timesteps. config["optimizer"]["max_weight_sync_delay"] = 10 # Initial lr, doesn't really matter because of the schedule below. config["lr"] = 0.2 lr_schedule = [ [0, 0.2], [100, 0.001], ] config["lr_schedule"] = lr_schedule def _step_n_times(trainer, n: int): """Step trainer n times. Returns: learning rate at the end of the execution. """ for _ in range(n): results = trainer.train() return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][ LEARNER_STATS_KEY]["cur_lr"] for _ in framework_iterator(config): trainer = apex.ApexTrainer(config=config, env="CartPole-v0") lr = _step_n_times(trainer, 5) # 10 timesteps # Close to 0.2 self.assertGreaterEqual(lr, 0.1) lr = _step_n_times(trainer, 20) # 200 timesteps # LR Annealed to 0.001 self.assertLessEqual(lr, 0.0011) trainer.stop()
def test_apex_epsilon_distribution(self): config = apex.APEX_DEFAULT_CONFIG.copy() config["num_workers"] = 3 config["optimizer"]["num_replay_buffer_shards"] = 1 trainer = apex.ApexTrainer(config, env="CartPole-v0") infos = trainer.workers.foreach_policy( lambda p, _: p.get_exploration_info()) eps = [i["cur_epsilon"] for i in infos] assert np.allclose(eps, [1.0, 0.016190862, 0.00065536, 2.6527108e-05])
def test_apex_zero_workers(self): config = apex.APEX_DEFAULT_CONFIG.copy() config["num_workers"] = 0 config["prioritized_replay"] = True config["timesteps_per_iteration"] = 100 config["min_iter_time_s"] = 1 config["optimizer"]["num_replay_buffer_shards"] = 1 trainer = apex.ApexTrainer(config=config, env="CartPole-v0") trainer.train() trainer.stop()
def test_apex_zero_workers(self): config = apex.APEX_DEFAULT_CONFIG.copy() config["num_workers"] = 0 config["learning_starts"] = 1000 config["prioritized_replay"] = True config["timesteps_per_iteration"] = 100 config["min_iter_time_s"] = 1 config["optimizer"]["num_replay_buffer_shards"] = 1 for _ in framework_iterator(config, frameworks=("torch", "tf")): trainer = apex.ApexTrainer(config=config, env="CartPole-v0") trainer.train() trainer.stop()
def test_apex_zero_workers(self): config = apex.APEX_DEFAULT_CONFIG.copy() config["num_workers"] = 0 config["num_gpus"] = 0 config["learning_starts"] = 1000 config["prioritized_replay"] = True config["timesteps_per_iteration"] = 100 config["min_time_s_per_reporting"] = 1 config["optimizer"]["num_replay_buffer_shards"] = 1 for _ in framework_iterator(config): trainer = apex.ApexTrainer(config=config, env="CartPole-v0") results = trainer.train() check_train_results(results) print(results) trainer.stop()
def test_apex_lr_schedule(self): config = apex.APEX_DEFAULT_CONFIG.copy() config["num_workers"] = 1 config["num_gpus"] = 0 config["buffer_size"] = 100 config["learning_starts"] = 10 config["train_batch_size"] = 10 config["rollout_fragment_length"] = 5 config["prioritized_replay"] = True config["timesteps_per_iteration"] = 10 # 0 metrics reporting delay, this makes sure timestep, # which lr depends on, is updated after each worker rollout. config["min_time_s_per_reporting"] = 0 config["optimizer"]["num_replay_buffer_shards"] = 1 # This makes sure learning schedule is checked every 10 timesteps. config["optimizer"]["max_weight_sync_delay"] = 10 # Initial lr, doesn't really matter because of the schedule below. config["lr"] = 0.2 lr_schedule = [ [0, 0.2], [100, 0.001], ] config["lr_schedule"] = lr_schedule def _step_n_times(trainer, n: int): """Step trainer n times. Returns: learning rate at the end of the execution. """ for _ in range(n): results = trainer.train() return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][ LEARNER_STATS_KEY]["cur_lr"] for _ in framework_iterator(config): trainer = apex.ApexTrainer(config=config, env="CartPole-v0") lr = _step_n_times(trainer, 1) # 10 timesteps # Close to 0.2 self.assertGreaterEqual(lr, 0.1) lr = _step_n_times(trainer, 20) # 200 timesteps # LR Annealed to 0.001 self.assertLessEqual(lr, 0.0011) trainer.stop()
def __init__(self): ModelCatalog.register_custom_model("my_model", RayTFModel) ray.init() self.max_step = 10 single_env = MultiStepsMultiAgentGymMinerEnv({'max_step': self.max_step}) obs_space = single_env.observation_space act_space = single_env.action_space self.config = apex.APEX_DEFAULT_CONFIG.copy() self.config = { **self.config, 'num_workers': 0, 'in_evaluation': True, 'multiagent': { 'policies': { 'dqn1': (None, obs_space, act_space, { 'model': { 'custom_model': 'my_model', 'custom_model_config': { 'max_step': self.max_step, 'embedding_size': 8, 'conv1_filters': [ (64, 4, 1), (128, 2, 2), (256, 2, 2), ], } }, 'gamma': 0.99, }), 'random': (RandomPolicy, obs_space, act_space, {}), }, 'policy_mapping_fn': lambda agent_id: 'dqn1', }, 'env_config': { 'max_step': self.max_step, }, } self.agent = apex.ApexTrainer( env=MultiStepsMultiAgentGymMinerEnv, config=self.config) self.agent.restore('checkpoints/checkpoint_740/checkpoint-740') self.step_states = []
def test_apex_compilation_and_per_worker_epsilon_values(self): """Test whether an APEX-DQNTrainer can be built on all frameworks.""" config = apex.APEX_DEFAULT_CONFIG.copy() config["num_workers"] = 3 config["prioritized_replay"] = True config["optimizer"]["num_replay_buffer_shards"] = 1 num_iterations = 1 for _ in framework_iterator(config, ("torch", "tf", "eager")): plain_config = config.copy() trainer = apex.ApexTrainer(config=plain_config, env="CartPole-v0") # Test per-worker epsilon distribution. infos = trainer.workers.foreach_policy( lambda p, _: p.get_exploration_info()) eps = [i["cur_epsilon"] for i in infos] assert np.allclose(eps, [1.0, 0.016190862, 0.00065536, 2.6527108e-05]) for i in range(num_iterations): results = trainer.train() print(results)
def __init__(self): ModelCatalog.register_custom_model("my_model", RayTFModel) ray.init() single_env = MultiAgentGymMinerEnv({}) obs_space = single_env.observation_space act_space = single_env.action_space self.config = apex.APEX_DEFAULT_CONFIG.copy() self.config['num_workers'] = 0 self.config['in_evaluation'] = True self.config['multiagent'] = { 'policies': { 'dqn1': (None, obs_space, act_space, { 'model': { 'custom_model': 'my_model', }, 'gamma': 0.99, }), 'random': (RandomPolicy, obs_space, act_space, {}), }, 'policy_mapping_fn': lambda agent_id: 'dqn1', } self.agent = apex.ApexTrainer(env=MultiAgentGymMinerEnv, config=self.config) self.agent.restore('checkpoints/checkpoint_1240/checkpoint-1240')
name) # Simplified to one layer. input = tf.keras.layers.Input(obs_space.shape, dtype=obs_space.dtype) output = tf.keras.layers.Dense(num_outputs, activation=None) self.base_model = tf.keras.models.Sequential([input, output]) self.register_variables(self.base_model.variables) def forward(self, input_dict, state, seq_lens): return self.base_model(input_dict["obs"]), [] ModelCatalog.register_custom_model("MLPModel", MLPModel) ModelCatalog.register_custom_model("MLPModelV2", MLPModelV2) if algorithm == 'A2C': RLAgent = a2c.A2CTrainer(env=env_name, config=config) elif algorithm == 'ADQN': RLAgent = adqn.ApexTrainer(env=env_name, config=config) elif algorithm == 'DQN': RLAgent = dqn.DQNTrainer(env=env_name, config=config) elif algorithm == 'IMPALA': RLAgent = impala.ImpalaTrainer(env=env_name, config=config) elif algorithm == 'PPO': RLAgent = ppo.PPOTrainer(env=env_name, config=config) elif algorithm == 'RDQN': RLAgent = dqn.DQNTrainer(env=env_name, config=config) RLAgent.restore(checkpoint_path) num_runs = 50 totalRewards = np.empty((num_runs,)) for j in range(num_runs): observations = env.reset()
import random import ray from ray.rllib.agents.dqn import apex from ray.rllib.models import ModelCatalog from mlflow import log_metric, log_param, log_artifacts from src.envs.multi_steps_multi_agents_env import MultiStepsMultiAgentsEnv as env from src.models.ray_tf_model_v2 import RayTFModel from configs.v1 import config if __name__ == "__main__": ModelCatalog.register_custom_model("my_model", RayTFModel) ray.init() agent = apex.ApexTrainer(env=env, config=config) for n in range(1000): result = agent.train() print( f'Step {n} - episode_reward_mean: {result["episode_reward_mean"]}') print(result['policy_reward_mean']) if (n + 1) % 20 == 0: agent.save('models/v1') ray.shutdown()
config = build_training_config(args) log_config = build_log_config(args, config) setup_wandb(args, log_config) # Now we setup our custom wrapper registry.register_env( "wrapped_pacman_env", lambda config: build_env() ) # And custom model ModelCatalog.register_custom_model("pacnet", PacNet) # Start ray and load a training instance. ray.init() trainer = apex.ApexTrainer(config=config, env="wrapped_pacman_env") # Find the new folder and make sure we can upload videos base_dir = '/home/ubuntu/ray_results/' expdir = max([base_dir + d for d in os.listdir(base_dir)], key=os.path.getmtime) print("Exp dir detected: {}".format(expdir)) # Begin training timesteps = 0 for i in range(args.iterations): start_time = time.time() result = trainer.train() print("Finished iter {}".format(i), result) elapsed_time = time.time() - start_time