def test_merge_config(): conf = get_config() conf = merge_config(conf, {"conf1": 2, "conf3": 3}) assert conf.conf1 == 2 assert conf.conf2 == 2 assert conf.conf3 == 3 conf = get_config() conf2 = Config(conf1=2, conf3=3) conf = merge_config(conf, conf2) assert conf.conf1 == 2 assert conf.conf2 == 2 assert conf.conf3 == 3
def train_config(self): disable_view_window() c = Config() c.env_name = "Pendulum-v0" c.env = unwrap_time_limit(gym.make(c.env_name)) c.observe_dim = 3 c.action_dim = 1 c.action_range = 2 c.max_episodes = 1000 c.max_steps = 200 c.noise_param = (0, 0.2) c.noise_mode = "normal" c.noise_interval = 2 c.replay_size = 100000 c.solved_reward = -400 c.solved_repeat = 5 return c
class TestIMPALA(object): # configs and definitions disable_view_window() c = Config() # Note: online policy algorithms such as PPO and A3C does not # work well in Pendulum (reason unknown) # and MountainCarContinuous (sparse returns) c.env_name = "CartPole-v0" c.env = unwrap_time_limit(gym.make(c.env_name)) c.observe_dim = 4 c.action_num = 2 c.max_episodes = 2000 c.max_steps = 200 c.replay_size = 10000 c.solved_reward = 150 c.solved_repeat = 5 @staticmethod def impala(device, dtype, use_lr_sch=False): c = TestIMPALA.c actor = smw( Actor(c.observe_dim, c.action_num).type(dtype).to(device), device, device) critic = smw( Critic(c.observe_dim).type(dtype).to(device), device, device) servers = model_server_helper(model_num=1) world = get_world() # process 0 and 1 will be workers, and 2 will be trainer impala_group = world.create_rpc_group("impala", ["0", "1", "2"]) if use_lr_sch: lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)], logger=default_logger) impala = IMPALA(actor, critic, t.optim.Adam, nn.MSELoss(reduction='sum'), impala_group, servers, lr_scheduler=LambdaLR, lr_scheduler_args=((lr_func, ), (lr_func, ))) else: impala = IMPALA(actor, critic, t.optim.Adam, nn.MSELoss(reduction='sum'), impala_group, servers) return impala ######################################################################## # Test for IMPALA acting ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180) @WorldTestBase.setup_world def test_act(_, device, dtype): c = TestIMPALA.c impala = TestIMPALA.impala(device, dtype) state = t.zeros([1, c.observe_dim], dtype=dtype) impala.act({"state": state}) return True ######################################################################## # Test for IMPALA action evaluation ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180) @WorldTestBase.setup_world def test_eval_action(_, device, dtype): c = TestIMPALA.c impala = TestIMPALA.impala(device, dtype) state = t.zeros([1, c.observe_dim], dtype=dtype) action = t.zeros([1, 1], dtype=t.int) impala._eval_act({"state": state}, {"action": action}) return True ######################################################################## # Test for IMPALA criticizing ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180) @WorldTestBase.setup_world def test__criticize(_, device, dtype): c = TestIMPALA.c impala = TestIMPALA.impala(device, dtype) state = t.zeros([1, c.observe_dim], dtype=dtype) impala._criticize({"state": state}) return True ######################################################################## # Test for IMPALA storage ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180) @WorldTestBase.setup_world def test_store_step(_, device, dtype): c = TestIMPALA.c impala = TestIMPALA.impala(device, dtype) old_state = state = t.zeros([1, c.observe_dim], dtype=dtype) action = t.zeros([1, 1], dtype=t.int) with pytest.raises(NotImplementedError): impala.store_transition({ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": 0, "action_log_prob": 0.1, "terminal": False }) return True @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180) @WorldTestBase.setup_world def test_store_episode(_, device, dtype): c = TestIMPALA.c impala = TestIMPALA.impala(device, dtype) old_state = state = t.zeros([1, c.observe_dim], dtype=dtype) action = t.zeros([1, 1], dtype=t.int) episode = [{ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": 0, "action_log_prob": 0.1, "terminal": False } for _ in range(3)] impala.store_episode(episode) return True ######################################################################## # Test for IMPALA update ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180) @WorldTestBase.setup_world def test_update(rank, device, dtype): c = TestIMPALA.c impala = TestIMPALA.impala(device, dtype) old_state = state = t.zeros([1, c.observe_dim], dtype=dtype) action = t.zeros([1, 1], dtype=t.int) if rank == 0: # episode length = 3 impala.store_episode([{ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": 0, "action_log_prob": 0.1, "terminal": False } for _ in range(3)]) elif rank == 1: # episode length = 2 impala.store_episode([{ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": 0, "action_log_prob": 0.1, "terminal": False } for _ in range(2)]) if rank == 2: sleep(2) impala.update(update_value=True, update_target=True, concatenate_samples=True) return True ######################################################################## # Test for IMPALA save & load ######################################################################## # Skipped, it is the same as base framework ######################################################################## # Test for IMPALA lr_scheduler ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180) @WorldTestBase.setup_world def test_lr_scheduler(_, device, dtype): impala = TestIMPALA.impala(device, dtype) impala.update_lr_scheduler() return True ######################################################################## # Test for IMPALA full training. ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], timeout=1800) @WorldTestBase.setup_world def test_full_train(rank): c = TestIMPALA.c impala = TestIMPALA.impala("cpu", t.float32) # perform manual syncing to decrease the number of rpc calls impala.set_sync(False) # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env world = get_world() all_group = world.create_rpc_group("all", ["0", "1", "2"]) all_group.pair("{}_running".format(rank), True) default_logger.info("{}, pid {}".format(rank, os.getpid())) if rank == 0: all_group.pair("episode", episode) if rank in (0, 1): while episode < c.max_episodes: # wait for trainer to keep up sleep(0.2) episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) impala.manual_sync() tmp_observations = [] while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state action, action_log_prob, *_ = impala.act( {"state": old_state.unsqueeze(0)}) state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) tmp_observations.append({ "state": { "state": old_state.unsqueeze(0) }, "action": { "action": action }, "next_state": { "state": state.unsqueeze(0) }, "reward": float(reward), "action_log_prob": action_log_prob.item(), "terminal": terminal or step == c.max_steps }) impala.store_episode(tmp_observations) smoother.update(total_reward) step.reset() terminal = False default_logger.info("Process {} Episode {} " "total reward={:.2f}".format( rank, episode, smoother.value)) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: default_logger.info("Environment solved!") all_group.unpair("{}_running".format(rank)) while (all_group.is_paired("0_running") or all_group.is_paired("1_running")): # wait for all workers to join sleep(1) # wait for trainer sleep(5) return True else: reward_fulfilled.reset() else: # wait for some samples # Note: the number of entries in buffer means "episodes" # rather than steps here! while impala.replay_buffer.all_size() < 5: sleep(0.1) while (all_group.is_paired("0_running") or all_group.is_paired("1_running")): impala.update() return True raise RuntimeError("IMPALA Training failed.")
def train_config(self, gpu): disable_view_window() c = Config() # Note: online policy algorithms such as PPO and A2C does not # work well in Pendulum (reason unknown) # and MountainCarContinuous (sparse returns) c.env_name = "CartPole-v0" c.env = unwrap_time_limit(gym.make(c.env_name)) c.observe_dim = 4 c.action_num = 2 # maximum and minimum of reward value # since reward is 1 for every step, maximum q value should be # below 20(reward_future_steps) * (1 + discount ** n_steps) < 40 c.value_max = 40 c.value_min = 0 c.reward_future_steps = 20 c.max_episodes = 1000 c.max_steps = 200 c.replay_size = 100000 # RAINBOW is not very stable (without dueling and noisy linear) # compared to other DQNs c.solved_reward = 180 c.solved_repeat = 5 c.device = gpu return c
def train_config(self, pytestconfig): disable_view_window() c = Config() # Note: online policy algorithms such as PPO and A2C does not # work well in Pendulum (reason unknown) # and MountainCarContinuous (sparse returns) c.env_name = "CartPole-v0" c.env = unwrap_time_limit(gym.make(c.env_name)) c.observe_dim = 4 c.action_num = 2 c.max_episodes = 1000 c.max_steps = 200 c.replay_size = 100000 c.solved_reward = 190 c.solved_repeat = 5 c.device = "cpu" return c
class TestDQNApex(object): # configs and definitions disable_view_window() c = Config() # Note: online policy algorithms such as PPO and A2C does not # work well in Pendulum (reason unknown) # and MountainCarContinuous (sparse returns) c.env_name = "CartPole-v0" c.env = unwrap_time_limit(gym.make(c.env_name)) c.observe_dim = 4 c.action_num = 2 c.max_episodes = 2000 c.max_steps = 200 c.replay_size = 100000 c.solved_reward = 190 c.solved_repeat = 5 @staticmethod def dqn_apex(): c = TestDQNApex.c q_net = smw( QNet(c.observe_dim, c.action_num).to(c.device), c.device, c.device) q_net_t = smw( QNet(c.observe_dim, c.action_num).to(c.device), c.device, c.device) servers = model_server_helper(model_num=1) world = get_world() # process 0 and 1 will be workers, and 2 will be trainer apex_group = world.create_rpc_group("apex", ["0", "1", "2"]) dqn_apex = DQNApex(q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction='sum'), apex_group, servers, replay_device="cpu", replay_size=c.replay_size) return dqn_apex ######################################################################## # Test for DQNApex acting ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["gpu"], timeout=180) @WorldTestBase.setup_world def test_act(_, gpu): c = TestDQNApex.c c.device = gpu dqn_apex = TestDQNApex.dqn_apex() state = t.zeros([1, c.observe_dim]) dqn_apex.act_discrete({"state": state}) dqn_apex.act_discrete({"state": state}, True) dqn_apex.act_discrete_with_noise({"state": state}) dqn_apex.act_discrete_with_noise({"state": state}, True) return True ######################################################################## # Test for DQNApex criticizing ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["gpu"], timeout=180) @WorldTestBase.setup_world def test_criticize(_, gpu): c = TestDQNApex.c c.device = gpu dqn_apex = TestDQNApex.dqn_apex() state = t.zeros([1, c.observe_dim]) dqn_apex._criticize({"state": state}) dqn_apex._criticize({"state": state}, True) return True ######################################################################## # Test for DQNApex storage ######################################################################## # Skipped, it is the same as DQN ######################################################################## # Test for DQNApex update ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["gpu"], timeout=180) @WorldTestBase.setup_world def test_update(rank, gpu): c = TestDQNApex.c c.device = gpu dqn_apex = TestDQNApex.dqn_apex() old_state = state = t.zeros([1, c.observe_dim]) action = t.zeros([1, 1], dtype=t.int) if rank in (0, 1): dqn_apex.store_episode([{ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": 0, "terminal": False } for _ in range(3)]) dqn_apex.manual_sync() if rank == 2: sleep(2) dqn_apex.update(update_value=True, update_target=True, concatenate_samples=True) return True ######################################################################## # Test for DQNApex save & load ######################################################################## # Skipped, it is the same as DQN ######################################################################## # Test for DQNApex lr_scheduler ######################################################################## # Skipped, it is the same as DQN ######################################################################## # Test for DQNApex full training. ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["gpu"], timeout=1800) @WorldTestBase.setup_world def test_full_train(rank, gpu): c = TestDQNApex.c c.device = gpu dqn_apex = TestDQNApex.dqn_apex() # perform manual syncing to decrease the number of rpc calls dqn_apex.set_sync(False) # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env world = get_world() all_group = world.create_rpc_group("all", ["0", "1", "2"]) all_group.pair("{}_running".format(rank), True) if rank in (0, 1): while episode < c.max_episodes: # wait for trainer to keep up sleep(0.2) episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32, device=c.device) dqn_apex.manual_sync() while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state # agent model inference action = dqn_apex.act_discrete_with_noise( {"state": old_state.unsqueeze(0)}) state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32, device=c.device).flatten() total_reward += float(reward) dqn_apex.store_transition({ "state": { "state": old_state.unsqueeze(0) }, "action": { "action": action }, "next_state": { "state": state.unsqueeze(0) }, "reward": float(reward), "terminal": terminal or step == c.max_steps }) smoother.update(total_reward) step.reset() terminal = False default_logger.info( "Process {} Episode {} total reward={:.2f}".format( rank, episode, smoother.value)) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: default_logger.info("Environment solved!") all_group.unpair("{}_running".format(rank)) while (all_group.is_paired("0_running") or all_group.is_paired("1_running")): # wait for all workers to join sleep(1) # wait for trainer sleep(5) return True else: reward_fulfilled.reset() else: # wait for some samples while dqn_apex.replay_buffer.all_size() < 500: sleep(0.1) while (all_group.is_paired("0_running") or all_group.is_paired("1_running")): dqn_apex.update() return True raise RuntimeError("DQN-Apex Training failed.")
class TestDDPGApex(object): # configs and definitions disable_view_window() c = Config() c.env_name = "Pendulum-v0" c.env = unwrap_time_limit(gym.make(c.env_name)) c.observe_dim = 3 c.action_dim = 1 c.action_range = 2 c.max_episodes = 2000 c.max_steps = 200 c.noise_param = (0, 0.2) c.noise_mode = "normal" c.replay_size = 100000 # takes too much computing resource # decrease standard for faster validation c.solved_reward = -300 c.solved_repeat = 5 @staticmethod def ddpg_apex(discrete=False): c = TestDDPGApex.c if not discrete: actor = smw( Actor(c.observe_dim, c.action_dim, c.action_range).to(c.device), c.device, c.device) actor_t = smw( Actor(c.observe_dim, c.action_dim, c.action_range).to(c.device), c.device, c.device) else: actor = smw( ActorDiscrete(c.observe_dim, c.action_dim).to(c.device), c.device, c.device) actor_t = smw( ActorDiscrete(c.observe_dim, c.action_dim).to(c.device), c.device, c.device) critic = smw( Critic(c.observe_dim, c.action_dim).to(c.device), c.device, c.device) critic_t = smw( Critic(c.observe_dim, c.action_dim).to(c.device), c.device, c.device) servers = model_server_helper(model_num=2) world = get_world() # process 0 and 1 will be workers, and 2 will be trainer apex_group = world.create_rpc_group("worker", ["0", "1", "2"]) ddpg_apex = DDPGApex(actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction='sum'), apex_group, servers, replay_device="cpu", replay_size=c.replay_size) return ddpg_apex ######################################################################## # Test for DDPGApex contiguous domain acting ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["gpu"], timeout=180) @WorldTestBase.setup_world def test_contiguous_act(_, gpu): c = TestDDPGApex.c c.device = gpu ddpg_apex = TestDDPGApex.ddpg_apex() state = t.zeros([1, c.observe_dim]) ddpg_apex.act({"state": state}) ddpg_apex.act({"state": state}, use_target=True) ddpg_apex.act_with_noise({"state": state}, noise_param=(0, 1.0), mode="uniform") ddpg_apex.act_with_noise({"state": state}, noise_param=(0, 1.0), mode="uniform", use_target=True) return True ######################################################################## # Test for DDPGApex discrete domain acting ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["gpu"], timeout=180) @WorldTestBase.setup_world def test_discrete_act(_, gpu): c = TestDDPGApex.c c.device = gpu ddpg_apex = TestDDPGApex.ddpg_apex(discrete=True) state = t.zeros([1, c.observe_dim]) ddpg_apex.act_discrete({"state": state}) ddpg_apex.act_discrete({"state": state}, use_target=True) ddpg_apex.act_discrete_with_noise({"state": state}) ddpg_apex.act_discrete_with_noise({"state": state}, use_target=True) return True ######################################################################## # Test for DDPGApex criticizing ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["gpu"], timeout=180) @WorldTestBase.setup_world def test__criticize(_, gpu): c = TestDDPGApex.c c.device = gpu ddpg_apex = TestDDPGApex.ddpg_apex() state = t.zeros([1, c.observe_dim]) action = t.zeros([1, c.action_dim]) ddpg_apex._criticize({"state": state}, {"action": action}) ddpg_apex._criticize({"state": state}, {"action": action}, use_target=True) return True ######################################################################## # Test for DDPGApex storage ######################################################################## # Skipped, it is the same as DDPG ######################################################################## # Test for DDPGApex update ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["gpu"], timeout=180) @WorldTestBase.setup_world def test_update(rank, gpu): c = TestDDPGApex.c c.device = gpu ddpg_apex = TestDDPGApex.ddpg_apex() old_state = state = t.zeros([1, c.observe_dim]) action = t.zeros([1, c.action_dim]) if rank in (0, 1): ddpg_apex.store_transition({ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": 0, "terminal": False }) sleep(5) ddpg_apex.manual_sync() if rank == 2: sleep(2) ddpg_apex.update(update_value=True, update_policy=True, update_target=True, concatenate_samples=True) return True ######################################################################## # Test for DDPGApex save & load ######################################################################## # Skipped, it is the same as DDPG ######################################################################## # Test for DDPGApex lr_scheduler ######################################################################## # Skipped, it is the same as DDPG ######################################################################## # Test for DDPGApex full training. ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], pass_through=["gpu"], timeout=1800) @WorldTestBase.setup_world def test_full_train(rank, gpu): c = TestDDPGApex.c c.device = gpu ddpg_apex = TestDDPGApex.ddpg_apex() # perform manual syncing to decrease the number of rpc calls ddpg_apex.set_sync(False) # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env world = get_world() all_group = world.create_rpc_group("all", ["0", "1", "2"]) all_group.pair("{}_running".format(rank), True) default_logger.info("{}, pid {}".format(rank, os.getpid())) if rank == 0: all_group.pair("episode", episode) if rank in (0, 1): while episode < c.max_episodes: # wait for trainer to keep up sleep(0.2) episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32, device=c.device) ddpg_apex.manual_sync() while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state action = ddpg_apex.act_with_noise( {"state": old_state.unsqueeze(0)}, noise_param=c.noise_param, mode=c.noise_mode) state, reward, terminal, _ = env.step( action.cpu().numpy()) state = t.tensor(state, dtype=t.float32, device=c.device).flatten() total_reward += float(reward) ddpg_apex.store_transition({ "state": { "state": old_state.unsqueeze(0) }, "action": { "action": action }, "next_state": { "state": state.unsqueeze(0) }, "reward": float(reward), "terminal": terminal or step == c.max_steps }) smoother.update(total_reward) step.reset() terminal = False default_logger.info("Process {} Episode {} " "total reward={:.2f}".format( rank, episode, smoother.value)) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: default_logger.info("Environment solved!") all_group.unpair("{}_running".format(rank)) while (all_group.is_paired("0_running") or all_group.is_paired("1_running")): # wait for all workers to join sleep(1) # wait for trainer sleep(5) return True else: reward_fulfilled.reset() else: # wait for some samples while ddpg_apex.replay_buffer.all_size() < 500: sleep(0.1) while (all_group.is_paired("0_running") or all_group.is_paired("1_running")): ddpg_apex.update() return True raise RuntimeError("DDPG-Apex Training failed.")
def get_config(): c = Config() c.conf1 = 1 c.conf2 = 2 return c
class TestA3C: # configs and definitions disable_view_window() c = Config() # Note: online policy algorithms such as PPO and A3C does not # work well in Pendulum (reason unknown) # and MountainCarContinuous (sparse returns) c.env_name = "CartPole-v0" c.env = unwrap_time_limit(gym.make(c.env_name)) c.observe_dim = 4 c.action_num = 2 c.max_episodes = 3000 c.max_steps = 200 c.replay_size = 10000 c.solved_reward = 150 c.solved_repeat = 5 @staticmethod def a3c(device, dtype): c = TestA3C.c actor = smw( Actor(c.observe_dim, c.action_num).type(dtype).to(device), device, device) critic = smw( Critic(c.observe_dim).type(dtype).to(device), device, device) # in all test scenarios, all processes will be used as reducers servers = grad_server_helper( [ lambda: Actor(c.observe_dim, c.action_num), lambda: Critic(c.observe_dim) ], learning_rate=5e-3, ) a3c = A3C( actor, critic, nn.MSELoss(reduction="sum"), servers, replay_device="cpu", replay_size=c.replay_size, ) return a3c ######################################################################## # Test for A3C acting ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @WorldTestBase.setup_world def test_act(_, device, dtype): c = TestA3C.c a3c = TestA3C.a3c(device, dtype) state = t.zeros([1, c.observe_dim], dtype=dtype) a3c.act({"state": state}) return True ######################################################################## # Test for A3C action evaluation ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @WorldTestBase.setup_world def test_eval_action(_, device, dtype): c = TestA3C.c a3c = TestA3C.a3c(device, dtype) state = t.zeros([1, c.observe_dim], dtype=dtype) action = t.zeros([1, 1], dtype=t.int) a3c._eval_act({"state": state}, {"action": action}) return True ######################################################################## # Test for A3C criticizing ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @WorldTestBase.setup_world def test__criticize(_, device, dtype): c = TestA3C.c a3c = TestA3C.a3c(device, dtype) state = t.zeros([1, c.observe_dim], dtype=dtype) a3c._criticize({"state": state}) return True ######################################################################## # Test for A3C storage ######################################################################## # Skipped, it is the same as A2C ######################################################################## # Test for A3C update ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @WorldTestBase.setup_world def test_update(rank, device, dtype): c = TestA3C.c c.device = gpu a3c = TestA3C.a3c(device, dtype) old_state = state = t.zeros([1, c.observe_dim], dtype=dtype) action = t.zeros([1, 1], dtype=t.int) begin = time() while time() - begin < 5: a3c.store_episode([{ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": 0, "terminal": False, } for _ in range(3)]) a3c.update( update_value=True, update_policy=True, update_target=True, concatenate_samples=True, ) sleep(0.01) if rank == 1: # pull the newest model a3c.manual_sync() return True ######################################################################## # Test for A3C save & load ######################################################################## # Skipped, it is the same as A2C ######################################################################## # Test for A3C lr_scheduler ######################################################################## # Skipped, it is the same as A2C ######################################################################## # Test for A3C config & init ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], timeout=180) @WorldTestBase.setup_world def test_config_init(rank): c = TestA3C.c config = A3C.generate_config({}) config["frame_config"]["models"] = ["Actor", "Critic"] config["frame_config"]["model_kwargs"] = [ { "state_dim": c.observe_dim, "action_num": c.action_num }, { "state_dim": c.observe_dim }, ] a3c = A3C.init_from_config(config) old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32) action = t.zeros([1, 1], dtype=t.int) begin = time() while time() - begin < 5: a3c.store_episode([{ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": 0, "terminal": False, } for _ in range(3)]) a3c.update() sleep(0.01) if rank == 1: # pull the newest model a3c.manual_sync() return True ######################################################################## # Test for A3C full training. ######################################################################## @staticmethod @pytest.mark.parametrize("gae_lambda", [0.0, 0.5, 1.0]) @run_multi(expected_results=[True, True, True], pass_through=["gae_lambda"], timeout=1800) @WorldTestBase.setup_world def test_full_train(rank, gae_lambda): c = TestA3C.c a3c = TestA3C.a3c("cpu", t.float32) a3c.set_sync(False) # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env # for cpu usage viewing default_logger.info(f"{rank}, pid {os.getpid()}") while episode < c.max_episodes: episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) a3c.manual_sync() tmp_observations = [] while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state # agent model inference action = a3c.act({"state": old_state.unsqueeze(0)})[0] state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) tmp_observations.append({ "state": { "state": old_state.unsqueeze(0) }, "action": { "action": action }, "next_state": { "state": state.unsqueeze(0) }, "reward": float(reward), "terminal": terminal or step == c.max_steps, }) # update a3c.store_episode(tmp_observations) a3c.update() smoother.update(total_reward) step.reset() terminal = False default_logger.info( f"Process {rank} Episode {episode} total reward={smoother.value:.2f}" ) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: default_logger.info("Environment solved!") return True else: reward_fulfilled.reset() raise RuntimeError("A3C Training failed.")
class TestIMPALA: # configs and definitions disable_view_window() c = Config() # Note: online policy algorithms such as PPO and A3C does not # work well in Pendulum (reason unknown) # and MountainCarContinuous (sparse returns) c.env_name = "CartPole-v0" c.env = unwrap_time_limit(gym.make(c.env_name)) c.observe_dim = 4 c.action_num = 2 c.max_episodes = 20000 c.max_steps = 200 c.replay_size = 10000 c.solved_reward = 150 c.solved_repeat = 5 @staticmethod def impala(device, dtype, use_lr_sch=False): c = TestIMPALA.c actor = smw( Actor(c.observe_dim, c.action_num).type(dtype).to(device), device, device ) critic = smw(Critic(c.observe_dim).type(dtype).to(device), device, device) servers = model_server_helper(model_num=1) world = get_world() # process 0 and 1 will be workers, and 2 will be trainer impala_group = world.create_rpc_group("impala", ["0", "1", "2"]) if use_lr_sch: lr_func = gen_learning_rate_func( [(0, 1e-3), (200000, 3e-4)], logger=default_logger ) impala = IMPALA( actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum"), impala_group, servers, lr_scheduler=LambdaLR, lr_scheduler_args=((lr_func,), (lr_func,)), ) else: impala = IMPALA( actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum"), impala_group, servers, ) return impala ######################################################################## # Test for IMPALA acting ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @setup_world def test_act(_, device, dtype): c = TestIMPALA.c impala = TestIMPALA.impala(device, dtype) state = t.zeros([1, c.observe_dim], dtype=dtype) impala.act({"state": state}) return True ######################################################################## # Test for IMPALA action evaluation ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @setup_world def test_eval_action(_, device, dtype): c = TestIMPALA.c impala = TestIMPALA.impala(device, dtype) state = t.zeros([1, c.observe_dim], dtype=dtype) action = t.zeros([1, 1], dtype=t.int) impala._eval_act({"state": state}, {"action": action}) return True ######################################################################## # Test for IMPALA criticizing ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @setup_world def test__criticize(_, device, dtype): c = TestIMPALA.c impala = TestIMPALA.impala(device, dtype) state = t.zeros([1, c.observe_dim], dtype=dtype) impala._criticize({"state": state}) return True ######################################################################## # Test for IMPALA storage ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @setup_world def test_store_episode(_, device, dtype): c = TestIMPALA.c impala = TestIMPALA.impala(device, dtype) old_state = state = t.zeros([1, c.observe_dim], dtype=dtype) action = t.zeros([1, 1], dtype=t.int) episode = [ { "state": {"state": old_state}, "action": {"action": action}, "next_state": {"state": state}, "reward": 0, "action_log_prob": 0.1, "terminal": False, } for _ in range(3) ] impala.store_episode(episode) return True ######################################################################## # Test for IMPALA update ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @setup_world def test_update(rank, device, dtype): c = TestIMPALA.c impala = TestIMPALA.impala(device, dtype) old_state = state = t.zeros([1, c.observe_dim], dtype=dtype) action = t.zeros([1, 1], dtype=t.int) if rank == 0: # episode length = 3 impala.store_episode( [ { "state": {"state": old_state}, "action": {"action": action}, "next_state": {"state": state}, "reward": 0, "action_log_prob": 0.1, "terminal": False, } for _ in range(3) ] ) elif rank == 1: # episode length = 2 impala.store_episode( [ { "state": {"state": old_state}, "action": {"action": action}, "next_state": {"state": state}, "reward": 0, "action_log_prob": 0.1, "terminal": False, } for _ in range(2) ] ) if rank == 2: sleep(2) impala.update( update_value=True, update_target=True, concatenate_samples=True ) return True ######################################################################## # Test for IMPALA save & load ######################################################################## # Skipped, it is the same as base framework ######################################################################## # Test for IMPALA lr_scheduler ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @setup_world def test_lr_scheduler(_, device, dtype): impala = TestIMPALA.impala(device, dtype) impala.update_lr_scheduler() return True ######################################################################## # Test for IMPALA config & init ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], timeout=180) @setup_world def test_config_init(rank): c = TestIMPALA.c config = IMPALA.generate_config({}) config["frame_config"]["models"] = ["Actor", "Critic"] config["frame_config"]["model_kwargs"] = [ {"state_dim": c.observe_dim, "action_num": c.action_num}, {"state_dim": c.observe_dim}, ] impala = IMPALA.init_from_config(config) old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32) action = t.zeros([1, 1], dtype=t.int) if rank == 0: # episode length = 3 impala.store_episode( [ { "state": {"state": old_state}, "action": {"action": action}, "next_state": {"state": state}, "reward": 0, "action_log_prob": 0.1, "terminal": False, } for _ in range(3) ] ) elif rank == 1: # episode length = 2 impala.store_episode( [ { "state": {"state": old_state}, "action": {"action": action}, "next_state": {"state": state}, "reward": 0, "action_log_prob": 0.1, "terminal": False, } for _ in range(2) ] ) if rank == 2: sleep(2) impala.update( update_value=True, update_target=True, concatenate_samples=True ) return True ######################################################################## # Test for IMPALA full training. ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], timeout=1800) @setup_world def test_full_train(rank): training_group = get_world().create_rpc_group("training", ["0", "1", "2"]) c = TestIMPALA.c impala = TestIMPALA.impala("cpu", t.float32) # perform manual syncing to decrease the number of rpc calls impala.set_sync(False) # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env env.seed(rank) # make sure all things are initialized. training_group.barrier() # for cpu usage viewing default_logger.info(f"{rank}, pid {os.getpid()}") while episode < c.max_episodes: episode.count() if rank in (0, 1): # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) impala.manual_sync() tmp_observations = [] while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state action, action_log_prob, *_ = impala.act( {"state": old_state.unsqueeze(0)} ) state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) tmp_observations.append( { "state": {"state": old_state.unsqueeze(0)}, "action": {"action": action}, "next_state": {"state": state.unsqueeze(0)}, "reward": float(reward), "action_log_prob": action_log_prob.item(), "terminal": terminal or step == c.max_steps, } ) impala.store_episode(tmp_observations) smoother.update(total_reward) step.reset() terminal = False default_logger.info( "Process {} Episode {} " "total reward={:.2f}".format(rank, episode, smoother.value) ) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: default_logger.info("Environment solved!") try: training_group.pair(f"solved", True) except KeyError: # already solved in another process pass else: reward_fulfilled.reset() else: # wait for some samples if episode.get() > 200: for _ in range(100): impala.update() default_logger.info("Updated 100 times.") training_group.barrier() if training_group.is_paired("solved"): return True raise RuntimeError("IMPALA Training failed.")
def train_config(self): disable_view_window() c = Config() # the cooperative environment environment provided in # https://github.com/openai/multiagent-particle-envs c.env_name = "simple_spread" c.env = create_env(c.env_name) c.env.discrete_action_input = True c.agent_num = 3 c.action_num = c.env.action_space[0].n c.observe_dim = c.env.observation_space[0].shape[0] # for contiguous tests c.test_action_dim = 5 c.test_action_range = 1 c.test_observe_dim = 5 c.test_agent_num = 3 c.max_episodes = 1000 c.max_steps = 200 c.replay_size = 100000 # from https://github.com/wsjeon/maddpg-rllib/tree/master/plots # PROBLEM: I have no idea how they calculate the rewards # I cannot replicate their reward curve c.solved_reward = -15 c.solved_repeat = 5 return c
class TestDDPGApex: # configs and definitions disable_view_window() c = Config() # Use cartpole-v0 instead since pendulum training is too slow on test machine. c.env_name = "CartPole-v0" c.env = unwrap_time_limit(gym.make(c.env_name)) c.observe_dim = 4 # use dim instead of num here c.action_dim = 2 c.action_range = 1 c.max_episodes = 20000 c.max_steps = 200 c.replay_size = 100000 c.solved_reward = 150 c.solved_repeat = 5 # only for continuous mode testings c.action_range = 1 @staticmethod def ddpg_apex(device, dtype, discrete=False): c = TestDDPGApex.c if not discrete: actor = smw( Actor(c.observe_dim, c.action_dim, c.action_range).type(dtype).to(device), device, device, ) actor_t = smw( Actor(c.observe_dim, c.action_dim, c.action_range).type(dtype).to(device), device, device, ) else: actor = smw( ActorDiscrete(c.observe_dim, c.action_dim).type(dtype).to(device), device, device, ) actor_t = smw( ActorDiscrete(c.observe_dim, c.action_dim).type(dtype).to(device), device, device, ) critic = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device) critic_t = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device) servers = model_server_helper(model_num=2) world = get_world() # process 0 and 1 will be workers, and 2 will be trainer apex_group = world.create_rpc_group("worker", ["0", "1", "2"]) ddpg_apex = DDPGApex( actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction="sum"), apex_group, servers, replay_device="cpu", replay_size=c.replay_size, ) return ddpg_apex ######################################################################## # Test for DDPGApex contiguous domain acting ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @WorldTestBase.setup_world def test_contiguous_act(_, device, dtype): c = TestDDPGApex.c ddpg_apex = TestDDPGApex.ddpg_apex(device, dtype) state = t.zeros([1, c.observe_dim], dtype=dtype) ddpg_apex.act({"state": state}) ddpg_apex.act({"state": state}, use_target=True) ddpg_apex.act_with_noise({"state": state}, noise_param=(0, 1.0), mode="uniform") ddpg_apex.act_with_noise({"state": state}, noise_param=(0, 1.0), mode="uniform", use_target=True) return True ######################################################################## # Test for DDPGApex discrete domain acting ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @WorldTestBase.setup_world def test_discrete_act(_, device, dtype): c = TestDDPGApex.c c.device = gpu ddpg_apex = TestDDPGApex.ddpg_apex(device, dtype, discrete=True) state = t.zeros([1, c.observe_dim], dtype=dtype) ddpg_apex.act_discrete({"state": state}) ddpg_apex.act_discrete({"state": state}, use_target=True) ddpg_apex.act_discrete_with_noise({"state": state}) ddpg_apex.act_discrete_with_noise({"state": state}, use_target=True) return True ######################################################################## # Test for DDPGApex criticizing ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @WorldTestBase.setup_world def test__criticize(_, device, dtype): c = TestDDPGApex.c c.device = gpu ddpg_apex = TestDDPGApex.ddpg_apex(device, dtype) state = t.zeros([1, c.observe_dim], dtype=dtype) action = t.zeros([1, c.action_dim], dtype=dtype) ddpg_apex._criticize({"state": state}, {"action": action}) ddpg_apex._criticize({"state": state}, {"action": action}, use_target=True) return True ######################################################################## # Test for DDPGApex storage ######################################################################## # Skipped, it is the same as DDPG ######################################################################## # Test for DDPGApex update ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @WorldTestBase.setup_world def test_update(rank, device, dtype): c = TestDDPGApex.c c.device = gpu ddpg_apex = TestDDPGApex.ddpg_apex(device, dtype) old_state = state = t.zeros([1, c.observe_dim], dtype=dtype) action = t.zeros([1, c.action_dim], dtype=dtype) if rank in (0, 1): ddpg_apex.store_transition({ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": 0, "terminal": False, }) sleep(5) ddpg_apex.manual_sync() if rank == 2: sleep(2) ddpg_apex.update( update_value=True, update_policy=True, update_target=True, concatenate_samples=True, ) return True ######################################################################## # Test for DDPGApex save & load ######################################################################## # Skipped, it is the same as DDPG ######################################################################## # Test for DDPGApex lr_scheduler ######################################################################## # Skipped, it is the same as DDPG ######################################################################## # Test for DDPGApex config & init ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], timeout=180) @WorldTestBase.setup_world def test_config_init(rank): c = TestDDPGApex.c config = DDPGApex.generate_config({}) config["frame_config"]["models"] = [ "Actor", "Actor", "Critic", "Critic" ] config["frame_config"][ "model_kwargs"] = [{ "state_dim": c.observe_dim, "action_dim": c.action_dim, "action_range": c.action_range, }] * 2 + [{ "state_dim": c.observe_dim, "action_dim": c.action_dim }] * 2 ddpg_apex = DDPGApex.init_from_config(config) old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32) action = t.zeros([1, c.action_dim], dtype=t.float32) if rank in (1, 2): ddpg_apex.store_transition({ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": 0, "terminal": False, }) sleep(5) ddpg_apex.manual_sync() if rank == 0: sleep(2) ddpg_apex.update() return True ######################################################################## # Test for DDPGApex full training. ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], timeout=1800) @WorldTestBase.setup_world def test_full_train(rank): c = TestDDPGApex.c ddpg_apex = TestDDPGApex.ddpg_apex("cpu", t.float32, discrete=True) # perform manual syncing to decrease the number of rpc calls ddpg_apex.set_sync(False) # begin training episode, step = Counter(), Counter() avg_step = Smooth() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env world = get_world() all_group = world.create_rpc_group("all", ["0", "1", "2"]) all_group.pair(f"{rank}_running", True) default_logger.info(f"{rank}, pid {os.getpid()}") if rank == 0: all_group.pair("episode", episode) if rank in (0, 1): while episode < c.max_episodes: # wait for trainer to keep up sleep(0.2) episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) ddpg_apex.manual_sync() while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state action, probs = ddpg_apex.act_discrete_with_noise( {"state": old_state.unsqueeze(0)}) state, reward, terminal, _ = env.step( action.cpu().item()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) ddpg_apex.store_transition({ "state": { "state": old_state.unsqueeze(0) }, "action": { "action": probs }, "next_state": { "state": state.unsqueeze(0) }, "reward": float(reward), "terminal": terminal or step == c.max_steps, }) smoother.update(total_reward) avg_step.update(step.get()) step.reset() terminal = False default_logger.info("Process {} Episode {} " "total reward={:.2f}".format( rank, episode, smoother.value)) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: default_logger.info("Environment solved!") all_group.unpair(f"{rank}_running") while all_group.is_paired( "0_running") or all_group.is_paired( "1_running"): # wait for all workers to join sleep(1) # wait for trainer sleep(5) return True else: reward_fulfilled.reset() else: # wait for some samples while ddpg_apex.replay_buffer.all_size() < 500: sleep(0.1) while all_group.is_paired("0_running") or all_group.is_paired( "1_running"): ddpg_apex.update() default_logger.info(f"Updated") return True raise RuntimeError("DDPG-Apex Training failed.")
class TestARS: # configs and definitions # Cartpole-v0 can be solved: # within 200 episodes, using single layer Actor # within 400 episodes, using double layer Actor # However, ARS fails to deal with pendulum v0: # Actor((st, 16)->(16, a)), noise_std=0.01, lr=0.05, rollout=9, optim=Adam) # reaches mean score = -700 at 10000 episodes # Actor((st, a)), noise_std=0.01, lr=0.05, rollout=9, optim=Adam) # reaches mean score = -1100 at 15000 episodes # and Adam optimizer is better than SGD disable_view_window() c = Config() c.env_name = "CartPole-v0" c.env = unwrap_time_limit(gym.make(c.env_name)) c.observe_dim = 4 c.action_num = 2 c.max_episodes = 1000 c.max_steps = 200 c.solved_reward = 150 c.solved_repeat = 5 @staticmethod def ars(device, dtype): c = TestARS.c actor = smw( ActorDiscrete(c.observe_dim, c.action_num).type(dtype).to(device), device, device, ) servers = model_server_helper(model_num=1) world = get_world() ars_group = world.create_rpc_group("ars", ["0", "1", "2"]) ars = ARS( actor, t.optim.SGD, ars_group, servers, noise_std_dev=0.1, learning_rate=0.1, noise_size=1000000, rollout_num=6, used_rollout_num=6, normalize_state=True, ) return ars @staticmethod def ars_lr(device, dtype): c = TestARS.c actor = smw( ActorDiscrete(c.observe_dim, c.action_num).type(dtype).to(device), device, device, ) lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)], logger=default_logger) servers = model_server_helper(model_num=1) world = get_world() ars_group = world.create_rpc_group("ars", ["0", "1", "2"]) ars = ARS( actor, t.optim.SGD, ars_group, servers, noise_size=1000000, lr_scheduler=LambdaLR, lr_scheduler_args=((lr_func, ), ), ) return ars ######################################################################## # Test for ARS acting ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @WorldTestBase.setup_world def test_act(_, device, dtype): c = TestARS.c ars = TestARS.ars(device, dtype) state = t.zeros([1, c.observe_dim], dtype=dtype) ars.act({"state": state}, "original") ars.act({"state": state}, ars.get_actor_types()[0]) with pytest.raises(ValueError): ars.act({"state": state}, "some_invalid_actor_type") return True ######################################################################## # Test for ARS storage ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @WorldTestBase.setup_world def test_store_reward(_, device, dtype): ars = TestARS.ars(device, dtype) ars.store_reward(0.0, ars.get_actor_types()[0]) with pytest.raises(ValueError): ars.store_reward(1.0, "some_invalid_actor_type") return True ######################################################################## # Test for ARS update ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @WorldTestBase.setup_world def test_update(_, device, dtype): c = TestARS.c ars = TestARS.ars(device, dtype) for at in ars.get_actor_types(): # get action will cause filters to initialize _action = ars.act( {"state": t.zeros([1, c.observe_dim], dtype=dtype)}, at) if at.startswith("neg"): ars.store_reward(1.0, at) else: ars.store_reward(0.0, at) ars.update() return True ######################################################################## # Test for ARS save & load ######################################################################## # Skipped, it is the same as base ######################################################################## # Test for ARS lr_scheduler ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @WorldTestBase.setup_world def test_lr_scheduler(_, device, dtype): ars = TestARS.ars_lr(device, dtype) ars.update_lr_scheduler() return True ######################################################################## # Test for ARS config & init ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], timeout=180) @WorldTestBase.setup_world def test_config_init(_): c = TestARS.c config = ARS.generate_config({}) config["frame_config"]["models"] = ["ActorDiscrete"] config["frame_config"]["model_kwargs"] = [{ "state_dim": c.observe_dim, "action_dim": c.action_num }] ars = ARS.init_from_config(config) for at in ars.get_actor_types(): # get action will cause filters to initialize _action = ars.act( {"state": t.zeros([1, c.observe_dim], dtype=t.float32)}, at) if at.startswith("neg"): ars.store_reward(1.0, at) else: ars.store_reward(0.0, at) ars.update() return True ######################################################################## # Test for ARS full training. ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], timeout=1800) @WorldTestBase.setup_world def test_full_train(rank): c = TestARS.c ars = TestARS.ars("cpu", t.float32) # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env # for cpu usage viewing default_logger.info(f"{rank}, pid {os.getpid()}") while episode < c.max_episodes: episode.count() all_reward = 0 for at in ars.get_actor_types(): total_reward = 0 # batch size = 1 state = t.tensor(env.reset(), dtype=t.float32) while not terminal and step <= c.max_steps: step.count() with t.no_grad(): # agent model inference action = ars.act({"state": state.unsqueeze(0)}, at) state, reward, terminal, __ = env.step(action) state = t.tensor(state, dtype=t.float32) total_reward += float(reward) step.reset() terminal = False ars.store_reward(total_reward, at) all_reward += total_reward # update ars.update() smoother.update(all_reward / len(ars.get_actor_types())) default_logger.info( f"Process {rank} Episode {episode} total reward={smoother.value:.2f}" ) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: default_logger.info("Environment solved!") raise SafeExit else: reward_fulfilled.reset() raise RuntimeError("ARS Training failed.")
def train_config(self, pytestconfig): disable_view_window() c = Config() c.env_name = "Pendulum-v0" c.env = unwrap_time_limit(gym.make(c.env_name)) c.observe_dim = 3 c.action_dim = 1 c.action_range = 2 c.max_episodes = 1000 c.max_steps = 200 c.replay_size = 100000 c.solved_reward = -150 c.solved_repeat = 5 c.device = "cpu" return c
class TestDQNApex: # configs and definitions disable_view_window() c = Config() # Note: online policy algorithms such as PPO and A2C does not # work well in Pendulum (reason unknown) # and MountainCarContinuous (sparse returns) c.env_name = "CartPole-v0" c.env = unwrap_time_limit(gym.make(c.env_name)) c.observe_dim = 4 c.action_num = 2 c.max_episodes = 20000 c.max_steps = 200 c.replay_size = 100000 c.solved_reward = 150 c.solved_repeat = 5 @staticmethod def dqn_apex(device, dtype): c = TestDQNApex.c q_net = smw( QNet(c.observe_dim, c.action_num).type(dtype).to(device), device, device ) q_net_t = smw( QNet(c.observe_dim, c.action_num).type(dtype).to(device), device, device ) servers = model_server_helper(model_num=1) world = get_world() # process 0 and 1 will be workers, and 2 will be trainer apex_group = world.create_rpc_group("apex", ["0", "1", "2"]) dqn_apex = DQNApex( q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction="sum"), apex_group, servers, replay_device="cpu", replay_size=c.replay_size, ) return dqn_apex ######################################################################## # Test for DQNApex acting ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @setup_world def test_act(_, device, dtype): c = TestDQNApex.c dqn_apex = TestDQNApex.dqn_apex(device, dtype) state = t.zeros([1, c.observe_dim], dtype=dtype) dqn_apex.act_discrete({"state": state}) dqn_apex.act_discrete({"state": state}, True) dqn_apex.act_discrete_with_noise({"state": state}) dqn_apex.act_discrete_with_noise({"state": state}, True) return True ######################################################################## # Test for DQNApex criticizing ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @setup_world def test_criticize(_, device, dtype): c = TestDQNApex.c dqn_apex = TestDQNApex.dqn_apex(device, dtype) state = t.zeros([1, c.observe_dim], dtype=dtype) dqn_apex._criticize({"state": state}) dqn_apex._criticize({"state": state}, True) return True ######################################################################## # Test for DQNApex storage ######################################################################## # Skipped, it is the same as DQN ######################################################################## # Test for DQNApex update ######################################################################## @staticmethod @run_multi( expected_results=[True, True, True], pass_through=["device", "dtype"], timeout=180, ) @setup_world def test_update(rank, device, dtype): c = TestDQNApex.c dqn_apex = TestDQNApex.dqn_apex(device, dtype) old_state = state = t.zeros([1, c.observe_dim], dtype=dtype) action = t.zeros([1, 1], dtype=t.int) if rank in (0, 1): dqn_apex.store_episode( [ { "state": {"state": old_state}, "action": {"action": action}, "next_state": {"state": state}, "reward": 0, "terminal": False, } for _ in range(3) ] ) dqn_apex.manual_sync() if rank == 2: sleep(2) dqn_apex.update() return True ######################################################################## # Test for DQNApex save & load ######################################################################## # Skipped, it is the same as DQN ######################################################################## # Test for DQNApex lr_scheduler ######################################################################## # Skipped, it is the same as DQN ######################################################################## # Test for DQNApex config & init ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], timeout=180) @setup_world def test_config_init(rank): c = TestDQNApex.c config = DQNApex.generate_config({}) config["frame_config"]["models"] = ["QNet", "QNet"] config["frame_config"]["model_kwargs"] = [ {"state_dim": c.observe_dim, "action_num": c.action_num} ] * 2 dqn_apex = DQNApex.init_from_config(config) old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32) action = t.zeros([1, 1], dtype=t.int) if rank in (1, 2): dqn_apex.store_episode( [ { "state": {"state": old_state}, "action": {"action": action}, "next_state": {"state": state}, "reward": 0, "terminal": False, } for _ in range(3) ] ) dqn_apex.manual_sync() if rank == 0: sleep(2) dqn_apex.update( update_value=True, update_target=True, concatenate_samples=True ) return True ######################################################################## # Test for DQNApex full training. ######################################################################## @staticmethod @run_multi(expected_results=[True, True, True], timeout=1800) @setup_world def test_full_train(rank): training_group = get_world().create_rpc_group("training", ["0", "1", "2"]) c = TestDQNApex.c dqn_apex = TestDQNApex.dqn_apex("cpu", t.float32) # perform manual syncing to decrease the number of rpc calls dqn_apex.set_sync(False) # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env env.seed(rank) # make sure all things are initialized. training_group.barrier() # for cpu usage viewing default_logger.info(f"{rank}, pid {os.getpid()}") while episode < c.max_episodes: episode.count() if rank in (0, 1): # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) dqn_apex.manual_sync() tmp_observations = [] while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state # agent model inference action = dqn_apex.act_discrete_with_noise( {"state": old_state.unsqueeze(0)} ) state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) tmp_observations.append( { "state": {"state": old_state.unsqueeze(0)}, "action": {"action": action}, "next_state": {"state": state.unsqueeze(0)}, "reward": float(reward), "terminal": terminal or step == c.max_steps, } ) dqn_apex.store_episode(tmp_observations) smoother.update(total_reward) step.reset() terminal = False default_logger.info( "Process {} Episode {} total reward={:.2f}".format( rank, episode, smoother.value ) ) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: default_logger.info("Environment solved!") try: training_group.pair(f"solved", True) except KeyError: # already solved in another process pass else: reward_fulfilled.reset() else: # wait for some samples if episode.get() > 200: for _ in range(100): dqn_apex.update() default_logger.info("Updated 100 times.") training_group.barrier() if training_group.is_paired("solved"): return True raise RuntimeError("DQN-Apex Training failed.")