Python Config примеры использования

Язык программирования: Python

Пространство имен/Пакет: machin.utils.conf

Класс/Тип: Config

Примеров на hotexamples.com: 15

Python Config - 15 примеров найдено. Это лучшие примеры Python кода для machin.utils.conf.Config, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Config(15)

env(5)

observe_dim(5)

solved_reward(5)

max_steps(5)

max_episodes(5)

solved_repeat(5)

env_name(5)

device(5)

replay_size(5)

action_num(3)

action_dim(2)

action_range(2)

test_agent_num(1)

test_observe_dim(1)

value_max(1)

test_action_range(1)

test_action_dim(1)

noise_interval(1)

reward_future_steps(1)

noise_param(1)

noise_mode(1)

noise(1)

conf2(1)

conf1(1)

agent_num(1)

value_min(1)

Пример #1

Показать файл

def test_merge_config():
    conf = get_config()
    conf = merge_config(conf, {"conf1": 2, "conf3": 3})
    assert conf.conf1 == 2
    assert conf.conf2 == 2
    assert conf.conf3 == 3

    conf = get_config()
    conf2 = Config(conf1=2, conf3=3)
    conf = merge_config(conf, conf2)
    assert conf.conf1 == 2
    assert conf.conf2 == 2
    assert conf.conf3 == 3

Пример #2

Показать файл

 def train_config(self):
     disable_view_window()
     c = Config()
     c.env_name = "Pendulum-v0"
     c.env = unwrap_time_limit(gym.make(c.env_name))
     c.observe_dim = 3
     c.action_dim = 1
     c.action_range = 2
     c.max_episodes = 1000
     c.max_steps = 200
     c.noise_param = (0, 0.2)
     c.noise_mode = "normal"
     c.noise_interval = 2
     c.replay_size = 100000
     c.solved_reward = -400
     c.solved_repeat = 5
     return c

Пример #3

Показать файл

class TestIMPALA(object):
    # configs and definitions
    disable_view_window()
    c = Config()
    # Note: online policy algorithms such as PPO and A3C does not
    # work well in Pendulum (reason unknown)
    # and MountainCarContinuous (sparse returns)
    c.env_name = "CartPole-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 4
    c.action_num = 2
    c.max_episodes = 2000
    c.max_steps = 200
    c.replay_size = 10000
    c.solved_reward = 150
    c.solved_repeat = 5

    @staticmethod
    def impala(device, dtype, use_lr_sch=False):
        c = TestIMPALA.c
        actor = smw(
            Actor(c.observe_dim, c.action_num).type(dtype).to(device), device,
            device)
        critic = smw(
            Critic(c.observe_dim).type(dtype).to(device), device, device)
        servers = model_server_helper(model_num=1)
        world = get_world()
        # process 0 and 1 will be workers, and 2 will be trainer
        impala_group = world.create_rpc_group("impala", ["0", "1", "2"])

        if use_lr_sch:
            lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)],
                                             logger=default_logger)
            impala = IMPALA(actor,
                            critic,
                            t.optim.Adam,
                            nn.MSELoss(reduction='sum'),
                            impala_group,
                            servers,
                            lr_scheduler=LambdaLR,
                            lr_scheduler_args=((lr_func, ), (lr_func, )))
        else:
            impala = IMPALA(actor, critic, t.optim.Adam,
                            nn.MSELoss(reduction='sum'), impala_group, servers)
        return impala

    ########################################################################
    # Test for IMPALA acting
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["device", "dtype"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_act(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        state = t.zeros([1, c.observe_dim], dtype=dtype)
        impala.act({"state": state})
        return True

    ########################################################################
    # Test for IMPALA action evaluation
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["device", "dtype"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_eval_action(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        impala._eval_act({"state": state}, {"action": action})
        return True

    ########################################################################
    # Test for IMPALA criticizing
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["device", "dtype"],
               timeout=180)
    @WorldTestBase.setup_world
    def test__criticize(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        state = t.zeros([1, c.observe_dim], dtype=dtype)
        impala._criticize({"state": state})
        return True

    ########################################################################
    # Test for IMPALA storage
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["device", "dtype"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_store_step(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)

        with pytest.raises(NotImplementedError):
            impala.store_transition({
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "action_log_prob": 0.1,
                "terminal": False
            })
        return True

    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["device", "dtype"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_store_episode(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        episode = [{
            "state": {
                "state": old_state
            },
            "action": {
                "action": action
            },
            "next_state": {
                "state": state
            },
            "reward": 0,
            "action_log_prob": 0.1,
            "terminal": False
        } for _ in range(3)]
        impala.store_episode(episode)
        return True

    ########################################################################
    # Test for IMPALA update
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["device", "dtype"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_update(rank, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        if rank == 0:
            # episode length = 3
            impala.store_episode([{
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "action_log_prob": 0.1,
                "terminal": False
            } for _ in range(3)])
        elif rank == 1:
            # episode length = 2
            impala.store_episode([{
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "action_log_prob": 0.1,
                "terminal": False
            } for _ in range(2)])
        if rank == 2:
            sleep(2)
            impala.update(update_value=True,
                          update_target=True,
                          concatenate_samples=True)
        return True

    ########################################################################
    # Test for IMPALA save & load
    ########################################################################
    # Skipped, it is the same as base framework

    ########################################################################
    # Test for IMPALA lr_scheduler
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["device", "dtype"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_lr_scheduler(_, device, dtype):
        impala = TestIMPALA.impala(device, dtype)

        impala.update_lr_scheduler()
        return True

    ########################################################################
    # Test for IMPALA full training.
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=1800)
    @WorldTestBase.setup_world
    def test_full_train(rank):
        c = TestIMPALA.c
        impala = TestIMPALA.impala("cpu", t.float32)

        # perform manual syncing to decrease the number of rpc calls
        impala.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        world = get_world()
        all_group = world.create_rpc_group("all", ["0", "1", "2"])
        all_group.pair("{}_running".format(rank), True)
        default_logger.info("{}, pid {}".format(rank, os.getpid()))
        if rank == 0:
            all_group.pair("episode", episode)

        if rank in (0, 1):
            while episode < c.max_episodes:
                # wait for trainer to keep up
                sleep(0.2)
                episode.count()

                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32)

                impala.manual_sync()
                tmp_observations = []
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        action, action_log_prob, *_ = impala.act(
                            {"state": old_state.unsqueeze(0)})
                        state, reward, terminal, _ = env.step(action.item())
                        state = t.tensor(state, dtype=t.float32).flatten()
                        total_reward += float(reward)

                        tmp_observations.append({
                            "state": {
                                "state": old_state.unsqueeze(0)
                            },
                            "action": {
                                "action": action
                            },
                            "next_state": {
                                "state": state.unsqueeze(0)
                            },
                            "reward":
                            float(reward),
                            "action_log_prob":
                            action_log_prob.item(),
                            "terminal":
                            terminal or step == c.max_steps
                        })
                impala.store_episode(tmp_observations)

                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info("Process {} Episode {} "
                                    "total reward={:.2f}".format(
                                        rank, episode, smoother.value))

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")

                        all_group.unpair("{}_running".format(rank))
                        while (all_group.is_paired("0_running")
                               or all_group.is_paired("1_running")):
                            # wait for all workers to join
                            sleep(1)
                        # wait for trainer
                        sleep(5)
                        return True
                else:
                    reward_fulfilled.reset()
        else:
            # wait for some samples
            # Note: the number of entries in buffer means "episodes"
            # rather than steps here!
            while impala.replay_buffer.all_size() < 5:
                sleep(0.1)
            while (all_group.is_paired("0_running")
                   or all_group.is_paired("1_running")):
                impala.update()
            return True

        raise RuntimeError("IMPALA Training failed.")

Пример #4

Показать файл

Файл: test_rainbow.py Проект: yueweizhizhu/machin

    def train_config(self, gpu):
        disable_view_window()
        c = Config()
        # Note: online policy algorithms such as PPO and A2C does not
        # work well in Pendulum (reason unknown)
        # and MountainCarContinuous (sparse returns)
        c.env_name = "CartPole-v0"
        c.env = unwrap_time_limit(gym.make(c.env_name))
        c.observe_dim = 4
        c.action_num = 2
        # maximum and minimum of reward value
        # since reward is 1 for every step, maximum q value should be
        # below 20(reward_future_steps) * (1 + discount ** n_steps) < 40
        c.value_max = 40
        c.value_min = 0
        c.reward_future_steps = 20
        c.max_episodes = 1000
        c.max_steps = 200
        c.replay_size = 100000

        # RAINBOW is not very stable (without dueling and noisy linear)
        # compared to other DQNs
        c.solved_reward = 180
        c.solved_repeat = 5
        c.device = gpu
        return c

Пример #5

Показать файл

 def train_config(self, pytestconfig):
     disable_view_window()
     c = Config()
     # Note: online policy algorithms such as PPO and A2C does not
     # work well in Pendulum (reason unknown)
     # and MountainCarContinuous (sparse returns)
     c.env_name = "CartPole-v0"
     c.env = unwrap_time_limit(gym.make(c.env_name))
     c.observe_dim = 4
     c.action_num = 2
     c.max_episodes = 1000
     c.max_steps = 200
     c.replay_size = 100000
     c.solved_reward = 190
     c.solved_repeat = 5
     c.device = "cpu"
     return c

Пример #6

Показать файл

Файл: test_apex.py Проект: yueweizhizhu/machin

class TestDQNApex(object):
    # configs and definitions
    disable_view_window()
    c = Config()
    # Note: online policy algorithms such as PPO and A2C does not
    # work well in Pendulum (reason unknown)
    # and MountainCarContinuous (sparse returns)
    c.env_name = "CartPole-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 4
    c.action_num = 2
    c.max_episodes = 2000
    c.max_steps = 200
    c.replay_size = 100000
    c.solved_reward = 190
    c.solved_repeat = 5

    @staticmethod
    def dqn_apex():
        c = TestDQNApex.c
        q_net = smw(
            QNet(c.observe_dim, c.action_num).to(c.device), c.device, c.device)
        q_net_t = smw(
            QNet(c.observe_dim, c.action_num).to(c.device), c.device, c.device)
        servers = model_server_helper(model_num=1)
        world = get_world()
        # process 0 and 1 will be workers, and 2 will be trainer
        apex_group = world.create_rpc_group("apex", ["0", "1", "2"])
        dqn_apex = DQNApex(q_net,
                           q_net_t,
                           t.optim.Adam,
                           nn.MSELoss(reduction='sum'),
                           apex_group,
                           servers,
                           replay_device="cpu",
                           replay_size=c.replay_size)
        return dqn_apex

    ########################################################################
    # Test for DQNApex acting
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_act(_, gpu):
        c = TestDQNApex.c
        c.device = gpu
        dqn_apex = TestDQNApex.dqn_apex()
        state = t.zeros([1, c.observe_dim])
        dqn_apex.act_discrete({"state": state})
        dqn_apex.act_discrete({"state": state}, True)
        dqn_apex.act_discrete_with_noise({"state": state})
        dqn_apex.act_discrete_with_noise({"state": state}, True)
        return True

    ########################################################################
    # Test for DQNApex criticizing
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_criticize(_, gpu):
        c = TestDQNApex.c
        c.device = gpu
        dqn_apex = TestDQNApex.dqn_apex()
        state = t.zeros([1, c.observe_dim])
        dqn_apex._criticize({"state": state})
        dqn_apex._criticize({"state": state}, True)
        return True

    ########################################################################
    # Test for DQNApex storage
    ########################################################################
    # Skipped, it is the same as DQN

    ########################################################################
    # Test for DQNApex update
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_update(rank, gpu):
        c = TestDQNApex.c
        c.device = gpu
        dqn_apex = TestDQNApex.dqn_apex()
        old_state = state = t.zeros([1, c.observe_dim])
        action = t.zeros([1, 1], dtype=t.int)
        if rank in (0, 1):
            dqn_apex.store_episode([{
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "terminal": False
            } for _ in range(3)])
            dqn_apex.manual_sync()
        if rank == 2:
            sleep(2)
            dqn_apex.update(update_value=True,
                            update_target=True,
                            concatenate_samples=True)
        return True

    ########################################################################
    # Test for DQNApex save & load
    ########################################################################
    # Skipped, it is the same as DQN

    ########################################################################
    # Test for DQNApex lr_scheduler
    ########################################################################
    # Skipped, it is the same as DQN

    ########################################################################
    # Test for DQNApex full training.
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=1800)
    @WorldTestBase.setup_world
    def test_full_train(rank, gpu):
        c = TestDQNApex.c
        c.device = gpu
        dqn_apex = TestDQNApex.dqn_apex()
        # perform manual syncing to decrease the number of rpc calls
        dqn_apex.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        world = get_world()
        all_group = world.create_rpc_group("all", ["0", "1", "2"])
        all_group.pair("{}_running".format(rank), True)

        if rank in (0, 1):
            while episode < c.max_episodes:
                # wait for trainer to keep up
                sleep(0.2)
                episode.count()

                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32, device=c.device)

                dqn_apex.manual_sync()
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        # agent model inference
                        action = dqn_apex.act_discrete_with_noise(
                            {"state": old_state.unsqueeze(0)})
                        state, reward, terminal, _ = env.step(action.item())
                        state = t.tensor(state,
                                         dtype=t.float32,
                                         device=c.device).flatten()
                        total_reward += float(reward)

                        dqn_apex.store_transition({
                            "state": {
                                "state": old_state.unsqueeze(0)
                            },
                            "action": {
                                "action": action
                            },
                            "next_state": {
                                "state": state.unsqueeze(0)
                            },
                            "reward":
                            float(reward),
                            "terminal":
                            terminal or step == c.max_steps
                        })
                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info(
                    "Process {} Episode {} total reward={:.2f}".format(
                        rank, episode, smoother.value))

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")

                        all_group.unpair("{}_running".format(rank))
                        while (all_group.is_paired("0_running")
                               or all_group.is_paired("1_running")):
                            # wait for all workers to join
                            sleep(1)
                        # wait for trainer
                        sleep(5)
                        return True
                else:
                    reward_fulfilled.reset()
        else:
            # wait for some samples
            while dqn_apex.replay_buffer.all_size() < 500:
                sleep(0.1)
            while (all_group.is_paired("0_running")
                   or all_group.is_paired("1_running")):
                dqn_apex.update()
            return True

        raise RuntimeError("DQN-Apex Training failed.")

Пример #7

Показать файл

Файл: test_apex.py Проект: yueweizhizhu/machin

class TestDDPGApex(object):
    # configs and definitions
    disable_view_window()
    c = Config()
    c.env_name = "Pendulum-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 3
    c.action_dim = 1
    c.action_range = 2
    c.max_episodes = 2000
    c.max_steps = 200
    c.noise_param = (0, 0.2)
    c.noise_mode = "normal"
    c.replay_size = 100000
    # takes too much computing resource
    # decrease standard for faster validation
    c.solved_reward = -300
    c.solved_repeat = 5

    @staticmethod
    def ddpg_apex(discrete=False):
        c = TestDDPGApex.c
        if not discrete:
            actor = smw(
                Actor(c.observe_dim, c.action_dim,
                      c.action_range).to(c.device), c.device, c.device)
            actor_t = smw(
                Actor(c.observe_dim, c.action_dim,
                      c.action_range).to(c.device), c.device, c.device)
        else:
            actor = smw(
                ActorDiscrete(c.observe_dim, c.action_dim).to(c.device),
                c.device, c.device)
            actor_t = smw(
                ActorDiscrete(c.observe_dim, c.action_dim).to(c.device),
                c.device, c.device)
        critic = smw(
            Critic(c.observe_dim, c.action_dim).to(c.device), c.device,
            c.device)
        critic_t = smw(
            Critic(c.observe_dim, c.action_dim).to(c.device), c.device,
            c.device)

        servers = model_server_helper(model_num=2)
        world = get_world()
        # process 0 and 1 will be workers, and 2 will be trainer
        apex_group = world.create_rpc_group("worker", ["0", "1", "2"])
        ddpg_apex = DDPGApex(actor,
                             actor_t,
                             critic,
                             critic_t,
                             t.optim.Adam,
                             nn.MSELoss(reduction='sum'),
                             apex_group,
                             servers,
                             replay_device="cpu",
                             replay_size=c.replay_size)
        return ddpg_apex

    ########################################################################
    # Test for DDPGApex contiguous domain acting
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_contiguous_act(_, gpu):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex()
        state = t.zeros([1, c.observe_dim])
        ddpg_apex.act({"state": state})
        ddpg_apex.act({"state": state}, use_target=True)
        ddpg_apex.act_with_noise({"state": state},
                                 noise_param=(0, 1.0),
                                 mode="uniform")
        ddpg_apex.act_with_noise({"state": state},
                                 noise_param=(0, 1.0),
                                 mode="uniform",
                                 use_target=True)
        return True

    ########################################################################
    # Test for DDPGApex discrete domain acting
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_discrete_act(_, gpu):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex(discrete=True)
        state = t.zeros([1, c.observe_dim])
        ddpg_apex.act_discrete({"state": state})
        ddpg_apex.act_discrete({"state": state}, use_target=True)
        ddpg_apex.act_discrete_with_noise({"state": state})
        ddpg_apex.act_discrete_with_noise({"state": state}, use_target=True)
        return True

    ########################################################################
    # Test for DDPGApex criticizing
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=180)
    @WorldTestBase.setup_world
    def test__criticize(_, gpu):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex()
        state = t.zeros([1, c.observe_dim])
        action = t.zeros([1, c.action_dim])
        ddpg_apex._criticize({"state": state}, {"action": action})
        ddpg_apex._criticize({"state": state}, {"action": action},
                             use_target=True)
        return True

    ########################################################################
    # Test for DDPGApex storage
    ########################################################################
    # Skipped, it is the same as DDPG

    ########################################################################
    # Test for DDPGApex update
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_update(rank, gpu):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex()
        old_state = state = t.zeros([1, c.observe_dim])
        action = t.zeros([1, c.action_dim])
        if rank in (0, 1):
            ddpg_apex.store_transition({
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "terminal": False
            })
            sleep(5)
            ddpg_apex.manual_sync()
        if rank == 2:
            sleep(2)
            ddpg_apex.update(update_value=True,
                             update_policy=True,
                             update_target=True,
                             concatenate_samples=True)
        return True

    ########################################################################
    # Test for DDPGApex save & load
    ########################################################################
    # Skipped, it is the same as DDPG

    ########################################################################
    # Test for DDPGApex lr_scheduler
    ########################################################################
    # Skipped, it is the same as DDPG

    ########################################################################
    # Test for DDPGApex full training.
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=1800)
    @WorldTestBase.setup_world
    def test_full_train(rank, gpu):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex()
        # perform manual syncing to decrease the number of rpc calls
        ddpg_apex.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        world = get_world()
        all_group = world.create_rpc_group("all", ["0", "1", "2"])
        all_group.pair("{}_running".format(rank), True)
        default_logger.info("{}, pid {}".format(rank, os.getpid()))
        if rank == 0:
            all_group.pair("episode", episode)

        if rank in (0, 1):
            while episode < c.max_episodes:
                # wait for trainer to keep up
                sleep(0.2)
                episode.count()

                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32, device=c.device)

                ddpg_apex.manual_sync()
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        action = ddpg_apex.act_with_noise(
                            {"state": old_state.unsqueeze(0)},
                            noise_param=c.noise_param,
                            mode=c.noise_mode)

                        state, reward, terminal, _ = env.step(
                            action.cpu().numpy())
                        state = t.tensor(state,
                                         dtype=t.float32,
                                         device=c.device).flatten()
                        total_reward += float(reward)

                        ddpg_apex.store_transition({
                            "state": {
                                "state": old_state.unsqueeze(0)
                            },
                            "action": {
                                "action": action
                            },
                            "next_state": {
                                "state": state.unsqueeze(0)
                            },
                            "reward":
                            float(reward),
                            "terminal":
                            terminal or step == c.max_steps
                        })

                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info("Process {} Episode {} "
                                    "total reward={:.2f}".format(
                                        rank, episode, smoother.value))

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")

                        all_group.unpair("{}_running".format(rank))
                        while (all_group.is_paired("0_running")
                               or all_group.is_paired("1_running")):
                            # wait for all workers to join
                            sleep(1)
                        # wait for trainer
                        sleep(5)
                        return True
                else:
                    reward_fulfilled.reset()
        else:
            # wait for some samples
            while ddpg_apex.replay_buffer.all_size() < 500:
                sleep(0.1)
            while (all_group.is_paired("0_running")
                   or all_group.is_paired("1_running")):
                ddpg_apex.update()
            return True

        raise RuntimeError("DDPG-Apex Training failed.")

Пример #8

Показать файл

def get_config():
    c = Config()
    c.conf1 = 1
    c.conf2 = 2
    return c

Пример #9

Показать файл

class TestA3C:
    # configs and definitions
    disable_view_window()
    c = Config()
    # Note: online policy algorithms such as PPO and A3C does not
    # work well in Pendulum (reason unknown)
    # and MountainCarContinuous (sparse returns)
    c.env_name = "CartPole-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 4
    c.action_num = 2
    c.max_episodes = 3000
    c.max_steps = 200
    c.replay_size = 10000
    c.solved_reward = 150
    c.solved_repeat = 5

    @staticmethod
    def a3c(device, dtype):
        c = TestA3C.c
        actor = smw(
            Actor(c.observe_dim, c.action_num).type(dtype).to(device), device,
            device)
        critic = smw(
            Critic(c.observe_dim).type(dtype).to(device), device, device)
        # in all test scenarios, all processes will be used as reducers
        servers = grad_server_helper(
            [
                lambda: Actor(c.observe_dim, c.action_num),
                lambda: Critic(c.observe_dim)
            ],
            learning_rate=5e-3,
        )
        a3c = A3C(
            actor,
            critic,
            nn.MSELoss(reduction="sum"),
            servers,
            replay_device="cpu",
            replay_size=c.replay_size,
        )
        return a3c

    ########################################################################
    # Test for A3C acting
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_act(_, device, dtype):
        c = TestA3C.c
        a3c = TestA3C.a3c(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        a3c.act({"state": state})
        return True

    ########################################################################
    # Test for A3C action evaluation
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_eval_action(_, device, dtype):
        c = TestA3C.c
        a3c = TestA3C.a3c(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        a3c._eval_act({"state": state}, {"action": action})
        return True

    ########################################################################
    # Test for A3C criticizing
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test__criticize(_, device, dtype):
        c = TestA3C.c
        a3c = TestA3C.a3c(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        a3c._criticize({"state": state})
        return True

    ########################################################################
    # Test for A3C storage
    ########################################################################
    # Skipped, it is the same as A2C

    ########################################################################
    # Test for A3C update
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_update(rank, device, dtype):
        c = TestA3C.c
        c.device = gpu
        a3c = TestA3C.a3c(device, dtype)
        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)

        begin = time()
        while time() - begin < 5:
            a3c.store_episode([{
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "terminal": False,
            } for _ in range(3)])
            a3c.update(
                update_value=True,
                update_policy=True,
                update_target=True,
                concatenate_samples=True,
            )
            sleep(0.01)

        if rank == 1:
            # pull the newest model
            a3c.manual_sync()
        return True

    ########################################################################
    # Test for A3C save & load
    ########################################################################
    # Skipped, it is the same as A2C

    ########################################################################
    # Test for A3C lr_scheduler
    ########################################################################
    # Skipped, it is the same as A2C

    ########################################################################
    # Test for A3C config & init
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=180)
    @WorldTestBase.setup_world
    def test_config_init(rank):
        c = TestA3C.c
        config = A3C.generate_config({})
        config["frame_config"]["models"] = ["Actor", "Critic"]
        config["frame_config"]["model_kwargs"] = [
            {
                "state_dim": c.observe_dim,
                "action_num": c.action_num
            },
            {
                "state_dim": c.observe_dim
            },
        ]
        a3c = A3C.init_from_config(config)

        old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32)
        action = t.zeros([1, 1], dtype=t.int)

        begin = time()
        while time() - begin < 5:
            a3c.store_episode([{
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "terminal": False,
            } for _ in range(3)])
            a3c.update()
            sleep(0.01)

        if rank == 1:
            # pull the newest model
            a3c.manual_sync()

        return True

    ########################################################################
    # Test for A3C full training.
    ########################################################################
    @staticmethod
    @pytest.mark.parametrize("gae_lambda", [0.0, 0.5, 1.0])
    @run_multi(expected_results=[True, True, True],
               pass_through=["gae_lambda"],
               timeout=1800)
    @WorldTestBase.setup_world
    def test_full_train(rank, gae_lambda):
        c = TestA3C.c
        a3c = TestA3C.a3c("cpu", t.float32)
        a3c.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            a3c.manual_sync()
            tmp_observations = []
            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action = a3c.act({"state": old_state.unsqueeze(0)})[0]
                    state, reward, terminal, _ = env.step(action.item())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    tmp_observations.append({
                        "state": {
                            "state": old_state.unsqueeze(0)
                        },
                        "action": {
                            "action": action
                        },
                        "next_state": {
                            "state": state.unsqueeze(0)
                        },
                        "reward":
                        float(reward),
                        "terminal":
                        terminal or step == c.max_steps,
                    })

            # update
            a3c.store_episode(tmp_observations)
            a3c.update()

            smoother.update(total_reward)
            step.reset()
            terminal = False

            default_logger.info(
                f"Process {rank} Episode {episode} total reward={smoother.value:.2f}"
            )

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    default_logger.info("Environment solved!")
                    return True
            else:
                reward_fulfilled.reset()

        raise RuntimeError("A3C Training failed.")

Пример #10

Показать файл

Файл: test_impala.py Проект: iffiX/machin

class TestIMPALA:
    # configs and definitions
    disable_view_window()
    c = Config()
    # Note: online policy algorithms such as PPO and A3C does not
    # work well in Pendulum (reason unknown)
    # and MountainCarContinuous (sparse returns)
    c.env_name = "CartPole-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 4
    c.action_num = 2
    c.max_episodes = 20000
    c.max_steps = 200
    c.replay_size = 10000
    c.solved_reward = 150
    c.solved_repeat = 5

    @staticmethod
    def impala(device, dtype, use_lr_sch=False):
        c = TestIMPALA.c
        actor = smw(
            Actor(c.observe_dim, c.action_num).type(dtype).to(device), device, device
        )
        critic = smw(Critic(c.observe_dim).type(dtype).to(device), device, device)
        servers = model_server_helper(model_num=1)
        world = get_world()
        # process 0 and 1 will be workers, and 2 will be trainer
        impala_group = world.create_rpc_group("impala", ["0", "1", "2"])

        if use_lr_sch:
            lr_func = gen_learning_rate_func(
                [(0, 1e-3), (200000, 3e-4)], logger=default_logger
            )
            impala = IMPALA(
                actor,
                critic,
                t.optim.Adam,
                nn.MSELoss(reduction="sum"),
                impala_group,
                servers,
                lr_scheduler=LambdaLR,
                lr_scheduler_args=((lr_func,), (lr_func,)),
            )
        else:
            impala = IMPALA(
                actor,
                critic,
                t.optim.Adam,
                nn.MSELoss(reduction="sum"),
                impala_group,
                servers,
            )
        return impala

    ########################################################################
    # Test for IMPALA acting
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_act(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        state = t.zeros([1, c.observe_dim], dtype=dtype)
        impala.act({"state": state})
        return True

    ########################################################################
    # Test for IMPALA action evaluation
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_eval_action(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        impala._eval_act({"state": state}, {"action": action})
        return True

    ########################################################################
    # Test for IMPALA criticizing
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test__criticize(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        state = t.zeros([1, c.observe_dim], dtype=dtype)
        impala._criticize({"state": state})
        return True

    ########################################################################
    # Test for IMPALA storage
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_store_episode(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        episode = [
            {
                "state": {"state": old_state},
                "action": {"action": action},
                "next_state": {"state": state},
                "reward": 0,
                "action_log_prob": 0.1,
                "terminal": False,
            }
            for _ in range(3)
        ]
        impala.store_episode(episode)
        return True

    ########################################################################
    # Test for IMPALA update
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_update(rank, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        if rank == 0:
            # episode length = 3
            impala.store_episode(
                [
                    {
                        "state": {"state": old_state},
                        "action": {"action": action},
                        "next_state": {"state": state},
                        "reward": 0,
                        "action_log_prob": 0.1,
                        "terminal": False,
                    }
                    for _ in range(3)
                ]
            )
        elif rank == 1:
            # episode length = 2
            impala.store_episode(
                [
                    {
                        "state": {"state": old_state},
                        "action": {"action": action},
                        "next_state": {"state": state},
                        "reward": 0,
                        "action_log_prob": 0.1,
                        "terminal": False,
                    }
                    for _ in range(2)
                ]
            )
        if rank == 2:
            sleep(2)
            impala.update(
                update_value=True, update_target=True, concatenate_samples=True
            )
        return True

    ########################################################################
    # Test for IMPALA save & load
    ########################################################################
    # Skipped, it is the same as base framework

    ########################################################################
    # Test for IMPALA lr_scheduler
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_lr_scheduler(_, device, dtype):
        impala = TestIMPALA.impala(device, dtype)

        impala.update_lr_scheduler()
        return True

    ########################################################################
    # Test for IMPALA config & init
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=180)
    @setup_world
    def test_config_init(rank):
        c = TestIMPALA.c
        config = IMPALA.generate_config({})
        config["frame_config"]["models"] = ["Actor", "Critic"]
        config["frame_config"]["model_kwargs"] = [
            {"state_dim": c.observe_dim, "action_num": c.action_num},
            {"state_dim": c.observe_dim},
        ]
        impala = IMPALA.init_from_config(config)

        old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32)
        action = t.zeros([1, 1], dtype=t.int)

        if rank == 0:
            # episode length = 3
            impala.store_episode(
                [
                    {
                        "state": {"state": old_state},
                        "action": {"action": action},
                        "next_state": {"state": state},
                        "reward": 0,
                        "action_log_prob": 0.1,
                        "terminal": False,
                    }
                    for _ in range(3)
                ]
            )
        elif rank == 1:
            # episode length = 2
            impala.store_episode(
                [
                    {
                        "state": {"state": old_state},
                        "action": {"action": action},
                        "next_state": {"state": state},
                        "reward": 0,
                        "action_log_prob": 0.1,
                        "terminal": False,
                    }
                    for _ in range(2)
                ]
            )
        if rank == 2:
            sleep(2)
            impala.update(
                update_value=True, update_target=True, concatenate_samples=True
            )
        return True

    ########################################################################
    # Test for IMPALA full training.
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=1800)
    @setup_world
    def test_full_train(rank):
        training_group = get_world().create_rpc_group("training", ["0", "1", "2"])

        c = TestIMPALA.c
        impala = TestIMPALA.impala("cpu", t.float32)

        # perform manual syncing to decrease the number of rpc calls
        impala.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False
        env = c.env
        env.seed(rank)

        # make sure all things are initialized.
        training_group.barrier()

        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")

        while episode < c.max_episodes:
            episode.count()

            if rank in (0, 1):
                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32)

                impala.manual_sync()
                tmp_observations = []
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        action, action_log_prob, *_ = impala.act(
                            {"state": old_state.unsqueeze(0)}
                        )
                        state, reward, terminal, _ = env.step(action.item())
                        state = t.tensor(state, dtype=t.float32).flatten()
                        total_reward += float(reward)

                        tmp_observations.append(
                            {
                                "state": {"state": old_state.unsqueeze(0)},
                                "action": {"action": action},
                                "next_state": {"state": state.unsqueeze(0)},
                                "reward": float(reward),
                                "action_log_prob": action_log_prob.item(),
                                "terminal": terminal or step == c.max_steps,
                            }
                        )
                impala.store_episode(tmp_observations)

                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info(
                    "Process {} Episode {} "
                    "total reward={:.2f}".format(rank, episode, smoother.value)
                )

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")
                        try:
                            training_group.pair(f"solved", True)
                        except KeyError:
                            # already solved in another process
                            pass
                else:
                    reward_fulfilled.reset()
            else:
                # wait for some samples
                if episode.get() > 200:
                    for _ in range(100):
                        impala.update()
                    default_logger.info("Updated 100 times.")

            training_group.barrier()
            if training_group.is_paired("solved"):
                return True

        raise RuntimeError("IMPALA Training failed.")

Пример #11

Показать файл

Файл: test_maddpg.py Проект: ikamensh/machin

 def train_config(self):
     disable_view_window()
     c = Config()
     # the cooperative environment environment provided in
     # https://github.com/openai/multiagent-particle-envs
     c.env_name = "simple_spread"
     c.env = create_env(c.env_name)
     c.env.discrete_action_input = True
     c.agent_num = 3
     c.action_num = c.env.action_space[0].n
     c.observe_dim = c.env.observation_space[0].shape[0]
     # for contiguous tests
     c.test_action_dim = 5
     c.test_action_range = 1
     c.test_observe_dim = 5
     c.test_agent_num = 3
     c.max_episodes = 1000
     c.max_steps = 200
     c.replay_size = 100000
     # from https://github.com/wsjeon/maddpg-rllib/tree/master/plots
     # PROBLEM: I have no idea how they calculate the rewards
     # I cannot replicate their reward curve
     c.solved_reward = -15
     c.solved_repeat = 5
     return c

Пример #12

Показать файл

class TestDDPGApex:
    # configs and definitions
    disable_view_window()
    c = Config()
    # Use cartpole-v0 instead since pendulum training is too slow on test machine.
    c.env_name = "CartPole-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 4
    # use dim instead of num here
    c.action_dim = 2
    c.action_range = 1
    c.max_episodes = 20000
    c.max_steps = 200
    c.replay_size = 100000
    c.solved_reward = 150
    c.solved_repeat = 5

    # only for continuous mode testings
    c.action_range = 1

    @staticmethod
    def ddpg_apex(device, dtype, discrete=False):
        c = TestDDPGApex.c
        if not discrete:
            actor = smw(
                Actor(c.observe_dim, c.action_dim,
                      c.action_range).type(dtype).to(device),
                device,
                device,
            )
            actor_t = smw(
                Actor(c.observe_dim, c.action_dim,
                      c.action_range).type(dtype).to(device),
                device,
                device,
            )
        else:
            actor = smw(
                ActorDiscrete(c.observe_dim,
                              c.action_dim).type(dtype).to(device),
                device,
                device,
            )
            actor_t = smw(
                ActorDiscrete(c.observe_dim,
                              c.action_dim).type(dtype).to(device),
                device,
                device,
            )
        critic = smw(
            Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
            device)
        critic_t = smw(
            Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
            device)

        servers = model_server_helper(model_num=2)
        world = get_world()
        # process 0 and 1 will be workers, and 2 will be trainer
        apex_group = world.create_rpc_group("worker", ["0", "1", "2"])
        ddpg_apex = DDPGApex(
            actor,
            actor_t,
            critic,
            critic_t,
            t.optim.Adam,
            nn.MSELoss(reduction="sum"),
            apex_group,
            servers,
            replay_device="cpu",
            replay_size=c.replay_size,
        )
        return ddpg_apex

    ########################################################################
    # Test for DDPGApex contiguous domain acting
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_contiguous_act(_, device, dtype):
        c = TestDDPGApex.c
        ddpg_apex = TestDDPGApex.ddpg_apex(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        ddpg_apex.act({"state": state})
        ddpg_apex.act({"state": state}, use_target=True)
        ddpg_apex.act_with_noise({"state": state},
                                 noise_param=(0, 1.0),
                                 mode="uniform")
        ddpg_apex.act_with_noise({"state": state},
                                 noise_param=(0, 1.0),
                                 mode="uniform",
                                 use_target=True)
        return True

    ########################################################################
    # Test for DDPGApex discrete domain acting
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_discrete_act(_, device, dtype):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex(device, dtype, discrete=True)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        ddpg_apex.act_discrete({"state": state})
        ddpg_apex.act_discrete({"state": state}, use_target=True)
        ddpg_apex.act_discrete_with_noise({"state": state})
        ddpg_apex.act_discrete_with_noise({"state": state}, use_target=True)
        return True

    ########################################################################
    # Test for DDPGApex criticizing
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test__criticize(_, device, dtype):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, c.action_dim], dtype=dtype)
        ddpg_apex._criticize({"state": state}, {"action": action})
        ddpg_apex._criticize({"state": state}, {"action": action},
                             use_target=True)
        return True

    ########################################################################
    # Test for DDPGApex storage
    ########################################################################
    # Skipped, it is the same as DDPG

    ########################################################################
    # Test for DDPGApex update
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_update(rank, device, dtype):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex(device, dtype)
        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, c.action_dim], dtype=dtype)
        if rank in (0, 1):
            ddpg_apex.store_transition({
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "terminal": False,
            })
            sleep(5)
            ddpg_apex.manual_sync()
        if rank == 2:
            sleep(2)
            ddpg_apex.update(
                update_value=True,
                update_policy=True,
                update_target=True,
                concatenate_samples=True,
            )
        return True

    ########################################################################
    # Test for DDPGApex save & load
    ########################################################################
    # Skipped, it is the same as DDPG

    ########################################################################
    # Test for DDPGApex lr_scheduler
    ########################################################################
    # Skipped, it is the same as DDPG

    ########################################################################
    # Test for DDPGApex config & init
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=180)
    @WorldTestBase.setup_world
    def test_config_init(rank):
        c = TestDDPGApex.c
        config = DDPGApex.generate_config({})
        config["frame_config"]["models"] = [
            "Actor", "Actor", "Critic", "Critic"
        ]
        config["frame_config"][
            "model_kwargs"] = [{
                "state_dim": c.observe_dim,
                "action_dim": c.action_dim,
                "action_range": c.action_range,
            }] * 2 + [{
                "state_dim": c.observe_dim,
                "action_dim": c.action_dim
            }] * 2
        ddpg_apex = DDPGApex.init_from_config(config)

        old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32)
        action = t.zeros([1, c.action_dim], dtype=t.float32)
        if rank in (1, 2):
            ddpg_apex.store_transition({
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "terminal": False,
            })
            sleep(5)
            ddpg_apex.manual_sync()
        if rank == 0:
            sleep(2)
            ddpg_apex.update()

        return True

    ########################################################################
    # Test for DDPGApex full training.
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=1800)
    @WorldTestBase.setup_world
    def test_full_train(rank):
        c = TestDDPGApex.c
        ddpg_apex = TestDDPGApex.ddpg_apex("cpu", t.float32, discrete=True)
        # perform manual syncing to decrease the number of rpc calls
        ddpg_apex.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        avg_step = Smooth()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        world = get_world()
        all_group = world.create_rpc_group("all", ["0", "1", "2"])
        all_group.pair(f"{rank}_running", True)
        default_logger.info(f"{rank}, pid {os.getpid()}")
        if rank == 0:
            all_group.pair("episode", episode)

        if rank in (0, 1):
            while episode < c.max_episodes:
                # wait for trainer to keep up
                sleep(0.2)
                episode.count()

                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32)

                ddpg_apex.manual_sync()
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        action, probs = ddpg_apex.act_discrete_with_noise(
                            {"state": old_state.unsqueeze(0)})

                        state, reward, terminal, _ = env.step(
                            action.cpu().item())
                        state = t.tensor(state, dtype=t.float32).flatten()
                        total_reward += float(reward)

                        ddpg_apex.store_transition({
                            "state": {
                                "state": old_state.unsqueeze(0)
                            },
                            "action": {
                                "action": probs
                            },
                            "next_state": {
                                "state": state.unsqueeze(0)
                            },
                            "reward":
                            float(reward),
                            "terminal":
                            terminal or step == c.max_steps,
                        })

                smoother.update(total_reward)
                avg_step.update(step.get())
                step.reset()
                terminal = False

                default_logger.info("Process {} Episode {} "
                                    "total reward={:.2f}".format(
                                        rank, episode, smoother.value))

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")

                        all_group.unpair(f"{rank}_running")
                        while all_group.is_paired(
                                "0_running") or all_group.is_paired(
                                    "1_running"):
                            # wait for all workers to join
                            sleep(1)
                        # wait for trainer
                        sleep(5)
                        return True
                else:
                    reward_fulfilled.reset()
        else:
            # wait for some samples
            while ddpg_apex.replay_buffer.all_size() < 500:
                sleep(0.1)
            while all_group.is_paired("0_running") or all_group.is_paired(
                    "1_running"):
                ddpg_apex.update()
                default_logger.info(f"Updated")
            return True

        raise RuntimeError("DDPG-Apex Training failed.")

Пример #13

Показать файл

class TestARS:
    # configs and definitions
    # Cartpole-v0 can be solved:
    # within 200 episodes, using single layer Actor
    # within 400 episodes, using double layer Actor

    # However, ARS fails to deal with pendulum v0:
    # Actor((st, 16)->(16, a)), noise_std=0.01, lr=0.05, rollout=9, optim=Adam)
    # reaches mean score = -700 at 10000 episodes
    # Actor((st, a)), noise_std=0.01, lr=0.05, rollout=9, optim=Adam)
    # reaches mean score = -1100 at 15000 episodes
    # and Adam optimizer is better than SGD
    disable_view_window()
    c = Config()
    c.env_name = "CartPole-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 4
    c.action_num = 2
    c.max_episodes = 1000
    c.max_steps = 200
    c.solved_reward = 150
    c.solved_repeat = 5

    @staticmethod
    def ars(device, dtype):
        c = TestARS.c
        actor = smw(
            ActorDiscrete(c.observe_dim, c.action_num).type(dtype).to(device),
            device,
            device,
        )
        servers = model_server_helper(model_num=1)
        world = get_world()
        ars_group = world.create_rpc_group("ars", ["0", "1", "2"])
        ars = ARS(
            actor,
            t.optim.SGD,
            ars_group,
            servers,
            noise_std_dev=0.1,
            learning_rate=0.1,
            noise_size=1000000,
            rollout_num=6,
            used_rollout_num=6,
            normalize_state=True,
        )
        return ars

    @staticmethod
    def ars_lr(device, dtype):
        c = TestARS.c
        actor = smw(
            ActorDiscrete(c.observe_dim, c.action_num).type(dtype).to(device),
            device,
            device,
        )
        lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)],
                                         logger=default_logger)
        servers = model_server_helper(model_num=1)
        world = get_world()
        ars_group = world.create_rpc_group("ars", ["0", "1", "2"])
        ars = ARS(
            actor,
            t.optim.SGD,
            ars_group,
            servers,
            noise_size=1000000,
            lr_scheduler=LambdaLR,
            lr_scheduler_args=((lr_func, ), ),
        )
        return ars

    ########################################################################
    # Test for ARS acting
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_act(_, device, dtype):
        c = TestARS.c
        ars = TestARS.ars(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        ars.act({"state": state}, "original")
        ars.act({"state": state}, ars.get_actor_types()[0])
        with pytest.raises(ValueError):
            ars.act({"state": state}, "some_invalid_actor_type")
        return True

    ########################################################################
    # Test for ARS storage
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_store_reward(_, device, dtype):
        ars = TestARS.ars(device, dtype)
        ars.store_reward(0.0, ars.get_actor_types()[0])
        with pytest.raises(ValueError):
            ars.store_reward(1.0, "some_invalid_actor_type")
        return True

    ########################################################################
    # Test for ARS update
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_update(_, device, dtype):
        c = TestARS.c
        ars = TestARS.ars(device, dtype)
        for at in ars.get_actor_types():
            # get action will cause filters to initialize
            _action = ars.act(
                {"state": t.zeros([1, c.observe_dim], dtype=dtype)}, at)
            if at.startswith("neg"):
                ars.store_reward(1.0, at)
            else:
                ars.store_reward(0.0, at)
        ars.update()
        return True

    ########################################################################
    # Test for ARS save & load
    ########################################################################
    # Skipped, it is the same as base

    ########################################################################
    # Test for ARS lr_scheduler
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_lr_scheduler(_, device, dtype):
        ars = TestARS.ars_lr(device, dtype)
        ars.update_lr_scheduler()
        return True

    ########################################################################
    # Test for ARS config & init
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=180)
    @WorldTestBase.setup_world
    def test_config_init(_):
        c = TestARS.c
        config = ARS.generate_config({})
        config["frame_config"]["models"] = ["ActorDiscrete"]
        config["frame_config"]["model_kwargs"] = [{
            "state_dim": c.observe_dim,
            "action_dim": c.action_num
        }]
        ars = ARS.init_from_config(config)

        for at in ars.get_actor_types():
            # get action will cause filters to initialize
            _action = ars.act(
                {"state": t.zeros([1, c.observe_dim], dtype=t.float32)}, at)
            if at.startswith("neg"):
                ars.store_reward(1.0, at)
            else:
                ars.store_reward(0.0, at)
        ars.update()
        return True

    ########################################################################
    # Test for ARS full training.
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=1800)
    @WorldTestBase.setup_world
    def test_full_train(rank):
        c = TestARS.c
        ars = TestARS.ars("cpu", t.float32)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")
        while episode < c.max_episodes:
            episode.count()

            all_reward = 0
            for at in ars.get_actor_types():
                total_reward = 0

                # batch size = 1
                state = t.tensor(env.reset(), dtype=t.float32)
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        # agent model inference
                        action = ars.act({"state": state.unsqueeze(0)}, at)
                        state, reward, terminal, __ = env.step(action)
                        state = t.tensor(state, dtype=t.float32)
                        total_reward += float(reward)
                step.reset()
                terminal = False
                ars.store_reward(total_reward, at)
                all_reward += total_reward

            # update
            ars.update()
            smoother.update(all_reward / len(ars.get_actor_types()))
            default_logger.info(
                f"Process {rank} Episode {episode} total reward={smoother.value:.2f}"
            )

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    default_logger.info("Environment solved!")
                    raise SafeExit
            else:
                reward_fulfilled.reset()

        raise RuntimeError("ARS Training failed.")

Пример #14

Показать файл

 def train_config(self, pytestconfig):
     disable_view_window()
     c = Config()
     c.env_name = "Pendulum-v0"
     c.env = unwrap_time_limit(gym.make(c.env_name))
     c.observe_dim = 3
     c.action_dim = 1
     c.action_range = 2
     c.max_episodes = 1000
     c.max_steps = 200
     c.replay_size = 100000
     c.solved_reward = -150
     c.solved_repeat = 5
     c.device = "cpu"
     return c

Пример #15

Показать файл

class TestDQNApex:
    # configs and definitions
    disable_view_window()
    c = Config()
    # Note: online policy algorithms such as PPO and A2C does not
    # work well in Pendulum (reason unknown)
    # and MountainCarContinuous (sparse returns)
    c.env_name = "CartPole-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 4
    c.action_num = 2
    c.max_episodes = 20000
    c.max_steps = 200
    c.replay_size = 100000
    c.solved_reward = 150
    c.solved_repeat = 5

    @staticmethod
    def dqn_apex(device, dtype):
        c = TestDQNApex.c
        q_net = smw(
            QNet(c.observe_dim, c.action_num).type(dtype).to(device), device, device
        )
        q_net_t = smw(
            QNet(c.observe_dim, c.action_num).type(dtype).to(device), device, device
        )
        servers = model_server_helper(model_num=1)
        world = get_world()
        # process 0 and 1 will be workers, and 2 will be trainer
        apex_group = world.create_rpc_group("apex", ["0", "1", "2"])
        dqn_apex = DQNApex(
            q_net,
            q_net_t,
            t.optim.Adam,
            nn.MSELoss(reduction="sum"),
            apex_group,
            servers,
            replay_device="cpu",
            replay_size=c.replay_size,
        )
        return dqn_apex

    ########################################################################
    # Test for DQNApex acting
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_act(_, device, dtype):
        c = TestDQNApex.c
        dqn_apex = TestDQNApex.dqn_apex(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        dqn_apex.act_discrete({"state": state})
        dqn_apex.act_discrete({"state": state}, True)
        dqn_apex.act_discrete_with_noise({"state": state})
        dqn_apex.act_discrete_with_noise({"state": state}, True)
        return True

    ########################################################################
    # Test for DQNApex criticizing
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_criticize(_, device, dtype):
        c = TestDQNApex.c
        dqn_apex = TestDQNApex.dqn_apex(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        dqn_apex._criticize({"state": state})
        dqn_apex._criticize({"state": state}, True)
        return True

    ########################################################################
    # Test for DQNApex storage
    ########################################################################
    # Skipped, it is the same as DQN

    ########################################################################
    # Test for DQNApex update
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_update(rank, device, dtype):
        c = TestDQNApex.c
        dqn_apex = TestDQNApex.dqn_apex(device, dtype)
        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        if rank in (0, 1):
            dqn_apex.store_episode(
                [
                    {
                        "state": {"state": old_state},
                        "action": {"action": action},
                        "next_state": {"state": state},
                        "reward": 0,
                        "terminal": False,
                    }
                    for _ in range(3)
                ]
            )
            dqn_apex.manual_sync()
        if rank == 2:
            sleep(2)
            dqn_apex.update()
        return True

    ########################################################################
    # Test for DQNApex save & load
    ########################################################################
    # Skipped, it is the same as DQN

    ########################################################################
    # Test for DQNApex lr_scheduler
    ########################################################################
    # Skipped, it is the same as DQN

    ########################################################################
    # Test for DQNApex config & init
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=180)
    @setup_world
    def test_config_init(rank):
        c = TestDQNApex.c
        config = DQNApex.generate_config({})
        config["frame_config"]["models"] = ["QNet", "QNet"]
        config["frame_config"]["model_kwargs"] = [
            {"state_dim": c.observe_dim, "action_num": c.action_num}
        ] * 2
        dqn_apex = DQNApex.init_from_config(config)

        old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32)
        action = t.zeros([1, 1], dtype=t.int)
        if rank in (1, 2):
            dqn_apex.store_episode(
                [
                    {
                        "state": {"state": old_state},
                        "action": {"action": action},
                        "next_state": {"state": state},
                        "reward": 0,
                        "terminal": False,
                    }
                    for _ in range(3)
                ]
            )
            dqn_apex.manual_sync()
        if rank == 0:
            sleep(2)
            dqn_apex.update(
                update_value=True, update_target=True, concatenate_samples=True
            )
        return True

    ########################################################################
    # Test for DQNApex full training.
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=1800)
    @setup_world
    def test_full_train(rank):
        training_group = get_world().create_rpc_group("training", ["0", "1", "2"])

        c = TestDQNApex.c
        dqn_apex = TestDQNApex.dqn_apex("cpu", t.float32)
        # perform manual syncing to decrease the number of rpc calls
        dqn_apex.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False
        env = c.env
        env.seed(rank)

        # make sure all things are initialized.
        training_group.barrier()

        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")

        while episode < c.max_episodes:
            episode.count()

            if rank in (0, 1):
                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32)

                dqn_apex.manual_sync()
                tmp_observations = []
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        # agent model inference
                        action = dqn_apex.act_discrete_with_noise(
                            {"state": old_state.unsqueeze(0)}
                        )
                        state, reward, terminal, _ = env.step(action.item())
                        state = t.tensor(state, dtype=t.float32).flatten()
                        total_reward += float(reward)

                        tmp_observations.append(
                            {
                                "state": {"state": old_state.unsqueeze(0)},
                                "action": {"action": action},
                                "next_state": {"state": state.unsqueeze(0)},
                                "reward": float(reward),
                                "terminal": terminal or step == c.max_steps,
                            }
                        )
                dqn_apex.store_episode(tmp_observations)
                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info(
                    "Process {} Episode {} total reward={:.2f}".format(
                        rank, episode, smoother.value
                    )
                )
                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")
                        try:
                            training_group.pair(f"solved", True)
                        except KeyError:
                            # already solved in another process
                            pass
                else:
                    reward_fulfilled.reset()

            else:
                # wait for some samples
                if episode.get() > 200:
                    for _ in range(100):
                        dqn_apex.update()
                    default_logger.info("Updated 100 times.")

            training_group.barrier()
            if training_group.is_paired("solved"):
                return True

        raise RuntimeError("DQN-Apex Training failed.")