예제 #1
0
    def impala(device, dtype, use_lr_sch=False):
        c = TestIMPALA.c
        actor = smw(
            Actor(c.observe_dim, c.action_num).type(dtype).to(device), device,
            device)
        critic = smw(
            Critic(c.observe_dim).type(dtype).to(device), device, device)
        servers = model_server_helper(model_num=1)
        world = get_world()
        # process 0 and 1 will be workers, and 2 will be trainer
        impala_group = world.create_rpc_group("impala", ["0", "1", "2"])

        if use_lr_sch:
            lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)],
                                             logger=default_logger)
            impala = IMPALA(actor,
                            critic,
                            t.optim.Adam,
                            nn.MSELoss(reduction='sum'),
                            impala_group,
                            servers,
                            lr_scheduler=LambdaLR,
                            lr_scheduler_args=((lr_func, ), (lr_func, )))
        else:
            impala = IMPALA(actor, critic, t.optim.Adam,
                            nn.MSELoss(reduction='sum'), impala_group, servers)
        return impala
예제 #2
0
파일: test_apex.py 프로젝트: lethaiq/machin
    def ddpg_apex(device, dtype, discrete=False):
        c = TestDDPGApex.c
        if not discrete:
            actor = smw(Actor(c.observe_dim, c.action_dim, c.action_range)
                        .type(dtype).to(device), device, device)
            actor_t = smw(Actor(c.observe_dim, c.action_dim, c.action_range)
                          .type(dtype).to(device), device, device)
        else:
            actor = smw(ActorDiscrete(c.observe_dim, c.action_dim)
                        .type(dtype).to(device), device, device)
            actor_t = smw(ActorDiscrete(c.observe_dim, c.action_dim)
                          .type(dtype).to(device), device, device)
        critic = smw(Critic(c.observe_dim, c.action_dim)
                     .type(dtype).to(device), device, device)
        critic_t = smw(Critic(c.observe_dim, c.action_dim)
                       .type(dtype).to(device), device, device)

        servers = model_server_helper(model_num=2)
        world = get_world()
        # process 0 and 1 will be workers, and 2 will be trainer
        apex_group = world.create_rpc_group("worker", ["0", "1", "2"])
        ddpg_apex = DDPGApex(actor, actor_t, critic, critic_t,
                             t.optim.Adam,
                             nn.MSELoss(reduction='sum'),
                             apex_group,
                             servers,
                             replay_device="cpu",
                             replay_size=c.replay_size)
        return ddpg_apex
예제 #3
0
파일: apex.py 프로젝트: ikamensh/machin
    def init_from_config(cls, config: Union[Dict[str, Any], Config]):
        world = get_world()
        f_config = deepcopy(config["frame_config"])
        apex_group = world.create_rpc_group(
            group_name=f_config["apex_group_name"],
            members=(
                world.get_members()
                if f_config["apex_members"] == "all"
                else f_config["apex_members"]
            ),
        )

        models = assert_and_get_valid_models(f_config["models"])
        model_args = f_config["model_args"]
        model_kwargs = f_config["model_kwargs"]
        models = [
            m(*arg, **kwarg) for m, arg, kwarg in zip(models, model_args, model_kwargs)
        ]
        # wrap models in DistributedDataParallel when running in learner mode
        max_learner_id = f_config["learner_process_number"]

        learner_group = world.create_collective_group(ranks=list(range(max_learner_id)))

        if world.rank < max_learner_id:
            models = [
                DistributedDataParallel(module=m, process_group=learner_group.group)
                for m in models
            ]

        optimizer = assert_and_get_valid_optimizer(f_config["optimizer"])
        criterion = assert_and_get_valid_criterion(f_config["criterion"])(
            *f_config["criterion_args"], **f_config["criterion_kwargs"]
        )
        criterion.reduction = "none"
        lr_scheduler = f_config["lr_scheduler"] and assert_and_get_valid_lr_scheduler(
            f_config["lr_scheduler"]
        )
        servers = model_server_helper(
            model_num=1,
            group_name=f_config["model_server_group_name"],
            members=f_config["model_server_members"],
        )
        del f_config["optimizer"]
        del f_config["criterion"]
        del f_config["lr_scheduler"]
        frame = cls(
            *models,
            optimizer,
            criterion,
            apex_group,
            servers,
            lr_scheduler=lr_scheduler,
            **f_config
        )
        if world.rank >= max_learner_id:
            frame.update = lambda *_, **__: (None, None)
        return frame
예제 #4
0
파일: test_ars.py 프로젝트: lethaiq/machin
 def ars_lr(device, dtype):
     c = TestARS.c
     actor = smw(
         ActorDiscrete(c.observe_dim, c.action_num).type(dtype).to(device),
         device, device)
     lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)],
                                      logger=default_logger)
     servers = model_server_helper(model_num=1)
     world = get_world()
     ars_group = world.create_rpc_group("ars", ["0", "1", "2"])
     ars = ARS(actor,
               t.optim.SGD,
               ars_group,
               servers,
               noise_size=1000000,
               lr_scheduler=LambdaLR,
               lr_scheduler_args=((lr_func, ), ))
     return ars
예제 #5
0
파일: test_apex.py 프로젝트: lethaiq/machin
 def dqn_apex(device, dtype):
     c = TestDQNApex.c
     q_net = smw(QNet(c.observe_dim, c.action_num)
                 .type(dtype).to(device), device, device)
     q_net_t = smw(QNet(c.observe_dim, c.action_num)
                   .type(dtype).to(device), device, device)
     servers = model_server_helper(model_num=1)
     world = get_world()
     # process 0 and 1 will be workers, and 2 will be trainer
     apex_group = world.create_rpc_group("apex", ["0", "1", "2"])
     dqn_apex = DQNApex(q_net, q_net_t,
                        t.optim.Adam,
                        nn.MSELoss(reduction='sum'),
                        apex_group,
                        servers,
                        replay_device="cpu",
                        replay_size=c.replay_size)
     return dqn_apex
예제 #6
0
파일: test_ars.py 프로젝트: lethaiq/machin
 def ars(device, dtype):
     c = TestARS.c
     actor = smw(
         ActorDiscrete(c.observe_dim, c.action_num).type(dtype).to(device),
         device, device)
     servers = model_server_helper(model_num=1)
     world = get_world()
     ars_group = world.create_rpc_group("ars", ["0", "1", "2"])
     ars = ARS(actor,
               t.optim.SGD,
               ars_group,
               servers,
               noise_std_dev=0.1,
               learning_rate=0.1,
               noise_size=1000000,
               rollout_num=6,
               used_rollout_num=6,
               normalize_state=True)
     return ars
예제 #7
0
파일: ars.py 프로젝트: ikamensh/machin
    def init_from_config(cls, config: Union[Dict[str, Any], Config]):
        world = get_world()
        f_config = copy.deepcopy(config["frame_config"])
        ars_group = world.create_rpc_group(
            group_name=f_config["ars_group_name"],
            members=(
                world.get_members()
                if f_config["ars_members"] == "all"
                else f_config["ars_members"]
            ),
        )

        models = assert_and_get_valid_models(f_config["models"])
        model_args = f_config["model_args"]
        model_kwargs = f_config["model_kwargs"]
        models = [
            m(*arg, **kwarg) for m, arg, kwarg in zip(models, model_args, model_kwargs)
        ]

        optimizer = assert_and_get_valid_optimizer(f_config["optimizer"])
        lr_scheduler = f_config["lr_scheduler"] and assert_and_get_valid_lr_scheduler(
            f_config["lr_scheduler"]
        )
        servers = model_server_helper(
            model_num=1,
            group_name=f_config["model_server_group_name"],
            members=f_config["model_server_members"],
        )
        del f_config["optimizer"]
        del f_config["lr_scheduler"]
        frame = cls(
            *models,
            optimizer,
            ars_group,
            servers,
            lr_scheduler=lr_scheduler,
            **f_config,
        )
        return frame
예제 #8
0
def main(rank):
    env = gym.make("Pendulum-v0")
    observe_dim = 3
    action_dim = 1
    action_range = 2
    max_episodes = 2000
    max_steps = 200
    noise_param = (0, 0.2)
    noise_mode = "normal"
    solved_reward = -150
    solved_repeat = 5

    # initlize distributed world first
    world = World(world_size=4, rank=rank, name=str(rank), rpc_timeout=20)

    servers = model_server_helper(model_num=2)
    apex_group = world.create_rpc_group("apex", ["0", "1", "2", "3"])

    actor = Actor(observe_dim, action_dim, action_range)
    actor_t = Actor(observe_dim, action_dim, action_range)
    critic = Critic(observe_dim, action_dim)
    critic_t = Critic(observe_dim, action_dim)

    ddpg_apex = DDPGApex(actor, actor_t, critic, critic_t, t.optim.Adam,
                         nn.MSELoss(reduction='sum'), apex_group, servers)

    # synchronize all processes in the group, make sure
    # distributed buffer has been created on all processes in apex_group
    apex_group.barrier()

    # manually control syncing to improve performance
    ddpg_apex.set_sync(False)
    if rank in (0, 1):
        # Process 0 and 1 are workers(samplers)
        # begin training
        episode, step, reward_fulfilled = 0, 0, 0
        smoothed_total_reward = 0

        while episode < max_episodes:
            # sleep to wait for learners keep up
            sleep(0.1)
            episode += 1
            total_reward = 0
            terminal = False
            step = 0

            state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)

            # manually pull the newest parameters
            ddpg_apex.manual_sync()
            while not terminal and step <= max_steps:
                step += 1
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action = ddpg_apex.act_with_noise({"state": old_state},
                                                      noise_param=noise_param,
                                                      mode=noise_mode)
                    state, reward, terminal, _ = env.step(action.numpy())
                    state = t.tensor(state, dtype=t.float32)\
                        .view(1, observe_dim)
                    total_reward += reward[0]

                    ddpg_apex.store_transition({
                        "state": {
                            "state": old_state
                        },
                        "action": {
                            "action": action
                        },
                        "next_state": {
                            "state": state
                        },
                        "reward":
                        reward[0],
                        "terminal":
                        terminal or step == max_steps
                    })

            smoothed_total_reward = (smoothed_total_reward * 0.9 +
                                     total_reward * 0.1)
            logger.info("Process {} Episode {} total reward={:.2f}".format(
                rank, episode, smoothed_total_reward))

            if smoothed_total_reward > solved_reward:
                reward_fulfilled += 1
                if reward_fulfilled >= solved_repeat:
                    logger.info("Environment solved!")

                    # will cause torch RPC to complain
                    # since other processes may have not finished yet.
                    # just for demonstration.
                    exit(0)
            else:
                reward_fulfilled = 0

    elif rank in (2, 3):
        # wait for enough samples
        while ddpg_apex.replay_buffer.all_size() < 500:
            sleep(0.1)
        while True:
            ddpg_apex.update()
예제 #9
0
def main(rank):
    env = gym.make("CartPole-v0")
    observe_dim = 4
    action_num = 2
    max_episodes = 2000
    max_steps = 200
    solved_reward = 190
    solved_repeat = 5

    # initlize distributed world first
    world = World(world_size=3, rank=rank, name=str(rank), rpc_timeout=20)

    actor = dmw(ActorDiscrete(observe_dim, action_num))
    servers = model_server_helper(model_num=1)
    ars_group = world.create_rpc_group("ars", ["0", "1", "2"])
    ars = ARS(
        actor,
        t.optim.SGD,
        ars_group,
        servers,
        noise_std_dev=0.1,
        learning_rate=0.1,
        noise_size=1000000,
        rollout_num=6,
        used_rollout_num=6,
        normalize_state=True,
    )

    # begin training
    episode, step, reward_fulfilled = 0, 0, 0
    smoothed_total_reward = 0

    while episode < max_episodes:
        episode += 1
        all_reward = 0
        for at in ars.get_actor_types():
            total_reward = 0
            terminal = False
            step = 0

            # batch size = 1
            state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
            while not terminal and step <= max_steps:
                step += 1
                with t.no_grad():
                    # agent model inference
                    action = ars.act({"state": state}, at)
                    state, reward, terminal, __ = env.step(action)
                    state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
                    total_reward += reward

            ars.store_reward(total_reward, at)
            all_reward += total_reward

        # update
        ars.update()

        # show reward
        smoothed_total_reward = (
            smoothed_total_reward * 0.9 + all_reward / len(ars.get_actor_types()) * 0.1
        )
        logger.info(
            f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}"
        )

        if smoothed_total_reward > solved_reward:
            reward_fulfilled += 1
            if reward_fulfilled >= solved_repeat:
                logger.info("Environment solved!")
                # will cause torch RPC to complain
                # since other processes may have not finished yet.
                # just for demonstration.
                exit(0)
        else:
            reward_fulfilled = 0
예제 #10
0
파일: dqn_apex.py 프로젝트: iffiX/machin
def main(rank):
    env = gym.make("CartPole-v0")
    observe_dim = 4
    action_num = 2
    max_episodes = 2000
    max_steps = 200
    solved_reward = 190
    solved_repeat = 5

    # initlize distributed world first
    world = World(world_size=4, rank=rank, name=str(rank), rpc_timeout=20)

    servers = model_server_helper(model_num=1)
    apex_group = world.create_rpc_group("apex", ["0", "1", "2", "3"])

    if rank in (2, 3):
        # learner_group.group is the wrapped torch.distributed.ProcessGroup
        learner_group = world.create_collective_group(ranks=[2, 3])

        # wrap the model with DistributedDataParallel
        # if current process is learner process 2 or 3
        q_net = DistributedDataParallel(module=QNet(observe_dim, action_num),
                                        process_group=learner_group.group)
        q_net_t = DistributedDataParallel(module=QNet(observe_dim, action_num),
                                          process_group=learner_group.group)
    else:
        q_net = QNet(observe_dim, action_num)
        q_net_t = QNet(observe_dim, action_num)

    # we may use a smaller batch size to train if we are using
    # DistributedDataParallel
    dqn_apex = DQNApex(
        q_net,
        q_net_t,
        t.optim.Adam,
        nn.MSELoss(reduction="sum"),
        apex_group,
        servers,
        batch_size=50,
    )

    # synchronize all processes in the group, make sure
    # distributed buffer has been created on all processes in apex_group
    apex_group.barrier()

    # manually control syncing to improve performance
    dqn_apex.set_sync(False)
    if rank in (0, 1):
        # Process 0 and 1 are workers(samplers)
        # begin training
        episode, step, reward_fulfilled = 0, 0, 0
        smoothed_total_reward = 0

        while episode < max_episodes:
            # sleep to wait for learners keep up
            sleep(0.1)
            episode += 1
            total_reward = 0
            terminal = False
            step = 0

            state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)

            # manually pull the newest parameters
            dqn_apex.manual_sync()
            while not terminal and step <= max_steps:
                step += 1
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action = dqn_apex.act_discrete_with_noise(
                        {"state": old_state})
                    state, reward, terminal, _ = env.step(action.item())
                    state = t.tensor(state,
                                     dtype=t.float32).view(1, observe_dim)
                    total_reward += reward

                    dqn_apex.store_transition({
                        "state": {
                            "state": old_state
                        },
                        "action": {
                            "action": action
                        },
                        "next_state": {
                            "state": state
                        },
                        "reward":
                        reward,
                        "terminal":
                        terminal or step == max_steps,
                    })

            smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1
            logger.info(
                f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}"
            )

            if smoothed_total_reward > solved_reward:
                reward_fulfilled += 1
                if reward_fulfilled >= solved_repeat:
                    logger.info("Environment solved!")

                    # will cause torch RPC to complain
                    # since other processes may have not finished yet.
                    # just for demonstration.
                    exit(0)
            else:
                reward_fulfilled = 0

    elif rank in (2, 3):
        # wait for enough samples
        while dqn_apex.replay_buffer.all_size() < 500:
            sleep(0.1)
        while True:
            dqn_apex.update()
예제 #11
0
파일: impala.py 프로젝트: iffiX/machin
def main(rank):
    env = gym.make("CartPole-v0")
    observe_dim = 4
    action_num = 2
    max_episodes = 2000
    max_steps = 200
    solved_reward = 190
    solved_repeat = 5

    # initlize distributed world first
    world = World(world_size=4, rank=rank, name=str(rank), rpc_timeout=20)

    servers = model_server_helper(model_num=1)
    impala_group = world.create_rpc_group("impala", ["0", "1", "2", "3"])

    if rank in (2, 3):
        # learner_group.group is the wrapped torch.distributed.ProcessGroup
        learner_group = world.create_collective_group(ranks=[2, 3])

        # wrap the model with DistributedDataParallel
        # if current process is learner process 2 or 3
        actor = DistributedDataParallel(module=Actor(observe_dim, action_num),
                                        process_group=learner_group.group)
        critic = DistributedDataParallel(module=Critic(observe_dim),
                                         process_group=learner_group.group)
    else:
        actor = Actor(observe_dim, action_num)
        critic = Critic(observe_dim)

    # we may use a smaller batch size to train if we are using
    # DistributedDataParallel

    # note: since the impala framework is storing a whole
    # episode as a single sample, we should wait for a smaller number
    impala = IMPALA(
        actor,
        critic,
        t.optim.Adam,
        nn.MSELoss(reduction="sum"),
        impala_group,
        servers,
        batch_size=2,
    )

    # synchronize all processes in the group, make sure
    # distributed buffer has been created on all processes in apex_group
    impala_group.barrier()

    # manually control syncing to improve performance
    impala.set_sync(False)
    if rank in (0, 1):
        # Process 0 and 1 are workers(samplers)
        # begin training
        episode, step, reward_fulfilled = 0, 0, 0
        smoothed_total_reward = 0

        while episode < max_episodes:
            # sleep to wait for learners keep up
            sleep(0.1)
            episode += 1
            total_reward = 0
            terminal = False
            step = 0

            state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)

            # manually pull the newest parameters
            impala.manual_sync()
            tmp_observations = []
            while not terminal and step <= max_steps:
                step += 1
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action, action_log_prob, *_ = impala.act(
                        {"state": old_state})
                    state, reward, terminal, _ = env.step(action.item())
                    state = t.tensor(state,
                                     dtype=t.float32).view(1, observe_dim)
                    total_reward += reward

                    tmp_observations.append({
                        "state": {
                            "state": old_state
                        },
                        "action": {
                            "action": action
                        },
                        "next_state": {
                            "state": state
                        },
                        "reward":
                        reward,
                        "action_log_prob":
                        action_log_prob.item(),
                        "terminal":
                        terminal or step == max_steps,
                    })

            impala.store_episode(tmp_observations)
            smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1
            logger.info(
                f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}"
            )

            if smoothed_total_reward > solved_reward:
                reward_fulfilled += 1
                if reward_fulfilled >= solved_repeat:
                    logger.info("Environment solved!")

                    # will cause torch RPC to complain
                    # since other processes may have not finished yet.
                    # just for demonstration.
                    exit(0)
            else:
                reward_fulfilled = 0

    elif rank in (2, 3):
        # wait for enough samples
        # note: since the impala framework is storing a whole
        # episode as a single sample, we should wait for a smaller number
        while impala.replay_buffer.all_size() < 5:
            sleep(0.1)
        while True:
            impala.update()
예제 #12
0
    def init_from_config(
        cls,
        config: Union[Dict[str, Any], Config],
        model_device: Union[str, t.device] = "cpu",
    ):
        world = get_world()
        f_config = deepcopy(config["frame_config"])
        impala_group = world.create_rpc_group(
            group_name=f_config["impala_group_name"],
            members=(
                world.get_members()
                if f_config["impala_members"] == "all"
                else f_config["impala_members"]
            ),
        )

        models = assert_and_get_valid_models(f_config["models"])
        model_args = f_config["model_args"]
        model_kwargs = f_config["model_kwargs"]
        models = [
            m(*arg, **kwarg).to(model_device)
            for m, arg, kwarg in zip(models, model_args, model_kwargs)
        ]
        # wrap models in DistributedDataParallel when running in learner mode
        max_learner_id = f_config["learner_process_number"]

        learner_group = world.create_collective_group(ranks=list(range(max_learner_id)))

        if world.rank < max_learner_id:
            models = [
                DistributedDataParallel(module=m, process_group=learner_group.group)
                for m in models
            ]

        optimizer = assert_and_get_valid_optimizer(f_config["optimizer"])
        criterion = assert_and_get_valid_criterion(f_config["criterion"])(
            *f_config["criterion_args"], **f_config["criterion_kwargs"]
        )
        lr_scheduler = f_config["lr_scheduler"] and assert_and_get_valid_lr_scheduler(
            f_config["lr_scheduler"]
        )
        servers = model_server_helper(
            model_num=1,
            group_name=f_config["model_server_group_name"],
            members=f_config["model_server_members"],
        )
        del f_config["optimizer"]
        del f_config["criterion"]
        del f_config["lr_scheduler"]
        frame = cls(
            *models,
            optimizer,
            criterion,
            impala_group,
            servers,
            lr_scheduler=lr_scheduler,
            **f_config,
        )
        if world.rank >= max_learner_id:
            frame.role = "sampler"
            frame.update = _disable_update
        else:
            frame.role = "learner"
        return frame