示例#1
0
 def test_seed(self, envs, idx):
     for name, creators in zip(*envs):
         default_logger.info(f"Testing on env {name}")
         subproc_wrapper = openai_gym.ParallelWrapperSubProc(creators)
         seeds = subproc_wrapper.seed()
         subproc_wrapper.close()
         assert len(seeds) == ENV_NUM
示例#2
0
    def test_full_train(self, train_config, rainbow_train):
        c = train_config

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            tmp_observations = []
            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action = rainbow_train.act_discrete_with_noise(
                        {"state": old_state.unsqueeze(0)}
                    )
                    state, reward, terminal, _ = env.step(action.item())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    tmp_observations.append({
                        "state": {"state": old_state.unsqueeze(0)},
                        "action": {"action": action},
                        "next_state": {"state": state.unsqueeze(0)},
                        "reward": float(reward),
                        "terminal": terminal or step == c.max_steps
                    })

            rainbow_train.store_episode(tmp_observations)
            # update
            if episode.get() > 100:
                for _ in range(step.get()):
                    rainbow_train.update()

            smoother.update(total_reward)
            step.reset()
            terminal = False

            logger.info("Episode {} total reward={:.2f}"
                        .format(episode, smoother.value))

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    logger.info("Environment solved!")
                    return
            else:
                reward_fulfilled.reset()

        pytest.fail("RAINBOW Training failed.")
示例#3
0
    def remove_trials_older_than(
        self,
        diff_day: int = 0,
        diff_hour: int = 1,
        diff_minute: int = 0,
        diff_second: int = 0,
    ):
        """
        By default this function removes all trials started one hour earlier
        than current time.

        Args:
            diff_day: Difference in days.
            diff_hour: Difference in hours.
            diff_minute: Difference in minutes.
            diff_second: Difference in seconds.
        """
        trial_list = [f for f in os.listdir(self.env_root)]
        current_time = datetime.now()
        diff_threshold = timedelta(
            days=diff_day, hours=diff_hour, minutes=diff_minute, seconds=diff_second
        )
        for file in trial_list:
            try:
                time = datetime.strptime(file, self.time_format)
            except ValueError:
                # not a trial
                pass
            else:
                diff_time = current_time - time
                if diff_time > diff_threshold:
                    rm_path = join(self.env_root, file)
                    default_logger.info(f"Removing trial directory: {rm_path}")
                    shutil.rmtree(rm_path)
示例#4
0
文件: test_a2c.py 项目: iffiX/machin
    def test_full_train(self, train_config, a2c_train, gae_lambda):
        c = train_config
        a2c_train.gae_lambda = gae_lambda

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        env.seed(0)
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            tmp_observations = []
            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action = a2c_train.act({"state": old_state.unsqueeze(0)})[0]
                    state, reward, terminal, _ = env.step(action.item())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    tmp_observations.append(
                        {
                            "state": {"state": old_state.unsqueeze(0)},
                            "action": {"action": action},
                            "next_state": {"state": state.unsqueeze(0)},
                            "reward": float(reward),
                            "terminal": terminal or step == c.max_steps,
                        }
                    )

            # update
            a2c_train.store_episode(tmp_observations)
            a2c_train.update()

            smoother.update(total_reward)
            step.reset()
            terminal = False

            logger.info(f"Episode {episode} total reward={smoother.value:.2f}")

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    logger.info("Environment solved!")
                    return
            else:
                reward_fulfilled.reset()

        pytest.fail("A2C Training failed.")
示例#5
0
 def test_active(self, envs):
     for name, creators in zip(*envs):
         default_logger.info(f"Testing on env {name}")
         subproc_wrapper = openai_gym.ParallelWrapperSubProc(creators)
         subproc_wrapper.reset()
         active = subproc_wrapper.active()
         subproc_wrapper.close()
         assert len(active) == ENV_NUM
示例#6
0
 def test_render(self, envs, idx, render_num):
     for name, creators in zip(*envs):
         default_logger.info(f"Testing on env {name}")
         subproc_wrapper = openai_gym.ParallelWrapperSubProc(creators)
         subproc_wrapper.reset(idx)
         rendered = subproc_wrapper.render(idx)
         subproc_wrapper.close()
         assert len(rendered) == render_num
         assert isinstance(rendered[0], np.ndarray)
         assert rendered[0].ndim == 3 and rendered[0].shape[-1] == 3
示例#7
0
 def on_train_batch_end(self, trainer, pl_module, outputs, batch,
                        _batch_idx, _dataloader_idx) -> None:
     for log in batch[0].logs:
         if "total_reward" in log:
             self.max_total_reward = max(log["total_reward"],
                                         self.max_total_reward)
             default_logger.info(
                 f"Current max total reward={self.max_total_reward:.2f}.")
             trainer.should_stop = self.max_total_reward >= 150
             return
     default_logger.error("Missing total reward in logs.")
示例#8
0
    def test_reset(self, envs, idx, reset_num):
        for name, creators in zip(*envs):
            default_logger.info(f"Testing on env {name}")
            dummy_wrapper = openai_gym.ParallelWrapperDummy(creators)
            obsrvs = dummy_wrapper.reset(idx)
            dummy_wrapper.close()

            assert len(obsrvs) == reset_num
            for obsrv in obsrvs:
                assert dummy_wrapper.observation_space.contains(
                    obsrv
                ), "Required observation form: {}, Actual observation: {}".format(
                    str(dummy_wrapper.observation_space), obsrv)
示例#9
0
 def perturb_adjust_hook(_model, _input, output):
     if perturb_switch.get():
         tmp_action["with_noise"] = output.clone()
     else:
         tmp_action["without_noise"] = output.clone()
     if "with_noise" in tmp_action and "without_noise" in tmp_action:
         # Compute distance between two actions generated by
         # noisy parameters and original parameters.
         with t.no_grad():
             dist = distance_func(tmp_action["with_noise"],
                                  tmp_action["without_noise"])
             tmp_action.clear()
             param_noise_spec.adapt(dist)
             logger.info("Current output distance: {}".format(dist))
             logger.info("Current param noise stddev: {}".format(
                 param_noise_spec.get_dev()))
示例#10
0
 def on_train_batch_end(self, trainer, pl_module, outputs, batch,
                        _batch_idx, _dataloader_idx) -> None:
     for log in batch[0].logs:
         if "total_reward" in log:
             self.max_total_reward = max(log["total_reward"],
                                         self.max_total_reward)
             default_logger.info(
                 f"Process [{get_cur_rank()}] "
                 f"Current max total reward={self.max_total_reward:.2f}.")
             self.queue.put((get_cur_rank(), self.max_total_reward))
             t_plugin = trainer.training_type_plugin
             trainer.should_stop = self.reduce_early_stopping_decision(
                 trainer, t_plugin)
             if trainer.should_stop:
                 default_logger.info(
                     f"Process [{get_cur_rank()}] decides to exit.")
             return
     default_logger.error("Missing total reward in logs.")
示例#11
0
    def test_step(self, envs, idx, act_num):
        for name, creators in zip(*envs):
            default_logger.info(f"Testing on env {name}")
            dummy_wrapper = openai_gym.ParallelWrapperDummy(creators)
            action = [
                mock_action(dummy_wrapper.action_space) for _ in range(act_num)
            ]
            dummy_wrapper.reset(idx)
            obsrvs, reward, terminal, info = dummy_wrapper.step(action, idx)
            dummy_wrapper.close()

            assert len(obsrvs) == act_num
            assert len(reward) == act_num
            assert len(terminal) == act_num
            assert len(info) == act_num and isinstance(info[0], dict)
            for obsrv in obsrvs:
                assert dummy_wrapper.observation_space.contains(
                    obsrv
                ), "Required observation form: {}, Actual observation: {}".format(
                    str(dummy_wrapper.observation_space), obsrv)
示例#12
0
def generate():
    actor = Actor(observe_dim, action_num)
    critic = Critic(observe_dim)

    ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum"))

    episode, step, reward_fulfilled = 0, 0, 0
    smoothed_total_reward = 0

    while episode < max_episodes:
        episode += 1
        # update
        episode_observations, episode_total_reward = run_episode(ppo, env)
        ppo.store_episode(episode_observations)
        ppo.update()

        # show reward
        smoothed_total_reward = smoothed_total_reward * 0.9 + episode_total_reward * 0.1
        logger.info(
            f"Episode {episode} total reward={smoothed_total_reward:.2f}")

        if smoothed_total_reward > solved_reward:
            reward_fulfilled += 1
            if reward_fulfilled >= solved_repeat:
                logger.info("Environment solved!")
                break
        else:
            reward_fulfilled = 0

    trajectories = []
    for i in range(expert_episodes):
        logger.info(f"Generating trajectory {i}")
        trajectories.append([{
            "state": s["state"],
            "action": s["action"]
        } for s in run_episode(ppo, env)[0]])
    archive = Archive(
        path=os.path.join(ROOT, "generated", f"{generated_name}_" +
                          get_time_string()))
    archive.add_item("expert_trajectories", trajectories)
    archive.save()
    logger.info(f"Trajectories saved as {archive.path}")
示例#13
0
    def test_dqn_apex_cpu_spawn_full_train(self, tmpdir):
        # by default, pytorch lightning will use ddp-spawn mode to replace ddp
        # if there are only cpus
        os.environ["WORLD_SIZE"] = "3"
        config = generate_env_config("CartPole-v0", {})
        config = generate_training_config(root_dir=tmpdir.make_numbered_dir(),
                                          config=config)
        config = generate_algorithm_config("DQNApex", config)
        # use ddp_cpu
        config["gpus"] = None
        config["num_processes"] = 3
        # this testing process corresponds to this node
        config["num_nodes"] = 1
        config["early_stopping_patience"] = 100
        # Use class instead of string name since algorithms is distributed.
        config["frame_config"]["models"] = [QNet, QNet]
        config["frame_config"]["model_kwargs"] = [
            {
                "state_dim": 4,
                "action_num": 2
            },
            {
                "state_dim": 4,
                "action_num": 2
            },
        ]

        # for spawn we use a special callback, because the we cannot access
        # max_total_reward from sub-processes
        queue = SimpleQueue(ctx=mp.get_context("spawn"))
        # cb = [SpawnInspectCallback(queue), LoggerDebugCallback()]
        cb = [SpawnInspectCallback(queue)]
        t = Thread(target=launch, args=(config, ), kwargs={"pl_callbacks": cb})
        t.start()

        default_logger.info("Start tracking")
        subproc_max_total_reward = [0, 0, 0]
        while True:
            try:
                result = queue.quick_get(timeout=60)
                default_logger.info(
                    f"Result from process [{result[0]}]: {result[1]}")
                subproc_max_total_reward[result[0]] = result[1]
            except TimeoutError:
                # no more results
                default_logger.info("No more results.")
                break
        t.join()
        assert (
            sum(subproc_max_total_reward) / 3 >= 150
        ), f"Max total reward {sum(subproc_max_total_reward) / 3} below threshold 150."
示例#14
0
 def test_cpu_shared_tensor(self):
     x = [t.ones([10]) * i for i in range(5)]
     for xx in x:
         xx.share_memory_()
     logger.info("CPU tensors created.")
     pool = self.pool_impl(processes=2, is_copy_tensor=False, share_method="cpu")
     logger.info("Pool created.")
     assert all(
         out == expect_out
         for out, expect_out in zip(pool.map(func, x), [0, 20, 40, 60, 80])
     )
     pool.close()
     pool.join()
     logger.info("Pool joined.")
示例#15
0
文件: gail.py 项目: iffiX/machin
def generate_expert_episodes():
    actor = Actor(observe_dim, action_num)
    critic = Critic(observe_dim)

    ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum"))
    logger.info("Training expert PPO")

    episode, step, reward_fulfilled = 0, 0, 0
    smoothed_total_reward = 0

    while episode < max_episodes:
        episode += 1
        # update
        episode_observations, episode_total_reward = run_episode(ppo, env)
        ppo.store_episode(episode_observations)
        ppo.update()

        # show reward
        smoothed_total_reward = smoothed_total_reward * 0.9 + episode_total_reward * 0.1
        logger.info(
            f"Episode {episode} total reward={smoothed_total_reward:.2f}")

        if smoothed_total_reward > solved_reward:
            reward_fulfilled += 1
            if reward_fulfilled >= solved_repeat:
                logger.info("Environment solved!")
                break
        else:
            reward_fulfilled = 0

    trajectories = []
    for i in range(expert_episodes):
        logger.info(f"Generating trajectory {i}")
        trajectories.append([{
            "state": s["state"],
            "action": s["action"]
        } for s in run_episode(ppo, env)[0]])
    return trajectories
示例#16
0
                action = ppo.act({"mem": history.get()})[0]
                state, reward, terminal, _ = env.step(action.item())
                state = convert(state)
                total_reward += reward

                old_history = history.get()
                new_history = history.append(state).get()
                tmp_observations.append({
                    "state": {
                        "mem": old_history
                    },
                    "action": {
                        "action": action
                    },
                    "next_state": {
                        "mem": new_history
                    },
                    "reward": reward,
                    "terminal": terminal,
                })

        # update
        ppo.store_episode(tmp_observations)
        ppo.update()

        # show reward
        smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1

        logger.info(
            f"Episode {episode} total reward={smoothed_total_reward:.2f}")
示例#17
0
                        "state": old_state
                    },
                    "action": {
                        "action": action
                    },
                    "next_state": {
                        "state": state
                    },
                    "reward": reward,
                    "terminal": terminal or step == max_steps
                })

        # update, update more if episode is longer, else less
        if episode > 100:
            for _ in range(step):
                dqn.update()

        # show reward
        smoothed_total_reward = (smoothed_total_reward * 0.9 +
                                 total_reward * 0.1)
        logger.info("Episode {} total reward={:.2f}".format(
            episode, smoothed_total_reward))

        if smoothed_total_reward > solved_reward:
            reward_fulfilled += 1
            if reward_fulfilled >= solved_repeat:
                logger.info("Environment solved!")
                exit(0)
        else:
            reward_fulfilled = 0
示例#18
0
                    "state": {
                        "state": old_state
                    },
                    "action": {
                        "action": action
                    },
                    "next_state": {
                        "state": state
                    },
                    "reward": reward,
                    "terminal": terminal or step == max_steps,
                })

        # update, update more if episode is longer, else less
        if episode > 100:
            for _ in range(step):
                dqn.update()

        # show reward
        smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1
        logger.info(
            f"Episode {episode} total reward={smoothed_total_reward:.2f}")

        if smoothed_total_reward > solved_reward:
            reward_fulfilled += 1
            if reward_fulfilled >= solved_repeat:
                logger.info("Environment solved!")
                exit(0)
        else:
            reward_fulfilled = 0
示例#19
0
    def test_full_train(self, train_config, sac_train):
        c = train_config
        sac_train.target_entropy = -c.action_dim

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state

                    # agent model inference
                    action = sac_train.act({"state":
                                            old_state.unsqueeze(0)})[0]

                    state, reward, terminal, _ = env.step(action.cpu().numpy())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    sac_train.store_transition({
                        "state": {
                            "state": old_state.unsqueeze(0)
                        },
                        "action": {
                            "action": action
                        },
                        "next_state": {
                            "state": state.unsqueeze(0)
                        },
                        "reward":
                        float(reward),
                        "terminal":
                        terminal or step == c.max_steps,
                    })
            # update
            if episode > 100:
                for i in range(step.get()):
                    sac_train.update()
                logger.info(
                    f"new entropy alpha: {sac_train.entropy_alpha.item()}")

            smoother.update(total_reward)
            step.reset()
            terminal = False

            logger.info(f"Episode {episode} total reward={smoother.value:.2f}")

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    logger.info("Environment solved!")
                    return
            else:
                reward_fulfilled.reset()

        pytest.fail("SAC Training failed.")
示例#20
0
def main(rank):
    env = gym.make("Pendulum-v0")
    observe_dim = 3
    action_dim = 1
    action_range = 2
    max_episodes = 2000
    max_steps = 200
    noise_param = (0, 0.2)
    noise_mode = "normal"
    solved_reward = -150
    solved_repeat = 5

    # initlize distributed world first
    world = World(world_size=4, rank=rank, name=str(rank), rpc_timeout=20)

    servers = model_server_helper(model_num=2)
    apex_group = world.create_rpc_group("apex", ["0", "1", "2", "3"])

    actor = Actor(observe_dim, action_dim, action_range)
    actor_t = Actor(observe_dim, action_dim, action_range)
    critic = Critic(observe_dim, action_dim)
    critic_t = Critic(observe_dim, action_dim)

    ddpg_apex = DDPGApex(actor, actor_t, critic, critic_t, t.optim.Adam,
                         nn.MSELoss(reduction='sum'), apex_group, servers)

    # synchronize all processes in the group, make sure
    # distributed buffer has been created on all processes in apex_group
    apex_group.barrier()

    # manually control syncing to improve performance
    ddpg_apex.set_sync(False)
    if rank in (0, 1):
        # Process 0 and 1 are workers(samplers)
        # begin training
        episode, step, reward_fulfilled = 0, 0, 0
        smoothed_total_reward = 0

        while episode < max_episodes:
            # sleep to wait for learners keep up
            sleep(0.1)
            episode += 1
            total_reward = 0
            terminal = False
            step = 0

            state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)

            # manually pull the newest parameters
            ddpg_apex.manual_sync()
            while not terminal and step <= max_steps:
                step += 1
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action = ddpg_apex.act_with_noise({"state": old_state},
                                                      noise_param=noise_param,
                                                      mode=noise_mode)
                    state, reward, terminal, _ = env.step(action.numpy())
                    state = t.tensor(state, dtype=t.float32)\
                        .view(1, observe_dim)
                    total_reward += reward[0]

                    ddpg_apex.store_transition({
                        "state": {
                            "state": old_state
                        },
                        "action": {
                            "action": action
                        },
                        "next_state": {
                            "state": state
                        },
                        "reward":
                        reward[0],
                        "terminal":
                        terminal or step == max_steps
                    })

            smoothed_total_reward = (smoothed_total_reward * 0.9 +
                                     total_reward * 0.1)
            logger.info("Process {} Episode {} total reward={:.2f}".format(
                rank, episode, smoothed_total_reward))

            if smoothed_total_reward > solved_reward:
                reward_fulfilled += 1
                if reward_fulfilled >= solved_repeat:
                    logger.info("Environment solved!")

                    # will cause torch RPC to complain
                    # since other processes may have not finished yet.
                    # just for demonstration.
                    exit(0)
            else:
                reward_fulfilled = 0

    elif rank in (2, 3):
        # wait for enough samples
        while ddpg_apex.replay_buffer.all_size() < 500:
            sleep(0.1)
        while True:
            ddpg_apex.update()
示例#21
0
def main(rank):
    env = gym.make("CartPole-v0")
    observe_dim = 4
    action_num = 2
    max_episodes = 2000
    max_steps = 200
    solved_reward = 190
    solved_repeat = 5

    # initlize distributed world first
    world = World(world_size=3, rank=rank, name=str(rank), rpc_timeout=20)

    actor = dmw(ActorDiscrete(observe_dim, action_num))
    servers = model_server_helper(model_num=1)
    ars_group = world.create_rpc_group("ars", ["0", "1", "2"])
    ars = ARS(
        actor,
        t.optim.SGD,
        ars_group,
        servers,
        noise_std_dev=0.1,
        learning_rate=0.1,
        noise_size=1000000,
        rollout_num=6,
        used_rollout_num=6,
        normalize_state=True,
    )

    # begin training
    episode, step, reward_fulfilled = 0, 0, 0
    smoothed_total_reward = 0

    while episode < max_episodes:
        episode += 1
        all_reward = 0
        for at in ars.get_actor_types():
            total_reward = 0
            terminal = False
            step = 0

            # batch size = 1
            state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
            while not terminal and step <= max_steps:
                step += 1
                with t.no_grad():
                    # agent model inference
                    action = ars.act({"state": state}, at)
                    state, reward, terminal, __ = env.step(action)
                    state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
                    total_reward += reward

            ars.store_reward(total_reward, at)
            all_reward += total_reward

        # update
        ars.update()

        # show reward
        smoothed_total_reward = (
            smoothed_total_reward * 0.9 + all_reward / len(ars.get_actor_types()) * 0.1
        )
        logger.info(
            f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}"
        )

        if smoothed_total_reward > solved_reward:
            reward_fulfilled += 1
            if reward_fulfilled >= solved_repeat:
                logger.info("Environment solved!")
                # will cause torch RPC to complain
                # since other processes may have not finished yet.
                # just for demonstration.
                exit(0)
        else:
            reward_fulfilled = 0
示例#22
0
文件: dqn_apex.py 项目: iffiX/machin
def main(rank):
    env = gym.make("CartPole-v0")
    observe_dim = 4
    action_num = 2
    max_episodes = 2000
    max_steps = 200
    solved_reward = 190
    solved_repeat = 5

    # initlize distributed world first
    world = World(world_size=4, rank=rank, name=str(rank), rpc_timeout=20)

    servers = model_server_helper(model_num=1)
    apex_group = world.create_rpc_group("apex", ["0", "1", "2", "3"])

    if rank in (2, 3):
        # learner_group.group is the wrapped torch.distributed.ProcessGroup
        learner_group = world.create_collective_group(ranks=[2, 3])

        # wrap the model with DistributedDataParallel
        # if current process is learner process 2 or 3
        q_net = DistributedDataParallel(module=QNet(observe_dim, action_num),
                                        process_group=learner_group.group)
        q_net_t = DistributedDataParallel(module=QNet(observe_dim, action_num),
                                          process_group=learner_group.group)
    else:
        q_net = QNet(observe_dim, action_num)
        q_net_t = QNet(observe_dim, action_num)

    # we may use a smaller batch size to train if we are using
    # DistributedDataParallel
    dqn_apex = DQNApex(
        q_net,
        q_net_t,
        t.optim.Adam,
        nn.MSELoss(reduction="sum"),
        apex_group,
        servers,
        batch_size=50,
    )

    # synchronize all processes in the group, make sure
    # distributed buffer has been created on all processes in apex_group
    apex_group.barrier()

    # manually control syncing to improve performance
    dqn_apex.set_sync(False)
    if rank in (0, 1):
        # Process 0 and 1 are workers(samplers)
        # begin training
        episode, step, reward_fulfilled = 0, 0, 0
        smoothed_total_reward = 0

        while episode < max_episodes:
            # sleep to wait for learners keep up
            sleep(0.1)
            episode += 1
            total_reward = 0
            terminal = False
            step = 0

            state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)

            # manually pull the newest parameters
            dqn_apex.manual_sync()
            while not terminal and step <= max_steps:
                step += 1
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action = dqn_apex.act_discrete_with_noise(
                        {"state": old_state})
                    state, reward, terminal, _ = env.step(action.item())
                    state = t.tensor(state,
                                     dtype=t.float32).view(1, observe_dim)
                    total_reward += reward

                    dqn_apex.store_transition({
                        "state": {
                            "state": old_state
                        },
                        "action": {
                            "action": action
                        },
                        "next_state": {
                            "state": state
                        },
                        "reward":
                        reward,
                        "terminal":
                        terminal or step == max_steps,
                    })

            smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1
            logger.info(
                f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}"
            )

            if smoothed_total_reward > solved_reward:
                reward_fulfilled += 1
                if reward_fulfilled >= solved_repeat:
                    logger.info("Environment solved!")

                    # will cause torch RPC to complain
                    # since other processes may have not finished yet.
                    # just for demonstration.
                    exit(0)
            else:
                reward_fulfilled = 0

    elif rank in (2, 3):
        # wait for enough samples
        while dqn_apex.replay_buffer.all_size() < 500:
            sleep(0.1)
        while True:
            dqn_apex.update()
示例#23
0
    def test_full_train(self, train_config, maddpg_train):
        c = train_config

        # begin training
        episode, step = Counter(), Counter()

        # first for prey, second for pred
        smoother = Smooth()
        reward_fulfilled = Counter()
        terminal = False

        env = c.env
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            states = [t.tensor(st, dtype=t.float32) for st in env.reset()]

            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_states = states

                    # agent model inference
                    results = maddpg_train.act_discrete_with_noise(
                        [{"state": st.unsqueeze(0)} for st in states]
                    )
                    actions = [int(r[0]) for r in results]
                    action_probs = [r[1] for r in results]

                    states, rewards, terminals, _ = env.step(actions)
                    states = [t.tensor(st, dtype=t.float32) for st in states]

                    total_reward += float(sum(rewards)) / c.agent_num

                    maddpg_train.store_transitions(
                        [
                            {
                                "state": {"state": ost.unsqueeze(0)},
                                "action": {"action": act},
                                "next_state": {"state": st.unsqueeze(0)},
                                "reward": float(rew),
                                "terminal": term or step == c.max_steps,
                            }
                            for ost, act, st, rew, term in zip(
                                old_states, action_probs, states, rewards, terminals
                            )
                        ]
                    )

            # update
            if episode > 5:
                for i in range(step.get()):
                    maddpg_train.update()

            # total reward is divided by steps here, since:
            # "Agents are rewarded based on minimum agent distance
            #  to each landmark, penalized for collisions"
            smoother.update(total_reward / step.get())
            logger.info(f"Episode {episode} total steps={step}")
            step.reset()
            terminal = False

            logger.info(f"Episode {episode} total reward={smoother.value:.2f}")

            if smoother.value > c.solved_reward and episode > 20:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    logger.info("Environment solved!")
                    return
            else:
                reward_fulfilled.reset()

        pytest.fail("MADDPG Training failed.")
示例#24
0
    def __init__(
        self,
        models: List[nn.Module],
        model_connection: Dict[Tuple[int, int], int],
        devices: List[Union[t.device, str]] = None,
        model_size_multiplier=2,
        max_mem_ratio=0.5,
        cpu_weight=0,
        connection_weight=2,
        size_match_weight=1e-2,
        complexity_match_weight=1,
        entropy_weight=1,
        iterations=500,
        update_rate=0.01,
        gpu_gpu_distance=1,
        cpu_gpu_distance=10,
        move_models=True,
    ):
        """
        Assign models to different devices. In the scope of a single process.
        Assigner assumes all GPUs have the **same processing power**.

        Assignment is based on four aspects:

        1. Distance and model connections. Connection is usually indicated
           by the amount of data transmitted between two models.
        2. Compute complexity.
        3. Model size.
        4. Entropy.

        Four aspects are controlled by four weights:

        1. ``connection_weight``, assigner will try to reduce the total
           ``distance * connection`` if this weight is larger.
        2. ``size_match_weight``, this weight controls the total memory
           space used on a single device, only works if total assigned
           memory of models exceeds allowed device memory size
           (internally it uses a relu activation), the larger,
           the tighter and more restricted the fit.
        3. ``complexity_match_weight``, this weights balance the model
           computation cost across devices, assigner will try to even
           the ``computation cost / compute power`` ratio for each device
           if this weight is larger.
        4. ``entropy_weight``, this weight minimize the uncertainty of
           model placement probability, so ``model i`` will have a close to 1
           probability of locating on some ``device j`` if this weight is
           larger.

        Assignment uses gradient descent to compute the probability matrix
        of each ``model i`` locating on each available ``device j``.

        See Also:
            :class:`.ModelSizeEstimator`

        Note:
            When the sum of your model size is very close to the capacity of
            your device memory, `ModelAssigner` does not respond very well
            to the ``size_match_weight``, therefore, please consider about
            increasing ``model_size_multiplier`` or decreasing
            ``max_mem_ratio``.

        Args:
            models: Models to assign.
            model_connection: Connection weight between modules.
                **Must be positive**
            devices: Available devices.
            model_size_multiplier: Size multiplier of models, used to reserve
                enough space for models,
            max_mem_ratio: Maximum percent of memory allowed.
            cpu_weight: Weight of cpu. Relative to the computing power of one
                GPU. By default it is 0 so no computation will be performed on
                CPU. **Must be positive**
            connection_weight: Weight of connection between models.
            size_match_weight: Weight of size match.
            complexity_match_weight: Weight of complexity match.
            entropy_weight: Weight of entropy.
            iterations: Number of optimization iterations.
            update_rate: Learning rate of the adam optimizer.
            gpu_gpu_distance: Estimated distance cost between gpu-gpu.
                **Must be positive**
            cpu_gpu_distance: Estimated distance cost between cpu-gpu.
                **Must be positive**
            move_models: Whether to automatically move the models after
                assignment.
        """
        if devices is None:
            devices = [
                t.device(type="cuda", index=i)
                for i in GPUtil.getAvailable(order="load")
            ]
        else:
            devices = [t.device(d) for d in devices]
            available_devices = [
                t.device(type="cuda", index=i)
                for i in GPUtil.getAvailable(order="load")
            ]
            used_devices = []
            for dev in devices:
                if dev.type == "cuda" and dev not in available_devices:
                    default_logger.info(
                        f"Warning: device {dev} not available, removed.")
                else:
                    used_devices.append(dev)
            devices = used_devices

        if not devices:
            devices = [t.device("cpu")]

        default_logger.info(f"Using these devices: {devices}")

        sizes = [
            ModelSizeEstimator(model, model_size_multiplier).estimate_size()
            for model in models
        ]
        device_size_capacity = []
        device_complexity_capacity = []

        gpus = GPUtil.getGPUs()
        for dev in devices:
            if dev.type == "cpu":
                device_size_capacity.append(
                    int(psutil.virtual_memory().available / 1024**2) *
                    max_mem_ratio)
                device_complexity_capacity.append(cpu_weight)
            elif dev.type == "cuda":
                device_size_capacity.append(gpus[dev.index].memoryFree *
                                            max_mem_ratio)
                device_complexity_capacity.append(1 - gpus[dev.index].load)

        if np.sum(np.array(sizes)) > np.sum(device_size_capacity):
            raise RuntimeError(
                f"Estimated model will use {np.sum(np.array(sizes)):.2f} MB, "
                f"but only have {np.sum(device_size_capacity):.2f} MB allowed memory "
                "in total.")

        # assign model to devices
        # using heuristic and gradient decent
        device_num = len(devices)
        model_num = len(models)

        # Important, the placement probability matrix! this matrix
        # describes the probability of placement of:
        # model i on device j
        placement = t.randn([model_num, device_num], requires_grad=True)

        optimizer = t.optim.Adam([placement], lr=update_rate)
        model_size = t.tensor(sizes, dtype=t.float).view([1, model_num])
        size_capacity = t.tensor(device_size_capacity,
                                 dtype=t.float).view([1, device_num])
        model_complexity = model_size

        # complexity_capacity is basically the estimated computing power
        # of devices.
        complexity_capacity = t.tensor(device_complexity_capacity,
                                       dtype=t.float).view([1, device_num])

        # model connection indicates the amount of data transmitted between
        # each pair of models, a weighted adjacency matrix.
        model_conn = t.zeros([model_num, model_num])

        for direction, conn in model_connection.items():
            model_conn[direction[0], direction[1]] = conn

        # device distance matrix
        device_distance = t.zeros([device_num, device_num])
        for i in range(device_num):
            for j in range(i):
                if (devices[i].type == "cpu" and devices[j].type == "cuda"
                        or devices[i].type == "cuda"
                        and devices[j].type == "cpu"):
                    device_distance[i,
                                    j] = device_distance[j,
                                                         i] = cpu_gpu_distance
                elif (devices[i].type == "cuda" and devices[j].type == "cuda"
                      and devices[i].index != devices[j].index):
                    device_distance[i,
                                    j] = device_distance[j,
                                                         i] = gpu_gpu_distance

        # optimize
        for _ in range(iterations):
            self.optimize_placement(
                optimizer,
                placement,
                model_size,
                size_capacity,
                model_complexity,
                complexity_capacity,
                model_conn,
                device_distance,
                connection_weight,
                size_match_weight,
                complexity_match_weight,
                entropy_weight,
            )
        self._assignment = [
            devices[d] for d in t.argmax(placement, dim=1).tolist()
        ]
        if move_models:
            for model, ass_device in zip(models, self._assignment):
                model.to(ass_device)
示例#25
0
 def test_close(self, envs):
     for name, creators in zip(*envs):
         default_logger.info(f"Testing on env {name}")
         subproc_wrapper = openai_gym.ParallelWrapperSubProc(creators)
         subproc_wrapper.close()
示例#26
0
                state, reward, terminal, _ = env.step(action.item())
                state = convert(state)
                total_reward += reward

                tmp_observations.append({
                    "state": {
                        "mem": old_state,
                        "hidden": old_hidden
                    },
                    "action": {
                        "action": action
                    },
                    "next_state": {
                        "mem": state,
                        "hidden": hidden
                    },
                    "reward": reward,
                    "terminal": terminal
                })

        # update
        rppo.store_episode(tmp_observations)
        rppo.update()

        # show reward
        smoothed_total_reward = (smoothed_total_reward * 0.9 +
                                 total_reward * 0.1)

        logger.info("Episode {} total reward={:.2f}".format(
            episode, smoothed_total_reward))
示例#27
0
def fnTrain():
    episode, step, reward_fulfilled = 0, 0, 0
    smoothed_total_reward = 0
    iNumOfTrainSamples = env.fnNumIterations()
    afRewardArray = []
    fMaxRewardSum = -np.inf
    while episode < iNumOfTrainSamples:
        episode += 1
        total_reward = 0
        terminal = False
        step = 0
        state = t.tensor(env.reset(),
                         dtype=t.float32).view(1,
                                               env.observation_spec().shape[0])

        while not terminal and step <= max_steps:
            step += 1
            with t.no_grad():
                old_state = state
                # agent model inference
                action = dqn.act_discrete_with_noise({"some_state": old_state})
                state, reward, terminal, oInfo = env.step(action.item())
                state = t.tensor(state, dtype=t.float32).view(
                    1,
                    env.observation_spec().shape[0])
                total_reward += reward

                dqn.store_transition({
                    "state": {
                        "some_state": old_state
                    },
                    "action": {
                        "action": action
                    },
                    "next_state": {
                        "some_state": state
                    },
                    "reward": np.float32(reward),
                    "terminal": terminal or step == max_steps
                })

        # update, update more if episode is longer, else less
        if episode > 100:
            for _ in range(step):
                dqn.update()

        # show reward
        smoothed_total_reward = (smoothed_total_reward * 0.9 +
                                 total_reward * 0.1)
        logger.info("Episode {} of {} ({:.2f}%), total reward={:.2f}".format(
            episode, iNumOfTrainSamples, 100.00 * episode / iNumOfTrainSamples,
            smoothed_total_reward))

        if (solved_repeat <= len(afRewardArray)):
            afRewardArray.pop(0)
        afRewardArray.append(smoothed_total_reward)
        fRewardSum = np.sum(afRewardArray)
        if (fMaxRewardSum < fRewardSum):
            fMaxRewardSum = fRewardSum
            dqn.save(g_sModel1)
            print("Reward sum={}".format(fMaxRewardSum))
示例#28
0
def main(rank):
    env = gym.make("CartPole-v0")
    observe_dim = 4
    action_num = 2
    max_episodes = 2000
    max_steps = 200
    solved_reward = 190
    solved_repeat = 5

    # initlize distributed world first
    _world = World(world_size=3, rank=rank,
                   name=str(rank), rpc_timeout=20)

    actor = Actor(observe_dim, action_num)
    critic = Critic(observe_dim)

    # in all test scenarios, all processes will be used as reducers
    servers = grad_server_helper(
        [lambda: Actor(observe_dim, action_num),
         lambda: Critic(observe_dim)],
        learning_rate=5e-3
    )
    a3c = A3C(actor, critic,
              nn.MSELoss(reduction='sum'),
              servers)

    # manually control syncing to improve performance
    a3c.set_sync(False)

    # begin training
    episode, step, reward_fulfilled = 0, 0, 0
    smoothed_total_reward = 0

    while episode < max_episodes:
        episode += 1
        total_reward = 0
        terminal = False
        step = 0

        state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)

        # manually pull the newest parameters
        a3c.manual_sync()
        tmp_observations = []
        while not terminal and step <= max_steps:
            step += 1
            with t.no_grad():
                old_state = state
                # agent model inference
                action = a3c.act({"state": old_state})[0]
                state, reward, terminal, _ = env.step(action.item())
                state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
                total_reward += reward

                tmp_observations.append({
                    "state": {"state": old_state},
                    "action": {"action": action},
                    "next_state": {"state": state},
                    "reward": reward,
                    "terminal": terminal or step == max_steps
                })

        # update
        a3c.store_episode(tmp_observations)
        a3c.update()

        # show reward
        smoothed_total_reward = (smoothed_total_reward * 0.9 +
                                 total_reward * 0.1)
        logger.info("Process {} Episode {} total reward={:.2f}"
                    .format(rank, episode, smoothed_total_reward))

        if smoothed_total_reward > solved_reward:
            reward_fulfilled += 1
            if reward_fulfilled >= solved_repeat:
                logger.info("Environment solved!")
                # will cause torch RPC to complain
                # since other processes may have not finished yet.
                # just for demonstration.
                exit(0)
        else:
            reward_fulfilled = 0
示例#29
0
    def test_full_train(self, train_config, ddpg_per_train):
        c = train_config

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state

                    # agent model inference
                    if episode.get() % c.noise_interval == 0:
                        action = ddpg_per_train.act_with_noise(
                            {"state": old_state.unsqueeze(0)},
                            noise_param=c.noise_param,
                            mode=c.noise_mode,
                        )
                    else:
                        action = ddpg_per_train.act({
                            "state":
                            old_state.unsqueeze(0)
                        }).clamp(-c.action_range, c.action_range)

                    state, reward, terminal, _ = env.step(action.cpu().numpy())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    ddpg_per_train.store_transition({
                        "state": {
                            "state": old_state.unsqueeze(0)
                        },
                        "action": {
                            "action": action
                        },
                        "next_state": {
                            "state": state.unsqueeze(0)
                        },
                        "reward":
                        float(reward),
                        "terminal":
                        terminal or step == c.max_steps,
                    })
            # update
            if episode > 100:
                for i in range(step.get()):
                    ddpg_per_train.update()

            smoother.update(total_reward)
            step.reset()
            terminal = False

            if episode.get() % c.noise_interval != 0:
                # only log result without noise
                logger.info(
                    f"Episode {episode} total reward={smoother.value:.2f}")

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    logger.info("Environment solved!")
                    return
            else:
                reward_fulfilled.reset()

        pytest.fail("DDPGPer Training failed.")
示例#30
0
文件: impala.py 项目: iffiX/machin
def main(rank):
    env = gym.make("CartPole-v0")
    observe_dim = 4
    action_num = 2
    max_episodes = 2000
    max_steps = 200
    solved_reward = 190
    solved_repeat = 5

    # initlize distributed world first
    world = World(world_size=4, rank=rank, name=str(rank), rpc_timeout=20)

    servers = model_server_helper(model_num=1)
    impala_group = world.create_rpc_group("impala", ["0", "1", "2", "3"])

    if rank in (2, 3):
        # learner_group.group is the wrapped torch.distributed.ProcessGroup
        learner_group = world.create_collective_group(ranks=[2, 3])

        # wrap the model with DistributedDataParallel
        # if current process is learner process 2 or 3
        actor = DistributedDataParallel(module=Actor(observe_dim, action_num),
                                        process_group=learner_group.group)
        critic = DistributedDataParallel(module=Critic(observe_dim),
                                         process_group=learner_group.group)
    else:
        actor = Actor(observe_dim, action_num)
        critic = Critic(observe_dim)

    # we may use a smaller batch size to train if we are using
    # DistributedDataParallel

    # note: since the impala framework is storing a whole
    # episode as a single sample, we should wait for a smaller number
    impala = IMPALA(
        actor,
        critic,
        t.optim.Adam,
        nn.MSELoss(reduction="sum"),
        impala_group,
        servers,
        batch_size=2,
    )

    # synchronize all processes in the group, make sure
    # distributed buffer has been created on all processes in apex_group
    impala_group.barrier()

    # manually control syncing to improve performance
    impala.set_sync(False)
    if rank in (0, 1):
        # Process 0 and 1 are workers(samplers)
        # begin training
        episode, step, reward_fulfilled = 0, 0, 0
        smoothed_total_reward = 0

        while episode < max_episodes:
            # sleep to wait for learners keep up
            sleep(0.1)
            episode += 1
            total_reward = 0
            terminal = False
            step = 0

            state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)

            # manually pull the newest parameters
            impala.manual_sync()
            tmp_observations = []
            while not terminal and step <= max_steps:
                step += 1
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action, action_log_prob, *_ = impala.act(
                        {"state": old_state})
                    state, reward, terminal, _ = env.step(action.item())
                    state = t.tensor(state,
                                     dtype=t.float32).view(1, observe_dim)
                    total_reward += reward

                    tmp_observations.append({
                        "state": {
                            "state": old_state
                        },
                        "action": {
                            "action": action
                        },
                        "next_state": {
                            "state": state
                        },
                        "reward":
                        reward,
                        "action_log_prob":
                        action_log_prob.item(),
                        "terminal":
                        terminal or step == max_steps,
                    })

            impala.store_episode(tmp_observations)
            smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1
            logger.info(
                f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}"
            )

            if smoothed_total_reward > solved_reward:
                reward_fulfilled += 1
                if reward_fulfilled >= solved_repeat:
                    logger.info("Environment solved!")

                    # will cause torch RPC to complain
                    # since other processes may have not finished yet.
                    # just for demonstration.
                    exit(0)
            else:
                reward_fulfilled = 0

    elif rank in (2, 3):
        # wait for enough samples
        # note: since the impala framework is storing a whole
        # episode as a single sample, we should wait for a smaller number
        while impala.replay_buffer.all_size() < 5:
            sleep(0.1)
        while True:
            impala.update()