Пример #1
0
    def test_full_train(self, train_config, dqn_train):
        c = train_config

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action = dqn_train.act_discrete_with_noise(
                        {"state": old_state.unsqueeze(0)}
                    )
                    state, reward, terminal, _ = env.step(action.item())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    dqn_train.store_transition(
                        {
                            "state": {"state": old_state.unsqueeze(0)},
                            "action": {"action": action},
                            "next_state": {"state": state.unsqueeze(0)},
                            "reward": float(reward),
                            "terminal": terminal or step == c.max_steps,
                        }
                    )

            # update
            if episode.get() > 100:
                for _ in range(step.get()):
                    dqn_train.update()

            smoother.update(total_reward)
            step.reset()
            terminal = False

            logger.info(f"Episode {episode} total reward={smoother.value:.2f}")

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    logger.info("Environment solved!")
                    return
            else:
                reward_fulfilled.reset()

        pytest.fail("DQN Training failed.")
Пример #2
0
    def test_full_train(self, train_config, a2c_train, gae_lambda):
        c = train_config
        a2c_train.gae_lambda = gae_lambda

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        env.seed(0)
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            tmp_observations = []
            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action = a2c_train.act({"state": old_state.unsqueeze(0)})[0]
                    state, reward, terminal, _ = env.step(action.item())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    tmp_observations.append(
                        {
                            "state": {"state": old_state.unsqueeze(0)},
                            "action": {"action": action},
                            "next_state": {"state": state.unsqueeze(0)},
                            "reward": float(reward),
                            "terminal": terminal or step == c.max_steps,
                        }
                    )

            # update
            a2c_train.store_episode(tmp_observations)
            a2c_train.update()

            smoother.update(total_reward)
            step.reset()
            terminal = False

            logger.info(f"Episode {episode} total reward={smoother.value:.2f}")

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    logger.info("Environment solved!")
                    return
            else:
                reward_fulfilled.reset()

        pytest.fail("A2C Training failed.")
Пример #3
0
    def test_full_train(rank):
        c = TestARS.c
        ars = TestARS.ars("cpu", t.float32)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")
        while episode < c.max_episodes:
            episode.count()

            all_reward = 0
            for at in ars.get_actor_types():
                total_reward = 0

                # batch size = 1
                state = t.tensor(env.reset(), dtype=t.float32)
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        # agent model inference
                        action = ars.act({"state": state.unsqueeze(0)}, at)
                        state, reward, terminal, __ = env.step(action)
                        state = t.tensor(state, dtype=t.float32)
                        total_reward += float(reward)
                step.reset()
                terminal = False
                ars.store_reward(total_reward, at)
                all_reward += total_reward

            # update
            ars.update()
            smoother.update(all_reward / len(ars.get_actor_types()))
            default_logger.info(
                f"Process {rank} Episode {episode} total reward={smoother.value:.2f}"
            )

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    default_logger.info("Environment solved!")
                    raise SafeExit
            else:
                reward_fulfilled.reset()

        raise RuntimeError("ARS Training failed.")
Пример #4
0
    def test_full_train(self, train_config, ddpg_per_train):
        c = train_config

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state

                    # agent model inference
                    if episode.get() % c.noise_interval == 0:
                        action = ddpg_per_train.act_with_noise(
                            {"state": old_state.unsqueeze(0)},
                            noise_param=c.noise_param,
                            mode=c.noise_mode,
                        )
                    else:
                        action = ddpg_per_train.act(
                            {"state": old_state.unsqueeze(0)}
                        ).clamp(-c.action_range, c.action_range)

                    state, reward, terminal, _ = env.step(action.cpu().numpy())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    ddpg_per_train.store_transition(
                        {
                            "state": {"state": old_state.unsqueeze(0)},
                            "action": {"action": action},
                            "next_state": {"state": state.unsqueeze(0)},
                            "reward": float(reward),
                            "terminal": terminal or step == c.max_steps,
                        }
                    )
            # update
            if episode > 100:
                for i in range(step.get()):
                    ddpg_per_train.update()

            smoother.update(total_reward)
            step.reset()
            terminal = False

            if episode.get() % c.noise_interval != 0:
                # only log result without noise
                logger.info(f"Episode {episode} total reward={smoother.value:.2f}")

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    logger.info("Environment solved!")
                    return
            else:
                reward_fulfilled.reset()

        pytest.fail("DDPGPer Training failed.")
Пример #5
0
    def test_full_train(rank):
        training_group = get_world().create_rpc_group("training", ["0", "1", "2"])

        c = TestIMPALA.c
        impala = TestIMPALA.impala("cpu", t.float32)

        # perform manual syncing to decrease the number of rpc calls
        impala.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False
        env = c.env
        env.seed(rank)

        # make sure all things are initialized.
        training_group.barrier()

        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")

        while episode < c.max_episodes:
            episode.count()

            if rank in (0, 1):
                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32)

                impala.manual_sync()
                tmp_observations = []
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        action, action_log_prob, *_ = impala.act(
                            {"state": old_state.unsqueeze(0)}
                        )
                        state, reward, terminal, _ = env.step(action.item())
                        state = t.tensor(state, dtype=t.float32).flatten()
                        total_reward += float(reward)

                        tmp_observations.append(
                            {
                                "state": {"state": old_state.unsqueeze(0)},
                                "action": {"action": action},
                                "next_state": {"state": state.unsqueeze(0)},
                                "reward": float(reward),
                                "action_log_prob": action_log_prob.item(),
                                "terminal": terminal or step == c.max_steps,
                            }
                        )
                impala.store_episode(tmp_observations)

                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info(
                    "Process {} Episode {} "
                    "total reward={:.2f}".format(rank, episode, smoother.value)
                )

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")
                        try:
                            training_group.pair(f"solved", True)
                        except KeyError:
                            # already solved in another process
                            pass
                else:
                    reward_fulfilled.reset()
            else:
                # wait for some samples
                if episode.get() > 200:
                    for _ in range(100):
                        impala.update()
                    default_logger.info("Updated 100 times.")

            training_group.barrier()
            if training_group.is_paired("solved"):
                return True

        raise RuntimeError("IMPALA Training failed.")
Пример #6
0
    def test_full_train(rank):
        c = TestDDPGApex.c
        ddpg_apex = TestDDPGApex.ddpg_apex("cpu", t.float32, discrete=True)
        # perform manual syncing to decrease the number of rpc calls
        ddpg_apex.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        avg_step = Smooth()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        world = get_world()
        all_group = world.create_rpc_group("all", ["0", "1", "2"])
        all_group.pair(f"{rank}_running", True)
        default_logger.info(f"{rank}, pid {os.getpid()}")
        if rank == 0:
            all_group.pair("episode", episode)

        if rank in (0, 1):
            while episode < c.max_episodes:
                # wait for trainer to keep up
                sleep(0.2)
                episode.count()

                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32)

                ddpg_apex.manual_sync()
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        action, probs = ddpg_apex.act_discrete_with_noise(
                            {"state": old_state.unsqueeze(0)})

                        state, reward, terminal, _ = env.step(
                            action.cpu().item())
                        state = t.tensor(state, dtype=t.float32).flatten()
                        total_reward += float(reward)

                        ddpg_apex.store_transition({
                            "state": {
                                "state": old_state.unsqueeze(0)
                            },
                            "action": {
                                "action": probs
                            },
                            "next_state": {
                                "state": state.unsqueeze(0)
                            },
                            "reward":
                            float(reward),
                            "terminal":
                            terminal or step == c.max_steps,
                        })

                smoother.update(total_reward)
                avg_step.update(step.get())
                step.reset()
                terminal = False

                default_logger.info("Process {} Episode {} "
                                    "total reward={:.2f}".format(
                                        rank, episode, smoother.value))

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")

                        all_group.unpair(f"{rank}_running")
                        while all_group.is_paired(
                                "0_running") or all_group.is_paired(
                                    "1_running"):
                            # wait for all workers to join
                            sleep(1)
                        # wait for trainer
                        sleep(5)
                        return True
                else:
                    reward_fulfilled.reset()
        else:
            # wait for some samples
            while ddpg_apex.replay_buffer.all_size() < 500:
                sleep(0.1)
            while all_group.is_paired("0_running") or all_group.is_paired(
                    "1_running"):
                ddpg_apex.update()
                default_logger.info(f"Updated")
            return True

        raise RuntimeError("DDPG-Apex Training failed.")
Пример #7
0
    def test_full_train(self, train_config, maddpg_train):
        c = train_config

        # begin training
        episode, step = Counter(), Counter()

        # first for prey, second for pred
        smoother = Smooth()
        reward_fulfilled = Counter()
        terminal = False

        env = c.env
        env.seed(0)
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            states = [
                t.tensor(st, dtype=t.float32).view(1, c.observe_dim)
                for st in env.reset()
            ]
            tmp_observations_list = [[] for _ in range(c.agent_num)]

            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_states = states

                    # agent model inference
                    results = maddpg_train.act_discrete_with_noise([{
                        "state":
                        st.view(1, c.observe_dim)
                    } for st in states])
                    actions = [int(r[0]) for r in results]
                    action_probs = [r[1] for r in results]

                    states, rewards, terminals, _ = env.step(actions)
                    states = [
                        t.tensor(st, dtype=t.float32).view(1, c.observe_dim)
                        for st in states
                    ]

                    total_reward += float(sum(rewards)) / c.agent_num

                    for tmp_observations, ost, act, st, rew, term in zip(
                            tmp_observations_list,
                            old_states,
                            action_probs,
                            states,
                            rewards,
                            terminals,
                    ):
                        tmp_observations.append({
                            "state": {
                                "state": ost
                            },
                            "action": {
                                "action": act
                            },
                            "next_state": {
                                "state": st
                            },
                            "reward":
                            float(rew),
                            "terminal":
                            term or step == c.max_steps,
                        })

            maddpg_train.store_episodes(tmp_observations_list)
            # update
            if episode > 5:
                for i in range(step.get()):
                    maddpg_train.update()

            # total reward is divided by steps here, since:
            # "Agents are rewarded based on minimum agent distance
            #  to each landmark, penalized for collisions"
            smoother.update(total_reward / step.get())
            logger.info(f"Episode {episode} total steps={step}")
            step.reset()
            terminal = False

            logger.info(f"Episode {episode} total reward={smoother.value:.2f}")

            if smoother.value > c.solved_reward and episode > 20:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    logger.info("Environment solved!")
                    return
            else:
                reward_fulfilled.reset()

        pytest.fail("MADDPG Training failed.")
Пример #8
0
    def test_full_train(rank, gae_lambda):
        c = TestA3C.c
        a3c = TestA3C.a3c("cpu", t.float32)
        a3c.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            a3c.manual_sync()
            tmp_observations = []
            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action = a3c.act({"state": old_state.unsqueeze(0)})[0]
                    state, reward, terminal, _ = env.step(action.item())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    tmp_observations.append({
                        "state": {
                            "state": old_state.unsqueeze(0)
                        },
                        "action": {
                            "action": action
                        },
                        "next_state": {
                            "state": state.unsqueeze(0)
                        },
                        "reward":
                        float(reward),
                        "terminal":
                        terminal or step == c.max_steps,
                    })

            # update
            a3c.store_episode(tmp_observations)
            a3c.update()

            smoother.update(total_reward)
            step.reset()
            terminal = False

            default_logger.info(
                f"Process {rank} Episode {episode} total reward={smoother.value:.2f}"
            )

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    default_logger.info("Environment solved!")
                    return True
            else:
                reward_fulfilled.reset()

        raise RuntimeError("A3C Training failed.")
Пример #9
0
    def test_full_train(rank):
        training_group = get_world().create_rpc_group("training",
                                                      ["0", "1", "2"])

        c = TestARS.c
        ars = TestARS.ars("cpu", t.float32)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False
        env = c.env
        env.seed(rank)

        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")

        # make sure all things are initialized.
        training_group.barrier()

        while episode < c.max_episodes:
            episode.count()

            all_reward = 0
            for at in ars.get_actor_types():
                total_reward = 0

                # batch size = 1
                state = t.tensor(env.reset(), dtype=t.float32)
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        # agent model inference
                        action = ars.act({"state": state.unsqueeze(0)}, at)
                        state, reward, terminal, __ = env.step(action)
                        state = t.tensor(state, dtype=t.float32)
                        total_reward += float(reward)
                step.reset()
                terminal = False
                ars.store_reward(total_reward, at)
                all_reward += total_reward

            # update
            ars.update()
            smoother.update(all_reward / len(ars.get_actor_types()))
            default_logger.info(
                f"Process {rank} Episode {episode} total reward={smoother.value:.2f}"
            )

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    default_logger.info("Environment solved!")
                    try:
                        training_group.pair(f"solved", True)
                    except KeyError:
                        # already solved in another process
                        pass
            else:
                reward_fulfilled.reset()

            training_group.barrier()
            if training_group.is_paired("solved"):
                return True

        raise RuntimeError("ARS Training failed.")
Пример #10
0
    def test_full_train(rank):
        c = TestIMPALA.c
        impala = TestIMPALA.impala("cpu", t.float32)

        # perform manual syncing to decrease the number of rpc calls
        impala.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        world = get_world()
        all_group = world.create_rpc_group("all", ["0", "1", "2"])
        all_group.pair(f"{rank}_running", True)
        default_logger.info(f"{rank}, pid {os.getpid()}")
        if rank == 0:
            all_group.pair("episode", episode)

        if rank in (0, 1):
            while episode < c.max_episodes:
                # wait for trainer to keep up
                sleep(0.2)
                episode.count()

                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32)

                impala.manual_sync()
                tmp_observations = []
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        action, action_log_prob, *_ = impala.act(
                            {"state": old_state.unsqueeze(0)})
                        state, reward, terminal, _ = env.step(action.item())
                        state = t.tensor(state, dtype=t.float32).flatten()
                        total_reward += float(reward)

                        tmp_observations.append({
                            "state": {
                                "state": old_state.unsqueeze(0)
                            },
                            "action": {
                                "action": action
                            },
                            "next_state": {
                                "state": state.unsqueeze(0)
                            },
                            "reward":
                            float(reward),
                            "action_log_prob":
                            action_log_prob.item(),
                            "terminal":
                            terminal or step == c.max_steps,
                        })
                impala.store_episode(tmp_observations)

                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info("Process {} Episode {} "
                                    "total reward={:.2f}".format(
                                        rank, episode, smoother.value))

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")

                        all_group.unpair(f"{rank}_running")
                        while all_group.is_paired(
                                "0_running") or all_group.is_paired(
                                    "1_running"):
                            # wait for all workers to join
                            sleep(1)
                        # wait for trainer
                        sleep(5)
                        return True
                else:
                    reward_fulfilled.reset()
        else:
            # wait for some samples
            # Note: the number of entries in buffer means "episodes"
            # rather than steps here!
            while impala.replay_buffer.all_size() < 5:
                sleep(0.1)
            while all_group.is_paired("0_running") or all_group.is_paired(
                    "1_running"):
                impala.update()
                default_logger.info("Updated")
            return True

        raise RuntimeError("IMPALA Training failed.")