示例#1
0
    def test_full_train(self, train_config, rainbow_train):
        c = train_config

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            tmp_observations = []
            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action = rainbow_train.act_discrete_with_noise(
                        {"state": old_state.unsqueeze(0)}
                    )
                    state, reward, terminal, _ = env.step(action.item())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    tmp_observations.append({
                        "state": {"state": old_state.unsqueeze(0)},
                        "action": {"action": action},
                        "next_state": {"state": state.unsqueeze(0)},
                        "reward": float(reward),
                        "terminal": terminal or step == c.max_steps
                    })

            rainbow_train.store_episode(tmp_observations)
            # update
            if episode.get() > 100:
                for _ in range(step.get()):
                    rainbow_train.update()

            smoother.update(total_reward)
            step.reset()
            terminal = False

            logger.info("Episode {} total reward={:.2f}"
                        .format(episode, smoother.value))

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    logger.info("Environment solved!")
                    return
            else:
                reward_fulfilled.reset()

        pytest.fail("RAINBOW Training failed.")
示例#2
0
文件: test_a2c.py 项目: iffiX/machin
    def test_full_train(self, train_config, a2c_train, gae_lambda):
        c = train_config
        a2c_train.gae_lambda = gae_lambda

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        env.seed(0)
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            tmp_observations = []
            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action = a2c_train.act({"state": old_state.unsqueeze(0)})[0]
                    state, reward, terminal, _ = env.step(action.item())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    tmp_observations.append(
                        {
                            "state": {"state": old_state.unsqueeze(0)},
                            "action": {"action": action},
                            "next_state": {"state": state.unsqueeze(0)},
                            "reward": float(reward),
                            "terminal": terminal or step == c.max_steps,
                        }
                    )

            # update
            a2c_train.store_episode(tmp_observations)
            a2c_train.update()

            smoother.update(total_reward)
            step.reset()
            terminal = False

            logger.info(f"Episode {episode} total reward={smoother.value:.2f}")

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    logger.info("Environment solved!")
                    return
            else:
                reward_fulfilled.reset()

        pytest.fail("A2C Training failed.")
示例#3
0
    def test_full_train(rank, gpu):
        c = TestARS.c
        c.device = gpu
        ars = TestARS.ars()

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        # for cpu usage viewing
        default_logger.info("{}, pid {}".format(rank, os.getpid()))
        while episode < c.max_episodes:
            episode.count()

            all_reward = 0
            for at in ars.get_actor_types():
                total_reward = 0

                # batch size = 1
                state = t.tensor(env.reset(), dtype=t.float32, device=c.device)
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        # agent model inference
                        action = ars.act({"state": state.unsqueeze(0)}, at)
                        state, reward, terminal, __ = env.step(action)
                        state = t.tensor(state,
                                         dtype=t.float32,
                                         device=c.device)
                        total_reward += float(reward)
                step.reset()
                terminal = False
                ars.store_reward(total_reward, at)
                all_reward += total_reward

            # update
            ars.update()
            smoother.update(all_reward / len(ars.get_actor_types()))
            default_logger.info(
                "Process {} Episode {} total reward={:.2f}".format(
                    rank, episode, smoother.value))

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    default_logger.info("Environment solved!")
                    raise SafeExit
            else:
                reward_fulfilled.reset()

        raise RuntimeError("ARS Training failed.")
示例#4
0
 def test_counter(self):
     c = Counter(start=0, step=1)
     c.count()
     assert c.get() == 1
     c.reset()
     assert c.get() == 0
     assert c < 1
     assert c <= 1
     assert c == 0
     assert c > -1
     assert c >= -1
     str(c)
示例#5
0
    def __init__(self,
                 name: str,
                 member_ranks: List[int],
                 rank: int,
                 timeout: float = 1,
                 **__):
        """
        Args:
            name: Name of this election group.
            member_ranks: Absolute member ranks.
            rank: Rank of the current process.
            timeout: Timeout :math:`\\delta` used in the essay.
        """
        super(ElectionGroupStableBase, self).__init__()
        if name in self.elect_groups:
            raise RuntimeError("Election group already existed!")
        self.member_ranks = sorted(member_ranks)

        # relative rank in list
        self.rank = self.member_ranks.index(rank)
        # Name of group, used to lookup
        self.name = name

        self.timer = Timer()
        self.alive_timer = Timer()
        self.timeout_recv_timer = Timer()
        self.timeout_recv_ok_or_start = False
        self.timeout_recv_pong = []
        self.timeout = timeout

        self.cur_round = 0
        self.leader = None

        self.tka_thread = Thread(target=self._task_keep_alive)
        self.tto_thread = Thread(target=self._task_timeout)
        self.leader_change_cond = Condition(Lock())
        self.ok_counter = Counter()
        self.last_alert = None
        self.recv_alerts = []
        self.elect_groups[name] = self

        self.stop = False
        self.tka_thread.start()
        self.tto_thread.start()
        self._start_round(0)
        self._register_handle(self._handle)
示例#6
0
    def test_full_train(self, train_config, ddpg_per_train):
        c = train_config

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state

                    # agent model inference
                    if episode.get() % c.noise_interval == 0:
                        action = ddpg_per_train.act_with_noise(
                            {"state": old_state.unsqueeze(0)},
                            noise_param=c.noise_param,
                            mode=c.noise_mode,
                        )
                    else:
                        action = ddpg_per_train.act({
                            "state":
                            old_state.unsqueeze(0)
                        }).clamp(-c.action_range, c.action_range)

                    state, reward, terminal, _ = env.step(action.cpu().numpy())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    ddpg_per_train.store_transition({
                        "state": {
                            "state": old_state.unsqueeze(0)
                        },
                        "action": {
                            "action": action
                        },
                        "next_state": {
                            "state": state.unsqueeze(0)
                        },
                        "reward":
                        float(reward),
                        "terminal":
                        terminal or step == c.max_steps,
                    })
            # update
            if episode > 100:
                for i in range(step.get()):
                    ddpg_per_train.update()

            smoother.update(total_reward)
            step.reset()
            terminal = False

            if episode.get() % c.noise_interval != 0:
                # only log result without noise
                logger.info(
                    f"Episode {episode} total reward={smoother.value:.2f}")

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    logger.info("Environment solved!")
                    return
            else:
                reward_fulfilled.reset()

        pytest.fail("DDPGPer Training failed.")
示例#7
0
    def test_full_train(rank):
        c = TestIMPALA.c
        impala = TestIMPALA.impala("cpu", t.float32)

        # perform manual syncing to decrease the number of rpc calls
        impala.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        world = get_world()
        all_group = world.create_rpc_group("all", ["0", "1", "2"])
        all_group.pair("{}_running".format(rank), True)
        default_logger.info("{}, pid {}".format(rank, os.getpid()))
        if rank == 0:
            all_group.pair("episode", episode)

        if rank in (0, 1):
            while episode < c.max_episodes:
                # wait for trainer to keep up
                sleep(0.2)
                episode.count()

                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32)

                impala.manual_sync()
                tmp_observations = []
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        action, action_log_prob, *_ = impala.act(
                            {"state": old_state.unsqueeze(0)})
                        state, reward, terminal, _ = env.step(action.item())
                        state = t.tensor(state, dtype=t.float32).flatten()
                        total_reward += float(reward)

                        tmp_observations.append({
                            "state": {
                                "state": old_state.unsqueeze(0)
                            },
                            "action": {
                                "action": action
                            },
                            "next_state": {
                                "state": state.unsqueeze(0)
                            },
                            "reward":
                            float(reward),
                            "action_log_prob":
                            action_log_prob.item(),
                            "terminal":
                            terminal or step == c.max_steps
                        })
                impala.store_episode(tmp_observations)

                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info("Process {} Episode {} "
                                    "total reward={:.2f}".format(
                                        rank, episode, smoother.value))

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")

                        all_group.unpair("{}_running".format(rank))
                        while (all_group.is_paired("0_running")
                               or all_group.is_paired("1_running")):
                            # wait for all workers to join
                            sleep(1)
                        # wait for trainer
                        sleep(5)
                        return True
                else:
                    reward_fulfilled.reset()
        else:
            # wait for some samples
            # Note: the number of entries in buffer means "episodes"
            # rather than steps here!
            while impala.replay_buffer.all_size() < 5:
                sleep(0.1)
            while (all_group.is_paired("0_running")
                   or all_group.is_paired("1_running")):
                impala.update()
            return True

        raise RuntimeError("IMPALA Training failed.")
示例#8
0
    def test_full_train(rank, gpu):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex()
        # perform manual syncing to decrease the number of rpc calls
        ddpg_apex.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        world = get_world()
        all_group = world.create_rpc_group("all", ["0", "1", "2"])
        all_group.pair("{}_running".format(rank), True)
        default_logger.info("{}, pid {}".format(rank, os.getpid()))
        if rank == 0:
            all_group.pair("episode", episode)

        if rank in (0, 1):
            while episode < c.max_episodes:
                # wait for trainer to keep up
                sleep(0.2)
                episode.count()

                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32, device=c.device)

                ddpg_apex.manual_sync()
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        action = ddpg_apex.act_with_noise(
                            {"state": old_state.unsqueeze(0)},
                            noise_param=c.noise_param,
                            mode=c.noise_mode)

                        state, reward, terminal, _ = env.step(
                            action.cpu().numpy())
                        state = t.tensor(state,
                                         dtype=t.float32,
                                         device=c.device).flatten()
                        total_reward += float(reward)

                        ddpg_apex.store_transition({
                            "state": {
                                "state": old_state.unsqueeze(0)
                            },
                            "action": {
                                "action": action
                            },
                            "next_state": {
                                "state": state.unsqueeze(0)
                            },
                            "reward":
                            float(reward),
                            "terminal":
                            terminal or step == c.max_steps
                        })

                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info("Process {} Episode {} "
                                    "total reward={:.2f}".format(
                                        rank, episode, smoother.value))

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")

                        all_group.unpair("{}_running".format(rank))
                        while (all_group.is_paired("0_running")
                               or all_group.is_paired("1_running")):
                            # wait for all workers to join
                            sleep(1)
                        # wait for trainer
                        sleep(5)
                        return True
                else:
                    reward_fulfilled.reset()
        else:
            # wait for some samples
            while ddpg_apex.replay_buffer.all_size() < 500:
                sleep(0.1)
            while (all_group.is_paired("0_running")
                   or all_group.is_paired("1_running")):
                ddpg_apex.update()
            return True

        raise RuntimeError("DDPG-Apex Training failed.")
示例#9
0
    def test_full_train(self, train_config, sac_train):
        c = train_config
        sac_train.target_entropy = -c.action_dim

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state

                    # agent model inference
                    action = sac_train.act({"state":
                                            old_state.unsqueeze(0)})[0]

                    state, reward, terminal, _ = env.step(action.cpu().numpy())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    sac_train.store_transition({
                        "state": {
                            "state": old_state.unsqueeze(0)
                        },
                        "action": {
                            "action": action
                        },
                        "next_state": {
                            "state": state.unsqueeze(0)
                        },
                        "reward":
                        float(reward),
                        "terminal":
                        terminal or step == c.max_steps,
                    })
            # update
            if episode > 100:
                for i in range(step.get()):
                    sac_train.update()
                logger.info(
                    f"new entropy alpha: {sac_train.entropy_alpha.item()}")

            smoother.update(total_reward)
            step.reset()
            terminal = False

            logger.info(f"Episode {episode} total reward={smoother.value:.2f}")

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    logger.info("Environment solved!")
                    return
            else:
                reward_fulfilled.reset()

        pytest.fail("SAC Training failed.")
示例#10
0
    def test_full_train(rank):
        training_group = get_world().create_rpc_group("training", ["0", "1", "2"])

        c = TestIMPALA.c
        impala = TestIMPALA.impala("cpu", t.float32)

        # perform manual syncing to decrease the number of rpc calls
        impala.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False
        env = c.env
        env.seed(rank)

        # make sure all things are initialized.
        training_group.barrier()

        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")

        while episode < c.max_episodes:
            episode.count()

            if rank in (0, 1):
                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32)

                impala.manual_sync()
                tmp_observations = []
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        action, action_log_prob, *_ = impala.act(
                            {"state": old_state.unsqueeze(0)}
                        )
                        state, reward, terminal, _ = env.step(action.item())
                        state = t.tensor(state, dtype=t.float32).flatten()
                        total_reward += float(reward)

                        tmp_observations.append(
                            {
                                "state": {"state": old_state.unsqueeze(0)},
                                "action": {"action": action},
                                "next_state": {"state": state.unsqueeze(0)},
                                "reward": float(reward),
                                "action_log_prob": action_log_prob.item(),
                                "terminal": terminal or step == c.max_steps,
                            }
                        )
                impala.store_episode(tmp_observations)

                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info(
                    "Process {} Episode {} "
                    "total reward={:.2f}".format(rank, episode, smoother.value)
                )

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")
                        try:
                            training_group.pair(f"solved", True)
                        except KeyError:
                            # already solved in another process
                            pass
                else:
                    reward_fulfilled.reset()
            else:
                # wait for some samples
                if episode.get() > 200:
                    for _ in range(100):
                        impala.update()
                    default_logger.info("Updated 100 times.")

            training_group.barrier()
            if training_group.is_paired("solved"):
                return True

        raise RuntimeError("IMPALA Training failed.")
示例#11
0
    def test_full_train(rank, gae_lambda):
        c = TestA3C.c
        a3c = TestA3C.a3c("cpu", t.float32)
        a3c.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            a3c.manual_sync()
            tmp_observations = []
            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action = a3c.act({"state": old_state.unsqueeze(0)})[0]
                    state, reward, terminal, _ = env.step(action.item())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    tmp_observations.append({
                        "state": {
                            "state": old_state.unsqueeze(0)
                        },
                        "action": {
                            "action": action
                        },
                        "next_state": {
                            "state": state.unsqueeze(0)
                        },
                        "reward":
                        float(reward),
                        "terminal":
                        terminal or step == c.max_steps,
                    })

            # update
            a3c.store_episode(tmp_observations)
            a3c.update()

            smoother.update(total_reward)
            step.reset()
            terminal = False

            default_logger.info(
                f"Process {rank} Episode {episode} total reward={smoother.value:.2f}"
            )

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    default_logger.info("Environment solved!")
                    return True
            else:
                reward_fulfilled.reset()

        raise RuntimeError("A3C Training failed.")
示例#12
0
    def test_full_train(self, train_config, maddpg_train):
        c = train_config

        # begin training
        episode, step = Counter(), Counter()

        # first for prey, second for pred
        smoother = Smooth()
        reward_fulfilled = Counter()
        terminal = False

        env = c.env
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            states = [t.tensor(st, dtype=t.float32) for st in env.reset()]

            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_states = states

                    # agent model inference
                    results = maddpg_train.act_discrete_with_noise(
                        [{"state": st.unsqueeze(0)} for st in states]
                    )
                    actions = [int(r[0]) for r in results]
                    action_probs = [r[1] for r in results]

                    states, rewards, terminals, _ = env.step(actions)
                    states = [t.tensor(st, dtype=t.float32) for st in states]

                    total_reward += float(sum(rewards)) / c.agent_num

                    maddpg_train.store_transitions(
                        [
                            {
                                "state": {"state": ost.unsqueeze(0)},
                                "action": {"action": act},
                                "next_state": {"state": st.unsqueeze(0)},
                                "reward": float(rew),
                                "terminal": term or step == c.max_steps,
                            }
                            for ost, act, st, rew, term in zip(
                                old_states, action_probs, states, rewards, terminals
                            )
                        ]
                    )

            # update
            if episode > 5:
                for i in range(step.get()):
                    maddpg_train.update()

            # total reward is divided by steps here, since:
            # "Agents are rewarded based on minimum agent distance
            #  to each landmark, penalized for collisions"
            smoother.update(total_reward / step.get())
            logger.info(f"Episode {episode} total steps={step}")
            step.reset()
            terminal = False

            logger.info(f"Episode {episode} total reward={smoother.value:.2f}")

            if smoother.value > c.solved_reward and episode > 20:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    logger.info("Environment solved!")
                    return
            else:
                reward_fulfilled.reset()

        pytest.fail("MADDPG Training failed.")
示例#13
0
    def test_full_train(rank):
        c = TestDQNApex.c
        dqn_apex = TestDQNApex.dqn_apex("cpu", t.float32)
        # perform manual syncing to decrease the number of rpc calls
        dqn_apex.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        world = get_world()
        all_group = world.create_rpc_group("all", ["0", "1", "2"])
        all_group.pair(f"{rank}_running", True)

        if rank in (0, 1):
            while episode < c.max_episodes:
                # wait for trainer to keep up
                sleep(0.2)
                episode.count()

                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32)

                dqn_apex.manual_sync()
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        # agent model inference
                        action = dqn_apex.act_discrete_with_noise(
                            {"state": old_state.unsqueeze(0)})
                        state, reward, terminal, _ = env.step(action.item())
                        state = t.tensor(state, dtype=t.float32).flatten()
                        total_reward += float(reward)

                        dqn_apex.store_transition({
                            "state": {
                                "state": old_state.unsqueeze(0)
                            },
                            "action": {
                                "action": action
                            },
                            "next_state": {
                                "state": state.unsqueeze(0)
                            },
                            "reward":
                            float(reward),
                            "terminal":
                            terminal or step == c.max_steps,
                        })
                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info(
                    "Process {} Episode {} total reward={:.2f}".format(
                        rank, episode, smoother.value))

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")

                        all_group.unpair(f"{rank}_running")
                        while all_group.is_paired(
                                "0_running") or all_group.is_paired(
                                    "1_running"):
                            # wait for all workers to join
                            sleep(1)
                        # wait for trainer
                        sleep(5)
                        return True
                else:
                    reward_fulfilled.reset()
        else:
            # wait for some samples
            while dqn_apex.replay_buffer.all_size() < 500:
                sleep(0.1)
            while all_group.is_paired("0_running") or all_group.is_paired(
                    "1_running"):
                dqn_apex.update()
                default_logger.info("Updated")
            return True

        raise RuntimeError("DQN-Apex Training failed.")
示例#14
0
文件: test_ars.py 项目: iffiX/machin
    def test_full_train(rank):
        training_group = get_world().create_rpc_group("training",
                                                      ["0", "1", "2"])

        c = TestARS.c
        ars = TestARS.ars("cpu", t.float32)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False
        env = c.env
        env.seed(rank)

        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")

        # make sure all things are initialized.
        training_group.barrier()

        while episode < c.max_episodes:
            episode.count()

            all_reward = 0
            for at in ars.get_actor_types():
                total_reward = 0

                # batch size = 1
                state = t.tensor(env.reset(), dtype=t.float32)
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        # agent model inference
                        action = ars.act({"state": state.unsqueeze(0)}, at)
                        state, reward, terminal, __ = env.step(action)
                        state = t.tensor(state, dtype=t.float32)
                        total_reward += float(reward)
                step.reset()
                terminal = False
                ars.store_reward(total_reward, at)
                all_reward += total_reward

            # update
            ars.update()
            smoother.update(all_reward / len(ars.get_actor_types()))
            default_logger.info(
                f"Process {rank} Episode {episode} total reward={smoother.value:.2f}"
            )

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    default_logger.info("Environment solved!")
                    try:
                        training_group.pair(f"solved", True)
                    except KeyError:
                        # already solved in another process
                        pass
            else:
                reward_fulfilled.reset()

            training_group.barrier()
            if training_group.is_paired("solved"):
                return True

        raise RuntimeError("ARS Training failed.")