示例#1
0
    def test(self) -> None:
        """
        Teste l'agent sur un episode complet sans bruit
        """
        start_test_time = time()

        env = make_env(self._config)
        obs, done = env.reset(), False
        rews = []
        nb_step = 0
        while not done:
            act = self._agent(obs=obs)
            obs, rew, done, _ = env.step(act)
            rews.append(rew)
            nb_step += 1

        rew_mean = np.mean(rews)

        # logging
        self._logger.add_scalar(label="test/reward",
                                value=sum(rews),
                                step=self._global_update_step.val())
        self._logger.add_scalar(label="test/nb_step",
                                value=nb_step,
                                step=self._global_update_step.val())
        self._logger.add_scalar(label="test/reward_mean",
                                value=float(rew_mean),
                                step=self._global_update_step.val())
        self._logger.add_scalar(label="test/reward_var",
                                value=float(np.var(rews)),
                                step=self._global_update_step.val())

        if self._best_test_reward < int(rew_mean):
            self._agent.save(episode=self._global_episode.val(),
                             update_step=self._global_update_step.val(),
                             test_reward=int(sum(rews)))

        self._logger.add_scalar(label="test/test_speed",
                                value=time() - start_test_time,
                                step=self._global_update_step.val())
示例#2
0
    def __init__(self, config: Dict, agent: Agent, id_worker: int,
                 worker_seed: int, global_episode: Counter,
                 transitions_queue: Queue, global_update_step: Counter,
                 epsilon: Union[ExponentialEpsilon,
                                SinusoidalEpsilone], logger: Logger) -> None:
        """
        Thread responsable de récupérer les transitions en interagissant avec l'environnement avec la dernière version
        de l'agent

        :param config: Dictionnaire de configuration de l'expérience
        :param agent: Agent à utiliser afin de choisir les actions
        :param id_worker: Id du worker
        :param worker_seed: Random seed à utiliser
        :param global_episode: Compteur du nombre d'époside finis (partagé entre les threads)
        :param transitions_queue: Queue à partir de laquelle les threads Player envoient leurs transitions au thread Trainer
        :param global_update_step: Compteur du nombre d'update effectués (partagé entre les threads)
        :param epsilon: Epsilone processus utilisé pour le bruit ajouté aux actions de l'agent
        :param logger: Le logger utilisé au cours de l'expérience
        """
        super().__init__()

        self._config = config
        self._agent = agent
        self._id = id_worker
        self._global_episode = global_episode
        self._transitions_queue = transitions_queue
        self._global_update_step = global_update_step
        self._logger = logger

        self._seed = worker_seed
        plant_seed(self._seed)

        self._env = make_env(self._config)
        self._nb_action = self._env.action_space.shape[
            0] if self._env.action_space.shape else 1

        self._random_noise_process = RandomNoise(config, self._nb_action,
                                                 epsilon)
示例#3
0
    def run(self) -> None:
        """
        Lance le thread
        """
        episode = 1
        step = 0

        self._start_player_time = time()

        while True:
            start_loop_time = time()

            if self._should_stop():
                break

            # Pour éviter les fuites mémoire sur certain environnement (rip opensim)
            if self._config["invalidate_env_time"] and episode % self._config[
                    "invalidate_env_time"] == 0:
                self._env = make_env(self._config)

            cur_episode = self._global_episode.val()
            self._global_episode.inc()

            episode_time = 0

            obs, done = self._env.reset(), False

            rew_sum = 0
            n_steps = 0
            while not done:
                start_episode_time = time()

                noise = self._random_noise_process.sample()
                act = self._agent(obs=obs, noise=noise)
                next_obs, rew, done, _ = self._env.step(act)
                rew_sum += rew

                n_steps += 1

                transition = Transition(observation=obs,
                                        action=act,
                                        new_observation=next_obs,
                                        reward=rew,
                                        done=done)

                obs = next_obs

                episode_time += time() - start_episode_time

                # Placement de la transition dans la queue
                while True:
                    if self._should_stop():
                        break
                    try:
                        self._transitions_queue.put_nowait(transition)
                        break
                    except Full:
                        sleep(0.01)

            # logging
            if step % self._config["players_config"]["log_freq"] == 0:
                self._logger.add_scalar(
                    label=f"players/reward_per_episode_{self._id}",
                    value=rew_sum,
                    step=cur_episode)
                self._logger.add_scalar(
                    label=f"players/noise_abs_mean_{self._id}",
                    value=np.abs(noise).mean(),
                    step=cur_episode)
                self._logger.add_scalar(
                    label=f"players/step_per_episode_{self._id}",
                    value=n_steps,
                    step=cur_episode)

                self._logger.add_scalar(label=f"players/step_{self._id}",
                                        value=step,
                                        step=cur_episode)
                self._logger.add_scalar(
                    label=f"players/epoch_speed_{self._id}",
                    value=episode_time,
                    step=cur_episode)
                self._logger.add_scalar(label=f"players/idle_{self._id}",
                                        value=time() - start_loop_time -
                                        episode_time,
                                        step=cur_episode)

            step += 1
        print(f"player {self._id} end")
示例#4
0
    def run(self) -> None:
        """
        Lance le thread
        """
        self._start_training_time = time()

        # Initialise le replay buffer avec la policy random jusqu'à qu'il contienne min_replay_size transitions
        p_bar = tqdm(total=self._config["agent_config"]["min_replay_size"])
        env = make_env(self._config)
        while len(self._replay_buffer
                  ) < self._config["agent_config"]["min_replay_size"]:
            obs, done = env.reset(), False

            while not done:
                act = env.action_space.sample()
                if len(act.shape) < 1:
                    act = [act]
                next_obs, rew, done, _ = env.step(act)
                transition = Transition(observation=obs,
                                        action=act,
                                        new_observation=next_obs,
                                        reward=rew,
                                        done=done)
                obs = next_obs
                self._replay_buffer.add(transition)
                p_bar.update(1)

        print("buffer initialization done")

        while True:
            start_get_replays_time = time()

            # Récupération d'une transition dans la queue et placement de cette transition dans le replay buffer
            while True:
                if self._should_stop():
                    break

                try:
                    transition = self._episode_queue.get_nowait()
                    self._replay_buffer.add(transition)
                    break
                except Empty:
                    sleep(0.01)
                    pass

            if self._should_stop():
                break

            end_get_replays_time = time()

            start_update_time = time()

            indexes_b, transition_b, weights_b = self._replay_buffer.sample(
                self._config["agent_config"]["batch_size"])

            if self._global_update_step.val(
            ) % self._config["trainer_config"]["log_freq"] == 0:
                update_step = self._global_update_step.val()
            else:
                update_step = None

            td_error = self._agent.update(transition_b, weights_b, update_step)

            # Update les priorités du replay buffer avec l'erreur du critic
            new_priorities = np.abs(
                td_error
            ) + 1e-16  # attention les priorités doivent toujours être strictement positives
            self._replay_buffer.update(indexes_b, new_priorities)

            # logging
            if update_step:
                self._logger.add_scalar(label="multithreading/queue_size",
                                        value=self._episode_queue.qsize(),
                                        step=update_step)
                self._logger.add_scalar(label="trainer/buffer_size",
                                        value=len(self._replay_buffer),
                                        step=update_step)
                self._logger.add_scalar(label="trainer/priority_mean",
                                        value=weights_b.mean(),
                                        step=update_step)
                self._logger.add_scalar(label="trainer/priority_var",
                                        value=weights_b.var(),
                                        step=update_step)
                self._logger.add_scalar(label="trainer/td_error_mean",
                                        value=np.abs(td_error).mean(),
                                        step=update_step)
                self._logger.add_scalar(label="trainer/update_step",
                                        value=self._global_episode.val(),
                                        step=update_step)
                self._logger.add_scalar(label="trainer/update_time",
                                        value=time() - start_update_time,
                                        step=update_step)
                self._logger.add_scalar(label="trainer/idle_time",
                                        value=end_get_replays_time -
                                        start_get_replays_time,
                                        step=update_step)

            self._global_update_step.inc()
            self._epsilone.step()

            if self._global_update_step.val(
            ) % self._config["trainer_config"]["test_freq"] == 0:
                self.test()

        self.test()
        print("trainer end")
示例#5
0
    def __init__(self,
                 config: Dict,
                 logger: Logger,
                 force_cpu: bool = False) -> None:
        """
        La classe Agent regroupe un reseau de neurones Actor et un Critic et correspond à un agent de RL pouvant prendre
        des décisions et updater les poids de ses réseaux

        :param config: Dictionnaire de configuration de l'expérience
        :param logger: Logger à utiliser au cours de l'entrainement de l'agent
        :param force_cpu: force l'utilisation du CPU même si un GPU est détecté
        """
        self._lock = threading.Lock()

        self._config = config

        source_env = make_env(self._config)

        if force_cpu:
            self._device = torch.device("cpu")
        else:
            self._device = torch.device(
                "cuda") if torch.cuda.is_available() else torch.device("cpu")

        self._n_action = source_env.action_space.shape[
            0] if source_env.action_space.shape else 1
        self._n_observation = source_env.observation_space.shape[
            0] if source_env.observation_space.shape else 1

        self._actor = Actor(self._config, self._n_observation,
                            self._n_action).to(self._device)
        self._critic = Critic(self._config, self._n_observation,
                              self._n_action).to(self._device)
        if self._config["agent_config"]["load_from_ckpt"] is not None:
            ckpt = torch.load(self._config["agent_config"]["load_from_ckpt"],
                              map_location=self._device)
            self._actor.load_state_dict(ckpt["actor"])
            self._critic.load_state_dict(ckpt["critic"])

        self._target_actor = Actor(self._config, self._n_observation,
                                   self._n_action).to(self._device)
        self._target_critic = Critic(self._config, self._n_observation,
                                     self._n_action).to(self._device)

        self._hard_update(self._target_actor, self._actor)
        self._hard_update(self._target_critic, self._critic)

        self._logger = logger

        self._loss_fn = WeightedMSELoss()

        self._actor_optim = torch.optim.Adam(
            self._actor.parameters(),
            lr=self._config["agent_config"]["lr_actor"],
            weight_decay=self._config["agent_config"]["weight_decay_actor"])
        self._critic_optim = torch.optim.Adam(
            self._critic.parameters(),
            lr=self._config["agent_config"]["lr_critic"],
            weight_decay=self._config["agent_config"]["weight_decay_critic"])

        if self._config["agent_config"]["load_critic_from"] is not None:
            self._critic.load_state_dict(
                torch.load(self._config["agent_config"]["load_critic_from"]))
            print(
                f"load critic from {self._config['agent_config']['load_critic_from']}"
            )
        elif self._config["agent_config"]["warmstart_critic"]:
            self._pretrain_critic()
示例#6
0
    def _pretrain_critic(self) -> None:
        """
        Initialisation du critic en l'entraînant sur la policy random
        """
        # génération des transitions random
        env = make_env(self._config)
        p_bar = tqdm(total=self._config["agent_config"]["warmstart_size"])
        samples = []
        while len(samples) < self._config["agent_config"]["warmstart_size"]:
            obs, done = env.reset(), False

            while not done:
                act = env.action_space.sample()
                next_obs, rew, done, _ = env.step(act)
                samples.append(
                    Transition(observation=obs,
                               action=act,
                               new_observation=next_obs,
                               reward=rew,
                               done=done))
                obs = next_obs
                p_bar.update(1)

        samples = np.array(samples)

        patience = 0
        batch_size = self._config["agent_config"]["batch_size"]
        best_td_error_mean = float('inf')
        epoch = 0

        # TODO : remonter la configuration de la patience du warmstart dans la config de l'expérience
        while patience < 5:
            np.random.shuffle(samples)

            train_td_error = []

            # train one epoch
            for i in range(int(len(samples) / batch_size)):
                batch = samples[i * batch_size:min(len(samples), (i + 1) *
                                                   batch_size)]
                observations_t, actions_t, rewards_t, next_observations_t, dones_t = unpack_batch(
                    batch, self._device)
                weight_t = torch.ones(len(observations_t))

                td_error = self._update_critic(observations_t, actions_t,
                                               rewards_t, next_observations_t,
                                               dones_t, weight_t, None)

                # Target soft update
                self._soft_update(self._target_critic, self._critic)

                train_td_error.append(np.abs(td_error).mean())

            train_td_error_mean = np.mean(train_td_error)
            self._logger.add_scalar(label=f"pretrain/td_error_mean",
                                    value=float(train_td_error_mean),
                                    step=epoch)

            epoch += 1

            if train_td_error_mean < best_td_error_mean:
                best_td_error_mean = train_td_error_mean
                patience = 0
            else:
                patience += 1

        save_path = os.path.join(self._config["log_dir"],
                                 f"critic_warmstart.pkl")
        torch.save(self._critic.state_dict(), save_path)
        print(f"saving critic to {save_path}")

        print(f"warmstart done")
示例#7
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Script permettant de tester un agent")
    parser.add_argument("-l", "--logdir", type=str)
    parser.add_argument('--last', dest='last_ckpt', action='store_true')
    parser.set_defaults(last_ckpt=False)

    args = parser.parse_args()

    with open(os.path.join(args.logdir, "config.json")) as f:
        config = json.load(f)

    n_skip = config["skip_frame"]
    config["skip_frame"] = 1

    with make_env(config) as env:

        while True:
            best = float('-inf')
            best_ckpt_file = ""
            for ckpt in os.listdir(os.path.join(args.logdir, "checkpoints")):
                if ckpt.startswith('.'):
                    continue
                if args.last_ckpt:
                    n = int(ckpt.split('e')[1].split('_')[0])
                else:
                    n = int(ckpt.split('r')[-1].split('.')[0])
                if n > best:
                    best = n
                    best_ckpt_file = os.path.join(args.logdir, "checkpoints",
                                                  ckpt)