示例#1
0
    for key, value in env_conf["useful_region"].items():
        if key in args.env:
            env_conf["useful_region"] = value
            custom_region_available = True
            break
    if custom_region_available is not True:
        env_conf["useful_region"] = env_conf["useful_region"]["Default"]
    print("Configuración a utilizar:", env_conf)

    atari_env = False
    for game in Atari.get_games_list():
        if game.replace("_", "") in args.env.lower():
            atari_env = True  
              
    if atari_env:
        environment = Atari.make_env(args.env, env_conf)
    else:
        environment = env_utils.ResizeReshapeFrames(gym.make(args.env))

    obs_shape = environment.observation_space.shape
    action_shape = environment.action_space.n
    agent_params = manager.get_agent_params()
    agent_params["test"] = args.test
    agent_params["clip_reward"] = env_conf["clip_reward"]
    agent = DeepQLearner(obs_shape, action_shape, agent_params)

    episode_rewards = list()
    previous_checkpoint_mean_ep_rew = agent.best_mean_reward
    num_improved_episodes_before_checkpoint = 0
    if agent_params['load_trained_model']:
        try:
示例#2
0
    def run(self):

        ## Cargar datos del entorno donde entrenar
        custom_region_available = False

        for key, value in self.env_conf["useful_region"].items():
            if key in args.env:
                self.env_conf["useful_region"] = value
                custom_region_available = True
                break

        if custom_region_available is not True:
            self.env_conf["useful_region"] = self.env_conf["useful_region"][
                "Default"]

        atari_env = False
        for game in Atari.get_games_list():
            if game.replace("_", "") in args.env.lower():
                atari_env = True

        if atari_env:
            self.env = Atari.make_env(self.env_name, self.env_conf)
        else:
            self.env = gym.make(self.env_name)

        ## Configurar la política y parámetros del actor y del crítico
        self.state_shape = self.env.observation_space.shape

        if isinstance(self.env.action_space.sample(),
                      int):  # Espacio de acciones Discreto
            self.action_shape = self.env.action_space.n
            self.policy = self.discrete_policy
            self.continuous_action_space = False

        else:  # Espacio de acciones contínuas
            self.action_shape = self.env.action_space.shape[0]
            self.policy = self.multi_variate_gaussian_policy

        self.critic_shape = 1

        if len(self.state_shape
               ) == 3:  #Imagen de pantalla como input del agente y el crítico
            if self.continuous_action_space:  # Espacio de acciones contínuas
                self.actor = DeepActor(self.state_shape, self.action_shape,
                                       device).to(device)
            else:  # Espacio de acciones discretas
                self.actor = DeepDiscreteActor(self.state_shape,
                                               self.action_shape,
                                               device).to(device)
            self.critic = DeepCritic(self.state_shape, self.critic_shape,
                                     device).to(device)
        else:  # Vector de cierta dimensión como input del agente y del crítico
            if self.continuous_action_space:  # Espacio de acciones contínuas
                #self.actor_critic = SwallowActorCritic(slf.state_shape, self.action_shape, self.critic_shape, device).to(device)
                self.actor = SwallowActor(self.state_shape, self.action_shape,
                                          device).to(device)
            else:  # Espacio de acciones discretas
                self.actor = SwallowDiscreteActor(self.state_shape,
                                                  self.action_shape,
                                                  device).to(device)
            self.critic = SwallowCritic(self.state_shape, self.critic_shape,
                                        device).to(device)

        self.actor_optimizer = torch.optim.Adam(
            self.actor.parameters(), lr=self.params["learning_rate"])
        self.critic_optimizer = torch.optim.Adam(
            self.critic.parameters(), lr=self.params["learning_rate"])

        ## Fase de entrenamiento del agente inteligente con A2C
        episode_rewards = list()
        previous_checkpoint_mean_ep_rew = self.best_mean_reward
        num_improved_episodes_before_checkpoint = 0
        if self.params['load_trained_model']:
            try:
                self.load()
                previous_checkpoint_mean_ep_rew = self.best_mean_reward
            except FileNotFoundError:
                print(
                    "ERROR: no existe ningún modelo entrenado para este entorno. Empezamos desde cero"
                )
                if args.test:
                    print(
                        "FATAL: no hay modelo salvado y no podemos proceder al modo testing. Pulsa cualquier tecla para volver a empezar"
                    )
                    input()
                else:
                    print(
                        "WARNING: no hay ningun modelo para este entorno. Pulsa cualquier tecla para volver a empezar..."
                    )

        for episode in range(self.params["max_num_episodes"]):
            obs = self.env.reset()
            done = False
            ep_reward = 0.0
            step_num = 0
            while not done:
                action = self.get_action(obs)
                next_obs, reward, done, _ = self.env.step(action)
                self.rewards.append(reward)
                ep_reward += reward
                step_num += 1

                if not args.test and (
                        step_num > self.params["learning_step_thresh"]
                        or done):
                    self.learn(next_obs, done)
                    step_num = 0

                    if done:
                        episode_rewards.append(ep_reward)
                        if ep_reward > self.best_reward:
                            self.best_reward = ep_reward

                        if np.mean(episode_rewards
                                   ) > previous_checkpoint_mean_ep_rew:
                            num_improved_episodes_before_checkpoint += 1

                        if num_improved_episodes_before_checkpoint >= self.params[
                                'save_freq']:
                            previous_checkpoint_mean_ep_rew = np.mean(
                                episode_rewards)
                            self.best_mean_reward = np.mean(episode_rewards)
                            self.save()
                            num_improved_episodes_before_checkpoint = 0

                obs = next_obs
                self.global_step_num += 1
                if args.render:
                    self.env.render()

                print(
                    "\n {}: Episodio #{}. Con {} estados:  recompensa media = {:.2f}, mejor recompensa = {}"
                    .format(self.actor_name, episode, ep_reward,
                            np.mean(episode_rewards), self.best_reward))

                writer.add_scalar(self.actor_name + "/reward", reward,
                                  self.global_step_num)
                writer.add_scalar(self.actor_name + "/ep_reward", ep_reward,
                                  self.global_step_num)
                writer.add_scalar(self.actor_name + "/mean_ep_reward",
                                  np.mean(episode_rewards),
                                  self.global_step_num)
                writer.add_scalar(self.actor_name + "/max_ep_reward",
                                  self.best_reward, self.global_step_num)