def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Actor")
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = OU_Noise_Exploration(self.config)
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Actor")
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = OU_Noise_Exploration(self.config)

        if self.video_mode:
            self.file_name = self.environment_title + "_" + self.agent_name + "_videos"
            for i in range(config.num_episodes_to_run):
                pathset = os.path.join(self.file_name)
                if not (os.path.exists(pathset)):
                    os.mkdir(pathset)
            # f = tables.open_file(self.file_name, mode = 'w')
            # f.close()
            # datainfo = "DDPG_"+ self.environment_title + "_info.txt"
            # f = open(self.file_name, 'w')
            # f.close()
            # f = open(datainfo, 'w')
            # f.write(str(self.height))
            # f.write(str(self.width))
            # f.write(str(self.channel))
            # f.write(str(config.max_step))
            # f.write(str(config.num_episodes_to_run))
            # f.close()
        self.save_max_result_list_list = []
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        model_path = self.config.model_path if self.config.model_path else 'Models'
        self.critic_local_path = os.path.join(
            model_path, "{}_critic_local.pt".format(self.agent_name))
        self.critic_local_2_path = os.path.join(
            model_path, "{}_critic_local_2.pt".format(self.agent_name))
        self.actor_local_path = os.path.join(
            model_path, "{}_actor_local.pt".format(self.agent_name))
        if self.config.load_model: self.locally_load_policy()
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_target = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Actor")

        if self.config.load_model: self.locally_load_policy()
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = OU_Noise_Exploration(self.config)
Exemplo n.º 4
0
class DDPG(Base_Agent):
    """A DDPG Agent"""
    agent_name = "DDPG"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.hyperparameters["Critic"]["learning_rate"])
        self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
                                    self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.hyperparameters["Actor"]["learning_rate"])
        self.exploration_strategy = OU_Noise_Exploration(self.config)

    def step(self):
        """Runs a step in the game"""
        while not self.done:
            # print("State ", self.state.shape)
            self.action = self.pick_action()
            self.conduct_action(self.action)
            if self.time_for_critic_and_actor_to_learn():
                for _ in range(self.hyperparameters["learning_updates_per_learning_session"]):
                    states, actions, rewards, next_states, dones = self.sample_experiences()
                    self.critic_learn(states, actions, rewards, next_states, dones)
                    self.actor_learn(states)
            self.save_experience()
            self.state = self.next_state #this is to set the state for the next iteration
            self.global_step_number += 1
        self.episode_number += 1

    def sample_experiences(self):
        return self.memory.sample()

    def pick_action(self, state=None):
        """Picks an action using the actor network and then adds some noise to it to ensure exploration"""
        if state is None: state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action": action})
        return action.squeeze(0)

    def critic_learn(self, states, actions, rewards, next_states, dones):
        """Runs a learning iteration for the critic"""
        loss = self.compute_loss(states, next_states, rewards, actions, dones)
        self.take_optimisation_step(self.critic_optimizer, self.critic_local, loss, self.hyperparameters["Critic"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"])

    def compute_loss(self, states, next_states, rewards, actions, dones):
        """Computes the loss for the critic"""
        with torch.no_grad():
            critic_targets = self.compute_critic_targets(next_states, rewards, dones)
        critic_expected = self.compute_expected_critic_values(states, actions)
        loss = functional.mse_loss(critic_expected, critic_targets)
        return loss

    def compute_critic_targets(self, next_states, rewards, dones):
        """Computes the critic target values to be used in the loss for the critic"""
        critic_targets_next = self.compute_critic_values_for_next_states(next_states)
        critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones)
        return critic_targets

    def compute_critic_values_for_next_states(self, next_states):
        """Computes the critic values for next states to be used in the loss for the critic"""
        with torch.no_grad():
            actions_next = self.actor_target(next_states)
            critic_targets_next = self.critic_target(torch.cat((next_states, actions_next), 1))
        return critic_targets_next

    def compute_critic_values_for_current_states(self, rewards, critic_targets_next, dones):
        """Computes the critic values for current states to be used in the loss for the critic"""
        critic_targets_current = rewards + (self.hyperparameters["discount_rate"] * critic_targets_next * (1.0 - dones))
        return critic_targets_current

    def compute_expected_critic_values(self, states, actions):
        """Computes the expected critic values to be used in the loss for the critic"""
        critic_expected = self.critic_local(torch.cat((states, actions), 1))
        return critic_expected

    def time_for_critic_and_actor_to_learn(self):
        """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
        actor and critic"""
        return self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0

    def actor_learn(self, states):
        """Runs a learning iteration for the actor"""
        if self.done: #we only update the learning rate at end of each episode
            self.update_learning_rate(self.hyperparameters["Actor"]["learning_rate"], self.actor_optimizer)
        actor_loss = self.calculate_actor_loss(states)
        self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss,
                                    self.hyperparameters["Actor"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(self.actor_local, self.actor_target, self.hyperparameters["Actor"]["tau"])

    def calculate_actor_loss(self, states):
        """Calculates the loss for the actor"""
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(torch.cat((states, actions_pred), 1)).mean()
        return actor_loss
class DDPG(Base_Agent):
    """A DDPG Agent"""
    agent_name = "DDPG"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Actor")
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = OU_Noise_Exploration(self.config)

        if self.video_mode:
            self.file_name = self.environment_title + "_" + self.agent_name + "_videos"
            for i in range(config.num_episodes_to_run):
                pathset = os.path.join(self.file_name)
                if not (os.path.exists(pathset)):
                    os.mkdir(pathset)
            # f = tables.open_file(self.file_name, mode = 'w')
            # f.close()
            # datainfo = "DDPG_"+ self.environment_title + "_info.txt"
            # f = open(self.file_name, 'w')
            # f.close()
            # f = open(datainfo, 'w')
            # f.write(str(self.height))
            # f.write(str(self.width))
            # f.write(str(self.channel))
            # f.write(str(config.max_step))
            # f.write(str(config.num_episodes_to_run))
            # f.close()
        self.save_max_result_list_list = []

    def step(self):
        """Runs a step in the game"""
        # print("(DDPG) into the step")

        # if self.video_mode:
        #     f = open(self.file_name, mode = 'a')
        # f = open(self.file_name, 'a')
        # f.write("Episode" + str(self.episode_number) +"\n")
        # self.f = tables.open_file(self.file_name, mode='a')
        # self.atom = tables.Int64Atom()
        # self.array_c = self.f.create_earray(self.f.root, "Episode"+str(self.episode_number), self.atom, (0,self.height, self.width, self.channel))

        record_video = self.video_mode and self.config.num_episodes_to_run - 10 <= self.episode_number
        if record_video:
            render_list = []
        save_max_score_list = []
        while not self.done:
            # print("State ", self.state.shape)
            self.action = self.pick_action()
            # print("picked action")
            """This is for the Cart-Pole environment"""
            if (self.get_environment_title() == "CartPole"):
                go_action = np.argmax(self.action)
                self.action = np.zeros(2)
                # print(self.action)

                self.action[go_action] = 1
                # self.action = np.put(self.action, go_action, 1)
                # print(self.action)
                self.conduct_action(go_action)

            else:
                self.conduct_action(self.action)
                # print("(DDPG) action conducted! Rendering...")
                img = self.environment.render('rgb_array')
                if record_video:
                    # f = open(self.file_name, mode='wb')
                    render_list.append(img)
                    # img = np.reshape(img, (1)).tolist()
                    # f.write(str(img))
                    # f.write('\n')
                    # img = np.reshape(img, (1, img.shape[0], img.shape[1], img.shape[2]))
                    # print(type(img))
                    # print(img.shape)
                    # print(self.array_c.shape)
                    # print(img)
                    # line = '\n'
                    # f.write(img.tostring())
                    # f.write(line.encode("utf-8"))
                    # f.close()
                    # self.array_c.append(img)
                # self.render.append(img)
                save_max_score_list.append(img)

            # print("(DDPG)outside the loop")
            # print(self.time_for_critic_and_actor_to_learn())
            # This is the learning part
            if self.time_for_critic_and_actor_to_learn():
                # print("(DDPG) It is time to learn!!")
                for _ in range(self.hyperparameters[
                        "learning_updates_per_learning_session"]):
                    states, actions, rewards, next_states, dones = self.sample_experiences(
                    )
                    # print("(DDPG) running in range")
                    self.critic_learn(states, actions, rewards, next_states,
                                      dones)
                    self.actor_learn(states)
                    # print("(DDPG)running in range complete")

            # print("(DDPG) outside of critic loop")
            self.save_experience()
            # print("(DDPG) saving experience")
            ######################
            self.state = self.next_state  #this is to set the state for the next iteration
            self.global_step_number += 1
            # print("(DDPG) incrementing step number")

        if record_video:
            render_list = np.array(render_list)
            np.save(self.file_name + '/episode' + str(self.episode_number + 1),
                    render_list)

        if self.total_episode_score_so_far > -0.2:
            if len(self.save_max_result_list_list) == 10:
                self.save_max_result_list_list.pop(0)
            self.save_max_result_list_list.append(save_max_score_list)

        if self.config.num_episodes_to_run == self.episode_number + 1:
            i = 1
            for save_max_score_list in self.save_max_result_list_list:
                save_max_score_list = np.array(save_max_score_list)
                np.save(self.file_name + '/maxscore' + str(i),
                        save_max_score_list)
                i += 1

        self.episode_number += 1
        # print("The epsiode end! rendering!!")
        # self.environment.render()

    def sample_experiences(self):
        return self.memory.sample()

    def pick_action(self, state=None):
        """Picks an action using the actor network and then adds some noise to it to ensure exploration"""
        if state is None:
            state = torch.from_numpy(self.state).float().unsqueeze(0).to(
                self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        action = self.exploration_strategy.perturb_action_for_exploration_purposes(
            {"action": action})
        return action.squeeze(0)

    def critic_learn(self, states, actions, rewards, next_states, dones):
        """Runs a learning iteration for the critic"""
        # print("(DDPG) inside the critic_learn()")
        loss = self.compute_loss(states, next_states, rewards, actions, dones)
        # print("(DDPG) critic learn loss: ",loss)
        self.take_optimisation_step(
            self.critic_optimizer, self.critic_local, loss,
            self.hyperparameters["Critic"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(
            self.critic_local, self.critic_target,
            self.hyperparameters["Critic"]["tau"])

    def compute_loss(self, states, next_states, rewards, actions, dones):
        """Computes the loss for the critic"""
        # print("(DDPG) inside the compute_loss()")
        with torch.no_grad():
            critic_targets = self.compute_critic_targets(
                next_states, rewards, dones)
        # print("(DDPG) after torch.no_grad")
        critic_expected = self.compute_expected_critic_values(states, actions)
        loss = functional.mse_loss(critic_expected, critic_targets)
        return loss

    def compute_critic_targets(self, next_states, rewards, dones):
        """Computes the critic target values to be used in the loss for the critic"""
        # print("(DDPG) inside the compute_critic_targets")
        critic_targets_next = self.compute_critic_values_for_next_states(
            next_states)
        critic_targets = self.compute_critic_values_for_current_states(
            rewards, critic_targets_next, dones)
        return critic_targets

    def compute_critic_values_for_next_states(self, next_states):
        """Computes the critic values for next states to be used in the loss for the critic"""
        # print("(DDPG) inside the compute_critic_values_for_next_states")
        with torch.no_grad():
            # print("(DDPG) comput_critic_values_for_next_states()) inside the torch.no_grad()")
            # input()
            # print(self.actor_target)
            # print(next_states)
            # print(self.actor_target(next_states))
            actions_next = self.actor_target(next_states)
            # input()
            # print("(DDPG comput_critic_values_for_next_states()) after calculating actor_target")
            critic_targets_next = self.critic_target(
                torch.cat((next_states, actions_next), 1))
        # print("(DDPG compute_critic_values_for_next_states()) after torch.no_grad")
        return critic_targets_next

    def compute_critic_values_for_current_states(self, rewards,
                                                 critic_targets_next, dones):
        """Computes the critic values for current states to be used in the loss for the critic"""
        critic_targets_current = rewards + (
            self.hyperparameters["discount_rate"] * critic_targets_next *
            (1.0 - dones))
        return critic_targets_current

    def compute_expected_critic_values(self, states, actions):
        """Computes the expected critic values to be used in the loss for the critic"""
        critic_expected = self.critic_local(torch.cat((states, actions), 1))
        return critic_expected

    def time_for_critic_and_actor_to_learn(self):
        """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
        actor and critic"""
        return self.enough_experiences_to_learn_from(
        ) and self.global_step_number % self.hyperparameters[
            "update_every_n_steps"] == 0

    def actor_learn(self, states):
        """Runs a learning iteration for the actor"""
        if self.done:  #we only update the learning rate at end of each episode
            self.update_learning_rate(
                self.hyperparameters["Actor"]["learning_rate"],
                self.actor_optimizer)
        actor_loss = self.calculate_actor_loss(states)
        self.take_optimisation_step(
            self.actor_optimizer, self.actor_local, actor_loss,
            self.hyperparameters["Actor"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(
            self.actor_local, self.actor_target,
            self.hyperparameters["Actor"]["tau"])

    def calculate_actor_loss(self, states):
        """Calculates the loss for the actor"""
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(torch.cat(
            (states, actions_pred), 1)).mean()
        return actor_loss