Exemplo n.º 1
0
 def __init__(self, config):
     Base_Agent.__init__(self, config)
     self.hyperparameters = config.hyperparameters
     self.critic_local = Neural_Network(self.state_size + self.action_size,
                                        1, self.random_seed,
                                        self.hyperparameters["Critic"],
                                        "VANILLA_NN").to(self.device)
     self.critic_target = copy.deepcopy(self.critic_local).to(self.device)
     self.critic_optimizer = optim.Adam(
         self.critic_local.parameters(),
         lr=self.hyperparameters["Critic"]["learning_rate"])
     self.memory = Replay_Buffer(
         self.hyperparameters["Critic"]["buffer_size"],
         self.hyperparameters["batch_size"], self.random_seed)
     self.actor_local = Neural_Network(self.state_size, self.action_size,
                                       self.random_seed,
                                       self.hyperparameters["Actor"],
                                       "VANILLA_NN").to(self.device)
     self.actor_target = copy.deepcopy(self.actor_local).to(self.device)
     self.actor_optimizer = optim.Adam(
         self.actor_local.parameters(),
         lr=self.hyperparameters["Actor"]["learning_rate"])
     self.noise = OU_Noise(self.action_size, self.random_seed,
                           self.hyperparameters["mu"],
                           self.hyperparameters["theta"],
                           self.hyperparameters["sigma"])
Exemplo n.º 2
0
    def __init__(self, worker_num, environment, shared_model, counter,
                 optimizer_lock, shared_optimizer, config, episodes_to_run,
                 epsilon_decay_denominator, action_size, action_types,
                 results_queue, local_model, gradient_updates_queue):
        super(Actor_Critic_Worker, self).__init__()
        self.environment = environment
        self.config = config
        self.worker_num = worker_num

        self.gradient_clipping_norm = self.config.hyperparameters[
            "gradient_clipping_norm"]
        self.discount_rate = self.config.hyperparameters["discount_rate"]
        self.normalise_rewards = self.config.hyperparameters[
            "normalise_rewards"]

        self.action_size = action_size
        self.set_seeds(self.worker_num)
        self.shared_model = shared_model
        self.local_model = local_model
        self.local_optimizer = Adam(self.local_model.parameters(),
                                    lr=0.0,
                                    eps=1e-4)
        self.counter = counter
        self.optimizer_lock = optimizer_lock
        self.shared_optimizer = shared_optimizer
        self.episodes_to_run = episodes_to_run
        self.epsilon_decay_denominator = epsilon_decay_denominator
        self.exploration_worker_difference = self.config.hyperparameters[
            "exploration_worker_difference"]
        self.action_types = action_types
        self.results_queue = results_queue
        self.episode_number = 0
        self.noise = OU_Noise(self.action_size, config.seed)

        self.gradient_updates_queue = gradient_updates_queue
Exemplo n.º 3
0
 def __init__(self,
              environment,
              policy,
              seed,
              hyperparameters,
              use_GPU=False):
     self.use_GPU = use_GPU
     self.environment = environment
     self.action_size = self.environment.get_action_size()
     self.action_types = self.environment.get_action_types()
     self.policy = policy
     self.hyperparameters = hyperparameters
     self.noise = OU_Noise(self.action_size, seed,
                           self.hyperparameters["mu"],
                           self.hyperparameters["theta"],
                           self.hyperparameters["sigma"])
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
        self.critic_target.load_state_dict(copy.deepcopy(self.critic_local.state_dict()))

        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.hyperparameters["Critic"]["learning_rate"])
        self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
                                    self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
        self.actor_target.load_state_dict(copy.deepcopy(self.actor_local.state_dict()))

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.hyperparameters["Actor"]["learning_rate"])
        self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"],
                              self.hyperparameters["theta"], self.hyperparameters["sigma"])
Exemplo n.º 5
0
 def __init__(self,
              environment,
              policy,
              seed,
              hyperparameters,
              action_size,
              use_GPU=False,
              action_choice_output_columns=None):
     self.use_GPU = use_GPU
     self.environment = environment
     self.action_types = "DISCRETE" if self.environment.action_space.dtype == int else "CONTINUOUS"
     self.action_size = action_size
     self.policy = policy
     self.action_choice_output_columns = action_choice_output_columns
     self.hyperparameters = hyperparameters
     if self.action_types == "CONTINUOUS":
         self.noise = OU_Noise(self.action_size, seed,
                               self.hyperparameters["mu"],
                               self.hyperparameters["theta"],
                               self.hyperparameters["sigma"])
Exemplo n.º 6
0
class DDPG_Agent(Base_Agent):
    agent_name = "DDPG"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = Neural_Network(self.state_size + self.action_size,
                                           1, self.random_seed,
                                           self.hyperparameters["Critic"],
                                           "VANILLA_NN").to(self.device)
        self.critic_target = copy.deepcopy(self.critic_local).to(self.device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"])
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.random_seed)
        self.actor_local = Neural_Network(self.state_size, self.action_size,
                                          self.random_seed,
                                          self.hyperparameters["Actor"],
                                          "VANILLA_NN").to(self.device)
        self.actor_target = copy.deepcopy(self.actor_local).to(self.device)
        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"])
        self.noise = OU_Noise(self.action_size, self.random_seed,
                              self.hyperparameters["mu"],
                              self.hyperparameters["theta"],
                              self.hyperparameters["sigma"])

    def reset_game(self):
        """Resets the game information so we are ready to play a new episode"""
        Base_Agent.reset_game(self)
        self.noise.reset()

    def step(self):
        """Runs a step in the game"""
        while not self.done:
            self.pick_and_conduct_action()
            self.update_next_state_reward_done_and_score()
            if self.time_for_critic_and_actor_to_learn():
                for _ in range(self.hyperparameters[
                        "learning_updates_per_learning_session"]):
                    states, actions, rewards, next_states, dones = self.memory.sample(
                    )  # Sample experiences
                    self.critic_learn(states, actions, rewards, next_states,
                                      dones)
                    self.actor_learn(states)
            self.save_experience()
            self.state = self.next_state  #this is to set the state for the next iteration
            self.episode_step_number += 1
        self.episode_number += 1

    def pick_action(self):
        """Picks an action using the actor network and then adds some noise to it to ensure exploration"""
        state = torch.from_numpy(self.state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        action += self.noise.sample()
        return action

    def critic_learn(self, states, actions, rewards, next_states, dones):
        loss = self.compute_loss(states, next_states, rewards, actions, dones)
        self.take_optimisation_step(
            self.critic_optimizer, self.critic_local, loss,
            self.hyperparameters["Critic"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(
            self.critic_local, self.critic_target,
            self.hyperparameters["Critic"]["tau"])

    def compute_loss(self, states, next_states, rewards, actions, dones):
        with torch.no_grad():
            critic_targets = self.compute_critic_targets(
                next_states, rewards, dones)
        critic_expected = self.compute_expected_critic_values(states, actions)
        loss = functional.mse_loss(critic_expected, critic_targets)
        return loss

    def compute_critic_targets(self, next_states, rewards, dones):
        critic_targets_next = self.compute_critic_values_for_next_states(
            next_states)
        critic_targets = self.compute_critic_values_for_current_states(
            rewards, critic_targets_next, dones)
        return critic_targets

    def compute_critic_values_for_next_states(self, next_states):
        with torch.no_grad():
            actions_next = self.actor_target(next_states)
            critic_targets_next = self.critic_target(
                torch.cat((next_states, actions_next), 1))
        return critic_targets_next

    def compute_critic_values_for_current_states(self, rewards,
                                                 critic_targets_next, dones):
        critic_targets_current = rewards + (
            self.hyperparameters["discount_rate"] * critic_targets_next *
            (1 - dones))
        return critic_targets_current

    def compute_expected_critic_values(self, states, actions):
        critic_expected = self.critic_local(torch.cat((states, actions), 1))
        return critic_expected

    def time_for_critic_and_actor_to_learn(self):
        return self.enough_experiences_to_learn_from(
        ) and self.episode_step_number % self.hyperparameters[
            "update_every_n_steps"] == 0

    def actor_learn(self, states):
        if self.done:  #we only update the learning rate at end of each episode
            self.update_learning_rate(
                self.hyperparameters["Actor"]["learning_rate"],
                self.actor_optimizer)
        actor_loss = self.calculate_actor_loss(states)
        self.take_optimisation_step(
            self.actor_optimizer, self.actor_local, actor_loss,
            self.hyperparameters["Actor"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(
            self.actor_local, self.actor_target,
            self.hyperparameters["Actor"]["tau"])

    def calculate_actor_loss(self, states):
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(torch.cat(
            (states, actions_pred), 1)).mean()
        return actor_loss
class Parallel_Experience_Generator(object):
    """ Plays n episode in parallel using a fixed agent. Only works for PPO or DDPG type agents at the moment, not Q-learning agents"""
    def __init__(self,
                 environment,
                 policy,
                 seed,
                 hyperparameters,
                 action_size,
                 use_GPU=False,
                 action_choice_output_columns=None):
        self.use_GPU = use_GPU
        self.environment = environment
        self.action_types = "DISCRETE" if self.environment.action_space.dtype in [
            int, 'int64'
        ] else "CONTINUOUS"
        self.action_size = action_size
        self.policy = policy
        self.action_choice_output_columns = action_choice_output_columns
        self.hyperparameters = hyperparameters
        if self.action_types == "CONTINUOUS":
            self.noise = OU_Noise(self.action_size, seed,
                                  self.hyperparameters["mu"],
                                  self.hyperparameters["theta"],
                                  self.hyperparameters["sigma"])

    def play_n_episodes(self, n, exploration_epsilon=None):
        """Plays n episodes in parallel using the fixed policy and returns the data"""
        self.exploration_epsilon = exploration_epsilon
        with closing(Pool(processes=n)) as pool:
            results = pool.map(self, range(n))
            pool.terminate()
        states_for_all_episodes = [episode[0] for episode in results]
        actions_for_all_episodes = [episode[1] for episode in results]
        rewards_for_all_episodes = [episode[2] for episode in results]
        return states_for_all_episodes, actions_for_all_episodes, rewards_for_all_episodes

    def __call__(self, n):
        exploration = max(
            0.0,
            random.uniform(self.exploration_epsilon / 3.0,
                           self.exploration_epsilon * 3.0))
        return self.play_1_episode(exploration)

    def play_1_episode(self, epsilon_exploration):
        """Plays 1 episode using the fixed policy and returns the data"""
        state = self.reset_game()
        done = False
        episode_states = []
        episode_actions = []
        episode_rewards = []
        while not done:
            action = self.pick_action(self.policy, state, epsilon_exploration)
            next_state, reward, done, _ = self.environment.step(action)
            if self.hyperparameters["clip_rewards"]:
                reward = max(min(reward, 1.0), -1.0)
            episode_states.append(state)
            episode_actions.append(action)
            episode_rewards.append(reward)
            state = next_state
        return episode_states, episode_actions, episode_rewards

    def reset_game(self):
        """Resets the game environment so it is ready to play a new episode"""
        seed = randint(0, sys.maxsize)
        torch.manual_seed(
            seed
        )  # Need to do this otherwise each worker generates same experience
        state = self.environment.reset()
        if self.action_types == "CONTINUOUS": self.noise.reset()
        return state

    def pick_action(self, policy, state, epsilon_exploration=None):
        """Picks an action using the policy"""
        if self.action_types == "DISCRETE":
            if random.random() <= epsilon_exploration:
                action = random.randint(0, self.action_size - 1)
                return action

        # state = torch.from_numpy(state).float().unsqueeze(0)
        state = torch.from_numpy(state).float()
        actor_output = policy.forward(state)
        if self.action_choice_output_columns is not None:
            actor_output = actor_output[:, self.action_choice_output_columns]
        action_distribution = create_actor_distribution(
            self.action_types, actor_output, self.action_size)
        action = action_distribution.sample().cpu()

        if self.action_types == "CONTINUOUS":
            action += torch.Tensor(self.noise.sample())
        else:
            action = action.item()
        return action
Exemplo n.º 8
0
class Parallel_Experience_Generator(object):
    """ Plays n episode in parallel using a fixed agent. Only works for PPO or DDPG type agents at the moment, not Q-learning agents"""
    def __init__(self,
                 environment,
                 policy,
                 seed,
                 hyperparameters,
                 use_GPU=False):
        self.use_GPU = use_GPU
        self.environment = environment
        self.action_size = self.environment.get_action_size()
        self.action_types = self.environment.get_action_types()
        self.policy = policy
        self.hyperparameters = hyperparameters
        self.noise = OU_Noise(self.action_size, seed,
                              self.hyperparameters["mu"],
                              self.hyperparameters["theta"],
                              self.hyperparameters["sigma"])

    def play_n_episodes(self, n):
        """Plays n episodes in parallel using the fixed policy and returns the data"""
        if self.use_GPU:
            with closing(GPU_POOL(processes=n)) as pool:
                results = pool.map(self, range(n))
                pool.terminate()
        else:
            with closing(Pool(processes=n)) as pool:
                results = pool.map(self, range(n))
                pool.terminate()
        states_for_all_episodes = [episode[0] for episode in results]
        actions_for_all_episodes = [episode[1] for episode in results]
        rewards_for_all_episodes = [episode[2] for episode in results]
        return states_for_all_episodes, actions_for_all_episodes, rewards_for_all_episodes

    def __call__(self, n):
        return self.play_1_episode()

    def play_1_episode(self):
        """Plays 1 episode using the fixed policy and returns the data"""
        state = self.reset_game()
        done = False
        episode_states = []
        episode_actions = []
        episode_rewards = []
        while not done:
            action = self.pick_action(self.policy, state)
            self.environment.conduct_action(action)
            next_state = self.environment.get_next_state()
            reward = self.environment.get_reward()
            done = self.environment.get_done()
            episode_states.append(state)
            episode_actions.append(action)
            episode_rewards.append(reward)
            state = next_state
        return episode_states, episode_actions, episode_rewards

    def reset_game(self):
        """Resets the game environment so it is ready to play a new episode"""
        seed = randint(0, 10000)
        torch.manual_seed(
            seed
        )  # Need to do this otherwise each worker generates same experience
        self.environment.reset_environment()
        self.noise.reset()
        state = self.environment.get_state()
        return state

    def pick_action(self, policy, state):
        """Picks an action using the policy"""
        state = torch.from_numpy(state).float().unsqueeze(0)
        actor_output = policy.forward(state)
        action_distribution = create_actor_distribution(
            self.action_types, actor_output, self.action_size)
        action = action_distribution.sample().cpu().numpy()
        if self.action_types == "CONTINUOUS":
            action += self.noise.sample()
        return action
Exemplo n.º 9
0
class Actor_Critic_Worker(torch.multiprocessing.Process):
    """Actor critic worker that will play the game for the designated number of episodes """
    def __init__(self, worker_num, environment, shared_model, counter,
                 optimizer_lock, shared_optimizer, config, episodes_to_run,
                 epsilon_decay_denominator, action_size, action_types,
                 results_queue, local_model, gradient_updates_queue):
        super(Actor_Critic_Worker, self).__init__()
        self.environment = environment
        self.config = config
        self.worker_num = worker_num

        self.gradient_clipping_norm = self.config.hyperparameters[
            "gradient_clipping_norm"]
        self.discount_rate = self.config.hyperparameters["discount_rate"]
        self.normalise_rewards = self.config.hyperparameters[
            "normalise_rewards"]

        self.action_size = action_size
        self.set_seeds(self.worker_num)
        self.shared_model = shared_model
        self.local_model = local_model
        self.local_optimizer = Adam(self.local_model.parameters(),
                                    lr=0.0,
                                    eps=1e-4)
        self.counter = counter
        self.optimizer_lock = optimizer_lock
        self.shared_optimizer = shared_optimizer
        self.episodes_to_run = episodes_to_run
        self.epsilon_decay_denominator = epsilon_decay_denominator
        self.exploration_worker_difference = self.config.hyperparameters[
            "exploration_worker_difference"]
        self.action_types = action_types
        self.results_queue = results_queue
        self.episode_number = 0
        self.noise = OU_Noise(self.action_size, config.seed)

        self.gradient_updates_queue = gradient_updates_queue

    def set_seeds(self, worker_num):
        """Sets random seeds for this worker"""
        torch.manual_seed(self.config.seed + worker_num)
        self.environment.seed(self.config.seed + worker_num)

    def run(self):
        """Starts the worker"""
        torch.set_num_threads(1)
        for ep_ix in range(self.episodes_to_run):
            with self.optimizer_lock:
                Base_Agent.copy_model_over(self.shared_model, self.local_model)
            epsilon_exploration = self.calculate_new_exploration()
            state = self.reset_game_for_worker()
            done = False
            self.episode_states = []
            self.episode_actions = []
            self.episode_rewards = []
            self.episode_log_action_probabilities = []
            self.critic_outputs = []

            while not done:
                action, action_log_prob, critic_outputs = self.pick_action_and_get_critic_values(
                    self.local_model, state, epsilon_exploration)
                next_state, reward, done, _ = self.environment.step(action)
                self.episode_states.append(state)
                self.episode_actions.append(action)
                self.episode_rewards.append(reward)
                self.episode_log_action_probabilities.append(action_log_prob)
                self.critic_outputs.append(critic_outputs)
                state = next_state

            total_loss = self.calculate_total_loss()
            self.put_gradients_in_queue(total_loss)
            self.episode_number += 1
            with self.counter.get_lock():
                self.counter.value += 1
                self.results_queue.put(np.sum(self.episode_rewards))

    def calculate_new_exploration(self):
        """Calculates the new exploration parameter epsilon. It picks a random point within 3X above and below the
        current epsilon"""
        with self.counter.get_lock():
            epsilon = 1.0 / (
                1.0 + (self.counter.value / self.epsilon_decay_denominator))
        epsilon = max(
            0.0,
            random.uniform(epsilon / self.exploration_worker_difference,
                           epsilon * self.exploration_worker_difference))
        return epsilon

    def reset_game_for_worker(self):
        """Resets the game environment so it is ready to play a new episode"""
        state = self.environment.reset()
        if self.action_types == "CONTINUOUS": self.noise.reset()
        return state

    def pick_action_and_get_critic_values(self,
                                          policy,
                                          state,
                                          epsilon_exploration=None):
        """Picks an action using the policy"""
        # state = torch.from_numpy(state).float().unsqueeze(0)
        state = torch.from_numpy(state).float()
        model_output = policy.forward(state)
        actor_output = model_output[:, list(
            range(self.action_size)
        )]  # we only use first set of columns to decide action, last column is state-value
        critic_output = model_output[:, -1]
        # action_distribution = create_actor_distribution(self.action_types, actor_output, self.action_size)
        action_distribution = create_actor_distribution(
            self.action_types, model_output, self.action_size)
        action = action_distribution.sample().cpu().numpy()
        if self.action_types == "CONTINUOUS": action += self.noise.sample()
        if self.action_types == "DISCRETE":
            if random.random() <= epsilon_exploration:
                action = random.randint(0, self.action_size - 1)
            else:
                action = action[0]
        action_log_prob = self.calculate_log_action_probability(
            action, action_distribution)
        return action, action_log_prob, critic_output

    def calculate_log_action_probability(self, actions, action_distribution):
        """Calculates the log probability of the chosen action"""
        policy_distribution_log_prob = action_distribution.log_prob(
            torch.Tensor([actions]))
        return policy_distribution_log_prob

    def calculate_total_loss(self):
        """Calculates the actor loss + critic loss"""
        discounted_returns = self.calculate_discounted_returns()
        if self.normalise_rewards:
            discounted_returns = self.normalise_discounted_returns(
                discounted_returns)
        critic_loss, advantages = self.calculate_critic_loss_and_advantages(
            discounted_returns)
        actor_loss = self.calculate_actor_loss(advantages)
        total_loss = actor_loss + critic_loss
        return total_loss

    def calculate_discounted_returns(self):
        """Calculates the cumulative discounted return for an episode which we will then use in a learning iteration"""
        discounted_returns = [0]
        for ix in range(len(self.episode_states)):
            return_value = self.episode_rewards[-(
                ix + 1)] + self.discount_rate * discounted_returns[-1]
            discounted_returns.append(return_value)
        discounted_returns = discounted_returns[1:]
        discounted_returns = discounted_returns[::-1]
        return discounted_returns

    def normalise_discounted_returns(self, discounted_returns):
        """Normalises the discounted returns by dividing by mean and std of returns that episode"""
        mean = np.mean(discounted_returns)
        std = np.std(discounted_returns)
        discounted_returns -= mean
        discounted_returns /= (std + 1e-5)
        return discounted_returns

    def calculate_critic_loss_and_advantages(self, all_discounted_returns):
        """Calculates the critic's loss and the advantages"""
        critic_values = torch.cat(self.critic_outputs)
        advantages = torch.Tensor(all_discounted_returns) - critic_values
        advantages = advantages.detach()
        critic_loss = (torch.Tensor(all_discounted_returns) - critic_values)**2
        critic_loss = critic_loss.mean()
        return critic_loss, advantages

    def calculate_actor_loss(self, advantages):
        """Calculates the loss for the actor"""
        action_log_probabilities_for_all_episodes = torch.cat(
            self.episode_log_action_probabilities)
        actor_loss = -1.0 * action_log_probabilities_for_all_episodes * advantages
        actor_loss = actor_loss.mean()
        return actor_loss

    def put_gradients_in_queue(self, total_loss):
        """Puts gradients in a queue for the optimisation process to use to update the shared model"""
        self.local_optimizer.zero_grad()
        total_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.local_model.parameters(),
                                       self.gradient_clipping_norm)
        gradients = [
            param.grad.clone() for param in self.local_model.parameters()
        ]
        self.gradient_updates_queue.put(gradients)
class DDPG(Base_Agent):
    """A DDPG Agent"""
    agent_name = "DDPG"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
        self.critic_target.load_state_dict(copy.deepcopy(self.critic_local.state_dict()))

        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.hyperparameters["Critic"]["learning_rate"])
        self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
                                    self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
        self.actor_target.load_state_dict(copy.deepcopy(self.actor_local.state_dict()))

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.hyperparameters["Actor"]["learning_rate"])
        self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"],
                              self.hyperparameters["theta"], self.hyperparameters["sigma"])

    def reset_game(self):
        """Resets the game information so we are ready to play a new episode"""
        Base_Agent.reset_game(self)
        self.noise.reset()

    def step(self):
        """Runs a step in the game"""
        while not self.done:
            # print("State ", self.state.shape)
            self.action = self.pick_action()
            self.conduct_action(self.action)
            if self.time_for_critic_and_actor_to_learn():
                for _ in range(self.hyperparameters["learning_updates_per_learning_session"]):
                    states, actions, rewards, next_states, dones = self.sample_experiences()
                    self.critic_learn(states, actions, rewards, next_states, dones)
                    self.actor_learn(states)
            self.save_experience()
            self.state = self.next_state #this is to set the state for the next iteration
            self.global_step_number += 1
        self.episode_number += 1

    def sample_experiences(self):
        return  self.memory.sample()

    def pick_action(self):
        """Picks an action using the actor network and then adds some noise to it to ensure exploration"""
        state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        action += self.noise.sample()
        return action.squeeze(0)

    def critic_learn(self, states, actions, rewards, next_states, dones):
        """Runs a learning iteration for the critic"""
        loss = self.compute_loss(states, next_states, rewards, actions, dones)
        self.take_optimisation_step(self.critic_optimizer, self.critic_local, loss, self.hyperparameters["Critic"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"])

    def compute_loss(self, states, next_states, rewards, actions, dones):
        """Computes the loss for the critic"""
        with torch.no_grad():
            critic_targets = self.compute_critic_targets(next_states, rewards, dones)
        critic_expected = self.compute_expected_critic_values(states, actions)
        loss = functional.mse_loss(critic_expected, critic_targets)
        return loss

    def compute_critic_targets(self, next_states, rewards, dones):
        """Computes the critic target values to be used in the loss for the critic"""
        critic_targets_next = self.compute_critic_values_for_next_states(next_states)
        critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones)
        return critic_targets

    def compute_critic_values_for_next_states(self, next_states):
        """Computes the critic values for next states to be used in the loss for the critic"""
        with torch.no_grad():
            actions_next = self.actor_target(next_states)
            critic_targets_next = self.critic_target(torch.cat((next_states, actions_next), 1))
        return critic_targets_next

    def compute_critic_values_for_current_states(self, rewards, critic_targets_next, dones):
        """Computes the critic values for current states to be used in the loss for the critic"""
        critic_targets_current = rewards + (self.hyperparameters["discount_rate"] * critic_targets_next * (1.0 - dones))
        return critic_targets_current

    def compute_expected_critic_values(self, states, actions):
        """Computes the expected critic values to be used in the loss for the critic"""
        critic_expected = self.critic_local(torch.cat((states, actions), 1))
        return critic_expected

    def time_for_critic_and_actor_to_learn(self):
        """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
        actor and critic"""
        return self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0

    def actor_learn(self, states):
        """Runs a learning iteration for the actor"""
        if self.done: #we only update the learning rate at end of each episode
            self.update_learning_rate(self.hyperparameters["Actor"]["learning_rate"], self.actor_optimizer)
        actor_loss = self.calculate_actor_loss(states)
        self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss,
                                    self.hyperparameters["Actor"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(self.actor_local, self.actor_target, self.hyperparameters["Actor"]["tau"])

    def calculate_actor_loss(self, states):
        """Calculates the loss for the actor"""
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(torch.cat((states, actions_pred), 1)).mean()
        return actor_loss