class REINFORCEAgentWithBaseline:
    def __init__(self, params={}):
        # parameters to be set from params dict
        self.γ = None

        self.policy_estimator = None
        self.function_approximator = None
        self.is_continuous = None

        self.states = []
        self.actions = []
        self.rewards = []

        self.tot_timestep = 0

        self.set_params_from_dict(params)

    # ====== Initialization functions ==================================

    def set_params_from_dict(self, params={}):
        self.γ = params.get("discount_factor", 0.9)
        self.is_continuous = params.get("is_continuous", False)
        self.initialize_policy_estimator(params.get("policy_estimator_info"))
        self.initialize_baseline_network(
            params.get("function_approximator_info"))

    def initialize_policy_estimator(self, params):
        self.policy_estimator = CustomNN(params)

    def initialize_baseline_network(self, params):
        self.function_approximator = CustomNN(params)

    # ====== Control related functions =================================

    def control(self):
        self.function_approximator.compute_weights()

    # ====== Action choice related functions ===========================

    def choose_action(self, state):
        if self.is_continuous:
            action_chosen = self.policy_estimator(state).detach().numpy()
        else:
            action_probs = self.policy_estimator(state).detach().numpy()
            action_chosen = np.random.choice(len(action_probs), p=action_probs)
        return action_chosen

    def start(self, state):
        # choosing the action to take
        current_action = self.choose_action(state)
        self.states = np.array([state])
        self.actions = np.array([current_action])
        self.rewards = []

        return current_action

    def step(self, state, reward):
        # getting the action
        current_action = self.choose_action(state)
        self.save_transition(reward, state, current_action)
        return current_action

    def end(self, state, reward):
        self.save_transition(reward)

    def save_transition(self, reward, state=None, action=None):
        self.rewards.append(reward)
        if state is not None and action is not None:
            self.states = np.vstack((self.states, state))
            self.actions = np.append(self.actions, action)

    def learn_from_experience(self):
        # TODO: probleme: comme j'ai pas ajouté le dernier état à la listes des états, on ne prend pas en compte la
        # dernière transition dans la partie DQN. DQN?
        discounted_reward = 0
        reversed_episode = zip(self.rewards[::-1], self.states[::-1],
                               self.actions[::-1])
        for reward, state, action in reversed_episode:
            state_value = self.function_approximator(state)
            discounted_reward = reward + self.γ * discounted_reward
            δ = self.γ * (discounted_reward - state_value.detach())

            value_loss = -state_value * δ
            #value_loss = discounted_reward - state_value
            self.function_approximator.optimizer.zero_grad()
            value_loss.backward()
            self.function_approximator.optimizer.step()
            self.writer.add_scalar("Agent info/critic loss", value_loss,
                                   self.tot_timestep)

            # plot the policy entropy
            probs = self.policy_estimator(state).detach().numpy()
            entropy = -(np.sum(probs * np.log(probs)))
            self.writer.add_scalar("Agent info/policy entropy", entropy,
                                   self.tot_timestep)

            # on prend le contraire de l'expression pour que notre loss
            # pénalise au bon moment.
            loss = -torch.log(self.policy_estimator(state)[action]) * δ
            self.policy_estimator.optimizer.zero_grad()
            loss.backward()
            self.policy_estimator.optimizer.step()
            self.writer.add_scalar("Agent info/actor loss", loss,
                                   self.tot_timestep)

            wandb.log({
                "Agent info/critic loss": value_loss,
                "Agent info/policy entropy": entropy,
                "Agent info/actor loss": loss
            })

    def get_state_value_eval(self, state):
        state_value = self.function_approximator(state).data
        return state_value
Exemplo n.º 2
0
 def initialize_function_approximator(self,
                                      params: Dict) -> CustomNeuralNetwork:
     return CustomNeuralNetwork(**params)
 def initialize_baseline_network(self, params):
     self.function_approximator = CustomNN(params)
Exemplo n.º 4
0
class REINFORCEAgent:
    def __init__(self, params={}):
        # parameters to be set from params dict
        self.γ = None
        self.policy_estimator = None
        self.is_continuous = None

        self.states = []
        self.actions = []
        self.rewards = []

        self.seed = None

        self.set_params_from_dict(params)

    # ====== Initialization functions ==================================

    def set_params_from_dict(self, params={}):
        self.γ = params.get("discount_factor", 0.9)
        self.is_continuous = params.get("is_continuous", False)
        self.init_seed(params.get("seed", None))
        self.initialize_policy_estimator(params.get("policy_estimator_info"))
        self.is_continuous = params.get("is_continuous", False)

    def initialize_policy_estimator(self, params):
        self.policy_estimator = CustomNeuralNetwork(params)

    def init_seed(self, seed):
        if seed:
            self.seed = seed
            set_random_seed(self.seed)

    def set_seed(self, seed):
        if seed:
            self.seed = seed
            set_random_seed(self.seed)
            self.policy_estimator.set_seed(seed)

    # ====== Action choice related functions ===========================

    def choose_action(self, state):
        if self.is_continuous:
            # TODO: I don't think that's correct
            action_probs = Categorical(self.policy_estimator(state))
        else:
            action_probs = Categorical(self.policy_estimator(state))
            action_chosen = action_probs.sample().numpy()
        return action_chosen

    def start(self, state):
        # choosing the action to take
        current_action = self.choose_action(state)
        self.states = np.array([state])
        self.actions = np.array([current_action])
        self.rewards = []

        return current_action

    def step(self, state, reward):
        # getting the action values from the function approximator
        current_action = self.choose_action(state)
        self.rewards.append(reward)
        self.states = np.vstack((self.states, state))
        self.actions = np.append(self.actions, current_action)

        return current_action

    def end(self, state, reward):
        self.rewards.append(reward)

    def learn_from_experience(self):
        """ replays the episode backward and make gradient ascent over 
        the policy
        """
        #self.policy_estimator.optimizer.zero_grad()
        discounted_reward = 0
        reversed_episode = zip(self.rewards[::-1], self.states[::-1],
                               self.actions[::-1])
        for reward, state, action in reversed_episode:
            self.policy_estimator.optimizer.zero_grad()
            discounted_reward = self.γ(reward + self.γ * discounted_reward)
            # on prend le contraire de l'expression pour que notre loss
            # pénalise au bon moment.
            loss = -torch.log(
                self.policy_estimator(state)[action]) * discounted_reward
            loss.backward()
            self.policy_estimator.optimizer.step()
Exemplo n.º 5
0
 def initialize_policy_estimator(self, params: Dict) -> CustomNeuralNetwork:
     return CustomNeuralNetwork(**params)
Exemplo n.º 6
0
 def initialize_neural_networks(self, nn_params):
     self.target_net, self.eval_net = (CustomNeuralNetwork(nn_params),
                                       CustomNeuralNetwork(nn_params))
Exemplo n.º 7
0
 def initialize_policy_estimator(self, params):
     self.policy_estimator = CustomNeuralNetwork(params)
Exemplo n.º 8
0
 def initialize_function_approximator(self, params):
     #self.function_approximator = DQN(params)
     self.function_approximator_eval = CustomNeuralNetwork(params)
     self.function_approximator_target = CustomNeuralNetwork(params)
Exemplo n.º 9
0
class ActorCriticAgent:
    def __init__(self, params={}):
        # parameters to be set from params dict
        self.γ = None
        self.num_actions = None

        self.policy_estimator = None

        self.function_approximator_eval = None
        self.function_approximator_target = None

        self.previous_state = None
        self.previous_action = None
        #self.rewards = []
        self.is_continuous = None

        # memory parameters
        self.memory_size = None
        self.memory = []
        self.memory_counter = 0
        self.batch_size = None

        self.update_target_counter = 0
        self.update_target_rate = None
        self.state_dim = None

        self.seed = None

        self.tot_timestep = 0

        self.set_params_from_dict(params)
        self.set_other_params()

    # ====== Initialization functions ==================================

    def set_params_from_dict(self, params={}):
        self.γ = params.get("discount_factor", 0.9)
        self.num_actions = params.get("num_actions", 1)
        self.is_continuous = params.get("is_continuous", False)

        self.initialize_policy_estimator(params.get("policy_estimator_info"))
        self.initialize_function_approximator(params.get(
            "function_approximator_info"))

        self.memory_size = params.get("memory_size", 200)
        self.update_target_rate = params.get("update_target_rate", 50)
        self.state_dim = params.get("state_dim", 4)
        self.batch_size = params.get("batch_size", 32)

        self.seed = params.get("seed", None)

    def set_other_params(self):
        # two slots for the states, + 1 for the reward an the last for 
        # the action (per memory slot)
        self.memory = np.zeros((self.memory_size, 2 * self.state_dim + 3))
        

    def initialize_policy_estimator(self, params):
        self.policy_estimator = CustomNeuralNetwork(params)

    def initialize_function_approximator(self, params):
        #self.function_approximator = DQN(params)
        self.function_approximator_eval = CustomNeuralNetwork(params)
        self.function_approximator_target = CustomNeuralNetwork(params)

    # ====== Memory functions ==========================================

    def store_transition(self, state, action, reward, next_state, is_terminal):
        # store a transition (SARS') in the memory
        is_terminal = [is_terminal]
        transition = np.hstack((state, [action, reward], next_state, is_terminal))
        self.memory[self.memory_counter % self.memory_size, :] = transition
        self.incr_mem_cnt()
        
    def incr_mem_cnt(self):
        # increment the memory counter and resets it to 0 when reached 
        # the memory size value to avoid a too large value
        self.memory_counter += 1
        #if self.memory_counter == self.memory_size:
        #    self.memory_counter = 0

    def sample_memory(self):
        # Sampling some indices from memory
        sample_index = np.random.choice(self.memory_size, self.batch_size)
        # Getting the batch of samples corresponding to those indices 
        # and dividing it into state, action, reward and next state
        batch_memory = self.memory[sample_index, :]
        batch_state = torch.tensor(batch_memory[:, :self.state_dim]).float()
        batch_action = torch.tensor(batch_memory[:, 
            self.state_dim:self.state_dim + 1].astype(int)).float()
        batch_reward = torch.tensor(batch_memory[:, 
            self.state_dim + 1:self.state_dim + 2]).float()
        batch_next_state = torch.tensor(batch_memory[:, -self.state_dim-1:-1]).float()
        batch_is_terminal = torch.tensor(batch_memory[:, -1:]).bool()

        return batch_state, batch_action, batch_reward, batch_next_state, batch_is_terminal

    def update_target_net(self):
        # every n learning cycle, the target networks will be replaced 
        # with the eval networks
        if self.update_target_counter % self.update_target_rate == 0:
            self.function_approximator_target.load_state_dict(
                self.function_approximator_eval.state_dict())
        self.update_target_counter += 1

    def control(self, state, reward):
        """

        :param state:
        :param reward:
        :return:
        """
        # every n learning cycle, the target network will be replaced 
        # with the eval network
        self.update_target_net()

        if self.memory_counter > self.memory_size:
            # getting batch data
            batch_state, batch_action, batch_reward, batch_next_state, batch_is_terminal = self.sample_memory()
            
            prev_state_value = self.function_approximator_eval(batch_state)
            state_value = self.function_approximator_target(batch_next_state)
            nu_state_value = torch.zeros(state_value.shape)
            nu_state_value = torch.masked_fill(state_value, batch_is_terminal, 0.0)

            δ = batch_reward + self.γ * nu_state_value.detach() - prev_state_value.detach()

            value_loss = - prev_state_value * δ 
            value_loss = value_loss.mean()
            self.function_approximator_eval.optimizer.zero_grad()
            value_loss.backward()
            self.function_approximator_eval.optimizer.step()

            # plot the policy entropy
            probs = self.policy_estimator(state).detach().numpy()
            entropy = -(np.sum(probs * np.log(probs)))
            
            logprob = - torch.log(self.policy_estimator(
                batch_state).gather(1, batch_action.long()))
            loss = logprob * δ 
            loss = loss.mean()
            self.policy_estimator.optimizer.zero_grad()
            loss.backward()
            self.policy_estimator.optimizer.step()

            wandb.log({
                "Agent info/critic loss": value_loss,
                "Agent info/policy entropy": entropy,
                "Agent info/actor loss": loss
            })

    def vanilla_control(self, state, reward, is_terminal_state):
        prev_state_value = self.function_approximator_eval(self.previous_state)
        if is_terminal_state:
            cur_state_value = torch.tensor([0])
        else:
            cur_state_value = self.function_approximator_eval(state)
        δ = reward + self.γ * cur_state_value.detach() - prev_state_value.detach()

        value_loss = - prev_state_value * δ 
        self.function_approximator_eval.optimizer.zero_grad()
        value_loss.backward()
        self.function_approximator_eval.optimizer.step()
        

        # plot the policy entropy
        probs = self.policy_estimator(state).detach().numpy()
        entropy = -(np.sum(probs * np.log(probs)))
        

        logprob = - torch.log(self.policy_estimator(self.previous_state)[self.previous_action])
        loss = logprob * δ 
        self.policy_estimator.optimizer.zero_grad()
        loss.backward()
        self.policy_estimator.optimizer.step()
        
        wandb.log({
                "Agent info/critic loss": value_loss,
                "Agent info/policy entropy": entropy,
                "Agent info/actor loss": loss
            })


    # ====== Action choice related functions ===========================

    def choose_action(self, state): # TODO fix first if
        if self.is_continuous:  
            action_chosen = self.policy_estimator(state).detach().numpy()
            return action_chosen
        else:
            action_probs = Categorical(self.policy_estimator(state))
            action_chosen = action_probs.sample()
            return action_chosen.item()

    # ====== Agent core functions ======================================

    def start(self, state):
        # choosing the action to take
        current_action = self.choose_action(state)

        self.previous_action = current_action
        self.previous_state = state

        return current_action

    def step(self, state, reward):

        # storing the transition in the function approximator memory for further use
        self.store_transition(self.previous_state, self.previous_action, reward, state, False)

        # getting the action values from the function approximator
        current_action = self.choose_action(state)

        #self.control(state, reward)
        self.vanilla_control(state, reward, False)

        self.previous_action = current_action
        self.previous_state = state

        return current_action

    def end(self, state, reward):
        # storing the transition in the function approximator memory for further use
        self.store_transition(self.previous_state, self.previous_action, reward, state, True)
        #self.control(state, reward)
        self.vanilla_control(state, reward, True)

    def get_state_value_eval(self, state):
        if self.num_actions > 1:
            state_value = self.policy_estimator(state).data
        else: 
            state_value = self.function_approximator_eval(state).data
        return state_value
Exemplo n.º 10
0
 def init_critic(self, params):
     self.critic = CustomNeuralNetwork(**params)
     self.critic_target = deepcopy(self.critic)
Exemplo n.º 11
0
 def init_actor(self, params):
     self.actor = CustomNeuralNetwork(**params)
     self.actor_target = deepcopy(self.actor)
Exemplo n.º 12
0
class DDPGAgent:
    def __init__(self,
                 policy_estimator_info: Dict[str, Any],
                 function_approximator_info: Dict[str, Any],
                 memory_info: Dict[str, Any],
                 seed: Optional[int] = 0,
                 num_actions: Optional[int] = 1,
                 state_dim: Optional[int] = 1,
                 update_target_rate: Optional[int] = 50,
                 discount_factor: Optional[float] = 0.995,
                 target_policy_noise: Optional[float] = 0.2,
                 target_noise_clip: Optional[float] = 0.5):
        self.num_actions = num_actions
        self.state_dim = state_dim

        self.seed = self.init_seed(seed)
        self.logger = None
        # neural network parameters
        self.actor = None
        self.actor_target = None
        self.critic = None
        self.critic_target = None
        self.update_target_rate = update_target_rate
        self.update_target_counter = 0
        self.loss_func = torch.nn.MSELoss()

        self.γ = discount_factor
        self.replay_buffer = self.init_memory_buffer(memory_info)
        self.target_policy_noise = target_policy_noise
        self.target_noise_clip = target_noise_clip
        self.tot_timestep = 0
        self.init_actor(policy_estimator_info)
        self.init_critic(function_approximator_info)

        self.previous_action = None
        self.previous_obs = None

    # ====== Initialization functions ==================================

    def init_actor(self, params):
        self.actor = CustomNeuralNetwork(**params)
        self.actor_target = deepcopy(self.actor)

    def init_critic(self, params):
        self.critic = CustomNeuralNetwork(**params)
        self.critic_target = deepcopy(self.critic)

    def init_memory_buffer(self, params) -> ReplayBuffer:
        params["obs_dim"] = self.state_dim
        params["action_dim"] = self.num_actions
        return ReplayBuffer(**params)

    def init_seed(self, seed):
        if seed:
            set_random_seed(self.seed)
            return seed

    def set_seed(self, seed):
        if seed:
            self.seed = seed
            set_random_seed(self.seed)
            self.function_approximator.set_seed(seed)

    def set_logger(self, logger: Type[Logger]):
        self.logger = logger
        self.logger.wandb_watch([self.actor, self.critic])

    def get_discount(self):
        return self.γ

    # ====== Action choice related functions ===========================

    def choose_action(self, obs: torch.Tensor):
        action = self.actor(obs)
        noise = np.random.normal(0, self.target_noise_clip)
        action += noise
        action = torch.clamp(action, -1, 1)
        return action

    # ====== Agent core functions ======================================

    def start(self, obs):
        current_action = self.choose_action(obs)
        self.previous_action = current_action
        self.previous_obs = obs

        return current_action

    def step(self, obs, reward):
        # storing the transition in the function approximator memory for further use
        self.replay_buffer.store_transition(self.previous_obs,
                                            self.previous_action, reward, obs,
                                            False)
        # getting the action values from the function approximator
        current_action = self.choose_action(obs)
        self.control()
        self.previous_action = current_action
        self.previous_obs = obs

        return current_action

    def end(self, obs, reward):
        self.replay_buffer.store_transition(self.previous_obs,
                                            self.previous_action, reward, obs,
                                            True)
        self.control()

    # === functional functions =========================================

    def get_action_value(self, state, action=None):
        # Compute action values from the eval net
        action_value = self.critic(state)
        noise = 0  # normal distrib, for exploration
        action_value = self.critic(state) + noise
        action_value = torch.clamp(action_value, self.min_action,
                                   self.max_action)
        return action_value

    # === parameters update functions ==================================

    def _update_target_net(self):
        # every n learning cycle, the target network will be replaced
        # with the eval network
        self.update_target_counter += 1
        if self.update_target_counter == self.update_target_rate:
            self.critic_target.load_state_dict(self.critic.state_dict())
            self.actor_target.load_state_dict(self.actor.state_dict())
            self.update_target_counter = 0

    # ====== Control related functions =================================

    def control(self):
        self._learn()

    def _learn(self):
        if self.replay_buffer.full:
            self._update_target_net()
            # getting batch data
            batch = self.replay_buffer.sample()
            # compute critic target
            target_actions = self.actor_target(
                batch.next_observations).detach()
            batch_oa = self._concat_obs_action(batch.next_observations,
                                               target_actions)
            q_next = self.critic_target(batch_oa).detach()
            q_next = (1.0 - batch.dones.float()) * q_next
            y = batch.rewards + self.γ * q_next
            batch_oa_eval = self._concat_obs_action(batch.observations,
                                                    batch.actions)
            # compute critic eval
            q_eval = self.critic(batch_oa_eval)
            # learn critic
            critic_loss = self.loss_func(q_eval, y)

            self.critic.backpropagate(critic_loss)

            actor_eval = self.actor(batch.observations)
            #with torch.no_grad():
            test_oa = self._concat_obs_action(batch.observations, actor_eval)
            actor_loss = self.critic(test_oa)
            actor_loss = -actor_loss.mean()
            self.logger.wandb_log({
                "Agent info/critic loss": critic_loss,
                "Agent info/actor loss": actor_loss
            })

            self.actor.backpropagate(actor_loss)

    def _concat_obs_action(self, obs: torch.Tensor,
                           action: torch.Tensor) -> torch.Tensor:
        obs_action = torch.cat((obs, action), 1)  #.unsqueeze(1)),1)
        return obs_action

    def get_action_value_eval(self, state: torch.Tensor):
        """for plotting purposes only?
        """
        action = np.random.uniform(-1, 1, 1)
        action = torch.Tensor(action)
        state_action = torch.cat((state, action))
        action_value = self.critic(state_action).detach().data
        return action_value

    def get_action_values_eval(self, state: torch.Tensor,
                               actions: torch.Tensor):
        """ for plotting purposes only?
        """
        #state = torch.cat((state, state)).unsqueeze(1)
        state = (state.unsqueeze(1) * torch.ones(len(actions))).T
        state_action = torch.cat((state, actions.unsqueeze(1)), 1)
        action_values = self.critic(state_action).data
        return action_values

    def _zero_terminal_states(self, q_values: torch.Tensor,
                              dones: torch.Tensor) -> torch.Tensor:
        """ Zeroes the q values at terminal states
        """
        nu_q_values = torch.zeros(q_values.shape)
        nu_q_values = torch.masked_fill(q_values, dones, 0.0)
        return nu_q_values

    def _create_noise_tensor(self, tensor):
        # create the nois tensor filled with normal distribution
        noise = tensor.clone().data.normal_(0, self.target_policy_noise)
        # clip the normal distribution
        noise = noise.clamp(-self.target_noise_clip, self.target_noise_clip)
        return noise

    def adjust_dims(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.num_actions = action_dim
        self.actor.reinit_layers(state_dim, action_dim)
        self.actor_target.reinit_layers(state_dim, action_dim)
        self.critic.reinit_layers(state_dim + action_dim, 1)
        self.critic_target.reinit_layers(state_dim + action_dim, 1)
        self.replay_buffer.correct(state_dim, action_dim)