Exemplo n.º 1
0
class DDPG_Agent:
    def __init__(self, state_size, action_size, seed, index=0, num_agents=2):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int):   Dimension of each state
            action_size (int):  Dimension of each action
            seed (int):         Random seed
            index (int):        Index assigned to the agent
            num_agents (int):   Number of agents in the environment
        """

        self.state_size = state_size  # State size
        self.action_size = action_size  # Action size
        self.seed = torch.manual_seed(seed)  # Random seed
        self.index = index  # Index of this agent, not used at the moment
        self.tau = TAU  # Parameter for soft weight update
        self.num_updates = N_UPDATES  # Number of updates to perform when updating
        self.num_agents = num_agents  # Number of agents in the environment
        self.tstep = 0  # Simulation step (modulo (%) UPDATE_EVERY)
        self.gamma = GAMMA  # Gamma for the reward discount
        self.alpha = ALPHA  # PER: toggle prioritization (0..1)

        # Set up actor and critic networks
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck noise
        self.noise = OUNoise((1, action_size), seed)

        # Replay buffer
        self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                              BATCH_SIZE, seed, self.alpha)

    # act and act_targets similar to exercises and MADDPG Lab
    def act(self, states, noise=1.0):
        """Returns actions for given state as per current policy.
    
        Params
        ======
            state [n_agents, state_size]: current state
            noise (float):    control whether or not noise is added
        """
        # Uncomment if state is numpy array instead of tensor
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((1, self.action_size))

        # Put model into evaluation mode
        self.actor_local.eval()

        # Get actions for current state, transformed from probabilities
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()

        # Put actor back into training mode
        self.actor_local.train()

        # Ornstein-Uhlenbeck noise addition
        actions += noise * self.noise.sample()

        #  Transform probability into valid action ranges
        return np.clip(actions, -1, 1)

    def step(self, states, actions, rewards, next_states, dones, beta):
        """Save experience in replay memory, use random samples from buffer to learn.
        
        PARAMS
        ======
            states:     [n_agents, state_size]  current state
            actions:    [n_agents, action_size] taken action
            rewards:    [n_agents]              earned reward
            next_states:[n_agents, state_size]  next state
            dones:      [n_agents]              Whether episode has finished
            beta:       [0..1]                  PER: toggles correction for importance weights (0 - no corrections, 1 - full correction)
        """
        # ------------------------------------------------------------------
        # Save experience in replay memory - slightly more effort due to Prioritization
        # We need to calculate priorities for the experience tuple.
        # This is in our case (Q_expected - Q_target)**2
        # -----------------------------------------------------------------
        # Set all networks to evaluation mode
        self.actor_target.eval()
        self.critic_target.eval()
        self.critic_local.eval()

        state = torch.from_numpy(states).float().to(device)
        next_state = torch.from_numpy(next_states).float().to(device)
        action = torch.from_numpy(actions).float().to(device)
        #reward = torch.from_numpy(rewards).float().to(device)
        #done = torch.from_numpy(dones).float().to(device)

        with torch.no_grad():
            next_actions = self.actor_target(state)
            own_action = action[:, self.index *
                                self.action_size:(self.index + 1) *
                                self.action_size]
            if self.index:
                # Agent 1
                next_actions_agent = torch.cat((own_action, next_actions),
                                               dim=1)
            else:
                # Agent 0: flipped order
                next_actions_agent = torch.cat((next_actions, own_action),
                                               dim=1)

            # Predicted Q value from Critic target network
            Q_targets_next = self.critic_target(next_state,
                                                next_actions_agent).float()
            #print(f"Type Q_t_n: {type(Q_targets_next)}")
            #print(f"Type gamma: {type(self.gamma)}")
            #print(f"Type dones: {type(dones)}")
            Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
            Q_expected = self.critic_local(state, action)

        # Use error between Q_expected and Q_targets as priority in buffer
        error = (Q_expected - Q_targets)**2
        self.memory.add(state, action, rewards, next_state, dones, error)

        # Set all networks back to training mode
        self.actor_target.train()
        self.critic_target.train()
        self.critic_local.train()

        # ------------------------------------------------------------------
        # Usual learning procedure
        # -----------------------------------------------------------------
        # Learn every UPDATE_EVERY time steps
        self.tstep = (self.tstep + 1) % UPDATE_EVERY

        # If UPDATE_EVERY and enough samples are available in memory, get random subset and learn
        if self.tstep == 0 and len(self.memory) > BATCH_SIZE:
            for _ in range(self.num_updates):
                experiences = self.memory.sample(beta)
                self.learn(experiences)

    def reset(self):
        """Reset the noise parameter of the agent."""
        self.noise.reset()

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples. 
        Update according to 
            Q_targets = r + gamma * critic_target(next_state, actor_target(next_state))
        
        According to the lessons: 
            actor_target  (state)           gives   action
            critic_target (state, action)   gives   Q-value

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of 
                    states          states visited
                    actions         actions taken by all agents
                    rewards         rewards received
                    next states     all next states
                    dones           whether or not a final state is reached 
                    weights         weights of the experiences
                    indices         indices of the experiences            
        """

        # Load experiences from sample
        states, actions, rewards, next_states, dones, weights_cur, indices = experiences

        # ------------------- update critic ------------------- #

        # Get next actions via actor network
        next_actions = self.actor_target(next_states)

        # Stack action together with action of the agent
        own_actions = actions[:,
                              self.index * self.action_size:(self.index + 1) *
                              self.action_size]
        if self.index:
            # Agent 1
            next_actions_agent = torch.cat((own_actions, next_actions), dim=1)
        else:
            # Agent 0: flipped order
            next_actions_agent = torch.cat((next_actions, own_actions), dim=1)

        # Predicted Q value from Critic target network
        Q_targets_next = self.critic_target(next_states, next_actions_agent)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        Q_expected = self.critic_local(states, actions)

        # Update priorities in ReplayBuffer
        loss = (Q_expected - Q_targets).pow(2).reshape(
            weights_cur.shape) * weights_cur
        self.memory.update(indices, loss.data.cpu().numpy())

        # Compute critic loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clip gradients
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), GRAD_CLIPPING)
        self.critic_optimizer.step()

        # ------------------- update actor ------------------- #
        actions_expected = self.actor_local(states)

        # Stack action together with action of the agent
        own_actions = actions[:,
                              self.index * self.action_size:(self.index + 1) *
                              self.action_size]
        if self.index:
            # Agent 1:
            actions_expected_agent = torch.cat((own_actions, actions_expected),
                                               dim=1)
        else:
            # Agent 0: flipped order
            actions_expected_agent = torch.cat((actions_expected, own_actions),
                                               dim=1)

        # Compute actor loss based on expectation from actions_expected
        actor_loss = -self.critic_local(states, actions_expected_agent).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.target_soft_update(self.critic_local, self.critic_target)
        self.target_soft_update(self.actor_local, self.actor_target)

    def target_soft_update(self, local_model, target_model):
        """Soft update model parameters for actor and critic of all MADDPG agents.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)

    def save(self, filename):
        """Saves the agent to the local workplace

        Params
        ======
            filename (string): where to save the weights
        """

        checkpoint = {
            'input_size':
            self.state_size,
            'output_size':
            self.action_size,
            'actor_hidden_layers': [
                each.out_features for each in self.actor_local.hidden_layers
                if each._get_name() != 'BatchNorm1d'
            ],
            'actor_state_dict':
            self.actor_local.state_dict(),
            'critic_hidden_layers': [
                each.out_features for each in self.critic_local.hidden_layers
                if each._get_name() != 'BatchNorm1d'
            ],
            'critic_state_dict':
            self.critic_local.state_dict()
        }

        torch.save(checkpoint, filename)

    def load_weights(self, filename):
        """ Load weights to update agent's actor and critic networks.
        Expected is a format like the one produced by self.save()

        Params
        ======
            filename (string): where to load data from. 
        """
        checkpoint = torch.load(filename)
        if not checkpoint['input_size'] == self.state_size:
            print(
                f"Error when loading weights from checkpoint {filename}: input size {checkpoint['input_size']} doesn't match state size of agent {self.state_size}"
            )
            return None
        if not checkpoint['output_size'] == self.action_size:
            print(
                f"Error when loading weights from checkpoint {filename}: output size {checkpoint['output_size']} doesn't match action space size of agent {self.action_size}"
            )
            return None
        my_actor_hidden_layers = [
            each.out_features for each in self.actor_local.hidden_layers
            if each._get_name() != 'BatchNorm1d'
        ]
        if not checkpoint['actor_hidden_layers'] == my_actor_hidden_layers:
            print(
                f"Error when loading weights from checkpoint {filename}: actor hidden layers {checkpoint['actor_hidden_layers']} don't match agent's actor hidden layers {my_actor_hidden_layers}"
            )
            return None
        my_critic_hidden_layers = [
            each.out_features for each in self.critic_local.hidden_layers
            if each._get_name() != 'BatchNorm1d'
        ]
        if not checkpoint['critic_hidden_layers'] == my_critic_hidden_layers:
            print(
                f"Error when loading weights from checkpoint {filename}: critic hidden layers {checkpoint['critic_hidden_layers']} don't match agent's critic hidden layers {my_critic_hidden_layers}"
            )
            return None
        self.actor_local.load_state_dict(checkpoint['actor_state_dict'])
        self.critic_local.load_state_dict(checkpoint['critic_state_dict'])
Exemplo n.º 2
0
class DQNAgent():
    def __init__(self, state_size, action_size):
        # if you want to see Cartpole learning, then change to True
        self.render = False
        self.load_model = False

        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.memory_size = 20000
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.explore_step = 5000
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.explore_step
        self.batch_size = 64
        self.train_start = 1000

        # create prioritized replay memory using SumTree
        self.memory = PrioritizedReplayBuffer(self.memory_size)

        # create main model and target model
        self.model = DQN(state_size, action_size)
        self.model.apply(self.weights_init)
        self.target_model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate)

        # initialize target model
        self.update_target_model()

        if self.load_model:
            self.model = torch.load('save_model/cartpole_dqn')

    # weight xavier initialize
    def weights_init(self, m):
        classname = m.__class__.__name__
        if classname.find('Linear') != -1:
            torch.nn.init.xavier_uniform(m.weight)

    # after some time interval update the target model to be same with model
    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    # get action from model using epsilon-greedy policy
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            state = torch.from_numpy(state)
            state = Variable(state).float().cpu()
            q_value = self.model(state)
            _, action = torch.max(q_value, 1)
            return int(action)

    # save sample (error,<s,a,r,s'>) to the replay memory
    def append_sample(self, state, action, reward, next_state, done):
        target = self.model(Variable(torch.FloatTensor(state))).data
        old_val = target[0][action]
        target_val = self.target_model(Variable(
            torch.FloatTensor(next_state))).data
        if done:
            target[0][action] = reward
        else:
            target[0][
                action] = reward + self.discount_factor * torch.max(target_val)

        error = abs(old_val - target[0][action])

        self.memory.add(error, (state, action, reward, next_state, done))

    # pick samples from prioritized replay memory (with batch_size)
    def train_model(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

        mini_batch, idxs, is_weights = self.memory.sample(self.batch_size)
        mini_batch = np.array(mini_batch).transpose()

        states = np.vstack(mini_batch[0])
        actions = list(mini_batch[1])
        rewards = list(mini_batch[2])
        next_states = np.vstack(mini_batch[3])
        dones = mini_batch[4]

        # bool to binary
        dones = dones.astype(int)

        # Q function of current state
        states = torch.Tensor(states)
        states = Variable(states).float()
        pred = self.model(states)

        # one-hot encoding
        a = torch.LongTensor(actions).view(-1, 1)

        one_hot_action = torch.FloatTensor(self.batch_size,
                                           self.action_size).zero_()
        one_hot_action.scatter_(1, a, 1)

        pred = torch.sum(pred.mul(Variable(one_hot_action)), dim=1)

        # Q function of next state
        next_states = torch.Tensor(next_states)
        next_states = Variable(next_states).float()
        next_pred = self.target_model(next_states).data

        rewards = torch.FloatTensor(rewards)
        dones = torch.FloatTensor(dones)

        # Q Learning: get maximum Q value at s' from target model
        target = rewards + (1 -
                            dones) * self.discount_factor * next_pred.max(1)[0]
        target = Variable(target)

        errors = torch.abs(pred - target).data.numpy()

        # update priority
        for i in range(self.batch_size):
            idx = idxs[i]
            self.memory.update(idx, errors[i])

        self.optimizer.zero_grad()

        # MSE Loss function
        loss = (torch.FloatTensor(is_weights) *
                F.mse_loss(pred, target)).mean()
        loss.backward()

        # and train
        self.optimizer.step()
class Prioritized(DQN):
    def __init__(self,
                 env,
                 model,
                 target_model,
                 config,
                 name_agent="prioritized-dqn"):
        self.name_agent = name_agent

        self.dim_space = env.observation_space.shape[0]
        self.nb_actions = env.action_space.n

        self.epsilon = config.epsilon_start
        self.epsilon_final = config.epsilon_final
        self.epsilon_start = config.epsilon_start
        self.epsilon_decay = config.epsilon_decay

        self.gamma = config.gamma
        self.update_nb_iter = config.update_nb_iter

        # changing the buffer (taking a priotirized buffer
        # insted of a uniform probability buffer)
        self.replay_buffer = PrioritizedReplayBuffer(10000, config.batch_size,
                                                     config.w,
                                                     config.beta_final,
                                                     config.beta_start,
                                                     config.beta_decay)
        self.environment = env
        self.batch_size = config.batch_size

        self.model = model
        self.target_model = target_model
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=config.learning_rate)

        self.loss_data = []
        self.rewards = []

    def loss(self):
        """ 
            the loss is equal to:
                    Rt+1+γt+1qθ(St+1,argmax qθ(St+1,a′))−qθ(St,At))^2
        """
        states, actions, rewards, next_states, finish, indices, weight = self.replay_buffer.sample(
        )
        actions = actions.long()

        # qθ(St,At)
        q0 = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # argmax qθ_barre(St+1,a′)
        max_next_q0 = self.model(next_states).max(1)[0] * (1 - finish)

        Rt_gamma_max = (rewards + self.gamma * max_next_q0)

        loss = (q0 - Rt_gamma_max).pow(2) * weight

        # update the priority of the buffer
        self.replay_buffer.add_p(indices, loss.detach().numpy())

        loss = loss.sum()

        return loss