Exemplo n.º 1
0
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self,state_size, action_size,single_rotor_control=False, buffer_size=int(1e5), batch_size=128, 
                    gamma=0.98, tau=1e-3, lr_actor=1e-4,lr_critic=1e-3, random_seed=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size

        # rotor control mode
        self.single_rotor_control = single_rotor_control

        if self.single_rotor_control:
            action_size = 1

        # define hyperparameters
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic

        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(size=action_size, seed=random_seed)

        # Replay memory
        self.memory = ReplayBuffer(buffer_size, random_seed)

        # counter for time steps
        self.time_step = 0

        #self.soft_update(self.critic_local, self.critic_target, 1.0)
        #self.soft_update(self.actor_local, self.actor_target, 1.0)
    
    def step(self,state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        
        # Save experience / reward
        if self.single_rotor_control:
            self.memory.add(state, action[0], reward, next_state, done)
        else:
            self.memory.add(state, action, reward, next_state, done)
        
        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences, self.gamma)

        # increase time step count
        self.time_step+=1

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states.unsqueeze(0)).cpu().data.numpy()[0]
        self.actor_local.train()
        if add_noise:
            #noise = np.repeat(,self.action_size)
            actions += self.noise.sample()

        if self.single_rotor_control:
            actions = np.repeat(actions,self.action_size)

        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        
        states, actions, rewards, next_states, dones, idxs, is_weights = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute Q value predictions
        Q_expected = self.critic_local(states, actions)

        # compute td error
        td_error = Q_targets - Q_expected
        # update td error in Replay buffer
        self.memory.update_priorities(idxs,td_error.detach().cpu().numpy().squeeze())

        # compute critic loss
        critic_loss = ((is_weights*td_error)**2).mean()

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -(is_weights * self.critic_local(states, actions_pred).squeeze()).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Exemplo n.º 2
0
class MADDPG():
    """
    Class definition of MADDPG agent. Interacts with and learns from the environment
    Comprises of a pair of Actor-Critic network ad implements centralized training and decentralized exeution (learn function)
    """
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 device,
                 seed=23520,
                 GRADIENT_CLIP=1,
                 ACTIVATION=F.relu,
                 BOOTSTRAP_SIZE=5,
                 GAMMA=0.99,
                 TAU=1e-3,
                 LR_CRITIC=5e-4,
                 LR_ACTOR=5e-4,
                 UPDATE_EVERY=1,
                 TRANSFER_EVERY=2,
                 UPDATE_LOOP=10,
                 ADD_NOISE_EVERY=5,
                 WEIGHT_DECAY=0,
                 MEMORY_SIZE=5e4,
                 BATCH_SIZE=64):
        """Initialize an Agent object.
        
        Params
        ======
            state_size  : dimension of each state
            action_size : dimension of each action
            num_agents  : number of running agents
            device: cpu or cuda:0 if available
            -----These are hyperparameters----
            BOOTSTRAP_SIZE      : How far ahead to bootstrap
            GAMMA               : Discount factor
            TAU                 : Parameter for performing soft updates of target parameters
            LR_CRITIC, LR_ACTOR : Learning rate of the networks
            UPDATE_EVERY        : How often to update the networks
            TRANSFER_EVERY      : How often to transfer the weights from local to target
            UPDATE_LOOP         : Number of iterations for network update
            ADD_NOISE_EVERY     : How often to add noise to favor exploration
            WEIGHT_DECAY        : L2 weight decay for critic optimizer
            GRADIENT_CLIP       : Limit of gradient to be clipped, to avoid exploding gradient issue
        """

        # Actor networks
        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optim = optim.Adam(self.actor_local.parameters(),
                                      lr=LR_ACTOR,
                                      weight_decay=WEIGHT_DECAY)
        hard_update(self.actor_local, self.actor_target)

        #critic networks
        self.critic_local = Critic(state_size * 2, action_size).to(device)
        self.critic_target = Critic(state_size * 2, action_size).to(device)
        self.critic_optim = optim.Adam(self.critic_local.parameters(),
                                       lr=LR_CRITIC,
                                       weight_decay=WEIGHT_DECAY)
        hard_update(self.critic_local, self.critic_target)

        self.device = device
        self.num_agents = num_agents

        # Noise : using simple noise instead of OUNoise
        self.noise = [
            SimpleNoise(action_size, scale=1) for i in range(num_agents)
        ]

        # Replay memory
        self.memory = ReplayBuffer(action_size, device, int(MEMORY_SIZE),
                                   BATCH_SIZE, seed)

        # Initialize time steps (for updating every UPDATE_EVERY steps)
        self.u_step = 0
        self.n_step = 0

        #keeping hyperparameters within the instance
        self.BOOTSTRAP_SIZE = BOOTSTRAP_SIZE
        self.GAMMA = GAMMA
        self.TAU = TAU
        self.LR_CRITIC = LR_CRITIC
        self.LR_ACTOR = LR_ACTOR
        self.UPDATE_EVERY = UPDATE_EVERY
        self.TRANSFER_EVERY = TRANSFER_EVERY
        self.UPDATE_LOOP = UPDATE_LOOP
        self.ADD_NOISE_EVERY = ADD_NOISE_EVERY
        self.GRADIENT_CLIP = GRADIENT_CLIP

        # initialize these variables to store the information of the n-previous timestep that are necessary to apply the bootstrap_size
        self.rewards = deque(maxlen=BOOTSTRAP_SIZE)
        self.states = deque(maxlen=BOOTSTRAP_SIZE)
        self.actions = deque(maxlen=BOOTSTRAP_SIZE)
        self.gammas = np.array([[GAMMA**i for j in range(num_agents)]
                                for i in range(BOOTSTRAP_SIZE)])

        self.loss_function = torch.nn.SmoothL1Loss()

    def reset(self):
        if self.noise:
            for n in self.noise:
                n.reset()

    def set_noise(self, noise):
        self.noise = noise

    def save(self, filename):
        torch.save(self.actor_local.state_dict(),
                   "{}_actor_local.pth".format(filename))
        torch.save(self.actor_target.state_dict(),
                   "{}_actor_target.pth".format(filename))
        torch.save(self.critic_local.state_dict(),
                   "{}_critic_local.pth".format(filename))
        torch.save(self.critic_target.state_dict(),
                   "{}_critic_target.pth".format(filename))

    def load(self, path):
        self.actor_local.load_state_dict(torch.load(path + "_actor_local.pth"))
        self.actor_target.load_state_dict(
            torch.load(path + "_actor_target.pth"))
        self.critic_local.load_state_dict(
            torch.load(path + "_critic_local.pth"))
        self.critic_target.load_state_dict(
            torch.load(path + "_critic_target.pth"))

    def act(self, states, noise=0.0):
        """
        Returns actions of each actor for given states.
        
        Params
        ======
            state    : current states
            add_noise: Introduce some noise in agent's action or not. During training, this is necessary to promote the exploration but should not be used during validation
        """
        actions = None

        self.n_step = (self.n_step + 1) % self.ADD_NOISE_EVERY

        with torch.no_grad():
            self.actor_local.eval()
            states = torch.from_numpy(states).float().unsqueeze(0).to(
                self.device)
            actions = self.actor_local(states).squeeze().cpu().data.numpy()
            self.actor_local.train()
            if self.n_step == 0:
                for i in range(len(actions)):
                    actions[i] += noise * self.noise[i].sample()
        return actions

    def step(self, states, actions, rewards, next_states, dones):
        """
        Take a step for the current episode
        1. Save the experience
        2. Bootstrap the rewards
        3. If update conditions are statisfied, perform learning on required number of loops
        """

        # Save experience in replay memory
        self.rewards.append(rewards)
        self.states.append(states)
        self.actions.append(actions)

        if len(self.rewards) == self.BOOTSTRAP_SIZE:
            # get the bootstrapped sum of rewards per agents
            reward = np.sum(self.rewards * self.gammas, axis=-2)
            self.memory.add(self.states[0], self.actions[0], reward,
                            next_states, dones)

        if np.any(dones):
            self.rewards.clear()
            self.states.clear()
            self.actions.clear()

        # Learn every UPDATE_EVERY timesteps
        self.u_step = (self.u_step + 1) % self.UPDATE_EVERY

        t_step = 0
        if len(self.memory) > self.memory.batch_size and self.u_step == 0:
            for _ in range(self.UPDATE_LOOP):
                self.learn()
                # transfer the weights as specified
                t_step = (t_step + 1) % self.TRANSFER_EVERY
                if t_step == 0:
                    soft_update(self.actor_local, self.actor_target, self.TAU)
                    soft_update(self.critic_local, self.critic_target,
                                self.TAU)

    def transform_states(self, states):
        """
        Transforms states to full states so that both agents can see each others state via the critic network
        """
        batch_size = states.shape[0]
        state_size = states.shape[-1]
        num_agents = states.shape[-2]
        transformed_states = torch.zeros(
            (batch_size, num_agents, state_size * num_agents)).to(self.device)
        for i in range(num_agents):
            start = 0
            for j in range(num_agents):
                transformed_states[:, i, start:start + state_size] += states[:,
                                                                             j]
                start += state_size
        return transformed_states

    def learn(self):
        """
        Update the network parameters using the experiences. The algorithm is described in detail in readme

        Params
        ======
            experiences : List of (s, a, r, s', done) tuples
        """
        # sample the memory to disrupt the internal correlation
        states, actions, rewards, next_states, dones = self.memory.sample()
        full_states = self.transform_states(states)

        # The critic should estimate the value of the states to be equal to rewards plus
        # the estimation of the next_states value according to the critic_target and actor_target
        with torch.no_grad():
            self.actor_target.eval()
            self.critic_target.eval()
            # obtain next actions as given by the target network and get transformed states for the critic
            next_actions = self.actor_target(next_states)
            next_full_states = self.transform_states(next_states)
            # calculate Q value using transformed next states and next actions, basically predict what the next value is from target's perspective
            q_next = self.critic_target(next_full_states,
                                        next_actions).squeeze(-1)
            # calculate the target's value
            targeted_value = rewards + (
                self.GAMMA**self.BOOTSTRAP_SIZE) * q_next * (1 - dones)

        current_value = self.critic_local(full_states, actions).squeeze(-1)
        loss = self.loss_function(current_value, targeted_value)
        # During the optimization, the critic tells how much the value is off from the action value and adjusts the network towards. Basically, the critic takes the actions predicted by the actor, and tells how good or bad they are by calculating its Q-value

        # calculate the loss of the critic network and backpropagate
        self.critic_optim.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                       self.GRADIENT_CLIP)
        self.critic_optim.step()

        # optimize the actor by having the critic evaluating the value of the actor's decision
        self.actor_optim.zero_grad()
        actions_pred = self.actor_local(states)
        mean = self.critic_local(full_states, actions_pred).mean()
        (-mean).backward()
        self.actor_optim.step()