def __init__(self, state_size, action_size,n_agents, buffer, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        # Set given state and action sizes
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network having local and target network for soft updates
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network having local and target network for soft updates
        self.critic_local = Critic(state_size, action_size,n_agents, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,n_agents, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process to boost exploration and hence learning of the network
        self.noise = OUNoise(action_size, random_seed)
        
        self.memory = buffer
示例#2
0
    def __init__(self, num, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            num (int): number of this agent
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.num = num
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size*2+action_size*2, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size*2+action_size*2, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
        
        # Noise process
        self.noise = OUNoise(action_size, random_seed, scale = 0.2)
示例#3
0
    def __init__(self, state_size, action_size, seed):
        """ Initialize an Agent object
        INPUT:
        state_size (int): dim of each state
        action_size (int): dim of each action
        seed (int): random seed
        
        """
        super(MADDPG_Agent, self).__init__()
        
        self.state_size = state_size
        self.action_size = action_size
        #replace: self.seed = torch.manual_seed(seed)
        random.seed(seed)
        
        # initialise local network and target network for Actor 
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optim = optim.Adam(self.actor_local.parameters(), lr = lr_actor)
        
        # initialize local network and target network for Critic 
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optim = optim.Adam(self.critic_local.parameters(), lr = lr_critic, weight_decay = weight_decay)

        
        # initialize the Ornstein-Uhlenbeck noise process
        self.noise = OUNoise((n_agents, action_size), seed)        
        
        
        # initialize Shared Replay Buffer
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        
        # initialize time step to keep track of update
        self.t_step = 0
示例#4
0
    def __init__(self, state_size, action_size, memory, device='cpu', params=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            memory (obj): Memory buffer to sample
            device (str): device string between cuda:0 and cpu
            params (dict): hyper-parameters
        """
        self.state_size = state_size
        self.action_size = action_size
        self.device = device
        self.step_t = 0
        self.update_every = params['update_every']

        # Set parameters
        self.gamma = params['gamma']
        self.tau = params['tau']
        self.seed = random.seed(params['seed'])

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, params['seed'],
                                 params['actor_units'][0], params['actor_units'][1]).to(device)
        self.actor_target = Actor(state_size, action_size, params['seed'],
                                  params['actor_units'][0], params['actor_units'][1]).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params['lr_actor'])

        # Critic Network (w/ Target Network)
        if not MADDPGAgent.critic_local:
            MADDPGAgent.critic_local = Critic(state_size, action_size, params['seed'],
                                              params['critic_units'][0], params['critic_units'][1]).to(device)
        if not MADDPGAgent.critic_target:
            MADDPGAgent.critic_target = Critic(state_size, action_size, params['seed'],
                                               params['critic_units'][0], params['critic_units'][1]).to(device)
        if not MADDPGAgent.critic_optimizer:
            MADDPGAgent.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                                      lr=params['lr_critic'], weight_decay=params['weight_decay'])

        self.critic_local = MADDPGAgent.critic_local
        self.critic_target = MADDPGAgent.critic_target
        self.critic_optimizer = MADDPGAgent.critic_optimizer

        # Noise process
        self.noise = OUNoise(action_size, params['seed'], theta=params['noise_theta'], sigma=params['noise_sigma'])

        # Replay memory
        self.memory = memory
示例#5
0
class Agent():
    """Interacts with and learns from the environment."""
    memory = None

    def __init__(self, num, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            num (int): number of this agent
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.num = num
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size*2+action_size*2, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size*2+action_size*2, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
        
        # Noise process
        self.noise = OUNoise(action_size, random_seed, scale = 0.2)


    def act(self, state, add_noise=True, noise_amplitude=0.0):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample() * noise_amplitude
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()
示例#6
0
class MADDPG_Agent():
    def __init__(self, state_size, action_size, seed):
        """ Initialize an Agent object
        INPUT:
        state_size (int): dim of each state
        action_size (int): dim of each action
        seed (int): random seed
        
        """
        super(MADDPG_Agent, self).__init__()
        
        self.state_size = state_size
        self.action_size = action_size
        #replace: self.seed = torch.manual_seed(seed)
        random.seed(seed)
        
        # initialise local network and target network for Actor 
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optim = optim.Adam(self.actor_local.parameters(), lr = lr_actor)
        
        # initialize local network and target network for Critic 
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optim = optim.Adam(self.critic_local.parameters(), lr = lr_critic, weight_decay = weight_decay)

        
        # initialize the Ornstein-Uhlenbeck noise process
        self.noise = OUNoise((n_agents, action_size), seed)        
        
        
        # initialize Shared Replay Buffer
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        
        # initialize time step to keep track of update
        self.t_step = 0
        
    def hard_update(self, local_model, target_model):
        """ copy weights from source to target network (part of initialization)"""
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(local_param.data)
            
            
    def step(self, states, actions, rewards, next_states, dones):
        
         """each agent adding their experience tuples in the replay buffer"""
        for i in range(n_agents):
            self.memory.add(states[i, :], actions[i, :], rewards[i], next_states[i, :], dones[i])
            
            
        self.t_step = (self.t_step + 1) % update_every
        if self.t_step == 0:
            # if enough samples are there then learn
            if len(self.memory) > batch_size:
                # update the networks update_freq times at each update step
                for i in range(update_freq):
                    experiences = self.memory.sample()
                    self.learn(experiences, gamma)
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size,n_agents, buffer, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        # Set given state and action sizes
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network having local and target network for soft updates
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network having local and target network for soft updates
        self.critic_local = Critic(state_size, action_size,n_agents, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,n_agents, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process to boost exploration and hence learning of the network
        self.noise = OUNoise(action_size, random_seed)
        
        self.memory = buffer
        
            
    def step(self):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        if len(self.memory) > BATCH_SIZE: # Do only if batch is full 
            experiences = self.memory.sample() # draw sample
            self.learn(experiences, GAMMA)        

    def act(self, state, add_noise=1.0):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device) # from numpy state to torch tensor
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy() # Forward state to get action probs
        self.actor_local.train()
        action += self.noise.sample()*add_noise
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        l_states, l_actions, rewards, l_next_states, dones = experiences  
        t_states      = torch.cat(l_states, dim=1).to(device)
        t_actions     = torch.cat(l_actions, dim=1).to(device)
        t_next_states = torch.cat(l_next_states, dim=1).to(device)    
        
        
        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        t_next_actions = torch.cat([self.actor_target(states) for states in l_states] , dim=1).to(device)        
        Q_targets_next = self.critic_target(t_next_states, t_next_actions)        
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))        
        # Compute critic loss
        Q_expected = self.critic_local(t_states, t_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)        
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        # take the current states and predict actions        
        t_actions_pred = torch.cat([self.actor_local(states) for states in l_states] , dim=1).to(device)
        actor_loss = -self.critic_local(t_states, t_actions_pred).mean()        
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()        
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
示例#8
0
class MADDPGAgent():
    """Interacts with and learns from the environment."""
    critic_local = None
    critic_target = None
    critic_optimizer = None
    
    def __init__(self, state_size, action_size, memory, device='cpu', params=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            memory (obj): Memory buffer to sample
            device (str): device string between cuda:0 and cpu
            params (dict): hyper-parameters
        """
        self.state_size = state_size
        self.action_size = action_size
        self.device = device
        self.step_t = 0
        self.update_every = params['update_every']

        # Set parameters
        self.gamma = params['gamma']
        self.tau = params['tau']
        self.seed = random.seed(params['seed'])

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, params['seed'],
                                 params['actor_units'][0], params['actor_units'][1]).to(device)
        self.actor_target = Actor(state_size, action_size, params['seed'],
                                  params['actor_units'][0], params['actor_units'][1]).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params['lr_actor'])

        # Critic Network (w/ Target Network)
        if not MADDPGAgent.critic_local:
            MADDPGAgent.critic_local = Critic(state_size, action_size, params['seed'],
                                              params['critic_units'][0], params['critic_units'][1]).to(device)
        if not MADDPGAgent.critic_target:
            MADDPGAgent.critic_target = Critic(state_size, action_size, params['seed'],
                                               params['critic_units'][0], params['critic_units'][1]).to(device)
        if not MADDPGAgent.critic_optimizer:
            MADDPGAgent.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                                      lr=params['lr_critic'], weight_decay=params['weight_decay'])

        self.critic_local = MADDPGAgent.critic_local
        self.critic_target = MADDPGAgent.critic_target
        self.critic_optimizer = MADDPGAgent.critic_optimizer

        # Noise process
        self.noise = OUNoise(action_size, params['seed'], theta=params['noise_theta'], sigma=params['noise_sigma'])

        # Replay memory
        self.memory = memory

    def store_actor_weights(self, filename):
        """Store weights of Actor

        Params
        ======
            filename (str): string of filename to store weights of actor
        """
        torch.save(self.actor_local.state_dict(), filename)

    def store_critic_weights(self, filename):
        """Store weights of Critic

        Params
        ======
            filename (str): string of filename to store weights of critic
        """
        torch.save(self.critic_local.state_dict(), filename)

    def load_actor_weights(self, filename):
        """Load weights of Actor

        Params
        ======
            filename (str): string of filename to load weights of actor
        """
        self.actor_local.load_state_dict(torch.load(filename))

    def load_critic_weights(self, filename):
        """Load weights of Critic

        Params
        ======
            filename (str): string of filename to load weights of critic
        """
        self.critic_local.load_state_dict(torch.load(filename))

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        self.step_t = (self.step_t + 1) % self.update_every

        # Learn, if enough samples are available in memory
        if self.step_t == 0 and len(self.memory) > self.memory.get_batch_size():
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
    def __init__(self, config):

        self.config = config
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            buffer_size (int) : replay buffer size
            batch_size (int) : minibatch size
            gamma (float) : discount factor
            tau (float) : for soft update of target parameter
            lr_actor (float) : learning rate of the actor 
            lr_critic (float) : learning rate of the critic 
            weight_decay (float) : L2 weight decay
            ou_mu (float) : OUNoise mu
            ou_theta (float) : OUNoise theta
            ou_sigma (float) : OUNoise sigma
            update_every_t_steps (int): timesteps between updates
            num_of_updates (int): num of update passes when updating
        """
        print(
            "[AGENT INFO] DDPG constructor initialized parameters:\n num_agents={} \n state_size={} \n action_size={} \n random_seed={} \n actor_fc1_units={} \n actor_fc2_units={} \n critic_fcs1_units={} \n critic_fc2_units={} \n buffer_size={} \n batch_size={} \n gamma={} \n tau={} \n lr_actor={} \n lr_critic={} \n weight_decay={} \n ou_mu={}\n ou_theta={}\n ou_sigma={}\n update_every_t_steps={}\n num_of_updates={}\n"
            .format(
                self.config.num_agents, self.config.state_size,
                self.config.action_size, self.config.random_seed,
                self.config.actor_fc1_units, self.config.actor_fc2_units,
                self.config.critic_fcs1_units, self.config.critic_fc2_units,
                self.config.buffer_size, self.config.batch_size,
                self.config.gamma, self.config.tau, self.config.lr_actor,
                self.config.lr_critic, self.config.weight_decay,
                self.config.ou_mu, self.config.ou_theta, self.config.ou_sigma,
                self.config.update_every_t_steps, self.config.num_of_updates))

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(self.config.state_size,
                                 self.config.action_size,
                                 self.config.random_seed,
                                 self.config.actor_fc1_units,
                                 self.config.actor_fc2_units).to(device)
        self.actor_target = Actor(self.config.state_size,
                                  self.config.action_size,
                                  self.config.random_seed,
                                  self.config.actor_fc1_units,
                                  self.config.actor_fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.config.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.config.state_size,
                                   self.config.action_size,
                                   self.config.random_seed,
                                   self.config.critic_fcs1_units,
                                   self.config.critic_fc2_units).to(device)
        self.critic_target = Critic(self.config.state_size,
                                    self.config.action_size,
                                    self.config.random_seed,
                                    self.config.critic_fcs1_units,
                                    self.config.critic_fc2_units).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.config.lr_critic,
            weight_decay=self.config.weight_decay)

        # Noise process
        self.noise = OUNoise(self.config.action_size,
                             self.config.random_seed,
                             mu=self.config.ou_mu,
                             theta=self.config.ou_theta,
                             sigma=self.config.ou_sigma)

        # Replay memory
        self.memory = ReplayBuffer(self.config.action_size,
                                   self.config.buffer_size,
                                   self.config.batch_size,
                                   self.config.random_seed)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, config):

        self.config = config
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            buffer_size (int) : replay buffer size
            batch_size (int) : minibatch size
            gamma (float) : discount factor
            tau (float) : for soft update of target parameter
            lr_actor (float) : learning rate of the actor 
            lr_critic (float) : learning rate of the critic 
            weight_decay (float) : L2 weight decay
            ou_mu (float) : OUNoise mu
            ou_theta (float) : OUNoise theta
            ou_sigma (float) : OUNoise sigma
            update_every_t_steps (int): timesteps between updates
            num_of_updates (int): num of update passes when updating
        """
        print(
            "[AGENT INFO] DDPG constructor initialized parameters:\n num_agents={} \n state_size={} \n action_size={} \n random_seed={} \n actor_fc1_units={} \n actor_fc2_units={} \n critic_fcs1_units={} \n critic_fc2_units={} \n buffer_size={} \n batch_size={} \n gamma={} \n tau={} \n lr_actor={} \n lr_critic={} \n weight_decay={} \n ou_mu={}\n ou_theta={}\n ou_sigma={}\n update_every_t_steps={}\n num_of_updates={}\n"
            .format(
                self.config.num_agents, self.config.state_size,
                self.config.action_size, self.config.random_seed,
                self.config.actor_fc1_units, self.config.actor_fc2_units,
                self.config.critic_fcs1_units, self.config.critic_fc2_units,
                self.config.buffer_size, self.config.batch_size,
                self.config.gamma, self.config.tau, self.config.lr_actor,
                self.config.lr_critic, self.config.weight_decay,
                self.config.ou_mu, self.config.ou_theta, self.config.ou_sigma,
                self.config.update_every_t_steps, self.config.num_of_updates))

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(self.config.state_size,
                                 self.config.action_size,
                                 self.config.random_seed,
                                 self.config.actor_fc1_units,
                                 self.config.actor_fc2_units).to(device)
        self.actor_target = Actor(self.config.state_size,
                                  self.config.action_size,
                                  self.config.random_seed,
                                  self.config.actor_fc1_units,
                                  self.config.actor_fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.config.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.config.state_size,
                                   self.config.action_size,
                                   self.config.random_seed,
                                   self.config.critic_fcs1_units,
                                   self.config.critic_fc2_units).to(device)
        self.critic_target = Critic(self.config.state_size,
                                    self.config.action_size,
                                    self.config.random_seed,
                                    self.config.critic_fcs1_units,
                                    self.config.critic_fc2_units).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.config.lr_critic,
            weight_decay=self.config.weight_decay)

        # Noise process
        self.noise = OUNoise(self.config.action_size,
                             self.config.random_seed,
                             mu=self.config.ou_mu,
                             theta=self.config.ou_theta,
                             sigma=self.config.ou_sigma)

        # Replay memory
        self.memory = ReplayBuffer(self.config.action_size,
                                   self.config.buffer_size,
                                   self.config.batch_size,
                                   self.config.random_seed)

    def reset(self):
        self.noise.reset()

    def step(self, states, actions, rewards, next_states, dones, agent_number,
             timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(states, actions, rewards, next_states, dones)

        # Learn, if enough samples are available in memory
        if len(
                self.memory
        ) > self.config.batch_size and timestep % self.config.update_every_t_steps == 0:
            for _ in range(self.config.num_of_updates):
                experiences = self.memory.sample()
                self.learn(experiences, self.config.gamma, agent_number)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.config.num_agents, self.config.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            # get action for each agent and concatenate them
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        # add noise to actions
        if add_noise:
            actions += self.noise.sample()
        actions = np.clip(actions, -1, 1)
        return actions

    def learn(self, experiences, gamma, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target,
                         self.config.tau)
        self.soft_update(self.actor_local, self.actor_target, self.config.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)