예제 #1
0
class DDPG:
    def __init__(self, state_size, action_size, random_seed, hyperparams):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.hyperparams = hyperparams

        self.actor = Actor(state_size, action_size, random_seed).to(device)
        self.actor_noise = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optim = optim.Adam(self.actor.parameters(),
                                      lr=hyperparams.alpha_actor)

        self.critic = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optim = optim.Adam(
            self.critic.parameters(),
            lr=hyperparams.alpha_critic,
            weight_decay=hyperparams.weight_decay,
        )

        self.replay_buffer = ReplayBuffer(hyperparams.buffer_size,
                                          hyperparams.batch_size, random_seed)

        self.noise = OUNoise(
            action_size,
            random_seed,
            self.hyperparams.mu,
            self.hyperparams.theta,
            self.hyperparams.sigma,
        )

    def step(self, state, action, reward, next_state, done):

        self.replay_buffer.add(state, action, reward, next_state, done)
        if len(self.replay_buffer) > self.hyperparams.batch_size:
            observations = self.replay_buffer.sample()
            self.update_params(observations)

    def select_action(self, state, train=True, nn_noise=False):
        state = torch.from_numpy(state).to(dtype=torch.float32, device=device)
        self.actor.eval()
        if nn_noise:
            action = self.actor_noise(state).cpu().data.numpy()
        else:
            action = self.actor(state).cpu().data.numpy()
        self.actor.train()
        if train:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset_state()

    def update_params(self, observations):

        states, actions, rewards, next_states, dones = observations
        next_actions = self.actor_target(next_states)
        next_Q_values = self.critic_target(next_states, next_actions)
        Q_values = rewards + (self.hyperparams.gamma * next_Q_values *
                              (1 - dones))

        expected_Q = self.critic(states, actions)
        Q_values_loss = F.l1_loss(expected_Q, Q_values)
        self.critic_optim.zero_grad()
        Q_values_loss.backward()
        self.critic_optim.step()

        policy_loss = -self.critic(states, self.actor(states))
        policy_loss = policy_loss.mean()
        self.actor_optim.zero_grad()
        policy_loss.backward()
        self.actor_optim.step()

        for qtarget_param, qlocal_param in zip(self.critic_target.parameters(),
                                               self.critic.parameters()):
            qtarget_param.data.copy_(self.hyperparams.tau * qlocal_param.data +
                                     (1.0 - self.hyperparams.tau) *
                                     qtarget_param.data)

        for target_param, local_param in zip(self.actor_target.parameters(),
                                             self.actor.parameters()):
            target_param.data.copy_(self.hyperparams.tau * local_param.data +
                                    (1.0 - self.hyperparams.tau) *
                                    target_param.data)
예제 #2
0
class DDPG():
    """Reinforcement learning agent that learns using DDPG."""
    def __init__(self, task, train=True):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high


        # Set the learning rate suggested by paper:  https://pdfs.semanticscholar.org/71f2/03de1a53deae81a7707143f0ed564661e279.pdf
        self.actor_learning_rate = 0.001
        self.actor_decay = 0.0
        self.critic_learning_rate = 0.001
        self.critic_decay = 0.0

        # Actor Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay)

        # Critic Model
        self.critic_local = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay)
        self.critic_target = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay)

        # initialize targets model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        # self.exploration_theta = 0.15
        # self.exploration_sigma = 0.2
        self.exploration_theta = 0.01
        self.exploration_sigma = 0.02
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta,
                   self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000

        self.batch_size = 64

        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.best_w = None
        self.best_score = -np.inf
        # self.noise_scale = 0.7
        self.score = 0

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01 # for soft update of target parameters

        # Indicate if we want to learn (or use to predict without learn)
        self.set_train(train)

    def reset_episode(self):
        self.total_reward = 0.0
        self.score = 0
        self.step_count = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):

        self.total_reward += reward
        self.step_count += 1
        # Save experience /reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        self.score = self.total_reward / float(self.step_count) if self.step_count else 0.0
        # Update the noise factor depending on the new score value
        if  self.score >= self.best_score:
            self.best_score = self.score
       
        # Learn, if enough samples are available in memory
        if self.train and len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, done)

        # Roll over last state and action
        self.last_state= next_state

    def act(self, state):
        """Returns actions for given state(s)  as per current policy"""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample()) # add more noise for exploration

    def learn(self, experiences, done):
        """Update policy and value parameters using give batch experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards  = np.array([e.reward for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)

        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_state = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        next_action = self.actor_target.model.predict_on_batch(next_state)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_state, next_action])

        # Compute Q targets for current states and train critic model(local)
        Q_targets = rewards + self.gamma * Q_targets_next * ( 1- dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]),
                            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        # Soft-update target method

        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())


        assert len(local_weights) == len(target_weights), "Local and target model parameters mush have the same size"
        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)

    def set_train(self, train):
        self.train = train
예제 #3
0
class DDPG():
    """Reinforcement Learning agent , learning using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.08
        self.exploration_sigma = 0.15
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.95  # discount factor 0.99
        self.tau = 0.001  # for soft update of target parameters 0.01

        # Score tracker and learning parameters
        self.total_reward = None
        self.count = 0
        self.score = 0
        self.best_score = -np.inf
        self.last_state = None

    def reset_episode(self):

        self.total_reward = None
        self.count = 0

        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):

        if self.total_reward:
            self.total_reward += reward
        else:
            self.total_reward = reward

        self.count += 1

        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(states)[0]
        # add some noise for exploration
        return list(action + self.noise.sample())

    def learn(self, experiences):
        """Update policy and value parameters using given batch of reward tuples."""

        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted actions of next-state  and Q values from target models
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        # track best score
        self.score = self.total_reward / float(
            self.count) if self.count else -np.inf
        if self.best_score < self.score:
            self.best_score = self.score

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
예제 #4
0
class DDPG():
    def __init__(self, agent_id, model, action_size, random_seed):
        self.id = agent_id

        # Actor Neural Network (Regular and target)
        self.actor_regular = model.actor_regular
        self.actor_target = model.actor_target
        self.actor_optimizer = optim.Adam(self.actor_regular.parameters(),
                                          lr=LR_ACTOR)

        # Critic Neural Network (Regular and target)
        self.critic_regular = model.critic_regular
        self.critic_target = model.critic_target
        self.critic_optimizer = optim.Adam(self.critic_regular.parameters(),
                                           lr=LR_CRITIC)

        # Exploration noise
        self.noise = OUNoise(action_size, random_seed, OU_MU, OU_THETA,
                             OU_SIGMA)

        # Ensure that both networks have the same weights
        self.deep_copy(self.actor_target, self.actor_regular)
        self.deep_copy(self.critic_target, self.critic_regular)

    def act(self, states, noise_value, add_noise=True):
        states = torch.from_numpy(states).float().to(DEVICE)
        self.actor_regular.eval()

        with torch.no_grad():
            action = self.actor_regular(states).cpu().data.numpy()

        self.actor_regular.train()

        if add_noise:
            # Include exploration noise
            action += noise_value * self.noise.sample()

        # Clip action to the right interval
        return np.clip(action, -1, 1)

    def learn(self, memory, agent_id, experiences, all_next_actions,
              all_actions):
        states, actions, rewards, next_states, dones = experiences

        # Update the critic neural network
        self.critic_optimizer.zero_grad()
        agent_id = torch.tensor([agent_id]).to(DEVICE)
        actions_next = torch.cat(all_next_actions, dim=1).to(DEVICE)

        with torch.no_grad():
            Q_targets_next = self.critic_target(next_states, actions_next)

        Q_expected = self.critic_regular(states, actions)
        # Compute Q targets for current states filtered by agent id
        Q_targets = rewards.index_select(
            1, agent_id) + (GAMMA * Q_targets_next *
                            (1 - dones.index_select(1, agent_id)))

        # Calculate the critic loss
        critic_loss = F.mse_loss(Q_expected, Q_targets.detach())

        # Minimize the loss
        critic_loss.backward()
        # Critic gradient clipping to 1
        torch.nn.utils.clip_grad_norm_(self.critic_regular.parameters(), 1)
        self.critic_optimizer.step()

        # Update the actor neural network
        self.actor_optimizer.zero_grad()
        # Detach actions of other agents
        actions_pred = [
            actions if i == self.id else actions.detach()
            for i, actions in enumerate(all_actions)
        ]
        actions_pred = torch.cat(actions_pred, dim=1).to(DEVICE)
        actor_loss = -self.critic_regular(states, actions_pred).mean()

        # Minimize the loss function
        actor_loss.backward()

        self.actor_optimizer.step()

        # Update target network using the soft update approach (slowly updating)
        self.soft_update(self.critic_regular, self.critic_target)
        self.soft_update(self.actor_regular, self.actor_target)

    def soft_update(self, local_model, target_model):
        # Update the target network slowly to improve the stability
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(TAU * local_param.data +
                                    (1.0 - TAU) * target_param.data)

    def deep_copy(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
예제 #5
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, state_size_full,
                 action_size_full, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.state_size_full = state_size_full
        self.action_size_full = action_size_full
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(hyperparameters.device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(hyperparameters.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=hyperparameters.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size_full, action_size_full,
                                   random_seed).to(hyperparameters.device)
        self.critic_target = Critic(state_size_full, action_size_full,
                                    random_seed).to(hyperparameters.device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=hyperparameters.LR_CRITIC,
            weight_decay=hyperparameters.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

    def act(self, state, eps, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(hyperparameters.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += eps * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DDPG:
    def __init__(self,
                state_size,
                action_size,                
                tau,
                lr_actor,
                lr_critic,
                num_agents,
                agent_idx,
                seed,
                device,
                gamma,
                tensorboard_writer=None):
        
        self.state_size = state_size
        self.action_size = action_size
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.num_agents = num_agents
        self.agent_idx = agent_idx
        self.seed = seed       
        self.device = device
        self.gamma = gamma
        random.seed(seed)
        self.tensorboard_writer = tensorboard_writer        
        
        self.actor_local = Actor(state_size, action_size, seed)
        self.actor_target = Actor(state_size, action_size, seed)
        
        critic_state_size = (state_size + action_size) * num_agents
        
        self.critic_local = Critic(critic_state_size, seed)
        self.critic_target = Critic(critic_state_size, seed)
        
        hard_update(self.actor_local, self.actor_target)
        hard_update(self.critic_local, self.critic_target) 
        
        self.actor_optim = torch.optim.Adam(self.actor_local.parameters(), lr=lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_local.parameters(), lr=lr_critic)
        
        self.noise = OUNoise(action_size, seed)
        
        self.iteration = 0
        
    def to(self, device):
        self.actor_local.to(device)
        self.actor_target.to(device)
        self.critic_local.to(device)
        self.critic_target.to(device)
        return self
                             
    def act(self, state, noise_scale, use_noise=True):
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if use_noise:
            action += self.noise.sample() * noise_scale
        return np.clip(action, -1, 1)
    
    def learn(self, experiences, all_curr_pred_actions, all_next_pred_actions):
        
        agent_idx_device = torch.tensor(self.agent_idx).to(self.device)
        
        states, actions, rewards, next_states, dones = experiences

        rewards = rewards.index_select(1, agent_idx_device)
        dones = dones.index_select(1, agent_idx_device)
        
        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
                
        batch_size = next_states.shape[0]
        
        actions_next = torch.cat(all_next_pred_actions, dim=1).to(self.device)
        next_states = next_states.reshape(batch_size, -1)      
        
        with torch.no_grad():
            Q_targets_next = self.critic_target(next_states, actions_next)
        
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        
        # Compute critic loss
        states = states.reshape(batch_size, -1)
        actions = actions.reshape(batch_size, -1)
        
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets.detach())
        # Minimize the loss
        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()
        
        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        self.actor_optim.zero_grad()
        predicted_actions = torch.cat([action if idx == self.agent_idx \
                   else action.detach()
                   for idx, action in enumerate(all_curr_pred_actions)],
                   dim=1).to(self.device)

        actor_loss = -self.critic_local(states, predicted_actions).mean()
        # minimize loss
        actor_loss.backward()
        self.actor_optim.step()
        
        al = actor_loss.cpu().detach().item()
        cl = critic_loss.cpu().detach().item()
        
        if self.tensorboard_writer is not None:            
            self.tensorboard_writer.add_scalar("agent{}/actor_loss".format(self.agent_idx), al, self.iteration)
            self.tensorboard_writer.add_scalar("agent{}/critic_loss".format(self.agent_idx), cl, self.iteration)
            self.tensorboard_writer.file_writer.flush()
            
        self.iteration += 1

        # ----------------------- update target networks ----------------------- #
        soft_update(self.critic_target, self.critic_local, self.tau)
        soft_update(self.actor_target, self.actor_local, self.tau)           

    
    def reset(self):
        self.noise.reset()
예제 #7
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def load(self, model_dir, agent_id):
        # Load Actor and Critic network weights
        self.actor_local.load_state_dict(
            torch.load(
                os.path.join(model_dir,
                             'agent_{0}_actor.pth'.format(agent_id))))
        self.critic_local.load_state_dict(
            torch.load(
                os.path.join(model_dir,
                             'agent_{0}_critic.pth'.format(agent_id))))

    def save(self, model_dir, agent_id):
        # Save Actor and Critic network weights
        torch.save(
            self.actor_local.state_dict(),
            os.path.join(model_dir, 'agent_{0}_actor.pth'.format(agent_id)))
        torch.save(
            self.critic_local.state_dict(),
            os.path.join(model_dir, 'agent_{0}_critic.pth'.format(agent_id)))
예제 #8
0
class DDPGAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 n_hidden_units=128,
                 n_layers=3):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # actor
        self.actor = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4)

        # critic
        self.critic = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=3e-4,
                                     weight_decay=0.0001)

        # will add noise
        self.noise = OUNoise(action_size, seed)

        # experience replay
        self.replay = ReplayBuffer(seed)

    def act(self, state, noise=True):
        '''
            Returns actions taken.
        '''
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state).cpu().data.numpy()
        self.actor.train()
        if noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def step(self, state, action, reward, next_state, done):
        '''
            Save experiences into replay and sample if replay contains enough experiences
        '''
        self.replay.add(state, action, reward, next_state, done)

        if self.replay.len() > self.replay.batch_size:
            experiences = self.replay.sample()
            self.learn(experiences, GAMMA)

    def learn(self, experiences, gamma):
        '''
            Update policy and value parameters using given batch of experience tuples.
            Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
            where:
                actor_target(state) -> action
                critic_target(state, action) -> Q-value
            Params: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, n_s, done) tuples
                    gamma (float): discount factor
        '''
        states, actions, rewards, next_states, dones = experiences
        # update critic:
        #   get predicted next state actions and Qvalues from targets
        next_actions = self.actor_target(next_states)
        next_Q_targets = self.critic_target(next_states, next_actions)
        #   get current state Qvalues
        Q_targets = rewards + (GAMMA * next_Q_targets * (1 - dones))
        #   compute citic loss
        Q_expected = self.critic(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        #   minimize loss
        self.critic_opt.zero_grad()
        critic_loss.backward(retain_graph=True)
        self.critic_opt.step()

        # update actor:
        #   compute actor loss
        action_predictions = self.actor(states)
        actor_loss = -self.critic(states, action_predictions).mean()
        #   minimize actor loss
        self.actor_opt.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor_opt.step()

        # update target networks
        self.soft_update(self.critic, self.critic_target, TAU)
        self.soft_update(self.actor, self.actor_target, TAU)

    def soft_update(self, local, target, tau):
        '''
            Soft update model parameters.
            θ_target = τ*θ_local + (1 - τ)*θ_target
            Params: local: PyTorch model (weights will be copied from)
                    target: PyTorch model (weights will be copied to)
                    tau (float): interpolation parameter
        '''
        for target_param, local_param in zip(target.parameters(),
                                             local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DDPG_Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 brain_name,
                 seed,
                 params=default_params,
                 device=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        params = self._fill_params(params)

        # implementation and identity
        self.device = device if device is not None else torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.name = params['name']
        self.brain_name = brain_name

        # set environment information
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size,
                                 action_size,
                                 seed,
                                 fc1_units=params['layers_actor'][0],
                                 fc2_units=params['layers_actor'][1]).to(
                                     self.device)

        self.actor_target = Actor(state_size,
                                  action_size,
                                  seed,
                                  fc1_units=params['layers_actor'][0],
                                  fc2_units=params['layers_actor'][1]).to(
                                      self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=params['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size,
                                   action_size,
                                   seed,
                                   fcs1_units=params['layers_critic'][0],
                                   fc2_units=params['layers_critic'][1]).to(
                                       self.device)
        self.critic_target = Critic(state_size,
                                    action_size,
                                    seed,
                                    fcs1_units=params['layers_critic'][0],
                                    fc2_units=params['layers_critic'][1]).to(
                                        self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=params['lr_critic'],
                                           weight_decay=params['weight_decay'])

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size,
                                   params['buffer_size'],
                                   params['batch_size'],
                                   seed,
                                   device=self.device)

        # save params
        self.params = params

    def _fill_params(self, src_params):
        params = {
            'name':
            self._get_param_or_default('name', src_params, default_params),
            'buffer_size':
            self._get_param_or_default('buffer_size', src_params,
                                       default_params),
            'batch_size':
            self._get_param_or_default('batch_size', src_params,
                                       default_params),
            'layers_actor':
            self._get_param_or_default('layers_actor', src_params,
                                       default_params),
            'layers_critic':
            self._get_param_or_default('layers_critic', src_params,
                                       default_params),
            'lr_actor':
            self._get_param_or_default('lr_actor', src_params, default_params),
            'lr_critic':
            self._get_param_or_default('lr_critic', src_params,
                                       default_params),
            'gamma':
            self._get_param_or_default('gamma', src_params, default_params),
            'tau':
            self._get_param_or_default('tau', src_params, default_params),
            'weight_decay':
            self._get_param_or_default('weight_decay', src_params,
                                       default_params)
        }
        return params

    def display_params(self, force_print=False):
        if force_print:
            print(self.params)
        return self.params

    def _get_param_or_default(self, key, src_params, default_params):
        if key in src_params:
            return src_params[key]
        else:
            return default_params[key]

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

    def start_learn(self):
        # Learn, if enough samples are available in memory
        # decoupled from step method to allow multiple steps per learning pass
        if len(self.memory) > self.params['batch_size']:
            experiences = self.memory.sample()
            self.learn(experiences, self.params['gamma'])

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target,
                         self.params['tau'])
        self.soft_update(self.actor_local, self.actor_target,
                         self.params['tau'])

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
예제 #10
0
class DDPGAgent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, agent_id):
        """Initialize a DDPGAgent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            agent_id (int): identifier for this agent
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(RANDOM_SEED)
        self.agent_id = agent_id

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Make sure that the target-local model pairs are initialized to the
        # same weights
        self.hard_update(self.actor_local, self.actor_target)
        self.hard_update(self.critic_local, self.critic_target)

        self.noise = OUNoise(action_size)

        #self.noise_amplification = NOISE_AMPLIFICATION
        #self.noise_amplification_decay = NOISE_AMPLIFICATION_DECAY

        #self._print_network()

    def act(self, state, add_noise=False):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
            self._decay_noise_amplification()

        return np.clip(action, -1, 1)

    def reset(self):
        """Resets the OU Noise for this agent."""
        self.noise.reset()

    def learn(self, experiences, next_actions, actions_pred):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(next_state) -> action
            critic_target(next_state, next_action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            next_actions (list): next actions computed from each agent
            actions_pred (list): prediction for actions for current states from each agent
        """
        states, actions, rewards, next_states, dones = experiences
        agent_id_tensor = torch.tensor([self.agent_id - 1]).to(device)

        ### Update critic
        self.critic_optimizer.zero_grad()
        Q_targets_next = self.critic_target(next_states, next_actions)
        Q_targets = rewards.index_select(1, agent_id_tensor) + (
            GAMMA * Q_targets_next *
            (1 - dones.index_select(1, agent_id_tensor)))
        Q_expected = self.critic_local(states, actions)
        # Minimize the loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        critic_loss.backward()
        self.critic_optimizer.step()

        ### Update actor
        self.actor_optimizer.zero_grad()
        # Minimize the loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        actor_loss.backward()
        self.actor_optimizer.step()

        ### Update target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def hard_update(self, local_model, target_model):
        """Hard update model parameters.
        θ_target = θ_local
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ * θ_local + (1 - τ) * θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def _print_network(self):
        """Helper to print network architecture for this agent's actors and critics."""
        print("Agent #{}".format(self.agent_id))
        print("Actor (Local):")
        print(self.actor_local)
        print("Actor (Target):")
        print(self.actor_target)
        print("Critic (Local):")
        print(self.critic_local)
        print("Critic (Target):")
        print(self.critic_target)
        if self.agent_id != NUM_AGENTS:
            print(
                "_______________________________________________________________"
            )
예제 #11
0
class Agent():
    def __init__(self, state_size, action_size, seed):
        self.gradient_clipping = True
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.config = Config()
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.config.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.config.LR_CRITIC,
            weight_decay=self.config.WEIGHT_DECAY)

        self.noise = OUNoise(action_size, seed)

        self.memory = ReplayBuffer(action_size, self.config.BUFFER_SIZE,
                                   self.config.BATCH_SIZE, seed, device)
        self.step_count = 0

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        self.step_count += 1

        if len(
                self.memory
        ) > self.config.BATCH_SIZE and self.step_count % self.config.UPDATE_EVERY == 0:
            experiences = self.memory.sample()
            self.learn(experiences, self.config.GAMMA)

    def act(self, state, eps, add_noise=True):
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action = action + self.noise.sample() * eps
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()

        critic_loss.backward()

        if self.gradient_clipping:
            # use gradient clipping
            torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)

        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        if self.step_count % 10 == 0:
            self.soft_update(self.critic_local, self.critic_target,
                             self.config.TAU)
            self.soft_update(self.actor_local, self.actor_target,
                             self.config.TAU)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
예제 #12
0
파일: ddpg.py 프로젝트: moliqingwa/DRLND
class DDPG(object):
    """
    Interacts with and learns from the environment.

    There are two agents and the observations of each agent has 24 dimensions, while each agent's action has 2 dimensions.
    Here we use two separate actor networks (one for each agent using each agent's observations only and output that agent's action).
    The critic for each agents gets to see the full observations and full actions of all agents.
    """
    def __init__(self,
                 agent_id,
                 state_size,
                 full_state_size,
                 action_size,
                 full_action_size,
                 actor_hidden_sizes=(256, 128),
                 actor_lr=1e-4,
                 actor_weight_decay=0.,
                 critic_hidden_sizes=(256, 128),
                 critic_lr=1e-3,
                 critic_weight_decay=0.,
                 is_action_continuous=True):
        """
        Initialize an Agent object.

        :param agent_id (int): ID of each each agent.
        :param state_size (int): Dimension of each state for each agent.
        :param full_state_size (int): Dimension of full state for all agents.
        :param action_size (int): Dimension of each action for each agent.
        :param full_action_size: Dimension of full action for all agents.
        :param actor_hidden_sizes (tuple): Hidden units of the actor network.
        :param actor_lr (float): Learning rate of the actor network.
        :param actor_weight_decay (float): weight decay (L2 penalty) of the actor network.
        :param critic_hidden_sizes (tuple): Hidden units of the critic network.
        :param critic_lr (float): Learning rate of the critic network.
        :param critic_weight_decay (float): weight decay (L2 penalty) of the critic network.
        :param is_action_continuous (bool): Whether action space is continuous or discrete.
        """
        self.id = agent_id
        self.state_size = state_size
        self.full_state_size = full_state_size
        self.action_size = action_size
        self.full_action_size = full_action_size
        self.is_action_continuous = is_action_continuous

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(
            state_size,
            actor_hidden_sizes,
            action_size,
            out_gate=nn.Tanh if is_action_continuous else None)
        self.actor_target = Actor(
            state_size,
            actor_hidden_sizes,
            action_size,
            out_gate=nn.Tanh if is_action_continuous else None)
        self.update(self.actor_local, self.actor_target, 1.)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=actor_lr,
                                          weight_decay=actor_weight_decay)

        # Critic Network (w/ Target Network)
        num_agents = int(full_action_size / action_size)
        self.critic_local = Critic(
            full_state_size,
            full_action_size if is_action_continuous else num_agents,
            critic_hidden_sizes)
        self.critic_target = Critic(
            full_state_size,
            full_action_size if is_action_continuous else num_agents,
            critic_hidden_sizes)
        # self.critic_local, self.critic_target = get_critic(full_state_size, full_action_size, critic_hidden_sizes)
        self.update(self.critic_local, self.critic_target, 1.)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=critic_lr,
                                           weight_decay=critic_weight_decay)

        self.use_actor = True

        # Noise Process
        self.noise_scale = 0.
        self.noise = OUNoise(action_size)

    def reset(self):
        self.noise.reset()

    def act(self, state, noise_scale=0.0):
        """
        Returns action for given state using current policy.
        """
        states = torch.from_numpy(state[np.newaxis]).float()

        # calculate actions
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states)
        self.actor_local.train()
        actions = actions.cpu().numpy().squeeze()

        # add noise
        actions += noise_scale * self.noise.sample()

        return np.clip(actions, -1,
                       1) if self.is_action_continuous else np.argmax(actions)

    def learn(self,
              states,
              actions,
              rewards,
              next_states,
              dones,
              full_actions_predicted,
              critic_full_next_actions,
              gamma=0.99):
        """
        Update policy and value parameters.
        Q_targets = r + γ * critic_target(next_state, action_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        :param states: Full states for training which size is (BATCHES, NUM_AGENTS, STATE_SIZE)
        :param actions: Full actions for training which size is (BATCHES, NUM_AGENTS, ACTION_SIZE)
        :param rewards: Full rewards for training which size is (BATCHES, NUM_AGENTS)
        :param next_states: Full next states for training which size is (BATCHES, NUM_AGENTS, STATE_SIZE)
        :param dones: Full dones for training which size is (BATCHES, NUM_AGENTS)
        :param full_actions_predicted:
        :param critic_full_next_actions: Full next states which size is (BATCHES, NUM_AGENTS * STATE_SIZE)
        :param gamma: discount ratio
        """
        full_states = states.view(-1, self.full_state_size)
        full_actions = actions.view(states.shape[0], -1).float()
        full_next_states = next_states.view(-1, self.full_state_size)
        critic_full_next_actions = torch.cat(critic_full_next_actions,
                                             dim=1).float().to(DEVICE)

        actor_rewards = rewards[:, self.id].view(-1, 1)
        actor_dones = dones[:, self.id].view(-1, 1)

        # ---------------------------- update critic ---------------------------- #
        q_next = self.critic_target.forward(full_next_states,
                                            critic_full_next_actions)

        q_target = actor_rewards + gamma * q_next * (1 - actor_dones)

        q_expected = self.critic_local(full_states, full_actions)

        # Compute critic loss
        critic_loss = F.mse_loss(q_expected, q_target.detach())

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        if self.use_actor:
            # detach actions from other agents
            full_actions_predicted = [
                actions if i == self.id else actions.detach()
                for i, actions in enumerate(full_actions_predicted)
            ]
            full_actions_predicted = torch.cat(full_actions_predicted,
                                               dim=1).float().to(DEVICE)

            # Compute actor loss
            actor_loss = -self.critic_local.forward(
                full_states, full_actions_predicted).mean()

            # Minimize the loss
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
        else:
            actor_loss = torch.tensor(0)

        return actor_loss.cpu().item(), critic_loss.cpu().item()

    def update(self, source, target, tau=0.01):
        """
        Update target model parameters:
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :param source: Pytorch model which parameters are copied from
        :param target: Pytorch model which parameters are copied to
        :param tau: interpolation parameter
        """
        for param, target_param in zip(source.parameters(),
                                       target.parameters()):
            target_param.data.copy_(target_param.data * (1 - tau) +
                                    param.data * tau)