class Agent():
    """Interacts with and learns from the environment"""

    def __init__(self, state_size, action_size, fc1_units=256, fc2_units=128, device=torch.device('cpu')):
        """DQN agent

        Args:
          state_size (int): dimension of each state
          action_size (int): dimension of each action (or the number of action choices)
          seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.device = device

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       fc1_units=fc1_units, fc2_units=fc2_units).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        fc1_units=fc1_units, fc2_units=fc2_units).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Initialze qnetwork_target parameters to qnetwork_local
        self.soft_update(self.qnetwork_local, self.qnetwork_target, 1)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device=self.device)

        # Initialize the time step counter (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subnet and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Args:
          state (array_like): current state
          eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Set qnetwork_local to evaluation mode
        self.qnetwork_local.eval()

        # This operation should not be included in gradient calculation
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        # Set back qnetwork_local to training mode
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Args:
          experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
          gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        # Compute Q tagets for current states with actual rewards
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ----- Update the target network -----
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        theta_target = tau * theta_local + (1 - tau) * theta_target

        Args:
          local_model (torch.nn.Module): weights will be copied from
          target_model (torch.nn.MOdule): weights will be copied to
          tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1. - tau) * target_param.data)
Пример #2
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed)
        self.qnetwork_target = QNetwork(state_size, action_size, seed)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #3
0
class Agent:
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.q_local = QNetwork(state_size, action_size, seed)
        self.q_target = QNetwork(state_size, action_size, seed)
        self.optimizer = optim.Adam(self.q_local.parameters(), lr=LR)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        self.t_size = 0

    def step(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)

        self.t_size = (self.t_size + 1) % UPDATE_EVERY
        if self.t_size == 0:
            if len(self.memory) > BATCH_SIZE:
                e = self.memory.sample()
                self.learn(e)

    def act(self, state, epsilon):
        state = torch.from_numpy(state).float().unsqueeze(0)  #Get state
        self.q_local.eval()  #Set Q_local in evaluate mode
        #Equivalent to q_local.train(False)
        with torch.no_grad():  #Get Action values
            action_values = self.q_local(state)
        self.q_local.train()  #Train Q_local

        if random.random() > epsilon:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma=GAMMA):
        states, actions, rewards, next_states, dones = experiences

        # TD target
        best_actions = self.q_target(next_states).detach().max(1)[1].unsqueeze(
            1)
        evaluations = self.q_local(next_states).gather(1, best_actions)
        Q_target = rewards + evaluations * gamma * (~dones)

        # Currently predicted Q value
        Q_expected = self.q_local(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.q_local, self.q_target)

    def soft_update(self, local_model, target_model, tau=TAU):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, lr_decay=0.985):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        print("Running on: " + str(device))

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        if USING_DUELING:
            self.qnetwork_local = DuelQNetwork(state_size, action_size,
                                               seed).to(device)
            self.qnetwork_target = DuelQNetwork(state_size, action_size,
                                                seed).to(device)
        else:
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(
            self.optimizer, lr_decay)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        if USING_DOUBLE_DQN:
            # Getting the actions with maximum reward from the local model
            next_actions_local = self.qnetwork_local(next_states).max(
                dim=1, keepdim=True)[1]
            # Get the reward from the target model for the selected actions
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, next_actions_local)

        else:
            # Max predicted values for the next state from the target model
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # Q targets for current state
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Q expected values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimizing loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #5
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma, double_dqn=True):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        if (double_dqn):
            # ---------------
            #   double DQN
            # ---------------

            # get the Q values for best actions in observations
            # based off the current Q network
            # max(Q(s', a', theta_i)) wrt a'
            Q_local_values = self.qnetwork_local(next_states).detach()
            _, a_prime = Q_local_values.max(1)

            # get Q values from frozen network (i.e. target network) for next state and chosen action
            # Q(s',argmax(Q(s',a', theta_i), theta_i_frozen)) (argmax wrt a')
            Q_target_values = self.qnetwork_target(next_states).detach()
            Q_target_s_a_prime = Q_target_values.gather(
                1, a_prime.unsqueeze(1))
            #Q_target_s_a_prime = Q_target_s_a_prime.squeeze()
            #print('Q_target_s_a_prime', Q_target_s_a_prime.size())

            # Compute Q targets for next states
            Q_target_s_a_prime = rewards + (gamma * Q_target_s_a_prime *
                                            (1 - dones))
            #print('Q_target_s_a_prime2', Q_target_s_a_prime.size())

            # Get expected Q values from local model
            Q_expected = self.qnetwork_local(states).gather(1, actions)
            #print('Q_expected', Q_expected.size())

            # Compute loss
            loss = F.mse_loss(Q_expected, Q_target_s_a_prime)
            # Minimize the loss
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        else:
            # ---------------
            #   regular DQN
            # ---------------
            # Get max predicted Q values (for next states) from target model
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)
            #print('Q_targets_next', Q_targets_next.size())

            # Compute Q targets for current states
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
            #print('Q_targets', Q_targets.size())

            # Get expected Q values from local model
            Q_expected = self.qnetwork_local(states).gather(1, actions)
            #print('Q_expected', Q_expected.size())
            # Compute loss
            loss = F.mse_loss(Q_expected, Q_targets)
            # Minimize the loss
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #6
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 lr_decay=0.9999,
                 double_dqn=False,
                 duel_dqn=False,
                 prio_exp=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): Dimension of each State
            action_size (int): Dimension of each Action
            seed (int): Random Seed
            lr_decay (float): Decay float for alpha learning rate
            DOUBLE DQN (boolean): Indicator for Double Deep Q-Network
            DUEL DQN (boolean): Indicator for Duel Deep Q-Network
            PRIORITISED_EXPERIENCE (boolean): Indicator for Prioritized Experience Replay
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.lr_decay = lr_decay
        self.DOUBLE_DQN = double_dqn
        self.DUEL_DQN = duel_dqn
        self.PRIORITISED_EXPERIENCE = prio_exp

        # Determine Deep Q-Network for use
        if self.DUEL_DQN:
            self.qnetwork_local = DuelQNetwork(state_size, action_size,
                                               seed).to(device)
            self.qnetwork_target = DuelQNetwork(state_size, action_size,
                                                seed).to(device)
        else:
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)

        # Initialize Optimizer
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Determine if Prioritized Experience will be used
        if self.PRIORITISED_EXPERIENCE:
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  BUFFER_SIZE,
                                                  BATCH_SIZE,
                                                  seed,
                                                  alpha=0.6,
                                                  beta=0.4,
                                                  beta_anneal=1.0001)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        if self.PRIORITISED_EXPERIENCE:
            states, actions, rewards, next_states, dones, weights = experiences
        else:
            states, actions, rewards, next_states, dones = experiences

        if self.DOUBLE_DQN:
            # Select max Action for Next State from Local NN
            max_action = self.qnetwork_local(next_states).detach().max(
                1)[1].unsqueeze(1)
            # Evaluate max Action with Target NN
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, max_action)
        else:
            # Get Max Predicted Q values for next state from Target NN
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # Compute Predicted Q values for current state
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get Expected Q values from Local NN
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        if self.PRIORITISED_EXPERIENCE:
            td_error = (Q_expected - Q_targets).squeeze_()  # Compute TD Error
            td_error_detached = td_error.detach()

            self.memory.update_probabilities(
                td_error_detached)  # Update Probabilities

            loss = ((td_error**2) * weights).mean()  # Compute Weighted Loss
        else:
            loss = F.mse_loss(Q_expected, Q_targets)  # Compute Loss

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- Update Target Network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(
                state)  # same as self.qnetwork_local.forward(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        # "*** YOUR CODE HERE ***"
        qs_local = self.qnetwork_local.forward(states)
        qsa_local = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long),
                             actions.reshape(BATCH_SIZE)]
        qsa_local = qsa_local.reshape((BATCH_SIZE, 1))
        # print(qsa_local.shape)

        # # DQN Target
        # qs_target = self.qnetwork_target.forward(next_states)
        # qsa_target, _ = torch.max(qs_target, dim=1) #using the greedy policy (q-learning)
        # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete
        # qsa_target = qsa_target.reshape((BATCH_SIZE,1))
        # TD_target = rewards + gamma * qsa_target
        # #print(qsa_target.shape, TD_target.shape, rewards.shape)

        # # Double DQN Target ver 1
        # qs_target = self.qnetwork_target.forward(next_states)
        # if random.random() > 0.5:
        #     _, qsa_target_argmax_a = torch.max(qs_target, dim=1) #using the greedy policy (q-learning)
        #     qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_target_argmax_a.reshape(BATCH_SIZE)]
        # else:
        #     _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning)
        #     #qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)]
        #     ##qsa_target = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)]

        # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete
        # qsa_target = qsa_target.reshape((BATCH_SIZE,1))
        # TD_target = rewards + gamma * qsa_target

        # Double DQN Target ver 2 (based upon double dqn paper)
        qs_target = self.qnetwork_target.forward(next_states)
        _, qsa_local_argmax_a = torch.max(
            qs_local, dim=1)  # using the greedy policy (q-learning)
        qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long),
                               qsa_local_argmax_a.reshape(BATCH_SIZE)]

        qsa_target = qsa_target * (
            1 - dones.reshape(BATCH_SIZE)
        )  # target qsa value is zero when episode is complete
        qsa_target = qsa_target.reshape((BATCH_SIZE, 1))
        TD_target = rewards + gamma * qsa_target

        # print(qsa_target.shape, TD_target.shape, rewards.shape)

        # #Udacity's approach
        # # Get max predicted Q values (for next states) from target model
        # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # # Compute Q targets for current states
        # TD_target = rewards + (gamma * Q_targets_next * (1 - dones))
        # # Get expected Q values from local model
        # qsa_local = self.qnetwork_local(states).gather(1, actions)

        # diff = qsa_local - TD_target
        # loss = torch.matmul(torch.transpose(diff, dim0=0, dim1=1), diff) #loss is now a scalar
        loss = F.mse_loss(
            qsa_local, TD_target)  # much faster than the above loss function
        # print(loss)
        # minimize the loss
        self.optimizer.zero_grad()  # clears the gradients
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #8
0
class DQNAgent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        :param state_size: (int) dimension of each state
        :param action_size: (int) dimension of each action
        :param seed: (int) random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=PARAM.LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, PARAM.BUFFER_SIZE,
                                   PARAM.BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """
        Adds the current state-action value to the memory and lets the agent learn if UPDATE_EVERY many steps are taken
        and the memory has more entries then BATCH_SIZE.

        :param state:       current state
        :param action:      taken action
        :param reward:      received reward
        :param next_state:  next state seen after action
        :param done:        boolean if the episode ended after the action
        """
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % PARAM.UPDATE_EVERY
        if self.t_step == 0:
            if len(self.memory) > PARAM.BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, PARAM.GAMMA)

    def act(self, state, eps=0.):
        """
        Returns actions for given state as per current policy.

        :param state: (array_like) current state
        :param eps: (float) epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def get_dqg_target(self, next_states, rewards, gamma, dones):
        """
        Gets the state-action value of the target network. That is, the current estimate of the target network for the
        next state including the seen reward.

        :param next_states: next state for each entry in the sampled mini batch
        :param rewards:     rewards seen for each sample in the mini batch
        :param gamma:       decay factor for current estimate
        :param dones:       indicator if the episode ended for each sample in the mini batch
        :return:
        """
        # Get predicted Q values
        qtarget_values = self.qnetwork_target(next_states).detach()

        # get max of it
        best_qtarget_value = qtarget_values.max(1)

        # reduce one dimension
        best_qtarget_value = best_qtarget_value[0]

        # reshape to 2d matrix with one value in it for 1st dimension (so difference can be calculated)
        # >>> torch.unsqueeze(x, 1)
        # tensor([[ 1],
        #        [ 2],
        #        [ 3],
        #        [ 4]])
        best_qtarget_value = best_qtarget_value.unsqueeze(1)

        # use vector formulation of:
        # if dones == 1:
        #    Q_targets = rewards
        # else:
        #    Q_targets = rewards + (gamma * best_qtarget_value)
        q_targets = rewards + (gamma * best_qtarget_value * (1 - dones))

        return q_targets

    def learn(self, experiences, gamma):
        """
        Update value parameters using given batch of experience tuples.

        :param experiences:  (Tuple[torch.Variable]) tuple of (s, a, r, s', done) tuples
        :param gamma: (float) discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        Q_targets = self.get_dqg_target(next_states, rewards, gamma, dones)

        # Get expected Q values
        q_exp = self.qnetwork_local(states)

        # gets the q values along dimention 1 according to the actions, which is used as index
        # >>> t = torch.tensor([[1,2],[3,4]])
        # >>> torch.gather(t, 1, torch.tensor([[0],[1]]))
        # tensor([[ 1],
        #        [ 4]])
        q_exp = q_exp.gather(1, actions)

        # compute loss
        loss = F.mse_loss(q_exp, Q_targets)

        # reset optimizer gradient
        self.optimizer.zero_grad()
        # do backpropagation
        loss.backward()
        # do optimize step
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, PARAM.TAU)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :param local_model: (PyTorch model) weights will be copied from
        :param target_model: (PyTorch model) weights will be copied to
        :param tau: (float) interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, buffer_size, batch_size,
                 lr, tau, sequential_sampling_fre):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            buffer_size(int):replay buffer size
            batch_size(int): minibatch size
            lr(float):learning rate 
            tau(float):for soft update of target parameters
            sequential_sampling_fre(int):Ratio of random sampling to sequential sampling
            
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.tau = tau

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed,
                                   sequential_sampling_fre)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_local_argmax = self.qnetwork_local(next_states).max(1)[1].unsqueeze(
            1)
        Q_targets_next_states = self.qnetwork_target(
            next_states).detach().gather(1, Q_local_argmax)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next_states * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #10
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 model='DQN',
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4,
                 pretrained_model_file=None):

        if model not in ('DQN', 'DDQN'):
            raise ValueError('Current model supports DQN or DDQN')
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            model (str): currently suports DQN and DDQN
            buffer size (int): replay buffer size
            batch size (int): minibatch size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            lr (float): learning rate
            update_every (int): how often to update the network
            pretrained_model_file (str): filepath to .pth file with pretrained model weights
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.model = model

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        if pretrained_model_file:
            weights = torch.load(pretrained_model_file)
            self.qnetwork_local.load_state_dict(weights)
            self.qnetwork_target.load_state_dict(weights)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        if self.model == 'DQN':
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)
        if self.model == 'DDQN':
            argmax_actions = self.qnetwork_local(next_states).detach().max(
                1)[1].unsqueeze(1)
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, argmax_actions)

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
Пример #11
0
class agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, duel, fc1_units, fc2_units,
                 seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            fc1_units : number of nodes in the first hidden layer 
            fc2_units : number of nodes in the second hidden layer
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        #Chose betweeen regulat Q-Network or duel architecture
        #if(duel):
        #   self.qnetwork_local  = Duel_QNetwork(state_size, action_size,fc1_units,fc2_units, seed).to(device)
        #    self.qnetwork_target = Duel_QNetwork(state_size, action_size,fc1_units,fc2_units, seed).to(device)
        #else:
        #    self.qnetwork_local  = QNetwork(state_size, action_size,fc1_units,fc2_units, seed).to(device)
        #   self.qnetwork_target = QNetwork(state_size, action_size,fc1_units,fc2_units, seed).to(device)
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Visualize network
        print(self.qnetwork_local)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state_size, state, action, reward, next_state, done, dqn):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                if (dqn):
                    self.DQN_learn(experiences, state_size, GAMMA)
                else:
                    self.DDQN_learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().to(device)
        # Model eval notify layers in model.py  that it is eval mode
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def DQN_learn(self, experiences, state_size, gamma):
        """Learn using the DQN algorithm.
           Update value parameters using given batch of experience tuples.

           Params
           ======
           experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
           gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        states = states.view(BATCH_SIZE, 4, state_size[0], state_size[1])
        next_states = next_states.view(BATCH_SIZE, 4, state_size[0],
                                       state_size[1])
        #
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from the local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss using element-wise mean squared error.
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def DDQN_learn(self, experiences, gamma):
        """DDQN version
        Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model

        # DQN
        #Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        #DDQN
        Q_local_argmax = self.qnetwork_local(next_states).detach().max(
            1)[1].unsqueeze(1)
        Q_targets_next = self.qnetwork_target(next_states).gather(
            1, Q_local_argmax)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        teta_target = ro*teta_local + (1 - ro)*teta_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #12
0
class SAC(object):
    def __init__(self):

        self.gamma = 0.99
        self.tau = 0.005
        self.alpha = 0.2
        self.lr = 0.003

        self.target_update_interval = 1
        self.device = torch.device("cpu")

        # 8 phases
        self.num_inputs = 8
        self.num_actions = 1
        self.hidden_size = 256

        self.critic = QNetwork(self.num_inputs, self.num_actions,
                               self.hidden_size).to(self.device)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=self.lr)

        self.critic_target = QNetwork(self.num_inputs, self.num_actions,
                                      self.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)
        # Copy the parameters of critic to critic_target

        self.target_entropy = -torch.Tensor([1.0]).to(self.device).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)

        self.alpha_optimizer = Adam([self.log_alpha], lr=self.lr)

        self.policy = GaussianPolicy(self.num_inputs, self.num_actions,
                                     self.hidden_size).to(self.device)
        self.policy_optimizer = Adam(self.policy.parameters(), lr=self.lr)

    def select_action(self, state):
        state = torch.FloatTensor(state).to(self.device)  # TODO
        _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]
        # action is a CUDA tensor, you should do .detach().cpu().numpy(), when
        # you need a numpy

    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)
        action_batch = np.expand_dims(action_batch, axis=1)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)
        # Unsqueeze: add one dimension to the index

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (
                min_qf_next_target)
        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf_loss = qf1_loss + qf2_loss

        self.critic_optimizer.zero_grad()
        # Clear the cumulative grad
        qf_loss.backward()
        # Get grad via backward()
        self.critic_optimizer.step()
        # Update the para via grad

        pi, log_pi, _ = self.policy.sample(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean()
        # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        # automatic_entropy_tuning:
        alpha_loss = -(self.log_alpha *
                       (log_pi + self.target_entropy).detach()).mean()  # TODO

        self.alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.alpha_optimizer.step()

        self.alpha = self.log_alpha.exp()
        alpha_tlogs = self.alpha.clone()  # For TensorboardX logs

        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item(), alpha_tlogs.item()

    # Save model parameters
    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None):
        # Create a dir package in the current location
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        print('Saving models to {} and {}'.format(actor_path, critic_path))
        torch.save(self.policy.state_dict(), actor_path)
        # state_dict() stores the parameters of layers and optimizers which have grad
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path):
        print('Loading models from {} and {}'.format(actor_path, critic_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))

    def get_alpha(self):
        return self.alpha
class soft_actor_critic_agent(object):
    def __init__(self, num_inputs, action_space, \
                 device, hidden_size, seed, lr, gamma, tau, alpha):

        self.gamma = gamma
        self.tau = tau
        self.alpha = alpha

        self.device = device
        self.seed = seed
        self.seed = torch.manual_seed(seed)

        torch.cuda.manual_seed(seed)
        #torch.cuda.manual_seed_all(seed)
        #torch.backends.cudnn.deterministic=True

        self.critic = QNetwork(seed, num_inputs, action_space.shape[0],
                               hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=lr)

        self.critic_target = QNetwork(seed, num_inputs, action_space.shape[0],
                                      hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
        self.target_entropy = -torch.prod(
            torch.Tensor(action_space.shape).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = Adam([self.log_alpha], lr=lr)
        self.policy = GaussianPolicy(seed, num_inputs, action_space.shape[0], \
                                         hidden_size, action_space).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=lr)

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if eval == False:
            action, _, _ = self.policy.sample(state)
        else:
            _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]

    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (
                min_qf_next_target)

        # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1, qf2 = self.critic(state_batch, action_batch)
        qf1_loss = F.mse_loss(qf1, next_q_value)
        qf2_loss = F.mse_loss(qf2, next_q_value)

        pi, log_pi, _ = self.policy.sample(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean()

        self.critic_optim.zero_grad()
        qf1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        qf2_loss.backward()
        self.critic_optim.step()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        alpha_loss = -(self.log_alpha *
                       (log_pi + self.target_entropy).detach()).mean()

        self.alpha_optim.zero_grad()
        alpha_loss.backward()
        self.alpha_optim.step()

        self.alpha = self.log_alpha.exp()
        alpha_tlogs = self.alpha.clone()  # For TensorboardX logs

        soft_update(self.critic_target, self.critic, self.tau)
Пример #14
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        # TODO: initialize action-value function Q with random weights theta
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        # TODO: initialize target action-value function Qhat with weights theta_=theta
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        # TODO: initialize replay memory D to capacity N (circular queue)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # TODO: set s_t1=s_t,a_t,x_t1 and preprocess f_t1=f(s_t1)
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # DONE. TODO: if episode terminates at step j+1, set y_j = r_j
        # else set y_j = r_j + gamma*max(Qhat(f_j1,a_;theta_))
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:  # DONE. TODO: every C steps reset Qhat = Q

            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                # DONE. TODO: sample random minibatch of transitions (f_j,a_j,r_j,f_j1) from D
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        # Unpack the experiences tuple
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # DONE. TODO: perform a gradient descent step on (y_j - Q(f_j,a_j;theta))^2 with
        # respect to the network parameters theta
        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #15
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every LEARN_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Learn every UPDATE_EVERY time steps.
        # self.t_step = (self.t_step + 1) % LEARN_EVERY
        # if self.t_step == 0:
        self.t_step += 1
        if done:
            for _ in range(self.t_step // SOFT_UPDATE_EVERY):
                # If enough samples are available in memory, get random subset and learn
                if len(self.memory) > BATCH_SIZE:
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)
                    # you can use learn_DDQN to enable double q-learning. but on lunarlander, at least,
                    # I don't see any benefit
                    # self.learn_DDQN(experiences, GAMMA)
            self.t_step = 0

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_local(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # print(loss)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def learn_DDQN(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # Get index of maximum value for next state from Q_expected
        Q_argmax = self.qnetwork_local(next_states).detach()
        _, a_prime = Q_argmax.max(1)
        # print (self.qnetwork_local(states).detach())
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().gather(
            1, a_prime.unsqueeze(1))
        # print (Q_targets_next.shape)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # print (Q_targets.shape)
        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        # print (Q_expected.shape)
        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #16
0
class Agent():
    """
    Deep Reinforcement Learning agent that interacts with and learns from the environment.
    Uses the Double DQN algorithm (see https://arxiv.org/abs/1509.06461) with a Dueling DQN
    model (see https://arxiv.org/abs/1511.06581).
    """
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Use Double DQN: Get predicted actions from local network model
        local_actions = self.qnetwork_local(next_states).detach().argmax(
            dim=1).unsqueeze(1)
        # Get predicted Q values (for next states) from target model using predicted actions
        Q_targets_next = self.qnetwork_target(next_states).gather(
            1, local_actions).detach()

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #17
0
class Agent(object):

    def __init__(self, n_states, n_actions, hidden_dim, lr, device):
        """Agent class that choose action and train

        Args:
            n_states (int): input dimension
            n_actions (int): output dimension
            hidden_dim (int): hidden dimension
        """

        self.device = device

        self.q_local = QNetwork(n_states, n_actions, hidden_dim=16).to(self.device)
        self.q_target = QNetwork(n_states, n_actions, hidden_dim=16).to(self.device)
        
        self.mse_loss = torch.nn.MSELoss()
        self.optim = optim.Adam(self.q_local.parameters(), lr=lr)
        
        self.n_states = n_states
        self.n_actions = n_actions
        

        #  ReplayMemory: trajectory is saved here
        self.replay_memory = ReplayMemory(10000)
        

    def get_action(self, state, eps, check_eps=True):
        """Returns an action

        Args:
            state : 2-D tensor of shape (n, input_dim)
            eps (float): eps-greedy for exploration

        Returns: int: action index
        """
        global steps_done
        sample = random.random()

        if check_eps==False or sample > eps:
            with torch.no_grad():
                return self.q_local(Variable(state).type(FloatTensor)).data.max(1)[1].view(1, 1)
        else:
           ## return LongTensor([[random.randrange(2)]])
           return torch.tensor([[random.randrange(self.n_actions)]], device=self.device) 


    def learn(self, experiences, gamma):
        """Prepare minibatch and train them

        Args:
        experiences (List[Transition]): batch of `Transition`
        gamma (float): Discount rate of Q_target
        """
        
        if len(self.replay_memory.memory) < BATCH_SIZE:
            return;
            
        transitions = self.replay_memory.sample(BATCH_SIZE)
        
        batch = Transition(*zip(*transitions))
                        
        states = torch.cat(batch.state)
        actions = torch.cat(batch.action)
        rewards = torch.cat(batch.reward)
        next_states = torch.cat(batch.next_state)
        dones = torch.cat(batch.done)
        
            
        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to newtork q_local (current estimate)
        Q_expected = self.q_local(states).gather(1, actions)     

        Q_targets_next = self.q_target(next_states).detach().max(1)[0] 

        # Compute the expected Q values
        Q_targets = rewards + (gamma * Q_targets_next * (1-dones))
        
        self.q_local.train(mode=True)        
        self.optim.zero_grad()
        loss = self.mse_loss(Q_expected, Q_targets.unsqueeze(1))
        # backpropagation of loss to NN        
        loss.backward()
        self.optim.step()
               
        
    def soft_update(self, local_model, target_model, tau):
        """ tau (float): interpolation parameter"""
        
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)     
            
    def hard_update(self, local, target):
        for target_param, param in zip(target.parameters(), local.parameters()):
            target_param.data.copy_(param.data)            
Пример #18
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # initialize local and target Q-Networks
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # initialize time step
        self.t_step = 0

        # initialize parameters
        self.buffer_size = BUFFER_SIZE
        self.batch_size = BATCH_SIZE
        self.gamma = GAMMA
        self.tau = TAU
        self.lr = LR
        self.update_every = UPDATE_EVERY
    
    def step(self, state, action, reward, next_state, done):
        # store experience tuple in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)
        
        # perform learning step every UPDATE_EVERY time steps
        self.t_step += 1
        is_time_to_update_weights = (self.t_step % UPDATE_EVERY) == 0
        if is_time_to_update_weights:
            # if enough samples in replay_buffer,
            # get random batch and perform one learning step
            if len(self.replay_buffer) > BATCH_SIZE:
                experiences = self.replay_buffer.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, epsilon=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # epsilon-greedy action selection
        action_values = action_values.cpu().data.numpy()[0]
        optimal_action = np.argmax(action_values)
        random_action = np.random.choice(np.arange(self.action_size))
        action = np.random.choice([optimal_action, random_action],
                                  p=[1-epsilon, epsilon])
        return np.int32(action)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # get max predicted Q values for next states from target models
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        # compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # minimize loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # target network soft update
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed, lr_decay=0.9999):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            lr_decay (float): multiplicative factor of learning rate decay
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        print("Running on: "+str(device))
        
        # Q-Network
        hidden_layers = [128, 32]
        
        if USE_DUELING_NETWORK:
            hidden_state_value = [64, 32]
            
            self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed, hidden_layers, hidden_state_value).to(device)

            self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed, hidden_layers, hidden_state_value).to(device)
            self.qnetwork_target.eval()
            
        else:
            self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_layers).to(device)

            self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_layers).to(device)
            self.qnetwork_target.eval()
            
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(self.optimizer, lr_decay)


        # Replay memory
        if USE_PRIORITIZED_REPLAY:
            self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device,
                                                  alpha=0.6, beta=0.4, beta_scheduler=1.0)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device)
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        # Epsilon-greedy action selection
        if random.random() > eps:
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            
            self.qnetwork_local.eval()
            with torch.no_grad():
                action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()

            return np.argmax(action_values.cpu().data.numpy())
        
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done, w) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, w = experiences

        ## Compute and minimize the loss

        with torch.no_grad():
            ### Use of Double DQN method
            if USE_DOUBLE_DQN:
                ## Select the greedy actions using the QNetwork Local
                # calculate the pair action/reward for each of the next_states
                next_action_rewards_local = self.qnetwork_local(next_states)
                # select the action with the maximum reward for each of the next actions
                greedy_actions_local = next_action_rewards_local.max(dim=1, keepdim=True)[1]

                ## Get the rewards for the greedy actions using the QNetwork Target
                # calculate the pair action/reward for each of the next_states
                next_action_rewards_target = self.qnetwork_target(next_states)
                # get the target reward for each of the greedy actions selected following the local network
                target_rewards = next_action_rewards_target.gather(1, greedy_actions_local)
                
            ### Use of Fixed Q-Target
            else:
                # calculate the pair action/reward for each of the next_states
                next_action_rewards = self.qnetwork_target(next_states)
                # select the maximum reward for each of the next actions
                target_rewards = next_action_rewards.max(dim=1, keepdim=True)[0]
                
            
            ## Calculate the discounted target rewards
            target_rewards = rewards + (gamma * target_rewards * (1 - dones))
            
        # calculate the pair action/rewards for each of the states
        expected_action_rewards = self.qnetwork_local(states) # shape: [batch_size, action_size]
        # get the reward for each of the actions
        expected_rewards = expected_action_rewards.gather(1, actions) # shape: [batch_size, 1]

        if USE_PRIORITIZED_REPLAY:
            target_rewards.sub_(expected_rewards)
            target_rewards.squeeze_()
            target_rewards.pow_(2)
            
            with torch.no_grad():
                td_error = target_rewards.detach()
                td_error.pow_(0.5)
                self.memory.update_priorities(td_error)
            
            target_rewards.mul_(w)
            loss = target_rewards.mean()
        else:
            # calculate the loss
            loss = F.mse_loss(expected_rewards, target_rewards)

        # perform the back-propagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.lr_scheduler.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Пример #20
0
class SAC(object):
    """
    SAC class from Haarnoja et al. (2018)
    We leave the option to use automatice_entropy_tuning to avoid selecting entropy rate alpha
    """
    def __init__(self, num_inputs, action_space, args):
        #self.n_flow = args.n_flows
        #assert self.n_flow == 0
        self.num_inputs = num_inputs
        #self.flow_family = args.flow_family
        self.num_layers = args.num_layers
        self.args = args

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(action_space.shape).to(self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

        self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                     args.hidden_size, self.num_layers,
                                     args).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

    def select_action(self, state, eval=False):
        """
        Select action for a state
        (Train) Sample an action from NF{N(mu(s),Sigma(s))}
        (Eval) Pass mu(s) through NF{}
        """
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if not eval:
            self.policy.train()
            action, _, _, _, _ = self.policy.evaluate(state)
        else:
            self.policy.eval()
            action, _, _, _, _ = self.policy.evaluate(state, eval=True)

        action = action.detach().cpu().numpy()
        return action[0]

    def update_parameters(self, memory, batch_size, updates):
        """
        Update parameters of SAC-NF
        Exactly like SAC, but keep two separate Adam optimizers for the Gaussian policy AND the NF layers
        .backward() on them sequentially
        """
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        # for visualization
        info = {}
        ''' update critic '''
        with torch.no_grad():
            next_state_action, next_state_log_pi, _, _, _ = self.policy.evaluate(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (
                min_qf_next_target)

        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]

        pi, log_pi, _, _, _ = self.policy.evaluate(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(
        )  # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        nf_loss = ((self.alpha * log_pi) - min_qf_pi).mean()

        # update
        self.critic_optim.zero_grad()
        qf1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        qf2_loss.backward()
        self.critic_optim.step()

        self.policy_optim.zero_grad()
        policy_loss.backward()  #retain_graph=True)
        self.policy_optim.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
            alpha_tlogs = self.alpha.clone()  # For TensorboardX logs
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_tlogs = torch.tensor(self.alpha)  # For TensorboardX logs

        # update target value fuctions
        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item(), alpha_tlogs.item(), info

    def save_model(self, info):
        """
        Save the weights of the network (actor and critic separately)
        """
        # policy
        save_checkpoint(
            {
                **info,
                'state_dict': self.policy.state_dict(),
                'optimizer': self.policy_optim.state_dict(),
            },
            self.args,
            filename='policy-ckpt.pth.tar')

        # critic
        save_checkpoint(
            {
                **info,
                'state_dict': self.critic.state_dict(),
                'optimizer': self.critic_optim.state_dict(),
            },
            self.args,
            filename='critic-ckpt.pth.tar')
        save_checkpoint(
            {
                **info,
                'state_dict': self.critic_target.state_dict(),
                #'optimizer' : self.critic_optim.state_dict(),
            },
            self.args,
            filename='critic_target-ckpt.pth.tar')

    def load_model(self, args):
        """
        Jointly or separately load actor and critic weights
        """
        # policy
        load_checkpoint(
            model=self.policy,
            optimizer=self.policy_optim,
            opt=args,
            device=self.device,
            filename='policy-ckpt.pth.tar',
        )

        # critic
        load_checkpoint(
            model=self.critic,
            optimizer=self.critic_optim,
            opt=args,
            device=self.device,
            filename='critic-ckpt.pth.tar',
        )
        load_checkpoint(
            model=self.critic_target,
            #optimizer=self.critic_optim,
            opt=args,
            device=self.device,
            filename='critic_target-ckpt.pth.tar',
        )
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):

        states, actions, rewards, next_states, dones = experiences

        Q_expected = self.qnetwork_local(states).gather(1, actions)

        actions_value = self.qnetwork_local.forward(next_states)
        next_action = torch.unsqueeze(torch.max(actions_value, 1)[1], 1)
        next_q = self.qnetwork_target.forward(next_states).gather(
            1, next_action)
        Q_targets = rewards + GAMMA * next_q * (1 - dones)

        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #22
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed=42,
                 hidden_layers=[32, 8]):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # detect GPU device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers,
                                       seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers,
                                        seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayMemory(BUFFER_SIZE, BATCH_SIZE, self.device, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_step, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_step, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step += 1
        if self.t_step % UPDATE_EVERY == 0:
            if self.memory.length > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, next_states, rewards, dones = experiences

        self.qnetwork_target.eval()
        with torch.no_grad():
            # get the max expected q-values
            Q_expected = self.qnetwork_local(
                next_states
            )  # gather = multiindex selector, dim=1 indices = actions
            action_argmax = torch.max(Q_expected, dim=1, keepdim=True)[1]
            Q_max_expected = Q_expected.gather(1, action_argmax)

            # get max predicted q-values for next states from target model (action with max value per state)
            # detach gets the tensor value, unsqueeze makes a matrix with one column
            Q_targets_next = self.qnetwork_target(next_states)
            # q-target for current state
            targets = rewards + gamma * Q_max_expected * (
                1 - dones)  #consider only not dones
        self.qnetwork_target.train()

        expected = self.qnetwork_local(states).gather(1, actions)
        loss = torch.sum((expected - targets)**2)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)

    def train(self,
              env,
              brain_name,
              n_episodes=2000,
              timesteps=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995):
        '''
        train the model network applying experience replay
        Params
        ======
            agent (Agent): agent that interacts with the enviroment
            n_episodes (int): number of games played
            timesteps (int): max number os steps to be played in the game
            eps_start (floaßt): initial proportion os random actions on epsilon-greedy action
            eps_end (float): final proportion os random actions on epsilon-greedy action
            eps_decay (float): epsilon decay rate 
        '''
        scores = []
        last_scores = deque(maxlen=100)
        eps = eps_start
        for i_episode in range(n_episodes):
            env_status = env.reset(train_mode=True)[brain_name]
            state = env_status.vector_observations[0]  #get state
            score = 0
            for _ in range(timesteps):
                action = self.act(state, eps).astype(int)
                env_status = env.step(action)[brain_name]
                next_state = env_status.vector_observations[0]
                reward = env_status.rewards[0]
                done = env_status.local_done[0]
                self.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break
            scores.append(score)
            last_scores.append(score)
            eps = max(eps_end, eps * eps_decay)  #decreases epsilon
            print('\rEpisode {}\tScores mean: {:.2f}'.format(
                i_episode, np.mean(last_scores)),
                  end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tLast 100 scores mean: {:.2f}'.format(
                    i_episode, np.mean(last_scores)))
            if np.mean(last_scores) >= 13.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tScores mean: {:.2f}'
                    .format(i_episode - 100, np.mean(last_scores)))
                torch.save(self.qnetwork_local.state_dict(), 'checkpoint.pth')
                break
        return scores
Пример #23
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

        # Get max predicted Q values for next states from the target model (frozen weights)
        #
        #    next_states is 64x8
        #    self.qnetwork_target(next_states) is 64x4
        #    detach() returns a tensor copy detached from the graph (no gradient)
        #    max(1)[0] returns the the max value in given dim  (max value indexes in 2nd array)
        #    => This returns an array of 64 values
        #    Unsqueeze(1)returns a new Tensor of size one inserted at the given position
        #    => This returns a 64X1 tensor
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model (being trained)
        # x.gather(1, actions) returns a tensor (located on the current device) that is the result of
        # concataining the input tensor values along the provided dimensions (here the dim indexes are the taken actions indexes)
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #24
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        
    def step(self, state, action, reward, next_state, done):       
        # ------------------- train with mini-batch sample of experiences ------------------- #
        if len(self.memory) > BATCH_SIZE:
            # If enough samples are available in memory, get random subset and learn
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
        
        # ------------------- update target network ----------------------------------------- #
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:             
            # If C (UPDATE_EVERY) steps have been reached, blend weights to the target network
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        # - qnetwork_target : apply forward pass for the whole mini-batch
        # - detach : do not backpropagate
        # - max : get maximizing action for each sample of the mini-batch (dim=1)
        # - [0].unsqueeze(1) : transform output into a flat array
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        # Compute Q targets for current states (y)
        # - dones : detect if the episode has finished
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model (Q(Sj, Aj, w))
        # - gather : for each sample select only the output value for action Aj
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Optimize over (yj-Q(Sj, Aj, w))^2
        # * compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # * minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()                            

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Пример #25
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, mode='DQN'):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        """ Set Tuning and Hyperparameters """

        self.mode = mode

        self.losses = []

        self.ddqn_enabled = False
        self.ddqn_counter = 0

        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        print("Parameters = {}".format(self.qnetwork_local.parameters()))
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def process_action(self, value):
        L = [
            np.array([1, 0, 0]),
            np.array([-1, 0, 0]),
            np.array([0, 1, 0]),
            np.array([0, 0, 1])
        ]
        return L[value]

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model

        if self.mode == 'DQN':
            argmax_actions_locals_next = self.qnetwork_local(next_states).max(
                1)[1].unsqueeze(1)
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, argmax_actions_locals_next)

        if self.mode == 'DDQN':
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        self.losses.append(float(loss))

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #26
0
class Agent():
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        # the state size and action size will be used to generate the Q Network
        self.state_size = state_size
        self.action_size = action_size
        ### random.seed(seed) generates sequence of random numbers by performing some operation on initial value.
        #If same initial value is used, it will generate the same sequence of random numbers
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = Replay_Buffer(action_size, Buffer_Size, Batch_Size, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def select_act(self, state, eps=0.):

        " selects action based on state and epsilon"

        # get the state array from env, convert to tensor

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # unsqueeze(0) adds a singleton dimension at 0 positon
        # useful because the states are in batches
        # to(device) moves the tensor to the device memory, cpu or cuda

        ## put network in eval mode
        self.qnetwork_local.eval()

        #get last_layer of the network to retrive index of the max reward
        with torch.no_grad(
        ):  # torch.no_grad() prevents calculating gradients in the following block, so no backward_pass.
            action_values = self.qnetwork_local(state)

        self.qnetwork_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return np.random.randint(self.action_size)  # select an action
        #random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):

        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_next_states = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # detach returns a new tensor detachd from the current graph
        # final layer is (batch_size ,action_size)i.e. (64,4), max(1), will find max in the second dim(1)
        # the new tensor is (64,), we then add a singleton dimensin to it with unsqueeze
        # Q_targets_next is the max reward of the four actons for each of the 64 states

        Q_target = rewards + (gamma * Q_next_states * (1 - dones))

        Q_expected = self.qnetwork_local(states).gather(1, actions)

        #gather rearranges values in the dimension (1 here) of the input tensor (64,4),
        #as per the indices in the index tensor provided, actions here...actions carries the index of the next action taken
        # given the state in states. SO only one value will be provided..it coud be either of 0,1,2,3..based on def act and state
        #therefore output is 64,1.with reward corresponding to only that action chosen after the state.

        # the rewars generated by q_network local is used for comparison with Q_targets to calc.loss
        #then we update parametrs to min loss

        loss = F.mse_loss(Q_expected, Q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def step(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (
            self.t_step + 1
        ) % UPDATE_EVERY  # self.t_step will increase by 1 after every step() call
        # that means every time step
        if self.t_step == 0:
            if len(self.memory) > Batch_Size:
                experiences = self.memory.sample()
                self.learn(experiences, gamma)

    def soft_update(self, local_model, target_model, TAU):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(TAU * local_param.data +
                                    (1.0 - TAU) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        #self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr=LR, momentum=0.95)
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences, idxs, ws = self.memory.sample()
                self.learn(experiences, idxs, ws, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, idxs, ws, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        ## TODO: compute and minimize the loss
        next_action_values_local = self.qnetwork_local(states).gather(1, actions)
        # Only change proposed for Double DQN: Get maximizing future actions from local network and get their
        # corresponding values from target network. Compare then these to the local taken actions.
        local_max_actions = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1)
        next_action_values_target = self.qnetwork_target(next_states).detach().gather(1, local_max_actions)
        
        
        '''
        print(next_action_values_local.shape)
        print(next_action_values_local[0][:])
        print(next_action_values_local.gather(1, actions).shape)
        print(actions[0][0])
        print(next_action_values_local.gather(1, actions)[0][0])
        '''
        y = rewards + (gamma * next_action_values_target*(1 - dones))
        # Local network will be actualized, target one is used as ground truth
        ws = torch.from_numpy(ws.astype(float)).float().to(device)
        loss = F.mse_loss(ws*next_action_values_local, ws*y)
        errors = np.abs(y.cpu().data.numpy() - next_action_values_local.cpu().data.numpy())
        self.memory.memory.update_batch(idxs, errors)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        # Copy from local to target network parameters
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def adjust_learning_rate(self, episode, val):
        print("adjusting learning rate!")
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = val
Пример #28
0
class Agent:
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steos
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """
        Returns action for given state as per current policy
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """
        Update value parameters using given batch of experience tuples
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values for next states from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DDQNPERAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 tor_dstate,
                 srpt_pens,
                 lrn_rate,
                 hsize1,
                 hsize2,
                 seed=0):
        """Initialize a DDQN Agent object with PER (Prioritized Experience Replay) support.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            tor_dstate (float): tolerance for deciding whether two states are the same
            srpt_pens (array_like): penalty (negative reward) values for undesirable actions
            lrn_rate (float): learning rate for Q-Network training
            hsize1 (int): size of the first hidden layer of the Q-Network
            hsize2 (int): size of the second hidden layer of the Q-Network 
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.tor_dstate = tor_dstate
        self.srpt_pens = srpt_pens
        self.lrn_rate = lrn_rate

        self.hsize1 = hsize1
        self.hsize2 = hsize2

        self.seed = seed
        if seed is not None: random.seed(seed)

        # Each penalty value adds a vector of action_size to signal which action causes the penalty.
        self.aug_state_size = state_size + len(srpt_pens) * action_size

        # Set up Q-Networks.
        self.qnetwork_local = QNetwork(self.aug_state_size, action_size,
                                       hsize1, hsize2, seed).to(device)
        self.qnetwork_local.initialize_weights(
        )  # initialize network with random weights
        self.qnetwork_target = QNetwork(self.aug_state_size,
                                        action_size,
                                        hsize1,
                                        hsize2,
                                        seed=None).to(device)
        self.qnetwork_target.update_weights(
            self.qnetwork_local)  # copy network weights to target network
        self.qnetwork_target.eval()
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=lrn_rate)

        # Store trained Q-model when the environment is solved.
        self.qnetwork_solved = None

        # Set up experience replay memory.
        self.ebuffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize interval steps.
        self.l_step = 0  # for learning every LEARN_EVERY time steps
        self.t_step = 0  # for updating target network every UPDATE_EVERY learnings

    def reset_epsisode(self, state, srpt_det=0):
        """Re-initialize buffers after environment reset for a new episode.
        
        Params
        ======
            state (array_like): initial state after environment reset
            srpt_det (int): number of repeated state types to be checked for post-processing
        """
        self.srpt_det = 0
        if len(self.srpt_pens) == 0:
            # State repeat detection for post-processing is active only when state repeat penalty option is off.
            self.srpt_det = srpt_det
        else:
            # This is used to signal self.step() hasn't been run yet.
            self.next_aug_state = None

        if len(self.srpt_pens) > 0 or self.srpt_det > 0:
            self.state_buffer = deque(maxlen=2)
            buffer_size = 2 * (max(len(self.srpt_pens), self.srpt_det) - 1)
            self.smsta_buffer = deque(maxlen=max(2, buffer_size))

            # The initial state will be pushed to the buffer again and be compared to this state in the process of
            # selecting the first action. So add 1 to the initial state here to ensure the states are different
            # enough for the first comparison.
            self.state_buffer.append(np.array(state) + 1)

            # Any position and orientation can be the initial simulated state here. It is like putting in a
            # coordinate system (origin and x-direction) for a 2-D plane and all the other simulated states
            # in the episode will be specified based on this reference coordinate system.
            self.smsta_buffer.append((np.array([0, 0]), 0))

    def step(self, state, action, reward, next_state, done):
        """Update replay memory and parameters of Q-Network by training.
        
        Params
        ======
            state (array_like): starting state of the step
            action (int): action performed in the step
            reward (float): reward from the action
            next_state (array_like): resulting state of the action in the step
            done (bool): indicator for whether next_state is terminal (i.e., end of episode) or not
        """
        if len(self.srpt_pens) > 0:
            # Augment state vector and modify reward using state repeat penalty values.
            self.state_buffer.append(np.array(next_state))
            self.next_aug_state = self.augment_state(next_state)
            state = self.aug_state
            next_state = self.next_aug_state
            reward = self.modify_reward(reward, state, action)

        # Save experience in replay memory.
        self.ebuffer.add(state, action, reward, next_state, done)

        # Learn every LEARN_EVERY steps after memory reaches batch_size.
        if len(self.ebuffer.memory) >= self.ebuffer.batch_size:
            self.l_step += 1
            self.l_step %= LEARN_EVERY
            if self.l_step == 0:
                experiences, weights = self.ebuffer.sample()
                self.learn(experiences, weights, GAMMA)

    def augment_state(self, state):
        """Augment state vector to penalize undesirable actions.
        
        Params
        ======
            state (array_like): original state vector to be augmented
        Returns
        ======
            aug_state (numpy.ndarray): augmented state vector
        """
        # Each penalty value adds a vector of action_size to signal which action causes the penalty.
        aug_state = np.concatenate(
            (state, np.zeros((len(self.srpt_pens) * self.action_size, ))))

        # Detect situation where the two preceeding observed states (not augmented) are essentially the
        # same, which indicates the agent is either stucked at a wall or in some kind of undesirable
        # blind spot. The next action to avoid (i.e., to be penalized) is the one that will keep the
        # agent stuck or in blind spot.
        avoid_action = self.get_avoid_action()
        if avoid_action != ACT_INVALID:
            aug_state[self.state_size + avoid_action] = 1
        if avoid_action != ACT_INVALID or len(self.srpt_pens) == 1:
            return aug_state

        # If agent is not stuck or in blind spot and there are more penalty values, continue to check
        # state repeats separated by more than two actions. Assuming NUM_ORIS is even, states separated
        # by odd number of actions won't repeat. So only even number of actions needs to be checked.
        for action in range(self.action_size):
            nxt_sta = self.sim_step(action)
            for act_cnt in range(2, 2 * len(self.srpt_pens), 2):
                if self.is_state_repeated(act_cnt, nxt_sta):
                    aug_state[self.state_size +
                              (act_cnt // 2) * self.action_size +
                              action] = 1  # signal undesirable action
                    break

        return aug_state

    def modify_reward(self, reward, aug_state, action):
        """Modify reward to penalized undesirable action.
        
        Params
        ======
            reward (float): original reward
            aug_state (numpy.ndarray): augmented state vector
            action (int): action performed
        Returns
        ======
            reward (float): modified reward
        """
        # Penalize undesirable action when it doesn't earn a reward or cause a penalty. If it earns a positive
        # reward or causes a more negative reward, leave the reward unchanged.
        if reward <= 0:
            for i, penalty in enumerate(self.srpt_pens):
                if aug_state[self.state_size + i * self.action_size +
                             action] > 0:  # action is undesirable
                    reward = min(reward, penalty)
                    break
        return reward

    def sim_step(self, action):
        """Advance simulated state (position and orientation) for one step by the action.
        
        Params
        ======
            action (int): action to advance the simulated state
        Returns
            pos, ori (numpy.ndarray, int): resulting simulated state
        """
        # An action can either be a move or turn (but not both) with the type of actions (including non-actions)
        # identified by the action code.
        pos, ori = self.smsta_buffer[-1]
        act_code = ACT_CODES[action]
        pos = pos + act_code[0] * ORIVEC_TABLE[ori]
        ori = (ori + act_code[1]) % NUM_ORIS
        return pos, ori

    def is_state_repeated(self, act_cnt, nxt_sta):
        """Check whether the next state repeats the past state separated by the specified number of actions.
        
        Params
        ======
            act_cnt (int): number of actions separating the past state to be checked and the next state
            nxt_sta (numpy.ndarray, int): next state resulting from an action
        Returns
        ======
            repeated (bool): indicator for repeated state
        """
        repeated = False
        if act_cnt <= len(self.smsta_buffer):
            chk_sta = self.smsta_buffer[-act_cnt]  # past state to be checked
            if chk_sta[1] == nxt_sta[1]:
                if np.linalg.norm(nxt_sta[0] - chk_sta[0]) <= self.tor_dstate:
                    repeated = True
        return repeated

    def act(self, state, eps=0.0):
        """Select action for given state as per epsilon-greedy current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for adjusting epsilon-greedy action selection
        Returns
        ======
            action (int): the chosen action
        """
        # If the agent is in testing mode, self.step() won't be invoked and some of the operations done there
        # need to be done here.
        if (len(self.srpt_pens) > 0
                and self.next_aug_state is None) or self.srpt_det > 0:
            # Push current state into state buffer for comparing with previous state if it is not alraedy pushed
            # by self.step() in the agent training process.
            self.state_buffer.append(np.array(state))

        if len(self.srpt_pens) > 0:
            if self.next_aug_state is None:
                self.aug_state = self.augment_state(state)
            else:
                self.aug_state = self.next_aug_state
            state = self.aug_state

        if self.srpt_det == 0:  # no checking for repeated states (observed or simulated)
            # Randomly select action.
            action = random.choice(np.arange(self.action_size))

            # Epsilon-greedy action selection.
            if random.random() >= eps:
                state = torch.from_numpy(state).float().to(device)
                self.qnetwork_local.eval()
                with torch.no_grad():
                    action = self.qnetwork_local(
                        state).squeeze().argmax().cpu().item()

            if len(self.srpt_pens) > 0:
                # Update simulated state buffer with result of chosen action.
                nxt_sta = self.sim_step(action)
                self.smsta_buffer.append(nxt_sta)

            return action

        # This is the implementation of the post-processing of the Epsilon-greedy policy to avoid repeated states
        # within a short series of actions. This option is set in self.reset_episode() for each espisode and is
        # only active when the option of penalizing undesirable actions, which is set for the class object, is
        # disabled when len(self.srpt_pens) == 0. To accomondate the post-processing of the selected actions, the
        # random policy is modified to randomly assign rankings to all the available actions.

        # Randomly assign rankings to action candidates.
        ranked_actions = np.random.permutation(self.action_size)

        # Epsilon-greedy action selection.
        if random.random() >= eps:
            state = torch.from_numpy(state).float().to(device)
            self.qnetwork_local.eval()
            with torch.no_grad():
                neg_act_qvals = -self.qnetwork_local(state).squeeze()
            ranked_actions = neg_act_qvals.argsort().cpu().numpy().astype(int)

        # Post-process ranked action candidates to remove undesirable action.
        avoid_action = self.get_avoid_action()
        action = self.select_nosrpt_action(avoid_action, ranked_actions)

        return action

    def get_avoid_action(self):
        """Avoid action that will keep the agent stucked or in a blind spot. 
        
        Returns
            avoid_action (int): next action to avoid
        """
        avoid_action = ACT_INVALID  # used to sigal agent is not stucked or in a blind spot
        if np.linalg.norm(self.state_buffer[1] -
                          self.state_buffer[0]) <= self.tor_dstate:
            sim_sta0 = self.smsta_buffer[-2]
            sim_sta1 = self.smsta_buffer[-1]
            if sim_sta0[1] == sim_sta1[
                    1]:  # action is not a turn, must be a move
                # Agent is stuck at a wall
                dpos = sim_sta1[0] - sim_sta0[0]
                mcode = np.around(np.dot(
                    dpos, ORIVEC_TABLE[sim_sta0[1]])).astype(
                        int)  # dot(mcode*(cos, sin), (cos, sin)) = mcode
                avoid_action = AVOID_MOVE_TABLE[mcode + 1]
                self.smsta_buffer.clear(
                )  # it is reasonable to backtrack to get unstucked except the last state which
                self.smsta_buffer.append(
                    sim_sta0
                )  # the agent is stucked in (as the new reference, it can be any state)
            else:  # action is a turn
                # Agent is in a blind spot (turned, but observed same state).
                tcode = sim_sta1[1] - sim_sta0[1]
                avoid_action = AVOID_TURN_TABLE[(tcode + 1) % NUM_ORIS]
                self.smsta_buffer.clear(
                )  # it is reasonable to backtrack to get out of blind
                self.smsta_buffer.append(
                    sim_sta0
                )  # spot except the last two states, which represent
                self.smsta_buffer.append(sim_sta1)  # the blind spot
        return avoid_action

    def select_nosrpt_action(self, avoid_action, ranked_actions):
        """Select action that avoids repeated state (i.e., loops) by a short series of actions.
        
        Params
        ======
            avoid_action (int): action to avoid if agent is stuck or in blind spot
            ranked_actions (array like): action candidates ranked by decreasing Q-values
        Returns
        ======
            action (int): the selected action
        """
        action = ranked_actions[0]
        if action == avoid_action: action = ranked_actions[1]
        nxt_sta = self.sim_step(action)

        # If repeated observed state by an action is detected (signaled by avoid_action != ACT_INVALID), the selected
        # action for avoiding the repeated state will be used since it is more important to free a agent that is
        # stucked or in a blind spot than to go back further to check for repeated simulated states. So the checking
        # for repeated simulated states by 2 or more actions will only occur when avoid_action == ACT_INVALID.
        if avoid_action == ACT_INVALID and self.srpt_det > 1:
            act_heapq = []
            for action in ranked_actions:
                nxt_sta = self.sim_step(action)
                for act_cnt in range(
                        2, 2 * self.srpt_det, 2
                ):  # assuming NUM_ORIS is even, only check even number of actions
                    if self.is_state_repeated(act_cnt, nxt_sta):
                        # Simulated state repeated, go checking next action.
                        heapq.heappush(act_heapq, [-act_cnt, action, nxt_sta])
                        break
                else:
                    # No repeated state detected, action is found.
                    break
            else:
                # No action can satisfy all the no repeated state conditions, select the action that repeats the
                # state separated by most actions (i.e., long loop is more acceptable than short loop).
                action, nxt_sta = heapq.heappop(act_heapq)[1:]

        self.smsta_buffer.append(
            nxt_sta
        )  # update simulated state buffer with result of chosen action.
        return action

    def learn(self, experiences, is_weights, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple (s, a, r, s', done) of batched experience data
            is_weights (torch.Tensor): importance sampling weights for the batched experiences
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Double DQN method for obtaining target Q-values.
        self.qnetwork_local.eval()
        with torch.no_grad():
            maxq_actions = self.qnetwork_local(next_states).max(
                1)[1].unsqueeze(1)
            qouts_next_states = self.qnetwork_target(next_states).gather(
                1, maxq_actions).squeeze()
        qouts_target = rewards + gamma * qouts_next_states * (1 - dones)

        # Obtain current Q-values and its difference from the target Q-values.
        self.qnetwork_local.train()
        qouts_states = self.qnetwork_local(states).gather(1, actions).squeeze()
        delta_qouts = qouts_states - qouts_target

        # Calculated weighted sum of squared losses.
        wsqr_loss = is_weights * delta_qouts**2  # weighted squared loss
        loss_sum = wsqr_loss.sum()

        # Update model parameters by minimizing the loss sum.
        self.optimizer.zero_grad()
        loss_sum.backward()
        self.optimizer.step()

        # Update priorities of the replay memory.
        neg_prios = -torch.abs(delta_qouts.detach())
        self.ebuffer.update_priorities(neg_prios.cpu().numpy())

        # Update target network.
        self.t_step += 1
        self.t_step %= UPDATE_EVERY
        if self.t_step == 0:
            self.qnetwork_target.update_weights(self.qnetwork_local, TAU)

    def update_beta(self, beta):
        """Update importance sampling weights for memory buffer with new Beta.

        Params
        ======
            beta (float): new Beta value
        """
        if beta != self.ebuffer.beta:
            self.ebuffer.beta = beta
            if len(self.ebuffer.memory) >= self.ebuffer.batch_size:
                self.ebuffer.update_is_weights()

    def copy_solved_qnet(self):
        """Copy current local Q-Network to solved Q-Network while local Q-Network will continue the training."""
        if self.qnetwork_solved is None:
            self.qnetwork_solved = QNetwork(self.aug_state_size,
                                            self.action_size,
                                            self.hsize1,
                                            self.hsize2,
                                            seed=None).to(device)
        self.qnetwork_solved.update_weights(
            self.qnetwork_local
        )  # copy local network weights to solved network

    def save_qnet(self, model_name):
        """Save Q-Network parameters into file.

        Params
        ======
            model_name (str): name of the Q-Network
        """
        # Save CPU version since it can be used with or without GPU.
        if self.qnetwork_solved is not None:
            torch.save(self.qnetwork_solved.cpu().state_dict(),
                       model_name + '.pth')
            self.qnetwork_solved = self.qnetwork_solved.to(device)
        else:
            torch.save(self.qnetwork_local.cpu().state_dict(),
                       model_name + '.pth')
            self.qnetwork_local = self.qnetwork_local.to(device)

    def load_qnet(self, model_name):
        """Load Q-Network parameters from file.

        Params
        ======
            model_name (str): name of the Q-Network
        """
        # Saved QNetwork is alway the CPU version.
        qnetwork_loaded = QNetwork(self.aug_state_size,
                                   self.action_size,
                                   self.hsize1,
                                   self.hsize2,
                                   seed=None)
        qnetwork_loaded.load_state_dict(torch.load(model_name + '.pth'))
        self.qnetwork_local.update_weights(qnetwork_loaded.to(
            device))  # copy loaded network weights to local network
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Пример #31
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed=0,
                 double_dqn=False,
                 dueling=False,
                 per=False,
                 per_args=(0.2, 0.01, 2e-5)):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            double_dqn (bool): whether to implement Double DQN (default=False)
            dueling (bool): whether to implement Dueling DQN
            per (bool): whether to implement Prioritized Experience Replay
            per_args (tuple): a,beta,beta_increment for PER
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.double_dqn = double_dqn
        self.per = per
        self.gamma = GAMMA

        # output name for checkpoint
        self.output_name = ''
        self.output_name += '_double' if double_dqn else ''
        self.output_name += '_dueling' if dueling else ''
        self.output_name += '_per' if per else ''

        # Q-Network
        self.qnetwork_local = QNetwork(state_size,
                                       action_size,
                                       seed,
                                       dueling=dueling).to(device)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size,
                                        seed,
                                        dueling=dueling).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        if self.per:
            self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                                  BATCH_SIZE, seed, *per_args)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def train(self,
              env,
              n_episodes=1000,
              max_t=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995):
        """Deep Q-Learning.

        Params
        ======
            env (UnityEnvironment): Bananas environment
            n_episodes (int): maximum number of training episodes
            max_t (int): maximum number of timesteps per episode
            eps_start (float): starting value of epsilon, for epsilon-greedy action selection
            eps_end (float): minimum value of epsilon
            eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        """
        # get the default brain
        brain_name = env.brain_names[0]
        brain = env.brains[brain_name]
        # list containing scores from each episode
        scores = []
        # list containing window averaged scores
        avg_scores = []
        # last 100 scores
        scores_window = deque(maxlen=100)
        # initialize epsilon
        eps = eps_start
        for i_episode in range(1, n_episodes + 1):
            env_info = env.reset(train_mode=True)[brain_name]
            state = env_info.vector_observations[0]
            score = 0
            for t in range(max_t):
                action = self.act(state, eps)
                env_info = env.step(action)[brain_name]
                # get the next state
                next_state = env_info.vector_observations[0]
                # get the reward
                reward = env_info.rewards[0]
                # see if episode has finished
                done = env_info.local_done[0]
                self.step((state, action, reward, next_state, done))
                state = next_state
                score += reward
                if done:
                    break
            # save most recent score
            scores_window.append(score)
            scores.append(score)
            avg_scores.append(np.mean(scores_window))
            # decrease epsilon
            eps = max(eps_end, eps_decay * eps)
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)))
            if np.mean(scores_window) >= 13.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(scores_window)))
                torch.save(self.qnetwork_local.state_dict(),
                           f'./checkpoints/checkpoint{self.output_name}.pth')
                break
        return scores, avg_scores

    def step(self, experience):
        """Save experience in replay memory and learn.
        
        Params
        ======
            experience (tuple): (state, action, reward, next_state, done)
        """
        # save experience
        self.memory.add(experience)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                self.learn()

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self):
        """Update value parameters using given batch of experience tuples.
        """
        # if using PER
        if self.per:
            states, actions, rewards, next_states, dones, idxs, is_weights = self.memory.sample(
            )

        # else normal replay buffer
        else:
            states, actions, rewards, next_states, dones = self.memory.sample()

        # if Double DQN
        if self.double_dqn:
            # Get predicted Q values (for next actions chosen by local model) from target model
            self.qnetwork_local.eval()
            with torch.no_grad():
                next_actions = self.qnetwork_local(next_states).detach().max(
                    1)[1].unsqueeze(1)
            self.qnetwork_local.train()
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, next_actions)

        else:
            # Get max predicted Q values (for next states) from target model
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        if self.per:
            loss = (torch.FloatTensor(is_weights) *
                    F.mse_loss(Q_expected, Q_targets)).mean()
        else:
            loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # if PER, update priority
        if self.per:
            errors = torch.abs(Q_expected - Q_targets).data.numpy()
            self.memory.update(idxs, errors)

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)