Python QNetwork.parameters примеры использования

Язык программирования: Python

Пространство имен/Пакет: model

Класс/Тип: QNetwork

Метод/Функция: parameters

Примеров на hotexamples.com: 31

Python QNetwork.parameters - 31 примеров найдено. Это лучшие примеры Python кода для model.QNetwork.parameters, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

QNetwork(30)

eval(30)

train(30)

state_dict(30)

load_state_dict(30)

parameters(30)

forward(23)

to(8)

set_weights(6)

cuda(5)

get_weights(4)

sample_action(3)

cpu(3)

save_weights(2)

load_weights(2)

decide_action(2)

load_model(2)

items(2)

criterion(2)

trainNet(1)

backward(1)

step(1)

update_mean(1)

spectrum(1)

update_nn(1)

update_target_network(1)

soft_update(1)

restore(1)

set_params(1)

save(1)

sample_actions(1)

qvalue(1)

predict_act(1)

__str__(1)

named_parameters(1)

loss_fn(1)

load(1)

initialize_weights(1)

get_action(1)

foward(1)

update_weights(1)

Пример #1

Показать файл

Файл: dqn_agent.py Проект: rohitsharma10creator/udacity_navigation_2

class Agent():
    """Interacts with and learns from the environment"""

    def __init__(self, state_size, action_size, fc1_units=256, fc2_units=128, device=torch.device('cpu')):
        """DQN agent

        Args:
          state_size (int): dimension of each state
          action_size (int): dimension of each action (or the number of action choices)
          seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.device = device

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       fc1_units=fc1_units, fc2_units=fc2_units).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        fc1_units=fc1_units, fc2_units=fc2_units).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Initialze qnetwork_target parameters to qnetwork_local
        self.soft_update(self.qnetwork_local, self.qnetwork_target, 1)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device=self.device)

        # Initialize the time step counter (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subnet and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Args:
          state (array_like): current state
          eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Set qnetwork_local to evaluation mode
        self.qnetwork_local.eval()

        # This operation should not be included in gradient calculation
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        # Set back qnetwork_local to training mode
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Args:
          experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
          gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        # Compute Q tagets for current states with actual rewards
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ----- Update the target network -----
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        theta_target = tau * theta_local + (1 - tau) * theta_target

        Args:
          local_model (torch.nn.Module): weights will be copied from
          target_model (torch.nn.MOdule): weights will be copied to
          tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1. - tau) * target_param.data)

Пример #2

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed)
        self.qnetwork_target = QNetwork(state_size, action_size, seed)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #3

Показать файл

class Agent:
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.q_local = QNetwork(state_size, action_size, seed)
        self.q_target = QNetwork(state_size, action_size, seed)
        self.optimizer = optim.Adam(self.q_local.parameters(), lr=LR)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        self.t_size = 0

    def step(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)

        self.t_size = (self.t_size + 1) % UPDATE_EVERY
        if self.t_size == 0:
            if len(self.memory) > BATCH_SIZE:
                e = self.memory.sample()
                self.learn(e)

    def act(self, state, epsilon):
        state = torch.from_numpy(state).float().unsqueeze(0)  #Get state
        self.q_local.eval()  #Set Q_local in evaluate mode
        #Equivalent to q_local.train(False)
        with torch.no_grad():  #Get Action values
            action_values = self.q_local(state)
        self.q_local.train()  #Train Q_local

        if random.random() > epsilon:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma=GAMMA):
        states, actions, rewards, next_states, dones = experiences

        # TD target
        best_actions = self.q_target(next_states).detach().max(1)[1].unsqueeze(
            1)
        evaluations = self.q_local(next_states).gather(1, best_actions)
        Q_target = rewards + evaluations * gamma * (~dones)

        # Currently predicted Q value
        Q_expected = self.q_local(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.q_local, self.q_target)

    def soft_update(self, local_model, target_model, tau=TAU):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #4

Показать файл

Файл: dqn_agent.py Проект: adamalavi/Deep_reinforcement_learning

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, lr_decay=0.985):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        print("Running on: " + str(device))

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        if USING_DUELING:
            self.qnetwork_local = DuelQNetwork(state_size, action_size,
                                               seed).to(device)
            self.qnetwork_target = DuelQNetwork(state_size, action_size,
                                                seed).to(device)
        else:
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(
            self.optimizer, lr_decay)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        if USING_DOUBLE_DQN:
            # Getting the actions with maximum reward from the local model
            next_actions_local = self.qnetwork_local(next_states).max(
                dim=1, keepdim=True)[1]
            # Get the reward from the target model for the selected actions
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, next_actions_local)

        else:
            # Max predicted values for the next state from the target model
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # Q targets for current state
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Q expected values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimizing loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #5

Показать файл

Файл: dqn_agent.py Проект: josjo80/DeepRL

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma, double_dqn=True):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        if (double_dqn):
            # ---------------
            #   double DQN
            # ---------------

            # get the Q values for best actions in observations
            # based off the current Q network
            # max(Q(s', a', theta_i)) wrt a'
            Q_local_values = self.qnetwork_local(next_states).detach()
            _, a_prime = Q_local_values.max(1)

            # get Q values from frozen network (i.e. target network) for next state and chosen action
            # Q(s',argmax(Q(s',a', theta_i), theta_i_frozen)) (argmax wrt a')
            Q_target_values = self.qnetwork_target(next_states).detach()
            Q_target_s_a_prime = Q_target_values.gather(
                1, a_prime.unsqueeze(1))
            #Q_target_s_a_prime = Q_target_s_a_prime.squeeze()
            #print('Q_target_s_a_prime', Q_target_s_a_prime.size())

            # Compute Q targets for next states
            Q_target_s_a_prime = rewards + (gamma * Q_target_s_a_prime *
                                            (1 - dones))
            #print('Q_target_s_a_prime2', Q_target_s_a_prime.size())

            # Get expected Q values from local model
            Q_expected = self.qnetwork_local(states).gather(1, actions)
            #print('Q_expected', Q_expected.size())

            # Compute loss
            loss = F.mse_loss(Q_expected, Q_target_s_a_prime)
            # Minimize the loss
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        else:
            # ---------------
            #   regular DQN
            # ---------------
            # Get max predicted Q values (for next states) from target model
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)
            #print('Q_targets_next', Q_targets_next.size())

            # Compute Q targets for current states
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
            #print('Q_targets', Q_targets.size())

            # Get expected Q values from local model
            Q_expected = self.qnetwork_local(states).gather(1, actions)
            #print('Q_expected', Q_expected.size())
            # Compute loss
            loss = F.mse_loss(Q_expected, Q_targets)
            # Minimize the loss
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #6

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 lr_decay=0.9999,
                 double_dqn=False,
                 duel_dqn=False,
                 prio_exp=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): Dimension of each State
            action_size (int): Dimension of each Action
            seed (int): Random Seed
            lr_decay (float): Decay float for alpha learning rate
            DOUBLE DQN (boolean): Indicator for Double Deep Q-Network
            DUEL DQN (boolean): Indicator for Duel Deep Q-Network
            PRIORITISED_EXPERIENCE (boolean): Indicator for Prioritized Experience Replay
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.lr_decay = lr_decay
        self.DOUBLE_DQN = double_dqn
        self.DUEL_DQN = duel_dqn
        self.PRIORITISED_EXPERIENCE = prio_exp

        # Determine Deep Q-Network for use
        if self.DUEL_DQN:
            self.qnetwork_local = DuelQNetwork(state_size, action_size,
                                               seed).to(device)
            self.qnetwork_target = DuelQNetwork(state_size, action_size,
                                                seed).to(device)
        else:
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)

        # Initialize Optimizer
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Determine if Prioritized Experience will be used
        if self.PRIORITISED_EXPERIENCE:
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  BUFFER_SIZE,
                                                  BATCH_SIZE,
                                                  seed,
                                                  alpha=0.6,
                                                  beta=0.4,
                                                  beta_anneal=1.0001)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        if self.PRIORITISED_EXPERIENCE:
            states, actions, rewards, next_states, dones, weights = experiences
        else:
            states, actions, rewards, next_states, dones = experiences

        if self.DOUBLE_DQN:
            # Select max Action for Next State from Local NN
            max_action = self.qnetwork_local(next_states).detach().max(
                1)[1].unsqueeze(1)
            # Evaluate max Action with Target NN
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, max_action)
        else:
            # Get Max Predicted Q values for next state from Target NN
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # Compute Predicted Q values for current state
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get Expected Q values from Local NN
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        if self.PRIORITISED_EXPERIENCE:
            td_error = (Q_expected - Q_targets).squeeze_()  # Compute TD Error
            td_error_detached = td_error.detach()

            self.memory.update_probabilities(
                td_error_detached)  # Update Probabilities

            loss = ((td_error**2) * weights).mean()  # Compute Weighted Loss
        else:
            loss = F.mse_loss(Q_expected, Q_targets)  # Compute Loss

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- Update Target Network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #7

Показать файл

Файл: dqn_agent.py Проект: surajitsaikia27/Deep-Reinforcement-Learning-for-Navigation

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(
                state)  # same as self.qnetwork_local.forward(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        # "*** YOUR CODE HERE ***"
        qs_local = self.qnetwork_local.forward(states)
        qsa_local = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long),
                             actions.reshape(BATCH_SIZE)]
        qsa_local = qsa_local.reshape((BATCH_SIZE, 1))
        # print(qsa_local.shape)

        # # DQN Target
        # qs_target = self.qnetwork_target.forward(next_states)
        # qsa_target, _ = torch.max(qs_target, dim=1) #using the greedy policy (q-learning)
        # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete
        # qsa_target = qsa_target.reshape((BATCH_SIZE,1))
        # TD_target = rewards + gamma * qsa_target
        # #print(qsa_target.shape, TD_target.shape, rewards.shape)

        # # Double DQN Target ver 1
        # qs_target = self.qnetwork_target.forward(next_states)
        # if random.random() > 0.5:
        #     _, qsa_target_argmax_a = torch.max(qs_target, dim=1) #using the greedy policy (q-learning)
        #     qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_target_argmax_a.reshape(BATCH_SIZE)]
        # else:
        #     _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning)
        #     #qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)]
        #     ##qsa_target = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)]

        # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete
        # qsa_target = qsa_target.reshape((BATCH_SIZE,1))
        # TD_target = rewards + gamma * qsa_target

        # Double DQN Target ver 2 (based upon double dqn paper)
        qs_target = self.qnetwork_target.forward(next_states)
        _, qsa_local_argmax_a = torch.max(
            qs_local, dim=1)  # using the greedy policy (q-learning)
        qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long),
                               qsa_local_argmax_a.reshape(BATCH_SIZE)]

        qsa_target = qsa_target * (
            1 - dones.reshape(BATCH_SIZE)
        )  # target qsa value is zero when episode is complete
        qsa_target = qsa_target.reshape((BATCH_SIZE, 1))
        TD_target = rewards + gamma * qsa_target

        # print(qsa_target.shape, TD_target.shape, rewards.shape)

        # #Udacity's approach
        # # Get max predicted Q values (for next states) from target model
        # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # # Compute Q targets for current states
        # TD_target = rewards + (gamma * Q_targets_next * (1 - dones))
        # # Get expected Q values from local model
        # qsa_local = self.qnetwork_local(states).gather(1, actions)

        # diff = qsa_local - TD_target
        # loss = torch.matmul(torch.transpose(diff, dim0=0, dim1=1), diff) #loss is now a scalar
        loss = F.mse_loss(
            qsa_local, TD_target)  # much faster than the above loss function
        # print(loss)
        # minimize the loss
        self.optimizer.zero_grad()  # clears the gradients
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #8

Показать файл

class DQNAgent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        :param state_size: (int) dimension of each state
        :param action_size: (int) dimension of each action
        :param seed: (int) random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=PARAM.LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, PARAM.BUFFER_SIZE,
                                   PARAM.BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """
        Adds the current state-action value to the memory and lets the agent learn if UPDATE_EVERY many steps are taken
        and the memory has more entries then BATCH_SIZE.

        :param state:       current state
        :param action:      taken action
        :param reward:      received reward
        :param next_state:  next state seen after action
        :param done:        boolean if the episode ended after the action
        """
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % PARAM.UPDATE_EVERY
        if self.t_step == 0:
            if len(self.memory) > PARAM.BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, PARAM.GAMMA)

    def act(self, state, eps=0.):
        """
        Returns actions for given state as per current policy.

        :param state: (array_like) current state
        :param eps: (float) epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def get_dqg_target(self, next_states, rewards, gamma, dones):
        """
        Gets the state-action value of the target network. That is, the current estimate of the target network for the
        next state including the seen reward.

        :param next_states: next state for each entry in the sampled mini batch
        :param rewards:     rewards seen for each sample in the mini batch
        :param gamma:       decay factor for current estimate
        :param dones:       indicator if the episode ended for each sample in the mini batch
        :return:
        """
        # Get predicted Q values
        qtarget_values = self.qnetwork_target(next_states).detach()

        # get max of it
        best_qtarget_value = qtarget_values.max(1)

        # reduce one dimension
        best_qtarget_value = best_qtarget_value[0]

        # reshape to 2d matrix with one value in it for 1st dimension (so difference can be calculated)
        # >>> torch.unsqueeze(x, 1)
        # tensor([[ 1],
        #        [ 2],
        #        [ 3],
        #        [ 4]])
        best_qtarget_value = best_qtarget_value.unsqueeze(1)

        # use vector formulation of:
        # if dones == 1:
        #    Q_targets = rewards
        # else:
        #    Q_targets = rewards + (gamma * best_qtarget_value)
        q_targets = rewards + (gamma * best_qtarget_value * (1 - dones))

        return q_targets

    def learn(self, experiences, gamma):
        """
        Update value parameters using given batch of experience tuples.

        :param experiences:  (Tuple[torch.Variable]) tuple of (s, a, r, s', done) tuples
        :param gamma: (float) discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        Q_targets = self.get_dqg_target(next_states, rewards, gamma, dones)

        # Get expected Q values
        q_exp = self.qnetwork_local(states)

        # gets the q values along dimention 1 according to the actions, which is used as index
        # >>> t = torch.tensor([[1,2],[3,4]])
        # >>> torch.gather(t, 1, torch.tensor([[0],[1]]))
        # tensor([[ 1],
        #        [ 4]])
        q_exp = q_exp.gather(1, actions)

        # compute loss
        loss = F.mse_loss(q_exp, Q_targets)

        # reset optimizer gradient
        self.optimizer.zero_grad()
        # do backpropagation
        loss.backward()
        # do optimize step
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, PARAM.TAU)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :param local_model: (PyTorch model) weights will be copied from
        :param target_model: (PyTorch model) weights will be copied to
        :param tau: (float) interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #9

Показать файл

Файл: dqn_agent.py Проект: zhhu471619584/navigation-collect-bananas

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, buffer_size, batch_size,
                 lr, tau, sequential_sampling_fre):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            buffer_size(int):replay buffer size
            batch_size(int): minibatch size
            lr(float):learning rate 
            tau(float):for soft update of target parameters
            sequential_sampling_fre(int):Ratio of random sampling to sequential sampling
            
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.tau = tau

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed,
                                   sequential_sampling_fre)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_local_argmax = self.qnetwork_local(next_states).max(1)[1].unsqueeze(
            1)
        Q_targets_next_states = self.qnetwork_target(
            next_states).detach().gather(1, Q_local_argmax)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next_states * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #10

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 model='DQN',
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4,
                 pretrained_model_file=None):

        if model not in ('DQN', 'DDQN'):
            raise ValueError('Current model supports DQN or DDQN')
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            model (str): currently suports DQN and DDQN
            buffer size (int): replay buffer size
            batch size (int): minibatch size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            lr (float): learning rate
            update_every (int): how often to update the network
            pretrained_model_file (str): filepath to .pth file with pretrained model weights
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.model = model

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        if pretrained_model_file:
            weights = torch.load(pretrained_model_file)
            self.qnetwork_local.load_state_dict(weights)
            self.qnetwork_target.load_state_dict(weights)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        if self.model == 'DQN':
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)
        if self.model == 'DDQN':
            argmax_actions = self.qnetwork_local(next_states).detach().max(
                1)[1].unsqueeze(1)
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, argmax_actions)

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)

Пример #11

Показать файл

class agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, duel, fc1_units, fc2_units,
                 seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            fc1_units : number of nodes in the first hidden layer 
            fc2_units : number of nodes in the second hidden layer
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        #Chose betweeen regulat Q-Network or duel architecture
        #if(duel):
        #   self.qnetwork_local  = Duel_QNetwork(state_size, action_size,fc1_units,fc2_units, seed).to(device)
        #    self.qnetwork_target = Duel_QNetwork(state_size, action_size,fc1_units,fc2_units, seed).to(device)
        #else:
        #    self.qnetwork_local  = QNetwork(state_size, action_size,fc1_units,fc2_units, seed).to(device)
        #   self.qnetwork_target = QNetwork(state_size, action_size,fc1_units,fc2_units, seed).to(device)
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Visualize network
        print(self.qnetwork_local)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state_size, state, action, reward, next_state, done, dqn):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                if (dqn):
                    self.DQN_learn(experiences, state_size, GAMMA)
                else:
                    self.DDQN_learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().to(device)
        # Model eval notify layers in model.py  that it is eval mode
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def DQN_learn(self, experiences, state_size, gamma):
        """Learn using the DQN algorithm.
           Update value parameters using given batch of experience tuples.

           Params
           ======
           experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
           gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        states = states.view(BATCH_SIZE, 4, state_size[0], state_size[1])
        next_states = next_states.view(BATCH_SIZE, 4, state_size[0],
                                       state_size[1])
        #
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from the local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss using element-wise mean squared error.
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def DDQN_learn(self, experiences, gamma):
        """DDQN version
        Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model

        # DQN
        #Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        #DDQN
        Q_local_argmax = self.qnetwork_local(next_states).detach().max(
            1)[1].unsqueeze(1)
        Q_targets_next = self.qnetwork_target(next_states).gather(
            1, Q_local_argmax)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        teta_target = ro*teta_local + (1 - ro)*teta_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #12

Показать файл

Файл: sac.py Проект: Metro1998/Metro_Plan

class SAC(object):
    def __init__(self):

        self.gamma = 0.99
        self.tau = 0.005
        self.alpha = 0.2
        self.lr = 0.003

        self.target_update_interval = 1
        self.device = torch.device("cpu")

        # 8 phases
        self.num_inputs = 8
        self.num_actions = 1
        self.hidden_size = 256

        self.critic = QNetwork(self.num_inputs, self.num_actions,
                               self.hidden_size).to(self.device)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=self.lr)

        self.critic_target = QNetwork(self.num_inputs, self.num_actions,
                                      self.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)
        # Copy the parameters of critic to critic_target

        self.target_entropy = -torch.Tensor([1.0]).to(self.device).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)

        self.alpha_optimizer = Adam([self.log_alpha], lr=self.lr)

        self.policy = GaussianPolicy(self.num_inputs, self.num_actions,
                                     self.hidden_size).to(self.device)
        self.policy_optimizer = Adam(self.policy.parameters(), lr=self.lr)

    def select_action(self, state):
        state = torch.FloatTensor(state).to(self.device)  # TODO
        _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]
        # action is a CUDA tensor, you should do .detach().cpu().numpy(), when
        # you need a numpy

    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)
        action_batch = np.expand_dims(action_batch, axis=1)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)
        # Unsqueeze: add one dimension to the index

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (
                min_qf_next_target)
        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf_loss = qf1_loss + qf2_loss

        self.critic_optimizer.zero_grad()
        # Clear the cumulative grad
        qf_loss.backward()
        # Get grad via backward()
        self.critic_optimizer.step()
        # Update the para via grad

        pi, log_pi, _ = self.policy.sample(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean()
        # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        # automatic_entropy_tuning:
        alpha_loss = -(self.log_alpha *
                       (log_pi + self.target_entropy).detach()).mean()  # TODO

        self.alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.alpha_optimizer.step()

        self.alpha = self.log_alpha.exp()
        alpha_tlogs = self.alpha.clone()  # For TensorboardX logs

        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item(), alpha_tlogs.item()

    # Save model parameters
    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None):
        # Create a dir package in the current location
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        print('Saving models to {} and {}'.format(actor_path, critic_path))
        torch.save(self.policy.state_dict(), actor_path)
        # state_dict() stores the parameters of layers and optimizers which have grad
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path):
        print('Loading models from {} and {}'.format(actor_path, critic_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))

    def get_alpha(self):
        return self.alpha

Пример #13

Показать файл

Файл: sac_agent.py Проект: jorditorresBCN/Deep-Reinforcement-Learning-Udacity

class soft_actor_critic_agent(object):
    def __init__(self, num_inputs, action_space, \
                 device, hidden_size, seed, lr, gamma, tau, alpha):

        self.gamma = gamma
        self.tau = tau
        self.alpha = alpha

        self.device = device
        self.seed = seed
        self.seed = torch.manual_seed(seed)

        torch.cuda.manual_seed(seed)
        #torch.cuda.manual_seed_all(seed)
        #torch.backends.cudnn.deterministic=True

        self.critic = QNetwork(seed, num_inputs, action_space.shape[0],
                               hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=lr)

        self.critic_target = QNetwork(seed, num_inputs, action_space.shape[0],
                                      hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
        self.target_entropy = -torch.prod(
            torch.Tensor(action_space.shape).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = Adam([self.log_alpha], lr=lr)
        self.policy = GaussianPolicy(seed, num_inputs, action_space.shape[0], \
                                         hidden_size, action_space).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=lr)

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if eval == False:
            action, _, _ = self.policy.sample(state)
        else:
            _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]

    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (
                min_qf_next_target)

        # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1, qf2 = self.critic(state_batch, action_batch)
        qf1_loss = F.mse_loss(qf1, next_q_value)
        qf2_loss = F.mse_loss(qf2, next_q_value)

        pi, log_pi, _ = self.policy.sample(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean()

        self.critic_optim.zero_grad()
        qf1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        qf2_loss.backward()
        self.critic_optim.step()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        alpha_loss = -(self.log_alpha *
                       (log_pi + self.target_entropy).detach()).mean()

        self.alpha_optim.zero_grad()
        alpha_loss.backward()
        self.alpha_optim.step()

        self.alpha = self.log_alpha.exp()
        alpha_tlogs = self.alpha.clone()  # For TensorboardX logs

        soft_update(self.critic_target, self.critic, self.tau)

Пример #14

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        # TODO: initialize action-value function Q with random weights theta
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        # TODO: initialize target action-value function Qhat with weights theta_=theta
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        # TODO: initialize replay memory D to capacity N (circular queue)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # TODO: set s_t1=s_t,a_t,x_t1 and preprocess f_t1=f(s_t1)
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # DONE. TODO: if episode terminates at step j+1, set y_j = r_j
        # else set y_j = r_j + gamma*max(Qhat(f_j1,a_;theta_))
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:  # DONE. TODO: every C steps reset Qhat = Q

            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                # DONE. TODO: sample random minibatch of transitions (f_j,a_j,r_j,f_j1) from D
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        # Unpack the experiences tuple
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # DONE. TODO: perform a gradient descent step on (y_j - Q(f_j,a_j;theta))^2 with
        # respect to the network parameters theta
        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #15

Показать файл

Файл: dqn_agent.py Проект: czxttkl/Tutorials

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every LEARN_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Learn every UPDATE_EVERY time steps.
        # self.t_step = (self.t_step + 1) % LEARN_EVERY
        # if self.t_step == 0:
        self.t_step += 1
        if done:
            for _ in range(self.t_step // SOFT_UPDATE_EVERY):
                # If enough samples are available in memory, get random subset and learn
                if len(self.memory) > BATCH_SIZE:
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)
                    # you can use learn_DDQN to enable double q-learning. but on lunarlander, at least,
                    # I don't see any benefit
                    # self.learn_DDQN(experiences, GAMMA)
            self.t_step = 0

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_local(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # print(loss)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def learn_DDQN(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # Get index of maximum value for next state from Q_expected
        Q_argmax = self.qnetwork_local(next_states).detach()
        _, a_prime = Q_argmax.max(1)
        # print (self.qnetwork_local(states).detach())
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().gather(
            1, a_prime.unsqueeze(1))
        # print (Q_targets_next.shape)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # print (Q_targets.shape)
        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        # print (Q_expected.shape)
        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #16

Показать файл

class Agent():
    """
    Deep Reinforcement Learning agent that interacts with and learns from the environment.
    Uses the Double DQN algorithm (see https://arxiv.org/abs/1509.06461) with a Dueling DQN
    model (see https://arxiv.org/abs/1511.06581).
    """
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Use Double DQN: Get predicted actions from local network model
        local_actions = self.qnetwork_local(next_states).detach().argmax(
            dim=1).unsqueeze(1)
        # Get predicted Q values (for next states) from target model using predicted actions
        Q_targets_next = self.qnetwork_target(next_states).gather(
            1, local_actions).detach()

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #17

Показать файл

class Agent(object):

    def __init__(self, n_states, n_actions, hidden_dim, lr, device):
        """Agent class that choose action and train

        Args:
            n_states (int): input dimension
            n_actions (int): output dimension
            hidden_dim (int): hidden dimension
        """

        self.device = device

        self.q_local = QNetwork(n_states, n_actions, hidden_dim=16).to(self.device)
        self.q_target = QNetwork(n_states, n_actions, hidden_dim=16).to(self.device)
        
        self.mse_loss = torch.nn.MSELoss()
        self.optim = optim.Adam(self.q_local.parameters(), lr=lr)
        
        self.n_states = n_states
        self.n_actions = n_actions
        

        #  ReplayMemory: trajectory is saved here
        self.replay_memory = ReplayMemory(10000)
        

    def get_action(self, state, eps, check_eps=True):
        """Returns an action

        Args:
            state : 2-D tensor of shape (n, input_dim)
            eps (float): eps-greedy for exploration

        Returns: int: action index
        """
        global steps_done
        sample = random.random()

        if check_eps==False or sample > eps:
            with torch.no_grad():
                return self.q_local(Variable(state).type(FloatTensor)).data.max(1)[1].view(1, 1)
        else:
           ## return LongTensor([[random.randrange(2)]])
           return torch.tensor([[random.randrange(self.n_actions)]], device=self.device) 


    def learn(self, experiences, gamma):
        """Prepare minibatch and train them

        Args:
        experiences (List[Transition]): batch of `Transition`
        gamma (float): Discount rate of Q_target
        """
        
        if len(self.replay_memory.memory) < BATCH_SIZE:
            return;
            
        transitions = self.replay_memory.sample(BATCH_SIZE)
        
        batch = Transition(*zip(*transitions))
                        
        states = torch.cat(batch.state)
        actions = torch.cat(batch.action)
        rewards = torch.cat(batch.reward)
        next_states = torch.cat(batch.next_state)
        dones = torch.cat(batch.done)
        
            
        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to newtork q_local (current estimate)
        Q_expected = self.q_local(states).gather(1, actions)     

        Q_targets_next = self.q_target(next_states).detach().max(1)[0] 

        # Compute the expected Q values
        Q_targets = rewards + (gamma * Q_targets_next * (1-dones))
        
        self.q_local.train(mode=True)        
        self.optim.zero_grad()
        loss = self.mse_loss(Q_expected, Q_targets.unsqueeze(1))
        # backpropagation of loss to NN        
        loss.backward()
        self.optim.step()
               
        
    def soft_update(self, local_model, target_model, tau):
        """ tau (float): interpolation parameter"""
        
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)     
            
    def hard_update(self, local, target):
        for target_param, param in zip(target.parameters(), local.parameters()):
            target_param.data.copy_(param.data)

Пример #18

Показать файл

class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # initialize local and target Q-Networks
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # initialize time step
        self.t_step = 0

        # initialize parameters
        self.buffer_size = BUFFER_SIZE
        self.batch_size = BATCH_SIZE
        self.gamma = GAMMA
        self.tau = TAU
        self.lr = LR
        self.update_every = UPDATE_EVERY
    
    def step(self, state, action, reward, next_state, done):
        # store experience tuple in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)
        
        # perform learning step every UPDATE_EVERY time steps
        self.t_step += 1
        is_time_to_update_weights = (self.t_step % UPDATE_EVERY) == 0
        if is_time_to_update_weights:
            # if enough samples in replay_buffer,
            # get random batch and perform one learning step
            if len(self.replay_buffer) > BATCH_SIZE:
                experiences = self.replay_buffer.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, epsilon=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # epsilon-greedy action selection
        action_values = action_values.cpu().data.numpy()[0]
        optimal_action = np.argmax(action_values)
        random_action = np.random.choice(np.arange(self.action_size))
        action = np.random.choice([optimal_action, random_action],
                                  p=[1-epsilon, epsilon])
        return np.int32(action)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # get max predicted Q values for next states from target models
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        # compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # minimize loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # target network soft update
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

Пример #19

Показать файл

Файл: dqn_agent.py Проект: silviomori/udacity-deep-reinforcement-learning-p1-navigation

class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed, lr_decay=0.9999):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            lr_decay (float): multiplicative factor of learning rate decay
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        print("Running on: "+str(device))
        
        # Q-Network
        hidden_layers = [128, 32]
        
        if USE_DUELING_NETWORK:
            hidden_state_value = [64, 32]
            
            self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed, hidden_layers, hidden_state_value).to(device)

            self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed, hidden_layers, hidden_state_value).to(device)
            self.qnetwork_target.eval()
            
        else:
            self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_layers).to(device)

            self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_layers).to(device)
            self.qnetwork_target.eval()
            
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(self.optimizer, lr_decay)


        # Replay memory
        if USE_PRIORITIZED_REPLAY:
            self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device,
                                                  alpha=0.6, beta=0.4, beta_scheduler=1.0)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device)
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        # Epsilon-greedy action selection
        if random.random() > eps:
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            
            self.qnetwork_local.eval()
            with torch.no_grad():
                action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()

            return np.argmax(action_values.cpu().data.numpy())
        
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done, w) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, w = experiences

        ## Compute and minimize the loss

        with torch.no_grad():
            ### Use of Double DQN method
            if USE_DOUBLE_DQN:
                ## Select the greedy actions using the QNetwork Local
                # calculate the pair action/reward for each of the next_states
                next_action_rewards_local = self.qnetwork_local(next_states)
                # select the action with the maximum reward for each of the next actions
                greedy_actions_local = next_action_rewards_local.max(dim=1, keepdim=True)[1]

                ## Get the rewards for the greedy actions using the QNetwork Target
                # calculate the pair action/reward for each of the next_states
                next_action_rewards_target = self.qnetwork_target(next_states)
                # get the target reward for each of the greedy actions selected following the local network
                target_rewards = next_action_rewards_target.gather(1, greedy_actions_local)
                
            ### Use of Fixed Q-Target
            else:
                # calculate the pair action/reward for each of the next_states
                next_action_rewards = self.qnetwork_target(next_states)
                # select the maximum reward for each of the next actions
                target_rewards = next_action_rewards.max(dim=1, keepdim=True)[0]
                
            
            ## Calculate the discounted target rewards
            target_rewards = rewards + (gamma * target_rewards * (1 - dones))
            
        # calculate the pair action/rewards for each of the states
        expected_action_rewards = self.qnetwork_local(states) # shape: [batch_size, action_size]
        # get the reward for each of the actions
        expected_rewards = expected_action_rewards.gather(1, actions) # shape: [batch_size, 1]

        if USE_PRIORITIZED_REPLAY:
            target_rewards.sub_(expected_rewards)
            target_rewards.squeeze_()
            target_rewards.pow_(2)
            
            with torch.no_grad():
                td_error = target_rewards.detach()
                td_error.pow_(0.5)
                self.memory.update_priorities(td_error)
            
            target_rewards.mul_(w)
            loss = target_rewards.mean()
        else:
            # calculate the loss
            loss = F.mse_loss(expected_rewards, target_rewards)

        # perform the back-propagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.lr_scheduler.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

Пример #20

Показать файл

Файл: sac_gs.py Проект: lim0606/pytorch-ardae-rl

class SAC(object):
    """
    SAC class from Haarnoja et al. (2018)
    We leave the option to use automatice_entropy_tuning to avoid selecting entropy rate alpha
    """
    def __init__(self, num_inputs, action_space, args):
        #self.n_flow = args.n_flows
        #assert self.n_flow == 0
        self.num_inputs = num_inputs
        #self.flow_family = args.flow_family
        self.num_layers = args.num_layers
        self.args = args

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(action_space.shape).to(self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

        self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                     args.hidden_size, self.num_layers,
                                     args).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

    def select_action(self, state, eval=False):
        """
        Select action for a state
        (Train) Sample an action from NF{N(mu(s),Sigma(s))}
        (Eval) Pass mu(s) through NF{}
        """
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if not eval:
            self.policy.train()
            action, _, _, _, _ = self.policy.evaluate(state)
        else:
            self.policy.eval()
            action, _, _, _, _ = self.policy.evaluate(state, eval=True)

        action = action.detach().cpu().numpy()
        return action[0]

    def update_parameters(self, memory, batch_size, updates):
        """
        Update parameters of SAC-NF
        Exactly like SAC, but keep two separate Adam optimizers for the Gaussian policy AND the NF layers
        .backward() on them sequentially
        """
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        # for visualization
        info = {}
        ''' update critic '''
        with torch.no_grad():
            next_state_action, next_state_log_pi, _, _, _ = self.policy.evaluate(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (
                min_qf_next_target)

        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]

        pi, log_pi, _, _, _ = self.policy.evaluate(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(
        )  # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        nf_loss = ((self.alpha * log_pi) - min_qf_pi).mean()

        # update
        self.critic_optim.zero_grad()
        qf1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        qf2_loss.backward()
        self.critic_optim.step()

        self.policy_optim.zero_grad()
        policy_loss.backward()  #retain_graph=True)
        self.policy_optim.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
            alpha_tlogs = self.alpha.clone()  # For TensorboardX logs
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_tlogs = torch.tensor(self.alpha)  # For TensorboardX logs

        # update target value fuctions
        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item(), alpha_tlogs.item(), info

    def save_model(self, info):
        """
        Save the weights of the network (actor and critic separately)
        """
        # policy
        save_checkpoint(
            {
                **info,
                'state_dict': self.policy.state_dict(),
                'optimizer': self.policy_optim.state_dict(),
            },
            self.args,
            filename='policy-ckpt.pth.tar')

        # critic
        save_checkpoint(
            {
                **info,
                'state_dict': self.critic.state_dict(),
                'optimizer': self.critic_optim.state_dict(),
            },
            self.args,
            filename='critic-ckpt.pth.tar')
        save_checkpoint(
            {
                **info,
                'state_dict': self.critic_target.state_dict(),
                #'optimizer' : self.critic_optim.state_dict(),
            },
            self.args,
            filename='critic_target-ckpt.pth.tar')

    def load_model(self, args):
        """
        Jointly or separately load actor and critic weights
        """
        # policy
        load_checkpoint(
            model=self.policy,
            optimizer=self.policy_optim,
            opt=args,
            device=self.device,
            filename='policy-ckpt.pth.tar',
        )

        # critic
        load_checkpoint(
            model=self.critic,
            optimizer=self.critic_optim,
            opt=args,
            device=self.device,
            filename='critic-ckpt.pth.tar',
        )
        load_checkpoint(
            model=self.critic_target,
            #optimizer=self.critic_optim,
            opt=args,
            device=self.device,
            filename='critic_target-ckpt.pth.tar',
        )

Пример #21

Показать файл

Файл: Doubledqn_agent.py Проект: goodhubcoder/DQN-Lunar-Lander

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):

        states, actions, rewards, next_states, dones = experiences

        Q_expected = self.qnetwork_local(states).gather(1, actions)

        actions_value = self.qnetwork_local.forward(next_states)
        next_action = torch.unsqueeze(torch.max(actions_value, 1)[1], 1)
        next_q = self.qnetwork_target.forward(next_states).gather(
            1, next_action)
        Q_targets = rewards + GAMMA * next_q * (1 - dones)

        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #22

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed=42,
                 hidden_layers=[32, 8]):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # detect GPU device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers,
                                       seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers,
                                        seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayMemory(BUFFER_SIZE, BATCH_SIZE, self.device, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_step, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_step, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step += 1
        if self.t_step % UPDATE_EVERY == 0:
            if self.memory.length > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, next_states, rewards, dones = experiences

        self.qnetwork_target.eval()
        with torch.no_grad():
            # get the max expected q-values
            Q_expected = self.qnetwork_local(
                next_states
            )  # gather = multiindex selector, dim=1 indices = actions
            action_argmax = torch.max(Q_expected, dim=1, keepdim=True)[1]
            Q_max_expected = Q_expected.gather(1, action_argmax)

            # get max predicted q-values for next states from target model (action with max value per state)
            # detach gets the tensor value, unsqueeze makes a matrix with one column
            Q_targets_next = self.qnetwork_target(next_states)
            # q-target for current state
            targets = rewards + gamma * Q_max_expected * (
                1 - dones)  #consider only not dones
        self.qnetwork_target.train()

        expected = self.qnetwork_local(states).gather(1, actions)
        loss = torch.sum((expected - targets)**2)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)

    def train(self,
              env,
              brain_name,
              n_episodes=2000,
              timesteps=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995):
        '''
        train the model network applying experience replay
        Params
        ======
            agent (Agent): agent that interacts with the enviroment
            n_episodes (int): number of games played
            timesteps (int): max number os steps to be played in the game
            eps_start (floaßt): initial proportion os random actions on epsilon-greedy action
            eps_end (float): final proportion os random actions on epsilon-greedy action
            eps_decay (float): epsilon decay rate 
        '''
        scores = []
        last_scores = deque(maxlen=100)
        eps = eps_start
        for i_episode in range(n_episodes):
            env_status = env.reset(train_mode=True)[brain_name]
            state = env_status.vector_observations[0]  #get state
            score = 0
            for _ in range(timesteps):
                action = self.act(state, eps).astype(int)
                env_status = env.step(action)[brain_name]
                next_state = env_status.vector_observations[0]
                reward = env_status.rewards[0]
                done = env_status.local_done[0]
                self.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break
            scores.append(score)
            last_scores.append(score)
            eps = max(eps_end, eps * eps_decay)  #decreases epsilon
            print('\rEpisode {}\tScores mean: {:.2f}'.format(
                i_episode, np.mean(last_scores)),
                  end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tLast 100 scores mean: {:.2f}'.format(
                    i_episode, np.mean(last_scores)))
            if np.mean(last_scores) >= 13.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tScores mean: {:.2f}'
                    .format(i_episode - 100, np.mean(last_scores)))
                torch.save(self.qnetwork_local.state_dict(), 'checkpoint.pth')
                break
        return scores

Пример #23

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

        # Get max predicted Q values for next states from the target model (frozen weights)
        #
        #    next_states is 64x8
        #    self.qnetwork_target(next_states) is 64x4
        #    detach() returns a tensor copy detached from the graph (no gradient)
        #    max(1)[0] returns the the max value in given dim  (max value indexes in 2nd array)
        #    => This returns an array of 64 values
        #    Unsqueeze(1)returns a new Tensor of size one inserted at the given position
        #    => This returns a 64X1 tensor
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model (being trained)
        # x.gather(1, actions) returns a tensor (located on the current device) that is the result of
        # concataining the input tensor values along the provided dimensions (here the dim indexes are the taken actions indexes)
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #24

Показать файл

Файл: dqn_agent.py Проект: manjavacas/eci2019-DRL

class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        
    def step(self, state, action, reward, next_state, done):       
        # ------------------- train with mini-batch sample of experiences ------------------- #
        if len(self.memory) > BATCH_SIZE:
            # If enough samples are available in memory, get random subset and learn
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
        
        # ------------------- update target network ----------------------------------------- #
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:             
            # If C (UPDATE_EVERY) steps have been reached, blend weights to the target network
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        # - qnetwork_target : apply forward pass for the whole mini-batch
        # - detach : do not backpropagate
        # - max : get maximizing action for each sample of the mini-batch (dim=1)
        # - [0].unsqueeze(1) : transform output into a flat array
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        # Compute Q targets for current states (y)
        # - dones : detect if the episode has finished
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model (Q(Sj, Aj, w))
        # - gather : for each sample select only the output value for action Aj
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Optimize over (yj-Q(Sj, Aj, w))^2
        # * compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # * minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()                            

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

Пример #25

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, mode='DQN'):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        """ Set Tuning and Hyperparameters """

        self.mode = mode

        self.losses = []

        self.ddqn_enabled = False
        self.ddqn_counter = 0

        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        print("Parameters = {}".format(self.qnetwork_local.parameters()))
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def process_action(self, value):
        L = [
            np.array([1, 0, 0]),
            np.array([-1, 0, 0]),
            np.array([0, 1, 0]),
            np.array([0, 0, 1])
        ]
        return L[value]

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model

        if self.mode == 'DQN':
            argmax_actions_locals_next = self.qnetwork_local(next_states).max(
                1)[1].unsqueeze(1)
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, argmax_actions_locals_next)

        if self.mode == 'DDQN':
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        self.losses.append(float(loss))

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #26

Показать файл

Файл: Agent.py Проект: AkshayS21/Navigation-Project-with-DQN

class Agent():
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        # the state size and action size will be used to generate the Q Network
        self.state_size = state_size
        self.action_size = action_size
        ### random.seed(seed) generates sequence of random numbers by performing some operation on initial value.
        #If same initial value is used, it will generate the same sequence of random numbers
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = Replay_Buffer(action_size, Buffer_Size, Batch_Size, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def select_act(self, state, eps=0.):

        " selects action based on state and epsilon"

        # get the state array from env, convert to tensor

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # unsqueeze(0) adds a singleton dimension at 0 positon
        # useful because the states are in batches
        # to(device) moves the tensor to the device memory, cpu or cuda

        ## put network in eval mode
        self.qnetwork_local.eval()

        #get last_layer of the network to retrive index of the max reward
        with torch.no_grad(
        ):  # torch.no_grad() prevents calculating gradients in the following block, so no backward_pass.
            action_values = self.qnetwork_local(state)

        self.qnetwork_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return np.random.randint(self.action_size)  # select an action
        #random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):

        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_next_states = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # detach returns a new tensor detachd from the current graph
        # final layer is (batch_size ,action_size)i.e. (64,4), max(1), will find max in the second dim(1)
        # the new tensor is (64,), we then add a singleton dimensin to it with unsqueeze
        # Q_targets_next is the max reward of the four actons for each of the 64 states

        Q_target = rewards + (gamma * Q_next_states * (1 - dones))

        Q_expected = self.qnetwork_local(states).gather(1, actions)

        #gather rearranges values in the dimension (1 here) of the input tensor (64,4),
        #as per the indices in the index tensor provided, actions here...actions carries the index of the next action taken
        # given the state in states. SO only one value will be provided..it coud be either of 0,1,2,3..based on def act and state
        #therefore output is 64,1.with reward corresponding to only that action chosen after the state.

        # the rewars generated by q_network local is used for comparison with Q_targets to calc.loss
        #then we update parametrs to min loss

        loss = F.mse_loss(Q_expected, Q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def step(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (
            self.t_step + 1
        ) % UPDATE_EVERY  # self.t_step will increase by 1 after every step() call
        # that means every time step
        if self.t_step == 0:
            if len(self.memory) > Batch_Size:
                experiences = self.memory.sample()
                self.learn(experiences, gamma)

    def soft_update(self, local_model, target_model, TAU):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(TAU * local_param.data +
                                    (1.0 - TAU) * target_param.data)

Пример #27

Показать файл

Файл: prio_rep_double_dqn_agent.py Проект: tricostume/deep-reinforcement-learning

class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        #self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr=LR, momentum=0.95)
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences, idxs, ws = self.memory.sample()
                self.learn(experiences, idxs, ws, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, idxs, ws, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        ## TODO: compute and minimize the loss
        next_action_values_local = self.qnetwork_local(states).gather(1, actions)
        # Only change proposed for Double DQN: Get maximizing future actions from local network and get their
        # corresponding values from target network. Compare then these to the local taken actions.
        local_max_actions = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1)
        next_action_values_target = self.qnetwork_target(next_states).detach().gather(1, local_max_actions)
        
        
        '''
        print(next_action_values_local.shape)
        print(next_action_values_local[0][:])
        print(next_action_values_local.gather(1, actions).shape)
        print(actions[0][0])
        print(next_action_values_local.gather(1, actions)[0][0])
        '''
        y = rewards + (gamma * next_action_values_target*(1 - dones))
        # Local network will be actualized, target one is used as ground truth
        ws = torch.from_numpy(ws.astype(float)).float().to(device)
        loss = F.mse_loss(ws*next_action_values_local, ws*y)
        errors = np.abs(y.cpu().data.numpy() - next_action_values_local.cpu().data.numpy())
        self.memory.memory.update_batch(idxs, errors)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        # Copy from local to target network parameters
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def adjust_learning_rate(self, episode, val):
        print("adjusting learning rate!")
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = val

Пример #28

Показать файл

Файл: dqn_agent.py Проект: mahmoudakl/navigation_dqn

class Agent:
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steos
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """
        Returns action for given state as per current policy
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """
        Update value parameters using given batch of experience tuples
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values for next states from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #29

Показать файл

Файл: ddqn_agent.py Проект: jzchang288/Udacity_DRLND_Navigation

class DDQNPERAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 tor_dstate,
                 srpt_pens,
                 lrn_rate,
                 hsize1,
                 hsize2,
                 seed=0):
        """Initialize a DDQN Agent object with PER (Prioritized Experience Replay) support.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            tor_dstate (float): tolerance for deciding whether two states are the same
            srpt_pens (array_like): penalty (negative reward) values for undesirable actions
            lrn_rate (float): learning rate for Q-Network training
            hsize1 (int): size of the first hidden layer of the Q-Network
            hsize2 (int): size of the second hidden layer of the Q-Network 
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.tor_dstate = tor_dstate
        self.srpt_pens = srpt_pens
        self.lrn_rate = lrn_rate

        self.hsize1 = hsize1
        self.hsize2 = hsize2

        self.seed = seed
        if seed is not None: random.seed(seed)

        # Each penalty value adds a vector of action_size to signal which action causes the penalty.
        self.aug_state_size = state_size + len(srpt_pens) * action_size

        # Set up Q-Networks.
        self.qnetwork_local = QNetwork(self.aug_state_size, action_size,
                                       hsize1, hsize2, seed).to(device)
        self.qnetwork_local.initialize_weights(
        )  # initialize network with random weights
        self.qnetwork_target = QNetwork(self.aug_state_size,
                                        action_size,
                                        hsize1,
                                        hsize2,
                                        seed=None).to(device)
        self.qnetwork_target.update_weights(
            self.qnetwork_local)  # copy network weights to target network
        self.qnetwork_target.eval()
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=lrn_rate)

        # Store trained Q-model when the environment is solved.
        self.qnetwork_solved = None

        # Set up experience replay memory.
        self.ebuffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize interval steps.
        self.l_step = 0  # for learning every LEARN_EVERY time steps
        self.t_step = 0  # for updating target network every UPDATE_EVERY learnings

    def reset_epsisode(self, state, srpt_det=0):
        """Re-initialize buffers after environment reset for a new episode.
        
        Params
        ======
            state (array_like): initial state after environment reset
            srpt_det (int): number of repeated state types to be checked for post-processing
        """
        self.srpt_det = 0
        if len(self.srpt_pens) == 0:
            # State repeat detection for post-processing is active only when state repeat penalty option is off.
            self.srpt_det = srpt_det
        else:
            # This is used to signal self.step() hasn't been run yet.
            self.next_aug_state = None

        if len(self.srpt_pens) > 0 or self.srpt_det > 0:
            self.state_buffer = deque(maxlen=2)
            buffer_size = 2 * (max(len(self.srpt_pens), self.srpt_det) - 1)
            self.smsta_buffer = deque(maxlen=max(2, buffer_size))

            # The initial state will be pushed to the buffer again and be compared to this state in the process of
            # selecting the first action. So add 1 to the initial state here to ensure the states are different
            # enough for the first comparison.
            self.state_buffer.append(np.array(state) + 1)

            # Any position and orientation can be the initial simulated state here. It is like putting in a
            # coordinate system (origin and x-direction) for a 2-D plane and all the other simulated states
            # in the episode will be specified based on this reference coordinate system.
            self.smsta_buffer.append((np.array([0, 0]), 0))

    def step(self, state, action, reward, next_state, done):
        """Update replay memory and parameters of Q-Network by training.
        
        Params
        ======
            state (array_like): starting state of the step
            action (int): action performed in the step
            reward (float): reward from the action
            next_state (array_like): resulting state of the action in the step
            done (bool): indicator for whether next_state is terminal (i.e., end of episode) or not
        """
        if len(self.srpt_pens) > 0:
            # Augment state vector and modify reward using state repeat penalty values.
            self.state_buffer.append(np.array(next_state))
            self.next_aug_state = self.augment_state(next_state)
            state = self.aug_state
            next_state = self.next_aug_state
            reward = self.modify_reward(reward, state, action)

        # Save experience in replay memory.
        self.ebuffer.add(state, action, reward, next_state, done)

        # Learn every LEARN_EVERY steps after memory reaches batch_size.
        if len(self.ebuffer.memory) >= self.ebuffer.batch_size:
            self.l_step += 1
            self.l_step %= LEARN_EVERY
            if self.l_step == 0:
                experiences, weights = self.ebuffer.sample()
                self.learn(experiences, weights, GAMMA)

    def augment_state(self, state):
        """Augment state vector to penalize undesirable actions.
        
        Params
        ======
            state (array_like): original state vector to be augmented
        Returns
        ======
            aug_state (numpy.ndarray): augmented state vector
        """
        # Each penalty value adds a vector of action_size to signal which action causes the penalty.
        aug_state = np.concatenate(
            (state, np.zeros((len(self.srpt_pens) * self.action_size, ))))

        # Detect situation where the two preceeding observed states (not augmented) are essentially the
        # same, which indicates the agent is either stucked at a wall or in some kind of undesirable
        # blind spot. The next action to avoid (i.e., to be penalized) is the one that will keep the
        # agent stuck or in blind spot.
        avoid_action = self.get_avoid_action()
        if avoid_action != ACT_INVALID:
            aug_state[self.state_size + avoid_action] = 1
        if avoid_action != ACT_INVALID or len(self.srpt_pens) == 1:
            return aug_state

        # If agent is not stuck or in blind spot and there are more penalty values, continue to check
        # state repeats separated by more than two actions. Assuming NUM_ORIS is even, states separated
        # by odd number of actions won't repeat. So only even number of actions needs to be checked.
        for action in range(self.action_size):
            nxt_sta = self.sim_step(action)
            for act_cnt in range(2, 2 * len(self.srpt_pens), 2):
                if self.is_state_repeated(act_cnt, nxt_sta):
                    aug_state[self.state_size +
                              (act_cnt // 2) * self.action_size +
                              action] = 1  # signal undesirable action
                    break

        return aug_state

    def modify_reward(self, reward, aug_state, action):
        """Modify reward to penalized undesirable action.
        
        Params
        ======
            reward (float): original reward
            aug_state (numpy.ndarray): augmented state vector
            action (int): action performed
        Returns
        ======
            reward (float): modified reward
        """
        # Penalize undesirable action when it doesn't earn a reward or cause a penalty. If it earns a positive
        # reward or causes a more negative reward, leave the reward unchanged.
        if reward <= 0:
            for i, penalty in enumerate(self.srpt_pens):
                if aug_state[self.state_size + i * self.action_size +
                             action] > 0:  # action is undesirable
                    reward = min(reward, penalty)
                    break
        return reward

    def sim_step(self, action):
        """Advance simulated state (position and orientation) for one step by the action.
        
        Params
        ======
            action (int): action to advance the simulated state
        Returns
            pos, ori (numpy.ndarray, int): resulting simulated state
        """
        # An action can either be a move or turn (but not both) with the type of actions (including non-actions)
        # identified by the action code.
        pos, ori = self.smsta_buffer[-1]
        act_code = ACT_CODES[action]
        pos = pos + act_code[0] * ORIVEC_TABLE[ori]
        ori = (ori + act_code[1]) % NUM_ORIS
        return pos, ori

    def is_state_repeated(self, act_cnt, nxt_sta):
        """Check whether the next state repeats the past state separated by the specified number of actions.
        
        Params
        ======
            act_cnt (int): number of actions separating the past state to be checked and the next state
            nxt_sta (numpy.ndarray, int): next state resulting from an action
        Returns
        ======
            repeated (bool): indicator for repeated state
        """
        repeated = False
        if act_cnt <= len(self.smsta_buffer):
            chk_sta = self.smsta_buffer[-act_cnt]  # past state to be checked
            if chk_sta[1] == nxt_sta[1]:
                if np.linalg.norm(nxt_sta[0] - chk_sta[0]) <= self.tor_dstate:
                    repeated = True
        return repeated

    def act(self, state, eps=0.0):
        """Select action for given state as per epsilon-greedy current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for adjusting epsilon-greedy action selection
        Returns
        ======
            action (int): the chosen action
        """
        # If the agent is in testing mode, self.step() won't be invoked and some of the operations done there
        # need to be done here.
        if (len(self.srpt_pens) > 0
                and self.next_aug_state is None) or self.srpt_det > 0:
            # Push current state into state buffer for comparing with previous state if it is not alraedy pushed
            # by self.step() in the agent training process.
            self.state_buffer.append(np.array(state))

        if len(self.srpt_pens) > 0:
            if self.next_aug_state is None:
                self.aug_state = self.augment_state(state)
            else:
                self.aug_state = self.next_aug_state
            state = self.aug_state

        if self.srpt_det == 0:  # no checking for repeated states (observed or simulated)
            # Randomly select action.
            action = random.choice(np.arange(self.action_size))

            # Epsilon-greedy action selection.
            if random.random() >= eps:
                state = torch.from_numpy(state).float().to(device)
                self.qnetwork_local.eval()
                with torch.no_grad():
                    action = self.qnetwork_local(
                        state).squeeze().argmax().cpu().item()

            if len(self.srpt_pens) > 0:
                # Update simulated state buffer with result of chosen action.
                nxt_sta = self.sim_step(action)
                self.smsta_buffer.append(nxt_sta)

            return action

        # This is the implementation of the post-processing of the Epsilon-greedy policy to avoid repeated states
        # within a short series of actions. This option is set in self.reset_episode() for each espisode and is
        # only active when the option of penalizing undesirable actions, which is set for the class object, is
        # disabled when len(self.srpt_pens) == 0. To accomondate the post-processing of the selected actions, the
        # random policy is modified to randomly assign rankings to all the available actions.

        # Randomly assign rankings to action candidates.
        ranked_actions = np.random.permutation(self.action_size)

        # Epsilon-greedy action selection.
        if random.random() >= eps:
            state = torch.from_numpy(state).float().to(device)
            self.qnetwork_local.eval()
            with torch.no_grad():
                neg_act_qvals = -self.qnetwork_local(state).squeeze()
            ranked_actions = neg_act_qvals.argsort().cpu().numpy().astype(int)

        # Post-process ranked action candidates to remove undesirable action.
        avoid_action = self.get_avoid_action()
        action = self.select_nosrpt_action(avoid_action, ranked_actions)

        return action

    def get_avoid_action(self):
        """Avoid action that will keep the agent stucked or in a blind spot. 
        
        Returns
            avoid_action (int): next action to avoid
        """
        avoid_action = ACT_INVALID  # used to sigal agent is not stucked or in a blind spot
        if np.linalg.norm(self.state_buffer[1] -
                          self.state_buffer[0]) <= self.tor_dstate:
            sim_sta0 = self.smsta_buffer[-2]
            sim_sta1 = self.smsta_buffer[-1]
            if sim_sta0[1] == sim_sta1[
                    1]:  # action is not a turn, must be a move
                # Agent is stuck at a wall
                dpos = sim_sta1[0] - sim_sta0[0]
                mcode = np.around(np.dot(
                    dpos, ORIVEC_TABLE[sim_sta0[1]])).astype(
                        int)  # dot(mcode*(cos, sin), (cos, sin)) = mcode
                avoid_action = AVOID_MOVE_TABLE[mcode + 1]
                self.smsta_buffer.clear(
                )  # it is reasonable to backtrack to get unstucked except the last state which
                self.smsta_buffer.append(
                    sim_sta0
                )  # the agent is stucked in (as the new reference, it can be any state)
            else:  # action is a turn
                # Agent is in a blind spot (turned, but observed same state).
                tcode = sim_sta1[1] - sim_sta0[1]
                avoid_action = AVOID_TURN_TABLE[(tcode + 1) % NUM_ORIS]
                self.smsta_buffer.clear(
                )  # it is reasonable to backtrack to get out of blind
                self.smsta_buffer.append(
                    sim_sta0
                )  # spot except the last two states, which represent
                self.smsta_buffer.append(sim_sta1)  # the blind spot
        return avoid_action

    def select_nosrpt_action(self, avoid_action, ranked_actions):
        """Select action that avoids repeated state (i.e., loops) by a short series of actions.
        
        Params
        ======
            avoid_action (int): action to avoid if agent is stuck or in blind spot
            ranked_actions (array like): action candidates ranked by decreasing Q-values
        Returns
        ======
            action (int): the selected action
        """
        action = ranked_actions[0]
        if action == avoid_action: action = ranked_actions[1]
        nxt_sta = self.sim_step(action)

        # If repeated observed state by an action is detected (signaled by avoid_action != ACT_INVALID), the selected
        # action for avoiding the repeated state will be used since it is more important to free a agent that is
        # stucked or in a blind spot than to go back further to check for repeated simulated states. So the checking
        # for repeated simulated states by 2 or more actions will only occur when avoid_action == ACT_INVALID.
        if avoid_action == ACT_INVALID and self.srpt_det > 1:
            act_heapq = []
            for action in ranked_actions:
                nxt_sta = self.sim_step(action)
                for act_cnt in range(
                        2, 2 * self.srpt_det, 2
                ):  # assuming NUM_ORIS is even, only check even number of actions
                    if self.is_state_repeated(act_cnt, nxt_sta):
                        # Simulated state repeated, go checking next action.
                        heapq.heappush(act_heapq, [-act_cnt, action, nxt_sta])
                        break
                else:
                    # No repeated state detected, action is found.
                    break
            else:
                # No action can satisfy all the no repeated state conditions, select the action that repeats the
                # state separated by most actions (i.e., long loop is more acceptable than short loop).
                action, nxt_sta = heapq.heappop(act_heapq)[1:]

        self.smsta_buffer.append(
            nxt_sta
        )  # update simulated state buffer with result of chosen action.
        return action

    def learn(self, experiences, is_weights, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple (s, a, r, s', done) of batched experience data
            is_weights (torch.Tensor): importance sampling weights for the batched experiences
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Double DQN method for obtaining target Q-values.
        self.qnetwork_local.eval()
        with torch.no_grad():
            maxq_actions = self.qnetwork_local(next_states).max(
                1)[1].unsqueeze(1)
            qouts_next_states = self.qnetwork_target(next_states).gather(
                1, maxq_actions).squeeze()
        qouts_target = rewards + gamma * qouts_next_states * (1 - dones)

        # Obtain current Q-values and its difference from the target Q-values.
        self.qnetwork_local.train()
        qouts_states = self.qnetwork_local(states).gather(1, actions).squeeze()
        delta_qouts = qouts_states - qouts_target

        # Calculated weighted sum of squared losses.
        wsqr_loss = is_weights * delta_qouts**2  # weighted squared loss
        loss_sum = wsqr_loss.sum()

        # Update model parameters by minimizing the loss sum.
        self.optimizer.zero_grad()
        loss_sum.backward()
        self.optimizer.step()

        # Update priorities of the replay memory.
        neg_prios = -torch.abs(delta_qouts.detach())
        self.ebuffer.update_priorities(neg_prios.cpu().numpy())

        # Update target network.
        self.t_step += 1
        self.t_step %= UPDATE_EVERY
        if self.t_step == 0:
            self.qnetwork_target.update_weights(self.qnetwork_local, TAU)

    def update_beta(self, beta):
        """Update importance sampling weights for memory buffer with new Beta.

        Params
        ======
            beta (float): new Beta value
        """
        if beta != self.ebuffer.beta:
            self.ebuffer.beta = beta
            if len(self.ebuffer.memory) >= self.ebuffer.batch_size:
                self.ebuffer.update_is_weights()

    def copy_solved_qnet(self):
        """Copy current local Q-Network to solved Q-Network while local Q-Network will continue the training."""
        if self.qnetwork_solved is None:
            self.qnetwork_solved = QNetwork(self.aug_state_size,
                                            self.action_size,
                                            self.hsize1,
                                            self.hsize2,
                                            seed=None).to(device)
        self.qnetwork_solved.update_weights(
            self.qnetwork_local
        )  # copy local network weights to solved network

    def save_qnet(self, model_name):
        """Save Q-Network parameters into file.

        Params
        ======
            model_name (str): name of the Q-Network
        """
        # Save CPU version since it can be used with or without GPU.
        if self.qnetwork_solved is not None:
            torch.save(self.qnetwork_solved.cpu().state_dict(),
                       model_name + '.pth')
            self.qnetwork_solved = self.qnetwork_solved.to(device)
        else:
            torch.save(self.qnetwork_local.cpu().state_dict(),
                       model_name + '.pth')
            self.qnetwork_local = self.qnetwork_local.to(device)

    def load_qnet(self, model_name):
        """Load Q-Network parameters from file.

        Params
        ======
            model_name (str): name of the Q-Network
        """
        # Saved QNetwork is alway the CPU version.
        qnetwork_loaded = QNetwork(self.aug_state_size,
                                   self.action_size,
                                   self.hsize1,
                                   self.hsize2,
                                   seed=None)
        qnetwork_loaded.load_state_dict(torch.load(model_name + '.pth'))
        self.qnetwork_local.update_weights(qnetwork_loaded.to(
            device))  # copy loaded network weights to local network

Пример #30

Показать файл

Файл: dqn_agent.py Проект: dantodor/deep-reinforcement-learning

class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

Пример #31

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed=0,
                 double_dqn=False,
                 dueling=False,
                 per=False,
                 per_args=(0.2, 0.01, 2e-5)):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            double_dqn (bool): whether to implement Double DQN (default=False)
            dueling (bool): whether to implement Dueling DQN
            per (bool): whether to implement Prioritized Experience Replay
            per_args (tuple): a,beta,beta_increment for PER
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.double_dqn = double_dqn
        self.per = per
        self.gamma = GAMMA

        # output name for checkpoint
        self.output_name = ''
        self.output_name += '_double' if double_dqn else ''
        self.output_name += '_dueling' if dueling else ''
        self.output_name += '_per' if per else ''

        # Q-Network
        self.qnetwork_local = QNetwork(state_size,
                                       action_size,
                                       seed,
                                       dueling=dueling).to(device)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size,
                                        seed,
                                        dueling=dueling).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        if self.per:
            self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                                  BATCH_SIZE, seed, *per_args)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def train(self,
              env,
              n_episodes=1000,
              max_t=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995):
        """Deep Q-Learning.

        Params
        ======
            env (UnityEnvironment): Bananas environment
            n_episodes (int): maximum number of training episodes
            max_t (int): maximum number of timesteps per episode
            eps_start (float): starting value of epsilon, for epsilon-greedy action selection
            eps_end (float): minimum value of epsilon
            eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        """
        # get the default brain
        brain_name = env.brain_names[0]
        brain = env.brains[brain_name]
        # list containing scores from each episode
        scores = []
        # list containing window averaged scores
        avg_scores = []
        # last 100 scores
        scores_window = deque(maxlen=100)
        # initialize epsilon
        eps = eps_start
        for i_episode in range(1, n_episodes + 1):
            env_info = env.reset(train_mode=True)[brain_name]
            state = env_info.vector_observations[0]
            score = 0
            for t in range(max_t):
                action = self.act(state, eps)
                env_info = env.step(action)[brain_name]
                # get the next state
                next_state = env_info.vector_observations[0]
                # get the reward
                reward = env_info.rewards[0]
                # see if episode has finished
                done = env_info.local_done[0]
                self.step((state, action, reward, next_state, done))
                state = next_state
                score += reward
                if done:
                    break
            # save most recent score
            scores_window.append(score)
            scores.append(score)
            avg_scores.append(np.mean(scores_window))
            # decrease epsilon
            eps = max(eps_end, eps_decay * eps)
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)))
            if np.mean(scores_window) >= 13.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(scores_window)))
                torch.save(self.qnetwork_local.state_dict(),
                           f'./checkpoints/checkpoint{self.output_name}.pth')
                break
        return scores, avg_scores

    def step(self, experience):
        """Save experience in replay memory and learn.
        
        Params
        ======
            experience (tuple): (state, action, reward, next_state, done)
        """
        # save experience
        self.memory.add(experience)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                self.learn()

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self):
        """Update value parameters using given batch of experience tuples.
        """
        # if using PER
        if self.per:
            states, actions, rewards, next_states, dones, idxs, is_weights = self.memory.sample(
            )

        # else normal replay buffer
        else:
            states, actions, rewards, next_states, dones = self.memory.sample()

        # if Double DQN
        if self.double_dqn:
            # Get predicted Q values (for next actions chosen by local model) from target model
            self.qnetwork_local.eval()
            with torch.no_grad():
                next_actions = self.qnetwork_local(next_states).detach().max(
                    1)[1].unsqueeze(1)
            self.qnetwork_local.train()
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, next_actions)

        else:
            # Get max predicted Q values (for next states) from target model
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        if self.per:
            loss = (torch.FloatTensor(is_weights) *
                    F.mse_loss(Q_expected, Q_targets)).mean()
        else:
            loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # if PER, update priority
        if self.per:
            errors = torch.abs(Q_expected - Q_targets).data.numpy()
            self.memory.update(idxs, errors)

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)