Exemplo n.º 1
0
    def __init__(self,
                 state_size,
                 action_size,
                 gamma=0.99,
                 lr=5e-4,
                 buffer_size=int(1e5),
                 batch_size=64,
                 tau=1e-3):
        # defining local and target networks
        self.qnet_local = Qnetwork(state_size, action_size).to(device)
        self.qnet_target = Qnetwork(state_size, action_size).to(device)

        # set local and target parameters equal to each other
        self.soft_update(tau=1.0)

        # experience replay buffer
        self.memory = ReplayBuffer(buffer_size, batch_size)

        # defining variables
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.lr = lr
        self.tau = tau

        self.t_step = 0

        # optimizer
        self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=self.lr)
 def __init__(self, state_size, action_size):
     self.state_size = state_size
     self.action_size = action_size
     self.Qlocal = Qnetwork(self.state_size,
                            self.action_size).to(device)  # Local Network
     self.Qtarget = Qnetwork(self.state_size,
                             self.action_size).to(device)  # Taget Network
     self.optim = optim.Adam(self.Qlocal.parameters(), lr)
     self.buffer = replay_buffer(buffer_max_size,
                                 batch_size)  # replay buffer
     self.t_step = 0  # used in updating the target network weights from local network
Exemplo n.º 3
0
    def __init__(self, state_size, action_size, seed):

        self.state_size = state_size
        self.action_size = action_size
        #self.device=device
        self.seed = random.seed(seed)

        self.q_network_local = Qnetwork(state_size, action_size,
                                        seed).to(device)
        self.q_network_target = Qnetwork(state_size, action_size,
                                         seed).to(device)
        self.optimizer = optim.Adam(self.q_network_local.parameters(), lr=LR)

        ####REPLAY MEMORY########
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        self.t_step = 0
Exemplo n.º 4
0
 def __init__(self, seed, state_size, action_size, net_type="dqn"):
     """if net_type is dqn, perform deep Q network; if ddqn, perform double deep Q network"""
     self.state_size = state_size
     self.action_size = action_size
     self.seed = random.seed(seed)
     self.net_type = net_type
     # replay buffer
     self.memory = replaybuffer(action_size, BATCH_SIZE, seed)
     # define target and local Q network
     self.qnetwork_local = Qnetwork(state_size, action_size,
                                    seed).to(device)
     self.qnetwork_target = Qnetwork(state_size, action_size,
                                     seed).to(device)
     # define optimizer for qnetwork_local
     self.optim = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
     # define time step for soft updating cycle
     self.time_step = 0
Exemplo n.º 5
0
    def __init__(self, state_size, action_size):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        self.state_size = state_size
        self.action_size = action_size
        #self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = Qnetwork(state_size, action_size).to(device)
        self.qnetwork_target = Qnetwork(state_size, action_size).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Exemplo n.º 6
0
    def __init__(self, state_size, action_size, double=False, duel=False):

        self.state_size = state_size
        self.action_size = action_size
        self.discounted_factor = 0.99
        self.learning_rate = 0.001

        self.double = double

        # Define Model
        if duel:
            self.local_model = Duel_Qnetwork(state_size,
                                             action_size).to(device)
            self.target_model = Duel_Qnetwork(state_size,
                                              action_size).to(device)
        else:
            self.local_model = Qnetwork(state_size, action_size).to(device)
            self.target_model = Qnetwork(state_size, action_size).to(device)

        # Define optimizer
        self.optimizer = optim.Adam(self.local_model.parameters(),
                                    lr=self.learning_rate)

        # Define Buffer
        self.buffer = Replay_buffer(action_size,
                                    buffer_size=BUFFER_SIZE,
                                    batch_size=BATCH_SIZE)

        # time_step, local_model update, target_model update
        self.t_step = 0
        self.target_update_t = 0
Exemplo n.º 7
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        #Qnetwork
        self.Qnetwork_local = Qnetwork(state_size, action_size,
                                       seed).to(device)
        self.Qnetwork_target = Qnetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.Qnetwork_local.parameters(), lr=lr)

        #replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        #init time step
        self.t_step = 0
Exemplo n.º 8
0
class smart_agent():
    def __init__(self, state_size, action_size, seed):

        self.state_size = state_size
        self.action_size = action_size
        #self.device=device
        self.seed = random.seed(seed)

        self.q_network_local = Qnetwork(state_size, action_size,
                                        seed).to(device)
        self.q_network_target = Qnetwork(state_size, action_size,
                                         seed).to(device)
        self.optimizer = optim.Adam(self.q_network_local.parameters(), lr=LR)

        ####REPLAY MEMORY########
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            if len(self.memory) > BATCH_SIZE:
                xp = self.memory.sample()
                self.learn(xp, GAMMA)

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.q_network_local.eval()

        with torch.no_grad():
            action_value = self.q_network_local(state)
        self.q_network_local.train()

        #Epsilon greedy selection
        if random.random() > eps:
            return np.argmax(action_value.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, xp, gamma):
        state, action, reward, next_state, done = xp
        q_target_next = self.q_network_target(next_state).detach().max(
            1)[0].unsqueeze(1)
        q_target = reward + (gamma * q_target_next * (1 - done))

        q_expected = self.q_network_local(state).gather(1, action)

        #MSE LOSS
        loss = F.mse_loss(q_expected, q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.q_network_local, self.q_network_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 9
0
class Agent:
    def __init__(self, seed, state_size, action_size, net_type="dqn"):
        """if net_type is dqn, perform deep Q network; if ddqn, perform double deep Q network"""
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.net_type = net_type
        # replay buffer
        self.memory = replaybuffer(action_size, BATCH_SIZE, seed)
        # define target and local Q network
        self.qnetwork_local = Qnetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = Qnetwork(state_size, action_size,
                                        seed).to(device)
        # define optimizer for qnetwork_local
        self.optim = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        # define time step for soft updating cycle
        self.time_step = 0

    def collect(self, state, action, reward, next_state, done):
        # collect the new sample
        self.memory.add(state, action, reward, next_state, done)
        # use time step to decide if it needs to learn or not
        self.time_step = (self.time_step + 1) % UPDATE_EVERY
        if self.time_step == 0:
            if len(self.memory) >= BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, ALPHA)

    def act(self, state):
        state = torch.from_numpy(state).unsqueeze(0).float().to(device)
        # get action_values
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_vals = self.qnetwork_local(state)
        self.qnetwork_local.train()
        # use epsilon_greedy policies to decide which action to take
        policy = np.ones(self.action_size) * (EPSILON / self.action_size)
        best = torch.argmax(action_vals).item()
        policy[best] = 1 - EPSILON + (EPSILON / self.action_size)
        return np.random.choice(np.arange(self.action_size), p=policy)

    def learn(self, experiences, alpha):

        states, actions, rewards, next_states, dones = experiences
        # parameter learning for local network
        if self.net_type == "dqn":
            TD_target = rewards + GAMMA * (self.qnetwork_target(
                next_states).detach().max(1)[0].unsqueeze(1)) * (1 - dones)
        if self.net_type == "ddqn":
            best = self.qnetwork_local(next_states).detach().max(
                1)[1].unsqueeze(1)
            TD_target = rewards + GAMMA * (
                self.qnetwork_target(next_states).detach().gather(1, best))
        TD_estimate = self.qnetwork_local(states).gather(1, actions)
        loss = F.mse_loss(TD_target, TD_estimate)
        self.optim.zero_grad()
        loss.backward()
        self.optim.step()
        # parameter soft updating for target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, alpha)

    def soft_update(self, local_network, target_network, alpha):

        for local_params, target_params in zip(local_network.parameters(),
                                               target_network.parameters()):
            target_params.data.copy_(alpha * local_params.data +
                                     (1 - alpha) * target_params.data)
Exemplo n.º 10
0
class agent():
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        #Qnetwork
        self.Qnetwork_local = Qnetwork(state_size, action_size,
                                       seed).to(device)
        self.Qnetwork_target = Qnetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.Qnetwork_local.parameters(), lr=lr)

        #replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        #init time step
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        #save step to replay memory
        self.memory.add(state, action, reward, next_state, done)

        #learn every update_every t_steps
        self.t_step = (self.t_step + 1) % update_every
        if self.t_step == 0:
            #check if enough samples are in memory if there are then learn
            if len(self.memory) > batch_size:
                exps = self.memory.sample()
                self.learn(exps, gamma)

    def act(self, state, eps=0.):
        '''Returns actions for a given state based on the current policy
        Params
        ======
            state (array_like): current state
            eps (float) epsilon for epsilon-greedy action selection
        '''

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.Qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.Qnetwork_local(state)
        self.Qnetwork_local.train()

        #epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, exps, gamma):
        """Update value parameters using a batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, done = exps

        #get max predicted Q values for next state from target model
        Q_targets_next = self.Qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        #compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - done))

        #calculate expected Q values from local model
        Q_expected = self.Qnetwork_local(states).gather(1, actions)

        #compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        #minimize loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        #update target network
        self.soft_update(self.Qnetwork_local, self.Qnetwork_target, tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 11
0
class DQNAgent():
    def __init__(self, state_size, action_size, double=False, duel=False):

        self.state_size = state_size
        self.action_size = action_size
        self.discounted_factor = 0.99
        self.learning_rate = 0.001

        self.double = double

        # Define Model
        if duel:
            self.local_model = Duel_Qnetwork(state_size,
                                             action_size).to(device)
            self.target_model = Duel_Qnetwork(state_size,
                                              action_size).to(device)
        else:
            self.local_model = Qnetwork(state_size, action_size).to(device)
            self.target_model = Qnetwork(state_size, action_size).to(device)

        # Define optimizer
        self.optimizer = optim.Adam(self.local_model.parameters(),
                                    lr=self.learning_rate)

        # Define Buffer
        self.buffer = Replay_buffer(action_size,
                                    buffer_size=BUFFER_SIZE,
                                    batch_size=BATCH_SIZE)

        # time_step, local_model update, target_model update
        self.t_step = 0
        self.target_update_t = 0

    def get_action(self, state, eps=0.0):
        """state (numpy.ndarray)"""
        state = torch.from_numpy(state.reshape(
            1, self.state_size)).float().to(device)

        self.local_model.eval()
        with torch.no_grad():
            action_values = self.local_model(state)  # .detach().cpu()
        self.local_model.train()

        # epsilon greedy policy
        if random.random() < eps:
            action = np.random.randint(4)
            return action
        else:
            action = np.argmax(action_values.cpu().data.numpy())

            return int(action)

    def append_sample(self, state, action, reward, next_state, done):
        self.buffer.add(state, action, reward, next_state, done)

        self.t_step += 1
        if self.t_step % LOCAL_UPDATE == 0:
            """If there are enough experiences"""
            if self.buffer.__len__() > BATCH_SIZE:
                experiences = self.buffer.sample()
                self.learn(experiences)

                # self.target_update_t += 1
                # if self.target_update_t % TARGET_UPDATE == 0:
                self.soft_target_model_update(TAU)

    def learn(self, experiences):
        """experiences ;tensor  """
        states, actions, rewards, next_states, dones = experiences

        pred_q = self.local_model(states).gather(1, actions)

        if self.double:
            _, argmax_actions = torch.max(
                self.local_model.forward(next_states).detach(),
                1,
                keepdim=True)
            pred_next_q = self.target_model.forward(next_states).gather(
                1, argmax_actions)
        else:
            pred_next_q, _ = torch.max(
                self.target_model.forward(next_states).detach(),
                1,
                keepdim=True)

        target_q = rewards + (
            (1 - dones) * self.discounted_factor * pred_next_q)
        loss = F.mse_loss(target_q, pred_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def soft_target_model_update(self, tau):
        for target_param, local_param in zip(self.target_model.parameters(),
                                             self.local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 12
0
class Agent:
    def __init__(self,
                 state_size,
                 action_size,
                 gamma=0.99,
                 lr=5e-4,
                 buffer_size=int(1e5),
                 batch_size=64,
                 tau=1e-3):
        # defining local and target networks
        self.qnet_local = Qnetwork(state_size, action_size).to(device)
        self.qnet_target = Qnetwork(state_size, action_size).to(device)

        # set local and target parameters equal to each other
        self.soft_update(tau=1.0)

        # experience replay buffer
        self.memory = ReplayBuffer(buffer_size, batch_size)

        # defining variables
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.lr = lr
        self.tau = tau

        self.t_step = 0

        # optimizer
        self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=self.lr)

    def step(self, state, action, reward, next_state, done):
        """ saves the step info in the memory buffer and perform a learning iteration
        Input : 
            state,action,reward,state,done : non-batched numpy arrays
        
        Output : 
            none
        """
        # add sample to the memory buffer
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY

        # use replay buffer to learn if it has enough samples
        if self.t_step == 0:
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def learn(self, experiences):
        """ perform a learning iteration by using sampled experience batch
        Input : 
            experience : tuple from the memory buffer
            states, actions, rewards, next_states, dones = experiences
            eg : states.shape = [N,state_size]
        Output : 
            none
        """
        states, actions, rewards, next_states, dones, wj, choose = experiences
        #states, actions, rewards, next_states, dones = experiences

        # set optimizer grdient to zero
        self.optimizer.zero_grad()

        # predicted action value
        q_pred = self.qnet_local.forward(states).gather(1, actions)

        # target action value
        ## use double DQNs, refer https://arxiv.org/abs/1509.06461
        next_action_local = self.qnet_local.forward(next_states).max(1)[1]
        q_target = rewards + self.gamma * (
            1 - dones) * self.qnet_target.forward(next_states)[
                range(self.batch_size), next_action_local].unsqueeze(1)

        # compute td error
        td_error = q_target - q_pred
        # update td error in Replay buffer
        self.memory.update_td_error(choose,
                                    td_error.detach().cpu().numpy().squeeze())

        # defining loss
        loss = ((wj * td_error)**2).mean()

        # running backprop and optimizer step
        loss.backward()
        self.optimizer.step()

        # run soft update
        self.soft_update(self.tau)

    def act(self, state, eps=0.):
        """ return the local model's predicted action for the given state
        Input : 
            state : [state_size]
        
        Output : 
            action : scalar action as action space is discrete with dim = 1
        """
        state = torch.from_numpy(state).float().unsqueeze(dim=0).to(
            device)  # converts numpy array to torch tensor

        self.qnet_local.eval()  # put net in test mode
        with torch.no_grad():
            max_action = np.argmax(
                self.qnet_local(state)[0].cpu().data.numpy())
        self.qnet_local.train()  # put net back in train mode

        rand_num = np.random.rand(
        )  # sample a random number uniformly between 0 and 1

        # implementing epsilon greedy policy
        if rand_num < eps:
            return np.random.randint(self.action_size)
        else:
            return max_action

    def soft_update(self, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        """
        for target_param, local_param in zip(self.qnet_target.parameters(),
                                             self.qnet_local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 13
0
    op_holder = []
    for idx, var in enumerate(tfVars[0:total_vars // 2]):
        op_holder.append(
            tfVars[idx + total_vars // 2].assign((var.value() * tau) + (
                (1 - tau) * tfVars[idx + total_vars // 2].value())))
    return op_holder


def updateTarget(op_holder, sess):
    for op in op_holder:
        sess.run(op)


# training
tf.reset_default_graph()
mainQN = Qnetwork(h_size, env)
targetQN = Qnetwork(h_size, env)

init = tf.global_variables_initializer()

saver = tf.train.Saver()

trainables = tf.trainable_variables()

targetOps = updateTargetGraph(trainables, tau)

myBuffer = experience_buffer()

# Set the rate of random action decrease.
e = startE
stepDrop = (startE - endE) / anneling_steps
Exemplo n.º 14
0
class Agent():
    '''
    Agent interacts with env and learn the optimal policy by learning optimal value function
    '''
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.Qlocal = Qnetwork(self.state_size,
                               self.action_size).to(device)  # Local Network
        self.Qtarget = Qnetwork(self.state_size,
                                self.action_size).to(device)  # Taget Network
        self.optim = optim.Adam(self.Qlocal.parameters(), lr)
        self.buffer = replay_buffer(buffer_max_size,
                                    batch_size)  # replay buffer
        self.t_step = 0  # used in updating the target network weights from local network

    def learn(self, exp, gamma):
        '''
        takes exp and gamma for training the local network in predicting proper q value
        calculates the next time step q value from next state
        '''
        state, action, reward, next_state, done = exp
        index = self.Qlocal(next_state).detach().max(1)[1].unsqueeze(
            1
        )  # double q learning to get max value of action from secondary network
        q_val = self.Qtarget(next_state).detach().gather(
            1, index)  # get the q value from the index which gave max value
        y_onehot = torch.zeros(
            batch_size,
            self.action_size).cuda()  # update the values for choosen action
        y_onehot.scatter_(1, action, 1)  # creating the one hot vector
        Q_val_n = reward + (gamma * q_val * (1 - done)
                            )  # Estimated the Target Q value for state
        Q_target = y_onehot * Q_val_n  # Traget network Qvalue for given action
        pre = self.Qlocal(state)  # Qvalue estimated by the local network
        Q_local = y_onehot * pre  # Local network Qvalue for given action

        loss = F.mse_loss(Q_local, Q_target)  # Loss function
        self.optim.zero_grad()
        loss.backward()
        self.optim.step()
        self.update(
            self.Qlocal, self.Qtarget, tau
        )  # updating the Target network weight with Local network weight

    def step(self, state, action, reward, next_state, done):
        '''
        Interacts with the env to get the one step experience and update the replay buffer
        trains Local network one in four times it interacts with env 
        '''
        self.buffer.add(state, action, reward, next_state,
                        done)  # Adding to replay buffer
        self.t_step += 1
        if (self.t_step % 4 == 0):  # Training ones in four times
            if (len(self.buffer) > batch_size):
                experiences = self.buffer.sample()
                self.learn(experiences, gamma)

    def act(self, state, eps):
        '''
        Given the state provide the appropriate action given by e-greedy policy
        '''
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.Qlocal.eval()  # network in eval mode to calculate the q value
        with torch.no_grad():
            action_values = self.Qlocal(
                state)  # Q value estimate for given state
        self.Qlocal.train()  # network in train mode
        if random.random(
        ) > eps:  # e-greedy policy for choosing the action from q values
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def update(self, local_model, target_model, tau):
        '''
        Updates the Target network with some part of local network
        '''
        for l, t in zip(local_model.parameters(), target_model.parameters()):
            t.data.copy_(t.data * (1.0 - tau) + l.data * tau)
Exemplo n.º 15
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        self.state_size = state_size
        self.action_size = action_size
        #self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = Qnetwork(state_size, action_size).to(device)
        self.qnetwork_target = Qnetwork(state_size, action_size).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > 1000:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        nxt_pred = 0.04 * torch.log(
            torch.exp(self.qnetwork_target(next_states) *
                      25.0).sum(-1)).detach().unsqueeze(1)
        y_onehot = torch.zeros(BATCH_SIZE, self.action_size).to(device)
        y_onehot.scatter_(1, actions, 1)
        Q_val_n = rewards + (gamma * nxt_pred * (1 - dones))
        Q_target = y_onehot * Q_val_n
        pre = self.qnetwork_local(states)
        Q_local = y_onehot * pre
        loss = F.mse_loss(Q_local, Q_target)
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 1)
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)