Exemplo n.º 1
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values for next states from the target model (frozen weights)
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model (being trained)
        # x.gather(1, actions) returns a tensor (located on the current device) that is the result of
        # concataining the input tensor values along the provided dimensions (here the dim indexes are the taken actions indexes)
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 2
0
class DQNAgent:

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    def __init__(self,
                 osize,
                 asize,
                 seed,
                 buffersize=int(1e6),
                 gamma=0.99,
                 epsilon=0.05,
                 epsilondecay=1e6,
                 epsilonmin=0.1,
                 minibatchsize=128,
                 lr=0.01,
                 tau=0.01):
        """
        Initialize DQN agent parameters.
        """

        # initialize agent parameters
        self.osize = osize
        self.asize = asize
        self.gamma = gamma
        self.epsilon0 = epsilon
        self.epsilon = epsilon
        self.epsilondecay = epsilondecay
        self.epsilonmin = epsilonmin
        self.minibatchsize = minibatchsize
        self.lr = lr
        self.tau = tau
        self.stepcount = 0
        self.loss_log = []

        # set the random seed
        self.seed = torch.manual_seed(seed)

        # create local and target Q networks
        self.Q = QNetwork(osize, asize).to(self.device)
        self.targetQ = QNetwork(osize, asize).to(self.device)

        # initialize optimizer
        self.optimizer = optim.Adam(self.Q.parameters(), lr=self.lr)

        # initialize experience replay
        self.replay = ExperienceReplay(asize, buffersize, minibatchsize, seed)

    def step(self, state, action, reward, next_state, done):
        """
        Step the agent, and learn if necessary.
        """

        # add experience to replay
        self.replay.add(state, action, reward, next_state, done)

        # learn from experiences
        if self.replay.__len__() > self.minibatchsize:
            # create mini batch for learning
            experiences = self.replay.sample(self.device)
            # train the agent
            self.learn(experiences)

        # increase step count
        self.stepcount += 1

        # decay epsilon
        decayed_epsilon = self.epsilon * (1 - self.epsilondecay)
        self.epsilon = max(self.epsilonmin, decayed_epsilon)

    def get_action(self, state):
        """
        Get an epsilon greedy action.
        """

        # convert network input to torch variable
        x = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # obtain network output
        self.Q.eval()
        with torch.no_grad(
        ):  # do not calculate network gradients which will speed things up
            y = self.Q(x)
        self.Q.train()

        # select action
        if random.random() > self.epsilon:
            # epsilon greedy action
            action = np.argmax(
                y.cpu().data.numpy())  # action is actually action index
        else:
            # random action selection
            action = np.random.choice(np.arange(self.asize))

        return action

    def learn(self, experiences):
        """
        Learn using Double DQN algorithm.
        """

        # unpack experience
        states, actions, rewards, next_states, dones = experiences

        # get the argmax of Q(next_state)
        a_max = torch.argmax(self.Q(next_states),
                             dim=1).cpu().data.numpy().reshape(
                                 (self.minibatchsize, 1))

        # obtain the target Q network output
        target_out = self.targetQ(next_states).detach().data.numpy()
        target_q = np.array(
            [tout[aidx] for tout, aidx in zip(target_out, a_max)])

        # calculate target and local Qs
        target = rewards + self.gamma * target_q * (1 - dones)
        local = self.Q(states).gather(1, actions)

        # calculate loss
        loss = F.mse_loss(local, target)
        self.loss_log.append(loss.cpu().data.numpy())

        # perform gradient descent step
        self.optimizer.zero_grad()  # reset the gradients to zero
        loss.backward()
        self.optimizer.step()

        # soft update target network
        for target_params, params in zip(self.targetQ.parameters(),
                                         self.Q.parameters()):
            target_params.data.copy_(self.tau * params +
                                     (1 - self.tau) * target_params.data)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed=SEED, batch_size=BATCH_SIZE,
                 buffer_size=BUFFER_SIZE, start_since=START_SINCE, gamma=GAMMA, target_update_every=T_UPDATE,
                 tau=TAU, lr=LR, weight_decay=WEIGHT_DECAY, update_every=UPDATE_EVERY, clip=CLIP, **kwds):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            batch_size (int): size of each sample batch
            buffer_size (int): size of the experience memory buffer
            start_since (int): number of steps to collect before start training
            gamma (float): discount factor
            target_update_every (int): how often to update the target network
            tau (float): target network soft-update parameter
            lr (float): learning rate
            weight_decay (float): weight decay for optimizer
            update_every (int): update(learning and target update) interval
            clip (float): gradient norm clipping (`None` to disable)
        """
        if kwds != {}:
            print("Ignored keyword arguments: ", end='')
            print(*kwds, sep=', ')
        assert isinstance(state_size, int)
        assert isinstance(action_size, int)
        assert isinstance(seed, int)
        assert isinstance(batch_size, int) and batch_size > 0
        assert isinstance(buffer_size, int) and buffer_size >= batch_size
        assert isinstance(start_since, int) and batch_size <= start_since <= buffer_size
        assert isinstance(gamma, (int, float)) and 0 <= gamma <= 1
        assert isinstance(target_update_every, int) and target_update_every > 0
        assert isinstance(tau, (int, float)) and 0 <= tau <= 1
        assert isinstance(lr, (int, float)) and lr >= 0
        assert isinstance(weight_decay, (int, float)) and weight_decay >= 0
        assert isinstance(update_every, int) and update_every > 0
        if clip: assert isinstance(clip, (int, float)) and clip >= 0

        self.state_size          = state_size
        self.action_size         = action_size
        self.seed                = random.seed(seed)
        self.batch_size          = batch_size
        self.buffer_size         = buffer_size
        self.start_since         = start_since
        self.gamma               = gamma
        self.target_update_every = target_update_every
        self.tau                 = tau
        self.lr                  = lr
        self.weight_decay        = weight_decay
        self.update_every        = update_every

        # Q-Network
        self.qnetwork_local  = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr, weight_decay=weight_decay)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps and TARGET_UPDATE_EVERY steps)
        self.u_step = 0
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.u_step = (self.u_step + 1) % self.update_every
        if self.u_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) >= self.start_since:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

        # update the target network every TARGET_UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.target_update_every
        if self.t_step == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        with torch.no_grad():
            target = rewards + gamma * self.qnetwork_target(next_states).max(dim=1)[0] * (1 - dones)

        pred = self.qnetwork_local(states)

        loss = F.mse_loss(pred.gather(dim=1, index=actions), target)

        self.optimizer.zero_grad()
        loss.backward()
        if self.clip:
            torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), CLIP)
        self.optimizer.step()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Exemplo n.º 4
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        # Target or w-
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        if (PRIORITIZED_REPLY_ENABLED):
            self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                                  BATCH_SIZE, seed)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.B = .001

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random
            #  subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done)
                tuples
            gamma (float): discount factor
        """
        if (PRIORITIZED_REPLY_ENABLED):
            states, actions, rewards, next_states, dones, a_probs = experiences
        else:
            states, actions, rewards, next_states, dones = experiences

        if (DOUBLE_DQN_ENABLED):
            # We will use the local paramters to get the next best action
            # We will then get the Q(s', a ) from the target
            # Get max predicted Q values (for next states) from target model
            # execute prediction for next states
            with torch.no_grad():
                Q_targets_next = self.qnetwork_local(next_states).detach()
            # Returns the maximum value of each row of the input tensor in the
            # given dimension dim. The second return value is the index
            # location of each maximum value found (argmax).
            Q_targets_next = np.argmax(Q_targets_next, axis=1)
            Q_targets_next_prime = self.qnetwork_target(next_states).detach()
            Q_targets_next = Q_targets_next_prime[list(range(0, len(states))),
                                                  Q_targets_next].reshape(
                                                      len(states), 1)
        else:
            # Get max predicted Q values (for next states) from target model
            # execute prediction for next states
            Q_targets_next = self.qnetwork_target(next_states)
            # Detaches the Tensor from the graph that created it, making it
            # a leaf.
            Q_targets_next = Q_targets_next.detach()
            # Returns the maximum value of each row of the input tensor in
            # the given dimension dim. The second return value is the index
            # location of each maximum value found (argmax).
            Q_targets_next = Q_targets_next.max(1)[0]
            # Returns a new tensor with a dimension of size one inserted at the
            # specified position.
            # Before squeze  torch.Size([64])
            Q_targets_next = Q_targets_next.unsqueeze(1)
            # After squeze torch.Size([64, 1])

        # Compute Q targets for current states
        # start with the rewards
        Q_targets = rewards
        # gamma * make on next state but only if not done
        Q_targets += (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        # do a forward pass
        Q_expected = self.qnetwork_local(states)
        # Gathers values along an axis specified by dim.
        # Before gather  torch.Size([64, 4])
        Q_expected = Q_expected.gather(1, actions)
        # After gather  torch.Size([64, 1])

        # Calulate the loss
        if (PRIORITIZED_REPLY_ENABLED):
            td_error = (Q_expected - Q_targets).abs_() + E_REPLAY
            impSampleWeigt = torch.tensor(
                ((1 / np.array(a_probs)) * (1 / BUFFER_SIZE))**self.B).float()
            for i in range(len(experiences)):
                self.memory.update(states[i], actions[i], rewards[i],
                                   next_states[i], dones[i], td_error[i])
            loss = F.mse_loss(Q_expected, Q_targets, reduce=False)
            impSampleWeigt = torch.unsqueeze(impSampleWeigt, 1)
            if self.B < 0.998:
                self.B += .001
            loss = torch.mean(loss * torch.tensor(impSampleWeigt).float())
        else:
            # Compute loss
            loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 5
0
class Agent():
    """ Interacts with and learns from the environment """
    def __init__(self,
                 state_size=4 * 4,
                 action_size=4,
                 seed=42,
                 fc1_units=256,
                 fc2_units=256,
                 fc3_units=256,
                 buffer_size=BUFFER_SIZE,
                 batch_size=BATCH_SIZE,
                 lr=LR,
                 use_expected_rewards=True,
                 predict_steps=2,
                 gamma=GAMMA,
                 tau=TAU):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            fc*_units (int): size of the respective layer
            buffer_size (int): number of steps to save in replay buffer
            batch_size (int): self-explanatory
            lr (float): learning rate
            use_expected_rewards (bool): whether to predict the weighted sum of future rewards or just for current step
            predict_steps (int): for how many steps to predict the expected rewards
            
        """
        TAU = tau
        GAMMA = gamma

        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed
        random.seed(seed)
        np.random.seed(seed)
        self.batch_size = batch_size
        self.losses = []
        self.use_expected_rewards = use_expected_rewards
        self.current_iteration = 0

        # Game scores
        self.scores_list = []
        self.last_n_scores = deque(maxlen=50)
        self.mean_scores = []
        self.max_score = 0
        self.min_score = 1000
        self.best_score_board = []

        # Rewards
        self.total_rewards_list = []
        self.last_n_total_rewards = deque(maxlen=50)
        self.mean_total_rewards = []
        self.max_total_reward = 0
        self.best_reward_board = []

        # Max cell value on game board
        self.max_vals_list = []
        self.last_n_vals = deque(maxlen=50)
        self.mean_vals = []
        self.max_val = 0
        self.best_val_board = []

        # Number of steps per episode
        self.max_steps_list = []
        self.last_n_steps = deque(maxlen=50)
        self.mean_steps = []
        self.max_steps = 0
        self.total_steps = 0
        self.best_steps_board = []

        self.actions_avg_list = []
        self.actions_deque = {
            0: deque(maxlen=50),
            1: deque(maxlen=50),
            2: deque(maxlen=50),
            3: deque(maxlen=50)
        }

        # Q-Network
        self.qnetwork_local = QNetwork(state_size,
                                       action_size,
                                       seed,
                                       fc1_units=fc1_units,
                                       fc2_units=fc2_units,
                                       fc3_units=fc3_units).to(device)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size,
                                        seed,
                                        fc1_units=fc1_units,
                                        fc2_units=fc2_units,
                                        fc3_units=fc3_units).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        lr_s = lambda epoch: 0.998**(
            epoch % 1000) if epoch < 100000 else 0.999**(epoch % 1000)
        self.lr_decay = optim.lr_scheduler.StepLR(self.optimizer, 1000, 0.9999)

        # Replay buffer
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)

        # Initialize time step
        self.t_step = 0
        self.steps_ahead = predict_steps

    def save(self, name):
        """Saves the state of the model and stats
        
        Params
        ======
            name (str): name of the agent version used in dqn function
        """

        torch.save(self.qnetwork_local.state_dict(),
                   base_dir + '/network_local_%s.pth' % name)
        torch.save(self.qnetwork_target.state_dict(),
                   base_dir + '/network_target_%s.pth' % name)
        torch.save(self.optimizer.state_dict(),
                   base_dir + '/optimizer_%s.pth' % name)
        torch.save(self.lr_decay.state_dict(),
                   base_dir + '/lr_schd_%s.pth' % name)
        state = {
            'state_size': self.state_size,
            'action_size': self.action_size,
            'seed': self.seed,
            'batch_size': self.batch_size,
            'losses': self.losses,
            'use_expected_rewards': self.use_expected_rewards,
            'current_iteration': self.current_iteration,

            # Game scores
            'scores_list': self.scores_list,
            'last_n_scores': self.last_n_scores,
            'mean_scores': self.mean_scores,
            'max_score': self.max_score,
            'min_score': self.min_score,
            'best_score_board': self.best_score_board,

            # Rewards
            'total_rewards_list': self.total_rewards_list,
            'last_n_total_rewards': self.last_n_total_rewards,
            'mean_total_rewards': self.mean_total_rewards,
            'max_total_reward': self.max_total_reward,
            'best_reward_board': self.best_reward_board,

            # Max cell value on game board
            'max_vals_list': self.max_vals_list,
            'last_n_vals': self.last_n_vals,
            'mean_vals': self.mean_vals,
            'max_val': self.max_val,
            'best_val_board': self.best_val_board,

            # Number of steps per episode
            'max_steps_list': self.max_steps_list,
            'last_n_steps': self.last_n_steps,
            'mean_steps': self.mean_steps,
            'max_steps': self.max_steps,
            'total_steps': self.total_steps,
            'best_steps_board': self.best_steps_board,
            'actions_avg_list': self.actions_avg_list,
            'actions_deque': self.actions_deque,
            # Replay buffer
            'memory': self.memory.dump(),
            # Initialize time step
            't_step': self.t_step,
            'steps_ahead': self.steps_ahead
        }

        with open(base_dir + '/agent_state_%s.pkl' % name, 'wb') as f:
            pickle.dump(state, f)

    def load(self, name):
        """Saves the state of the model and stats
        
        Params
        ======
            name (str): name of the agent version used in dqn function
        """
        self.qnetwork_local.load_state_dict(
            torch.load(base_dir + '/network_local_%s.pth' % name))
        self.qnetwork_target.load_state_dict(
            torch.load(base_dir + '/network_target_%s.pth' % name))
        self.optimizer.load_state_dict(
            torch.load(base_dir + '/optimizer_%s.pth' % name))
        self.lr_decay.load_state_dict(
            torch.load(base_dir + '/lr_schd_%s.pth' % name))

        with open(base_dir + '/agent_state_%s.pkl' % name, 'rb') as f:
            state = pickle.load(f)

        self.state_size = state['state_size']
        self.action_size = state['action_size']
        self.seed = state['seed']
        random.seed(self.seed)
        np.random.seed(self.seed)
        self.batch_size = state['batch_size']
        self.losses = state['losses']
        self.use_expected_rewards = state['use_expected_rewards']
        self.current_iteration = state['current_iteration']

        # Game scores
        self.scores_list = state['scores_list']
        self.last_n_scores = state['last_n_scores']
        self.mean_scores = state['mean_scores']
        self.max_score = state['max_score']
        self.min_score = state['min_score'] if 'min_score' in state.keys(
        ) else state['max_score']
        self.best_score_board = state['best_score_board']

        # Rewards
        self.total_rewards_list = state['total_rewards_list']
        self.last_n_total_rewards = state['last_n_total_rewards']
        self.mean_total_rewards = state['mean_total_rewards']
        self.max_total_reward = state['max_total_reward']
        self.best_reward_board = state['best_reward_board']

        # Max cell value on game board
        self.max_vals_list = state['max_vals_list']
        self.last_n_vals = state['last_n_vals']
        self.mean_vals = state['mean_vals']
        self.max_val = state['max_val']
        self.best_val_board = state['best_val_board']

        # Number of steps per episode
        self.max_steps_list = state['max_steps_list']
        self.last_n_steps = state['last_n_steps']
        self.mean_steps = state['mean_steps']
        self.max_steps = state['max_steps']
        self.total_steps = state['total_steps']
        self.best_steps_board = state['best_steps_board']

        self.actions_avg_list = state['actions_avg_list']
        self.actions_deque = state['actions_deque']

        # Replay buffer
        self.memory.load(state['memory'])

        # Initialize time step
        self.t_step = state['t_step']
        self.steps_ahead = state['steps_ahead']

    def step(self, state, action, reward, next_state, done, error,
             action_dist):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done, error,
                        action_dist, None)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        return action_values.cpu().data.numpy()

    def learn(self,
              learn_iterations,
              mode='board_max',
              save_loss=True,
              gamma=GAMMA,
              weight=None):

        if self.use_expected_rewards:
            self.memory.calc_expected_rewards(self.steps_ahead, weight)

        self.memory.add_episode_experiences()

        losses = []

        if len(self.memory) > self.batch_size:
            if learn_iterations is None:
                learn_iterations = self.learn_iterations

            for i in range(learn_iterations):

                states, actions, rewards, next_states, dones = self.memory.sample(
                    mode=mode)

                # Get expected Q values from local model
                Q_expected = self.qnetwork_local(states).gather(1, actions)

                # Compute loss
                loss = F.mse_loss(Q_expected, rewards)

                losses.append(loss.detach().numpy())

                # Minimize the loss
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            self.lr_decay.step()

            if save_loss:
                self.losses.append(np.mean(losses))
        else:
            self.losses.append(0)

    def soft_update(self, local_model, target_model, tau):
        """NOT USED ANYMORE
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 6
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.loss = torch.nn.MSELoss()

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        ''' 
        1. Get the target q-values 
        2. Get the current q-values 
        3. compute the loss
        4. update the weights using adam optimizer (don't forget to set zero grad)

        
        '''
        # the current_q_values tensor will have a shape of 64, 1
        current_q_values = self.qnetwork_local(states).gather(1, actions)
        target_q_values = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(-1)
        td_target = rewards + (gamma * target_q_values)
        output = self.loss(td_target, current_q_values)

        # emptying out the gradients before calculating again to ensure there is no wierd addition
        self.optimizer.zero_grad()

        # caculating the gradients of the loss function with respect to the weights
        output.backward()

        # updating the weights using the stochastic gradient descent
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 7
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Parameters:
        ==========
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network (local and target)
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Parameters:
        ==========
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
 
        Parameters:
        ==========
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Forward and backward passes
        output = self.qnetwork_local.forward(states).gather(1, actions)
        # MSE Loss implementation:
        self.criterion = nn.MSELoss()
        loss = self.criterion(output,
                              self.targets(gamma, rewards, next_states, dones))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def targets(self, gamma, rewards, next_states, dones):

        with torch.no_grad():
            q = self.qnetwork_target.forward(next_states)

        y = rewards + torch.mul(torch.max(q, dim=1, keepdim=True)[0],
                                gamma) * (1 - dones)

        return y

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Parameters:
        ==========
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        
        if torch.cuda.device_count() > 1:
            print("Let's use", torch.cuda.device_count(), "GPUs!")
            # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
            net = nn.DataParallel(self.qnetwork_local)

        if torch.cuda.is_available():
            print("using GPUs!")
            net.cuda()

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        # target net update
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        #logger.info('mse: {}'.format(delta))
        
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()             # evolutionary step - increase survival chances
        #logger.info('avg reward: {} mse:{}'.format(delta, np.mean(experiences.rewards())))  

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                          

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        # use helper calc_loss function & not this directly
        self.criterion = nn.MSELoss()

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        # first unsqueeze to make it a batch with one sample in it
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        # set the mode back to training mode
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
        # output of model needs to be Q values of actions 0 - (n-1) And output[ind_x] needs to correspond to action_x
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        
    # helper for learn func
    # y_j does not depend on the weight parameters that gradient descent will be training
    def calc_y_j(self, r_j, dones, gamma, target_out):
        # 1 or 0 flag; if episode doesn't terminate at j+1 (aka done == False), y_j product now already includes the gamma multiplication factor
        # use [[x] for x in y] kind of list comprehension because need them to be batch_size by 1 like r_j
        # use .to(device) to move to gpu (not just setting device arg when creating a tensor)
        dones_flags = torch.Tensor([[0] if done == True else [gamma] for done in dones]).float().to(device)
        max_q_target_out = torch.Tensor([[torch.max(q_for_all_actions)] for q_for_all_actions in target_out]).float().to(device)
        
        #  RuntimeError: Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead.
        #dones_flags = torch.from_numpy(np.vstack([0 if done == True else gamma for done in dones])).float().to(device)
        #max_q_target_out = torch.from_numpy(np.vstack([torch.max(q_for_all_actions) for q_for_all_actions in target_out])).float().to(device)
        
        y_j = r_j + dones_flags * max_q_target_out
        return y_j
    
    # helper for learn func
    def calc_loss(self, y_j, pred_out, actions):
        # need pred_out_actions_taken to be a tensor & built by combining (concatenating) other tensors to maintain gradients
        # actions is batch_size by 1- only have to iterate through rows
        for i in range(actions.shape[0]):
            # action taken. is an index for which col to look at in pred_out (pred_out is batch_size by n_actions]
            action_ind = actions[i, 0].item()
            if i == 0:
                # need to take h from 0 dimensional to 2 dimensional
                pred_out_actions_taken = pred_out[i, action_ind].unsqueeze(0).unsqueeze(0)
            else: 
                # concat along dim 0 -> vertically stack rows
                pred_out_actions_taken = torch.cat((pred_out_actions_taken, pred_out[i, action_ind].unsqueeze(0).unsqueeze(0)), dim=0)
            
        # loss is MSE between pred_out_actions_taken (input) and y_j (target) 
        return self.criterion(pred_out_actions_taken, y_j)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        # torch
        states, actions, rewards, next_states, dones = experiences

        "*** YOUR CODE HERE ***"
        ## TODO: compute and minimize the loss
        # vstack takes one argument (sequence)- stacks the pieces of that argument vertically
        # SELF NOTE: think about what (if anything additional) needs .todevice()
        
        # make sure to zero the gradients
        self.optimizer.zero_grad()
        
        # q_network_local model output from forward pass
        pred_out = self.qnetwork_local(states)
        target_out = self.qnetwork_target(next_states)
        
        # compute the loss for q_network_local vs q_network_target 
        y_j = self.calc_y_j(rewards, dones, gamma, target_out)
        # calc gradient & take step down the gradient
        loss = self.calc_loss(y_j, pred_out, actions)
        loss.backward()
        self.optimizer.step()
                      

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Exemplo n.º 10
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed, 64,
                                       64).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed, 64,
                                        64).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.loss_fn = torch.nn.MSELoss()

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0


#         torch.nn.utils.clip_grad_value_(self.qnetwork_local.parameters(), clip_value = 1)

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

        #         # Get max predicted Q values (for next states) from target model
        #         Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        #         # Compute Q targets for current states
        #         Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        #         # Get expected Q values from local model
        #         Q_expected = self.qnetwork_local(states).gather(1, actions)

        #         # Compute loss
        #         loss = F.mse_loss(Q_expected, Q_targets)
        #         # Minimize the loss
        #         self.optimizer.zero_grad()
        #         loss.backward()
        #         self.optimizer.step()

        optimizer = self.optimizer
        loss_fn = self.loss_fn

        ## this is required as we're not learning qnetwork_targets weights
        #         with torch.no_grad():
        #             Q_target = rewards + gamma * (torch.max(self.qnetwork_target(next_states), dim=1)[0].view(64,1))*(1 - dones)
        #             Q_target[dones == True] = rewards[dones == True]
        #         Q_pred = torch.max(self.qnetwork_local(states), dim=1)[0].view(64,1)

        ## Double DQNs
        #argmax on Target W
        best_actions_by_local_nn = torch.max(
            self.qnetwork_local(next_states).detach(), dim=1)[1].unsqueeze(1)
        action_values_by_target_nn = self.qnetwork_target(
            next_states).detach().gather(1, best_actions_by_local_nn)
        Q_target = rewards + gamma * action_values_by_target_nn * (1 - dones)

        Q_pred = self.qnetwork_local(states).gather(1, actions)

        optimizer.zero_grad()
        loss = loss_fn(Q_pred, Q_target)
        loss.backward()
        optimizer.step()

        #         print("Loss=", loss.item())
        #         print("Loss=", loss,
        #               "Local params L2=", torch.norm(self.qnetwork_local.parameters(), 2),
        #               "Local params grad L2=", torch.norm(self.qnetwork_local.parameters().grad, 2))

        #         with torch.no_grad():
        #             for param in self.qnetwork_local.parameters():
        #                 param -= learning_rate * param.grad

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 11
0
    class Agent():
        """Interacts with and learns from the environment."""
        def __init__(self, state_size, action_size, seed):
            """Initialize an Agent object.
            
            Params
            ======
                state_size (int): dimension of each state
                action_size (int): dimension of each action
                seed (int): random seed
            """
            self.state_size = state_size
            self.action_size = action_size
            self.seed = random.seed(seed)

            # Q-Network
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=LR)

            # Replay memory
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed, ALPHA)

            # Initialize time step (for updating every UPDATE_EVERY steps)
            self.t_step = 0

            # Initialize learning step for updating beta
            self.learn_step = 0

        def step(self, state, action, reward, next_state, done):
            # Save experience in replay memory
            self.memory.add(state, action, reward, next_state, done)

            # Learn every UPDATE_EVERY time steps.
            self.t_step = (self.t_step + 1) % UPDATE_EVERY
            if self.t_step == 0:
                # If enough samples are available in memory, get prioritized subset and learn
                if len(self.memory) > BATCH_SIZE:
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA, BETA)

        def act(self, state, eps=0.):
            """Returns actions for given state as per current policy.
            
            Params
            ======
                state (array_like): current state
                eps (float): epsilon, for epsilon-greedy action selection
            """
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            self.qnetwork_local.eval()
            with torch.no_grad():
                action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()

            # Epsilon-greedy action selection
            if random.random() > eps:
                return np.argmax(action_values.cpu().data.numpy())
            else:
                return random.choice(np.arange(self.action_size))

        def learn(self, experiences, gamma, beta):
            """Update value parameters using given batch of experience tuples.

            Params
            ======
                experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
                gamma (float): discount factor
                beta (float): initial value for beta, which controls how much importance weights affect learning
            """
            states, actions, rewards, next_states, dones, probabilities, indices = experiences

            if double_dqn:
                # Get the Q values for each next_state, action pair from the
                # local/online/behavior Q network:
                Q_targets_next_local = self.qnetwork_local(
                    next_states).detach()
                # Get the corresponding best action for those next_states:
                _, a_prime = Q_targets_next_local.max(1)

                # Get the Q values from the target Q network but following a_prime,
                # which belongs to the local network, not the target network:
                Q_targets_next = self.qnetwork_target(next_states).detach()
                Q_targets_next = Q_targets_next.gather(1, a_prime.unsqueeze(1))

            else:
                # Get max predicted Q values (for next states) from target model
                Q_targets_next = self.qnetwork_target(
                    next_states).detach().max(1)[0].unsqueeze(1)

            # Compute Q targets for current states
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

            # Get expected Q values from local model
            Q_expected = self.qnetwork_local(states).gather(1, actions)

            # Compute and update new priorities
            new_priorities = (abs(Q_expected - Q_targets) +
                              EPSILON_PER).detach()
            self.memory.update_priority(new_priorities, indices)

            # Update beta parameter (b). By default beta will reach 1 after
            # 25,000 training steps (~325 episodes in the Banana environment):
            b = min(1.0, beta + self.learn_step * (1.0 - beta) / BETA_ITERS)
            self.learn_step += 1

            # Compute and apply importance sampling weights to TD Errors
            ISweights = (((1 / len(self.memory)) * (1 / probabilities))**b)
            max_ISweight = torch.max(ISweights)
            ISweights /= max_ISweight
            Q_targets *= ISweights
            Q_expected *= ISweights

            # Compute loss
            loss = F.mse_loss(Q_expected, Q_targets)
            # Minimize the loss
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            # ------------------- update target network ------------------- #
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

        def soft_update(self, local_model, target_model, tau):
            """Soft update model parameters.
            θ_target = τ*θ_local + (1 - τ)*θ_target

            Params
            ======
                local_model (PyTorch model): weights will be copied from
                target_model (PyTorch model): weights will be copied to
                tau (float): interpolation parameter 
            """
            for target_param, local_param in zip(target_model.parameters(),
                                                 local_model.parameters()):
                target_param.data.copy_(tau * local_param.data +
                                        (1.0 - tau) * target_param.data)
Exemplo n.º 12
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 hidden_layers,
                 buffer_size=int(1e6),
                 batch_size=32,
                 gamma=.99,
                 tau=1,
                 lr=2.5e-4,
                 update_local=4,
                 update_target=10000,
                 ddqn=False,
                 seed=1):
        """Initialize Agent object

        Params
        ======
            state_size (int): Dimension of states
            action_size (int): Dimension of actions
            hidden_layers (list of ints): number of nodes in the hidden layers
            buffer_size (int): size of replay buffer
            batch_size (int): size of sample
            gamma (float): discount factor
            tau (float): (soft) update of target parameters
            lr (float): learning rate
            update_local (int): update local after every x steps
            update_target (int): update target after every x steps
            ddqn (boolean): Double Deep Q-Learning
            seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Hyperparameters
        self.buffer_size = buffer_size  # replay buffer
        self.batch_size = batch_size  # minibatch size
        self.gamma = gamma  # discount factor
        self.tau = tau  # (soft) update of target parameters
        self.lr = lr  # learning rate
        self.update_local = update_local  # update local network after every x steps
        self.update_target = update_target  # update target network with local network weights

        # Q Network
        self.qnet_local = \
            QNetwork(state_size, action_size, hidden_layers, seed).to(device)
        self.qnet_target = \
            QNetwork(state_size, action_size, hidden_layers, seed).to(device)
        self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=lr)

        # Replay buffer
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)

        # Initialize time step
        self.t_step = 0

        # Double Deep Q-Learning flag
        self.ddqn = ddqn

    def step(self, state, action, reward, next_state, done):

        # Save experience in replay buffer
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE LOCAL time steps
        self.t_step += 1
        if self.t_step % self.update_local == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                sample = self.memory.sample()
                if self.t_step % self.update_target == 0:
                    do_target_update = True
                else:
                    do_target_update = False
                self.__learn(sample, self.gamma, do_target_update)

    def act(self, state, epsilon=0):
        """Returns action given a state according to local Q Network (current policy)

        Params
        ======
            state (array_like): current state
            epsilon (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnet_local.eval()
        with torch.no_grad():
            action_values = self.qnet_local(state)
        self.qnet_local.train()

        # Epsilon greedy action selection
        if random.random() > epsilon:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def __learn(self, sample, gamma, do_target_update):
        """Update value parameters using given batch of sampled experiences tuples

        Params
        ======
            sample (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = sample

        if not self.ddqn:

            # Get max predicted Q values (for next states) from target model
            Q_targets_next = \
                self.qnet_target(next_states).detach().max(1)[0].unsqueeze(1)

        else:
            # Get actions (for next states) with max Q values from local net
            next_actions = \
                self.qnet_local(next_states).detach().max(1)[1].unsqueeze(1)

            # Get predicted Q values from target model
            Q_targets_next = \
                self.qnet_target(next_states).gather(1, next_actions)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnet_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        if do_target_update:
            self.__target_net_update(self.qnet_local, self.qnet_target,
                                     self.tau)

    def __target_net_update(self, local_net, target_net, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param \
            in zip(target_net.parameters(), local_net.parameters()):
            target_param.data.\
                copy_(tau*local_param.data + (1.0 - tau)*target_param.data)

    def get_info(self):
        output = """
            Replay Buffer size: {} \n
            Batch size: {} \n
            Discout factor: {} \n
            tau: {} \n
            Learning Rate: {} \n
            Update local network after every {} steps \n
            Update target network with local network parameters after every {} steps \n
            DDQN: {}
        """
        print(
            output.format(self.buffer_size, self.batch_size, self.gamma,
                          self.tau, self.lr, self.update_local,
                          self.update_target, self.ddqn))
Exemplo n.º 13
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        # self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        # Note that train + update is made every C iterations here.
        # In the algorithm, train is assumed to be done every iteration, whereas
        # update is done every C iterations.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                # ------------------- train with mini-batch sample of experiences ------------------- #
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        # - qnetwork_target : apply forward pass for the whole mini-batch
        # - detach : do not backpropagate
        # - max : get maximizing action for each sample of the mini-batch (dim=1)
        # - [0].unsqueeze(1) : transform output into a flat array
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states (y)
        # - dones : detect if the episode has finished
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model (Q(Sj, Aj, w))
        # - gather : for each sample select only the output value for action Aj
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Optimize over (yj-Q(Sj, Aj, w))^2
        # * compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # * minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 14
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, checkpoint_path=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)

        # Load the model only when the checkpoint is available
        if checkpoint_path is not None:
            self.qnetwork_local.load_state_dict(torch.load(checkpoint_path))
            print("Checkpoint loaded successfully")

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        self.optimizer.zero_grad()
        #target
        with torch.no_grad():
            #Double DQN
            ddqn_max_indices = self.qnetwork_local(next_states).max(dim=1)[1]
            target_op = self.qnetwork_target(next_states)
            target_op = target_op.gather(1, ddqn_max_indices.view(-1, 1))
            '''
            # DQN
                target_op = self.qnetwork_target(next_states).max(dim=1)[0].view(-1,1)
            '''
            targets = rewards + target_op * (1 - dones) * gamma

        predictions = self.qnetwork_local(states)
        predictions = predictions.gather(1, actions.view(-1, 1))
        loss = torch.nn.MSELoss()(predictions, targets)
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 15
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, compute_weights=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.compute_weights = compute_weights

        # Algorithms to enable during training
        self.PrioritizedReplayBuffer = True  # Use False to enable uniform sampling
        self.HardTargetUpdate = True  # Use False to enable soft target update

        # building the policy and target Q-networks for the agent, such that the target Q-network is kept frozen to avoid the training instability issues
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)  # main policy network
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)  # target network
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.criterion = nn.MSELoss()

        # Replay memory
        # building the experience replay memory used to avoid training instability issues
        # Below: PER
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   EXPERIENCES_PER_SAMPLING, seed,
                                   compute_weights)

        # Below: Uniform by method defined in this script
        #self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_NN_EVERY steps)
        self.t_step_nn = 0
        # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps)
        self.t_step_mem_par = 0
        # Initialize time step (for updating every UPDATE_MEM_EVERY steps)
        self.t_step_mem = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_NN_EVERY time steps.
        self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY
        self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY
        self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY
        if self.t_step_mem_par == 0:
            self.memory.update_parameters()
        if self.t_step_nn == 0:
            # If enough samples are available in memory, get random subset and learn
            if self.memory.experience_count > EXPERIENCES_PER_SAMPLING:
                sampling = self.memory.sample()
                self.learn(sampling, GAMMA)
        if self.t_step_mem == 0:
            self.memory.update_memory_sampling()

    def act(self, state, eps=0.):
        """A function to select an action based on the Epsilon greedy policy. Epislon percent of times the agent will select a random
        action while 1-Epsilon percent of the time the agent will select the action with the highest Q value as predicted by the
        neural network.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # here we calculate action values (Q values)
        self.qnetwork_local.eval(
        )  # model deactivate norm, dropout etc. layers as it is expected
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train(
        )  # model.train() sets the modules in the network in training mode

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.cpu().numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, sampling, gamma):
        """Update value parameters using given batch of experience tuples.
        Function for training the neural network. The function will update the weights of the newtwork

        Params
        ======
            sampling (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, weights, indices = sampling

        # Target (absolute) Q values from target Q network
        q_target = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Predictions from local Q network
        expected_values = rewards + gamma * q_target * (1 - dones)
        output = self.qnetwork_local(states).gather(1, actions)
        # computing the loss
        loss = F.mse_loss(output,
                          expected_values)  # Loss Function: Mean Square Error
        if self.compute_weights:
            with torch.no_grad():
                weight = sum(np.multiply(weights, loss.data.cpu().numpy()))
            loss *= weight
        # Minimizing the loss by optimizer
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

        # ------------------- update priorities ------------------- #
        delta = abs(expected_values - output.detach()).cpu().numpy()
        #print("delta", delta)
        self.memory.update_priorities(delta, indices)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    # def hard_update(self):
    # """ This hard_update method performs direct update of target network
    # weight update from local network weights instantly"""

    # Write the algorithm here

    def load_models(self, policy_net_filename, target_net_filename):
        """ Function to load the parameters of the policy and target models """
        print('Loading model...')
        self.qnetwork_local.load_model(policy_net_filename)
        self.qnetwork_target.load_model(target_net_filename)
Exemplo n.º 16
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, compute_weights=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.compute_weights = compute_weights

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.criterion = nn.MSELoss()

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   EXPERIENCES_PER_SAMPLING, seed,
                                   compute_weights)
        # Initialize time step (for updating every UPDATE_NN_EVERY steps)
        self.t_step_nn = 0
        # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps)
        self.t_step_mem_par = 0
        # Initialize time step (for updating every UPDATE_MEM_EVERY steps)
        self.t_step_mem = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_NN_EVERY time steps.
        self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY
        self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY
        self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY
        if self.t_step_mem_par == 0:
            self.memory.update_parameters()
        if self.t_step_nn == 0:
            # If enough samples are available in memory, get random subset and learn
            if self.memory.experience_count > EXPERIENCES_PER_SAMPLING:
                sampling = self.memory.sample()
                self.learn(sampling, GAMMA)
        if self.t_step_mem == 0:
            self.memory.update_memory_sampling()

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, sampling, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            sampling (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, weights, indices = sampling

        ## TODO: compute and minimize the loss
        q_target = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        expected_values = rewards + gamma * q_target * (1 - dones)
        output = self.qnetwork_local(states).gather(1, actions)
        loss = F.mse_loss(output, expected_values)
        if self.compute_weights:
            with torch.no_grad():
                weight = sum(np.multiply(weights, loss.data.cpu().numpy()))
            loss *= weight
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

        # ------------------- update priorities ------------------- #
        delta = abs(expected_values - output.detach()).numpy()
        #print("delta", delta)
        self.memory.update_priorities(delta, indices)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 17
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed, buffer_size=int(1e5), batch_size=64,
                 gamma=0.99, tau=1e-3, lr=5e-4, update_every=4, 
                 double_dqn=False, dueling_dqn=False, prioritized_replay=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            lr (float): learning rate
            update_every (int): how often to update the network
            double_dqn (bool) use double Q-network when 'True'
            dueling_dqn (bool): use dueling Q-network when 'True'
            prioritized_replay (bool): use prioritized replay buffer when 'True'
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.double_dqn = double_dqn
        self.dueling_dqn = dueling_dqn
        self.prioritized_replay = prioritized_replay

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed, dueling_dqn=dueling_dqn).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed, dueling_dqn=dueling_dqn).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, prioritized_replay=prioritized_replay)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for a given state as per the current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using a given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        if self.prioritized_replay:
            states, actions, rewards, next_states, dones, indices, weights = experiences
        else:
            states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q-values (for next states) from target model
        if self.double_dqn:
            # Use local model to choose an action, and target model to evaluate that action
            Q_local_max = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1)
            Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_local_max)
        else:
            Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q-targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q-values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        
        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        if self.prioritized_replay:
            priorities = np.sqrt(loss.detach().cpu().data.numpy())
            self.memory.update_priorities(indices, priorities)
            loss = loss * weights
            loss = loss.mean()
            
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): where weights will be copied from
            target_model (PyTorch model): where weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
Exemplo n.º 18
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, temperature,
                 type_of_update, type_of_loss):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.type_of_loss = type_of_loss
        print(self.type_of_loss)
        self.type_of_update = type_of_update
        print(self.type_of_update)

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.temperature = temperature
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.total_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        self.total_step = (self.total_step + 1) % TARGET_UPDATE
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                #                import pdb
                #                 pdb.set_trace()
                #                 whole_experiences = list(self.memory[:])
                #                pdb.set_trace()
                #sample_weights = self.learn_sample_weights(self.memory.memory, GAMMA)

                #sample_weights=torch.nn.functional.softmax(sample_weights/self.temperature)
                #                 if min(sample_weights)<10*(-6):
                #                     addition=min(sample_weights)
                #                 else:
                #                     addition=10*(-6)
                #                 sample_weights=sample_weights+addition/100.0

                #sample_weights=sample_weights.detach().numpy()

                #experiences = self.memory.sample(sample_weights)
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)
                if self.type_of_update == 'hard':
                    if self.total_step == 0:
                        self.hard_update(self.qnetwork_local,
                                         self.qnetwork_target)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))


#     def learn_sample_weights(self, experiences, gamma):
#         states, actions, rewards, next_states, dones = turn_experiences_into_subcategories(experiences)

#         # Get max predicted Q values (for next states) from target model
#         Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
#         # Compute Q targets for current states
#         Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

#         # Get expected Q values from local model
#         Q_expected = self.qnetwork_local(states).gather(1, actions)

#         sample_weights=abs(Q_targets-Q_expected).reshape(-1,)
#         return sample_weights

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        if self.type_of_update == 'same':
            Q_targets_next = self.qnetwork_local(next_states).detach().max(
                1)[0].unsqueeze(1)
        else:
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        #import pdb
        #pdb.set_trace()
        sample_weights = abs(Q_targets - Q_expected).reshape(-1, )

        # Compute loss
        if self.type_of_loss == 'mse':
            loss = F.mse_loss(Q_expected, Q_targets)
        elif self.type_of_loss == 'huber':
            loss = F.smooth_l1_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        if self.type_of_update == 'soft':
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def hard_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 19
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, lr_decay=0.9999):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        if USE_DUELING_NETWORK:
            self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                                  seed, [128, 32],
                                                  [64, 32]).to(device)

            self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                                   seed, [128, 32],
                                                   [64, 32]).to(device)
            self.qnetwork_target.eval()

        else:
            self.qnetwork_local = QNetwork(state_size,
                                           action_size,
                                           seed,
                                           fc1_units=128,
                                           fc2_units=32).to(device)

            self.qnetwork_target = QNetwork(state_size,
                                            action_size,
                                            seed,
                                            fc1_units=128,
                                            fc2_units=32).to(device)
            self.qnetwork_target.eval()

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(
            self.optimizer, lr_decay)

        # Replay memory
        if USE_PRIORITIZED_REPLAY_BUFFER:
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  BUFFER_SIZE,
                                                  BATCH_SIZE,
                                                  seed,
                                                  device,
                                                  alpha=0.6,
                                                  beta=0.4,
                                                  beta_scheduler=1.0)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, w = experiences

        if USE_DOUBLE_DQN:
            self.qnetwork_local.eval()
            Q_local = self.qnetwork_local(next_states)
            greedy_actions = torch.argmax(Q_local, axis=1).unsqueeze(1)
            self.qnetwork_local.train()

            Q_targets_next = self.qnetwork_target(next_states)
            Q_targets_next = Q_targets_next.gather(1, greedy_actions)
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        else:
            # Get max predicted Q values (for next states) from target model
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

            Q_targets_next = Q_targets_next.max(dim=1, keepdim=True)[0]

            # Compute Q targets for current states
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        if USE_PRIORITIZED_REPLAY_BUFFER:
            Q_targets.sub_(Q_expected)
            Q_targets.squeeze_()
            Q_targets.pow_(2)

            with torch.no_grad():
                td_error = Q_targets.detach()
                td_error.pow_(0.5)
                self.memory.update_priorities(td_error)

            Q_targets.mul_(w)
            loss = Q_targets.mean()
        else:
            loss = F.mse_loss(Q_expected, Q_targets)

        # Mback-propagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.lr_scheduler.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 20
0
class ddqn_dual_Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
       
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = dqn_ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        #detach: Returns a new Tensor, detached from the current graph.
        #The result will never require gradient.
        yj=self.qnetwork_target.forward(next_states).detach().max(1)[0].unsqueeze(1)
 #       print("shape",self.qnetwork_target.forward(next_states).detach())
        Q_targets=rewards+gamma*yj*(1.0-dones)
 
        # Get expected Q values from local model
        Q_expected = self.qnetwork_local.forward(states).gather(1, actions)
 
        # Compute loss: Mean Square Error by element
        loss = F.mse_loss(Q_expected, Q_targets)
  
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        teta_target = ro*teta_local + (1 - ro)*teta_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Exemplo n.º 21
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()  # this change the local net to eval mode
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train(
        )  # this just return the local net back to train mode

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        # Get max predicted Q values (for next states) from target model
        target_q_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        """
        # disregard action, get best value! 
        # why so many next states? answer: the qnetwork will return each corresponding next states action, the max will pick from each the           best action
        
        # explanation on detach (https://discuss.pytorch.org/t/detach-no-grad-and-requires-grad/16915/7)
        """
        # Compute Q targets for current states
        target_q = rewards + (gamma * target_q_next * (1 - dones))

        # Get expected Q values from local model
        expected_q = self.qnetwork_local(states).gather(1, actions)
        """
        this uses gather instead of detach like target since it only give a s*** to action taken
        # explanation on gather (https://stackoverflow.com/questions/50999977/what-does-the-gather-function-do-in-pytorch-in-layman-terms)
        """
        # Compute loss
        loss = F.mse_loss(expected_q, target_q)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 22
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, enable_curiosity):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.enable_curiosity = enable_curiosity

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)

        # Curiosity Elements
        self.fwd_model = FwdModel(state_size, action_size, seed).to(device)
        self.inverse_model = InverseModel(state_size, action_size,
                                          seed).to(device)

        ##Optimizer
        params_to_opt = list(self.qnetwork_local.parameters()) + list(
            self.fwd_model.parameters()) + list(
                self.inverse_model.parameters())
        self.optimizer = optim.Adam(params_to_opt, lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        self.loss_list = []

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        ##Now get the result of evaluating the forward model
        ##FIXME: do I need to normalize state? Probably!
        act_onehot = torch.FloatTensor(BATCH_SIZE, self.action_size).to(device)
        act_onehot.zero_()
        act_onehot.scatter_(1, actions, 1)
        ns_expected = self.fwd_model(states, act_onehot)

        ##Now get the result of evaluating the inverse model
        a_expected = self.inverse_model(states, next_states)

        # Compute loss
        #exploration loss
        loss1 = F.mse_loss(Q_expected, Q_targets)

        #inverse model loss
        criterion = torch.nn.CrossEntropyLoss()
        loss2 = criterion(a_expected, torch.squeeze(actions))

        #forward model loss
        loss3 = F.mse_loss(ns_expected, next_states)

        if self.enable_curiosity:
            loss1 = loss1 * EXTRINSIC_WEIGHT
            loss2 = loss2 * INVERSE_WEIGHT
            loss3 = loss3 * FORWARD_WEIGHT

            loss = loss1 + loss2 + loss3
        else:
            loss = loss1

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.loss_list.append((loss1, loss2, loss3))

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        print("Running on: " + str(device))

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)

        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.qnetwork_target.eval()

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## DONE: compute and minimize the loss
        "*** YOUR CODE HERE ***"

        with torch.no_grad():
            # calculate the target rewards for the next_states
            target_rewards = self.qnetwork_target(next_states)
            # select the maximum reward for each next_state
            target_rewards = target_rewards.max(1)[0]
            # change shape: [batch_size] --> [batch_size, 1]
            target_rewards = target_rewards.unsqueeze(1)
            # calculate the discounted target rewards
            target_rewards = rewards + (gamma * target_rewards * (1 - dones))

        # calculate the expected rewards for each action for the states
        expected_rewards = self.qnetwork_local(
            states)  # shape: [batch_size, action_size]
        # get the reward for the action selected for each state
        expected_rewards = expected_rewards.gather(
            1, actions)  # shape: [batch_size, 1]

        # calculate the loss
        loss = F.mse_loss(expected_rewards, target_rewards)

        # perform the back-propagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 24
0
class Vanilla():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 hidden_layers,
                 seed,
                 buffer_size=BUFFER_SIZE,
                 batch_size=BATCH_SIZE,
                 gamma=GAMMA,
                 lr=LR,
                 update_every=UPDATE_EVERY):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.hidden_layers = hidden_layers
        self.buffer_size = int(buffer_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.lr = lr
        self.update_every = update_every

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)
        # Initialize time step (for updating every update_every steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every update_every time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                # Is line below required? Don't think so looks like no-op ...
                #action_values = self.qnetwork_local(experiences[0])
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        # "unsqueeze" set the batch_size dim which is one here
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        # Should we multiply by weights now -- TEMP
        loss = F.mse_loss(Q_expected, Q_targets)

        # Somewhere here update priorities -- TEMP

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 25
0
class AbstractAgent(metaclass=ABCMeta):
    """Abstract Base Agent"""
    def __init__(self, state_size, action_size, memory, seed, configs):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.lr = configs['LR']
        self.update_every = configs['UPDATE_EVERY']
        self.batch_size = configs['BATCH_SIZE']
        self.gamma = configs['GAMMA']
        self.tau = configs['TAU']

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Replay memory
        # ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.memory = memory
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # Loss
        self.criterion = nn.MSELoss()

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self._learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    @abstractmethod
    def _learn(self, experiences, gamma):
        raise NotImplementedError
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(
                state)  #same as self.qnetwork_local.forward(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        #"*** YOUR CODE HERE ***"
        qs_local = self.qnetwork_local.forward(states)
        qsa_local = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long),
                             actions.reshape(BATCH_SIZE)]
        qsa_local = qsa_local.reshape((BATCH_SIZE, 1))
        #print(qsa_local.shape)

        # # DQN Target
        # qs_target = self.qnetwork_target.forward(next_states)
        # qsa_target, _ = torch.max(qs_target, dim=1) #using the greedy policy (q-learning)
        # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete
        # qsa_target = qsa_target.reshape((BATCH_SIZE,1))
        # TD_target = rewards + gamma * qsa_target
        # #print(qsa_target.shape, TD_target.shape, rewards.shape)

        # # Double DQN Target ver 1
        # qs_target = self.qnetwork_target.forward(next_states)
        # if random.random() > 0.5:
        #     _, qsa_target_argmax_a = torch.max(qs_target, dim=1) #using the greedy policy (q-learning)
        #     qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_target_argmax_a.reshape(BATCH_SIZE)]
        # else:
        #     _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning)
        #     #qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)]
        #     ##qsa_target = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)]

        # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete
        # qsa_target = qsa_target.reshape((BATCH_SIZE,1))
        # TD_target = rewards + gamma * qsa_target

        # Double DQN Target ver 2 (based upon double dqn paper)
        qs_target = self.qnetwork_target.forward(next_states)
        _, qsa_local_argmax_a = torch.max(
            qs_local, dim=1)  #using the greedy policy (q-learning)
        qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long),
                               qsa_local_argmax_a.reshape(BATCH_SIZE)]

        qsa_target = qsa_target * (
            1 - dones.reshape(BATCH_SIZE)
        )  #target qsa value is zero when episode is complete
        qsa_target = qsa_target.reshape((BATCH_SIZE, 1))
        TD_target = rewards + gamma * qsa_target

        #print(qsa_target.shape, TD_target.shape, rewards.shape)

        # #Udacity's approach
        # # Get max predicted Q values (for next states) from target model
        # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # # Compute Q targets for current states
        # TD_target = rewards + (gamma * Q_targets_next * (1 - dones))
        # # Get expected Q values from local model
        # qsa_local = self.qnetwork_local(states).gather(1, actions)

        #diff = qsa_local - TD_target
        #loss = torch.matmul(torch.transpose(diff, dim0=0, dim1=1), diff) #loss is now a scalar
        loss = F.mse_loss(qsa_local,
                          TD_target)  #much faster than the above loss function
        #print(loss)
        #minimize the loss
        self.optimizer.zero_grad()  #clears the gradients
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 27
0
class AgentUniform():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, hidden_layers, lr=5e-4):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed

        self.gamma = GAMMA

        # Q-Network
        self.lr = lr
        self.qnetwork_local = QNetwork(state_size, action_size, self.seed,
                                       hidden_layers).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, self.seed,
                                        hidden_layers).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        self.checkpoint = {
            "input_size":
            self.state_size,
            "output_size":
            self.action_size,
            "hidden_layers":
            [each.out_features for each in self.qnetwork_local.hidden_layers],
            "state_dict":
            self.qnetwork_local.state_dict()
        }
        self.checkpointfile = 'vanilla_dpq.pth'

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn NUM_LEARNS times par every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) >= MIN_BUF_SIZE:
            for i in range(NUM_LEARNS):
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()).astype(np.int32)
        else:
            return random.choice(np.arange(self.action_size)).astype(np.int32)

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def update_qtarget(self):
        for target_param, local_param in zip(self.qnetwork_target.parameters(),
                                             self.qnetwork_local.parameters()):
            target_param.data.copy_(local_param.data)

    def set_lr(self, lr):
        self.lr = lr

    def load_model(self, filepath):
        checkpoint = torch.load(filepath)

        self.qnetwork_local = QNetwork(checkpoint["input_size"],
                                       checkpoint["output_size"], self.seed,
                                       checkpoint["hidden_layers"])
        self.qnetwork_local.load_state_dict(checkpoint["state_dict"])

    def get_gamma(self):
        return self.gamma

    def save_model(self):
        torch.save(self.checkpoint, self.checkpointfile)
Exemplo n.º 28
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        h_size = 128
        self.qnetwork_local = QNetwork(state_size, action_size, h_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, h_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.criterion = torch.nn.MSELoss()
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        # get predictions (actions with their values?)
        with torch.no_grad():
            action_values = self.qnetwork_target(
                next_states)  #.detach().max(1)[0].unsqueeze(1)
            max_idx = torch.argmax(action_values, 1).unsqueeze(1)
            y_targets = action_values.gather(1, max_idx)

        values = rewards + (1 - dones) * gamma * (
            y_targets)  # if done (=1) then we just use reward value

        y_expected = self.qnetwork_local(states).gather(1, actions)

        loss = F.mse_loss(y_expected, values)

        self.optimizer.zero_grad()
        # back prop
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 29
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed, fc1_units = 64, fc2_units = 64):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            fc1_units (int): fully connected layer 1 size
            fc2_units (int): fully connected layer 2 size
        """
        self.state_size = state_size
        self.action_size = action_size
        random.seed(seed)
        self.seed = random.randint(1,1000)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed, fc1_units, fc2_units).to(device)
        print(self.qnetwork_local.parameters)
        self.qnetwork_target = QNetwork(state_size, action_size, seed, fc1_units, fc2_units).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
#             return np.argmax(action_values.cpu().data.numpy()).astype('int') #int32 here makes unity env happy
            return int(np.argmax(action_values.cpu().data.numpy()))   #try casting this to native python int to make unity happy
        else:
#             return random.choice(np.arange(self.action_size)).astype('int')
            return int(random.choice(np.arange(self.action_size)))    #try casting this to native python int to make unity happy

########################
# the commented return lines are believed to cause errors' in Udacity reviewer's hardware.  int64 that comes out of the argmax function
# by default appears to have cause the same issues on my own hardware.  Casting to int32 fixed that for me.  Now trying native python int
########################

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DDQNAgentPrioExpReplay:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=PARAM.LR)

        # Replay memory
        self.memory = PrioritizedReplayBuffer(action_size, 20000,
                                              PARAM.BATCH_SIZE, 0,
                                              PARAM.PROBABILITY_EXPONENT)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.eps = 1

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % PARAM.UPDATE_EVERY
        if self.t_step == 0:

            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > PARAM.BATCH_SIZE:
                experiences, experience_indices, importance_weights = self.memory.sample(
                )
                self.learn(experiences, experience_indices, importance_weights,
                           PARAM.GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        self.eps = eps
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def get_ddqn_targets(self, next_states, rewards, gamma, dones):
        # get best action according to online value function approximation
        q_online = self.qnetwork_local(next_states).detach()
        q_online = q_online.argmax(1)

        # get value of target function at position of best online action
        q_target = self.qnetwork_target(next_states).detach()
        q_target = q_target.index_select(1, q_online)[:, 0]

        # reshape
        q_target = q_target.unsqueeze(1)

        # calculate more correct q-value given the current reward
        Q_targets = rewards + (gamma * q_target * (1 - dones))

        return Q_targets

    def learn(self, experiences, experience_indices, importance_weights,
              gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences
        Q_targets = self.get_ddqn_targets(next_states, rewards, gamma, dones)

        # Get expected Q values
        q_exp = self.qnetwork_local(states)
        # print(q_exp)

        # gets the q values along dimension 1 according to the actions, which is used as index
        # >>> t = torch.tensor([[1,2],[3,4]])
        # >>> torch.gather(t, 1, torch.tensor([[0],[1]]))
        # tensor([[ 1],
        #        [ 4]])
        q_exp = q_exp.gather(1, actions)
        # print(q_exp)

        error = torch.abs(q_exp - Q_targets)

        with torch.no_grad():
            # update priority
            # we need ".cpu()" here because the values need to be copied to memory before converting them to numpy,
            # else they are just present in the GPU
            errors = np.squeeze(error.cpu().data.numpy())
            self.memory.set_priorities(experience_indices, errors)

        # compute loss
        squared_error = torch.mul(error, error)
        with torch.no_grad():
            w = torch.from_numpy(
                importance_weights**(1 - self.eps)).float().to(device)
            w = w.detach()

        squared_error = torch.squeeze(squared_error)
        weighted_squared_error = torch.mul(squared_error, w)
        loss = torch.mean(weighted_squared_error)
        #loss = F.mse_loss(q_exp, Q_targets)

        # reset optimizer gradient
        self.optimizer.zero_grad()
        # do backpropagation
        loss.backward()
        # do optimize step
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        # according to the algorithm in
        # https://proceedings.neurips.cc/paper/2010/file/091d584fced301b442654dd8c23b3fc9-Paper.pdf
        # one should update randomly in ether direction
        #update_direction = np.random.binomial(1, 0.5)
        #if update_direction == 0:
        #    self.soft_update(self.qnetwork_local, self.qnetwork_target, PARAM.TAU)
        #else:
        #    self.soft_update(self.qnetwork_target, self.qnetwork_local, PARAM.TAU)

        self.soft_update(self.qnetwork_local, self.qnetwork_target, PARAM.TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)