示例#1
0
class DDQN(nn.Module):
    def __init__(self, obs, ac, config):

        super().__init__()

        self.q = QNetwork(obs, ac)
        self.target = QNetwork(obs, ac)

        self.target.load_state_dict(self.q.state_dict())

        self.target_net_update_freq = config.target_net_update_freq
        self.update_counter = 0

    def get_action(self, x):

        with torch.no_grad():
            a = self.q(x).max(1)[1]

        return a.item()

    def update_policy(self, adam, memory, params):

        b_states, b_actions, b_rewards, b_next_states, b_masks = memory.sample(
            params.batch_size)

        states = torch.tensor(b_states).float()
        actions = torch.tensor(b_actions).long().reshape(-1, 1)
        rewards = torch.tensor(b_rewards).float().reshape(-1, 1)
        next_states = torch.tensor(b_next_states).float()
        masks = torch.tensor(b_masks).float().reshape(-1, 1)

        current_q_values = self.q(states).gather(1, actions)

        # print(current_q_values[:5])

        with torch.no_grad():

            max_next_q_vals = self.target(next_states).max(1)[0].reshape(-1, 1)
            # max_next_q_vals = self.
        expected_q_vals = rewards + max_next_q_vals * 0.99 * masks
        # print(expected_q_vals[:5])
        loss = F.mse_loss(expected_q_vals, current_q_values)

        # input(loss)

        # print('\n'*5)

        adam.zero_grad()
        loss.backward()

        for p in self.q.parameters():
            p.grad.data.clamp_(-1., 1.)
        adam.step()

        self.update_counter += 1
        if self.update_counter % self.target_net_update_freq == 0:
            self.update_counter = 0
            self.target.load_state_dict(self.q.state_dict())
示例#2
0
class DQNAgent():
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        self.policy_network = QNetwork(state_size, action_size).to(device)
        self.target_network = QNetwork(state_size, action_size).to(device)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=LR)

        self.eps = EPS_START
        self.memory = ReplayBuffer(BUFFER_SIZE)
        self.t_step = 0
        self.learn_count = 0

    def step(self, state, action, reward, next_state, done):
        self.memory.store_transition(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample(BATCH_SIZE, device)
            self.learn(experiences)

    def act(self, state):
        if np.random.rand() < self.eps:
            return np.random.randint(self.action_size)
        else:
            state = torch.from_numpy(state).unsqueeze(0).to(device)
            action_values = self.policy_network(state)
            return torch.argmax(action_values).item()

    def update_eps(self):
        self.eps = max(EPS_END, EPS_DECAY * self.eps)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        Q_current = self.policy_network(states).gather(1, actions)

        Q_targets_next = self.target_network(next_states).max(1)[0].unsqueeze(
            1)
        Q_targets = rewards + GAMMA * Q_targets_next * (1 - dones)

        loss = F.mse_loss(Q_current, Q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.learn_count += 1
        if self.learn_count % SYNC_TARGET_EVERY == 0:
            self.target_network.load_state_dict(
                self.policy_network.state_dict())
示例#3
0
class SAC(object):
    def __init__(self, num_inputs, action_space, config):

        self.gamma = config['gamma']
        self.tau = config['tau']
        self.alpha = config['alpha']

        self.policy_type = config['policy']
        self.target_update_interval = config['target_update_interval']
        self.automatic_entropy_tuning = config['automatic_entropy_tuning']

        self.device = torch.device(
            'cuda:' + str(config['cuda'])) if torch.cuda.is_available(
            ) and config['cuda'] >= 0 else torch.device('cpu')

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               config['hidden_size']).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=config['lr'])

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      config['hidden_size']).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=config['lr'])

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         config['hidden_size'],
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=config['lr'])

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if eval == False:
            action, _, _ = self.policy.sample(state)
        else:
            _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]

    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (
                min_qf_next_target)

        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]

        pi, log_pi, _ = self.policy.sample(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(
        )  # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        self.critic_optim.zero_grad()
        qf1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        qf2_loss.backward()
        self.critic_optim.step()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
            alpha_tlogs = self.alpha.clone()  # For TensorboardX logs
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_tlogs = torch.tensor(self.alpha)  # For TensorboardX logs

        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item(), alpha_tlogs.item()

    # Save model parameters
    def save_model(self, save_path=None, env_name=None, suffix=None):
        if save_path is None:
            save_path = './models/'

        actor_path = '{}actor_{}_{}'.format(save_path, env_name, suffix)
        critic_path = "{}critic_{}_{}".format(save_path, env_name, suffix)
        print('Saving models to {} and {}'.format(actor_path, critic_path))
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)
示例#4
0
class AgentPriority():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers,
                 lr=5e-4,
                 alpha=0.5,
                 beta=0.4):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_layers (list[int, int, ...]): size of hidden layers
            lr (float): learning rate
            alpha (float (0<=alpha<=1)): parameter alpha for priority
            beta (float (0<=beta<=1)): parameter for importance sampling weight
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed

        # Q-Network
        self.lr = lr
        self.qnetwork_local = QNetwork(state_size, action_size, self.seed,
                                       hidden_layers).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, self.seed,
                                        hidden_layers).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Replay memory
        self.alpha = alpha
        self.beta = beta
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed,
                                   self.alpha, self.beta)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # discount
        self.gamma = GAMMA

        self.checkpoint = {
            "input_size":
            self.state_size,
            "output_size":
            self.action_size,
            "hidden_layers":
            [each.out_features for each in self.qnetwork_local.hidden_layers],
            "state_dict":
            self.qnetwork_local.state_dict()
        }
        self.checkpointfile = 'priority_ddqn.pth'

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        delta = self.comp_delta(state, action, reward, next_state, done)
        self.memory.add(state, action, reward, next_state, done, delta)

        # Learn NUM_LEARNS times par every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) >= MIN_BUF_SIZE:
            self.memory.set_priority_params(self.alpha, self.beta)
            for i in range(NUM_LEARNS):
                if i % SORT_EVERY == 0:
                    # Sort memory based on delta every SORT_EVERY learnings
                    self.memory.argsort_deltas()

                    # Update q_target with q_local
                    self.update_qtarget()

                    # If PARAMETER_ANNEALING is set to True,anneal alpha & beta.
                    if PARAMETER_ANNEALING:
                        self.parameter_anneal()

                experiences, weights, mem_idxs = self.memory.sample()
                self.learn(experiences, weights, mem_idxs)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()).astype(np.int32)
        else:
            return random.choice(np.arange(self.action_size)).astype(np.int32)

    def learn(self, experiences, weights, mem_idxs):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
            mem_idxs (list of ints): indices in the replay buffer corresponding to
                                     the given experiences (used to update delta)
        """
        states, actions, rewards, next_states, dones = experiences

        # Get argmax of Q values (for next states) from Q_local model
        Q_local_actions = self.qnetwork_local(next_states).detach().max(
            1)[1].unsqueeze(1)

        # Evaluate that actions with Q_target model
        Q_targets_next = self.qnetwork_target(next_states).gather(
            1, Q_local_actions).detach()

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # update deltas in self.memory
        deltas = (Q_targets - Q_expected).detach().cpu().numpy()
        self.memory.update_deltas(deltas, mem_idxs)

        # Compute loss
        loss = F.mse_loss(weights * Q_expected, weights * Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def update_qtarget(self):
        for target_param, local_param in zip(self.qnetwork_target.parameters(),
                                             self.qnetwork_local.parameters()):
            target_param.data.copy_(local_param.data)

    def comp_delta(self, state, action, reward, next_state, done):
        """Compute delta given an experience
        delta = reward + gamma*argmax_action(Q_target(next_state, a)) - Q_local(state, action)
        """
        state_ts = torch.from_numpy(np.expand_dims(state,
                                                   0)).float().to(device)
        action_ts = torch.from_numpy(np.array([[action]])).long().to(device)
        reward_ts = torch.from_numpy(np.array([[reward]])).float().to(device)
        next_state_ts = torch.from_numpy(np.expand_dims(next_state,
                                                        0)).float().to(device)
        done_ts = torch.from_numpy(np.array([[int(done)]])).float().to(device)

        Q_targets_next = self.qnetwork_target(next_state_ts).detach().max(
            1)[0].unsqueeze(1)
        Q_targets = reward_ts + (self.gamma * Q_targets_next * (1 - done_ts))
        Q_expected = self.qnetwork_local(state_ts).gather(1, action_ts)

        delta = (Q_targets - Q_expected).detach().cpu().numpy()[0, 0]
        return delta

    def get_gamma(self):
        return self.gamma

    def save_model(self):
        torch.save(self.checkpoint, self.checkpointfile)

    def set_lr(self, lr):
        self.lr = lr

    def load_model(self, filepath):
        checkpoint = torch.load(filepath)

        self.qnetwork_local = QNetwork(checkpoint["input_size"],
                                       checkpoint["output_size"], self.seed,
                                       checkpoint["hidden_layers"])
        self.qnetwork_local.load_state_dict(checkpoint["state_dict"])

    def set_uniform_sampling(self):
        """ Set alpha to 0.0 and beta to 1.0 so that the agent
        becomes equivalent to the uniform sampling.
        """
        self.alpha = 0.0
        self.beta = 1.0
        self.memory.set_priority_params(self.alpha, self.beta)

    def parameter_anneal(self):
        self.alpha = max(0.0, self.alpha - ALPHA_ANNEALING)
        self.beta = min(1.0, self.beta + BETA_ANNEALING)
        self.memory.set_priority_params(self.alpha, self.beta)
示例#5
0
class Agent():
    """ Creates an agent that interacts with a Unity-ML Environment
        using a Deep Q-learning model (in pytorch).
    """
    def __init__(self,
                 n_state,
                 n_actions,
                 n_hidden=32,
                 n_layers=2,
                 seed=333,
                 snapshotfile="snapshot.pth"):
        """ Initialize the agent.

        Args:
            n_state     (int):  Number of features that represent the state
            n_actions   (int):  Number of actions available to agent
            n_hidden    (int):  Number of units in hidden neural net layers
            n_layers    (int):  Number of layers for neural network
            seed        (int):  Set the random seed (for reproducibility)
            snapshotfile (str): Filepath to use for saving weights
        """
        self.n_state = n_state
        self.n_actions = n_actions
        self.seed = random.seed(seed)
        self.snapshotfile = snapshotfile

        # Deep Q-Network
        self.qnetwork_local = QNetwork(n_state, n_actions, seed,
                                       n_hidden=64).to(device)
        self.qnetwork_target = QNetwork(n_state, n_actions, seed,
                                        n_hidden=64).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.loss_func = torch.nn.MSELoss(reduce=True)

        # Experience Replay Memory
        self.memory = ReplayBuffer(n_actions, EXPERIENCE_MEMORY_SIZE,
                                   BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # TODO: have the is_training attribute control eval and train
        #       mode in pytprch network
        self.is_training = True

    def memorize_and_learn_step(self, state, action, reward, next_state, done):
        """ Given  S,A,R',S' and if it is finished, it saves the eperience
            to memory, and occasionally samples from memorized experiences and
            learns from those memories.
        """
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Once every UPDATE_EVERY steps, randomly sample memories to learn from
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def choose_action(self, state, epsilon=0.0):
        """ Given an environment state, it returns an action using epsilon
            greedy policy.

        Args:
            state   (array_like): current state
            epsilon (float)     : probability of choosing a random action
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():  # temporarially set requires_grad flag to false
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > epsilon:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.n_actions))

    def learn(self, experiences, gamma):
        """ Update the weights of the neural network representing the Q values,
            given a batch of experience tuples.

        Args:
            experiences (tuple of torch.Variable): tuple with the following
                        torch tensors
                        (states, actions, rewards, next_states, dones)
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Q_TARGET
        next_logits = self.qnetwork_target(
            next_states).detach()  # no need to calculate gradients, so detach
        q_next = torch.max(next_logits, dim=1, keepdim=True)[0]
        # where dones=1, it  will ignore q_next, and just use current reward
        q_target = rewards + ((1 - dones) * (gamma * q_next))

        # Q_CURRENT - based on action taken in experience
        current_logits = self.qnetwork_local(states)
        q_pred = torch.gather(current_logits, 1, actions)

        # LOSS
        loss = self.loss_func(q_pred, q_target)
        # loss = F.mse_loss(q_pred, q_target)

        # OPTIMIZE WEIGHTS
        self.optimizer.zero_grad()  # zero the parameter gradients
        loss.backward()
        self.optimizer.step()

        # UPDATE TARGET NETWORK
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """ Performs a soft update on the target Q network weights, by
            shifting them slightly towards the local Q network by a factor of
            `tau`.

            θ_target = τ*θ_local + (1 - τ)*θ_target

        Args:
            local_model  (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def snapshot(self, file=None):
        """ Takes a snapshot file of the neural netowrk weights """
        file = self.snapshotfile if file is None else file
        torch.save(self.qnetwork_local.state_dict(), file)

    def load_snapshot(self, file=None):
        """ Loads the neural network weights from a file """
        file = self.snapshotfile if file is None else file
        self.qnetwork_local.load_state_dict(torch.load(file))
        self.qnetwork_target.load_state_dict(torch.load(file))
示例#6
0
class SAC(object):
    def __init__(self, num_inputs, action_space, args):

        self.num_inputs = num_inputs
        self.action_space = action_space.shape[0]
        self.gamma = args.gamma
        self.tau = args.tau

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(self.num_inputs, self.action_space,
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        if self.policy_type == "Gaussian":
            self.alpha = args.alpha
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(self.num_inputs, self.action_space,
                                         args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.value = ValueNetwork(self.num_inputs,
                                      args.hidden_size).to(self.device)
            self.value_target = ValueNetwork(self.num_inputs,
                                             args.hidden_size).to(self.device)
            self.value_optim = Adam(self.value.parameters(), lr=args.lr)
            hard_update(self.value_target, self.value)
        else:
            self.policy = DeterministicPolicy(self.num_inputs,
                                              self.action_space,
                                              args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.critic_target = QNetwork(self.num_inputs, self.action_space,
                                          args.hidden_size).to(self.device)
            hard_update(self.critic_target, self.critic)

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if eval == False:
            self.policy.train()
            action, _, _ = self.policy.sample(state)
        else:
            self.policy.eval()
            _, _, action = self.policy.sample(state)
        action = action.detach().cpu().numpy()
        return action[0]

    def update_parameters(self, state_batch, action_batch, reward_batch,
                          next_state_batch, mask_batch, updates):
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        pi, log_pi, _ = self.policy.sample(state_batch)

        if self.policy_type == "Gaussian":
            if self.automatic_entropy_tuning:
                alpha_loss = -(self.log_alpha *
                               (log_pi + self.target_entropy).detach()).mean()
                self.alpha_optim.zero_grad()
                alpha_loss.backward()
                self.alpha_optim.step()
                self.alpha = self.log_alpha.exp()
                alpha_logs = torch.tensor(self.alpha)  # For TensorboardX logs
            else:
                alpha_loss = torch.tensor(0.).to(self.device)
                alpha_logs = torch.tensor(self.alpha)  # For TensorboardX logs

            vf = self.value(
                state_batch
            )  # separate function approximator for the soft value can stabilize training.
            with torch.no_grad():
                vf_next_target = self.value_target(next_state_batch)
                next_q_value = reward_batch + mask_batch * self.gamma * (
                    vf_next_target)
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_logs = self.alpha  # For TensorboardX logs
            with torch.no_grad():
                next_state_action, _, _, _, _, = self.policy.sample(
                    next_state_batch)
                # Use a target critic network for deterministic policy and eradicate the value value network completely.
                qf1_next_target, qf2_next_target = self.critic_target(
                    next_state_batch, next_state_action)
                min_qf_next_target = torch.min(qf1_next_target,
                                               qf2_next_target)
                next_q_value = reward_batch + mask_batch * self.gamma * (
                    min_qf_next_target)

        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        if self.policy_type == "Gaussian":
            vf_target = min_qf_pi - (self.alpha * log_pi)
            value_loss = F.mse_loss(
                vf, vf_target.detach()
            )  # JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2]

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(
        )  # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        # Regularization Loss
        # mean_loss = 0.001 * mean.pow(2).mean()
        # std_loss = 0.001 * log_std.pow(2).mean()

        # policy_loss += mean_loss + std_loss

        self.critic_optim.zero_grad()
        qf1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        qf2_loss.backward()
        self.critic_optim.step()

        if self.policy_type == "Gaussian":
            self.value_optim.zero_grad()
            value_loss.backward()
            self.value_optim.step()
        else:
            value_loss = torch.tensor(0.).to(self.device)

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()
        """
        We update the target weights to match the current value function weights periodically
        Update target parameter after every n(args.target_update_interval) updates
        """
        if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic":
            soft_update(self.critic_target, self.critic, self.tau)

        elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian":
            soft_update(self.value_target, self.value, self.tau)
        return value_loss.item(), qf1_loss.item(), qf2_loss.item(
        ), policy_loss.item(), alpha_loss.item(), alpha_logs.item()

    # Save model parameters
    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None,
                   value_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        if value_path is None:
            value_path = "models/sac_value_{}_{}".format(env_name, suffix)
        print('Saving models to {}, {} and {}'.format(actor_path, critic_path,
                                                      value_path))
        torch.save(self.value.state_dict(), value_path)
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path, value_path):
        print('Loading models from {}, {} and {}'.format(
            actor_path, critic_path, value_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))
        if value_path is not None:
            self.value.load_state_dict(torch.load(value_path))
示例#7
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed=SEED, batch_size=BATCH_SIZE,
                 buffer_size=BUFFER_SIZE, start_since=START_SINCE, gamma=GAMMA, target_update_every=T_UPDATE,
                 tau=TAU, lr=LR, weight_decay=WEIGHT_DECAY, update_every=UPDATE_EVERY, priority_eps=P_EPS,
                 a=A, initial_beta=INIT_BETA, n_multisteps=N_STEPS, clip=CLIP, initial_sigma=INIT_SIGMA, linear_type=LINEAR, **kwds):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            batch_size (int): size of each sample batch
            buffer_size (int): size of the experience memory buffer
            start_since (int): number of steps to collect before start training
            gamma (float): discount factor
            target_update_every (int): how often to update the target network
            tau (float): target network soft-update parameter
            lr (float): learning rate
            weight_decay (float): weight decay for optimizer
            update_every (int): update(learning and target update) interval
            priority_eps (float): small base value for priorities
            a (float): priority exponent parameter
            initial_beta (float): initial importance-sampling weight
            n_multisteps (int): number of steps to consider for each experience
            clip (float): gradient norm clipping (`None` to disable)
            initial_sigma (float): initial noise parameter weights
            linear_type (str): one of ('linear', 'noisy'); type of linear layer to use
        """
        if kwds != {}:
            print("Ignored keyword arguments: ", end='')
            print(*kwds, sep=', ')
        assert isinstance(state_size, int)
        assert isinstance(action_size, int)
        assert isinstance(seed, int)
        assert isinstance(batch_size, int) and batch_size > 0
        assert isinstance(buffer_size, int) and buffer_size >= batch_size
        assert isinstance(start_since, int) and batch_size <= start_since <= buffer_size
        assert isinstance(gamma, (int, float)) and 0 <= gamma <= 1
        assert isinstance(target_update_every, int) and target_update_every > 0
        assert isinstance(tau, (int, float)) and 0 <= tau <= 1
        assert isinstance(lr, (int, float)) and lr >= 0
        assert isinstance(weight_decay, (int, float)) and weight_decay >= 0
        assert isinstance(update_every, int) and update_every > 0
        assert isinstance(priority_eps, (int, float)) and priority_eps >= 0
        assert isinstance(a, (int, float)) and 0 <= a <= 1
        assert isinstance(initial_beta, (int, float)) and 0 <= initial_beta <= 1
        assert isinstance(n_multisteps, int) and n_multisteps > 0
        if clip: assert isinstance(clip, (int, float)) and clip >= 0
        assert isinstance(initial_sigma, (int, float)) and initial_sigma >= 0
        assert isinstance(linear_type, str) and linear_type.strip().lower() in ('linear', 'noisy')

        self.state_size          = state_size
        self.action_size         = action_size
        self.seed                = random.seed(seed)
        self.batch_size          = batch_size
        self.buffer_size         = buffer_size
        self.start_since         = start_since
        self.gamma               = gamma
        self.target_update_every = target_update_every
        self.tau                 = tau
        self.lr                  = lr
        self.weight_decay        = weight_decay
        self.update_every        = update_every
        self.priority_eps        = priority_eps
        self.a                   = a
        self.beta                = initial_beta
        self.n_multisteps        = n_multisteps
        self.clip                = clip
        self.initial_sigma       = initial_sigma
        self.linear_type         = linear_type.strip().lower()

        # Q-Network
        self.qnetwork_local  = QNetwork(state_size, action_size, linear_type, initial_sigma, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, linear_type, initial_sigma, seed).to(device)
        self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr, weight_decay=weight_decay)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, n_multisteps, gamma, a, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps and TARGET_UPDATE_EVERY steps)
        self.u_step = 0
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.u_step = (self.u_step + 1) % self.update_every
        if self.u_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) >= self.start_since:
                experiences, target_discount, is_weights, indices = self.memory.sample(self.beta)
                new_priorities = self.learn(experiences, is_weights, target_discount)
                self.memory.update_priorities(indices, new_priorities)

        # update the target network every TARGET_UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.target_update_every
        if self.t_step == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        return random.choice(np.arange(self.action_size))

    def learn(self, experiences, is_weights, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            is_weights (torch.Tensor): tensor of importance-sampling weights
            gamma (float): discount factor for the target max-Q value

        Returns
        =======
            new_priorities (List[float]): list of new priority values for the given sample
        """
        states, actions, rewards, next_states, dones = experiences

        with torch.no_grad():
            target = rewards + gamma * (1 - dones) * self.qnetwork_target(next_states)\
                                                         .gather(dim=1, index=self.qnetwork_local(next_states)\
                                                                                  .argmax(dim=1, keepdim=True))

        pred = self.qnetwork_local(states)

        diff = target.sub(pred.gather(dim=1, index=actions))
        new_priorities = diff.detach().abs().add(P_EPS).cpu().numpy().reshape((-1,))
        loss = diff.pow(2).mul(is_weights).mean()

        self.optimizer.zero_grad()
        loss.backward()
        if self.clip:
            torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), CLIP)
        self.optimizer.step()

        return new_priorities

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
示例#8
0
class SAC(object):
    def __init__(self, num_inputs, action_space, variant):

        self.gamma = variant['gamma']
        self.tau = variant['tau']
        self.alpha = variant['alpha']
        self.policy_type = variant['policy_type']
        self.target_update_interval = variant['target_update_interval']
        self.automatic_entropy_tuning = variant['automatic_entropy_tuning']
        self.lr = variant.get("lr", 1e-3)

        self.device = torch.device("cuda" if variant['cuda'] else "cpu")
        self.hidden_size = variant.get('hidden_size', [128, 128])

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               self.hidden_size).to(self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      self.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == 'Gaussian':
            if self.automatic_entropy_tuning:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=self.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         self.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              self.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)

    def select_action(self, state, evaluate=False):

        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if evaluate is False:
            action, _, _ = self.policy.sample(state)
        else:
            _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]

    def update_parameters(self, memory, batch_size, updates):
        #sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (
                min_qf_next_target)
        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]

        # samle a batch of action and appropriate log_pi
        pi, log_pi, _ = self.policy.sample(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(
        )  # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        self.critic_optim.zero_grad()
        qf1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        qf2_loss.backward()
        self.critic_optim.step()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
            # alpha_tlogs = self.alpha.clone()
        else:
            alpha_loss = torch.tensor(0.0).to(self.device)

        if update % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item()

    def save_model(self,
                   env_nam,
                   suffix=".pkl",
                   actor_path=None,
                   critic_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')
        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)

        print("Saving models to {} and {}".format(actor_path, critic_path))
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    def load_model(self, actor_path, critic_path):
        print('loading models from {} and {}'.format(actor_path, critic_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 double_dqn=False,
                 dueling_network=False,
                 prioritized_replay=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            double_dqn (bool): use Double DQN method
            dueling_network (bool): use Dueling Network
            prioritized_replay (bool): use Prioritized Replay Buffer
        """
        self.state_size = state_size
        self.action_size = action_size
        self.dueling_network = dueling_network
        self.double_dqn = double_dqn
        self.prioritized_replay = prioritized_replay

        random.seed(seed)

        # Q-Network
        self.hidden_layers = [128, 32]

        if self.dueling_network:
            self.hidden_state_value_layers = [64, 32]

            self.qnetwork_local = DuelingQNetwork(
                state_size, action_size, seed, self.hidden_layers,
                self.hidden_state_value_layers).to(device)
            self.qnetwork_target = DuelingQNetwork(
                state_size, action_size, seed, self.hidden_layers,
                self.hidden_state_value_layers).to(device)
            self.qnetwork_target.eval()
        else:
            self.qnetwork_local = QNetwork(state_size, action_size, seed,
                                           self.hidden_layers).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size, seed,
                                            self.hidden_layers).to(device)
            self.qnetwork_target.eval()

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(
            self.optimizer, LR_DECAY)

        # Replay memory
        if prioritized_replay:
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  BUFFER_SIZE,
                                                  BATCH_SIZE,
                                                  seed,
                                                  device,
                                                  alpha=0.6,
                                                  beta=0.4,
                                                  beta_scheduler=1.0)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed, device)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def load(self, filepath):
        # load weights from file
        state_dict = torch.load(filepath)
        self.qnetwork_local.load_state_dict(state_dict)
        self.qnetwork_local.eval()

    def save(self, filepath):
        # Save weights to file
        torch.save(self.qnetwork_local.state_dict(), filepath)

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        # Epsilon-greedy action selection
        if random.random() >= eps:
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)

            self.qnetwork_local.eval()
            with torch.no_grad():
                action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()

            return np.argmax(action_values.cpu().data.numpy()).astype(int)

        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done, w) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, w = experiences

        with torch.no_grad():
            # Use of Double DQN method
            if self.double_dqn:
                # Select the greedy actions (maximum Q target for next states) from local model
                greedy_actions = self.qnetwork_local(next_states).max(
                    dim=1, keepdim=True)[1]

                # Get the Q targets (for next states) for the greedy actions from target model
                q_targets_next = self.qnetwork_target(next_states).gather(
                    1, greedy_actions)

            # Use of Fixed Q-Target
            else:
                # Get max predicted Q values (for next states) from target model
                q_targets_next = self.qnetwork_target(next_states).max(
                    dim=1, keepdim=True)[0]

        # Compute Q targets for current states
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))

        # Get expected Q values from local model
        q_expected = self.qnetwork_local(states).gather(
            1, actions)  # shape: [batch_size, 1]

        # Compute loss
        if self.prioritized_replay:
            q_targets.sub_(q_expected)
            q_targets.squeeze_()
            q_targets.pow_(2)

            with torch.no_grad():
                td_error = q_targets
                td_error.pow_(0.5)
                self.memory.update_priorities(td_error)

            q_targets.mul_(w)
            loss = q_targets.mean()
        else:
            loss = F.mse_loss(q_expected, q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.lr_scheduler.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#10
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed=42,
                 hidden_layers=[32, 8]):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # detect GPU device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers,
                                       seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers,
                                        seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayMemory(BUFFER_SIZE, BATCH_SIZE, self.device, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_step, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_step, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step += 1
        if self.t_step % UPDATE_EVERY == 0:
            if self.memory.length > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, next_states, rewards, dones = experiences

        self.qnetwork_target.eval()
        with torch.no_grad():
            # get the max expected q-values
            Q_expected = self.qnetwork_local(
                next_states
            )  # gather = multiindex selector, dim=1 indices = actions
            action_argmax = torch.max(Q_expected, dim=1, keepdim=True)[1]
            Q_max_expected = Q_expected.gather(1, action_argmax)

            # get max predicted q-values for next states from target model (action with max value per state)
            # detach gets the tensor value, unsqueeze makes a matrix with one column
            Q_targets_next = self.qnetwork_target(next_states)
            # q-target for current state
            targets = rewards + gamma * Q_max_expected * (
                1 - dones)  #consider only not dones
        self.qnetwork_target.train()

        expected = self.qnetwork_local(states).gather(1, actions)
        loss = torch.sum((expected - targets)**2)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)

    def train(self,
              env,
              brain_name,
              n_episodes=2000,
              timesteps=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995):
        '''
        train the model network applying experience replay
        Params
        ======
            agent (Agent): agent that interacts with the enviroment
            n_episodes (int): number of games played
            timesteps (int): max number os steps to be played in the game
            eps_start (floaßt): initial proportion os random actions on epsilon-greedy action
            eps_end (float): final proportion os random actions on epsilon-greedy action
            eps_decay (float): epsilon decay rate 
        '''
        scores = []
        last_scores = deque(maxlen=100)
        eps = eps_start
        for i_episode in range(n_episodes):
            env_status = env.reset(train_mode=True)[brain_name]
            state = env_status.vector_observations[0]  #get state
            score = 0
            for _ in range(timesteps):
                action = self.act(state, eps).astype(int)
                env_status = env.step(action)[brain_name]
                next_state = env_status.vector_observations[0]
                reward = env_status.rewards[0]
                done = env_status.local_done[0]
                self.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break
            scores.append(score)
            last_scores.append(score)
            eps = max(eps_end, eps * eps_decay)  #decreases epsilon
            print('\rEpisode {}\tScores mean: {:.2f}'.format(
                i_episode, np.mean(last_scores)),
                  end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tLast 100 scores mean: {:.2f}'.format(
                    i_episode, np.mean(last_scores)))
            if np.mean(last_scores) >= 13.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tScores mean: {:.2f}'
                    .format(i_episode - 100, np.mean(last_scores)))
                torch.save(self.qnetwork_local.state_dict(), 'checkpoint.pth')
                break
        return scores
示例#11
0
class SAC(object):
    """
    SAC class from Haarnoja et al. (2018)
    We leave the option to use automatice_entropy_tuning to avoid selecting entropy rate alpha
    """
    def __init__(self, num_inputs, action_space, args):
        #self.n_flow = args.n_flows
        #assert self.n_flow == 0
        self.num_inputs = num_inputs
        #self.flow_family = args.flow_family
        self.num_layers = args.num_layers
        self.args = args

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(action_space.shape).to(self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

        self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                     args.hidden_size, self.num_layers,
                                     args).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

    def select_action(self, state, eval=False):
        """
        Select action for a state
        (Train) Sample an action from NF{N(mu(s),Sigma(s))}
        (Eval) Pass mu(s) through NF{}
        """
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if not eval:
            self.policy.train()
            action, _, _, _, _ = self.policy.evaluate(state)
        else:
            self.policy.eval()
            action, _, _, _, _ = self.policy.evaluate(state, eval=True)

        action = action.detach().cpu().numpy()
        return action[0]

    def update_parameters(self, memory, batch_size, updates):
        """
        Update parameters of SAC-NF
        Exactly like SAC, but keep two separate Adam optimizers for the Gaussian policy AND the NF layers
        .backward() on them sequentially
        """
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        # for visualization
        info = {}
        ''' update critic '''
        with torch.no_grad():
            next_state_action, next_state_log_pi, _, _, _ = self.policy.evaluate(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (
                min_qf_next_target)

        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]

        pi, log_pi, _, _, _ = self.policy.evaluate(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(
        )  # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        nf_loss = ((self.alpha * log_pi) - min_qf_pi).mean()

        # update
        self.critic_optim.zero_grad()
        qf1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        qf2_loss.backward()
        self.critic_optim.step()

        self.policy_optim.zero_grad()
        policy_loss.backward()  #retain_graph=True)
        self.policy_optim.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
            alpha_tlogs = self.alpha.clone()  # For TensorboardX logs
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_tlogs = torch.tensor(self.alpha)  # For TensorboardX logs

        # update target value fuctions
        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item(), alpha_tlogs.item(), info

    def save_model(self, info):
        """
        Save the weights of the network (actor and critic separately)
        """
        # policy
        save_checkpoint(
            {
                **info,
                'state_dict': self.policy.state_dict(),
                'optimizer': self.policy_optim.state_dict(),
            },
            self.args,
            filename='policy-ckpt.pth.tar')

        # critic
        save_checkpoint(
            {
                **info,
                'state_dict': self.critic.state_dict(),
                'optimizer': self.critic_optim.state_dict(),
            },
            self.args,
            filename='critic-ckpt.pth.tar')
        save_checkpoint(
            {
                **info,
                'state_dict': self.critic_target.state_dict(),
                #'optimizer' : self.critic_optim.state_dict(),
            },
            self.args,
            filename='critic_target-ckpt.pth.tar')

    def load_model(self, args):
        """
        Jointly or separately load actor and critic weights
        """
        # policy
        load_checkpoint(
            model=self.policy,
            optimizer=self.policy_optim,
            opt=args,
            device=self.device,
            filename='policy-ckpt.pth.tar',
        )

        # critic
        load_checkpoint(
            model=self.critic,
            optimizer=self.critic_optim,
            opt=args,
            device=self.device,
            filename='critic-ckpt.pth.tar',
        )
        load_checkpoint(
            model=self.critic_target,
            #optimizer=self.critic_optim,
            opt=args,
            device=self.device,
            filename='critic_target-ckpt.pth.tar',
        )
示例#12
0
class SAC(object):
    def __init__(self):

        self.gamma = 0.99
        self.tau = 0.005
        self.alpha = 0.2
        self.lr = 0.003

        self.target_update_interval = 1
        self.device = torch.device("cpu")

        # 8 phases
        self.num_inputs = 8
        self.num_actions = 1
        self.hidden_size = 256

        self.critic = QNetwork(self.num_inputs, self.num_actions,
                               self.hidden_size).to(self.device)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=self.lr)

        self.critic_target = QNetwork(self.num_inputs, self.num_actions,
                                      self.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)
        # Copy the parameters of critic to critic_target

        self.target_entropy = -torch.Tensor([1.0]).to(self.device).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)

        self.alpha_optimizer = Adam([self.log_alpha], lr=self.lr)

        self.policy = GaussianPolicy(self.num_inputs, self.num_actions,
                                     self.hidden_size).to(self.device)
        self.policy_optimizer = Adam(self.policy.parameters(), lr=self.lr)

    def select_action(self, state):
        state = torch.FloatTensor(state).to(self.device)  # TODO
        _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]
        # action is a CUDA tensor, you should do .detach().cpu().numpy(), when
        # you need a numpy

    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)
        action_batch = np.expand_dims(action_batch, axis=1)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)
        # Unsqueeze: add one dimension to the index

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (
                min_qf_next_target)
        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf_loss = qf1_loss + qf2_loss

        self.critic_optimizer.zero_grad()
        # Clear the cumulative grad
        qf_loss.backward()
        # Get grad via backward()
        self.critic_optimizer.step()
        # Update the para via grad

        pi, log_pi, _ = self.policy.sample(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean()
        # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        # automatic_entropy_tuning:
        alpha_loss = -(self.log_alpha *
                       (log_pi + self.target_entropy).detach()).mean()  # TODO

        self.alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.alpha_optimizer.step()

        self.alpha = self.log_alpha.exp()
        alpha_tlogs = self.alpha.clone()  # For TensorboardX logs

        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item(), alpha_tlogs.item()

    # Save model parameters
    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None):
        # Create a dir package in the current location
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        print('Saving models to {} and {}'.format(actor_path, critic_path))
        torch.save(self.policy.state_dict(), actor_path)
        # state_dict() stores the parameters of layers and optimizers which have grad
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path):
        print('Loading models from {} and {}'.format(actor_path, critic_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))

    def get_alpha(self):
        return self.alpha
示例#13
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed=0,
                 double_dqn=False,
                 dueling=False,
                 per=False,
                 per_args=(0.2, 0.01, 2e-5)):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            double_dqn (bool): whether to implement Double DQN (default=False)
            dueling (bool): whether to implement Dueling DQN
            per (bool): whether to implement Prioritized Experience Replay
            per_args (tuple): a,beta,beta_increment for PER
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.double_dqn = double_dqn
        self.per = per
        self.gamma = GAMMA

        # output name for checkpoint
        self.output_name = ''
        self.output_name += '_double' if double_dqn else ''
        self.output_name += '_dueling' if dueling else ''
        self.output_name += '_per' if per else ''

        # Q-Network
        self.qnetwork_local = QNetwork(state_size,
                                       action_size,
                                       seed,
                                       dueling=dueling).to(device)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size,
                                        seed,
                                        dueling=dueling).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        if self.per:
            self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                                  BATCH_SIZE, seed, *per_args)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def train(self,
              env,
              n_episodes=1000,
              max_t=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995):
        """Deep Q-Learning.

        Params
        ======
            env (UnityEnvironment): Bananas environment
            n_episodes (int): maximum number of training episodes
            max_t (int): maximum number of timesteps per episode
            eps_start (float): starting value of epsilon, for epsilon-greedy action selection
            eps_end (float): minimum value of epsilon
            eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        """
        # get the default brain
        brain_name = env.brain_names[0]
        brain = env.brains[brain_name]
        # list containing scores from each episode
        scores = []
        # list containing window averaged scores
        avg_scores = []
        # last 100 scores
        scores_window = deque(maxlen=100)
        # initialize epsilon
        eps = eps_start
        for i_episode in range(1, n_episodes + 1):
            env_info = env.reset(train_mode=True)[brain_name]
            state = env_info.vector_observations[0]
            score = 0
            for t in range(max_t):
                action = self.act(state, eps)
                env_info = env.step(action)[brain_name]
                # get the next state
                next_state = env_info.vector_observations[0]
                # get the reward
                reward = env_info.rewards[0]
                # see if episode has finished
                done = env_info.local_done[0]
                self.step((state, action, reward, next_state, done))
                state = next_state
                score += reward
                if done:
                    break
            # save most recent score
            scores_window.append(score)
            scores.append(score)
            avg_scores.append(np.mean(scores_window))
            # decrease epsilon
            eps = max(eps_end, eps_decay * eps)
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)))
            if np.mean(scores_window) >= 13.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(scores_window)))
                torch.save(self.qnetwork_local.state_dict(),
                           f'./checkpoints/checkpoint{self.output_name}.pth')
                break
        return scores, avg_scores

    def step(self, experience):
        """Save experience in replay memory and learn.
        
        Params
        ======
            experience (tuple): (state, action, reward, next_state, done)
        """
        # save experience
        self.memory.add(experience)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                self.learn()

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self):
        """Update value parameters using given batch of experience tuples.
        """
        # if using PER
        if self.per:
            states, actions, rewards, next_states, dones, idxs, is_weights = self.memory.sample(
            )

        # else normal replay buffer
        else:
            states, actions, rewards, next_states, dones = self.memory.sample()

        # if Double DQN
        if self.double_dqn:
            # Get predicted Q values (for next actions chosen by local model) from target model
            self.qnetwork_local.eval()
            with torch.no_grad():
                next_actions = self.qnetwork_local(next_states).detach().max(
                    1)[1].unsqueeze(1)
            self.qnetwork_local.train()
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, next_actions)

        else:
            # Get max predicted Q values (for next states) from target model
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        if self.per:
            loss = (torch.FloatTensor(is_weights) *
                    F.mse_loss(Q_expected, Q_targets)).mean()
        else:
            loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # if PER, update priority
        if self.per:
            errors = torch.abs(Q_expected - Q_targets).data.numpy()
            self.memory.update(idxs, errors)

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#14
0
class Agent():
    def __init__(self, state_size, action_size, seed):
        """
        Initialize an agent object
        
        Params
        ======
          state_size(int) => dimensions of each state
          action_size (int) => dimension of each action
          seed (int) => random seed
        """
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.t_step = 0
        
    
    def act(self, state, eps=0.):
        """Returns action for a given state as per current policy.
        
        Params
        ======
          state (array) => current state
          eps (float) => epsilon for epsilon greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        
        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        
    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)
                
    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def save_model(self, path):
        """Save current model.
        
        Params
        ======
          path (string) => file path where model will be saved.
        """
        torch.save(self.qnetwork_local.state_dict(), path)
    
    def restore_model(self, path):
        """Restore model from a file.
        
        Params
        ======
          path (string) => file path from where to load the model.
        """
        self.qnetwork_local.load_state_dict(torch.load(path))
        self.qnetwork_target.load_state_dict(torch.load(path))
示例#15
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, double_dqn=True):
        self.state_size = state_size
        self.action_size = action_size
        self.double_dqn = double_dqn

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
        self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(),
                                          lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
        self.t_step = 0

    def save(self, path, *data):
        torch.save(self.qnetwork_local.state_dict(),
                   path / "model_checkpoint.local")
        torch.save(self.qnetwork_target.state_dict(),
                   path / "model_checkpoint.target")
        torch.save(self.optimizer.state_dict(),
                   path / 'model_checkpoint.optimizer')
        with open(path / 'model_checkpoint.meta', 'wb') as file:
            pickle.dump(data, file)

    def load(self, path, *defaults):
        try:
            print("Loading model from checkpoint...")
            self.qnetwork_local.load_state_dict(
                torch.load(path / 'model_checkpoint.local'))
            self.qnetwork_target.load_state_dict(
                torch.load(path / 'model_checkpoint.target'))
            self.optimizer.load_state_dict(
                torch.load(path / 'model_checkpoint.optimizer'))
            with open(path / 'model_checkpoint.meta', 'rb') as file:
                return pickle.load(file)
        except:
            print("No checkpoint file was found")
            return defaults

    def step(self, state, action, reward, next_state, done, train=True):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if train and len(self.memory) > BATCH_SIZE and self.t_step == 0:
            self.learn(self.memory.sample(), GAMMA)

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        if self.double_dqn:
            Q_best_action = self.qnetwork_local(next_states).max(1)[1]
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, Q_best_action.unsqueeze(-1))
        else:
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(-1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute loss and perform a gradient step
        self.optimizer.zero_grad()
        loss = F.mse_loss(Q_expected, Q_targets)
        loss.backward()
        self.optimizer.step()

        # Update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#16
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save(self, filename):
        """Saves the agent to the local workplace

        Params
        ======
            filename (string): where to save the weights
        """

        checkpoint = {
            'input_size':
            self.state_size,
            'output_size':
            self.action_size,
            'hidden_layers':
            [each.out_features for each in self.qnetwork_local.hidden_layers],
            'state_dict':
            self.qnetwork_local.state_dict()
        }

        torch.save(checkpoint, filename)

    def load_weights(self, filename):
        """ Load weights to update agent's Q-Network.
        Expected is a format like the one produced by self.save()

        Params
        ======
            filename (string): where to load data from. 
        """
        checkpoint = torch.load(filename)
        if not checkpoint['input_size'] == self.state_size:
            print(
                f"Error when loading weights from checkpoint {filename}: input size {checkpoint['input_size']} doesn't match state size of agent {self.state_size}"
            )
            return None
        if not checkpoint['output_size'] == self.action_size:
            print(
                f"Error when loading weights from checkpoint {filename}: output size {checkpoint['output_size']} doesn't match action space size of agent {self.action_size}"
            )
            return None
        my_hidden_layers = [
            each.out_features for each in self.qnetwork_local.hidden_layers
        ]
        if not checkpoint['hidden_layers'] == my_hidden_layers:
            print(
                f"Error when loading weights from checkpoint {filename}: hidden layers {checkpoint['hidden_layers']} don't match agent's hidden layers {my_hidden_layers}"
            )
            return None
        self.qnetwork_local.load_state_dict(checkpoint['state_dict'])
        self.qnetwork_target = self.qnetwork_local
示例#17
0
def train_dqn(options):
    max_episode = options.max_episode

    flappyBird = game.GameState()
    print(f'FPS {flappyBird.FPS}')

    rpm = ReplayMemory(options.rpm_size, options)  # DQN的经验回放池

    model = QNetwork()
    if options.resume and options.ckpt_path is not None:
        print ('load previous model weight: {}'.format(options.ckpt_path))
        episode, epsilon = load_checkpoint(options.ckpt_path, model)
    else:
        epsilon = options.init_e
        episode = 0

    if options.cuda:
        model = model.cuda()

    optimizer = optim.Adam(model.parameters(), lr=options.lr)
    algorithm = DQN(model, optimizer, epsilon, options)

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < options.rpm_size/4:
        run_episode(algorithm, flappyBird, rpm, options)

    print(f'observation done {len(rpm)}')

    # 开始训练
    logname = time.strftime('%Y-%m-%d %M-%I-%S' , time.localtime())
    logger = get_logger(f'log/{logname}.log')
    best_reward = 0
    max_score = 0
    begin = time.time()
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part

        reward, loss, score = run_episode(algorithm, flappyBird, rpm, options)
        algorithm.epsilon = max(algorithm.final_e, algorithm.epsilon - algorithm.e_decrement)
        episode += 1
        max_score = max(max_score, score)

        if (episode)%10 == 0:
            logger.info('episode:[{}/{}]\tscore:{:.3f}\ttrain_reward:{:.5f}\tloss:{:.5f}'.format(
                episode, max_episode, score, reward, loss))
        
        # test part
        if (episode)%options.evaluate_freq == 0:
            eval_reward, score = evaluate(flappyBird, algorithm, options)
            mid = time.time()
            elapsed = round(mid-begin)
            logger.info('episode:[{}/{}]\tscore:{:.3f}\tepsilon:{:.5f}\ttest_reward:{:.5f}\t{}:{}'.format(
                episode, max_episode, score, algorithm.epsilon, eval_reward, elapsed//60, elapsed%60))
            if eval_reward > best_reward:
                save_path = f'ckpt/best_{score}.ckpt'
                save_checkpoint({
                    'episode': episode,
                    'epsilon': algorithm.epsilon,
                    'state_dict': model.state_dict(),
                    }, False, save_path
                )

        if (episode)%1000 == 0:
            save_path = f'ckpt/episode_{episode}.ckpt'
            save_checkpoint({
                'episode': episode,
                'epsilon': algorithm.epsilon,
                'state_dict': model.state_dict(),
                }, False, save_path
            )

    # 训练结束,保存模型
    save_path = f'ckpt/final_{episode}_{score}.ckpt'
    save_checkpoint({
        'episode': episode,
        'epsilon': algorithm.epsilon,
        'state_dict': model.state_dict(),
        }, False, save_path)

    mid = time.time()
    elapsed = round(mid-begin)
    logger.info('training completed, {} episiode, {}m {}s'.format(max_episode, elapsed//60, elapsed%60))
    print(f'max_score {max_score}')
class DQNAgent:
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 memory=None,
                 device='cpu',
                 weights_filename=None,
                 params=None,
                 train_mode=True):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            memory (obj): Memory buffer to sample
            device (str): device string between cuda:0 and cpu
            weights_filename (str): file name having weights of local Q network to load
            params (dict): hyper-parameters
            train_mode (bool): True if it is train mode, otherwise False
        """
        self.state_size = state_size
        self.action_size = action_size
        self.device = device

        # Set parameters
        self.gamma = params['gamma']
        self.tau = params['tau']
        self.lr = params['lr']
        self.update_every = params['update_every']
        self.seed = random.seed(params['seed'])

        # Q-Network
        if train_mode:
            drop_p = params['drop_p']
        else:
            drop_p = 0

        self.qnetwork_local = QNetwork(state_size, action_size, params['seed'],
                                       params['hidden_layers'],
                                       drop_p).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        params['seed'],
                                        params['hidden_layers'],
                                        drop_p).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Replay memory
        self.memory = memory

        # Load weight file
        if weights_filename:
            self.qnetwork_local.load_state_dict(torch.load(weights_filename))

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def store_weights(self, filename):
        """Store weights of Q local network

        Params
        ======
            filename (str): string of filename to store weights of Q local network
        """
        torch.save(self.qnetwork_local.state_dict(), filename)

    def step(self, state, action, reward, next_state, done):
        """This defines an agent to do whenever moving.

        Params
        ======
            state (array_like): current state
            action (int): current action
            reward (float): reward on next state
            next_state (array_like): next state
            done (bool): flag to indicate whether this episode is done
        """
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.memory.get_batch_size():
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        # q_targets_next = max Q(next state, next action, theta dash)
        # qnetwork_target(next_states): Q values[next_states][action]
        #   .detach(): detached from the current graph
        #   .max(1): first - max(Q values), second - action: argmax(Q values), third - device (cuda:0 or cpu)
        #   .max(1)[0]: select max(Q values) of next states
        #   .unsqueeze(1)): from 1d-array to 2d-matrix ([a,b,c] -> [[a], [b], [c]])
        q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        # If done, q_targets = rewards.
        # Otherwise, q_targets = rewards + gamma * q_targets_next
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))

        # Get expected Q values from local model
        q_locals = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = f.mse_loss(q_locals, q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()  # Clear gradients
        loss.backward()  # Calculate gradients
        self.optimizer.step()  # Move to the gradients

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#19
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def dqn(self,
            env,
            brain_name,
            n_episodes=2000,
            max_t=1000,
            eps_start=1.0,
            eps_end=0.01,
            eps_decay=0.995):
        """Deep Q-Learning.
    
        Params
        ======
            n_episodes (int): maximum number of training episodes
            max_t (int): maximum number of timesteps per episode
            eps_start (float): starting value of epsilon, for epsilon-greedy action selection
            eps_end (float): minimum value of epsilon
            eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        """
        scores = []  # list containing scores from each episode
        scores_window = deque(maxlen=100)  # last 100 scores
        eps = eps_start  # initialize epsilon
        for i_episode in range(1, n_episodes + 1):
            env_info = env.reset(
                train_mode=False)[brain_name]  # reset the environment
            state = env_info.vector_observations[0]  # get the current state
            score = 0  # reset the score
            for t in range(max_t):
                action = self.act(state, eps).astype(
                    int)  # choose action based on epsilon-greedy policy
                env_info = env.step(action)[
                    brain_name]  # send the action to the environment
                next_state = env_info.vector_observations[
                    0]  # get the next state
                reward = env_info.rewards[0]  # get the reward
                done = env_info.local_done[0]  # see if episode has finished
                self.step(state, action, reward, next_state,
                          done)  # make the agent take a step
                state = next_state  # update the state
                score += reward  # add the reward to the score
                if done:  # (if done)
                    break  # end episode
            scores_window.append(score)  # save most recent score
            scores.append(score)  # save most recent score
            eps = max(eps_end, eps_decay * eps)  # decrease epsilon
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)))
            if np.mean(scores_window) >= 13.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(scores_window)))
                torch.save(self.qnetwork_local.state_dict(), 'checkpoint.pth')
                break
        return scores

    def test(self, env, brain_name):
        self.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))
        # load environment variables
        # action_size, state_size = info.getInfo()
        env_info = env.reset(
            train_mode=False)[brain_name]  # reset the environment
        state = env_info.vector_observations[0]  # get the current state
        score = 0  # initialize the score
        while True:
            action = self.act(state).astype(int)  # select an action
            env_info = env.step(action)[
                brain_name]  # send the action to the environment
            next_state = env_info.vector_observations[0]  # get the next state
            reward = env_info.rewards[0]  # get the reward
            done = env_info.local_done[0]  # see if episode has finished
            score += reward  # update the score
            state = next_state  # roll over the state to next time step
            if done:  # exit loop if episode finished
                break

        return score
示例#20
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, layer_spec, seed=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.layer_spec = layer_spec
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       layer_spec).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        layer_spec).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # (Prioritized) experience replay setup
        self.buffer_size = BUFFER_SIZE
        self.batch_size = BATCH_SIZE
        self.min_prio = MIN_PRIO
        self.alpha = ALPHA
        self.beta = INIT_BETA
        self.beta_increment = BETA_INC
        if USE_PER:
            self.memory = PrioritizedReplayBuffer(size=self.buffer_size,
                                                  alpha=self.alpha)
        else:
            self.memory = DequeReplayBuffer(action_size=self.action_size,
                                            buffer_size=self.buffer_size,
                                            batch_size=self.batch_size,
                                            seed=42)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # print info about Agent
        print('Units in the hidden layers are {}.'.format(str(layer_spec)))
        print('Using Double-DQN is \"{}\".'.format(str(USE_DDQN)))
        print('Using prioritized experience replay is \"{}\".'.format(
            str(USE_PER)))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get subset and learn
            if len(self.memory) > BATCH_SIZE:
                self.beta = min(1., self.beta + self.beta_increment)
                experiences = self.memory.sample(self.batch_size,
                                                 beta=self.beta)
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        # Get TD step from experiences
        states, actions, rewards, next_states, dones, weights, idxes = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # DOUBLE DQN: Select action based on _local, evaluate action based on _target
        if USE_DDQN:
            Q_action_select = self.qnetwork_local(next_states).detach().max(
                1)[1].unsqueeze(1)
            Q_targets_next = self.qnetwork_target(next_states).detach().gather(
                1, Q_action_select)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute (PER-weighted) MSE loss
        if USE_PER:
            TD_error = Q_targets - Q_expected
            weighted_TD_error = weights * (TD_error**2)
            loss = torch.mean(weighted_TD_error)
            # Update priorities in Replay Buffer
            prio_updates = np.abs(
                TD_error.detach().squeeze(1).cpu().numpy()) + self.min_prio
            self.memory.update_priorities(idxes, prio_updates.tolist())
        else:
            loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # soft-update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def save_checkpoint(self):
        checkpoint = {
            'input_size': self.state_size,
            'output_size': self.action_size,
            'layer_spec': self.layer_spec,
            'state_dict': self.qnetwork_local.state_dict()
        }
        torch.save(checkpoint, 'checkpoint.pth')
        print('Checkpoint succesfully saved.')

    def load_checkpoint(self, filepath='checkpoint.pth'):
        checkpoint = torch.load(filepath)
        self.qnetwork_local = QNetwork(checkpoint['input_size'],
                                       checkpoint['output_size'],
                                       checkpoint['layer_spec']).to(device)
        self.qnetwork_local.load_state_dict(checkpoint['state_dict'])
        print('Checkpoint successfully loaded.')

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#21
0
def main(args):
    env = gym.make(args.env)
    if 'MiniGrid' in args.env:
        env = ImgObsWrapper(env)
    path = args.base_path + args.env
    os.makedirs(path, exist_ok=True)
    # obs_shape = np.prod(env.observation_space.shape).astype(int)
    obs_shape = env.observation_space.shape
    act_shape = env.action_space.n

    q = QNetwork(obs_shape, act_shape)
    q_target = QNetwork(obs_shape, act_shape)
    opt = optim.Adam(lr=args.lr, params=q.parameters())
    memory = Memory(capacity=args.memory)
    scheduler = LinearSchedule(schedule_timesteps=int(args.max_steps * 0.1), final_p=0.01)

    avg_rw = deque(maxlen=40)
    avg_len = deque(maxlen=40)

    def get_action(s, t):

        s = torch.Tensor(s[None,:])
        _q = q(s)
        if np.random.sample() > scheduler.value:
            best_action = np.argmax(_q.detach(), axis=-1).item()
        else:
            best_action = np.random.randint(0, act_shape)
            scheduler.update(t)
        return best_action

    def train(batch):
        batch = Transition(*zip(*batch))
        s = torch.Tensor(batch.state)
        a = torch.Tensor(one_hot(np.array(batch.action), num_classes=act_shape))
        r = torch.Tensor(batch.reward)
        d = torch.Tensor(batch.done)
        s1 = torch.Tensor(batch.next_state)

        value = (q(s) * a).sum(dim=-1)
        next_value = r + args.gamma * (1. - d) * torch.max(q_target(s1), dim=-1)[0]
        loss = (.5 * (next_value - value) ** 2).mean()
        opt.zero_grad()
        loss.backward()
        opt.step()

    state = env.reset()

    q_target.load_state_dict(q.state_dict())

    ep_rw = 0
    ep_len = 0
    ep = 0
    for t in range(args.max_steps):
        action = get_action(state, t)
        next_state, reward, done, _ = env.step(action)
        memory.push(state, action, next_state, reward, done)
        ep_rw += reward
        ep_len += 1

        state = next_state.copy()
        if done:
            ep += 1
            avg_rw.append(ep_rw)
            avg_len.append(ep_len)
            ep_rw = 0
            ep_len = 0
            state = env.reset()

        if t % args.train_every == 0 and len(memory) > args.batch_size:
            batch = memory.sample(batch_size=args.batch_size)
            train(batch)

        if t % args.update_every == 0:
            q_target.load_state_dict(q.state_dict())
            print(f't:{t}\tep:{ep}\tavg_rw:{np.mean(avg_rw)}\tavg_len:{np.mean(avg_len)}\teps:{scheduler.value}')

    env = Monitor(env, directory=path)

    for ep in range(4):
        s = env.reset()
        while True:
            a = get_action(s, t=0)
            s1, r, d, _ = env.step(a)
            s = s1.copy()
            if d:
                break
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed=SEED, batch_size=BATCH_SIZE,
                 buffer_size=BUFFER_SIZE, start_since=START_SINCE, gamma=GAMMA, target_update_every=T_UPDATE,
                 tau=TAU, lr=LR, weight_decay=WEIGHT_DECAY, update_every=UPDATE_EVERY, priority_eps=P_EPS,
                 a=A, initial_beta=INIT_BETA, n_multisteps=N_STEPS,
                 v_min=V_MIN, v_max=V_MAX, clip=CLIP, n_atoms=N_ATOMS,
                 initial_sigma=INIT_SIGMA, linear_type=LINEAR, factorized=FACTORIZED, **kwds):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            batch_size (int): size of each sample batch
            buffer_size (int): size of the experience memory buffer
            start_since (int): number of steps to collect before start training
            gamma (float): discount factor
            target_update_every (int): how often to update the target network
            tau (float): target network soft-update parameter
            lr (float): learning rate
            weight_decay (float): weight decay for optimizer
            update_every (int): update(learning and target update) interval
            priority_eps (float): small base value for priorities
            a (float): priority exponent parameter
            initial_beta (float): initial importance-sampling weight
            n_multisteps (int): number of steps to consider for each experience
            v_min (float): minimum reward support value
            v_max (float): maximum reward support value
            clip (float): gradient norm clipping (`None` to disable)
            n_atoms (int): number of atoms in the discrete support distribution
            initial_sigma (float): initial noise parameter weights
            linear_type (str): one of ('linear', 'noisy'); type of linear layer to use
            factorized (bool): whether to use factorized gaussian noise in noisy layers
        """
        if kwds != {}:
            print("Ignored keyword arguments: ", end='')
            print(*kwds, sep=', ')
        assert isinstance(state_size, int)
        assert isinstance(action_size, int)
        assert isinstance(seed, int)
        assert isinstance(batch_size, int) and batch_size > 0
        assert isinstance(buffer_size, int) and buffer_size >= batch_size
        assert isinstance(start_since, int) and batch_size <= start_since <= buffer_size
        assert isinstance(gamma, (int, float)) and 0 <= gamma <= 1
        assert isinstance(target_update_every, int) and target_update_every > 0
        assert isinstance(tau, (int, float)) and 0 <= tau <= 1
        assert isinstance(lr, (int, float)) and lr >= 0
        assert isinstance(weight_decay, (int, float)) and weight_decay >= 0
        assert isinstance(update_every, int) and update_every > 0
        assert isinstance(priority_eps, (int, float)) and priority_eps >= 0
        assert isinstance(a, (int, float)) and 0 <= a <= 1
        assert isinstance(initial_beta, (int, float)) and 0 <= initial_beta <= 1
        assert isinstance(n_multisteps, int) and n_multisteps > 0
        assert isinstance(v_min, (int, float)) and isinstance(v_max, (int, float)) and v_min < v_max
        if clip: assert isinstance(clip, (int, float)) and clip >= 0
        assert isinstance(n_atoms, int) and n_atoms > 0
        assert isinstance(initial_sigma, (int, float)) and initial_sigma >= 0
        assert isinstance(linear_type, str) and linear_type.strip().lower() in ('linear', 'noisy')
        assert isinstance(factorized, bool)

        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)

        self.state_size          = state_size
        self.action_size         = action_size
        self.seed                = seed
        self.batch_size          = batch_size
        self.buffer_size         = buffer_size
        self.start_since         = start_since
        self.gamma               = gamma
        self.target_update_every = target_update_every
        self.tau                 = tau
        self.lr                  = lr
        self.weight_decay        = weight_decay
        self.update_every        = update_every
        self.priority_eps        = priority_eps
        self.a                   = a
        self.beta                = initial_beta
        self.n_multisteps        = n_multisteps
        self.v_min               = v_min
        self.v_max               = v_max
        self.clip                = clip
        self.n_atoms             = n_atoms
        self.initial_sigma       = initial_sigma
        self.linear_type         = linear_type.strip().lower()
        self.factorized          = factorized

        # Distribution
        self.supports = torch.linspace(v_min, v_max, n_atoms, device=device)
        self.delta_z  = (v_max - v_min) / (n_atoms - 1)

        # Q-Network
        self.qnetwork_local  = QNetwork(state_size, action_size, n_atoms, linear_type, initial_sigma, factorized).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, n_atoms, linear_type, initial_sigma, factorized).to(device)
        self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr, weight_decay=weight_decay)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, n_multisteps, gamma, a)
        # Initialize time step (for updating every UPDATE_EVERY steps and TARGET_UPDATE_EVERY steps)
        self.u_step = 0
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.u_step = (self.u_step + 1) % self.update_every
        if self.u_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) >= self.start_since:
                experiences, target_discount, is_weights, indices = self.memory.sample(self.beta)
                new_priorities = self.learn(experiences, is_weights, target_discount)
                self.memory.update_priorities(indices, new_priorities)

        # update the target network every TARGET_UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.target_update_every
        if self.t_step == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        with torch.no_grad():
            z_probs       = F.softmax(self.qnetwork_local(state), dim=-1)
            action_values = self.supports.mul(z_probs).sum(dim=-1, keepdim=False)

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        return random.choice(np.arange(self.action_size))

    def learn(self, experiences, is_weights, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            is_weights (torch.Tensor): tensor of importance-sampling weights
            gamma (float): discount factor for the target max-Q value

        Returns
        =======
            new_priorities (List[float]): list of new priority values for the given sample
        """
        states, actions, rewards, next_states, dones = experiences

        with torch.no_grad():
            rows         = tuple(range(next_states.size(0)))
            a_argmax     = F.softmax(self.qnetwork_local(next_states), dim=2)\
                               .mul(self.supports)\
                               .sum(dim=2, keepdim=False)\
                               .argmax(dim=1, keepdim=False)
            p            = F.softmax(self.qnetwork_target(next_states)[rows, a_argmax], dim=1)
            tz_projected = torch.clamp(rewards + (1 - dones) * gamma * self.supports, min=self.v_min, max=self.v_max)
            # """
            b            = (tz_projected - self.v_min) / self.delta_z
            u            = b.ceil()
            l            = b.floor()
            u_updates    = b - l + u.eq(l).type(u.dtype) # fixes the problem when having b == u == l
            l_updates    = u - b
            indices_flat = torch.cat((u.long(), l.long()), dim=1)
            indices_flat = indices_flat.add(
                               torch.arange(start=0,
                                            end=b.size(0) * b.size(1),
                                            step=b.size(1),
                                            dtype=indices_flat.dtype,
                                            layout=indices_flat.layout,
                                            device=indices_flat.device).unsqueeze(1)
                           ).view(-1)
            updates_flat = torch.cat((u_updates.mul(p), l_updates.mul(p)), dim=1).view(-1)
            target_distributions = torch.zeros_like(p)
            target_distributions.view(-1).index_add_(0, indices_flat, updates_flat)
            """
            b = ((tz_projected - V_MIN) / self.delta_z).t() # transpose for later for-loop convenience
            u = b.ceil()
            l = b.floor()
            u_updates = b - l + u.eq(l).type(u.dtype)
            l_updates = u - b
            target_distributions = torch.zeros_like(p)
            for u_indices, l_indices, u_update, l_update, prob in zip(u.long(), l.long(), u_updates, l_updates, p.t()):
                target_distributions[rows, u_indices] += u_update * prob
                target_distributions[rows, l_indices] += l_update * prob
            """

        pred_distributions = self.qnetwork_local(states)
        pred_distributions = pred_distributions.gather(dim=1, index=actions.unsqueeze(1).expand(-1, -1, pred_distributions.size(2))).squeeze(1)

        """
        cross_entropy = target_distributions.mul(pred_distributions.exp().sum(dim=-1, keepdim=True).log() - pred_distributions).sum(dim=-1, keepdim=False)
        new_priorities = cross_entropy.detach().add(self.priority_eps).cpu().numpy()
        loss = cross_entropy.mul(is_weights.view(-1)).mean()
        """
        kl_divergence = F.kl_div(F.log_softmax(pred_distributions, dim=-1), target_distributions, reduce=False).sum(dim=-1, keepdim=False)
        new_priorities = kl_divergence.detach().add(self.priority_eps).cpu().numpy()
        loss = kl_divergence.mul(is_weights.view(-1)).mean()
#         """

        self.optimizer.zero_grad()
        loss.backward()
        if self.clip:
            torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), self.clip)
        self.optimizer.step()

        return new_priorities

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
示例#23
0
class SAC(object):
    def __init__(self, num_inputs, action_space, args):

        self.num_inputs = num_inputs
        self.action_space = action_space.shape[0]
        self.gamma = args.gamma
        self.tau = args.tau

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.critic = QNetwork(self.num_inputs, self.action_space,
                               args.hidden_size)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        if self.policy_type == "Gaussian":
            self.alpha = args.alpha
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape)).item()
                self.log_alpha = torch.zeros(1, requires_grad=True)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)
            else:
                pass

            self.policy = GaussianPolicy(self.num_inputs, self.action_space,
                                         args.hidden_size)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.value = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_target = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_optim = Adam(self.value.parameters(), lr=args.lr)
            hard_update(self.value_target, self.value)
        else:
            self.policy = DeterministicPolicy(self.num_inputs,
                                              self.action_space,
                                              args.hidden_size)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.critic_target = QNetwork(self.num_inputs, self.action_space,
                                          args.hidden_size)
            hard_update(self.critic_target, self.critic)

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state).unsqueeze(0)
        if eval == False:
            self.policy.train()
            action, _, _, _, _ = self.policy.sample(state)
        else:
            self.policy.eval()
            _, _, _, action, _ = self.policy.sample(state)
            if self.policy_type == "Gaussian":
                action = torch.tanh(action)
            else:
                pass
        #action = torch.tanh(action)
        action = action.detach().cpu().numpy()
        return action[0]

    def update_parameters(self, state_batch, action_batch, reward_batch,
                          next_state_batch, mask_batch, updates):
        state_batch = torch.FloatTensor(state_batch)
        next_state_batch = torch.FloatTensor(next_state_batch)
        action_batch = torch.FloatTensor(action_batch)
        reward_batch = torch.FloatTensor(reward_batch).unsqueeze(1)
        mask_batch = torch.FloatTensor(np.float32(mask_batch)).unsqueeze(1)
        """
        Use two Q-functions to mitigate positive bias in the policy improvement step that is known
        to degrade performance of value based methods. Two Q-functions also significantly speed
        up training, especially on harder task.
        """
        expected_q1_value, expected_q2_value = self.critic(
            state_batch, action_batch)
        new_action, log_prob, _, mean, log_std = self.policy.sample(
            state_batch)

        if self.policy_type == "Gaussian":
            if self.automatic_entropy_tuning:
                """
                Alpha Loss
                """
                alpha_loss = -(
                    self.log_alpha *
                    (log_prob + self.target_entropy).detach()).mean()
                self.alpha_optim.zero_grad()
                alpha_loss.backward()
                self.alpha_optim.step()
                self.alpha = self.log_alpha.exp()
                alpha_logs = self.alpha.clone()  # For TensorboardX logs
            else:
                alpha_loss = torch.tensor(0.)
                alpha_logs = self.alpha  # For TensorboardX logs
            """
            Including a separate function approximator for the soft value can stabilize training.
            """
            expected_value = self.value(state_batch)
            target_value = self.value_target(next_state_batch)
            next_q_value = reward_batch + mask_batch * self.gamma * (
                target_value).detach()
        else:
            """
            There is no need in principle to include a separate function approximator for the state value.
            We use a target critic network for deterministic policy and eradicate the value value network completely.
            """
            alpha_loss = torch.tensor(0.)
            alpha_logs = self.alpha  # For TensorboardX logs
            next_state_action, _, _, _, _, = self.policy.sample(
                next_state_batch)
            target_critic_1, target_critic_2 = self.critic_target(
                next_state_batch, next_state_action)
            target_critic = torch.min(target_critic_1, target_critic_2)
            next_q_value = reward_batch + mask_batch * self.gamma * (
                target_critic).detach()
        """
        Soft Q-function parameters can be trained to minimize the soft Bellman residual
        JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        ∇JQ = ∇Q(st,at)(Q(st,at) - r(st,at) - γV(target)(st+1))
        """
        q1_value_loss = F.mse_loss(expected_q1_value, next_q_value)
        q2_value_loss = F.mse_loss(expected_q2_value, next_q_value)
        q1_new, q2_new = self.critic(state_batch, new_action)
        expected_new_q_value = torch.min(q1_new, q2_new)

        if self.policy_type == "Gaussian":
            """
            Including a separate function approximator for the soft value can stabilize training and is convenient to 
            train simultaneously with the other networks
            Update the V towards the min of two Q-functions in order to reduce overestimation bias from function approximation error.
            JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2]
            ∇JV = ∇V(st)(V(st) - Q(st,at) + (α * logπ(at|st)))
            """
            next_value = expected_new_q_value - (self.alpha * log_prob)
            value_loss = F.mse_loss(expected_value, next_value.detach())
        else:
            pass
        """
        Reparameterization trick is used to get a low variance estimator
        f(εt;st) = action sampled from the policy
        εt is an input noise vector, sampled from some fixed distribution
        Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]
        ∇Jπ = ∇log π + ([∇at (α * logπ(at|st)) − ∇at Q(st,at)])∇f(εt;st)
        """
        policy_loss = ((self.alpha * log_prob) - expected_new_q_value).mean()

        # Regularization Loss
        mean_loss = 0.001 * mean.pow(2).mean()
        std_loss = 0.001 * log_std.pow(2).mean()

        policy_loss += mean_loss + std_loss

        self.critic_optim.zero_grad()
        q1_value_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        q2_value_loss.backward()
        self.critic_optim.step()

        if self.policy_type == "Gaussian":
            self.value_optim.zero_grad()
            value_loss.backward()
            self.value_optim.step()
        else:
            value_loss = torch.tensor(0.)

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()
        """
        We update the target weights to match the current value function weights periodically
        Update target parameter after every n(args.target_update_interval) updates
        """
        if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic":
            soft_update(self.critic_target, self.critic, self.tau)

        elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian":
            soft_update(self.value_target, self.value, self.tau)
        return value_loss.item(), q1_value_loss.item(), q2_value_loss.item(
        ), policy_loss.item(), alpha_loss.item(), alpha_logs

    # Save model parameters
    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None,
                   value_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        if value_path is None:
            value_path = "models/sac_value_{}_{}".format(env_name, suffix)
        print('Saving models to {}, {} and {}'.format(actor_path, critic_path,
                                                      value_path))
        torch.save(self.value.state_dict(), value_path)
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path, value_path):
        print('Loading models from {}, {} and {}'.format(
            actor_path, critic_path, value_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))
        if value_path is not None:
            self.value.load_state_dict(torch.load(value_path))
示例#24
0
文件: dqn.py 项目: xuezzee/-
class Agent:
    def __init__(self, state_size, action_size, num_agents, double_dqn=False):
        self.action_size = action_size
        self.double_dqn = double_dqn

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
        self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=4000, gamma=0.98, last_epoch=-1)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE)
        self.num_agents = num_agents
        self.t_step = 0

    def reset(self):
        self.finished = [False] * self.num_agents


    # Decide on an action to take in the environment

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        # Epsilon-greedy action selection
        if random.random() > eps:
              return torch.argmax(action_values).item()
        else: return torch.randint(self.action_size, ()).item()


    # Record the results of the agent's action and update the model

    def step(self, handle, state, action, reward, next_state, agent_done):
        if not self.finished[handle]:
            # Save experience in replay memory
            self.memory.push(state, action, reward, next_state, agent_done)
            self.finished[handle] = agent_done

        # Perform a gradient update every UPDATE_EVERY time steps
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE * 1: # 320
            self.learn(*self.memory.sample(BATCH_SIZE, device))


    def learn(self, states, actions, rewards, next_states, dones):
        self.qnetwork_local.train()

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        if self.double_dqn:
              Q_best_action = self.qnetwork_local(next_states).argmax(1)
              Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_best_action.unsqueeze(-1))
        else: Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(-1)

        # Compute Q targets for current states
        Q_targets = rewards + GAMMA * Q_targets_next * (1 - dones)

        # Compute loss and perform a gradient step
        self.optimizer.zero_grad()
        loss = F.mse_loss(Q_expected, Q_targets)
        loss.backward()
        self.optimizer.step()
        self.lr_scheduler.step()

        # Update the target network parameters to `tau * local.parameters() + (1 - tau) * target.parameters()`
        for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()):
            target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data)


    # Checkpointing methods

    def save(self, path, *data):
        torch.save(self.qnetwork_local.state_dict(), path / 'model_checkpoint.local')
        torch.save(self.qnetwork_target.state_dict(), path / 'model_checkpoint.target')
        torch.save(self.optimizer.state_dict(), path / 'model_checkpoint.optimizer')
        with open(path / 'model_checkpoint.meta', 'wb') as file:
            pickle.dump(data, file)

    def load(self, path, *defaults):
        try:
            print("Loading model from checkpoint...")
            self.qnetwork_local.load_state_dict(torch.load(path / 'model_checkpoint.local'))
            self.qnetwork_target.load_state_dict(torch.load(path / 'model_checkpoint.target'))
            self.optimizer.load_state_dict(torch.load(path / 'model_checkpoint.optimizer'))
            with open(path / 'model_checkpoint.meta', 'rb') as file:
                return pickle.load(file)
        except:
            print("No checkpoint file was found")
            return defaults
示例#25
0
文件: sac.py 项目: dmitrySorokin/SAC
class SAC(object):
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
        if self.automatic_entropy_tuning is True:
            self.target_entropy = -torch.prod(
                torch.Tensor(action_space.shape).to(self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

        self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                     args.hidden_size,
                                     action_space).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

    def select_action(self, state, evaluate=False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if evaluate is False:
            action, _, _ = self.policy.sample(state)
        else:
            _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]

    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * min_qf_next_target
        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        critic_loss = qf1_loss + qf2_loss

        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()

        pi, log_pi, _ = self.policy.sample(state_batch)
        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)
        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(
        )  # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
            alpha_tlogs = self.alpha.clone()  # For TensorboardX logs
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_tlogs = torch.tensor(self.alpha)  # For TensorboardX logs

        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item(), alpha_tlogs.item()

    # Save model parameters
    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        print('Saving models to {} and {}'.format(actor_path, critic_path))
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path, device='cpu'):
        print('Loading models from {} and {}'.format(actor_path, critic_path))
        if actor_path is not None:
            self.policy.load_state_dict(
                torch.load(actor_path, map_location=torch.device(device)))
        if critic_path is not None:
            self.critic.load_state_dict(
                torch.load(critic_path, map_location=torch.device(device)))
示例#26
0
class DQN_Agent:
    def __init__(self, state_size, action_size, seed=42):
        self.action_size = action_size

        # Q-Network
        self.q_eval = QNetwork(state_size, action_size, seed).to(device)
        self.q_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.RMSprop(self.q_eval.parameters(), lr=LR)

        # Replay Buffer
        self.memory = ReplayBuffer(seed=seed)

        self.step_count = 0
        self.seed = random.seed(seed)
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.q_eval.eval()
        with torch.no_grad():
            q_values = self.q_eval(state)
        self.q_eval.train()

        epsilon = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * self.step_count / EPS_DECAY)
        if random.random() > epsilon:
            # greedy
            return np.argmax(q_values.cpu().data.numpy())
        else:
            # explore
            return random.choice(np.arange(self.action_size))
        
    def step(self, state, action, reward, next_state, done):       
        self.memory.push(state, action, reward, next_state, done)

        loss_value = None
        if len(self.memory) >= BATCH_SIZE:
            # sample transitions from replay buffer
            states, actions, rewards, next_states, dones = self.memory.sample()
 
            #  r                                   if done
            #  r + max_a \gamma Q(s, a; \theta')   if not done
            q_next_values = self.q_target(next_states).detach().max(1)[0].unsqueeze(1)
            q_learning_targets = rewards + GAMMA * q_next_values * (1 - dones)

            # Q(s, a; \theta)
            q_values = self.q_eval(states).gather(1, actions)

            # perform gradient descent on the loss
            loss = F.mse_loss(q_values, q_learning_targets)
            loss_value = loss.data.item()

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            # update target Q-Network
            self.update_target()

        self.step_count += 1
        return loss_value
    
    def update_target(self):
        if self.step_count % UPDATE_TARGET_STEPS == 0:
            self.q_target.load_state_dict(self.q_eval.state_dict())
示例#27
0
class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, double_dqn=True):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        self.state_size = state_size
        self.action_size = action_size
        self.double_dqn = double_dqn
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = copy.deepcopy(self.qnetwork_local)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def save(self, filename):
        torch.save(self.qnetwork_local.state_dict(), filename + ".local")
        torch.save(self.qnetwork_target.state_dict(), filename + ".target")

    def load(self, filename):
        if os.path.exists(filename + ".local"):
            self.qnetwork_local.load_state_dict(torch.load(filename + ".local"))
        if os.path.exists(filename + ".target"):
            self.qnetwork_target.load_state_dict(torch.load(filename + ".target"))

    def step(self, state, action, reward, next_state, done, train=True):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                if train:
                    self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):

        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        if self.double_dqn:
            # Double DQN
            q_best_action = self.qnetwork_local(next_states).max(1)[1]
            Q_targets_next = self.qnetwork_target(next_states).gather(1, q_best_action.unsqueeze(-1))
        else:
            # DQN
            Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(-1)

            # Compute Q targets for current states

        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
示例#28
0
class Agent():
    """
    Initialize Agent, inclduing:
        DQN Hyperparameters
        Local and Targat State-Action Policy Networks
        Replay Memory Buffer from Replay Buffer Class (define below)
    """
    def __init__(self,
                 state_size,
                 action_size,
                 dqn_type='DQN',
                 replay_memory_size=1e5,
                 batch_size=64,
                 gamma=0.99,
                 learning_rate=1e-3,
                 target_tau=2e-3,
                 update_rate=4,
                 seed=0):
        """
        DQN Agent Parameters
        ====== 
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            dqn_type (string): can be either 'DQN' for vanillia dqn learning (default) or 'DDQN' for double-DQN.
            replay_memory size (int): size of the replay memory buffer (typically 5e4 to 5e6)
            batch_size (int): size of the memory batch used for model updates (typically 32, 64 or 128)
            gamma (float): paramete for setting the discoun ted value of future rewards (typically .95 to .995)
            learning_rate (float): specifies the rate of model learing (typically 1e-4 to 1e-3))
            seed (int): random seed for initializing training point.
        """
        self.dqn_type = dqn_type
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)
        """
        # DQN Agent Q-Network
        # For DQN training, two nerual network models are employed;
        # (a) A network that is updated every (step % update_rate == 0)
        # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate.
        # The slower modulation of the target network weights operates to stablize learning.
        """
        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learn_rate,
                                    betas=BETAS)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    ########################################################
    # STEP() method
    #
    def step(self, state, action, reward, next_state, done, update=True):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_rate
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                if update:
                    self.learn(experiences, self.gamma)

########################################################
# ACT() method
#

    def act(self, state, eps=0.0):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.network.eval()
        with torch.no_grad():
            action_values = self.network(state)
        self.network.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

########################################################
# LEARN() method
# Update value parameters using given batch of experience tuples.

    def learn(self, experiences, gamma, DQN=True):
        """
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # Get Q values from current observations (s, a) using model nextwork
        Qsa = self.network(states).gather(1, actions)

        if (self.dqn_type == 'DDQN'):
            #Double DQN
            #************************
            Qsa_prime_actions = self.network(next_states).detach().max(
                1)[1].unsqueeze(1)
            Qsa_prime_targets = self.target_network(
                next_states)[Qsa_prime_actions].unsqueeze(1)

        else:
            #Regular (Vanilla) DQN
            #************************
            # Get max Q values for (s',a') from target model
            Qsa_prime_target_values = self.target_network(next_states).detach()
            Qsa_prime_targets = Qsa_prime_target_values.max(1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Qsa_targets = rewards + (gamma * Qsa_prime_targets * (1 - dones))

        # Compute loss (error)
        loss = F.mse_loss(Qsa, Qsa_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.network, self.target_network, self.tau)

    ########################################################
    """
    Soft update model parameters.
    θ_target = τ*θ_local + (1 - τ)*θ_target
    """

    def soft_update(self, local_model, target_model, tau):
        """
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_the_model(self, iteration, f_name):
        if not os.path.exists('./save/dqn/'):
            os.makedirs('./save/dqn/')
        f_name = 'dqn_param_' + str(iteration) + '_' + f_name + '_model.pth'
        torch.save(self.network.state_dict(), './save/dqn/' + f_name)
        print('DQN Model Saved')

    def load_the_model(self, iteration, f_name):
        f_path = './save/dqn/dqn_param_' + str(
            iteration) + '_' + f_name + '_model.pth'
        self.network.load_state_dict(torch.load(f_path))
        print('DQN Model Loaded')
示例#29
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed=0,
                 gamma=0.99,
                 learning_rate=5e-4,
                 use_RB=True,
                 RB_size=int(1e5),
                 RB_batch_size=64,
                 use_TM=True,
                 TM_update_every=4,
                 use_DDQN=True,
                 use_PER=False,
                 PER_epsilon=0.01,
                 PER_alpha=0.5,
                 PER_beta=0.4,
                 PER_beta_increment=0.001,
                 use_DUELING=True):
        """Initialize an Agent object.
        
        Params
        ======
            state_size                  (int)   : dimension of each state
            action_size                 (int)   : dimension of each action
            seed                        (int)   : random seed
            gamma                       (float) : discount factor
            learning_rate               (float) : learning rate of the model

            use_RB                      (bool)  : Use a replay buffer
            RB_size                     (int)   : replay buffer size
            RB_batch_size               (int)   : minibatch size of the learning

            use_TM                      (bool)  : Use a target model
            TM_update_every             (int)   : update target model every t steps

            use_DDQN                    (bool)  : Use Double DQN, only valid if use target model
            
            use_PER                     (bool)  : Use a prioritized replay buffer
            PER_epsilon                 (float) : Small value added to priorities to avoid zero probabilities
            PER_alpha                   (float) : Power used to compute the sampling probabilities
                                                  [0-1] : 0=> Uniform sampling 1=>Fully prioritized
            PER_beta                    (float) : Used in importance-sampling - Initial value increased to 1
            PER_beta_increment          (float) : To increment beta at each sampling

            use_DUELING                 (bool)  : Use DUELING network
        """
        # Control some parameters
        assert not use_PER or (
            use_PER and use_RB
        ), "Use replay buffer if use PER"  # To make sure we remember to update RB params
        assert not use_DDQN or (use_DDQN
                                and use_TM), "Use target model if use DDQN"

        self.state_size = state_size
        self.action_size = action_size

        self.gamma = gamma

        # Q-Network
        self.qnetwork_policy = QNetwork(state_size,
                                        action_size,
                                        seed,
                                        use_DUELING=use_DUELING).to(device)
        self.optimizer = optim.Adam(self.qnetwork_policy.parameters(),
                                    lr=learning_rate)

        self.use_DDQN = use_DDQN
        self.use_TM = use_TM
        if use_TM:
            self.qnetwork_target = QNetwork(state_size,
                                            action_size,
                                            seed,
                                            use_DUELING=use_DUELING).to(device)
            self.TM_update_every = TM_update_every

        # Initialize time step
        self.t_step = 0

        # Replay memory
        self.use_RB = use_RB
        self.RB_batch_size = RB_batch_size
        self.use_PER = use_PER
        if use_PER:
            self.memory = ReplayBufferPER(RB_size,
                                          RB_batch_size,
                                          seed,
                                          epsilon=PER_epsilon,
                                          alpha=PER_alpha,
                                          beta=PER_beta,
                                          beta_increment=PER_beta_increment)
        elif use_RB:
            self.memory = ReplayBuffer(RB_size, RB_batch_size, seed)

        # Init the seed
        random.seed(seed)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        # Epsilon-greedy action selection
        if random.random() > eps:
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            self.qnetwork_policy.eval()
            with torch.no_grad():
                action_values = self.qnetwork_policy(state)
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory if any
        if self.use_PER:
            # Need to compute the error of this experience
            Q_target, Q_expected = self._QValues([(state, action, reward,
                                                   next_state, done)])
            error = (Q_target - Q_expected).cpu().squeeze().data.item()

            self.memory.add(error, (state, action, reward, next_state, done))
        elif self.use_RB:
            self.memory.add((state, action, reward, next_state, done))
        else:
            self.experiences = [(state, action, reward, next_state, done)]

        # One more step.
        self.t_step += 1

        # If no replay buffer or not enough samples available in memory, learn
        if not self.use_RB or len(self.memory) > self.RB_batch_size:
            self._learn()

    def _QValues(self, batch):
        """Execute a forward path for the QNetworks to get the QValues (expected and target)
           So the TD error can be computed or used to learn

           Params
           ======

           batch : Array of tuple <state, action, reward, next_state, done>
        """

        # Get the types by line
        mini_batch = np.array(batch).transpose()

        states = torch.Tensor(np.vstack(mini_batch[0])).float().to(device)
        actions = torch.Tensor(np.vstack(mini_batch[1])).long().to(device)
        rewards = torch.Tensor(np.vstack(mini_batch[2])).float().to(device)
        next_states = torch.Tensor(np.vstack(mini_batch[3])).float().to(device)
        dones = torch.Tensor(np.vstack(
            mini_batch[4]).astype(int)).float().to(device)

        # Get max predicted Q values (for next states) from target model
        if not self.use_TM or (self.use_TM and self.use_DDQN):
            self.qnetwork_policy.eval()
            with torch.no_grad():
                action_values_policy = self.qnetwork_policy(next_states)

        if self.use_TM:
            self.qnetwork_target.eval()
            with torch.no_grad():
                action_values_target = self.qnetwork_target(next_states)

        if self.use_TM:
            if self.use_DDQN:
                Q_targets_next = action_values_target.gather(
                    dim=1,
                    index=action_values_policy.max(dim=1, keepdim=True)[1])
            else:
                Q_targets_next = action_values_target.max(dim=1,
                                                          keepdim=True)[0]
        else:
            Q_targets_next = action_values_policy.max(dim=1, keepdim=True)[0]

        # Need to be at zero if we were done
        Q_targets_next *= torch.ones_like(dones) - dones

        # Compute the Q targets for current states
        Q_targets = rewards + self.gamma * Q_targets_next

        # Get the Q values from policy model
        self.qnetwork_policy.train()
        Q_expected = self.qnetwork_policy(states).gather(dim=1, index=actions)

        return Q_targets, Q_expected

    def _learn(self):
        """Update value parameters using given a batch of experience tuples."""

        if self.use_PER:
            experiences, indexes, IS_weights = self.memory.sample()
            IS_weights = torch.Tensor(np.vstack(IS_weights)).float().to(device)
        elif self.use_RB:
            experiences = self.memory.sample()
        else:
            experiences = self.experiences

        # Get the Qvalues for those experiences
        Q_targets, Q_expected = self._QValues(experiences)

        if self.use_PER:
            # Update priorities of the replay buffer
            errors = (Q_targets - Q_expected).cpu().squeeze().data.numpy()
            self.memory.update_priorities(indexes, errors)

            # Update Qs with the importance-sampling weight correction
            Q_expected *= IS_weights**0.5
            Q_targets *= IS_weights**0.5

        # Loss computation
        loss = F.mse_loss(Q_expected, Q_targets)
        #loss = F.smooth_l1_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        if self.use_TM:
            self.t_step %= self.TM_update_every
            if self.t_step == 0:
                self.qnetwork_target.load_state_dict(
                    self.qnetwork_policy.state_dict())

    def save_weights(self, file='checkpoint.pth'):
        """Save the agent network weights in a checkpoint file"""
        torch.save(self.qnetwork_policy.state_dict(), file)

    def load_weights(self, file='checkpoint.pth'):
        """Load the agent network weights from a checkpoint file"""
        self.qnetwork_policy.load_state_dict(torch.load(file))
示例#30
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, filepath):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.avarage_score = 0
        self.start_epoch = 0
        self.seed = random.randint(0, seed)
        random.seed(seed)
        print("seed ", seed, "  self.seed ", self.seed)
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       self.seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        self.seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        if filepath:
            self.load_model(filepath)

        # Replay memory
        print("buffer size ", BUFFER_SIZE)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   self.seed)
        print("memory ", self.memory)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            #print("experiences ",experiences)
            self.learn_DDQN(experiences, GAMMA)
            self.t_step = (self.t_step + 1) % UPDATE_EVERY
            if self.t_step == 0:
                self.update_network(self.qnetwork_local, self.qnetwork_target)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn_DDQN(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # Get max predicted Q values (for next states) from target model
        Q_targets_next_argmax = self.qnetwork_local(next_states).squeeze(
            0).detach().max(1)[1].unsqueeze(1)
        #Q_targets_next0 = self.qnetwork_target(next_states).squeeze(0).detach()
        #Q_targets_next = Q_targets_next0.max(1)[0].unsqueeze(1)
        Q_targets_next = self.qnetwork_target(next_states).squeeze(0).gather(
            1, Q_targets_next_argmax)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).squeeze(0).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        #self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next0 = self.qnetwork_target(next_states).squeeze(0).detach()
        Q_targets_next = Q_targets_next0.max(1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).squeeze(0).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        #self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def save_model(self, filepath, epoch, score, last=False):
        checkpoint = {
            'input_size':
            self.state_size,
            'output_size':
            self.action_size,
            'hidden_layers':
            [each.in_features for each in self.qnetwork_local.hidden_layers],
            'state_dict':
            self.qnetwork_local.state_dict(),
            'optimizer_state_dict':
            self.optimizer.state_dict(),
            'epoch':
            epoch,
            'avarage_score':
            score
        }
        checkpoint['hidden_layers'].append(
            self.qnetwork_local.hidden_layers[-1].out_features)
        torch.save(checkpoint, filepath)
        if last:
            torch.save(self.qnetwork_local.state_dict(),
                       '{}_state_dict_{}.pt'.format(last, epoch))
        #print("checkpoint['hidden_layers'] ",checkpoint['hidden_layers'])

    def load_model(self, filepath):
        print("seed ", self.seed)
        if os.path.isfile(filepath):
            print("=> loading checkpoint '{}'".format(filepath))
            checkpoint = torch.load(filepath)
            print("checkpoint['hidden_layers'] ", checkpoint['hidden_layers'])
            self.qnetwork_local = QNetwork(
                checkpoint['input_size'], checkpoint['output_size'], self.seed,
                checkpoint['hidden_layers']).to(device)
            self.qnetwork_local.load_state_dict(checkpoint['state_dict'])
            self.qnetwork_local.to(device)
            self.qnetwork_target = QNetwork(
                checkpoint['input_size'], checkpoint['output_size'], self.seed,
                checkpoint['hidden_layers']).to(device)
            self.qnetwork_target.load_state_dict(checkpoint['state_dict'])
            self.qnetwork_target.to(device)
            if 'optimizer_state_dict' in checkpoint:
                self.optimizer.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                for state in self.optimizer.state.values():
                    for k, v in state.items():
                        if isinstance(v, torch.Tensor):
                            state[k] = v.to(device)
                print(self.optimizer)
            if 'epoch' in checkpoint:
                self.start_epoch = checkpoint['epoch']
            if 'avarage_score' in checkpoint:
                self.avarage_score = checkpoint['avarage_score']

            print(self.qnetwork_target)
            print(self.optimizer)
        else:
            print("=> no checkpoint found at '{}'".format(filepath))

    def update_network(self, local_model, target_model):
        for target, local in zip(target_model.parameters(),
                                 local_model.parameters()):
            target.data.copy_(local.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)