class A2CAgent:
    def __init__(self,
                 replay_size,
                 memory_size=10000,
                 prioritized=False,
                 load_models=False,
                 actor_model_file='',
                 critic_model_file='',
                 is_eval=False):
        self.state_size = 2
        self.action_size = 3
        self.step = 0
        self.replay_size = replay_size
        self.replay_queue = deque(maxlen=self.replay_size)
        self.memory_size = memory_size
        self.prioritized = prioritized
        if self.prioritized:
            self.memory = Memory(capacity=memory_size)

        # Hyper parameters for learning
        self.value_size = 1
        self.layer_size = 16
        self.discount_factor = 0.99
        self.actor_learning_rate = 0.0005
        self.critic_learning_rate = 0.005
        self.is_eval = is_eval

        # Create actor and critic neural networks
        self.actor = self.build_actor()
        self.critic = self.build_critic()
        #self.actor.summary()

        if load_models:
            if actor_model_file:
                self.actor.load_weights(actor_model_file)
            if critic_model_file:
                self.critic.load_weights(critic_model_file)

    # The actor takes a state and outputs probabilities of each possible action
    def build_actor(self):

        layer1 = Dense(self.layer_size,
                       input_dim=self.state_size,
                       activation='relu',
                       kernel_initializer='he_uniform')
        layer2 = Dense(self.layer_size,
                       input_dim=self.layer_size,
                       activation='relu',
                       kernel_initializer='he_uniform')
        # Use softmax activation so that the sum of probabilities of the actions becomes 1
        layer3 = Dense(self.action_size,
                       activation='softmax',
                       kernel_initializer='he_uniform')  # self.action_size = 2

        actor = Sequential(layers=[layer1, layer2, layer3])

        # Print a summary of the network
        actor.summary()

        # We use categorical crossentropy loss since we have a probability distribution
        actor.compile(loss='categorical_crossentropy',
                      optimizer=Adam(lr=self.actor_learning_rate))
        return actor

    # The critic takes a state and outputs the predicted value of the state
    def build_critic(self):

        layer1 = Dense(self.layer_size,
                       input_dim=self.state_size,
                       activation='relu',
                       kernel_initializer='he_uniform')
        layer2 = Dense(self.layer_size,
                       input_dim=self.layer_size,
                       activation='relu',
                       kernel_initializer='he_uniform')
        layer3 = Dense(self.value_size,
                       activation='linear',
                       kernel_initializer='he_uniform')  # self.value_size = 1

        critic = Sequential(layers=[layer1, layer2, layer3])

        # Print a summary of the network
        critic.summary()

        critic.compile(loss='mean_squared_error',
                       optimizer=Adam(lr=self.critic_learning_rate))
        return critic

    def act(self, state):
        # Get probabilities for each action
        policy = self.actor.predict(np.array([state]), batch_size=1).flatten()

        # Randomly choose an action
        if not self.is_eval:
            return np.random.choice(self.action_size, 1, p=policy).take(0)
        else:
            return np.argmax(policy)  # 20191117- for evaluation

    def store_transition(self, s, a, r, s_, dd):
        if self.prioritized:  # prioritized replay
            transition = np.hstack((s, [a, r], s_, dd))
            self.memory.store(
                transition)  # have high priority for newly arrived transition
        else:
            #self.replay_queue.append((s, [a, r], s_, dd))
            transition = np.hstack((s, [a, r], s_, dd))
            self.replay_queue.append(transition)

    def expReplay(self, batch_size=64, lr=1, factor=0.95):
        if self.prioritized:
            tree_idx, batch_memory, ISWeights = self.memory.sample(batch_size)
        else:
            batch_memory = random.sample(self.replay_queue, batch_size)

        s_prevBatch = np.array([replay[[0, 1]] for replay in batch_memory])
        a = np.array([replay[[2]] for replay in batch_memory])
        r = np.array([replay[[3]] for replay in batch_memory])
        s_currBatch = np.array([replay[[4, 5]] for replay in batch_memory])
        d = np.array([replay[[6]] for replay in batch_memory])

        td_error = np.zeros((d.shape[0], ), dtype=float)
        for i in range(d.shape[0]):
            q_prev = self.critic.predict(np.array([s_prevBatch[i, :]]))
            q_curr = self.critic.predict(np.array([s_currBatch[i, :]]))
            if int(d[i]) == 1:
                q_curr = r[i]
            q_realP = r[i] + factor * q_curr
            advantages = np.zeros((1, self.action_size))
            advantages[0, int(a[i])] = q_realP - q_prev

            if self.prioritized:
                td_error[i] = abs(advantages[0, int(a[i])])

            self.actor.fit(np.array([s_prevBatch[i, :]]),
                           advantages,
                           epochs=1,
                           verbose=0)
            self.critic.fit(np.array([s_prevBatch[i, :]]),
                            reshape(q_realP),
                            epochs=1,
                            verbose=0)

        if self.prioritized:
            self.memory.batch_update(tree_idx, td_error)
Пример #2
0
class Agent:
    """
    Interacts with and learns from the environment.
    Learns using a Deep Q-Network with prioritised experience replay.
    Two models are instantiated, one for use during evaluation and updating (qnetwork_local) and one to be used for the
    target values in the learning algorithm (qnetwork_target)
    """

    BUFFER_SIZE = int(1e5)  # prioritised experience replay buffer size
    BATCH_SIZE = 64  # minibatch size
    TAU = 1e-3  # for soft update of target parameters
    LR = 5e-4  # learning rate
    UPDATE_EVERY = 4  # how often to update the network
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def __init__(self,
                 state_size: int = 37,
                 action_size: int = 4,
                 seed: int = 44,
                 gamma: float = 0.99,
                 tau: float = 1e-3):
        """
        Initialize an Agent object.

        :param state_size: dimension of each state
        :param action_size: dimension of each action
        :param seed: random seed for network initialisation
        :param gamma: discount factor
        :param tau: lag for soft update of target network parameters
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.gamma = gamma
        self.tau = tau

        self.max_w = 0

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.LR)

        # Prioritised Experience Replay memory
        self.memory = Memory(self.BUFFER_SIZE)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self,
             state: np.ndarray,
             action: int,
             reward: float,
             next_state: np.ndarray,
             done: bool,
             gamma: Optional[float] = None,
             tau: Optional[float] = None):
        """
        An agent step takes the current experience and stores it in the replay memory, then samples from the memory and
        calls the learning algorithm.

        :param state: the state vector
        :param action: the action performed on the state
        :param reward: the reward given upon performing the action
        :param next_state: the next state after doing the action
        :param done: True if the episode has ended
        :param gamma: discount factor
        :param tau: lag for soft update of target network parameters
        """
        gamma_value = gamma if gamma is not None else self.gamma
        tau_value = tau if tau is not None else self.tau

        self.memory.add((state, action, reward, next_state,
                         done))  # Save experience in replay memory

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if self.memory.tree.n_entries > self.BATCH_SIZE:
                experiences, idxs, importance_weights = self.memory.sample(
                    self.BATCH_SIZE)
                self.learn(experiences, idxs, importance_weights, gamma_value,
                           tau_value)

    def act(self, state: np.ndarray, eps: float = 0.0):
        """
        Returns actions for given state as per current policy. Uses the local copy of the model.

        :param state: current state
        :param eps: epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.int32(np.argmax(action_values.cpu().data.numpy()))
        else:
            return np.int32(random.choice(np.arange(self.action_size)))

    def learn(self, experiences: Tuple[torch.Tensor, torch.Tensor,
                                       torch.Tensor, torch.Tensor,
                                       torch.Tensor], indices: np.ndarray,
              importance_weights: torch.Tensor, gamma: float, tau: float):
        """
        Update value parameters using given batch of experience tuples.

        :param experiences: tuple of (s, a, r, s', done) tuples
        :param indices:
            indices of the SumTree that contain the priority values for these experiences. Used for updating the
            priority values after error has been found
        :param importance_weights: the weighting that each experience carries when used in updating the network
        :param gamma: discount factor
        :param tau: lag for soft update of target network parameters
        """
        states, actions, rewards, next_states, dones = experiences

        # For Double-DQN, get action with the highest q-value (for next_states) from the local model
        next_action = self.qnetwork_local(next_states).detach().max(
            1)[1].unsqueeze(1)
        # Get max predicted Q values (for next states) from target model
        q_targets_next = self.qnetwork_target(next_states).gather(
            1, next_action)
        # Compute Q targets for current states
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))

        # Get expected Q values from local model
        q_expected = self.qnetwork_local(states).gather(1, actions)

        error = torch.abs(q_targets - q_expected).detach().numpy()

        # update priorities
        self.memory.batch_update(indices, error)

        # Compute mse and loss with importance weights
        t_mse = F.mse_loss(q_expected, q_targets)
        loss = (importance_weights * t_mse).mean()
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network with model parameters approaching those of the local network.
        self.soft_update(self.qnetwork_local, self.qnetwork_target, tau)

    @staticmethod
    def soft_update(local_model: torch.nn.Module,
                    target_model: torch.nn.Module, tau: float):
        """
        Soft update model parameters. Every learning step the target network is updated to bring its parameters nearer
        by a factor TAU to those of the improving local network.

        If TAU = 1 the target network becomes a copy of the local network.
        If TAU = 0 the target network is not updated.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :param local_model: weights will be copied from
        :param target_model: weights will be copied to
        :param tau: interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DoubleDQN(object):
    def __init__(self, replay_size, memory_size=10000, prioritized=False):
        self.step = 0
        self.replay_size = replay_size
        self.replay_queue = deque(maxlen=self.replay_size)
        self.memory_size = memory_size
        self.tau = 1e-2  #MountainCar-v0
        self.model = self.create_model()
        self.prioritized = prioritized
        self.target_model = self.create_model()
        self.target_model.set_weights(self.model.get_weights())
        if self.prioritized:
            self.memory = Memory(capacity=memory_size)

    def create_model(self):

        STATE_DIM, ACTION_DIM = 2, 3
        model = models.Sequential([
            layers.Dense(100, input_dim=STATE_DIM, activation='relu'),
            layers.Dense(ACTION_DIM, activation="linear")
        ])
        model.compile(loss='mean_squared_error',
                      optimizer=optimizers.Adam(0.001))
        return model

    def act(self, s, epsilon=0.1):

        #
        if np.random.uniform() < epsilon - self.step * 0.0002:
            return np.random.choice([0, 1, 2])
        return np.argmax(self.model.predict(np.array([s]))[0])

    def save_model(self, file_path='MountainCar-v0-Ddqn.h5'):
        print('model saved')
        self.model.save(file_path)

    def store_transition(self, s, a, r, s_, dd):
        if self.prioritized:  # prioritized replay
            transition = np.hstack((s, [a, r], s_, dd))  # transition -> 7x1
            self.memory.store(
                transition)  # have high priority for newly arrived transition
        else:
            #self.replay_queue.append((s, [a, r], s_, dd))
            transition = np.hstack((s, [a, r], s_, dd))  # transition -> 7x1
            self.replay_queue.append(transition)

    def expReplay(self, batch_size=64, lr=1, factor=0.95):

        if self.prioritized:
            tree_idx, batch_memory, ISWeights = self.memory.sample(batch_size)
        else:
            batch_memory = random.sample(self.replay_queue, batch_size)

        s_batch = np.array([replay[[0, 1]] for replay in batch_memory])
        a = np.array([replay[[2]] for replay in batch_memory])
        r = np.array([replay[[3]] for replay in batch_memory])
        next_s_batch = np.array([replay[[4, 5]] for replay in batch_memory])
        d = np.array([replay[[6]] for replay in batch_memory])

        Q = self.model.predict(s_batch)
        Q_next = self.model.predict(next_s_batch)
        Q_targ = self.target_model.predict(next_s_batch)

        #update Q value
        td_error = np.zeros((d.shape[0], ), dtype=float)
        for i in range(d.shape[0]):
            old_q = Q[i, int(a[i])]
            if int(d[i]) == 1:
                Q[i, int(a[i])] = r[i]
            else:
                next_best_action = np.argmax(Q_next[i, :])
                Q[i, int(a[i])] = r[i] + factor * Q_targ[i, next_best_action]

            if self.prioritized:
                td_error[i] = abs(old_q - Q[i, int(a[i])])

        if self.prioritized:
            self.memory.batch_update(tree_idx, td_error)

        self.model.fit(s_batch, Q, verbose=0)

    def transfer_weights(self):
        """ Transfer Weights from Model to Target at rate Tau
        """
        W = self.model.get_weights()
        tgt_W = self.target_model.get_weights()
        for i in range(len(W)):
            tgt_W[i] = self.tau * W[i] + (1 - self.tau) * tgt_W[i]
        self.target_model.set_weights(tgt_W)