class A2CAgent: def __init__(self, replay_size, memory_size=10000, prioritized=False, load_models=False, actor_model_file='', critic_model_file='', is_eval=False): self.state_size = 2 self.action_size = 3 self.step = 0 self.replay_size = replay_size self.replay_queue = deque(maxlen=self.replay_size) self.memory_size = memory_size self.prioritized = prioritized if self.prioritized: self.memory = Memory(capacity=memory_size) # Hyper parameters for learning self.value_size = 1 self.layer_size = 16 self.discount_factor = 0.99 self.actor_learning_rate = 0.0005 self.critic_learning_rate = 0.005 self.is_eval = is_eval # Create actor and critic neural networks self.actor = self.build_actor() self.critic = self.build_critic() #self.actor.summary() if load_models: if actor_model_file: self.actor.load_weights(actor_model_file) if critic_model_file: self.critic.load_weights(critic_model_file) # The actor takes a state and outputs probabilities of each possible action def build_actor(self): layer1 = Dense(self.layer_size, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform') layer2 = Dense(self.layer_size, input_dim=self.layer_size, activation='relu', kernel_initializer='he_uniform') # Use softmax activation so that the sum of probabilities of the actions becomes 1 layer3 = Dense(self.action_size, activation='softmax', kernel_initializer='he_uniform') # self.action_size = 2 actor = Sequential(layers=[layer1, layer2, layer3]) # Print a summary of the network actor.summary() # We use categorical crossentropy loss since we have a probability distribution actor.compile(loss='categorical_crossentropy', optimizer=Adam(lr=self.actor_learning_rate)) return actor # The critic takes a state and outputs the predicted value of the state def build_critic(self): layer1 = Dense(self.layer_size, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform') layer2 = Dense(self.layer_size, input_dim=self.layer_size, activation='relu', kernel_initializer='he_uniform') layer3 = Dense(self.value_size, activation='linear', kernel_initializer='he_uniform') # self.value_size = 1 critic = Sequential(layers=[layer1, layer2, layer3]) # Print a summary of the network critic.summary() critic.compile(loss='mean_squared_error', optimizer=Adam(lr=self.critic_learning_rate)) return critic def act(self, state): # Get probabilities for each action policy = self.actor.predict(np.array([state]), batch_size=1).flatten() # Randomly choose an action if not self.is_eval: return np.random.choice(self.action_size, 1, p=policy).take(0) else: return np.argmax(policy) # 20191117- for evaluation def store_transition(self, s, a, r, s_, dd): if self.prioritized: # prioritized replay transition = np.hstack((s, [a, r], s_, dd)) self.memory.store( transition) # have high priority for newly arrived transition else: #self.replay_queue.append((s, [a, r], s_, dd)) transition = np.hstack((s, [a, r], s_, dd)) self.replay_queue.append(transition) def expReplay(self, batch_size=64, lr=1, factor=0.95): if self.prioritized: tree_idx, batch_memory, ISWeights = self.memory.sample(batch_size) else: batch_memory = random.sample(self.replay_queue, batch_size) s_prevBatch = np.array([replay[[0, 1]] for replay in batch_memory]) a = np.array([replay[[2]] for replay in batch_memory]) r = np.array([replay[[3]] for replay in batch_memory]) s_currBatch = np.array([replay[[4, 5]] for replay in batch_memory]) d = np.array([replay[[6]] for replay in batch_memory]) td_error = np.zeros((d.shape[0], ), dtype=float) for i in range(d.shape[0]): q_prev = self.critic.predict(np.array([s_prevBatch[i, :]])) q_curr = self.critic.predict(np.array([s_currBatch[i, :]])) if int(d[i]) == 1: q_curr = r[i] q_realP = r[i] + factor * q_curr advantages = np.zeros((1, self.action_size)) advantages[0, int(a[i])] = q_realP - q_prev if self.prioritized: td_error[i] = abs(advantages[0, int(a[i])]) self.actor.fit(np.array([s_prevBatch[i, :]]), advantages, epochs=1, verbose=0) self.critic.fit(np.array([s_prevBatch[i, :]]), reshape(q_realP), epochs=1, verbose=0) if self.prioritized: self.memory.batch_update(tree_idx, td_error)
class Agent: """ Interacts with and learns from the environment. Learns using a Deep Q-Network with prioritised experience replay. Two models are instantiated, one for use during evaluation and updating (qnetwork_local) and one to be used for the target values in the learning algorithm (qnetwork_target) """ BUFFER_SIZE = int(1e5) # prioritised experience replay buffer size BATCH_SIZE = 64 # minibatch size TAU = 1e-3 # for soft update of target parameters LR = 5e-4 # learning rate UPDATE_EVERY = 4 # how often to update the network device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def __init__(self, state_size: int = 37, action_size: int = 4, seed: int = 44, gamma: float = 0.99, tau: float = 1e-3): """ Initialize an Agent object. :param state_size: dimension of each state :param action_size: dimension of each action :param seed: random seed for network initialisation :param gamma: discount factor :param tau: lag for soft update of target network parameters """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.gamma = gamma self.tau = tau self.max_w = 0 # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LR) # Prioritised Experience Replay memory self.memory = Memory(self.BUFFER_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state: np.ndarray, action: int, reward: float, next_state: np.ndarray, done: bool, gamma: Optional[float] = None, tau: Optional[float] = None): """ An agent step takes the current experience and stores it in the replay memory, then samples from the memory and calls the learning algorithm. :param state: the state vector :param action: the action performed on the state :param reward: the reward given upon performing the action :param next_state: the next state after doing the action :param done: True if the episode has ended :param gamma: discount factor :param tau: lag for soft update of target network parameters """ gamma_value = gamma if gamma is not None else self.gamma tau_value = tau if tau is not None else self.tau self.memory.add((state, action, reward, next_state, done)) # Save experience in replay memory # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if self.memory.tree.n_entries > self.BATCH_SIZE: experiences, idxs, importance_weights = self.memory.sample( self.BATCH_SIZE) self.learn(experiences, idxs, importance_weights, gamma_value, tau_value) def act(self, state: np.ndarray, eps: float = 0.0): """ Returns actions for given state as per current policy. Uses the local copy of the model. :param state: current state :param eps: epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.int32(np.argmax(action_values.cpu().data.numpy())) else: return np.int32(random.choice(np.arange(self.action_size))) def learn(self, experiences: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], indices: np.ndarray, importance_weights: torch.Tensor, gamma: float, tau: float): """ Update value parameters using given batch of experience tuples. :param experiences: tuple of (s, a, r, s', done) tuples :param indices: indices of the SumTree that contain the priority values for these experiences. Used for updating the priority values after error has been found :param importance_weights: the weighting that each experience carries when used in updating the network :param gamma: discount factor :param tau: lag for soft update of target network parameters """ states, actions, rewards, next_states, dones = experiences # For Double-DQN, get action with the highest q-value (for next_states) from the local model next_action = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) # Get max predicted Q values (for next states) from target model q_targets_next = self.qnetwork_target(next_states).gather( 1, next_action) # Compute Q targets for current states q_targets = rewards + (gamma * q_targets_next * (1 - dones)) # Get expected Q values from local model q_expected = self.qnetwork_local(states).gather(1, actions) error = torch.abs(q_targets - q_expected).detach().numpy() # update priorities self.memory.batch_update(indices, error) # Compute mse and loss with importance weights t_mse = F.mse_loss(q_expected, q_targets) loss = (importance_weights * t_mse).mean() # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network with model parameters approaching those of the local network. self.soft_update(self.qnetwork_local, self.qnetwork_target, tau) @staticmethod def soft_update(local_model: torch.nn.Module, target_model: torch.nn.Module, tau: float): """ Soft update model parameters. Every learning step the target network is updated to bring its parameters nearer by a factor TAU to those of the improving local network. If TAU = 1 the target network becomes a copy of the local network. If TAU = 0 the target network is not updated. θ_target = τ*θ_local + (1 - τ)*θ_target :param local_model: weights will be copied from :param target_model: weights will be copied to :param tau: interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DoubleDQN(object): def __init__(self, replay_size, memory_size=10000, prioritized=False): self.step = 0 self.replay_size = replay_size self.replay_queue = deque(maxlen=self.replay_size) self.memory_size = memory_size self.tau = 1e-2 #MountainCar-v0 self.model = self.create_model() self.prioritized = prioritized self.target_model = self.create_model() self.target_model.set_weights(self.model.get_weights()) if self.prioritized: self.memory = Memory(capacity=memory_size) def create_model(self): STATE_DIM, ACTION_DIM = 2, 3 model = models.Sequential([ layers.Dense(100, input_dim=STATE_DIM, activation='relu'), layers.Dense(ACTION_DIM, activation="linear") ]) model.compile(loss='mean_squared_error', optimizer=optimizers.Adam(0.001)) return model def act(self, s, epsilon=0.1): # if np.random.uniform() < epsilon - self.step * 0.0002: return np.random.choice([0, 1, 2]) return np.argmax(self.model.predict(np.array([s]))[0]) def save_model(self, file_path='MountainCar-v0-Ddqn.h5'): print('model saved') self.model.save(file_path) def store_transition(self, s, a, r, s_, dd): if self.prioritized: # prioritized replay transition = np.hstack((s, [a, r], s_, dd)) # transition -> 7x1 self.memory.store( transition) # have high priority for newly arrived transition else: #self.replay_queue.append((s, [a, r], s_, dd)) transition = np.hstack((s, [a, r], s_, dd)) # transition -> 7x1 self.replay_queue.append(transition) def expReplay(self, batch_size=64, lr=1, factor=0.95): if self.prioritized: tree_idx, batch_memory, ISWeights = self.memory.sample(batch_size) else: batch_memory = random.sample(self.replay_queue, batch_size) s_batch = np.array([replay[[0, 1]] for replay in batch_memory]) a = np.array([replay[[2]] for replay in batch_memory]) r = np.array([replay[[3]] for replay in batch_memory]) next_s_batch = np.array([replay[[4, 5]] for replay in batch_memory]) d = np.array([replay[[6]] for replay in batch_memory]) Q = self.model.predict(s_batch) Q_next = self.model.predict(next_s_batch) Q_targ = self.target_model.predict(next_s_batch) #update Q value td_error = np.zeros((d.shape[0], ), dtype=float) for i in range(d.shape[0]): old_q = Q[i, int(a[i])] if int(d[i]) == 1: Q[i, int(a[i])] = r[i] else: next_best_action = np.argmax(Q_next[i, :]) Q[i, int(a[i])] = r[i] + factor * Q_targ[i, next_best_action] if self.prioritized: td_error[i] = abs(old_q - Q[i, int(a[i])]) if self.prioritized: self.memory.batch_update(tree_idx, td_error) self.model.fit(s_batch, Q, verbose=0) def transfer_weights(self): """ Transfer Weights from Model to Target at rate Tau """ W = self.model.get_weights() tgt_W = self.target_model.get_weights() for i in range(len(W)): tgt_W[i] = self.tau * W[i] + (1 - self.tau) * tgt_W[i] self.target_model.set_weights(tgt_W)