class DQNAgent(Agent): def __init__(self, *args, **kwargs): super(DQNAgent, self).__init__(*args, **kwargs) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir) def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) actions = self.q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def learn(self): if self.memory.mem_cntr < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.q_eval.forward(states)[indices, actions] q_next = self.q_next.forward(states_).max(dim=1)[0] q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon()
class DDQNAgent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name+'_'+self.algo+'_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name+'_'+self.algo+'_q_next', chkpt_dir=self.chkpt_dir) def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, dones def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation],dtype=T.float).to(self.q_eval.device) actions = self.q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def replace_target_network(self): if self.replace_target_cnt is not None and \ self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec \ if self.epsilon > self.eps_min else self.eps_min def learn(self): if self.memory.mem_cntr < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.q_eval.forward(states)[indices, actions] q_next = self.q_next.forward(states_) q_eval = self.q_eval.forward(states_) max_actions = T.argmax(q_eval, dim=1) q_next[dones] = 0.0 q_target = rewards + self.gamma*q_next[indices, max_actions] loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon() def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint()
class DQNAgent(Agent): ''' Agent based on Deep Q-Network Agent (DQN) ''' def __init__(self, *args, **kwargs): super(DQNAgent, self).__init__(*args, **kwargs) # define Q-evaluation network and target Q-network for the agent self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) # we will never perform gradient descent or backpropagation with Q next network self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir) def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) actions = self.q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def learn(self): if self.memory.mem_cntr < self.batch_size: return # zero out previous gradient calculations self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) # calculate Q-predicted and Q-target values (gives action values for batch states) ''' dims --> batch_size x n_actions what the target network has to say about the values of the new states that results from the agent's actions. we want to know what are teh values of the maximal actions for that articular set of states. we find that by taking the max along the action dimension ''' q_pred = self.q_eval.forward(states)[indices, actions] q_next = self.q_next.forward(states_).max( dim=1)[0] # 0 max value, 1 index of max value # done flag as a type of mask q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() # backprogate the loss self.q_eval.optimizer.step() # step the optimizer to update weight self.learn_step_counter += 1 # do this to remember to update target network to the right frequency
class DDQNAgent(Agent): ''' Agent based on Double Deep Q-Nework (Double-DQN) ''' def __init__(self, *args, **kwargs): super(DDQNAgent, self).__init__(*args, **kwargs) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir) def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) actions = self.q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def learn(self): if self.memory.mem_cntr < self.batch_size: return # zero out previous gradient calculations self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.q_eval.forward(states)[indices, actions] q_next = self.q_next.forward(states_) q_eval = self.q_eval.forward(states_) max_actions = T.argmax(q_eval, dim=1) # done flag as a type of mask q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next[indices, max_actions] loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() # backprogate the loss self.q_eval.optimizer.step() # step the optimizer to update weight self.learn_step_counter += 1 # do this to remember to update target network to the right frequency self.decrement_epsilon()
class Agent(): def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions, max_mem_size=100000, eps_end=0.01, eps_dec=5e-5): self.gamma = gamma self.epsilon = epsilon self.eps_min = eps_end self.eps_dec = eps_dec self.lr = lr self.action_space = [i for i in range(n_actions)] self.mem_size = max_mem_size self.batch_size = batch_size self.mem_cntr = 0 self.Q_eval = DeepQNetwork(self.lr, n_actions=n_actions, input_dims=input_dims, fc1_dims=256, fc2_dims=256) self.state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32) self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32) self.action_memory = np.zeros(self.mem_size, dtype=np.int32) self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) def store_transition(self, state, action, reward, state_, done): index = self.mem_cntr % self.mem_size #this allows for wrapping when memory is maxed self.state_memory[index] = state self.new_state_memory[index] = state_ self.reward_memory[index] = reward self.action_memory[index] = action self.terminal_memory[index] = done self.mem_cntr += 1 def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation]).to(self.Q_eval.device).float() actions = self.Q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def learn(self): if self.mem_cntr < self.batch_size: return self.Q_eval.optimizer.zero_grad() max_mem = min(self.mem_cntr, self.mem_size) batch = np.random.choice(max_mem, self.batch_size, replace=False) batch_index = np.arange(self.batch_size, dtype=np.int32) state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device) new_state_batch = T.tensor(self.new_state_memory[batch]).to( self.Q_eval.device) reward_batch = T.tensor(self.reward_memory[batch]).to( self.Q_eval.device) terminal_batch = T.tensor(self.terminal_memory[batch]).to( self.Q_eval.device) action_batch = self.action_memory[batch] q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch] q_next = self.Q_eval.forward(new_state_batch) q_next[terminal_batch] = 0.0 q_target = reward_batch + self.gamma * T.max(q_next, dim=1)[0] loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device) loss.backward() self.Q_eval.optimizer.step() self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min \ else self.eps_min