class Agent(): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.eps_min = eps_min self.eps_dec = eps_dec self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.batch_size = batch_size self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.memory = ReplayBuffer(mem_size, input_dims, n_actions) def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def choose_action(self, observation): raise NotImplementedError def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec \ if self.epsilon > self.eps_min else self.eps_min def sample_memory(self): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, dones def learn(self): raise NotImplementedError def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint()
class Agent(): def __init__(self, input_dims, n_actions): self.epsilon = Config.epsilon self.n_actions = n_actions self.input_dims = input_dims self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(input_dims, n_actions) def store_transition(self, state, action, reward, state_new, done): self.memory.store_transition(state, action, reward, state_new, done) def choose_action(self, observation): raise NotImplementedError def replace_target_network(self): if self.learn_step_counter % Config.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decay_epsilon(self): self.epsilon = max(self.epsilon * Config.eps_decay, Config.eps_min) def sample_memory(self): state, action, reward, new_state, done = self.memory.sample_buffer() states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, dones def learn(self): raise NotImplementedError def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint()
class DQN(): def __init__(self, states, actions, alpha, gamma, epsilon, epsilon_min, epsilon_decay, replay_buffer_sz, batch, path, path_pred): self.Q = Network(states.shape, actions, alpha, path) self.Q_pred = Network(states.shape, actions, alpha, path_pred) # self.memory = deque(maxlen=replay_buffer_sz) self.memory = ReplayBuffer(replay_buffer_sz, states.shape, actions) self.batch = batch self.learn_cnt = 0 self.gamma = gamma self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.actions = actions self.Q.path = path self.Q_pred.path = path_pred def e_greedy_policy(self, s): p = random.random() s = torch.tensor([s], dtype=torch.float).to(self.Q.device) # s = torch.unsqueeze(axis=0) a = torch.argmax(self.Q.forward(s)).item() if ( p > self.epsilon) else np.random.randint(0, self.actions) return a def sample_memory(self): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch) states = torch.tensor(state).to(self.Q.device) rewards = torch.tensor(reward).to(self.Q.device) dones = torch.tensor(done).to(self.Q.device) actions = torch.tensor(action).to(self.Q.device) states_ = torch.tensor(new_state).to(self.Q.device) return states, actions, rewards, states_, dones def store(self, s, a, r, ns, done): # self.memory.append([s,a,r,ns,done]) self.memory.store_transition(s, a, r, ns, done) def update_target_network(self): self.Q_pred.load_state_dict(self.Q.state_dict()) def save_models(self): self.Q.save_checkpoint(self.Q.path) self.Q_pred.save_checkpoint(self.Q_pred.path) def load_models(self): self.Q.load_checkpoint() self.Q_pred.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch: return self.Q.optimizer.zero_grad() if (self.learn_cnt >= 1000): #only update network after 1000 steps self.learn_cnt = 0 self.update_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch) q_pred = self.Q.forward(states)[indices, actions] q_next = self.Q_pred.forward(states_).max(dim=1)[0] q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next loss = self.Q.loss(q_target, q_pred).to(self.Q.device) loss.backward() self.Q.optimizer.step() self.learn_cnt += 1 self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_decay)
class DDQNAgent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, action_joint_dim, mem_size, batch_size, eps_min, eps_dec, replace, prioritized=False, prob_alpha=0.6, beta=0.4, beta_increment=1e-4, temperature = 0.1, tau = 1e-5): """ Double Deep Q-Learning Agent class. ----- Args: gamma: Discount factor for reward. 0 indicates a myopic behaviour. 1 indicates a far-sighted behaviour. epsilon: Exploration/exploitation rate. 0 indicates full exploitation. lr: Learning Rate. The bigger 'lr' the bigger step in the gradient of the loss. n_actions: Number of possible actions. input_dims: Dimension of the state (allegedly an image). The channel goes first (CHANN, HEIGHT, WIDTH) action_joint_dim: Number of joints for the Multi-agent case. Normally the number of agents. mem_size: Number of the Replay Buffer memory. batch_size: Number of past experiences used for trainin Q-Network. eps_min: Min. value for the exploration. eps_dec: Epsilon decay in every epoch. replace: Number of epochs for replacing the target network with the behavioral network. ------ """ # Hiperparámetros de entrenamiento # self.gamma = gamma self.epsilon = epsilon self.beta = beta self.beta_increment = beta_increment self.prob_alpha = prob_alpha self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.update_target_count = replace self.action_space = [i for i in range(n_actions)] self.action_joint_dim = action_joint_dim self.prioritized = prioritized self.temperature = temperature self.tau = tau self.mem_size = mem_size if not self.prioritized: self.memory = ReplayBuffer(mem_size, input_dims, action_joint_dim) else: self.memory = PrioritizedReplayBuffer(mem_size, input_dims, action_joint_dim, self.prob_alpha) # Funciones del modelo y del target # self.q_eval = DeepQNetwork(self.lr, num_agents=action_joint_dim, action_size=n_actions, input_size=input_dims) self.q_eval.cuda() self.q_next = DeepQNetwork(self.lr, num_agents=action_joint_dim, action_size=n_actions, input_size=input_dims) self.q_next.cuda() def reset_memory(self): self.memory = ReplayBuffer(self.mem_size, self.input_dims, self.action_joint_dim) def store_transition(self, state, action, reward, next_state, done): """ Store a '(s,a,r,s')' transition into the buffer replay. ------ Args: state: State of the experience. action: Action joint performed in the given 'state'. reward: 1D Reward array obtained due to (s,a). One component for every agent. next_state: The next state produced, given (s,a). done: If the state is terminal. Normally 0 because non-episodic. ------ """ # Guardamos en memoria la tupla (s,a,r,s') + done # # Se guardan como arrays de numpy y al samplear se pasan a tensores # self.memory.store_transition(state, action, reward, next_state, done) def sample_memory(self): """ Extract 'self.batch_size' experiences (s,a,r,s') from the memory replay. ------ Returns: The *BATCH_SIZE* (s,a,r,s') experiences. ------ """ # Muestreamos un conjunto BATCH de experiencias # state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size) with T.no_grad(): # Los convertimos a tensores de Pytorch # states = T.tensor(state, device = self.q_eval.device) rewards = T.tensor(reward, device = self.q_eval.device) dones = T.tensor(done, device = self.q_eval.device) actions = T.tensor(action, device = self.q_eval.device) next_state = T.tensor(new_state, device = self.q_eval.device) return states, actions, rewards, next_state, dones def prioritized_sample_memory(self): """ Extract 'self.batch_size' experiences (s,a,r,s') from the memory replay. ------ Returns: The *BATCH_SIZE* (s,a,r,s') experiences. ------ """ # Muestreamos un conjunto BATCH de experiencias # state, action, reward, new_state, done, indices, weight = self.memory.sample_buffer(self.batch_size, self.beta) with T.no_grad(): # Los convertimos a tensores de Pytorch # states = T.tensor(state, device = self.q_eval.device) rewards = T.tensor(reward, device = self.q_eval.device) dones = T.tensor(done, device = self.q_eval.device) actions = T.tensor(action, device = self.q_eval.device) next_state = T.tensor(new_state, device = self.q_eval.device) weights = T.tensor(weight, device = self.q_eval.device) return states, actions, rewards, next_state, dones, indices, weights # Política e-greedy # def choose_action(self, observation, mode = 'egreedy'): """ Epsilon-greedy policy. Plays explorate/explotate with a probability epsilon/(1-apsilon). ---- Args: observation: The state (allegedly an image). Must be a matrix with (N_CHANN, HEIGHT, WIDTH) Returns: An action joint (1D array) with the selected actions. ------ """ if mode == 'egreedy': if np.random.random() > self.epsilon: with T.no_grad(): state = T.tensor([observation], dtype=T.float, device = self.q_eval.device) Q = self.q_eval.forward(state) action_joint = [] # En la e-greedy, si caemos en explotación, a = max_a(Q) for i in range(self.action_joint_dim): action_joint.append(T.argmax(Q.narrow(1,i*self.n_actions,self.n_actions)).item()) return action_joint else: action_joint = np.random.choice(self.action_space, size = self.action_joint_dim) return action_joint elif mode == 'softmax': with T.no_grad(): state = T.tensor([observation], dtype=T.float, device = self.q_eval.device) Q = self.q_eval.forward(state) action_joint = [] # Softmax policy # for i in range(self.action_joint_dim): probs = T.softmax(Q.narrow(1,i*self.n_actions,self.n_actions)/self.temperature,dim=1) categ = T.distributions.Categorical(probs) action_joint.append(categ.sample().item()) return action_joint else: assert('ERROR. ESCOJA UN VALOR DE POLÍTICA ADECUADO: (egreedy/softmax)') # Este método se usará out-class para poder alternar entre DDQN y DQN # def replace_target_network(self, epoch): """ Function to dump the behavioral network into the target network. ----- Args: epoch: The actual epoch. ------ """ """ if epoch % self.update_target_count == 0: for target_param, param in zip(self.q_next.parameters(), self.q_eval.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) """ if epoch % self.update_target_count == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def learn(self, mask = None): """ Learning function. It predicts the return (target_network) and takes a descent gradient step. The q-values are calculated with Time Difference and we will accumulate the other_results of the gradients along the agents. ------ """ # Si intentamos entrenar con menos experiencias que tamaño de batch # nos salimos y no entrenamos. if self.memory.mem_cntr < self.batch_size: return if mask is None: mask = np.zeros(shape=self.action_joint_dim) self.q_eval.optimizer.zero_grad() self.q_next.optimizer.zero_grad() if not self.prioritized: states, actions, rewards, next_states, dones = self.sample_memory() else: states, actions, rewards, next_states, dones, batches, weights = self.prioritized_sample_memory() prior = T.tensor(np.zeros(shape=batches.shape), device = self.q_eval.device) indices = np.arange(self.batch_size) Q_pred = self.q_eval(states) Q_next = self.q_next(next_states) Q_eval = self.q_eval(next_states) for i in range(self.action_joint_dim): if mask[i] == 1: # If the mask is 1, the agent does not learn at all # continue q_pred = Q_pred.narrow(1,i*self.n_actions,self.n_actions)[indices, actions[:, i]] max_actions = T.argmax(Q_eval.narrow(1,i*self.n_actions,self.n_actions), dim=1).detach() q_target = rewards[indices, i] + self.gamma*Q_next.narrow(1,i*self.n_actions,self.n_actions)[indices, max_actions[i]].detach() # TARGET IS DETACHED, ITS PARAMETERS ARE NOT SUBJECT OF TRAINING # if not self.prioritized: loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward(retain_graph=True) else: loss = self.q_eval.loss2(q_target, q_pred).to(self.q_eval.device)*weights prior += loss.data.detach() loss = loss.mean() loss.backward(retain_graph=True) self.q_eval.optimizer.step() if self.prioritized: self.memory.update_priorities(batches, prior.cpu().numpy()) def decrement_epsilon(self): """ Decrement 'self.epsilon' with a 'self.eps_dec', clipping its minimum to 'self.eps_min'. ------ """ if self.epsilon >= self.eps_min: self.epsilon = self.epsilon - self.eps_dec else: self.epsilon = self.eps_min def increment_beta(self): if self.beta >= 1: self.beta = 1 else: self.beta += self.beta_increment def decrement_temperature(self): if self.temperature < 0.005: self.temperature = 0.005 else: self.temperature = self.temperature - 2e-4
class Agent(object): ''' Agent base class ''' def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 # agents memory self.memory = ReplayBuffer(mem_size, input_dims, n_actions) def choose_action( self, observation ): # Example: epsilon-greedy behavior policy for action selection raise NotImplementedError def store_transition(self, state, action, reward, state_, done): ''' Storing transitions in the agent's memory, sampling those transitions and converting imput into tensors, and decay epsilon, and replacing target network. ''' self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = self.memory.sample_buffer( self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, dones def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): if self.epsilon > self.eps_min: self.epsilon = self.epsilon - self.eps_dec else: self.epsilon = self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): raise NotImplementedError
class DuelingDDQNAgent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='models/'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DuelingDeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DuelingDeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir) def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, dones def choose_action(self, observation): if np.random.random() > self.epsilon: state = np.array([observation], copy=False, dtype=np.float32) state_tensor = T.tensor(state).to(self.q_eval.device) _, advantages = self.q_eval.forward(state_tensor) action = T.argmax(advantages).item() else: action = np.random.choice(self.action_space) return action def replace_target_network(self): if self.replace_target_cnt is not None and \ self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec \ if self.epsilon > self.eps_min else self.eps_min def learn(self): if self.memory.mem_cntr < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) V_s, A_s = self.q_eval.forward(states) V_s_, A_s_ = self.q_next.forward(states_) V_s_eval, A_s_eval = self.q_eval.forward(states_) q_pred = T.add(V_s, (A_s - A_s.mean(dim=1, keepdim=True)))[indices, actions] q_next = T.add(V_s_, (A_s_ - A_s_.mean(dim=1, keepdim=True))) q_eval = T.add(V_s_eval, (A_s_eval - A_s_eval.mean(dim=1, keepdim=True))) max_actions = T.argmax(q_eval, dim=1) q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next[indices, max_actions] loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon() def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint()
class Agent(object): def __init__(self, n_actions, input_dims): self.n_actions = n_actions self.input_dims = input_dims self.epsilon = Config.epsilon self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(input_dims, n_actions) name_root = Config.env_name + '_' + Config.algo self.q_eval = Network(self.n_actions, input_dims=self.input_dims, name=name_root + '_q_eval') self.q_next = Network(self.n_actions, input_dims=self.input_dims, name=name_root + '_q_next') def store_transition(self, state, action, reward, state_new, done): self.memory.store_transition(state, action, reward, state_new, done) def sample_memory(self): state, action, reward, new_state, done = self.memory.sample_buffer( Config.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, dones def choose_action(self, observation): if np.random.random() > self.epsilon: state = np.array([observation], copy=False, dtype=np.float32) state_tensor = T.tensor(state).to(self.q_eval.device) _, advantages = self.q_eval.forward(state_tensor) action = T.argmax(advantages).item() else: action = np.random.choice(self.action_space) return action def replace_target_network(self): if Config.replace_target_cnt is not None and self.learn_step_counter % Config.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decay_epsilon(self): new_value = self.epsilon * Config.eps_decay if new_value > Config.eps_min: self.epsilon = new_value def learn(self): if self.memory.mem_cntr < Config.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_new, dones = self.sample_memory() indices = np.arange(Config.batch_size) Value_stream, Advantage_stream = self.q_eval.forward(states) Value_stream_new, Advantage_stream_new = self.q_next.forward( states_new) Value_stream_eval, Advantage_stream_eval = self.q_eval.forward( states_new) q_pred = T.add(Value_stream, (Advantage_stream - Advantage_stream.mean(dim=1, keepdim=True)))[indices, actions] q_next = T.add(Value_stream_new, (Advantage_stream_new - Advantage_stream_new.mean(dim=1, keepdim=True))) q_eval = T.add(Value_stream_eval, (Advantage_stream_eval - Advantage_stream_eval.mean(dim=1, keepdim=True))) max_actions = T.argmax(q_eval, dim=1) q_next[dones] = 0.0 q_target = rewards + Config.gamma * q_next[indices, max_actions] loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decay_epsilon() def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint()
class DQNAgent(): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, chkpt_dir, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None): self.gamma = gamma # 0.99 self.epsilon = epsilon # 1.0 self.lr = lr # 0.0001 self.n_actions = n_actions # 6 self.input_dims = input_dims # (4, 84, 84) self.batch_size = batch_size # 32 self.eps_min = eps_min # 0.1 self.eps_dec = eps_dec # 1e-05 self.replace_target_cnt = replace # 1000 self.algo = algo # 'DQNAgent' self.env_name = env_name # 'PongNoFrameskip-v4' self.chkpt_dir = chkpt_dir # .\\models\\ self.action_space = [i for i in range(self.n_actions) ] # [0, 1, 2, 3, 4, 5] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir) def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) actions = self.q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = self.memory.sample_buffer( self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, dones def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict( )) # load_state_dict and state_dict are inbuilt functions of torch def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.q_eval.forward( states )[indices, actions] # self.q_eval.forward(states).shape = (32, 6), q_pred.shape = 32 q_next = self.q_next.forward(states_).max( dim=1 )[0] # self.q_next.forward(states_).shape = (32, 6), q_next.shape = 32 temp_dones = dones.bool() q_next[temp_dones] = 0.0 # as reward for terminal state is 0 q_target = rewards + self.gamma * q_next loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon()
class Agent(): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=.01, eps_dec=5e-7, replace_count=1000, algorithm=None, env_name=None, checkpoint_dir='/checkpoints'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_count = replace_count self.algorithm = algorithm self.env_name = env_name self.checkpoint_dir = checkpoint_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) print(type(self).__name__) self.q_eval = object self.q_policy = object def store_transition(self, current_state, action, reward, next_state, done): self.memory.store_transition(current_state, action, reward, next_state, done) def sample_memory(self): current_state, action, reward, next_state, done = self.memory.sample_buffer(self.batch_size) current_states = T.tensor(current_state).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) next_states = T.tensor(next_state).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) return current_states, actions, rewards, next_states, dones def update_policy_network(self): if(self.learn_step_counter % self.replace_count == 0): self.q_policy.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_policy.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_policy.load_checkpoint() def pre_learn(self): if self.memory.mem_idx < self.batch_size: return self.q_eval.optimizer.zero_grad() self.update_policy_network() current_states, actions, rewards, next_states, dones = self.sample_memory() indices = np.arange(self.batch_size) return current_states, actions, rewards, next_states, dones, indices def post_learn(self, q_target, q_pred): loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon() def choose_action(self, observation, network): raise NotImplementedError def learn(self): raise NotImplementedError
class DQNAgent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir) #self.dim_bechir = self.q_eval.calculate_output_bechir(self.input_dims) def choose_action(self, observation): """ Choose an action through an epsilon-greedy approach. :param observation: state features as provided by gym environment. :return: action """ if np.random.random() > self.epsilon: # Convert state to Pytorch tensor and send to q_eval.device state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) # Get actions values from q_eval network actions = self.q_eval.forward(state) # Get action with highest value action = T.argmax(actions).item() else: # Select random action from action space action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, dones def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec \ if self.epsilon > self.eps_min else self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): # If memory counter has not reached batch size simply return if self.memory.mem_cntr < self.batch_size: return # reset gradients for the main network's optimizer self.q_eval.optimizer.zero_grad() # Call function to update target network weights every n steps self.replace_target_network() # Sample environment transitions from the replay buffer states, actions, rewards, states_, dones = self.sample_memory() # Get Q(s,a) for the actions performed by the agent. # Because we processed a batch of states, we need to index the result of the forward function by the indices of # the states (from 0 to batch_size) followed by the index of the action performed by the agent. indices = np.arange(self.batch_size) q_pred = self.q_eval.forward(states)[indices, actions] # Get max Q(s', a') from target network q_next = self.q_next.forward(states_).max(dim=1)[0] # Set Q(s', a') to zero for terminal states q_next[dones] = 0.0 # Compute the q_target as r + gamma * Q(s',a') q_target = rewards + self.gamma * q_next # Compute the loss tensor and move it to q_eval.device loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) # Backpropagate loss and optimize network parameters loss.backward() self.q_eval.optimizer.step() # Increment training counter self.learn_step_counter += 1 # Decrement epsilon for epsilon-greedy action selection self.decrement_epsilon()
class DDQNAgent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, chkpt_name, eps_min, eps_dec, replace, logging_dir): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.chkpt_dir = chkpt_name self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = FullyConnectedNet(self.lr, self.n_actions, input_dims=self.input_dims, chkpt_name=self.chkpt_dir, name='q_eval', logging_dir=logging_dir) self.q_next = FullyConnectedNet(self.lr, self.n_actions, input_dims=self.input_dims, name='q_next', chkpt_name=self.chkpt_dir, logging_dir=logging_dir) def select_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) actions = self.q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, dones def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec \ if self.epsilon > self.eps_min else self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self, freeze): if self.memory.mem_cntr < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() #self.q_next.load_state_dict(self.q_eval.state_dict()) states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.q_eval.forward(states)[indices, actions] q_next = self.q_next.forward(states_) q_eval = self.q_eval.forward(states_) max_actions = T.argmax(q_eval, dim=1) q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next[indices, max_actions] loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon() if freeze: #self.q_eval.conv1.weight.requires_grad = False #self.q_eval.conv2.weight.requires_grad = False #self.q_eval.conv3.weight.requires_grad = False #print('conv nets are frozen') pass
class DDQNAgent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DDQN(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DDQN(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir) def choose_action(self, observation): if tf.random.uniform([1]) > self.epsilon: actions = self.q_eval.call(tf.expand_dims(observation, axis=0)) action = tf.argmax(actions, axis=1) else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): states, actions, rewards, new_states, dones = self.memory.sample_buffer( self.batch_size) return states, actions, rewards, new_states, dones def replace_target_network(self): if self.learn_step_counter & self.replace_target_cnt == 0: self.q_next = self.q_eval def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec \ if self.epsilon > self.eps_min else self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() optimizer = keras.optimizers.RMSprop(learning_rate=self.lr) indices = tf.range(self.batch_size) with tf.GradientTape() as tape: #predicted q from actions taken from the memory q_pred = tf.gather_nd(self.q_eval.call(states), list(zip(indices, actions))) #forward pass of future states trough policy network q_next = self.q_next.call(states_) #forward pass of future states trough target network q_eval = self.q_eval.call(states_) #gains the corresponding actions/indices that have the higest q values from target max_actions = tf.math.argmax(q_eval, axis=1) #zero out the terminal states q_next = q_next * tf.expand_dims(tf.cast(dones, tf.float32), -1) #get q values of the max action predicted from targets using the policy network gather = tf.gather_nd( q_next, list(zip(indices, tf.cast(max_actions, dtype=tf.int32)))) #q value update q_target = rewards + self.gamma * tf.cast(gather, dtype=tf.float32) #loss calculation loss = keras.losses.MSE(q_target, q_pred) gradient = tape.gradient(loss, self.q_eval.trainable_variables) optimizer.apply_gradients( zip(gradient, self.q_eval.trainable_variables)) self.decrement_epsilon() self.learn_step_counter += 1
class SACAgent(): def __init__(self, alpha, beta, input_dims, tau, gamma=0.99, max_action=1.0, \ n_actions=2, max_size=1000000, layer1_size=400, \ layer2_size=300, batch_size=100, reward_scale=2, path_dir='model/sac'): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_actor', max_action=max_action, chkpt_dir=path_dir) self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_critic_1', chkpt_dir=path_dir) self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_critic_2', chkpt_dir=path_dir) self.value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name='_value', chkpt_dir=path_dir) self.target_value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name='_target_value', chkpt_dir=path_dir) self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = T.Tensor([observation]).to(self.actor.device) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_state_dict = dict(target_value_params) value_state_dict = dict(value_params) for name in value_state_dict: value_state_dict[name] = tau * value_state_dict[name].clone() + ( 1 - tau) * target_value_state_dict[name].clone() self.target_value.load_state_dict(value_state_dict) def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = self.memory.sample_buffer( self.batch_size) reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device) done = T.tensor(done).to(self.critic_1.device) state_ = T.tensor(new_state, dtype=T.float).to(self.critic_1.device) state = T.tensor(state, dtype=T.float).to(self.critic_1.device) action = T.tensor(action, dtype=T.float).to(self.critic_1.device) value = self.value(state).view(-1) value_ = self.target_value(state_).view(-1) value_[done] = 0.0 actions, log_probs = self.actor.sample_normal(state, reparameterize=False) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value - log_probs value_loss = 0.5 * F.mse_loss(value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() actions, log_probs = self.actor.sample_normal(state, reparameterize=True) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_probs - critic_value actor_loss = T.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() q_hat = self.scale * reward + self.gamma * value_ q1_old_policy = self.critic_1.forward(state, action).view(-1) q2_old_policy = self.critic_2.forward(state, action).view(-1) critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters() def save_models(self, episode): self.actor.save_checkpoint(episode) self.value.save_checkpoint(episode) self.target_value.save_checkpoint(episode) self.critic_1.save_checkpoint(episode) self.critic_2.save_checkpoint(episode) def load_models(self, episode): self.actor.load_checkpoint(episode) self.value.load_checkpoint(episode) self.target_value.load_checkpoint(episode) self.critic_1.load_checkpoint(episode) self.critic_2.load_checkpoint(episode)
class DQNAgent(): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec = 5e-7, replace =1000, algo=None, env_name = None, chkpt_dir = 'tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter= 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims,name = self.env_name + self.algo + '_q_eval', chkpt_dir= self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims,name = self.env_name + self.algo + '_q_next', chkpt_dir= self.chkpt_dir) def choose_action(self, observation): if(np.random.random()> self.epsilon): state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) actions = self.q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards= T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, done def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon -self.eps_dec if self.epsilon > self.eps_min else self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): if(self.memory.mem_cntr < self.batch_size): return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.q_eval.forward(states)[indices, actions] q_next = self.q_next.forward(states_).max(dim=1)[0] q_next[dones] = 0 q_target= rewards + self.gamma * q_next loss =self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter+=1 self.decrement_epsilon()
class Agent(): def __init__(self, input_dims, n_actions, lr, mem_size, batch_size, epsilon, gamma=0.99, eps_dec=5e-7, eps_min=0.01, replace=1000, algo=None, env_name=None, checkpoint_dir='tmp/dqn'): self.lr = lr self.batch_size = batch_size self.input_dims = input_dims self.n_actions = n_actions self.gamma = gamma self.epsilon = epsilon self.eps_dec = eps_dec self.eps_min = eps_min self.replace = replace self.algo = algo self.env_name = env_name self.checkpoint_dir = checkpoint_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + " " + self.algo + "_q_eval", checkpoint_dir=self.checkpoint_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + " " + self.algo + "_q_next", checkpoint_dir=self.checkpoint_dir) def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation], dtype=T.float).to( self.q_eval.device) # converting observation to tensor, # and observation is in the list because our convolution expects an input tensor of shape batch size # by input dims. q_values = self.q_eval.forward(state) action = T.argmax(q_values).item() else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, resulted_state, done): self.memory.store_transition(state, action, reward, resulted_state, done) def sample_memory(self): state, action, reward, resulted_state, done = self.memory.sample_buffer( self.batch_size) state = T.tensor(state).to(self.q_eval.device) reward = T.tensor(reward).to(self.q_eval.device) done = T.tensor(done).to(self.q_eval.device) action = T.tensor(action).to(self.q_eval.device) resulted_state = T.tensor(resulted_state).to(self.q_eval.device) return state, reward, done, action, resulted_state def replace_target_network(self): if self.learn_step_counter % self.replace == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): if self.epsilon > self.eps_min: self.epsilon -= self.eps_dec else: self.epsilon = self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): if self.memory.mem_counter < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() state, reward, done, action, resulted_state = self.sample_memory() indexes = np.arange(self.batch_size, dtype=np.longlong) action = action.long() done = done.bool() prediction = self.q_eval.forward(state)[ indexes, action] # dims: batch_size * n_actions next_result = self.q_next.forward(resulted_state).max(dim=1)[0] next_result[done] = 0.0 # for terminal states, target should be reward target = reward + self.gamma * next_result loss = self.q_eval.loss(target, prediction).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon()
class DQNAgent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DQN(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DQN(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir) def choose_action(self, observation): if tf.random.uniform([1]) > self.epsilon: actions = self.q_eval.call(tf.expand_dims(observation, axis=0)) action = tf.argmax(actions, axis=1) else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): states, actions, rewards, new_states, dones = self.memory.sample_buffer( self.batch_size) return states, actions, rewards, new_states, dones def replace_target_network(self): if self.learn_step_counter & self.replace_target_cnt == 0: self.q_next = self.q_eval def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec \ if self.epsilon > self.eps_min else self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() optimizer = keras.optimizers.RMSprop(learning_rate=self.lr) indices = tf.range(self.batch_size) with tf.GradientTape() as tape: q_pred = tf.gather_nd(self.q_eval.call(states), list(zip(indices, actions))) q_next = tf.math.reduce_max(self.q_next.call(states_), axis=1) q_next = q_next * tf.expand_dims(tf.cast(dones, tf.float32), -1) q_target = rewards + self.gamma * q_next loss = keras.losses.MSE(q_target, q_pred) gradient = tape.gradient(loss, self.q_eval.trainable_variables) optimizer.apply_gradients( zip(gradient, self.q_eval.trainable_variables)) self.decrement_epsilon() self.learn_step_counter += 1
class Agent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, checkpoint_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_counter = replace self.algo = algo self.env_name = env_name self.checkpoint_dir = checkpoint_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + "_q_eval", checkpoint_dir=self.checkpoint_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + "_q_next", checkpoint_dir=self.checkpoint_dir) def store_transition(self, state, action, reward, resulted_state, done): self.memory.store_transition(state, action, reward, resulted_state, done) def sample_memory(self): state, action, reward, resulted_state, done = self.memory.sample_buffer( self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) resulted_states = T.tensor(resulted_state).to(self.q_eval.device) return states, actions, rewards, resulted_states, dones def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation], dtype=T.float).to( self.q_eval.device) # converting observation to tensor, # and observation is in the list because our convolution expects an input tensor of shape batch size # by input dims. _, advantages = self.q_eval.forward(state) action = T.argmax(advantages).item() else: action = np.random.choice(self.action_space) return action def replace_target_network(self): if self.replace_target_counter is not None and \ self.learn_step_counter % self.replace_target_counter == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): if self.epsilon > self.eps_min: self.epsilon = self.epsilon - self.eps_dec else: self.epsilon = self.eps_min def learn(self): if self.memory.mem_counter < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, resulted_states, dones = self.sample_memory() indexes = np.arange(self.batch_size) V_states, A_states = self.q_eval.forward(states) q_pred = T.add( V_states, (A_states - A_states.mean(dim=1, keepdim=True)))[indexes, actions] V_resulted_states, A_resulted_states = self.q_next.forward( resulted_states) q_next = T.add( V_resulted_states, (A_resulted_states - A_resulted_states.mean(dim=1, keepdim=True))).max(dim=1)[0] q_next[dones] = 0.0 target = rewards + self.gamma * q_next loss = self.q_eval.loss(target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon() def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint()