def execute(times): for t in range(times): print(t) # INICIALIZACION state = init_state numMov = 0 # Numero de movimientos coinObtAt = 0 # Numero de movimientos necesitados para coger la moneda coinObt = False # Si se ha cogido o no la moneda done = False # Si se ha terminado (llegado al estado final) observaciones = deque() for o in range(OBSERVACIONES): action, next = DQN.take_action(state) GUI.visualize(state, next, o, t); if next == end_state: reward = 5 / (numMov + 1) done = True elif next == coin_pos and not coinObt: coinObt = True reward = 15 / (numMov + 1) coinObtAt = numMov + 1 else: reward = 0 observaciones.append((state, action, next, reward, done)) state = next[:] numMov += 1 if done: GUI.visualize(state,init_state,o,t) print('Finished in %i movements, coin obtained in %i' % (numMov, coinObtAt)) state = init_state numMov = 0 coinObtAt = 0 coinObt = False done = False DQN.updateExploration(-0.005) print('Observation finished') GUI.visualize(state,init_state,o,t) DQN.learn(observaciones,BATCH_SIZE)
def __init__( self, n_actions, n_features, n_episodes, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=300, memory_size=500, batch_size=32, e_greedy_increment=None, output_graph=False, ): self.n_actions = n_actions self.n_features = n_features self.n_episodes = n_episodes self.lr = learning_rate self.gamma = reward_decay self.epsilon_max = e_greedy self.replace_target_iter = replace_target_iter self.memory_size = memory_size self.batch_size = batch_size self.epsilon_increment = e_greedy_increment self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max # total learning step self.learn_step_counter = 0 # initialize zero memory [s, a, r, s_] self.memory = np.zeros((self.memory_size, n_features * 2 + 2)) # consist of [target_net, evaluate_net] self.target_net = DQN(n_features, n_actions) self.eval_net = DQN(n_features, n_actions) # self.eval_net.load_state_dict(torch.load('/content/params.pkl')) self.criterion = nn.MSELoss() self.optimizer = torch.optim.SGD(self.eval_net.parameters(), lr=self.lr)
def execute(times): for t in range(times): print(t) # INICIALIZACION state = init_state numMov = 0 # Numero de movimientos coinObtAt = 0 # Numero de movimientos necesitados para coger la moneda coinObt = False # Si se ha cogido o no la moneda done = False # Si se ha terminado (llegado al estado final) batch_con_recompensa = False observaciones = deque() tabla = DQN.calcularTabla() GUI.actualizar(tabla) for o in range(OBSERVACIONES): action, next = DQN.take_action(state) GUI.visualize(state, next, o, t, coinObt) #if next[:2] == end_state: if next == end_state: reward = 25 #/ (numMov + 1) done = True batch_con_recompensa = True elif next[:2] == coin_pos and not coinObt: coinObt = True next[2] = 1 reward = 15 #/ (numMov + 1) coinObtAt = numMov + 1 batch_con_recompensa = True else: reward = 0 observaciones.append((state, action, next, reward, done)) state = next[:] numMov += 1 if done: GUI.visualize(state, init_state, o, t, coinObt) print('Finished in %i movements, coin obtained in %i' % (numMov, coinObtAt)) state = init_state numMov = 0 coinObtAt = 0 coinObt = False done = False DQN.updateExploration(-0.01) print('Observation finished') GUI.visualize(state, init_state, o, t, coinObt) if batch_con_recompensa: print("Aprendiendo...") DQN.learn(observaciones)
class Train(): def __init__( self, n_actions, n_features, n_episodes, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=300, memory_size=500, batch_size=32, e_greedy_increment=None, output_graph=False, ): self.n_actions = n_actions self.n_features = n_features self.n_episodes = n_episodes self.lr = learning_rate self.gamma = reward_decay self.epsilon_max = e_greedy self.replace_target_iter = replace_target_iter self.memory_size = memory_size self.batch_size = batch_size self.epsilon_increment = e_greedy_increment self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max # total learning step self.learn_step_counter = 0 # initialize zero memory [s, a, r, s_] self.memory = np.zeros((self.memory_size, n_features * 2 + 2)) # consist of [target_net, evaluate_net] self.target_net = DQN(n_features, n_actions) self.eval_net = DQN(n_features, n_actions) # self.eval_net.load_state_dict(torch.load('/content/params.pkl')) self.criterion = nn.MSELoss() self.optimizer = torch.optim.SGD(self.eval_net.parameters(), lr=self.lr) def store_transition(self, s, a, r, s_): if not hasattr(self, 'memory_counter'): self.memory_counter = 0 transition = np.hstack((s, [a, r], s_)) # replace the old memory with new memory index = self.memory_counter % self.memory_size self.memory[index, :] = transition self.memory_counter += 1 def choose_action(self, observation): # to have batch dimension when feed into tf placeholder observation = observation[np.newaxis, :] if np.random.uniform() < self.epsilon: # forward feed the observation and get q value for every actions actions_value = self.eval_net( Variable(torch.from_numpy( observation).float())).cpu().detach().numpy() action = np.argmax(actions_value) else: action = np.random.randint(0, self.n_actions) return action def learn(self): # check to replace target parameters if self.learn_step_counter % self.replace_target_iter == 0: # self.sess.run(self.target_replace_op) self.target_net.load_state_dict(self.eval_net.state_dict()) # print('\ntarget_params_replaced\n') # sample batch memory from all memory if self.memory_counter > self.memory_size: sample_index = np.random.choice(self.memory_size, size=self.batch_size) else: sample_index = np.random.choice(self.memory_counter, size=self.batch_size) batch_memory = self.memory[sample_index, :] self.s = Variable(torch.from_numpy( batch_memory[:, :self.n_features]).float(), requires_grad=True) self.a = Variable( torch.from_numpy(batch_memory[:, self.n_features]).long()) self.r = Variable( torch.from_numpy(batch_memory[:, self.n_features + 1]).float()) self.s_ = Variable( torch.from_numpy(batch_memory[:, -self.n_features:]).float()) current_Q_values = self.eval_net(self.s).gather( 1, self.a.unsqueeze(1)).view(-1) next_Q_values = self.target_net(self.s_).detach().max(1)[0] # Compute the target of the current Q values target_Q_values = self.r + (self.gamma * next_Q_values) # Compute Bellman error loss = self.criterion(target_Q_values, current_Q_values) self.optimizer.zero_grad() # run backward pass loss.backward() # Perfom the update self.optimizer.step() # increasing epsilon self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max self.learn_step_counter += 1
reward = 0 observaciones.append((state, action, next, reward, done)) state = next[:] numMov += 1 if done: GUI.visualize(state, init_state, o, t, coinObt) print('Finished in %i movements, coin obtained in %i' % (numMov, coinObtAt)) state = init_state numMov = 0 coinObtAt = 0 coinObt = False done = False DQN.updateExploration(-0.01) print('Observation finished') GUI.visualize(state, init_state, o, t, coinObt) if batch_con_recompensa: print("Aprendiendo...") DQN.learn(observaciones) if __name__ == '__main__': DQN = DQN(acciones, alpha, gamma, exploration) GUI = Maze(8, 8, coin_pos, end_state) execute(1000)