class DeepQLearner(object): def __init__(self, obs_shape, action_shape, params): # metodo de inicializacion y self es una referencia del propio objecto self.params = params self.gamma = self.params['gamma'] self.learning_rate = self.params['learning_rate'] self.best_mean_reward = -float("inf") self.best_reward = -float("inf") self.training_steps_completed = 0 #self.MAX_NUM_EPISODES = self.params['max_num_episodes'] #self.STEPS_PER_EPISODE = self.params['steps_per_episode'] self.action_shape = action_shape if len(obs_shape) == 1: ## Solo tenemos una dimensión del espacio de observaciones self.DQN = SLP elif len(obs_shape) == 3: ## El estado de observaciones es una imagen/3D self.DQN = CNN self.Q = self.DQN(obs_shape, action_shape, device).to(device) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr = self.learning_rate) # lr = radio de aprendisaje if self.params['use_target_network']: self.Q_target = self.DQN(obs_shape, action_shape, device).to(device) self.policy = self.epsilon_greedy_Q # politica de actuacion self.epsilon_max = self.params['epsilon_max'] self.epsilon_min = self.params['epsilon_min'] self.epsilon_decay = LinearDecaySchedule(initial_value = self.epsilon_max, final_value = self.epsilon_min, max_steps = self.params['epsilon_decay_final_step']) self.step_num = 0 self.memory = ExperienceMemory(capacity = int(self.params['experience_memory_size'])) def get_action(self, obs): obs = np.array(obs) obs = obs / 255.0 # esto hara que los valores esten entre 0 y 1 if len(obs.shape) == 3: # tenemos una imagen if obs.shape[2] < obs.shape[0]: # WxHxC -> esto nos cambiara al siguiente order de la imagens Color x Altura x ancho obs = obs.reshape(obs.shape[2], obs.shape[1], obs.shape[0]) # reordenamos el tamaño obs = np.expand_dims(obs, 0) # en vez de tener el array tendremos un bash expandido return self.policy(obs) def epsilon_greedy_Q(self, obs): writer.add_scalar("DQL/epsilon", self.epsilon_decay(self.step_num), self.step_num) self.step_num +=1 if random.random() < self.epsilon_decay(self.step_num) and not self.params["test"]: action = random.choice([a for a in range(self.action_shape)]) else: action = np.argmax(self.Q(obs).data.to(torch.device('cpu')).numpy()) return action def learn(self, obs, action, reward, next_obs, done): if done: td_target = reward + 0.0 else: td_target = reward + self.gamma * torch.max(self.Q(next_obs)) # con torcho maximiza la toma de decion td_error = torch.nn.functional.mse_loss(self.Q(obs)[action], td_target) # Funcio de perdida de valores # RADIO DE APRENDIZAJE: self.Q_optimizer.zero_grad() td_error.backward() # Haciendo el paso hacia atras writer.add_scalar("DQL/td_error", td_error.mean(), self.step_num) self.Q_optimizer.step() # obtimiza los pesos internos def replay_experience(self, batch_size = None): """ Vuelve a jugar usando la experiencia aleatoria almacenada :param batch_size: Tamaño de la muestra a tomar de la memoria :return: """ batch_size = batch_size if batch_size is not None else self.params['replay_batch_size'] experience_batch = self.memory.sample(batch_size) self.learn_from_batch_experience(experience_batch) self.training_steps_completed += 1 # contador de entrenamientos def learn_from_batch_experience(self, experiences): """ Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores :param experiences: fragmento de recuerdos anteriores :return: """ batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs)/255.0 action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) if self.params["clip_reward"]: reward_batch = np.sign(reward_batch) next_obs_batch = np.array(batch_xp.next_obs)/255.0 done_batch = np.array(batch_xp.done) if self.params['use_target_network']: # se aharan 2000 interaciones y parara para descanzar y el 'td_target' actualizara el estado del dicionario y guardar lo aprendido if self.step_num % self.params['target_network_update_frequency'] == 0: self.Q_target.load_state_dict(self.Q.state_dict()) td_target = reward_batch + ~done_batch *\ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q_target(next_obs_batch).max(1)[0].data.numpy() else: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).detach().max(1)[0].data.numpy()# ~done_batch : Solo hara la suma si no a terminado td_target = torch.from_numpy(td_target) # convertimos a un tensor para operar #td_target = td_target.to(self.device) action_idx = torch.from_numpy(action_batch).to(device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1,action_idx.view(-1,1).long()), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() self.Q_optimizer.step()# hago un paso adelante para que la neurona aprenda def save(self, env_name): file_name = self.params['save_dir']+"DQL_"+env_name+".ptm" agent_state = {"Q": self.Q.state_dict(), "best_mean_reward": self.best_mean_reward, "best_reward": self.best_reward} torch.save(agent_state, file_name) print("Estado del agente guardado en : ", file_name) def load(self, env_name): file_name = self.params['load_dir']+"DQL_"+env_name+".ptm" agent_state = torch.load(file_name, map_location = lambda storage, loc: storage) self.Q.load_state_dict(agent_state["Q"]) self.Q.to(device) self.best_mean_reward = agent_state["best_mean_reward"] self.best_reward = agent_state["best_reward"] print("Cargado del modelo Q desde ", file_name, "que hasta el momento tiene una mejor recompensa media de: ",self.best_mean_reward, " y una recompensa máxima de: ", self.best_reward)
class SwallowQLearner(object): def __init__(self, environment, learning_rate=0.005, gamma=0.98): self.obs_shape = environment.observation_space.shape self.action_shape = environment.action_space.n self.Q = SLP(self.obs_shape, self.action_shape) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=learning_rate) self.gamma = gamma self.epsilon_max = 1.0 self.epsilon_min = 0.05 self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=0.5 * MAX_NUM_EPISODES * STEPS_PER_EPISODE) self.step_num = 0 self.policy = self.epsilon_greedy_Q self.memory = ExperienceMemory(capacity=int(1e5)) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") def get_action(self, obs): return self.policy(obs) def epsilon_greedy_Q(self, obs): if random.random() < self.epsilon_decay(self.step_num): action = random.choice([a for a in range(self.action_shape)]) else: action = np.argmax( self.Q(obs).data.to(torch.device('cpu')).numpy()) self.step_num += 1 ##EN EL VIDEO SE NOS OLVIDÓ SUBIR EL STEP EN UNA UNIDAD return action def learn(self, obs, action, reward, next_obs): td_target = reward + self.gamma * torch.max(self.Q(next_obs)) td_error = torch.nn.functional.mse_loss(self.Q(obs)[action], td_target) self.Q_optimizer.zero_grad() td_error.backward() self.Q_optimizer.step() def replay_experience(self, batch_size): """ Vuelve a jugar usando la experiencia aleatoria almacenada :param batch_size: Tamaño de la muestra a tomar de la memoria :return: """ experience_batch = self.memory.sample(batch_size) self.learn_from_batch_experience(experience_batch) def learn_from_batch_experience(self, experiences): """ Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores :param experiences: fragmento de recuerdos anteriores :return: """ batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs) action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) next_obs_batch = np.array(batch_xp.next_obs) done_batch = np.array(batch_xp.done) td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).detach().max(1)[0].data.numpy() td_target = torch.from_numpy(td_target) td_target = td_target.to(self.device) action_idx = torch.from_numpy(action_batch).to(self.device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1).long()), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() self.Q_optimizer.step()
class SwallowQLearner(object): def __init__(self, environment, learning_rate=0.005, gamma=0.98): self.obs_shape = environment.observation_space.shape self.action_shape = environment.action_space.n self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.Q = SLP(self.obs_shape, self.action_shape, self.device) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=learning_rate) self.gamma = gamma self.epsilon_max = 1.0 self.epsilon_min = 0.05 self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=0.5 * MAX_NUM_EPISODES * MAX_STEP_PER_EPISODE) self.step_num = 0 self.policy = self.epsilon_greedy_Q self.memory = ExperienceMemory(capacity=int(1e5)) def get_action(self, obs): return self.policy(obs) def epsilon_greedy_Q(self, obs): if random.random() < self.epsilon_decay(self.step_num): action = random.choice([a for a in range(self.action_shape)]) else: action = np.argmax(self.Q(obs).data.to(self.device).cpu().numpy()) return action def learn(self, obs, action, reward, next_obs): td_target = reward + self.gamma * torch.max(self.Q(next_obs)) td_error = torch.nn.functional.mse_loss(self.Q(obs)[action], td_target) self.Q_optimizer.zero_grad() td_error.backward() self.Q_optimizer.step() def replay_experience(self, batch_size): experience_batch = self.memory.sample(batch_size) self.learn_from_batch_experiece(experience_batch) def learn_from_batch_experiece(self, experiences): batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs) action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) next_obs_batch = np.array(batch_xp.next_obs) done_batch = np.array(batch_xp.done) if str(self.device) == "cuda": td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ torch.max(self.Q(next_obs_batch).detach(),1)[0].data.tolist() else: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).detach().max(1)[0].data.numpy() td_target = torch.from_numpy(td_target) td_target = td_target.to(self.device) action_idx = torch.from_numpy(action_batch).to(self.device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1).long()), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() self.Q_optimizer.step()
class DeepQLearner(object): def __init__(self, obs_shape, action_shape, params): self.params = params self.gamma = self.params['gamma'] self.learning_rate = self.params['learning_rate'] self.best_mean_reward = -float("inf") self.best_reward = -float("inf") self.training_steps_completed = 0 self.action_shape = action_shape if len( obs_shape ) == 1: ## Solo tenemos una dimensión del espacio de observaciones self.DQN = SLP elif len(obs_shape ) == 3: ## El estado de observaciones es una imagen/3D self.DQN = CNN self.Q = self.DQN(obs_shape, action_shape, device).to(device) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=self.learning_rate) if self.params['use_target_network']: self.Q_target = self.DQN(obs_shape, action_shape, device).to(device) self.policy = self.epsilon_greedy_Q self.epsilon_max = self.params['epsilon_max'] self.epsilon_min = self.params['epsilon_min'] self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=self.params['epsilon_decay_final_step']) self.step_num = 0 self.memory = ExperienceMemory( capacity=int(self.params['experience_memory_size'])) def get_action(self, obs): obs = np.array(obs) obs = obs / 255.0 if len(obs.shape) == 3: # tenemos una imagen if obs.shape[2] < obs.shape[0]: # WxHxC -> C x H x W obs = obs.reshape(obs.shape[2], obs.shape[1], obs.shape[0]) obs = np.expand_dims(obs, 0) return self.policy(obs) def epsilon_greedy_Q(self, obs): writer.add_scalar("DQL/epsilon", self.epsilon_decay(self.step_num), self.step_num) self.step_num += 1 if random.random() < self.epsilon_decay( self.step_num) and not self.params["test"]: action = random.choice([a for a in range(self.action_shape)]) else: action = np.argmax( self.Q(obs).data.to(torch.device('cpu')).numpy()) return action def learn(self, obs, action, reward, next_obs, done): if done: td_target = reward + 0.0 else: td_target = reward + self.gamma * torch.max(self.Q(next_obs)) td_error = torch.nn.functional.mse_loss(self.Q(obs)[action], td_target) self.Q_optimizer.zero_grad() td_error.backward() writer.add_scalar("DQL/td_error", td_error.mean(), self.step_num) self.Q_optimizer.step() def replay_experience(self, batch_size=None): batch_size = batch_size if batch_size is not None else self.params[ 'replay_batch_size'] experience_batch = self.memory.sample(batch_size) self.learn_from_batch_experience(experience_batch) self.training_steps_completed += 1 def learn_from_batch_experience(self, experiences): batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs) / 255.0 action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) if self.params["clip_reward"]: reward_batch = np.sign(reward_batch) next_obs_batch = np.array(batch_xp.next_obs) / 255.0 done_batch = np.array(batch_xp.done) if torch.cuda.is_available(): if self.params['use_target_network']: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ torch.max(self.Q_target(next_obs_batch).detach(),1)[0].data.tolist() else: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ torch.max(self.Q(next_obs_batch).detach(),1)[0].data.tolist() else: if self.params['use_target_network']: if self.step_num % self.params[ 'target_network_update_frequency'] == 0: self.Q_target.load_state_dict(self.Q.state_dict()) td_target = reward_batch + ~done_batch *\ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q_target(next_obs_batch).max(1)[0].data else: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).detach().max(1)[0].data td_target = torch.from_numpy(td_target) td_target = td_target.to(device) action_idx = torch.from_numpy(action_batch).to(device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1).long()), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() self.Q_optimizer.step() def save(self, env_name): file_name = self.params['save_dir'] + "DQL_" + env_name + ".ptm" agent_state = { "Q": self.Q.state_dict(), "best_mean_reward": self.best_mean_reward, "best_reward": self.best_reward } torch.save(agent_state, file_name) print("Estado del agente guardado en : ", file_name) def load(self, env_name): file_name = self.params['load_dir'] + "DQL_" + env_name + ".ptm" agent_state = torch.load(file_name, map_location=lambda storage, loc: storage) self.Q.load_state_dict(agent_state["Q"]) self.Q.to(device) self.best_mean_reward = agent_state["best_mean_reward"] self.best_reward = agent_state["best_reward"] print("Cargado del modelo Q desde", file_name, "que hasta el momento tiene una mejor recompensa media de: ", self.best_mean_reward, " y una recompensa máxima de: ", self.best_reward)
class Deep_Q_Learner(object): def __init__(self, state_shape, action_shape, params): """ self.Q is the Action-Value function. This agent represents Q using a Neural Network If the input is a single dimensional vector, uses a Single-Layer-Perceptron else if the input is 3 dimensional image, use a Convolutional-Neural-Network :param state_shape: Shape (tuple) of the observation/state :param action_shape: Shape (number) of the discrete action space :param params: A dictionary containing various Agent configuration parameters and hyper-parameters """ self.state_shape = state_shape self.action_shape = action_shape self.params = params self.gamma = self.params['gamma'] # Agent's discount factor self.learning_rate = self.params['lr'] # Agent's Q-learning rate self.best_mean_reward = -float( "inf") # Agent's personal best mean episode reward self.best_reward = -float("inf") self.training_steps_completed = 0 # Number of training batch steps completed so far if len(self.state_shape ) == 1: # Single dimensional observation/state space self.DQN = SLP elif len(self.state_shape) == 3: # 3D/image observation/state self.DQN = CNN self.Q = self.DQN(state_shape, action_shape, device).to(device) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=self.learning_rate) if self.params['use_target_network']: self.Q_target = self.DQN(state_shape, action_shape, device).to(device) # self.policy is the policy followed by the agent. This agents follows # an epsilon-greedy policy w.r.t it's Q estimate. self.policy = self.epsilon_greedy_Q self.epsilon_max = 1.0 self.epsilon_min = 0.05 self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=self.params['epsilon_decay_final_step']) self.step_num = 0 self.memory = ExperienceMemory( capacity=int(self.params['experience_memory_capacity'] )) # Initialize an Experience memory with 1M capacity def get_action(self, observation): if len(observation.shape) == 3: # Single image (not a batch) if observation.shape[2] < observation.shape[ 0]: # Probably observation is in W x H x C format # Reshape to C x H x W format as per PyTorch's convention observation = observation.reshape(observation.shape[2], observation.shape[1], observation.shape[0]) observation = np.expand_dims(observation, 0) # Create a batch dimension return self.policy(observation) def epsilon_greedy_Q(self, observation): # Decay Epsilon/exploration as per schedule writer.add_scalar("DQL/epsilon", self.epsilon_decay(self.step_num), self.step_num) self.step_num += 1 if random.random() < self.epsilon_decay(self.step_num): action = random.choice([i for i in range(self.action_shape)]) else: action = np.argmax( self.Q(observation).data.to(torch.device('cpu')).numpy()) return action def learn(self, s, a, r, s_next, done): # TD(0) Q-learning if done: # End of episode td_target = reward + 0.0 # Set the value of terminal state to zero else: td_target = r + self.gamma * torch.max(self.Q(s_next)) td_error = td_target - self.Q(s)[a] # Update Q estimate #self.Q(s)[a] = self.Q(s)[a] + self.learning_rate * td_error self.Q_optimizer.zero_grad() td_error.backward() self.Q_optimizer.step() def learn_from_batch_experience(self, experiences): batch_xp = Experience(*zip(*experiences)) obs_batch = np.array( batch_xp.obs ) / 255.0 # Scale/Divide by max limit of obs's dtype. 255 for uint8 action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) next_obs_batch = np.array( batch_xp.next_obs ) / 255.0 # Scale/Divide by max limit of obs' dtype. 255 for uint8 done_batch = np.array(batch_xp.done) if self.params['use_target_network']: if self.training_steps_completed % self.params[ 'target_network_update_freq'] == 0: # The *update_freq is the Num steps after which target net is updated. # A schedule can be used instead to vary the update freq. self.Q_target.load_state_dict(self.Q.state_dict()) td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q_target(next_obs_batch).max(1)[0].data else: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).detach().max(1)[0].data td_target = td_target.to(device) action_idx = torch.from_numpy(action_batch).to(device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1)), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() writer.add_scalar("DQL/td_error", td_error.mean(), self.step_num) self.Q_optimizer.step() def replay_experience(self, batch_size=None): batch_size = batch_size if batch_size is not None else self.params[ 'replay_batch_size'] experience_batch = self.memory.sample(batch_size) self.learn_from_batch_experience(experience_batch) self.training_steps_completed += 1 # Increment the number of training batch steps complemented def save(self, env_name): file_name = self.params['save_dir'] + "DQL_" + env_name + ".ptm" agent_state = { "Q": self.Q.state_dict(), "best_mean_reward": self.best_mean_reward, "best_reward": self.best_reward } torch.save(agent_state, file_name) print("Agent's state saved to ", file_name) def load(self, env_name): file_name = self.params['load_dir'] + "DQL_" + env_name + ".ptm" agent_state = torch.load(file_name, map_location=lambda storage, loc: storage) self.Q.load_state_dict(agent_state["Q"]) self.Q.to(device) self.best_mean_reward = agent_state["best_mean_reward"] self.best_reward = agent_state["best_reward"] print("Loaded Q model state from", file_name, " which fetched a best mean reward of:", self.best_mean_reward, " and an all time best reward of:", self.best_reward)
class DeepQLearner: def __init__(self, state_shape, action_shape, params, writer, device="cpu"): """ self.Q is the Action-value function.This agent represents Q using a Neural Network If the input is a single dimensional vector, use a Single Layer Perceptron else if the input is 3 dimensional image, use a Convolutional Neural Network :param state_shape: Shape (tuple) of the observation/state :param action_shape: Shape (number) of the discrete action space :param params: A dictionary containing various Agent configuration parameters and hyper-parameters """ self.state_shape = state_shape self.action_shape = action_shape self.params = params self.gamma = self.params['gamma'] self.learning_rate = self.params['lr'] self.best_mean_reward = -float('inf') self.best_reward = -float('inf') self.training_steps_completed = 0 self.writer = writer self.device = device if len(self.state_shape) == 1: self.DQN = SLP elif len(self.state_shape) == 3: self.DQN = CNN self.Q = self.DQN(state_shape, action_shape, self.device).to(self.device) self.Q.apply(utils.weights_initializer.xavier) self.Q_optimizer = torch.optim.Adam( self.Q.parameters(), lr=self.learning_rate) if self.params['use_target_network']: self.Q_target = self.DQN( state_shape, action_shape, self.device).to(self.device) self.policy = self.epsilon_greedy_Q self.epsilon_max = params['epsilon_max'] self.epsilon_min = params['epsilon_min'] self.epsilon_decay = LinearDecayScheduler( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=self.params['epsilon_decay_final_step']) self.step_num = 0 self.memory = ExperienceMemory(capacity=int( self.params['experience_memory_capacity'])) def get_action(self, obs): obs = np.array(obs) obs = obs/255.0 if len(obs.shape) == 3: if obs.shape[2] < obs.shape[0]: obs = obs.reshape(obs.shape[2], obs.shape[1], obs.shape[0]) obs = np.expand_dims(obs, 0) return self.policy(obs) def epsilon_greedy_Q(self, obs): self.writer.add_scalar( 'DQL/epsilon', self.epsilon_decay(self.step_num), self.step_num) self.step_num += 1 if random.random() < self.epsilon_decay(self.step_num) and not self.params['test']: action = random.choice([i for i in range(self.action_shape)]) else: action = np.argmax(self.Q(obs).data.to(self.device).numpy()) return action def learn(self, obs, action, reward, next_obs, done): if done: td_target = reward + 0.0 else: td_target = reward + self.gamma * torch.max(self.Q(next_obs)) td_error = td_target - self.Q(s)[a] self.Q_optimizer.zero_grad() td_error.backward() self.Q_optimizer.step() def learn_from_batch_experience(self, experiences): batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs) action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) if self.params['clip_rewards']: # Clip the rewards reward_batch = np.sign(reward_batch) next_obs_batch = np.array(batch_xp.next_obs)/255.0 done_batch = np.array(batch_xp.done) if self.params['use_target_network']: if self.step_num % self.params['target_network_update_freq'] == 0: self.Q_target.load_state_dict(self.Q.state_dict()) td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q_target(next_obs_batch).max( 1)[0].data.to(self.device).numpy() else: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).detach().max( 1)[0].data.to(self.device).numpy() td_target = torch.from_numpy(td_target).to(self.device) action_idx = torch.from_numpy(action_batch).to(self.device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1)), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() self.writer.add_scalar('DQL/td_error', td_error.mean(), self.step_num) self.Q_optimizer.step() def replay_experience(self, batch_size=None): batch_size = batch_size if batch_size is not None else self.params['replay_batch_size'] experience_batch = self.memory.sample(batch_size) self.learn_from_batch_experience(experience_batch) self.training_steps_completed += 1 def save(self, env_name): if not exists(self.params['save_dir']): makedirs(self.params['save_dir']) file_name = join(self.params['save_dir'], 'DQL_' + env_name + '.ptm') agent_state = {'Q': self.Q.state_dict(), 'best_mean_reward': self.best_mean_reward, 'best_reward': self.best_reward } torch.save(agent_state, file_name) print(f"Agent's state saved to {file_name}") def load(self, env_name): file_name = self.params['load_dir'] + 'DQL_' + env_name + '.ptm' agent_state = torch.load( file_name, map_location=lambda storage, loc: storage) self.Q.load_state_dict(agent_state['Q']) self.Q.to(self.device) self.best_mean_reward = agent_state['best_mean_reward'] self.best_reward = agent_state['best_reward'] print( f'Loaded Q model state from {file_name} which fetched a best mean reward of {self.best_mean_reward:.3f} and an all time best reward of {self.best_reward}')
class DeepQLearner(object): def __init__(self, obs_shape, action_shape, hidden_shape, params): self.params = params self.gamma = self.params["gamma"] self.delta = self.params["delta"] self.learning_rate = self.params["learning_rate"] self.best_mean_reward = -float("inf") self.best_reward = -float("inf") self.training_steps_completed = 0 self.action_shape = action_shape self.Q = CNN(obs_shape, action_shape, hidden_shape) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=self.learning_rate) self.policy = self.epsilon_greedy_Q self.epsilon_max = self.params["epsilon_max"] self.epsilon_min = self.params["epsilon_min"] self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=self.params["epsilon_decay_final_step"]) self.memory = ExperienceMemory(self.params["memory"]) self.total_trainings = 0 self.step_num = 0 def get_action(self, obs): return self.policy(obs) def epsilon_greedy_Q(self, obs): self.step_num += 1 if random.random() < self.epsilon_decay( self.step_num) and not self.params["test"]: action = random.choice([a for a in range(self.action_shape)]) else: action = np.argmax(self.Q(obs).detach().numpy()) return action def learn(self, obs, action, reward, next_obs, done): if done: td_target = reward + torch.tensor(0.0, requires_grad=True) else: td_target = reward + self.gamma * torch.max(self.Q(next_obs)) td_error = torch.nn.functional.mse_loss( self.Q(obs)[0][action], td_target) #print(td_target.item(), self.Q(obs)[action].item(), td_error.item()) #print(reward, td_target.item(), td_error.item()) self.Q_optimizer.zero_grad() td_error.backward() self.Q_optimizer.step() def replay_experience(self, batch_size=None): """ Vuelve a jugar usando la experiencia aleatoria almacenada :param batch_size: Tamaño de la muestra a tomar de la memoria :return: """ batch_size = batch_size if batch_size is not None else self.params[ "replay_batch_size"] experience_batch = self.memory.sample(batch_size) self.learn_from_batch_experience(experience_batch) self.training_steps_completed += 1 #print("Replaying {} episodes".format(batch_size)) def learn_from_batch_experience(self, experiences): """ Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores :param experiencias: fragmento de recuerdos anteriores :return: """ batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs) obs_batch = obs_batch action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) next_obs_batch = np.array(batch_xp.next_obs) done_batch = np.array(batch_xp.done) td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).detach().max(1)[0].data.numpy() td_target = torch.from_numpy(td_target) action_idx = torch.from_numpy(action_batch) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1).long()), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() self.Q_optimizer.step() def save(self, env_name): file_name = self.params["save_dir"] + "DQL_" + env_name + ".ptm" agent_state = { "Q": self.Q.state_dict(), "best_mean_reward": self.best_mean_reward, "best_reward": self.best_reward, "total_trainings": self.total_trainings } torch.save(agent_state, file_name) print("NN guardada en: ", file_name) def load(self, env_name): file_name = self.params["load_dir"] + "DQL_" + env_name + ".ptm" agent_state = torch.load(file_name, map_location=lambda storage, loc: storage) self.Q.load_state_dict(agent_state["Q"]) #self.Q.eval() self.best_mean_reward = agent_state["best_mean_reward"] self.best_reward = agent_state["best_reward"] self.total_trainings = agent_state["total_trainings"] print( "NN cargada desde: {} \nMejor recompensa media: {:.3f}\nMejor recompensa: {:.3f}\nTrains: {}" .format(file_name, self.best_mean_reward, self.best_reward, self.total_trainings))
class Deep_Q_Learner(object): def __init__(self, state_shape, action_shape, params): self.state_shape = state_shape self.action_shape = action_shape self.params = params self.gamma = self.params['gamma'] self.learning_rate = self.params['lr'] self.best_mean_reward = -float("inf") self.best_reward = -float("inf") self.training_steps_completed = 0 if len(self.state_shape) == 1: self.DQN = SLP elif len(self.state_shape) == 3: self.DQN = CNN self.Q = self.DQN(state_shape, action_shape, device).to(device) self.Q.apply(utils.weights_initializer.xavier) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=self.learning_rate) if self.params['use_target_network']: self.Q_target = self.DQN(state_shape, action_shape, device).to(device) self.policy = self.epsilon_greedy_Q self.epsilon_max = params["epsilon_max"] self.epsilon_min = params["epsilon_min"] self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=self.params['epsilon_decay_final_step']) self.step_num = 0 self.memory = ExperienceMemory( capacity=int(self.params['experience_memory_capacity'])) def get_action(self, observation): observation = np.array(observation) observation = observation / 255 if len(observation.shape) == 3: if observation.shape[2] < observation.shape[0]: observation.reshape(observation[2], observation[1], observation[0]) observation = np.expand_dims(observation, 0) return self.policy(observation) def epsilon_greedy_Q(self, observation): writer.add_scalar("DQL/epsilon", self.epsilon_decay(self.step_num), self.step_num) self.step_num += 1 if random.random() < self.epsilon_decay( self.step_num) and not self.params["test"]: action = random.choice([i for i in range(self.action_shape)]) else: action = np.argmax( self.Q(observation).data.to(torch.device('cpu')).numpy()) return action def learn(self, obs, action, reward, obs_next, done): if done: td_target = reward + 0.0 else: td_target = reward + (self.gamma * torch.max(self.Q(obs_next))) td_error = td_target - self.Q(obs)[action] self.Q_optimizer.zero_grad() td_error.backward() self.Q_optimizer.step() def learn_from_batch_experience(self, experiences): batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs) / 255.0 action_batch = np.array(batch_xp.action) reward_batch = np.array(obs_batch.reward) if self.params["clip_rewards"]: reward_batch = np.sign(reward_batch) next_obs_batch = np.array(batch_xp.next_obs) / 255.0 done_batch = np.array(batch_xp.done) if self.params['use_target_network']: if self.step_num % self.params['target_network_update_freq'] == 0: self.Q_target.load_state_dict(self.Q.state_dict()) td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q_target(next_obs_batch).max(1)[0].data.cpu().numpy() else: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).max(1)[0].data.cpu().numpy() td_target = torch.from_numpy(td_target).to(device) action_idx = torch.from_numpy(action_batch).to(device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1)), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() writer.add_scalar("DQL/td_error", td_error.mean(), self.step_num) self.Q_optimizer.step() def replay_experience(self, batch_size=None): batch_size = batch_size if batch_size is not None else self.params[ 'replay_batch_size'] experience_batch = self.memory.sample(batch_size) self.learn_from_batch_experience(experience_batch) self.training_steps_completed += 1 def save(self, env_name): file_name = self.params['save_dir'] + "DQL_" + env_name + ".ptm" agent_state = { "Q": self.Q.state_dict(), "best_mean_reward": self.best_mean_reward, "best_reward": self.best_reward } torch.save(agent_state, file_name) print("Agent's state saved to ", file_name) def load(self, env_name): file_name = self.params['load_dir'] + "DQL_" + env_name + ".ptm" agent_state = torch.load(file_name, map_location=lambda storage, loc: storage) self.Q.load_state_dict(agent_state["Q"]) self.Q.to(device) self.best_mean_reward = agent_state["best_mean_reward"] self.best_reward = agent_state["best_reward"] print("Loaded Q model state from", file_name, " which fetched a best mean reward of:", self.best_mean_reward, " and an all time best reward of:", self.best_reward)
class SwallowQLearner(object): def __init__( self, environment, learning_rate=0.005, gamma=0.98 ): # metodo de inicializacion y self es una referencia del propio objecto self.obs_shape = environment.observation_space.shape # me quedo los valores (tamaño, mas alto y mas bajo) self.action_shape = environment.action_space.n # numero de acciones self.Q = SLP(self.obs_shape, self.action_shape) self.Q_optimizer = torch.optim.Adam( self.Q.parameters(), lr=learning_rate) # lr = radio de aprendisaje self.gamma = gamma self.epsilon_max = 1.0 self.epsilon_min = 0.05 self.epsilon_decay = LinearDecaySchedule( initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=0.5 * MAX_NUM_EPISODES * STEPS_PER_EPISODE) self.step_num = 0 self.policy = self.epsilon_greedy_Q # politica de actuacion self.memory = ExperienceMemory(capacity=int(1e6)) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") def get_action(self, obs): return self.policy(obs) def epsilon_greedy_Q(self, obs): if random.random() < self.epsilon_decay( self.step_num): # numero aleatorio action = random.choice([ a for a in range(self.action_shape) ]) # del total de espcios aleactorios decido una else: action = np.argmax( self.Q(obs).data.to(torch.device('cpu')).numpy()) return action def learn(self, obs, action, reward, next_obs): #discrete_obs = self.discretize(obs) #discrete_next_obs = self.discretize(next_obs) td_target = reward + self.gamma * torch.max( self.Q(next_obs)) # con torcho maximiza la toma de decion td_error = torch.nn.functional.mse_loss( self.Q(obs)[action], td_target) # Funcio de perdida de valores # RADIO DE APRENDIZAJE: self.Q_optimizer.zero_grad() td_error.backward() # Haciendo el paso hacia atras self.Q_optimizer.step() # obtimiza los pesos internos def replay_experience(self, batch_size): """ Vuelve a jugar usando la experiencia aleatoria almacenada :param batch_size: Tamaño de la muestra a tomar de la memoria :return: """ experience_batch = self.memory.sample(batch_size) self.learn_from_batch_experience(experience_batch) def learn_from_batch_experience(self, experiences): """ Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores :param experiences: fragmento de recuerdos anteriores :return: """ batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs) action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) next_obs_batch = np.array(batch_xp.next_obs) done_batch = np.array(batch_xp.done) td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).detach().max(1)[0].data.numpy() # ~done_batch : Solo hara la suma si no a terminado td_target = torch.from_numpy( td_target) # convertimos a un tensor para operar td_target = td_target.to(self.device) action_idx = torch.from_numpy(action_batch).to(self.device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1).long()), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() self.Q_optimizer.step( ) # hago un paso adelante para que la neurona aprenda