def learn_from_batch_experiece(self, experiences): batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs) action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) next_obs_batch = np.array(batch_xp.next_obs) done_batch = np.array(batch_xp.done) if str(self.device) == "cuda": td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ torch.max(self.Q(next_obs_batch).detach(),1)[0].data.tolist() else: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).detach().max(1)[0].data.numpy() td_target = torch.from_numpy(td_target) td_target = td_target.to(self.device) action_idx = torch.from_numpy(action_batch).to(self.device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1).long()), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() self.Q_optimizer.step()
def learn_from_batch_experience(self, experiences): """ Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores :param experiences: fragmento de recuerdos anteriores :return: """ batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs) action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) next_obs_batch = np.array(batch_xp.next_obs) done_batch = np.array(batch_xp.done) td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).detach().max(1)[0].data.numpy() td_target = torch.from_numpy(td_target) td_target = td_target.to(self.device) action_idx = torch.from_numpy(action_batch).to(self.device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1).long()), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() self.Q_optimizer.step()
def learn_from_batch_experience(self, experiences): batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs) action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) if self.params['clip_rewards']: # Clip the rewards reward_batch = np.sign(reward_batch) next_obs_batch = np.array(batch_xp.next_obs)/255.0 done_batch = np.array(batch_xp.done) if self.params['use_target_network']: if self.step_num % self.params['target_network_update_freq'] == 0: self.Q_target.load_state_dict(self.Q.state_dict()) td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q_target(next_obs_batch).max( 1)[0].data.to(self.device).numpy() else: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).detach().max( 1)[0].data.to(self.device).numpy() td_target = torch.from_numpy(td_target).to(self.device) action_idx = torch.from_numpy(action_batch).to(self.device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1)), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() self.writer.add_scalar('DQL/td_error', td_error.mean(), self.step_num) self.Q_optimizer.step()
def learn_from_batch_experience(self, experiences): batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs) / 255.0 # Scale/Divide by max limit of obs's dtype. 255 for uint8 action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) # Clip the rewards if self.params["clip_rewards"]: reward_batch = np.sign(reward_batch) next_obs_batch = np.array(batch_xp.next_obs) / 255.0 # Scale/Divide by max limit of obs' dtype. 255 for uint8 done_batch = np.array(batch_xp.done) if self.params['use_target_network']: #if self.training_steps_completed % self.params['target_network_update_freq'] == 0: if self.step_num % self.params['target_network_update_freq'] == 0: # The *update_freq is the Num steps after which target net is updated. # A schedule can be used instead to vary the update freq. self.Q_target.load_state_dict(self.Q.state_dict()) td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q_target(next_obs_batch).max(1)[0].data else: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).detach().max(1)[0].data td_target = td_target.to(device) action_idx = torch.from_numpy(action_batch).to(device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1)), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() writer.add_scalar("DQL/td_error", td_error.mean(), self.step_num) self.Q_optimizer.step()
def learn_from_batch_experience(self, experiences): batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs) / 255.0 action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) if self.params["clip_reward"]: reward_batch = np.sign(reward_batch) next_obs_batch = np.array(batch_xp.next_obs) / 255.0 done_batch = np.array(batch_xp.done) if torch.cuda.is_available(): if self.params['use_target_network']: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ torch.max(self.Q_target(next_obs_batch).detach(),1)[0].data.tolist() else: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ torch.max(self.Q(next_obs_batch).detach(),1)[0].data.tolist() else: if self.params['use_target_network']: if self.step_num % self.params[ 'target_network_update_frequency'] == 0: self.Q_target.load_state_dict(self.Q.state_dict()) td_target = reward_batch + ~done_batch *\ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q_target(next_obs_batch).max(1)[0].data else: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).detach().max(1)[0].data td_target = torch.from_numpy(td_target) td_target = td_target.to(device) action_idx = torch.from_numpy(action_batch).to(device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1).long()), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() self.Q_optimizer.step()
def learn_from_batch_experience(self, experiences): """ Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores :param experiences: fragmento de recuerdos anteriores :return: """ batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs) / 255.0 action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) if self.params["clip_reward"]: reward_batch = np.sign(reward_batch) next_obs_batch = np.array(batch_xp.next_obs) / 255.0 done_batch = np.array(batch_xp.done) if self.params['use_target_network']: if self.step_num % self.params[ 'target_network_update_frequency'] == 0: self.Q_target.load_state_dict(self.Q.state_dict()) td_target = reward_batch + ~done_batch *\ np.tile(self.gamma, len(next_obs_batch)) * \ torch.max(self.Q_target(next_obs_batch),1)[0].data.tolist() td_target = torch.from_numpy(td_target) else: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ torch.max(self.Q(next_obs_batch).detach(),1)[0].data.tolist() td_target = torch.from_numpy(td_target) td_target = td_target.to(device) action_idx = torch.from_numpy(action_batch).to(device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1)), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() self.Q_optimizer.step()
def learn_from_batch_experience(self, experiences): """ Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores :param experiences: fragmento de recuerdos anteriores :return: """ batch_xp = Experience(*zip(*experiences)) obs_batch = np.array(batch_xp.obs)/255.0 action_batch = np.array(batch_xp.action) reward_batch = np.array(batch_xp.reward) if self.params["clip_reward"]: reward_batch = np.sign(reward_batch) next_obs_batch = np.array(batch_xp.next_obs)/255.0 done_batch = np.array(batch_xp.done) if self.params['use_target_network']: # se aharan 2000 interaciones y parara para descanzar y el 'td_target' actualizara el estado del dicionario y guardar lo aprendido if self.step_num % self.params['target_network_update_frequency'] == 0: self.Q_target.load_state_dict(self.Q.state_dict()) td_target = reward_batch + ~done_batch *\ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q_target(next_obs_batch).max(1)[0].data.numpy() else: td_target = reward_batch + ~done_batch * \ np.tile(self.gamma, len(next_obs_batch)) * \ self.Q(next_obs_batch).detach().max(1)[0].data.numpy()# ~done_batch : Solo hara la suma si no a terminado td_target = torch.from_numpy(td_target) # convertimos a un tensor para operar #td_target = td_target.to(self.device) action_idx = torch.from_numpy(action_batch).to(device) td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1,action_idx.view(-1,1).long()), td_target.float().unsqueeze(1)) self.Q_optimizer.zero_grad() td_error.mean().backward() self.Q_optimizer.step()# hago un paso adelante para que la neurona aprenda
self.Q_optimizer.step() if __name__ == "__main__": environment = gym.make("CartPole-v0") agent = SwallowQLearner(environment) first_episode = True episode_rewards = list() for episode in range(MAX_NUM_EPISODES): obs = environment.reset() total_reward = 0.0 for step in range(STEPS_PER_EPISODE): #environment.render() action = agent.get_action(obs) next_obs, reward, done, info = environment.step(action) agent.memory.store(Experience(obs, action, reward, next_obs, done)) agent.learn(obs, action, reward, next_obs) obs = next_obs total_reward += reward if done is True: if first_episode: max_reward = total_reward first_episode = False episode_rewards.append(total_reward) if total_reward > max_reward: max_reward = total_reward print( "\nEpisodio#{} finalizado con {} iteraciones. Recompensa = {}, Recompensa media = {}, Mejor recompensa = {}" .format(episode, step + 1, total_reward,
print("ERROR: no existe ningún modelo entrenado para este entorno. Empezamos desde cero") episode = 0 while global_step_num < agent_params['max_training_steps']: obs = environment.reset() total_reward = 0.0 done = False step = 0 while not done: # mientras no haya terminado if env_conf['render'] or args.render: environment.render() action = agent.get_action(obs) next_obs, reward, done, info = environment.step(action) agent.memory.store(Experience(obs, action, reward, next_obs, done))# implementamos la experiencia en memoria obs = next_obs total_reward += reward step += 1 global_step_num += 1 if done is True: episode += 1 episode_rewards.append(total_reward) if total_reward > agent.best_reward: # si tenemos un mejor recompenza agent.best_reward = total_reward if np.mean(episode_rewards) > previous_checkpoint_mean_ep_rew: num_improved_episodes_before_checkpoint += 1