示例#1
0
    def __init__(self, environment, learning_rate=0.005, gamma=0.98):
        self.obs_shape = environment.observation_space.shape

        self.action_shape = environment.action_space.n
        self.Q = SLP(self.obs_shape, self.action_shape)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=learning_rate)
        self.gamma = gamma

        self.epsilon_max = 1.0
        self.epsilon_min = 0.005
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * STEPS_PER_EPISODE)
        self.step_num = 0
        self.policy = self.epsilon_greedy_Q
    def __init__(self, environment, learning_rate=0.005, gamma=0.98):
        self.obs_shape = environment.observation_space.shape
        self.action_shape = environment.action_space.n
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.Q = SLP(self.obs_shape, self.action_shape, self.device)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=learning_rate)
        self.gamma = gamma
        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * MAX_STEP_PER_EPISODE)

        self.step_num = 0
        self.policy = self.epsilon_greedy_Q
        self.memory = ExperienceMemory(capacity=int(1e5))
示例#3
0
class SwallowQLearner(object):
    def __init__(self, environment, learning_rate=0.005, gamma=0.98):
        self.obs_shape = environment.observation_space.shape

        self.action_shape = environment.action_space.n
        self.Q = SLP(self.obs_shape, self.action_shape)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=learning_rate)
        self.gamma = gamma

        self.epsilon_max = 1.0
        self.epsilon_min = 0.005
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * STEPS_PER_EPISODE)
        self.step_num = 0
        self.policy = self.epsilon_greedy_Q

    def discretize(self, obs):
        return tuple(((obs - self.obs_low) / self.bin_width).astype(int))

    def get_action(self, obs):
        return self.policy(obs)

    def epsilon_greedy_Q(self, obs):
        if random.random() < self.epsilon_decay(self.step_num):
            action = random.choice([a for a in range(self.action_shape)])
        else:
            action = np.argmax(
                self.Q(obs).data.to(torch.device('cpu')).numpy())

        return action

    def learn(self, obs, action, reward, next_obs):
        td_target = reward + self.gamma * torch.max(self.Q(next_obs))
        td_error = torch.nn.functional.mse_loss(self.Q(obs)[action], td_target)
        self.Q_optimizer.zero_grad()
        td_error.backward()
        self.Q_optimizer.step()
示例#4
0
class SwallowQLearner(object):
    def __init__(self, environment, learning_rate=0.005, gamma=0.98):
        self.obs_shape = environment.observation_space.shape

        self.action_shape = environment.action_space.n
        self.Q = SLP(self.obs_shape, self.action_shape)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=learning_rate)

        self.gamma = gamma

        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * STEPS_PER_EPISODE)
        self.step_num = 0
        self.policy = self.epsilon_greedy_Q

        self.memory = ExperienceMemory(capacity=int(1e5))
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

    def get_action(self, obs):
        return self.policy(obs)

    def epsilon_greedy_Q(self, obs):
        if random.random() < self.epsilon_decay(self.step_num):
            action = random.choice([a for a in range(self.action_shape)])
        else:
            action = np.argmax(
                self.Q(obs).data.to(torch.device('cpu')).numpy())
        self.step_num += 1  ##EN EL VIDEO SE NOS OLVIDÓ SUBIR EL STEP EN UNA UNIDAD
        return action

    def learn(self, obs, action, reward, next_obs):
        td_target = reward + self.gamma * torch.max(self.Q(next_obs))
        td_error = torch.nn.functional.mse_loss(self.Q(obs)[action], td_target)
        self.Q_optimizer.zero_grad()
        td_error.backward()
        self.Q_optimizer.step()

    def replay_experience(self, batch_size):
        """
        Vuelve a jugar usando la experiencia aleatoria almacenada
        :param batch_size: Tamaño de la muestra a tomar de la memoria
        :return: 
        """
        experience_batch = self.memory.sample(batch_size)
        self.learn_from_batch_experience(experience_batch)

    def learn_from_batch_experience(self, experiences):
        """
        Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores
        :param experiences: fragmento de recuerdos anteriores
        :return: 
        """
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs)
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)
        next_obs_batch = np.array(batch_xp.next_obs)
        done_batch = np.array(batch_xp.done)

        td_target = reward_batch + ~done_batch * \
                    np.tile(self.gamma, len(next_obs_batch)) * \
                    self.Q(next_obs_batch).detach().max(1)[0].data.numpy()
        td_target = torch.from_numpy(td_target)
        td_target = td_target.to(self.device)
        action_idx = torch.from_numpy(action_batch).to(self.device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1,
                                     action_idx.view(-1, 1).long()),
            td_target.float().unsqueeze(1))

        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.Q_optimizer.step()
class SwallowQLearner(object):
    def __init__(self, environment, learning_rate=0.005, gamma=0.98):
        self.obs_shape = environment.observation_space.shape
        self.action_shape = environment.action_space.n
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.Q = SLP(self.obs_shape, self.action_shape, self.device)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=learning_rate)
        self.gamma = gamma
        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * MAX_STEP_PER_EPISODE)

        self.step_num = 0
        self.policy = self.epsilon_greedy_Q
        self.memory = ExperienceMemory(capacity=int(1e5))

    def get_action(self, obs):
        return self.policy(obs)

    def epsilon_greedy_Q(self, obs):
        if random.random() < self.epsilon_decay(self.step_num):
            action = random.choice([a for a in range(self.action_shape)])
        else:
            action = np.argmax(self.Q(obs).data.to(self.device).cpu().numpy())
        return action

    def learn(self, obs, action, reward, next_obs):
        td_target = reward + self.gamma * torch.max(self.Q(next_obs))
        td_error = torch.nn.functional.mse_loss(self.Q(obs)[action], td_target)
        self.Q_optimizer.zero_grad()
        td_error.backward()
        self.Q_optimizer.step()

    def replay_experience(self, batch_size):
        experience_batch = self.memory.sample(batch_size)
        self.learn_from_batch_experiece(experience_batch)

    def learn_from_batch_experiece(self, experiences):
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs)
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)
        next_obs_batch = np.array(batch_xp.next_obs)
        done_batch = np.array(batch_xp.done)

        if str(self.device) == "cuda":
            td_target =  reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                    torch.max(self.Q(next_obs_batch).detach(),1)[0].data.tolist()
        else:
            td_target = reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                    self.Q(next_obs_batch).detach().max(1)[0].data.numpy()

        td_target = torch.from_numpy(td_target)
        td_target = td_target.to(self.device)
        action_idx = torch.from_numpy(action_batch).to(self.device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1,
                                     action_idx.view(-1, 1).long()),
            td_target.float().unsqueeze(1))

        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.Q_optimizer.step()
示例#6
0
class ShallowQLearner(object):
    def __init__(self, environment, learning_rate=0.005, gamma=0.98):
        self.obs_shape = environment.observation_space.shape

        self.action_shape = environment.action_space.n
        self.Q = SLP(self.obs_shape, self.action_shape)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=learning_rate)

        self.gamma = gamma

        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * STEPS_PER_EPISODE)
        self.step_num = 0
        self.policy = self.epsilon_greedy_Q

        #exp replay
        self.memory = ExperienceMemory(capacity=int(1e5))
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

    def get_action(self, obs):
        return self.policy(obs)

    def epsilon_greedy_Q(self, obs):

        if random.random() < self.epsilon_decay(self.step_num):
            action = random.choice([a for a in range(self.action_shape)])
        else:
            action = np.argmax(
                self.Q(obs).data.to(torch.device('cpu')).numpy())

        return action

    def learn(self, obs, action, reward, next_obs):

        #formula
        td_target = reward + self.gamma * torch.max(self.Q(next_obs))
        td_error = torch.nn.functional.mse_loss(self.Q(obs)[action], td_target)

        self.Q_optimizer.zero_grad()
        td_error.backward()
        self.Q_optimizer.step()

    def replay_experience(self, batch_size):
        """
        Vuelve a jugar usando la experiencia aleatoria almacenada
        :param batch_size: Tamano de la muestra a tomar de la memoria
        :return:
        """
        experience_batch = self.memory.sample(batch_size)
        self.learn_from_batch_experience(experience_batch)

    def learn_from_batch_experience(self, experiences):
        """
        Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores
        :param experiences: fragmento de recuerdos anteriores
        :return:
        """
        # el * pasa la referencia y no el valor
        #extraer las obs, acciones, recompensas, las siguientes obs, done
        batch_exp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_exp.obs)
        action_batch = np.array(batch_exp.action)
        reward_batch = np.array(batch_exp.reward)
        next_obs_batch = np.array(batch_exp.next_obs)
        done_batch = np.array(
            batch_exp.done)  #para cada experiencia si hay obs

        #calcular la diferencia temporal, forma de vector
        #~done_batch si es False es cero y multiplica todo la derecha
        td_target = reward_batch + ~done_batch * \
                    np.tile(self.gamma, len(next_obs_batch)) * \
                    self.Q(next_obs_batch).detach().max(1)[0].data.numpy()#convertimos a numpy array para multiplicar y se revierte con torch.from_numpy

        #calcular el error cuadrado medio
        td_target = torch.from_numpy(
            td_target)  #se convierte de numpy ha tensor
        td_target = td_target.to(self.device)
        action_idx = torch.from_numpy(action_batch).to(self.device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1,
                                     action_idx.view(-1, 1).long()),
            td_target.float().unsqueeze(1)
        )  #se agrega .long() para convertir a tipo LongTensor lo que espera como parametro el gather de action_idx.view(-1,1)

        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.Q_optimizer.step()