示例#1
0
文件: dqn_agent.py 项目: dbueno96/DQN
class DQNAgent(BaseAgent):
    #Se inicializa la clase padre con el objeto config, un History,
    #una memoria de repetición, una red neuronal y su arquitectura.
    def __init__(self, config):
        super(DQNAgent, self).__init__(config)
        self.history = History(config)
        self.replay_memory = DQNReplayMemory(config)
        self.net = DQN(self.env_wrapper.action_space.n, config)
        self.net.build()
        self.net.add_summary(["average_reward", "average_loss", "average_q", "ep_max_reward", "ep_min_reward", "ep_num_game", "learning_rate"], ["ep_rewards", "ep_actions"])

    #Se encarga de procesar una observación del agente.
    def observe(self):
        reward = max(self.min_reward, min(self.max_reward, self.env_wrapper.reward)) #Se encarga de que puntaje en el estado esté entre 1 y -1
        screen = self.env_wrapper.screen #Se toma la screen actual del wrapper
        self.history.add(screen) #Se agrega la screen a la historial
        self.replay_memory.add(screen, reward, self.env_wrapper.action, self.env_wrapper.terminal) #Se agrega los valores a la memoria de repetición
        if self.i < self.config.epsilon_decay_episodes: #Se decrementa el valor de epsilon de acuerdo a la cantidad de timesteps
            self.epsilon -= self.config.epsilon_decay
        if self.i % self.config.train_freq == 0 and self.i > self.config.train_start: #se actualizan los pesos ed la red neuronal de acuerdo a los valores en el objeto config.
            state, action, reward, state_, terminal = self.replay_memory.sample_batch() #Se toma una muestra de la memoria de repetición
            q, loss= self.net.train_on_batch_target(state, action, reward, state_, terminal, self.i) #Se actualiza la red neuronal con los valores de la muestra
            self.total_q += q #se incrementa al valor de q
            self.total_loss += loss #se incrementa el valor de pérdida
            self.update_count += 1 #Se contabiliza el entrenamiento 
        if self.i % self.config.update_freq == 0: #Se verifica si es necesario actualiza la red neuronal objetivo 
            self.net.update_target() #Se actualiza la red neurona objetivo

    #Se encarga de definir la política bajo la cual el agente ejecuta un acción.
    def policy(self):
        if np.random.rand() < self.epsilon: #Si epsilon es mayo a un aleatorio
            return self.env_wrapper.random_step() #Se ejecuta una acción aleatoria
        else: #Si no
            state = self.history.get()/255.0 
            a = self.net.q_action.eval({
                self.net.state : [state]
            }, session=self.net.sess)
            return a[0] #Calcula la acción que ofrece mejor puntaje a futuro de acuerdo a la red neuronal

    #Se encarga de realizan el proceso de entrenamiento del agente. 
    def train(self, steps):
        render = False #Se setea la variable para renderiza en false
        self.env_wrapper.new_random_game() #Se inicia un juego y se lleva a un estado aleatorio
        num_game, self.update_count, ep_reward = 0,0,0. #Se inicializan las variables en 0.
        total_reward, self.total_loss, self.total_q = 0.,0.,0. #Se inicializan las varaibles en o
        ep_rewards, actions = [], [] #Se inicializan arreglos vacpios
        t = 0

        for _ in range(self.config.history_len): #Se itera de acuerdo a la longitud de historial
            self.history.add(self.env_wrapper.screen) #Se agregan la primeras iguales screens al historial
        for self.i in tqdm(range(self.i, steps)): #Se itera de acuerdo a los timesteps de entrenamiento
            action = self.policy() #Se selecciona la acción del agente de acuerdo a la poĺítica 
            self.env_wrapper.act(action) #Se eejecuta la acción en el ambiente
            self.observe() #Se procesa la observación.
            if self.env_wrapper.terminal: #Si se llega a un estado terminal 
                t = 0 #reinicia el conteo de timesteps del episodio
                self.env_wrapper.new_random_game() #inicializa un juego y lo lleva un estado aleatorio
                num_game += 1 #Se incrementa la cantidad de episodios jugados
                ep_rewards.append(ep_reward) #Se almacena la recompensa del episodio
                ep_reward = 0. #Se reinicia la recompensa para el sigueinte episodio
            else: #si no es un estado terminal.
                ep_reward += self.env_wrapper.reward #Incrementa la recompensa en el episodio.
                t += 1 #contabiliza el timestep transcurrido
            actions.append(action) #almacena la acción ejecutada 
            total_reward += self.env_wrapper.reward #Incrementa la recomentan total.

            
            if self.i >= self.config.train_start:  #si ya han trasncurridos más timesteps que los especificados en el objeto config
                if self.i % self.config.test_step == self.config.test_step -1: #Si se cumple la condición de acuerdo a los teststeps
                    avg_reward = total_reward / self.config.test_step #Sedeine el promedio de las recompensas sobre los lostest steps
                    avg_loss = self.total_loss / self.update_count #Se define el promedio de pérdida de acuerdo los updates en la red 
                    avg_q = self.total_q / self.update_count #Se define el valor q promedio de acuerdo a los updates en la red

                    try:
                        max_ep_reward = np.max(ep_rewards) #Se almacena la recompensa máxima de los episodios
                        min_ep_reward = np.min(ep_rewards) #Se almacena la recompensa minima de los episodios
                        avg_ep_reward = np.mean(ep_rewards)#Se almaena la recompensa promedio de los episodios
                    except:
                        max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 #Si hay un errores se setean a 0

                    sum_dict = { #crea un diccionari con la información definida.
                        'average_reward': avg_reward,
                        'average_loss': avg_loss,
                        'average_q': avg_q,
                        'ep_max_reward': max_ep_reward,
                        'ep_min_reward': min_ep_reward,
                        'ep_num_game': num_game,
                        'learning_rate': self.net.learning_rate,
                        'ep_rewards': ep_rewards,
                        'ep_actions': actions
                    }
                    self.net.inject_summary(sum_dict, self.i) #injecta el diccionario a la red para crear un objeto tf.Summary
                    num_game = 0 #Reinicia los valores guardados en el diccionario
                    total_reward = 0.
                    self.total_loss = 0.
                    self.total_q = 0.
                    self.update_count = 0
                    ep_reward = 0.
                    ep_rewards = []
                    actions = []

            if self.i % 500000 == 0 and self.i > 0: #Cada 500000 timesteps guarda un checkpoint del proceso de entrenamiento de la red
                j = 0
                self.save()
            if self.i % 100000 == 0: #Cada 10000 timesteps renderiza 1000 steps del entrenamiento
                j = 0
                render = True

            # if render:
            #     self.env_wrapper.env.render()
            #     j += 1
            #     if j == 1000:
            #         render = False


    #Se encarga de jugar episodios de acuerdo a la red neuronal entrenada
    def play(self, episodes, net_path):
        self.net.restore_session(path=net_path) #carga los pesos de la red neuronal
        self.env_wrapper.new_game() #Inicia un juego en un estado aleatorio
        i = 0
        for _ in range(self.config.history_len): #sse itera de acuerdo al tamaño del historial
            self.history.add(self.env_wrapper.screen)  #Se almacena las pprimera historias en el historial
        episode_steps = 0
        while i < episodes: 
            a = self.net.q_action.eval({
                self.net.state : [self.history.get()/255.0]
            }, session=self.net.sess) #Calcula la acción que ofrece mejor retorno de acuerdo a la red neuronal
            action = a[0] #Se toma la primera posición, que ofrece el mejor retorno
            self.env_wrapper.act_play(action) #Ejecuta la accion en el ambiente y verifica si perdió vida
            self.history.add(self.env_wrapper.screen) #Almacena el nuevo screen
            episode_steps += 1
            if episode_steps > self.config.max_steps: #Si se alcana la cantidad máxima de steps por episodio
                self.env_wrapper.terminal = True #Marca el episodio como terminado
            if self.env_wrapper.terminal: #Si el episodio está terminado
                episode_steps = 0  #Reiniicia los steps para el nuevo episodio
                i += 1 #contabiliza el episodio jugado
                self.env_wrapper.new_play_game() #Crear un nuevo ambiente en un estado aleatorio
                for _ in range(self.config.history_len): #Se itera sobre el tamaño del historial
                    screen = self.env_wrapper.screen
                    self.history.add(screen) #Almacena la screen en el historia
示例#2
0
class DQNAgent(BaseAgent):

    def __init__(self, config):
        super(DQNAgent, self).__init__(config)
        self.history = History(config)
        self.replay_memory = DQNReplayMemory(config)
        self.net = DQN(self.env_wrapper.action_space.n, config)
        self.net.build()
        self.net.add_summary(["average_reward", "average_loss", "average_q", "ep_max_reward", "ep_min_reward", "ep_num_game", "learning_rate"], ["ep_rewards", "ep_actions"])

    def observe(self):
        reward = max(self.min_reward, min(self.max_reward, self.env_wrapper.reward))
        screen = self.env_wrapper.screen
        self.history.add(screen)
        self.replay_memory.add(screen, reward, self.env_wrapper.action, self.env_wrapper.terminal)
        if self.i < self.config.epsilon_decay_episodes:
            self.epsilon -= self.config.epsilon_decay
        if self.i % self.config.train_freq == 0 and self.i > self.config.train_start:
            state, action, reward, state_, terminal = self.replay_memory.sample_batch()
            q, loss= self.net.train_on_batch_target(state, action, reward, state_, terminal, self.i)
            self.total_q += q
            self.total_loss += loss
            self.update_count += 1
        if self.i % self.config.update_freq == 0:
            self.net.update_target()

    def policy(self):
        if np.random.rand() < self.epsilon:
            return self.env_wrapper.random_step()
        else:
            state = self.history.get()/255.0
            a = self.net.q_action.eval({
                self.net.state : [state]
            }, session=self.net.sess)
            return a[0]


    def train(self, steps):
        render = False
        self.env_wrapper.new_random_game()
        num_game, self.update_count, ep_reward = 0,0,0.
        total_reward, self.total_loss, self.total_q = 0.,0.,0.
        ep_rewards, actions = [], []
        t = 0

        for _ in range(self.config.history_len):
            self.history.add(self.env_wrapper.screen)
        for self.i in tqdm(range(self.i, steps)):
            action = self.policy()
            self.env_wrapper.act(action)
            self.observe()
            if self.env_wrapper.terminal:
                t = 0
                self.env_wrapper.new_random_game()
                num_game += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.
            else:
                ep_reward += self.env_wrapper.reward
                t += 1
            actions.append(action)
            total_reward += self.env_wrapper.reward

            if self.i >= self.config.train_start:
                if self.i % self.config.test_step == self.config.test_step -1:
                    avg_reward = total_reward / self.config.test_step
                    avg_loss = self.total_loss / self.update_count
                    avg_q = self.total_q / self.update_count

                    try:
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0

                    sum_dict = {
                        'average_reward': avg_reward,
                        'average_loss': avg_loss,
                        'average_q': avg_q,
                        'ep_max_reward': max_ep_reward,
                        'ep_min_reward': min_ep_reward,
                        'ep_num_game': num_game,
                        'learning_rate': self.net.learning_rate,
                        'ep_rewards': ep_rewards,
                        'ep_actions': actions
                    }
                    self.net.inject_summary(sum_dict, self.i)
                    num_game = 0
                    total_reward = 0.
                    self.total_loss = 0.
                    self.total_q = 0.
                    self.update_count = 0
                    ep_reward = 0.
                    ep_rewards = []
                    actions = []

            if self.i % 500000 == 0 and self.i > 0:
                j = 0
                self.save()
            if self.i % 100000 == 0:
                j = 0
                render = True

            if render:
                self.env_wrapper.env.render()
                j += 1
                if j == 1000:
                    render = False

    def play(self, episodes, net_path):
        self.net.restore_session(path=net_path)
        self.env_wrapper.new_game()
        i = 0
        for _ in range(self.config.history_len):
            self.history.add(self.env_wrapper.screen)
        episode_steps = 0
        while i < episodes:
            a = self.net.q_action.eval({
                self.net.state : [self.history.get()/255.0]
            }, session=self.net.sess)
            action = a[0]
            self.env_wrapper.act_play(action)
            self.history.add(self.env_wrapper.screen)
            episode_steps += 1
            if episode_steps > self.config.max_steps:
                self.env_wrapper.terminal = True
            if self.env_wrapper.terminal:
                episode_steps = 0
                i += 1
                self.env_wrapper.new_play_game()
                for _ in range(self.config.history_len):
                    screen = self.env_wrapper.screen
                    self.history.add(screen)
示例#3
0
class DQNAgent(BaseAgent):
    def __init__(self, config):
        super(DQNAgent, self).__init__(config)
        self.history = History(config)
        self.replay_memory = DQNReplayMemory(config)
        self.net = DQN(4, config)
        self.net.build()
        self.net.add_summary([
            "average_reward", "average_loss", "average_q", "ep_max_reward",
            "ep_avg_reward", "ep_min_reward", "ep_num_game", "learning_rate"
        ], ["ep_rewards", "ep_actions"])

    def observe(self):
        reward = max(self.min_reward,
                     min(self.max_reward, self.env_wrapper.reward))
        color = self.env_wrapper.color
        self.history.add(color)
        self.replay_memory.add(color, reward, self.env_wrapper.action,
                               self.env_wrapper.terminal)
        if self.i < self.config.epsilon_decay_episodes:
            self.epsilon -= self.config.epsilon_decay
        if self.i % self.config.train_freq == 0 and self.i > self.config.train_start:
            #print('----> i',self.i)
            ### training starts only after train_start=20K steps, mem_size is 800K, train_freq is 8,
            ### I guess thats why its okay to not worry about sampling with repititions
            state, action, reward, state_, terminal = self.replay_memory.sample_batch(
            )
            q, loss = self.net.train_on_batch_target(state, action, reward,
                                                     state_, terminal, self.i)
            ### self.i is passed to implement lr decay
            self.total_q += q
            self.total_loss += loss
            self.update_count += 1
        if self.i % self.config.update_freq == 0:
            self.net.update_target()

    def policy(self):
        if np.random.rand() < self.epsilon:
            return self.env_wrapper.random_step()
        else:
            state = self.history.get()
            a = self.net.q_action.eval({self.net.state: [state]},
                                       session=self.net.sess)
            return a[0]

    def train(self, steps):
        f = open('dqn2.txt', 'w')
        render = False
        self.env_wrapper.new_random_game()
        num_game, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        ep_rewards, actions = [], []
        t = 0
        print(self.net.number_of_trainable_parameters())

        for _ in range(self.config.history_len):
            self.history.add(self.env_wrapper.color)
            ### So, the first state is just the first color, repeated a number of times
        for self.i in tqdm(range(self.i, steps)):
            #take action, observe
            action = self.policy()
            self.env_wrapper.act(action)
            self.observe()
            if self.env_wrapper.terminal:
                t = 0
                self.env_wrapper.new_random_game()
                num_game += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.
            else:
                ep_reward += self.env_wrapper.reward
                t += 1
            actions.append(action)
            total_reward += self.env_wrapper.reward
            #print(self.i,action,total_reward, self.env_wrapper.terminal)
            #total_reward, max_ep_reward, min_ep_reward, avg_ep_reward keep track of reward earned every self.config.test_step=5000 steps
            if self.i >= self.config.train_start:
                if self.i % self.config.test_step == self.config.test_step - 1:
                    avg_reward = total_reward / self.config.test_step
                    avg_loss = self.total_loss / self.update_count
                    avg_q = self.total_q / self.update_count

                    try:
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0

                    sum_dict = {
                        'average_reward': avg_reward,
                        'average_loss': avg_loss,
                        'average_q': avg_q,
                        'ep_max_reward': max_ep_reward,
                        'ep_min_reward': min_ep_reward,
                        'ep_num_game': num_game,
                        'ep_avg_reward': avg_ep_reward,
                        'learning_rate': self.net.learning_rate,
                        'ep_rewards': ep_rewards,
                        'ep_actions': actions
                    }
                    self.net.inject_summary(sum_dict, self.i)
                    num_game = 0
                    total_reward = 0.
                    self.total_loss = 0.
                    self.total_q = 0.
                    self.update_count = 0
                    ep_reward = 0.
                    ep_rewards = []
                    actions = []
                    f.write(str(avg_ep_reward))
            if self.i % 50000 == 0 and self.i > 0:
                j = 0
                print('saving..')
                self.save()
                #play_score = self.play(episodes=self.config.num_episodes_for_play_scores_summary, net_path=self.net.dir_model)
                #self.net.inject_summary({'play_score':play_score}, self.i)
            if self.i % 100000 == 0:
                j = 0
                render = True

            if render:
                #self.env_wrapper.env.render()
                j += 1
                if j == 1000:
                    render = False
        f.close()

    def play(self, episodes, net_path, verbose=False, print_average=True):
        d = []
        self.net.restore_session(path=net_path)
        self.env_wrapper.new_game()
        i = 0
        for _ in range(self.config.history_len):
            self.history.add(self.env_wrapper.color)
        episode_steps = 0
        ###EDIT (LJ): added rewards calculation
        episode_reward = 0
        actions_list = []
        while i < episodes:
            #Chose Action:
            a = self.net.q_action.eval({self.net.state: [self.history.get()]},
                                       session=self.net.sess)
            action = a[0]
            actions_list.append(action)
            #Take Action
            self.env_wrapper.act_play(action)
            self.history.add(self.env_wrapper.color)
            episode_steps += 1
            episode_reward += self.env_wrapper.reward
            if episode_steps > self.config.max_steps:
                self.env_wrapper.terminal = True
            if self.env_wrapper.terminal:
                if verbose:
                    print('episode terminated in ' + str(episode_steps) +
                          ' steps with reward ' + str(episode_reward))
                    print('ACTIONS TAKEN:')
                    print(actions_list)
                actions_list = []
                d.append(episode_reward)
                episode_steps = 0
                episode_reward = 0
                i += 1
                self.env_wrapper.new_play_game()
                for _ in range(self.config.history_len):
                    color = self.env_wrapper.color
                    self.history.add(color)
        if verbose:
            print('ALL, AVERAGE:', [d, sum(d) / len(d)])
        if print_average:
            print('AVERAGE:', sum(d) / len(d))
        return sum(d) / len(d)