示例#1
0
class ExperienceReplayDQNAgent(DQNAgent):
    def __init__(self):
        super().__init__()
        self.memory = Memory(1000)

    def remember(self, state, action, reward, next_state, done):
        self.memory.store((state, action, reward, next_state, done))

    def replay_new(self, memory):
        idx, minibatch, ISWeights = memory.sample(1000)
        for sample in minibatch:
            state = sample[0][0]
            action = sample[0][1]
            reward = sample[0][2]
            next_state = sample[0][3]
            done = sample[0][4]
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(
                    self.model.predict(np.array([next_state]))[0])
            target_f = self.model.predict(np.array([state]))
            memory.batch_update(idx, np.abs(target_f[0] - target))
            target_f[0][np.argmax(action)] = target
            self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)

    def train_short_memory(self, state, action, reward, next_state, done):
        target = reward
        if not done:
            target = reward + self.gamma * np.amax(
                self.model.predict(next_state.reshape((1, 11)))[0])
        target_f = self.model.predict(state.reshape((1, 11)))
        target_f[0][np.argmax(action)] = target
        self.model.fit(state.reshape((1, 11)), target_f, epochs=1, verbose=0)

    def run(self, mode_file):
        pygame.init()
        counter_games = 0
        score_plot = []
        counter_plot = []
        record = 0
        while counter_games < 200:
            # Initialize classes
            game = Game(440, 440, mode_file)
            player1 = game.player
            food1 = game.food

            # Perform first move
            initialize_game(player1, game, food1, self)
            if game_settings['display_option']:
                display(player1, food1, game, record)

            while not game.crash:
                # agent.epsilon is set to give randomness to actions
                self.epsilon = 80 - counter_games

                # get old state
                state_old = self.get_state(game, player1, food1)

                # perform random actions based on agent.epsilon, or choose the action
                if randint(0, 200) < self.epsilon:
                    final_move = to_categorical(randint(0, 2), num_classes=3)
                else:
                    # predict action based on the old state
                    prediction = self.model.predict(state_old.reshape((1, 11)))
                    final_move = to_categorical(np.argmax(prediction[0]),
                                                num_classes=3)

                # perform new move and get new state
                player1.do_move(final_move, player1.x, player1.y, game, food1,
                                self)
                state_new = self.get_state(game, player1, food1)

                # set reward for the new state
                reward = self.set_reward(player1, game.crash)

                # train short memory base on the new action and state
                self.train_short_memory(state_old, final_move, reward,
                                        state_new, game.crash)

                # store the new data into a long term memory
                self.remember(state_old, final_move, reward, state_new,
                              game.crash)
                record = get_record(game.score, record)
                if game_settings['display_option']:
                    display(player1, food1, game, record)
                    pygame.time.wait(game_settings['speed'])

            self.replay_new(self.memory)
            counter_games += 1
            print('Game', counter_games, '      Score:', game.score)
            score_plot.append(game.score)
            counter_plot.append(counter_games)
        self.model.save_weights('weights.hdf5')
        # from google.colab import files
        # files.download("weights.hdf5")
        plot_seaborn(counter_plot, score_plot)
示例#2
0
class BreakOutPlayer:
    def __init__(self, paramsManager):
        self.paramsManager = paramsManager
        self.memory = Memory(
            self.paramsManager.get_params()["agent"]["GOOD_MEMORIES_SIZE"],
            self.paramsManager.get_params()["agent"]["BAD_MEMORIES_SIZE"],
            self.paramsManager.get_params()["agent"]["MINI_BATCH_SIZE"],
            self.paramsManager.get_params()["environment"]
            ["FRAME_PROCESSED_WIDTH"],
            self.paramsManager.get_params()["environment"]
            ["FRAME_PROCESSED_HEIGHT"],
            self.paramsManager.get_params()["environment"]
            ["NUMBER_OF_FRAMES_TO_STACK_ON_STATE"])
        print("[i] Creating main convolutional neural network")
        self.main_cnn = CNN()
        print("[i] Creating target convolutional neural network")
        self.target_cnn = copy.deepcopy(self.main_cnn)
        print("[!] Creating the agent")
        self.main_cnn.cuda()
        self.target_cnn.cuda()
        self.agent = Agent(
            self.main_cnn, self.target_cnn,
            self.paramsManager.get_params()["agent"]["EPSILON_MAX"],
            self.paramsManager.get_params()["agent"]
            ["NUMBER_OF_FRAMES_WITH_CONSTANT_EPSILON"],
            self.paramsManager.get_params()["agent"]["FIRST_EPSILON_DECAY"],
            self.paramsManager.get_params()["agent"]
            ["FRAMES_TO_FIRST_EPSILON_DECAY"],
            self.paramsManager.get_params()["agent"]["FINAL_EPSILON_VALUE"],
            self.paramsManager.get_params()["agent"]
            ["FRAMES_TO_FINAL_EPSILON"],
            self.paramsManager.get_params()["agent"]
            ["EXPLORATION_PROBABILITY_DURING_EVALUATION"],
            self.paramsManager.get_params()["agent"]["LEARNING_RATE"])
        self.breakout_wrapper = BreakoutWrapper(
            self.paramsManager.get_params()["environment"]["NAME"],
            self.paramsManager.get_params()["agent"]["NO_OP_STEPS"],
            self.paramsManager.get_params()["environment"]
            ["NUMBER_OF_FRAMES_TO_STACK_ON_STATE"],
            self.paramsManager.get_params()["environment"]
            ["FRAME_PROCESSED_WIDTH"],
            self.paramsManager.get_params()["environment"]
            ["FRAME_PROCESSED_HEIGHT"],
            self.paramsManager.get_params()["environment"]["RENDER"])

    def train(self):
        frame_number = 0
        rewards = []
        # Stores the mean rewards of each epoch
        epochs_means = []
        # While we are training
        while frame_number < self.paramsManager.get_params(
        )["agent"]["MAX_FRAMES"]:
            #########################
            ####### TRAINING ########
            #########################
            # Epoch counter
            epoch_counter = 0
            # Stores the epoch rewards
            epoch_rewards = []
            # While we arent on evaluation
            while epoch_counter < self.paramsManager.get_params(
            )["agent"]["EVAL_FREQUENCY"]:
                # Resetting the env
                done_life_lost = self.breakout_wrapper.reset(evaluation=False)
                # Other params
                total_episode_reward = 0
                current_ale_lives = 5
                perform_fire = True
                for i in range(self.paramsManager.get_params()["agent"]
                               ["MAX_EPISODE_LENGTH"]):
                    # Prints the saparetor defined on the json
                    print(self.paramsManager.get_params()["environment"]
                          ["SEPARATOR"])
                    # If its necessary to FIRE
                    if perform_fire:
                        chosen_action = 1
                    else:
                        chosen_action = self.agent.get_action(
                            frame_number,
                            self.breakout_wrapper.actual_state,
                            evaluation=False)
                    # We take the step. A dying penalty is added by the breakout_wrapper
                    processed_new_frame, reward, done, done_life_lost, _, info = self.breakout_wrapper.step(
                        chosen_action,
                        self.paramsManager.get_params()["agent"]
                        ["DYING_REWARD"], current_ale_lives)
                    print("[i] Action performed: ", chosen_action,
                          ". Reward: ", reward, ".Frame number: ",
                          frame_number)
                    # If we already have rewards:
                    if len(rewards) != 0:
                        print("[i] Mean Training Reward: %.3f" %
                              (sum(rewards) / len(rewards)))
                    if len(epoch_rewards) != 0:
                        print("[i] Mean Epoch Reward: %.3f" %
                              (sum(epoch_rewards) / len(epoch_rewards)))
                    frame_number += 1
                    epoch_counter += 1
                    total_episode_reward += reward
                    if self.paramsManager.get_params()["agent"]["CLIP_REWARD"]:
                        self.memory.store(processed_new_frame, chosen_action,
                                          self.clip_reward(reward),
                                          done_life_lost)
                    else:
                        self.memory.store(processed_new_frame, chosen_action,
                                          reward, done_life_lost)
                    # If its time to learn
                    if frame_number % self.paramsManager.get_params()["agent"][
                            "UPDATE_FREQUENCY"] and frame_number > self.paramsManager.get_params(
                            )["agent"]["REPLAY_MEMORY_START_FRAME"]:
                        losses = self.agent.learn(
                            self.memory,
                            self.paramsManager.get_params()["agent"]["GAMMA"],
                            self.paramsManager.get_params()["agent"]
                            ["MINI_BATCH_SIZE"])
                    if frame_number % self.paramsManager.get_params()["agent"][
                            "NETWORK_UPDATE_FREQ"] == 0 and frame_number > self.paramsManager.get_params(
                            )["agent"]["REPLAY_MEMORY_START_FRAME"]:
                        self.agent.updateNetworks()
                    if info["ale.lives"] < current_ale_lives:
                        perform_fire = True
                        current_ale_lives = info["ale.lives"]
                    elif info["ale.lives"] == current_ale_lives:
                        perform_fire = False
                    if done:
                        done = False
                        perform_fire = True
                        break
                rewards.append(total_episode_reward)
                epoch_rewards.append(total_episode_reward)
            #########################
            ####### SAVE INFO #######
            #########################
            epochs_means.append(sum(epoch_rewards) / len(epoch_rewards))
            file = open("results.txt", "w")
            print("============ EPOCH %d FINISHED ============" %
                  len(epochs_means))
            for idx, mean in enumerate(epochs_means):
                print("Epoch number: %d. Mean reward: %.3f" % (idx, mean))
                file.write("Epoch number: %d. Mean reward: %.3f\n" %
                           (idx, mean))
            file.close()
            time.sleep(10)

    def clip_reward(self, r):
        if r > 0:
            return 1
        elif r == 0:
            return 0
        else:
            return -1