예제 #1
0
class RandomAgent(Agent):
    def __init__(self, configs):
        self.rb = RogueBox(configs)
        self._pending_action_timer = None
        self.ui = UIManager.init(configs["userinterface"], self.rb)
        self.l = Logger(log_depth=configs["verbose"],
                        log_targets=["file", "ui"],
                        ui=self.ui)
        self.ui.on_key_press(self._keypress_callback)
        self._timer_value = 100
        self._pending_action_timer = self.ui.on_timer_end(
            self._timer_value, self._act_callback)

    def run(self):
        self.ui.start_ui()

    def act(self):
        actions = self.rb.get_actions()
        action = random.choice(actions)
        logs = [
            Log("random_action_time",
                "Action ({}) time".format(action),
                LOG_LEVEL_MORE,
                mean=10)
        ]
        self.l.start_log_timer(logs)
        reward, _, __ = self.rb.send_command(action)
        self.l.stop_log_timer(logs)
        logs = [
            Log("action_state",
                "My previous state: \n {}".format(self.ui.read_rogue()),
                LOG_LEVEL_ALL),
            Log("chosen_action",
                "My chosen action: {} got reward: {}".format(action, reward),
                LOG_LEVEL_MORE),
        ]
        self.l.log(logs)
        return reward

    def _keypress_callback(self, event):
        if event.char == 'q' or event.char == 'Q':
            self.rb.quit_the_game()
            exit()
        elif event.char == 'r' or event.char == 'R':
            # we need to stop the agent from acting
            # or it will try to write to a closed pipe
            self.ui.cancel_timer(self._pending_action_timer)
            self.rb.reset()
            self._pending_action_timer = self.ui.on_timer_end(
                self._timer_value, self._act_callback)

    def _act_callback(self):
        reward = self.act()
        self.ui.draw_from_rogue()
        if not self.rb.game_over():
            # renew the callback
            self._pending_action_timer = self.ui.on_timer_end(
                self._timer_value, self._act_callback)
        else:
            self.ui.cancel_timer(self._pending_action_timer)
예제 #2
0
class QLearnerAgent(LearnerAgent):
    def __init__(self, configs):
        import models, history

        # class instances
        self.rb = RogueBox(configs)
        self.model_manager = getattr(models, configs["model_manager"])(self.rb)
        self.history_manager = getattr(history,
                                       configs["history_manager"])(self)
        # configs
        self.configs = configs
        self.configs["iteration"] = 1
        self.configs["actions"] = self.rb.get_actions()
        self.configs["actions_num"] = len(self.configs["actions"])
        # gui stuff
        ui = None
        log_targets = []
        if configs["logsonfile"]:
            log_targets.append("file")
        if self.configs["gui"]:
            self.ui = UIManager.init(configs["userinterface"], self.rb)
            self._pending_action = None
            ui = self.ui
            log_targets.append("ui")
            self.l = Logger(log_depth=configs["verbose"],
                            log_targets=log_targets,
                            ui=ui)
        else:
            log_targets.append("terminal")
            self.l = Logger(log_depth=configs["verbose"],
                            log_targets=log_targets)
        # state
        self.state = self.model_manager.reshape_initial_state(
            self.rb.compute_state())
        self.old_state = self.state
        # model
        self.model = self.model_manager.build_model()
        self.target_model = self.model_manager.build_model()
        self.target_model.set_weights(self.model.get_weights())
        # resume from file
        # load weights, transitions history and parameters from assets, if any
        self._load_progress()

    def _load_progress(self):
        # model weights
        if os.path.isfile("assets/weights.h5"):
            print("loading weights...")
            self.model.load_weights("assets/weights.h5")
            self.target_model.set_weights(self.model.get_weights())
            print("weights loaded!")

        # transitions history
        if self.configs["save_history"]:
            self.history_manager.load_history_from_file("assets/history.pkl")

        # parameters
        # only float can be loaded like this for now
        if os.path.isfile("assets/parameters.csv"):
            print("loading parameters...")
            with open("assets/parameters.csv") as parameters:
                reader = csv.reader(parameters)
                for row in reader:
                    try:
                        # try conversion from string
                        self.configs[row[0]] = float(row[1])
                    except ValueError:
                        print("the parameter", row[0],
                              " is not a float castable value")
            print("parameters loaded!")

    def _save_progress(self):
        print("saving...")
        if not os.path.exists("assets"):
            os.makedirs("assets")

        print("saving weights...")
        self.model.save_weights("assets/weights.h5", overwrite=True)

        if self.configs["save_history"]:
            self.history_manager.save_history_on_file("assets/history.pkl")

        print("saving parameters...")
        with open("assets/parameters.csv", "w") as parameters:
            writer = csv.writer(parameters)
            writer.writerow(["epsilon", self.configs["epsilon"]])
            writer.writerow(["iteration", self.configs["iteration"]])
        print("done saving!")

    def _reinit(self):
        self.state = self.model_manager.reshape_initial_state(
            self.rb.compute_state())
        self.old_state = self.state

    def predict(self):
        """return a numpy array of length actions_num all set to 0
        except for the index of the action to take wich is set to 1"""
        # chose an action epsilon greedy
        actions_array = np.zeros(self.configs["actions_num"])
        if random.random() <= self.configs["epsilon"]:
            action_index = random.randrange(self.configs["actions_num"])
        else:
            q = self.model.predict(self.state)
            logs = [
                Log("actions_array", "This is the action array: {}".format(q),
                    LOG_LEVEL_MORE)
            ]
            actions = self.configs["actions"]
            if self.configs["only_legal_actions"]:
                legal_actions = self.rb.get_legal_actions()
                for action in actions:
                    if action not in legal_actions:
                        q[(0, actions.index(action))] = -np.inf
            logs += [
                Log("legal_actions_array",
                    "This is the legal action array: {}".format(q),
                    LOG_LEVEL_MORE)
            ]
            self.l.log(logs)
            action_index = np.argmax(q)
        return action_index

    def act(self, action_index):
        action = self.configs["actions"][action_index]
        reward, new_state, terminal = self.rb.send_command(action)
        logs = [
            Log("action_reward",
                "Sent action: {} got reward: {}".format(action, reward),
                LOG_LEVEL_MORE)
        ]
        self.l.log(logs)
        self.old_state = self.state
        self.state = self.model_manager.reshape_new_state(
            self.old_state, new_state)
        return reward, terminal

    def observe(self):
        timer_log = [
            Log("Observe_time", "Ten observe done", LOG_LEVEL_MORE, mean=10)
        ]
        self.l.start_log_timer(timer_log)
        minibatch = self.history_manager.pick_batch(self.configs["batchsize"])
        inputs = np.zeros((self.configs["batchsize"], ) + self.state.shape[1:])
        targets = np.zeros(
            (self.configs["batchsize"], self.configs["actions_num"]))

        # Now we do the experience replay
        for i in range(self.configs["batchsize"]):
            old_state = minibatch[i][0]
            action_index = minibatch[i][1]
            reward = minibatch[i][2]
            new_state = minibatch[i][3]
            terminal = minibatch[i][4]

            inputs[i] = old_state
            targets[i] = self.model.predict(old_state)

            if terminal:
                targets[i, action_index] = reward
            else:
                Q_new_state = self.target_model.predict(new_state)
                targets[i, action_index] = reward + self.configs[
                    "gamma"] * np.max(Q_new_state)

        loss = self.model.train_on_batch(inputs, targets)
        loss_log = [
            Log("loss_value", "Loss for this iteration: {}".format(loss),
                LOG_LEVEL_SOME)
        ]
        self.l.log(loss_log)
        self.l.stop_log_timer(timer_log)
        return loss

    def plot(self, frame):
        #WARNING works only with 3 layers states
        #makes sense only with non reshaped, no memory states
        # generate heatmap
        heatmap_start = [
            Log(
                "heatmap_start",
                "Generating heatmap for iteration {} ...".format(
                    self.configs["iteration"]), LOG_LEVEL_SOME)
        ]
        self.l.log(heatmap_start)
        heatmap_time = [
            Log("heatmap_time",
                "Generating heatmap took",
                LOG_LEVEL_MORE,
                mean=1)
        ]
        self.l.start_log_timer(heatmap_time)

        heatmap = np.zeros((22, 80))
        best_actions = np.full((22, 80), -1)
        passable_pos = np.argwhere(frame[0] == 255)

        for i, j in passable_pos:
            player_layer = np.zeros((22, 80))
            player_layer[i][j] = 255
            temp = np.stack((frame[0], player_layer, frame[2]))
            q = self.model.predict(
                self.model_manager.reshape_new_state(temp, temp))
            heatmap[i][j] = q.max()
            best_actions[i][j] = q[0].argmax()

        heatmap = np.ma.masked_where(heatmap == 0, heatmap)
        mn = heatmap.min()
        mx = heatmap.max()
        heatmap_start = [
            Log("minmax", "heatmap min: {} max: {}".format(mn, mx),
                LOG_LEVEL_SOME)
        ]
        self.l.log(heatmap_start)

        arrows = ['←', '↓', '↑', '→']

        cmap = plt.cm.hot_r
        cmap.set_bad(color="green")
        fig, ax = plt.subplots(figsize=(11, 5))
        ax.imshow(heatmap,
                  cmap=cmap,
                  interpolation='nearest',
                  vmin=mn,
                  vmax=mx)
        for i, j in passable_pos:
            ax.text(j,
                    i,
                    '%s' % arrows[best_actions[i][j]],
                    ha='center',
                    va='center')
        fig.savefig("plots/heatmap-iteration-%s.png" %
                    self.configs["iteration"])

    def train(self):
        if self.configs["gui"]:
            self._pending_action = self.ui.on_timer_end(
                100, lambda: self._train_callback(1))
            self.ui.on_key_press(self._train_key_callback)
            self.ui.start_ui()
        else:
            while True:
                self._train_step(self.configs["iteration"])
                self.configs["iteration"] += 1

    def _train_step(self, iteration):
        action_index = self.predict()
        self._train_evaluation_hook_before_action()
        reward, terminal = self.act(action_index)
        self._train_evaluation_hook_after_action()
        item_added = self.history_manager.update_history(
            action_index, reward, terminal)
        if iteration % 10 == 0:
            log_iteration = [
                Log("iteration",
                    "Iteration number: {}".format(self.configs["iteration"]),
                    LOG_LEVEL_SOME)
            ]
            log_iteration += [
                Log("hist",
                    "History size: {}".format(self.history_manager.hist_len()),
                    LOG_LEVEL_SOME)
            ]
            self.l.log(log_iteration)
        # Begin training only when we have enough history
        if self.history_manager.hist_len(
        ) >= self.configs["minhist"] and item_added:
            self.observe()
            # anneal epsilon
            if self.configs["epsilon"] > self.configs["final_epsilon"]:
                self.configs["epsilon"] -= (self.configs["initial_epsilon"] - self.configs["final_epsilon"]) / \
                                              self.configs["explore_steps"]
            logs = [
                Log("epsilon", "{}".format(self.configs["epsilon"]),
                    LOG_LEVEL_ALL)
            ]
            self.l.log(logs)
            if iteration % 100000 == 0:
                self._save_progress()
                #plottin is disabled because its not compatible with every state
                #uncomment the next line if needed
                #self.plot(self.state[0])
            if iteration % 10000 == 0:
                self.target_model.set_weights(self.model.get_weights())
        if terminal:
            self._train_evaluation_hook_game_over()
            self.rb.reset()
            self._reinit()

    def run(self):
        # dont act randomly
        self.configs["epsilon"] = 0
        if self.configs["gui"]:
            self.ui.on_key_press(self._play_key_callback)
            self._pending_action = self.ui.on_timer_end(
                100, self._run_callback)
            self.ui.start_ui()
        else:
            terminal = False
            while not terminal:
                self._run_step()

    def _run_step(self):
        action_index = self.predict()
        reward, terminal = self.act(action_index)

    def _train_key_callback(self, event):
        """Callback for keys pressed during learning"""
        if event.char == 'q' or event.char == 'Q':
            self.rb.quit_the_game()
            exit()

    def _play_key_callback(self, event):
        """Callback for keys pressed during playing"""
        if event.char == 'q' or event.char == 'Q':
            self.rb.quit_the_game()
            exit()
        elif event.char == 'r' or event.char == 'R':
            # we need to stop the agent from acting
            # or it will try to write to a closed pipe
            self.ui.cancel_timer(self._pending_action)
            self.rb.reset()
            self._reinit()
            self._pending_action = self.ui.on_timer_end(
                100, self._run_callback)

    def _train_callback(self, iteration):
        self._train_step(iteration)
        self.ui.draw_from_rogue()
        self.configs["iteration"] += 1
        self._pending_action = self.ui.on_timer_end(
            self.configs["gui_delay"],
            lambda: self._train_callback(self.configs["iteration"]))

    def _run_callback(self):
        self._run_step()
        self.ui.draw_from_rogue()
        if not self.rb.game_over():
            # renew the callback
            self._pending_action = self.ui.on_timer_end(
                self.configs["gui_delay"], self._run_callback)
        else:
            self.ui.cancel_timer(self._pending_action)

    # evaluation hooks
    def _train_evaluation_hook_before_action(self):
        pass

    def _train_evaluation_hook_after_action(self):
        pass

    def _train_evaluation_hook_game_over(self):
        pass