class RandomAgent(Agent): def __init__(self, configs): self.rb = RogueBox(configs) self._pending_action_timer = None self.ui = UIManager.init(configs["userinterface"], self.rb) self.l = Logger(log_depth=configs["verbose"], log_targets=["file", "ui"], ui=self.ui) self.ui.on_key_press(self._keypress_callback) self._timer_value = 100 self._pending_action_timer = self.ui.on_timer_end( self._timer_value, self._act_callback) def run(self): self.ui.start_ui() def act(self): actions = self.rb.get_actions() action = random.choice(actions) logs = [ Log("random_action_time", "Action ({}) time".format(action), LOG_LEVEL_MORE, mean=10) ] self.l.start_log_timer(logs) reward, _, __ = self.rb.send_command(action) self.l.stop_log_timer(logs) logs = [ Log("action_state", "My previous state: \n {}".format(self.ui.read_rogue()), LOG_LEVEL_ALL), Log("chosen_action", "My chosen action: {} got reward: {}".format(action, reward), LOG_LEVEL_MORE), ] self.l.log(logs) return reward def _keypress_callback(self, event): if event.char == 'q' or event.char == 'Q': self.rb.quit_the_game() exit() elif event.char == 'r' or event.char == 'R': # we need to stop the agent from acting # or it will try to write to a closed pipe self.ui.cancel_timer(self._pending_action_timer) self.rb.reset() self._pending_action_timer = self.ui.on_timer_end( self._timer_value, self._act_callback) def _act_callback(self): reward = self.act() self.ui.draw_from_rogue() if not self.rb.game_over(): # renew the callback self._pending_action_timer = self.ui.on_timer_end( self._timer_value, self._act_callback) else: self.ui.cancel_timer(self._pending_action_timer)
class QLearnerAgent(LearnerAgent): def __init__(self, configs): import models, history # class instances self.rb = RogueBox(configs) self.model_manager = getattr(models, configs["model_manager"])(self.rb) self.history_manager = getattr(history, configs["history_manager"])(self) # configs self.configs = configs self.configs["iteration"] = 1 self.configs["actions"] = self.rb.get_actions() self.configs["actions_num"] = len(self.configs["actions"]) # gui stuff ui = None log_targets = [] if configs["logsonfile"]: log_targets.append("file") if self.configs["gui"]: self.ui = UIManager.init(configs["userinterface"], self.rb) self._pending_action = None ui = self.ui log_targets.append("ui") self.l = Logger(log_depth=configs["verbose"], log_targets=log_targets, ui=ui) else: log_targets.append("terminal") self.l = Logger(log_depth=configs["verbose"], log_targets=log_targets) # state self.state = self.model_manager.reshape_initial_state( self.rb.compute_state()) self.old_state = self.state # model self.model = self.model_manager.build_model() self.target_model = self.model_manager.build_model() self.target_model.set_weights(self.model.get_weights()) # resume from file # load weights, transitions history and parameters from assets, if any self._load_progress() def _load_progress(self): # model weights if os.path.isfile("assets/weights.h5"): print("loading weights...") self.model.load_weights("assets/weights.h5") self.target_model.set_weights(self.model.get_weights()) print("weights loaded!") # transitions history if self.configs["save_history"]: self.history_manager.load_history_from_file("assets/history.pkl") # parameters # only float can be loaded like this for now if os.path.isfile("assets/parameters.csv"): print("loading parameters...") with open("assets/parameters.csv") as parameters: reader = csv.reader(parameters) for row in reader: try: # try conversion from string self.configs[row[0]] = float(row[1]) except ValueError: print("the parameter", row[0], " is not a float castable value") print("parameters loaded!") def _save_progress(self): print("saving...") if not os.path.exists("assets"): os.makedirs("assets") print("saving weights...") self.model.save_weights("assets/weights.h5", overwrite=True) if self.configs["save_history"]: self.history_manager.save_history_on_file("assets/history.pkl") print("saving parameters...") with open("assets/parameters.csv", "w") as parameters: writer = csv.writer(parameters) writer.writerow(["epsilon", self.configs["epsilon"]]) writer.writerow(["iteration", self.configs["iteration"]]) print("done saving!") def _reinit(self): self.state = self.model_manager.reshape_initial_state( self.rb.compute_state()) self.old_state = self.state def predict(self): """return a numpy array of length actions_num all set to 0 except for the index of the action to take wich is set to 1""" # chose an action epsilon greedy actions_array = np.zeros(self.configs["actions_num"]) if random.random() <= self.configs["epsilon"]: action_index = random.randrange(self.configs["actions_num"]) else: q = self.model.predict(self.state) logs = [ Log("actions_array", "This is the action array: {}".format(q), LOG_LEVEL_MORE) ] actions = self.configs["actions"] if self.configs["only_legal_actions"]: legal_actions = self.rb.get_legal_actions() for action in actions: if action not in legal_actions: q[(0, actions.index(action))] = -np.inf logs += [ Log("legal_actions_array", "This is the legal action array: {}".format(q), LOG_LEVEL_MORE) ] self.l.log(logs) action_index = np.argmax(q) return action_index def act(self, action_index): action = self.configs["actions"][action_index] reward, new_state, terminal = self.rb.send_command(action) logs = [ Log("action_reward", "Sent action: {} got reward: {}".format(action, reward), LOG_LEVEL_MORE) ] self.l.log(logs) self.old_state = self.state self.state = self.model_manager.reshape_new_state( self.old_state, new_state) return reward, terminal def observe(self): timer_log = [ Log("Observe_time", "Ten observe done", LOG_LEVEL_MORE, mean=10) ] self.l.start_log_timer(timer_log) minibatch = self.history_manager.pick_batch(self.configs["batchsize"]) inputs = np.zeros((self.configs["batchsize"], ) + self.state.shape[1:]) targets = np.zeros( (self.configs["batchsize"], self.configs["actions_num"])) # Now we do the experience replay for i in range(self.configs["batchsize"]): old_state = minibatch[i][0] action_index = minibatch[i][1] reward = minibatch[i][2] new_state = minibatch[i][3] terminal = minibatch[i][4] inputs[i] = old_state targets[i] = self.model.predict(old_state) if terminal: targets[i, action_index] = reward else: Q_new_state = self.target_model.predict(new_state) targets[i, action_index] = reward + self.configs[ "gamma"] * np.max(Q_new_state) loss = self.model.train_on_batch(inputs, targets) loss_log = [ Log("loss_value", "Loss for this iteration: {}".format(loss), LOG_LEVEL_SOME) ] self.l.log(loss_log) self.l.stop_log_timer(timer_log) return loss def plot(self, frame): #WARNING works only with 3 layers states #makes sense only with non reshaped, no memory states # generate heatmap heatmap_start = [ Log( "heatmap_start", "Generating heatmap for iteration {} ...".format( self.configs["iteration"]), LOG_LEVEL_SOME) ] self.l.log(heatmap_start) heatmap_time = [ Log("heatmap_time", "Generating heatmap took", LOG_LEVEL_MORE, mean=1) ] self.l.start_log_timer(heatmap_time) heatmap = np.zeros((22, 80)) best_actions = np.full((22, 80), -1) passable_pos = np.argwhere(frame[0] == 255) for i, j in passable_pos: player_layer = np.zeros((22, 80)) player_layer[i][j] = 255 temp = np.stack((frame[0], player_layer, frame[2])) q = self.model.predict( self.model_manager.reshape_new_state(temp, temp)) heatmap[i][j] = q.max() best_actions[i][j] = q[0].argmax() heatmap = np.ma.masked_where(heatmap == 0, heatmap) mn = heatmap.min() mx = heatmap.max() heatmap_start = [ Log("minmax", "heatmap min: {} max: {}".format(mn, mx), LOG_LEVEL_SOME) ] self.l.log(heatmap_start) arrows = ['←', '↓', '↑', '→'] cmap = plt.cm.hot_r cmap.set_bad(color="green") fig, ax = plt.subplots(figsize=(11, 5)) ax.imshow(heatmap, cmap=cmap, interpolation='nearest', vmin=mn, vmax=mx) for i, j in passable_pos: ax.text(j, i, '%s' % arrows[best_actions[i][j]], ha='center', va='center') fig.savefig("plots/heatmap-iteration-%s.png" % self.configs["iteration"]) def train(self): if self.configs["gui"]: self._pending_action = self.ui.on_timer_end( 100, lambda: self._train_callback(1)) self.ui.on_key_press(self._train_key_callback) self.ui.start_ui() else: while True: self._train_step(self.configs["iteration"]) self.configs["iteration"] += 1 def _train_step(self, iteration): action_index = self.predict() self._train_evaluation_hook_before_action() reward, terminal = self.act(action_index) self._train_evaluation_hook_after_action() item_added = self.history_manager.update_history( action_index, reward, terminal) if iteration % 10 == 0: log_iteration = [ Log("iteration", "Iteration number: {}".format(self.configs["iteration"]), LOG_LEVEL_SOME) ] log_iteration += [ Log("hist", "History size: {}".format(self.history_manager.hist_len()), LOG_LEVEL_SOME) ] self.l.log(log_iteration) # Begin training only when we have enough history if self.history_manager.hist_len( ) >= self.configs["minhist"] and item_added: self.observe() # anneal epsilon if self.configs["epsilon"] > self.configs["final_epsilon"]: self.configs["epsilon"] -= (self.configs["initial_epsilon"] - self.configs["final_epsilon"]) / \ self.configs["explore_steps"] logs = [ Log("epsilon", "{}".format(self.configs["epsilon"]), LOG_LEVEL_ALL) ] self.l.log(logs) if iteration % 100000 == 0: self._save_progress() #plottin is disabled because its not compatible with every state #uncomment the next line if needed #self.plot(self.state[0]) if iteration % 10000 == 0: self.target_model.set_weights(self.model.get_weights()) if terminal: self._train_evaluation_hook_game_over() self.rb.reset() self._reinit() def run(self): # dont act randomly self.configs["epsilon"] = 0 if self.configs["gui"]: self.ui.on_key_press(self._play_key_callback) self._pending_action = self.ui.on_timer_end( 100, self._run_callback) self.ui.start_ui() else: terminal = False while not terminal: self._run_step() def _run_step(self): action_index = self.predict() reward, terminal = self.act(action_index) def _train_key_callback(self, event): """Callback for keys pressed during learning""" if event.char == 'q' or event.char == 'Q': self.rb.quit_the_game() exit() def _play_key_callback(self, event): """Callback for keys pressed during playing""" if event.char == 'q' or event.char == 'Q': self.rb.quit_the_game() exit() elif event.char == 'r' or event.char == 'R': # we need to stop the agent from acting # or it will try to write to a closed pipe self.ui.cancel_timer(self._pending_action) self.rb.reset() self._reinit() self._pending_action = self.ui.on_timer_end( 100, self._run_callback) def _train_callback(self, iteration): self._train_step(iteration) self.ui.draw_from_rogue() self.configs["iteration"] += 1 self._pending_action = self.ui.on_timer_end( self.configs["gui_delay"], lambda: self._train_callback(self.configs["iteration"])) def _run_callback(self): self._run_step() self.ui.draw_from_rogue() if not self.rb.game_over(): # renew the callback self._pending_action = self.ui.on_timer_end( self.configs["gui_delay"], self._run_callback) else: self.ui.cancel_timer(self._pending_action) # evaluation hooks def _train_evaluation_hook_before_action(self): pass def _train_evaluation_hook_after_action(self): pass def _train_evaluation_hook_game_over(self): pass