def _build_heuristic_distance(self): """ Build the heuristic map used for searching by Dijkstra's Algorithm """ goal = self.goal frontier = PriorityQueueImproved('min', f=self.heuristic_distance.__getitem__) # For all exit positions (We don't care about other players) for pos in self.exit_positions[self.colour]: # Set initial heuristic to 1, and add to start self.heuristic_distance[pos] = 1 frontier.append(pos) # While search is not ended while frontier: pos = frontier.pop() q, r = pos # Explore all space near current place cost = self.heuristic_distance[pos] for dq, dr in self.moves: for move in range(1, 3): next_pos = (q + dq * move, r + dr * move) # If the moved position is valid, update it with cost + 1, # Else simply continue next loop if (not State.inboard(next_pos) or next_pos in goal.pos_to_piece): continue # Get value in dictionary h_val = self.heuristic_distance.get(next_pos, None) # Not yet navigated to or can be updated if h_val is None or h_val > cost + 1: # Update dictionary entry self.heuristic_distance[next_pos] = cost + 1 # Update the value in queue frontier.append(next_pos)
def convert_action_perspective(self, action, convert_to): """ This further converts the converted format from convert_action_to to the corresponding player perspective :param action: the action to translate. Format (fr, to, type) :param convert_to: To whose perspective should the convert to :return: The converted perspective """ fr, to, move = action # No need for change if pass if action == "PASS": return action if convert_to == "player": new_fr = State.rotate_pos(self.colour, "red", fr) new_to = State.rotate_pos(self.colour, "red", to) elif convert_to == "referee": new_fr = State.rotate_pos("red", self.colour, fr) new_to = State.rotate_pos("red", self.colour, to) else: raise ValueError(convert_to + "mode is not valid") return new_fr, new_to, move
def __init__(self, colour, search_algorithm=None, game_type=Game, evaluator=player_evaluator, initial_state=None): """ This method is called once at the beginning of the game to initialise your player. You should use this opportunity to set up your own internal representation of the game state, and any other information about the game state you would like to maintain for the duration of the game. The parameter colour will be a string representing the player your program will play as (Red, Green or Blue). The value will be one of the strings "red", "green", or "blue" correspondingly. You can parse any valid board state to UCT the Agent However the Agent assumes the states are valid """ self.colour = colour self.search_algorithm = search_algorithm self.code_map = State.code_map self.rev_code_map = State.rev_code_map # cycle the players: # player:: red: red -> red, green -> green, blue -> blue # player:: green: red -> blue, green -> red, blue -> green # player:: blue: red -> green, green -> blue, blue -> red self.referee_to_player_mapping = State.perspective_mapping[colour] self.player_to_referee_mapping = { value: key for key, value in self.referee_to_player_mapping.items() } # The initial player is red, convert it to the rotate perspective state = State(self.start_config, colour=self.referee_to_player_mapping["red"]) # Colour of the game is different from the color of the state self.game = game_type("red", state)
node_type = SimpleRLNode2 policy = "greedy" # policy = "choice" # debug = 0.001 # debug = 0.1 debug = 0 # explore = 0 # explore = 0.1 # explore = 0.2 # explore = 0.5 explore = 1 # theta = 0.05 # theta = 0.01 theta = 0.005 # theta = 0.001 # theta = 0.0005 gamma = 1 # gamma = 0.99 initial_state = State(Player.start_config, "red") game = Game("red", initial_state) agent.td_train(game, initial_state, debug=debug, node_type=node_type, policy=policy, explore=explore, theta=theta, gamma=gamma)
def parse_state(file_name): f = open(file_name) pos_dict, colour, completed = JsonParser(json.load(f)).parse() return State(pos_dict, colour, completed)
def change_state_color(state, color): return State(state.pos_to_piece, color, state.completed)
def td_train(self, game, initial_state=None, explore=0.1, n=1000, theta=0.05, checkpoint_interval=20, gamma=0.9, node_type=InitialRLNode, policy="choice", debug=0): # TODO make it possible to plug in other agents self.network.learning_rate = theta initial_node = node_type(initial_state, rewards=(0, 0, 0)) if policy == "greedy": policy = self.greedy_policy elif policy == "choice": policy = self.choice_policy else: raise ValueError("Invalid policy") # Generate episodes of game based on current policy # -> Update the value for each player losses = [] episodes = [] count = 0 length = 0 for i in range(n): node = initial_node # We record three player simultaneously loss = 0 episode_actions = [] episode_states = [] episode_rewards = [] # while not game.terminal_state(node.state): while True: # TODO replace this by any policy for bootstrapping # TODO use the current value to compute! current_colour = node.state.colour current_code = node.state.code_map[current_colour] # Rotate the state to make current colour be red current_state = node.state.red_perspective(current_colour) # Get the results action, next_node = policy(game, current_state, explore=explore, node_type=node_type, train=True) # Update # Here is the model assumption # The player's turn (who's time to choose action) # -------------------- # g b r g b r g ... # 1 2 3 4 5 6 7 # In reality, we should be computing three values for each # node. But we cheat here. We only compute the value wrt # current actor: The values are like this. (for r only) # * here now # v v' v'' # o # / # o - o - o - o o # ^ \ / # Other o - o - o - o # branch \ # unknown o # p_s[r]:[0 1 2 *] # We say that vt'' -> vt', and vt' + # # (Experimental, try to solve the after state problem) # # Update estimation of v' based on v'' # # Then update the estimation v based on v' # # Get y # # 1. Get current state (already have) # # 2. Get feature vector # current_state_vector = self.feature_extractor(current_state) # # 3. Compute v' # v_prime = \ # self.network.forward(np.array([current_state_vector])) # ### THERE is no reward here! # # 4. Get y from v' # y = v_prime # # # Get X # # 1. Get prev state # prev_state = player_states[current_code][1] # # 2. Get feature vector as X # prev_state_vector = \ # self.feature_extractor(prev_state) # X = np.array([prev_state_vector]) # # Backward propagation # self.network.backward(X, y) # Update the estimation of previous state v and v' # Get y # 1. Get next state next_state = next_node.state # 2. Get feature vector next_state_vector = self.feature_extractor(next_state) # 3. Compute v' v_prime = \ self.network.forward(np.array([next_state_vector])) # 4. Get reward reward = next_node.rewards[0] # 5. Get v' + reward as y y = gamma * v_prime + reward # Get X # 1. Get current state # 2. Get feature vector as X current_state_vector = self.feature_extractor(current_state) X = np.array([current_state_vector]) # Backward propagation y_hat_old = self.network.forward(X) # if y[0][0] != y_hat_old[0][0]: # print("====================") # print(y_hat_old, y) self.network.backward(X, y) y_hat = self.network.forward(X) # if y[0][0] != y_hat[0][0]: # print(y_hat, y) # assert abs(y[0][0] - y_hat[0][0]) < abs(y[0][0] - y_hat_old[0][0]) # print("====================") loss += self.network.loss.compute(y_hat, y) count += 1 if game.terminal_state(next_node.state): break fr, to, move = action fr = State.rotate_pos("red", current_colour, fr) to = State.rotate_pos("red", current_colour, to) action = (fr, to, move) node = next_node # Back to original perspective node.original_perspective(current_colour) episode_actions.append(action) episode_states.append(node.state) episode_rewards.append(node.rewards) if debug: print(node) sleep(debug) print(len(episode_states)) print(f"Episode: {i}") # Store them for now episodes.append((episode_states, episode_actions, episode_rewards)) length += len(episode_states) if i % checkpoint_interval == checkpoint_interval - 1: losses.append((i, loss)) print(f"Episode: {i}\n" f" loss={loss/count}\n" f" average episode={length/checkpoint_interval}") count = 0 length = 0 self.network.save_checkpoint() self.network.save_final()
def __init__(self, colour, state): super().__init__(state, State({}, "red")) # self.evaluator = evaluator self.colour = colour self._build_heuristic_distance()