def convert_action_perspective(self, action, convert_to): """ This further converts the converted format from convert_action_to to the corresponding player perspective :param action: the action to translate. Format (fr, to, type) :param convert_to: To whose perspective should the convert to :return: The converted perspective """ fr, to, move = action # No need for change if pass if action == "PASS": return action if convert_to == "player": new_fr = State.rotate_pos(self.colour, "red", fr) new_to = State.rotate_pos(self.colour, "red", to) elif convert_to == "referee": new_fr = State.rotate_pos("red", self.colour, fr) new_to = State.rotate_pos("red", self.colour, to) else: raise ValueError(convert_to + "mode is not valid") return new_fr, new_to, move
def td_train(self, game, initial_state=None, explore=0.1, n=1000, theta=0.05, checkpoint_interval=20, gamma=0.9, node_type=InitialRLNode, policy="choice", debug=0): # TODO make it possible to plug in other agents self.network.learning_rate = theta initial_node = node_type(initial_state, rewards=(0, 0, 0)) if policy == "greedy": policy = self.greedy_policy elif policy == "choice": policy = self.choice_policy else: raise ValueError("Invalid policy") # Generate episodes of game based on current policy # -> Update the value for each player losses = [] episodes = [] count = 0 length = 0 for i in range(n): node = initial_node # We record three player simultaneously loss = 0 episode_actions = [] episode_states = [] episode_rewards = [] # while not game.terminal_state(node.state): while True: # TODO replace this by any policy for bootstrapping # TODO use the current value to compute! current_colour = node.state.colour current_code = node.state.code_map[current_colour] # Rotate the state to make current colour be red current_state = node.state.red_perspective(current_colour) # Get the results action, next_node = policy(game, current_state, explore=explore, node_type=node_type, train=True) # Update # Here is the model assumption # The player's turn (who's time to choose action) # -------------------- # g b r g b r g ... # 1 2 3 4 5 6 7 # In reality, we should be computing three values for each # node. But we cheat here. We only compute the value wrt # current actor: The values are like this. (for r only) # * here now # v v' v'' # o # / # o - o - o - o o # ^ \ / # Other o - o - o - o # branch \ # unknown o # p_s[r]:[0 1 2 *] # We say that vt'' -> vt', and vt' + # # (Experimental, try to solve the after state problem) # # Update estimation of v' based on v'' # # Then update the estimation v based on v' # # Get y # # 1. Get current state (already have) # # 2. Get feature vector # current_state_vector = self.feature_extractor(current_state) # # 3. Compute v' # v_prime = \ # self.network.forward(np.array([current_state_vector])) # ### THERE is no reward here! # # 4. Get y from v' # y = v_prime # # # Get X # # 1. Get prev state # prev_state = player_states[current_code][1] # # 2. Get feature vector as X # prev_state_vector = \ # self.feature_extractor(prev_state) # X = np.array([prev_state_vector]) # # Backward propagation # self.network.backward(X, y) # Update the estimation of previous state v and v' # Get y # 1. Get next state next_state = next_node.state # 2. Get feature vector next_state_vector = self.feature_extractor(next_state) # 3. Compute v' v_prime = \ self.network.forward(np.array([next_state_vector])) # 4. Get reward reward = next_node.rewards[0] # 5. Get v' + reward as y y = gamma * v_prime + reward # Get X # 1. Get current state # 2. Get feature vector as X current_state_vector = self.feature_extractor(current_state) X = np.array([current_state_vector]) # Backward propagation y_hat_old = self.network.forward(X) # if y[0][0] != y_hat_old[0][0]: # print("====================") # print(y_hat_old, y) self.network.backward(X, y) y_hat = self.network.forward(X) # if y[0][0] != y_hat[0][0]: # print(y_hat, y) # assert abs(y[0][0] - y_hat[0][0]) < abs(y[0][0] - y_hat_old[0][0]) # print("====================") loss += self.network.loss.compute(y_hat, y) count += 1 if game.terminal_state(next_node.state): break fr, to, move = action fr = State.rotate_pos("red", current_colour, fr) to = State.rotate_pos("red", current_colour, to) action = (fr, to, move) node = next_node # Back to original perspective node.original_perspective(current_colour) episode_actions.append(action) episode_states.append(node.state) episode_rewards.append(node.rewards) if debug: print(node) sleep(debug) print(len(episode_states)) print(f"Episode: {i}") # Store them for now episodes.append((episode_states, episode_actions, episode_rewards)) length += len(episode_states) if i % checkpoint_interval == checkpoint_interval - 1: losses.append((i, loss)) print(f"Episode: {i}\n" f" loss={loss/count}\n" f" average episode={length/checkpoint_interval}") count = 0 length = 0 self.network.save_checkpoint() self.network.save_final()