def convert_action_perspective(self, action, convert_to):
     """
     This further converts the converted format from convert_action_to
     to the corresponding player perspective
     :param action: the action to translate. Format (fr, to, type)
     :param convert_to: To whose perspective should the convert to
     :return: The converted perspective
     """
     fr, to, move = action
     # No need for change if pass
     if action == "PASS":
         return action
     if convert_to == "player":
         new_fr = State.rotate_pos(self.colour, "red", fr)
         new_to = State.rotate_pos(self.colour, "red", to)
     elif convert_to == "referee":
         new_fr = State.rotate_pos("red", self.colour, fr)
         new_to = State.rotate_pos("red", self.colour, to)
     else:
         raise ValueError(convert_to + "mode is not valid")
     return new_fr, new_to, move
    def td_train(self, game, initial_state=None, explore=0.1, n=1000,
                 theta=0.05, checkpoint_interval=20, gamma=0.9,
                 node_type=InitialRLNode, policy="choice", debug=0):
        # TODO make it possible to plug in other agents
        self.network.learning_rate = theta
        initial_node = node_type(initial_state, rewards=(0, 0, 0))

        if policy == "greedy":
            policy = self.greedy_policy
        elif policy == "choice":
            policy = self.choice_policy
        else:
            raise ValueError("Invalid policy")

        # Generate episodes of game based on current policy
        # -> Update the value for each player
        losses = []

        episodes = []
        count = 0
        length = 0
        for i in range(n):
            node = initial_node
            # We record three player simultaneously
            loss = 0

            episode_actions = []
            episode_states = []
            episode_rewards = []

            # while not game.terminal_state(node.state):
            while True:

                # TODO replace this by any policy for bootstrapping
                # TODO use the current value to compute!

                current_colour = node.state.colour
                current_code = node.state.code_map[current_colour]

                # Rotate the state to make current colour be red
                current_state = node.state.red_perspective(current_colour)
                # Get the results
                action, next_node = policy(game, current_state,
                                           explore=explore,
                                           node_type=node_type, train=True)

                # Update
                # Here is the model assumption
                # The player's turn (who's time to choose action)
                # --------------------
                # g   b   r   g   b   r   g ...
                # 1   2   3   4   5   6   7
                # In reality, we should be computing three values for each
                # node. But we cheat here. We only compute the value wrt
                # current actor: The values are like this. (for r only)
                #                     * here now
                #         v   v'      v''
                #             o
                #           /
                # o - o - o - o           o
                #   ^       \           /
                # Other       o - o - o - o
                # branch                \
                # unknown                 o
                # p_s[r]:[0   1   2   *]
                # We say that vt'' -> vt', and vt' +

                # # (Experimental, try to solve the after state problem)
                # # Update estimation of v' based on v''
                # # Then update the estimation v based on v'
                # # Get y
                # # 1. Get current state (already have)
                # # 2. Get feature vector
                # current_state_vector = self.feature_extractor(current_state)
                # # 3. Compute v'
                # v_prime = \
                #     self.network.forward(np.array([current_state_vector]))
                # ### THERE is no reward here!
                # # 4. Get y from v'
                # y = v_prime
                #
                # # Get X
                # # 1. Get prev state
                # prev_state = player_states[current_code][1]
                # # 2. Get feature vector as X
                # prev_state_vector = \
                #     self.feature_extractor(prev_state)
                # X = np.array([prev_state_vector])
                # # Backward propagation
                # self.network.backward(X, y)

                # Update the estimation of previous state v and v'
                # Get y
                # 1. Get next state
                next_state = next_node.state
                # 2. Get feature vector
                next_state_vector = self.feature_extractor(next_state)
                # 3. Compute v'
                v_prime = \
                    self.network.forward(np.array([next_state_vector]))
                # 4. Get reward
                reward = next_node.rewards[0]
                # 5. Get v' + reward as y
                y = gamma * v_prime + reward
                # Get X
                # 1. Get current state
                # 2. Get feature vector as X
                current_state_vector = self.feature_extractor(current_state)
                X = np.array([current_state_vector])
                # Backward propagation
                y_hat_old = self.network.forward(X)
                # if y[0][0] != y_hat_old[0][0]:
                #     print("====================")
                #     print(y_hat_old, y)
                self.network.backward(X, y)
                y_hat = self.network.forward(X)
                # if y[0][0] != y_hat[0][0]:
                    # print(y_hat, y)
                    # assert abs(y[0][0] - y_hat[0][0]) < abs(y[0][0] - y_hat_old[0][0])
                    # print("====================")
                loss += self.network.loss.compute(y_hat, y)
                count += 1

                if game.terminal_state(next_node.state):
                    break

                fr, to, move = action
                fr = State.rotate_pos("red", current_colour, fr)
                to = State.rotate_pos("red", current_colour, to)
                action = (fr, to, move)

                node = next_node
                # Back to original perspective
                node.original_perspective(current_colour)

                episode_actions.append(action)
                episode_states.append(node.state)
                episode_rewards.append(node.rewards)

                if debug:
                    print(node)
                    sleep(debug)

            print(len(episode_states))
            print(f"Episode: {i}")

            # Store them for now
            episodes.append((episode_states, episode_actions, episode_rewards))
            length += len(episode_states)

            if i % checkpoint_interval == checkpoint_interval - 1:
                losses.append((i, loss))
                print(f"Episode: {i}\n"
                      f"        loss={loss/count}\n"
                      f"        average episode={length/checkpoint_interval}")
                count = 0
                length = 0
                self.network.save_checkpoint()
        self.network.save_final()