def __init__(self, cube: PuzzleCube, model: CubeModel):
        """
        :param cube: The starting puzzle cube.
        :param model: The trained model to use.
        """

        assert (model._model is not None), "model must be loaded"
        history_length = model._model.history
        blank_history = tuple(None for _ in range(history_length - 1))
        internal_state = (cube._inner_cube, ) + blank_history
        initial_state = State(_internal_state=internal_state)

        self.model_policy_value = model._function()
        self.state = initial_state  # type: State
        self.solution_length = 0
예제 #2
0
    def __init__(self, cube: PuzzleCube, model: CubeModel):
        """
        :param cube: The starting puzzle cube.
        :param model: The trained model to use.
        """

        # assert (model._model is not None), "model must be loaded"
        # history_length = model._model.history
        history_length = 8
        blank_history = tuple(None for _ in range(history_length - 1))
        internal_state = (cube._inner_cube, ) + blank_history
        initial_state = State(_internal_state=internal_state)

        self._mcts_agent = MCTSAgent(model._function(),
                                     initial_state,
                                     max_depth=100)
예제 #3
0
def cube_to_initial_mcts_state(cube: PuzzleCube, history: int) -> State:
    blank_history = tuple(None for _ in range(history - 1))
    internal_state = (cube._inner_cube, ) + blank_history
    return State(_internal_state=internal_state)
예제 #4
0
    def play_game(self,
                  model_policy_value,
                  state=None,
                  distance=None,
                  evaluation_game=False):
        if distance is None:
            # choose distance
            lower_dist = int(self.training_distance_level)
            prob_of_increase = self.training_distance_level - lower_dist
            distance = lower_dist + np.random.choice(
                2, p=[1 - prob_of_increase, prob_of_increase])

            lower_dist_win_rate = 0. if self.recent_games[
                lower_dist] == 0 else self.recent_wins[
                    lower_dist] / self.recent_games[lower_dist]
            upper_dist_win_rate = 0. if self.recent_games[
                lower_dist + 1] == 0 else self.recent_wins[
                    lower_dist + 1] / self.recent_games[lower_dist + 1]

            print(
                "(DB) distance:", distance,
                "(level: {:.2f} win rates: {}: {:.2f} {}: {:.2f})".format(
                    self.training_distance_level, lower_dist,
                    lower_dist_win_rate, lower_dist + 1, upper_dist_win_rate))
        if state is None:
            state = State()
            while state.done():
                state.reset_and_randomize(distance)

        mcts = MCTSAgent(
            model_policy_value,
            state,
            max_depth=self.max_depth,
            transposition_table=self.prebuilt_transposition_table.copy(),
            c_puct=self.exploration,
            gamma=self.decay)

        counter = 0
        win = True
        while not mcts.is_terminal():
            print("(DB) step:", counter)

            mcts.search(steps=self.max_steps)

            # find next state
            probs = mcts.action_probabilities(inv_temp=10)
            action = np.argmax(probs)
            #action = np.random.choice(12, p=probs)

            shortest_path = mcts.stats('shortest_path')

            if not evaluation_game:
                # record stats
                self.self_play_stats['_game_id'].append(self.game_number)
                self.self_play_stats['_step_id'].append(counter)
                #self.self_play_stats['state']  # find a better representation of the state (that is easy to import)
                self.self_play_stats['shortest_path'].append(shortest_path)
                self.self_play_stats['action'].append(action)
                self.self_play_stats['value'].append(mcts.stats('value'))

                self.self_play_stats['prior'].append(mcts.stats('prior'))
                self.self_play_stats['prior_dirichlet'].append(
                    mcts.stats('prior_dirichlet'))
                self.self_play_stats['visit_counts'].append(
                    mcts.stats('visit_counts'))
                self.self_play_stats['total_action_values'].append(
                    mcts.stats('total_action_values'))

                # training data (also recorded in stats)
                self.training_data_states.append(
                    mcts.initial_node.state.input_array())

                policy = mcts.action_probabilities(inv_temp=10)
                self.training_data_policies.append(policy)
                self.self_play_stats['updated_policy'].append(policy)

                self.training_data_values.append(
                    0)  # updated if game is success
                self.self_play_stats['updated_value'].append(0)

            # prepare for next state
            counter += 1
            if shortest_path < 0 or counter >= self.max_game_length:
                win = False
                break
            mcts.advance_to_action(action)

        # update training values based on game results
        if not evaluation_game:
            if win:
                value = 1
                for i in range(counter):
                    value *= self.decay
                    self.training_data_values[-(i + 1)] = value
                    self.self_play_stats['updated_value'][-(i + 1)] = value

            # record game stats
            self.game_stats['_game_id'].append(self.game_number)
            self.game_stats['distance_level'].append(
                self.training_distance_level)
            self.game_stats['training_distance'].append(distance)
            self.game_stats['max_game_length'].append(self.max_game_length)
            self.game_stats['win'].append(win)
            self.game_stats['total_steps'].append(counter if win else -1)

        # set up for next game
        self.game_number += 1
        if win:
            print("(DB)", "win")
        else:
            print("(DB)", "lose")

        if not evaluation_game:
            self.recent_wins[distance] += win
            self.recent_games[distance] += 1

            # update difficulty
            upper_dist = 0
            while True:
                upper_dist += 1
                if self.recent_wins[
                        upper_dist] <= self.win_rate_target * self.recent_games[
                            upper_dist]:
                    break
            if upper_dist <= self.min_distance:
                self.training_distance_level = float(self.min_distance)
            else:
                lower_dist = upper_dist - 1
                lower_dist_win_rate = 0. if self.recent_games[lower_dist] == 0 \
                                        else self.recent_wins[lower_dist] / self.recent_games[lower_dist]
                upper_dist_win_rate = 0. if self.recent_games[lower_dist+1] == 0 \
                                        else self.recent_wins[lower_dist+1] / self.recent_games[lower_dist+1]
                # notice that we won't divide by zero hear since upper_dist_win_rate < lower_dist_win_rate
                self.training_distance_level = lower_dist + (
                    lower_dist_win_rate - self.win_rate_target) / (
                        lower_dist_win_rate - upper_dist_win_rate)

        return state, distance, win
예제 #5
0
 def random_state(distance, history):
     state = State(random_depth = distance, history = history)
     while state.done(): 
         state = State(random_depth = distance, history = history)
     return state
예제 #6
0
    def play_game(self, model_policy_value, state=None, distance=None, evaluation_game=False):
        if distance is None:
            # choose distance
            lower_dist = int(self.training_distance_level)
            prob_of_increase = self.training_distance_level - lower_dist
            distance = lower_dist + np.random.choice(2, p=[1-prob_of_increase, prob_of_increase])
            
            lower_dist_win_rate = 0. if self.recent_games[lower_dist] == 0 else self.recent_wins[lower_dist] / self.recent_games[lower_dist]
            upper_dist_win_rate = 0. if self.recent_games[lower_dist+1] == 0 else self.recent_wins[lower_dist+1] / self.recent_games[lower_dist+1]
        
            print("(DB) distance:", distance, 
                  "(level: {:.2f} win rates: {}: {:.2f} {}: {:.2f})".format(self.training_distance_level, lower_dist, lower_dist_win_rate, lower_dist+1, upper_dist_win_rate))
        if state is None:
            state = State()
            while state.done(): 
                state.reset_and_randomize(distance)

        mcts = MCTSAgent(model_policy_value, 
                         state, 
                         max_depth=self.max_depth, 
                         transposition_table=self.prebuilt_transposition_table.copy(),
                         c_puct = self.exploration,
                         gamma = self.decay)

        counter = 0
        win = True
        while not mcts.is_terminal():
            print("(DB) step:", counter)

            mcts.search(steps=self.max_steps)

            # find next state
            probs = mcts.action_probabilities(inv_temp = 10)
            action = np.argmax(probs)
            #action = np.random.choice(12, p=probs)

            shortest_path = mcts.stats('shortest_path')

            if not evaluation_game:
                # record stats
                self.self_play_stats['_game_id'].append(self.game_number)
                self.self_play_stats['_step_id'].append(counter)
                #self.self_play_stats['state']  # find a better representation of the state (that is easy to import)
                self.self_play_stats['shortest_path'].append(shortest_path)
                self.self_play_stats['action'].append(action)
                self.self_play_stats['value'].append(mcts.stats('value'))

                self.self_play_stats['prior'].append(mcts.stats('prior'))
                self.self_play_stats['prior_dirichlet'].append(mcts.stats('prior_dirichlet'))
                self.self_play_stats['visit_counts'].append(mcts.stats('visit_counts'))
                self.self_play_stats['total_action_values'].append(mcts.stats('total_action_values'))

                # training data (also recorded in stats)
                self.training_data_states.append(mcts.initial_node.state.input_array())
                
                policy = mcts.action_probabilities(inv_temp = 10)
                self.training_data_policies.append(policy)
                self.self_play_stats['updated_policy'].append(policy)
                
                self.training_data_values.append(0) # updated if game is success
                self.self_play_stats['updated_value'].append(0)

            # prepare for next state
            counter += 1 
            if shortest_path < 0 or counter >= self.max_game_length:
                win = False
                break
            mcts.advance_to_action(action)
            

        # update training values based on game results
        if not evaluation_game:
            if win:
                value = 1
                for i in range(counter):
                    value *= self.decay
                    self.training_data_values[-(i+1)] = value
                    self.self_play_stats['updated_value'][-(i+1)] = value
        
            # record game stats
            self.game_stats['_game_id'].append(self.game_number)
            self.game_stats['distance_level'].append(self.training_distance_level)
            self.game_stats['training_distance'].append(distance)
            self.game_stats['max_game_length'].append(self.max_game_length)
            self.game_stats['win'].append(win)
            self.game_stats['total_steps'].append(counter if win else -1)

        # set up for next game
        self.game_number += 1
        if win:
            print("(DB)", "win")
        else:
            print("(DB)", "lose")

        if not evaluation_game:
            self.recent_wins[distance] += win
            self.recent_games[distance] += 1
            
            # update difficulty
            upper_dist = 0
            while True:
                upper_dist += 1
                if self.recent_wins[upper_dist] <= self.win_rate_target * self.recent_games[upper_dist]:
                    break
            if upper_dist <= self.min_distance:
                self.training_distance_level = float(self.min_distance)
            else:
                lower_dist = upper_dist - 1
                lower_dist_win_rate = 0. if self.recent_games[lower_dist] == 0 \
                                        else self.recent_wins[lower_dist] / self.recent_games[lower_dist]
                upper_dist_win_rate = 0. if self.recent_games[lower_dist+1] == 0 \
                                        else self.recent_wins[lower_dist+1] / self.recent_games[lower_dist+1]
                # notice that we won't divide by zero hear since upper_dist_win_rate < lower_dist_win_rate
                self.training_distance_level = lower_dist + (lower_dist_win_rate - self.win_rate_target) / (lower_dist_win_rate - upper_dist_win_rate)

        return state, distance, win
예제 #7
0
 def random_state(distance, history):
     state = State(history=history)
     while state.done():
         state.reset_and_randomize(distance)
     return state