def __init__(self, cube: PuzzleCube, model: CubeModel): """ :param cube: The starting puzzle cube. :param model: The trained model to use. """ assert (model._model is not None), "model must be loaded" history_length = model._model.history blank_history = tuple(None for _ in range(history_length - 1)) internal_state = (cube._inner_cube, ) + blank_history initial_state = State(_internal_state=internal_state) self.model_policy_value = model._function() self.state = initial_state # type: State self.solution_length = 0
def __init__(self, cube: PuzzleCube, model: CubeModel): """ :param cube: The starting puzzle cube. :param model: The trained model to use. """ # assert (model._model is not None), "model must be loaded" # history_length = model._model.history history_length = 8 blank_history = tuple(None for _ in range(history_length - 1)) internal_state = (cube._inner_cube, ) + blank_history initial_state = State(_internal_state=internal_state) self._mcts_agent = MCTSAgent(model._function(), initial_state, max_depth=100)
def cube_to_initial_mcts_state(cube: PuzzleCube, history: int) -> State: blank_history = tuple(None for _ in range(history - 1)) internal_state = (cube._inner_cube, ) + blank_history return State(_internal_state=internal_state)
def play_game(self, model_policy_value, state=None, distance=None, evaluation_game=False): if distance is None: # choose distance lower_dist = int(self.training_distance_level) prob_of_increase = self.training_distance_level - lower_dist distance = lower_dist + np.random.choice( 2, p=[1 - prob_of_increase, prob_of_increase]) lower_dist_win_rate = 0. if self.recent_games[ lower_dist] == 0 else self.recent_wins[ lower_dist] / self.recent_games[lower_dist] upper_dist_win_rate = 0. if self.recent_games[ lower_dist + 1] == 0 else self.recent_wins[ lower_dist + 1] / self.recent_games[lower_dist + 1] print( "(DB) distance:", distance, "(level: {:.2f} win rates: {}: {:.2f} {}: {:.2f})".format( self.training_distance_level, lower_dist, lower_dist_win_rate, lower_dist + 1, upper_dist_win_rate)) if state is None: state = State() while state.done(): state.reset_and_randomize(distance) mcts = MCTSAgent( model_policy_value, state, max_depth=self.max_depth, transposition_table=self.prebuilt_transposition_table.copy(), c_puct=self.exploration, gamma=self.decay) counter = 0 win = True while not mcts.is_terminal(): print("(DB) step:", counter) mcts.search(steps=self.max_steps) # find next state probs = mcts.action_probabilities(inv_temp=10) action = np.argmax(probs) #action = np.random.choice(12, p=probs) shortest_path = mcts.stats('shortest_path') if not evaluation_game: # record stats self.self_play_stats['_game_id'].append(self.game_number) self.self_play_stats['_step_id'].append(counter) #self.self_play_stats['state'] # find a better representation of the state (that is easy to import) self.self_play_stats['shortest_path'].append(shortest_path) self.self_play_stats['action'].append(action) self.self_play_stats['value'].append(mcts.stats('value')) self.self_play_stats['prior'].append(mcts.stats('prior')) self.self_play_stats['prior_dirichlet'].append( mcts.stats('prior_dirichlet')) self.self_play_stats['visit_counts'].append( mcts.stats('visit_counts')) self.self_play_stats['total_action_values'].append( mcts.stats('total_action_values')) # training data (also recorded in stats) self.training_data_states.append( mcts.initial_node.state.input_array()) policy = mcts.action_probabilities(inv_temp=10) self.training_data_policies.append(policy) self.self_play_stats['updated_policy'].append(policy) self.training_data_values.append( 0) # updated if game is success self.self_play_stats['updated_value'].append(0) # prepare for next state counter += 1 if shortest_path < 0 or counter >= self.max_game_length: win = False break mcts.advance_to_action(action) # update training values based on game results if not evaluation_game: if win: value = 1 for i in range(counter): value *= self.decay self.training_data_values[-(i + 1)] = value self.self_play_stats['updated_value'][-(i + 1)] = value # record game stats self.game_stats['_game_id'].append(self.game_number) self.game_stats['distance_level'].append( self.training_distance_level) self.game_stats['training_distance'].append(distance) self.game_stats['max_game_length'].append(self.max_game_length) self.game_stats['win'].append(win) self.game_stats['total_steps'].append(counter if win else -1) # set up for next game self.game_number += 1 if win: print("(DB)", "win") else: print("(DB)", "lose") if not evaluation_game: self.recent_wins[distance] += win self.recent_games[distance] += 1 # update difficulty upper_dist = 0 while True: upper_dist += 1 if self.recent_wins[ upper_dist] <= self.win_rate_target * self.recent_games[ upper_dist]: break if upper_dist <= self.min_distance: self.training_distance_level = float(self.min_distance) else: lower_dist = upper_dist - 1 lower_dist_win_rate = 0. if self.recent_games[lower_dist] == 0 \ else self.recent_wins[lower_dist] / self.recent_games[lower_dist] upper_dist_win_rate = 0. if self.recent_games[lower_dist+1] == 0 \ else self.recent_wins[lower_dist+1] / self.recent_games[lower_dist+1] # notice that we won't divide by zero hear since upper_dist_win_rate < lower_dist_win_rate self.training_distance_level = lower_dist + ( lower_dist_win_rate - self.win_rate_target) / ( lower_dist_win_rate - upper_dist_win_rate) return state, distance, win
def random_state(distance, history): state = State(random_depth = distance, history = history) while state.done(): state = State(random_depth = distance, history = history) return state
def play_game(self, model_policy_value, state=None, distance=None, evaluation_game=False): if distance is None: # choose distance lower_dist = int(self.training_distance_level) prob_of_increase = self.training_distance_level - lower_dist distance = lower_dist + np.random.choice(2, p=[1-prob_of_increase, prob_of_increase]) lower_dist_win_rate = 0. if self.recent_games[lower_dist] == 0 else self.recent_wins[lower_dist] / self.recent_games[lower_dist] upper_dist_win_rate = 0. if self.recent_games[lower_dist+1] == 0 else self.recent_wins[lower_dist+1] / self.recent_games[lower_dist+1] print("(DB) distance:", distance, "(level: {:.2f} win rates: {}: {:.2f} {}: {:.2f})".format(self.training_distance_level, lower_dist, lower_dist_win_rate, lower_dist+1, upper_dist_win_rate)) if state is None: state = State() while state.done(): state.reset_and_randomize(distance) mcts = MCTSAgent(model_policy_value, state, max_depth=self.max_depth, transposition_table=self.prebuilt_transposition_table.copy(), c_puct = self.exploration, gamma = self.decay) counter = 0 win = True while not mcts.is_terminal(): print("(DB) step:", counter) mcts.search(steps=self.max_steps) # find next state probs = mcts.action_probabilities(inv_temp = 10) action = np.argmax(probs) #action = np.random.choice(12, p=probs) shortest_path = mcts.stats('shortest_path') if not evaluation_game: # record stats self.self_play_stats['_game_id'].append(self.game_number) self.self_play_stats['_step_id'].append(counter) #self.self_play_stats['state'] # find a better representation of the state (that is easy to import) self.self_play_stats['shortest_path'].append(shortest_path) self.self_play_stats['action'].append(action) self.self_play_stats['value'].append(mcts.stats('value')) self.self_play_stats['prior'].append(mcts.stats('prior')) self.self_play_stats['prior_dirichlet'].append(mcts.stats('prior_dirichlet')) self.self_play_stats['visit_counts'].append(mcts.stats('visit_counts')) self.self_play_stats['total_action_values'].append(mcts.stats('total_action_values')) # training data (also recorded in stats) self.training_data_states.append(mcts.initial_node.state.input_array()) policy = mcts.action_probabilities(inv_temp = 10) self.training_data_policies.append(policy) self.self_play_stats['updated_policy'].append(policy) self.training_data_values.append(0) # updated if game is success self.self_play_stats['updated_value'].append(0) # prepare for next state counter += 1 if shortest_path < 0 or counter >= self.max_game_length: win = False break mcts.advance_to_action(action) # update training values based on game results if not evaluation_game: if win: value = 1 for i in range(counter): value *= self.decay self.training_data_values[-(i+1)] = value self.self_play_stats['updated_value'][-(i+1)] = value # record game stats self.game_stats['_game_id'].append(self.game_number) self.game_stats['distance_level'].append(self.training_distance_level) self.game_stats['training_distance'].append(distance) self.game_stats['max_game_length'].append(self.max_game_length) self.game_stats['win'].append(win) self.game_stats['total_steps'].append(counter if win else -1) # set up for next game self.game_number += 1 if win: print("(DB)", "win") else: print("(DB)", "lose") if not evaluation_game: self.recent_wins[distance] += win self.recent_games[distance] += 1 # update difficulty upper_dist = 0 while True: upper_dist += 1 if self.recent_wins[upper_dist] <= self.win_rate_target * self.recent_games[upper_dist]: break if upper_dist <= self.min_distance: self.training_distance_level = float(self.min_distance) else: lower_dist = upper_dist - 1 lower_dist_win_rate = 0. if self.recent_games[lower_dist] == 0 \ else self.recent_wins[lower_dist] / self.recent_games[lower_dist] upper_dist_win_rate = 0. if self.recent_games[lower_dist+1] == 0 \ else self.recent_wins[lower_dist+1] / self.recent_games[lower_dist+1] # notice that we won't divide by zero hear since upper_dist_win_rate < lower_dist_win_rate self.training_distance_level = lower_dist + (lower_dist_win_rate - self.win_rate_target) / (lower_dist_win_rate - upper_dist_win_rate) return state, distance, win
def random_state(distance, history): state = State(history=history) while state.done(): state.reset_and_randomize(distance) return state