def play_series(x): game = Hex() actor = Actor(game, [], replay_file='model/replays_expert.txt', rp_save_interval=replay_save_interval) mcts = MCTS(game, simulations=rollouts) for i in range(games_per_series): print('Starting game 1') state = game.get_initial_state() mcts.set_state(state) while not game.is_finished(state): move, probabilities = mcts.select_move(True) padded_probs = np.pad(probabilities, (0, game.num_possible_moves() - len(probabilities)), 'constant') actor.add_to_replay_buffer(state, padded_probs) state = game.get_outcome_state(state, move) mcts.set_state(state)
class ActorTrainer: def __init__(self, game, checkpoint_directory, actor=None, network_save_interval=100, rollouts=100, start_game=0, replay_save_interval=250, replay_limit=20000, minibatch_size=50, replay_file=None, test_games=50, nn_steps=1): self.game = game self.checkpoint_directory = checkpoint_directory self.network_save_interval = network_save_interval self.mcts = MCTS(game, simulations=rollouts, default_policy=self.create_default_policy()) self.game_count = start_game self.replay_save_interval = replay_save_interval self.replay_buffer = deque(maxlen=replay_limit) self.rp_count = 0 self.minibatch_size = minibatch_size self.test_games = test_games self.nn_steps = nn_steps if replay_file == 'auto': self.replay_file = f'{checkpoint_directory}/replays.txt' else: self.replay_file = replay_file if not os.path.exists(checkpoint_directory): os.makedirs(checkpoint_directory) if actor: self.actor = actor self.save_actor_to_file() else: self.actor = self.load_actor_from_file() if start_game > 0: self.actor.load_checkpoint( f'{checkpoint_directory}/game_{start_game}') if replay_save_interval > replay_limit: raise ValueError( f'replay_save_interval ({replay_save_interval}) must be smaller ' f'than replay_limit ({replay_limit})') if replay_file is not None and replay_file != 'auto': try: self.load_replays() except FileNotFoundError: pass if start_game == 0: self.actor.save_checkpoint(checkpoint_directory + '/game_0') self.actor.save_checkpoint(checkpoint_directory + '/best') with open(checkpoint_directory + '/best.txt', 'w') as f: f.write(str(0)) def train(self, num_games): for i in range(num_games): self.game_count += 1 game_start_time = time.time() print(f'[GAME {self.game_count}] Initializing state') state = self.game.get_initial_state() self.mcts.set_state(state) print(f'[GAME {self.game_count}] Simulating game') while not self.game.is_finished(state): move, probabilities = self.mcts.select_move(True) padded_probs = np.pad( probabilities, (0, self.game.num_possible_moves() - len(probabilities)), 'constant') self.add_to_replay_buffer(state, padded_probs) state = game.get_outcome_state(state, move) self.mcts.set_state(state) print(f'[GAME {self.game_count}] Training neural network') for j in range(self.nn_steps): self.train_network() if self.game_count % self.network_save_interval == 0: print( f'[GAME {self.game_count}] Saving neural network checkpoint' ) self.actor.save_checkpoint( f'{self.checkpoint_directory}/game_{self.game_count}') if self.test_against_best(): print( f'[GAME {self.game_count}] New best found - saving checkpoint' ) print( f'[GAME {self.game_count}] Time elapsed: {time.time() - game_start_time:.2f}' ) print() def test_against_best(self): if self.test_games <= 0: return False print(f'[GAME {self.game_count}] Testing against best model...', end='') best_actor = self.load_actor_from_file() best_actor.load_checkpoint(f'{self.checkpoint_directory}/best') starting = True wins = 0 for i in range(self.test_games): turn = starting state = self.game.get_initial_state() while not self.game.is_finished(state): if turn: move = self.actor.select_move(state) else: move = best_actor.select_move(state) state = game.get_outcome_state(state, move[0]) turn = not turn result = game.evaluate_state(state) if result == 1 and starting or result == -1 and not starting: wins += 1 starting = not starting print(f'won {wins}/{self.test_games}') if wins > self.test_games / 2: self.actor.save_checkpoint(self.checkpoint_directory + '/best') with open(self.checkpoint_directory + '/best.txt', 'w') as f: f.write(str(self.game_count)) return True return False def train_network(self): minibatch = random.sample( self.replay_buffer, min(self.minibatch_size, len(self.replay_buffer))) for i in range(len(minibatch)): minibatch[i] = self.game.format_for_nn( minibatch[i][0], format=self.actor.format), minibatch[i][1] self.actor.network.train(minibatch=minibatch) def create_default_policy(self): def actor_default_policy(state, moves): move = self.actor.select_move(state, stochastic=True) return move return actor_default_policy def add_to_replay_buffer(self, state, probabilities): self.replay_buffer.append((state, probabilities)) self.rp_count += 1 if self.replay_save_interval != -1 and self.rp_count % self.replay_save_interval == 0 and self.rp_count != 0: replays = len(self.replay_buffer) self.save_replays( itertools.islice(self.replay_buffer, replays - self.replay_save_interval, replays)) def save_replays(self, replays): if self.replay_file is None: return with open(self.replay_file, 'a') as f: for replay in replays: state_string = ','.join(map(str, replay[0][0])) + ',' + str( replay[0][1]) probs_string = ','.join(map(str, replay[1])) rp_string = state_string + ';' + probs_string f.write(rp_string + '\n') def load_replays(self): with open(self.replay_file, 'r') as f: for line in f: state, probs = line.split(';') state = list(map(int, state.split(','))) player = state[-1] board = state[:-1] probs = list(map(float, probs.split(','))) self.replay_buffer.append(((board, player), probs)) def load_actor_from_file(self): with open(f'{self.checkpoint_directory}/actor_params.txt') as f: lines = f.read().split('\n') format = lines[0] optimizer = 'adam' if len(lines) > 1: optimizer = lines[1] with open(f'{self.checkpoint_directory}/actor_layers.bin', 'rb') as f: layers = pickle.load(f) return Actor(self.game, layers, format=format, optimizer=optimizer) def save_actor_to_file(self): with open(f'{self.checkpoint_directory}/actor_params.txt', 'w') as f: f.write(self.actor.format + '\n') f.write(self.actor.optimizer) with open(f'{self.checkpoint_directory}/actor_layers.bin', 'wb') as f: pickle.dump(self.actor.layers, f)
class BasicClientActor(BasicClientActorAbs): def __init__(self, ip_address=None, verbose=True, auto_test=False): self.series_id = -1 self.starting_player = -1 self.game_count = 0 self.series_count = 0 self.series_game_count = 0 BasicClientActorAbs.__init__(self, ip_address, verbose=verbose, auto_test=auto_test) trainer = ActorTrainer(self.hex, 'model/1000x500x100-200', start_game=250) #self.actor = trainer.actor self.actor = MCTS(self.hex, simulations=100) def handle_get_action(self, state): """ Here you will use the neural net that you trained using MCTS to select a move for your actor on the current board. Remember to use the correct player_number for YOUR actor! The default action is to select a random empty cell on the board. This should be modified. :param state: The current board in the form (1 or 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), where 1 or 2 indicates the number of the current player. If you are player 2 in the current series, for example, then you will see a 2 here throughout the entire series, whereas player 1 will see a 1. :return: Your actor's selected action as a tuple (row, column) """ current_player = state[0] - 1 board = list(state[1:]) state = (board, current_player) #next_move = self.actor.select_move(state)[0][0] self.actor.set_state(state) next_move = self.actor.select_move()[0] return next_move def handle_series_start(self, unique_id, series_id, player_map, num_games, game_params): """ Set the player_number of our actor, so that we can tell our MCTS which actor we are. :param unique_id: integer identifier for the player within the whole tournament database :param series_id: (1 or 2) indicating which player this will be for the ENTIRE series :param player_map: a list of tuples: (unique-id series-id) for all players in a series :param num_games: number of games to be played in the series :param game_params: important game parameters. For Hex = list with one item = board size (e.g. 5) :return """ self.series_id = series_id self.series_count += 1 print(f'Series {self.series_count} starting') print(f'Series ID: {series_id}') self.series_game_count = 0 ############################# # # # YOUR CODE (if you have anything else) HERE # # ############################## def handle_game_start(self, start_player): """ :param start_player: The starting player number (1 or 2) for this particular game. :return """ self.starting_player = start_player self.game_count += 1 print( f'Game {self.game_count} starting. (Game {self.series_game_count} in series.)' ) ############################# # # # YOUR CODE (if you have anything else) HERE # # ############################## def handle_game_over(self, winner, end_state): """ Here you can decide how to handle what happens when a game finishes. The default action is to print the winner and the end state. :param winner: Winner ID (1 or 2) :param end_state: Final state of the board. :return: """ ############################# # # # YOUR CODE HERE # # ############################## print() print("Game over, these are the stats:") print('Winner: ' + str(winner)) print('End state:') self.print_state(end_state) def handle_series_over(self, stats): """ Here you can handle the series end in any way you want; the initial handling just prints the stats. :param stats: The actor statistics for a series = list of tuples [(unique_id, series_id, wins, losses)...] :return: """ ############################# # # # YOUR CODE HERE # # ############################# print("Series ended, these are the stats:") print(f'Series ID: {self.series_id}') for stat in stats: if stat[1] == self.series_id: # Found my stats print( f'Won {stat[2]}/{stat[2] + stat[3]} ({stat[2]/(stat[2]+stat[3]):.0%})' ) print() # print(str(stats)) def handle_tournament_over(self, score): """ Here you can decide to do something when a tournament ends. The default action is to print the received score. :param score: The actor score for the tournament :return: """ ############################# # # # YOUR CODE HERE # # ############################# print("Tournament over. Your score was: " + str(score)) def handle_illegal_action(self, state, illegal_action): """ Here you can handle what happens if you get an illegal action message. The default is to print the state and the illegal action. :param state: The state :param illegal_action: The illegal action :return: """ ############################# # # # YOUR CODE HERE # # ############################# print("An illegal action was attempted:") print('State: ' + str(state)) print('Action: ' + str(illegal_action))