def generate_supervised_training_data(cls, games, labeling_strategy): """ Generates training data by applying random moves to a board and labeling each sample with the move that :param labeling_strategy would have taken given the board. :param games: The number of games to be simulated :param labeling_strategy: The strategy used to label each sample. The label equals labeling_strategy.get_move(board) :return: a list of tuples(board_sample, move_label) """ labeling_strategy.color = cls.config.BLACK generator = RandomPlayer() color_iterator = TicTacToeBaseExperiment.AlternatingColorIterator() start = datetime.now() training_set = [] for game in range(games): board = TicTacToeBoard() for i in range(9): # generate training pair expert_move = labeling_strategy.get_move(board) training_set.append((board.copy(), expert_move)) # prepare for next sample move = generator.get_move(board) board.apply_move(move, color_iterator.__next__()) print("Generated %s training pairs form %s games in %s" % (len(training_set), games, datetime.now() - start)) return training_set
def test_DummyUpdate(self): board = TicTacToeBoard() value_function = PGStrategy(lr=0.001, weight_decay=0.003) value_function.evaluate(board.board, board.get_legal_moves_map(config.BLACK)) move = RandomPlayer.get_move(board) board.apply_move(move, config.BLACK) value_function.evaluate(board.board, board.get_legal_moves_map(config.BLACK)) move = RandomPlayer.get_move(board) board.apply_move(move, config.WHITE) value_function.evaluate(board.board, board.get_legal_moves_map(config.BLACK))
def evaluate_against_base_players(player, evaluation_players=[ RandomPlayer(), NovicePlayer(), ExperiencedPlayer(), ExpertPlayer() ], silent=True): """ Standardized evaluation against base players. :param player: The player to be evaluated :param evaluation_players: A list of players against which the player should be evaluated :param silent: Flag controlling if output is written to console :return: a tuple (score, results) where score is the average score over all evaluation games (scalar (-1, 1)) and results is a list of tuples (name, score) where 'score' is the score (-1, 1) achieved when evaluated against the player named 'name'. """ # Store original training values if issubclass(player.__class__, LearningPlayer): training_values = player.strategy.train, player.strategy.model.training player.strategy.train, player.strategy.model.training = False, False results = [] for e_player in evaluation_players: simulation = TicTacToe([player, e_player]) rewards, losses = simulation.run_simulations(config.EVALUATION_GAMES) results.append([e_player.__str__(), rewards]) if not silent: print_results(player, e_player, rewards) # Restore original training values if issubclass(player.__class__, LearningPlayer): player.strategy.train, player.strategy.model.training = training_values avg_results = [(result[0], np.mean(result[1])) for result in results] avg_results.insert( 0, ("Total Score", np.mean([ res[1] for res in avg_results ]))) # Insert average overall score as first element of results results_overview = deepcopy(results) total = Counter(dict()) for entry in results_overview: entry[1] = Counter(entry[1]) total += entry[1] results_overview.insert( 0, ("[Total Score]", total)) # Insert average overall score as first element of results if not silent: print("Overall score: %s" % avg_results[0][1]) return avg_results[0][1], avg_results, results_overview
def run(self, lr, silent=False): self.player1 = self.pretrained_player if self.pretrained_player else FCBaseLinePlayer( lr=lr) if self.opponent is not None: self.player2 = self.opponent self.simulation = TicTacToe([self.player1, self.player2]) games_per_evaluation = self.games // self.evaluations start_time = datetime.now() for episode in range(1, self.evaluations + 1): if self.opponent is None: self.player2 = choice( (RandomPlayer(), NovicePlayer(), ExperiencedPlayer() )) # choice((RandomPlayer(), ExpertPlayer())) self.simulation = TicTacToe([self.player1, self.player2]) # train self.player1.strategy.train, self.player1.strategy.model.training = True, True # training mode results, losses = self.simulation.run_simulations( games_per_evaluation) self.add_loss(np.mean(losses)) self.add_results(("Training Results", np.mean(results))) # evaluate self.player1.strategy.train, self.player1.strategy.model.training = False, False # eval mode if self.opponent is None: score, results, overview = evaluate_against_base_players( self.player1) else: score, results, overview = evaluate_against_base_players( self.player1, evaluation_players=[self.opponent]) self.add_results(results) if not silent: if Printer.print_episode(episode * games_per_evaluation, self.games, datetime.now() - start_time): overview = format_overview(overview) self.plot_and_save( "%s vs TRADITIONAL OPPONENT" % (self.player1), "Train %s vs %s\nGames: %s Evaluations: %s\nTime: %s" % (self.player1, self.opponent, episode * games_per_evaluation, self.evaluations, config.time_diff(start_time))) self.final_score, self.final_results, self.results_overview = evaluate_against_base_players( self.player1, silent=False) return self
def run(self, lr, termination_criterion, silent=False): self.player = FCReinforcePlayer(lr=lr) self.player.color = config.BLACK generator = RandomPlayer() print("Pretraining %s on legal moves" % self.player.__str__()) losses, rewards = [], [] start = datetime.now() for game in range(1, self.max_games + 1): loss, reward = self.__run_episode__(generator) losses.append(loss) rewards.append(reward) if not silent: if Printer.print_episode(game, self.max_games, datetime.now() - start): plot_name = "Pretraining %s using %s layers on legal moves\nlr: %s" % ( self.player.__class__.__name__, LAYERS, lr) plot_info = "%sGames - Final reward: %s \nTime: %s" % ( game, reward, config.time_diff(start)) self.plot_and_save(plot_name, plot_name + "\n" + plot_info) if (100 * game / self.max_games) % 10 == 0: self.save_player( self.player, "using %s layers pretrained on legal moves for %s games lr: %s" % (LAYERS, self.max_games, lr)) if game > termination_criterion and sum(rewards[ -termination_criterion:]) / termination_criterion == 1: print( "Reached training goal: %s games with only legal moves played -> terminating training." % termination_criterion) self.save_player( self.player, "using %s layers pretrained on legal moves for %s games lr: %s" % (LAYERS, self.max_games, lr)) return losses, rewards print("Reached max training_games (%s) -> terminating training" % self.max_games) self.save_player( self.player, "using %s layers pretrained on legal moves for %s games lr: %s" % (LAYERS, self.max_games, lr)) return losses, rewards
def test_neverLose(self): GAMES = 10000 player1 = SearchPlayer() player2 = SearchPlayer() random_player = RandomPlayer() simulation = TicTacToe([player1, player2]) results, losses = simulation.run_simulations(GAMES // 100) self.assertEqual( len(results), results.count(0), "Perfect player mirror match resulted in a result other than draw") print("\nFirst 20 results: %s against self" % results[:20]) simulation = TicTacToe([player1, random_player]) results, losses = simulation.run_simulations(GAMES) self.assertEqual(0, results.count(-1), "Perfect player lost against random") print("First 20 results: %s against random player" % results[:20]) print("Win rate: %s vs random player" % (sum(results) / len(results)))
def test_ConvReinforcePlayer(self): fc_player = FCReinforcePlayer(lr=1e-4) random_player = RandomPlayer() simulation = TicTacToe([fc_player, random_player]) simulation.run_simulations(100)
def test_DummyTrainReinforcePlayer(self): player1 = FCReinforcePlayer(lr=0.001) player2 = RandomPlayer() simulation = TicTacToe([player1, player2]) simulation.run_simulations(10)
def run(self, lr, silent=False): EVALUATION_GAMES = 10 player = FCReinforcePlayer(lr=lr) player.color = config.BLACK expert = ExperiencedPlayer(deterministic=True, block_mid=True) expert.color = config.BLACK generator = RandomPlayer() color_iterator = self.AlternatingColorIterator() validation_set = self.generate_supervised_training_data( EVALUATION_GAMES, ExperiencedPlayer(deterministic=True, block_mid=True)) print("Training ReinforcedPlayer supervised continuously with LR: %s" % lr) start = datetime.now() for game in range(self.games): rewards = [] board = TicTacToeBoard() for i in range(9): expert_move = expert.get_move(board) player_move = player.get_move(board) reward = config.LABEL_WIN if expert_move == player_move else config.LABEL_LOSS rewards.append(reward) # prepare for next sample move = generator.get_move(board) board.apply_move(move, color_iterator.__next__()) average_reward = sum(rewards) / len(rewards) player.strategy.rewards = rewards loss = player.strategy.update() del rewards[:] self.add_results([("Losses", loss), ("Reward", average_reward)]) if game % self.evaluation_period == 0: test_rewards = [] for board, expert_move in validation_set: # Evaluation mode player.strategy.train, player.strategy.model.training = False, False strategy_move = player.get_move(board) player.strategy.train, player.strategy.model.training = True, True test_reward = config.BLACK if expert_move == strategy_move else config.WHITE test_rewards.append(test_reward) average_test_reward = sum(test_rewards) / len(test_rewards) del test_rewards[:] self.add_results(("Test reward", average_test_reward)) if not silent: if Printer.print_episode(game + 1, self.games, datetime.now() - start): plot_name = "Supervised Continuous training of %s" % ( player) plot_info = "%s Games - Final reward: %s \nTime: %s" % ( game + 1, average_reward, config.time_diff(start)) self.plot_and_save(plot_name, plot_name + "\n" + plot_info) return average_reward