def run(self, lr, milestones=False, silent=False): self.player1 = self.pretrained_player if self.pretrained_player else FCBaseLinePlayer( lr=lr) # Player 2 has the same start conditions as Player 1 but does not train self.player2 = self.player1.copy(shared_weights=True) self.player2.strategy.train = False self.simulation = TicTacToe([ self.player1, self.player2 ]) # Players never change, therefore we only need one simulation games_per_evaluation = self.games // self.evaluations start_time = datetime.now() for episode in range(1, self.evaluations + 1): # train self.player1.strategy.train, self.player1.strategy.model.training = True, True # training mode # If milestones exist, use them with probability p if self.milestones and random() < 0.2: self.player2 = choice(self.milestones) else: self.player2 = self.player1.copy(shared_weights=True) # train self.player1.strategy.train, self.player1.strategy.model.training = True, True # training mode self.player2.strategy.train = False results, losses = self.simulation.run_simulations( games_per_evaluation) self.add_loss(np.mean(losses)) self.add_results(("Self", np.mean(results))) # evaluate self.player1.strategy.train, self.player1.strategy.model.training = False, False # eval mode score, results, overview = evaluate_against_base_players( self.player1) self.add_results(results) if not silent and Printer.print_episode( episode * games_per_evaluation, self.games, datetime.now() - start_time): self.plot_and_save( "%s vs SELF" % (self.player1.__str__() + (" milestones" if milestones else "")), "Train %s vs Self version of self\nGames: %s Evaluations: %s\nTime: %s" % (self.player1, episode * games_per_evaluation, self.evaluations, config.time_diff(start_time))) # If x/5th of training is completed, save milestone if milestones and (self.games / episode * games_per_evaluation) % 5 == 0: self.milestones.append(self.player1.copy(shared_weights=False)) self.milestones[-1].strategy.train = False self.final_score, self.final_results, self.results_overview = evaluate_against_base_players( self.player1, silent=False) return self
def run(self, lr, silent=False): self.player1 = self.pretrained_player if self.pretrained_player else FCBaseLinePlayer( lr=lr) if self.opponent is not None: self.player2 = self.opponent self.simulation = TicTacToe([self.player1, self.player2]) games_per_evaluation = self.games // self.evaluations start_time = datetime.now() for episode in range(1, self.evaluations + 1): if self.opponent is None: self.player2 = choice( (RandomPlayer(), NovicePlayer(), ExperiencedPlayer() )) # choice((RandomPlayer(), ExpertPlayer())) self.simulation = TicTacToe([self.player1, self.player2]) # train self.player1.strategy.train, self.player1.strategy.model.training = True, True # training mode results, losses = self.simulation.run_simulations( games_per_evaluation) self.add_loss(np.mean(losses)) self.add_results(("Training Results", np.mean(results))) # evaluate self.player1.strategy.train, self.player1.strategy.model.training = False, False # eval mode if self.opponent is None: score, results, overview = evaluate_against_base_players( self.player1) else: score, results, overview = evaluate_against_base_players( self.player1, evaluation_players=[self.opponent]) self.add_results(results) if not silent: if Printer.print_episode(episode * games_per_evaluation, self.games, datetime.now() - start_time): overview = format_overview(overview) self.plot_and_save( "%s vs TRADITIONAL OPPONENT" % (self.player1), "Train %s vs %s\nGames: %s Evaluations: %s\nTime: %s" % (self.player1, self.opponent, episode * games_per_evaluation, self.evaluations, config.time_diff(start_time))) self.final_score, self.final_results, self.results_overview = evaluate_against_base_players( self.player1, silent=False) return self
def run(self, lr, termination_criterion, silent=False): self.player = FCReinforcePlayer(lr=lr) self.player.color = config.BLACK generator = RandomPlayer() print("Pretraining %s on legal moves" % self.player.__str__()) losses, rewards = [], [] start = datetime.now() for game in range(1, self.max_games + 1): loss, reward = self.__run_episode__(generator) losses.append(loss) rewards.append(reward) if not silent: if Printer.print_episode(game, self.max_games, datetime.now() - start): plot_name = "Pretraining %s using %s layers on legal moves\nlr: %s" % ( self.player.__class__.__name__, LAYERS, lr) plot_info = "%sGames - Final reward: %s \nTime: %s" % ( game, reward, config.time_diff(start)) self.plot_and_save(plot_name, plot_name + "\n" + plot_info) if (100 * game / self.max_games) % 10 == 0: self.save_player( self.player, "using %s layers pretrained on legal moves for %s games lr: %s" % (LAYERS, self.max_games, lr)) if game > termination_criterion and sum(rewards[ -termination_criterion:]) / termination_criterion == 1: print( "Reached training goal: %s games with only legal moves played -> terminating training." % termination_criterion) self.save_player( self.player, "using %s layers pretrained on legal moves for %s games lr: %s" % (LAYERS, self.max_games, lr)) return losses, rewards print("Reached max training_games (%s) -> terminating training" % self.max_games) self.save_player( self.player, "using %s layers pretrained on legal moves for %s games lr: %s" % (LAYERS, self.max_games, lr)) return losses, rewards
def run(self, lr, silent=False): self.player1 = self.pretrained_player if self.pretrained_player else FCReinforcePlayer( lr=lr) # Player 2 has the same start conditions as Player 1 but does not train self.player2 = self.player1.copy(shared_weights=False) self.player2.strategy.train = False games_per_evaluation = self.games // self.evaluations self.replacements = [] start_time = datetime.now() for episode in range(1, self.evaluations + 1): # train self.player1.strategy.train, self.player1.strategy.model.training = True, True # training mode self.simulation = TicTacToe([self.player1, self.player2]) results, losses = self.simulation.run_simulations( games_per_evaluation) self.add_results(("Losses", np.mean(losses))) # evaluate self.player1.strategy.train, self.player1.strategy.model.training = False, False # eval mode score, results, overview = evaluate_against_base_players( self.player1) self.add_loss(np.mean(losses)) self.add_results(results) if not silent and Printer.print_episode( episode * games_per_evaluation, self.games, datetime.now() - start_time): self.plot_and_save( "%s vs BEST" % (self.player1), "Train %s vs Best version of self\nGames: %s Evaluations: %s\nTime: %s" % (self.player1, episode * games_per_evaluation, self.evaluations, config.time_diff(start_time))) if evaluate_against_each_other(self.player1, self.player2): self.player2 = self.player1.copy(shared_weights=False) self.player2.strategy.train, self.player2.strategy.model.training = False, False self.replacements.append(episode) print("Best player replaced after episodes: %s" % self.replacements) self.final_score, self.final_results, self.results_overview = evaluate_against_base_players( self.player1, silent=False) return self
def run(self, lr, silent=False): EVALUATION_GAMES = 10 player = FCReinforcePlayer(lr=lr) player.color = config.BLACK expert = ExperiencedPlayer(deterministic=True, block_mid=True) expert.color = config.BLACK generator = RandomPlayer() color_iterator = self.AlternatingColorIterator() validation_set = self.generate_supervised_training_data( EVALUATION_GAMES, ExperiencedPlayer(deterministic=True, block_mid=True)) print("Training ReinforcedPlayer supervised continuously with LR: %s" % lr) start = datetime.now() for game in range(self.games): rewards = [] board = TicTacToeBoard() for i in range(9): expert_move = expert.get_move(board) player_move = player.get_move(board) reward = config.LABEL_WIN if expert_move == player_move else config.LABEL_LOSS rewards.append(reward) # prepare for next sample move = generator.get_move(board) board.apply_move(move, color_iterator.__next__()) average_reward = sum(rewards) / len(rewards) player.strategy.rewards = rewards loss = player.strategy.update() del rewards[:] self.add_results([("Losses", loss), ("Reward", average_reward)]) if game % self.evaluation_period == 0: test_rewards = [] for board, expert_move in validation_set: # Evaluation mode player.strategy.train, player.strategy.model.training = False, False strategy_move = player.get_move(board) player.strategy.train, player.strategy.model.training = True, True test_reward = config.BLACK if expert_move == strategy_move else config.WHITE test_rewards.append(test_reward) average_test_reward = sum(test_rewards) / len(test_rewards) del test_rewards[:] self.add_results(("Test reward", average_test_reward)) if not silent: if Printer.print_episode(game + 1, self.games, datetime.now() - start): plot_name = "Supervised Continuous training of %s" % ( player) plot_info = "%s Games - Final reward: %s \nTime: %s" % ( game + 1, average_reward, config.time_diff(start)) self.plot_and_save(plot_name, plot_name + "\n" + plot_info) return average_reward
from TicTacToe.experiments.ticTacToeBaseExperiment import TicTacToeBaseExperiment from TicTacToe.environment.evaluation import evaluate_against_each_other from TicTacToe.players.basePlayers import RandomPlayer, ExperiencedPlayer, ExpertPlayer class EvaluatePlayer(TicTacToeBaseExperiment): config = conf def __init__(self): super(EvaluatePlayer, self).__init__() def reset(self): super().__init__() def run(self, player1, player2): evaluate_against_each_other(player1, player2, silent=False) if __name__ == '__main__': START_TIME = datetime.now() LR = 1e-5 + random() * 1e-9 PLAYER1 = ExperiencedPlayer() PLAYER2 = ExpertPlayer() # RandomPlayer() experiment = EvaluatePlayer() experiment.run(player1=PLAYER1, player2=PLAYER2) print("\n| Evaluation completed, took %s |" % conf.time_diff(START_TIME))
def run(self, lr, silent=False): print( "Training PGStrategy supervised on %s games for %s Episodes - LR: %s" % (self.games, self.episodes, lr)) TEST_GAMES = 1 player = FCReinforcePlayer(lr=lr) player.color = config.BLACK expert = ExperiencedPlayer(deterministic=True, block_mid=True) expert.color = config.BLACK training_set = self.generate_supervised_training_data( self.games, expert) test_set = self.generate_supervised_training_data(TEST_GAMES, expert) start = datetime.now() for episode in range(self.episodes): rewards = [] test_rewards = [] for board, expert_move in training_set: # Training mode player.strategy.train, player.strategy.model.training = True, True strategy_move = player.get_move(board) reward = config.LABEL_WIN if expert_move == strategy_move else config.LABEL_LOSS rewards.append(reward) average_reward = sum(rewards) / len(rewards) player.strategy.rewards = rewards loss = player.strategy.update() for board, expert_move in test_set: # Evaluation mode player.strategy.train, player.strategy.model.training = False, False strategy_move = player.get_move(board) test_reward = config.BLACK if expert_move == strategy_move else config.WHITE test_rewards.append(test_reward) average_test_reward = sum(test_rewards) / len(test_rewards) self.add_results([("Losses", loss), ("Average reward", average_reward), ("Average test reward", average_test_reward)]) if not silent: if Printer.print_episode(episode + 1, self.episodes, datetime.now() - start): plot_name = "Supervised on %s games lr: %s" % (self.games, lr) plot_info = "Lr: %s - %s Games - %s Episodes\nFinal Scores: %s / %s \nTime: %s" % ( lr, self.games, episode + 1, '{:.2f}'.format(average_reward), '{:.2f}'.format(average_test_reward), config.time_diff(start)) self.plot_and_save(plot_name, plot_name + "\n" + plot_info) return average_reward, average_test_reward