Exemplo n.º 1
0
    def run(self, lr, termination_criterion, silent=False):
        self.player = FCReinforcePlayer(lr=lr)
        self.player.color = config.BLACK

        generator = RandomPlayer()
        print("Pretraining %s on legal moves" % self.player.__str__())

        losses, rewards = [], []
        start = datetime.now()
        for game in range(1, self.max_games + 1):
            loss, reward = self.__run_episode__(generator)
            losses.append(loss)
            rewards.append(reward)

            if not silent:
                if Printer.print_episode(game, self.max_games,
                                         datetime.now() - start):
                    plot_name = "Pretraining %s using %s layers on legal moves\nlr: %s" % (
                        self.player.__class__.__name__, LAYERS, lr)
                    plot_info = "%sGames - Final reward: %s \nTime: %s" % (
                        game, reward, config.time_diff(start))
                    self.plot_and_save(plot_name, plot_name + "\n" + plot_info)
                    if (100 * game / self.max_games) % 10 == 0:
                        self.save_player(
                            self.player,
                            "using %s layers pretrained on legal moves for %s games lr: %s"
                            % (LAYERS, self.max_games, lr))

            if game > termination_criterion and sum(rewards[
                    -termination_criterion:]) / termination_criterion == 1:
                print(
                    "Reached training goal: %s games with only legal moves played -> terminating training."
                    % termination_criterion)
                self.save_player(
                    self.player,
                    "using %s layers pretrained on legal moves for %s games lr: %s"
                    % (LAYERS, self.max_games, lr))
                return losses, rewards

        print("Reached max training_games (%s) -> terminating training" %
              self.max_games)
        self.save_player(
            self.player,
            "using %s layers pretrained on legal moves for %s games lr: %s" %
            (LAYERS, self.max_games, lr))
        return losses, rewards
    def run(self, lr, silent=False):

        self.player1 = self.pretrained_player if self.pretrained_player else FCReinforcePlayer(
            lr=lr)

        if self.opponent is not None:
            self.player2 = self.opponent
            self.simulation = TicTacToe([self.player1, self.player2])

        games_per_evaluation = self.games // self.evaluations
        start_time = datetime.now()
        for episode in range(1, self.evaluations + 1):

            if self.opponent is None:
                self.player2 = choice(
                    (RandomPlayer(), NovicePlayer(), ExperiencedPlayer()
                     ))  # choice((RandomPlayer(), ExpertPlayer()))
                self.simulation = TicTacToe([self.player1, self.player2])

            # train
            self.player1.strategy.train, self.player1.strategy.model.training = True, True  # training mode

            results, losses = self.simulation.run_simulations(
                games_per_evaluation)
            self.add_loss(np.mean(losses))
            self.add_results(("Training Results", np.mean(results)))

            # evaluate
            self.player1.strategy.train, self.player1.strategy.model.training = False, False  # eval mode
            if self.opponent is None:
                score, results, overview = evaluate_against_base_players(
                    self.player1)
            else:
                score, results, overview = evaluate_against_base_players(
                    self.player1, evaluation_players=[self.opponent])
            self.add_results(results)

            if not silent:
                if Printer.print_episode(episode * games_per_evaluation,
                                         self.games,
                                         datetime.now() - start_time):
                    self.plot_and_save(
                        "%s vs TRADITIONAL OPPONENT" % (self.player1),
                        "Train %s vs %s\nGames: %s Evaluations: %s\nTime: %s" %
                        (self.player1, self.opponent,
                         episode * games_per_evaluation, self.evaluations,
                         config.time_diff(start_time)))

        self.final_score, self.final_results, self.results_overview = evaluate_against_base_players(
            self.player1, silent=False)
        return self
Exemplo n.º 3
0
    def run(self, lr, silent=False):
        self.player1 = self.pretrained_player if self.pretrained_player else FCReinforcePlayer(
            lr=lr)

        # Player 2 has the same start conditions as Player 1 but does not train
        self.player2 = self.player1.copy(shared_weights=False)
        self.player2.strategy.train = False

        games_per_evaluation = self.games // self.evaluations
        self.replacements = []
        start_time = datetime.now()
        for episode in range(1, self.evaluations + 1):
            # train
            self.player1.strategy.train, self.player1.strategy.model.training = True, True  # training mode

            self.simulation = TicTacToe([self.player1, self.player2])
            results, losses = self.simulation.run_simulations(
                games_per_evaluation)
            self.add_results(("Losses", np.mean(losses)))

            # evaluate
            self.player1.strategy.train, self.player1.strategy.model.training = False, False  # eval mode
            score, results, overview = evaluate_against_base_players(
                self.player1)
            self.add_loss(np.mean(losses))
            self.add_results(results)

            if not silent and Printer.print_episode(
                    episode * games_per_evaluation, self.games,
                    datetime.now() - start_time):
                self.plot_and_save(
                    "%s vs BEST" % (self.player1),
                    "Train %s vs Best version of self\nGames: %s Evaluations: %s\nTime: %s"
                    % (self.player1, episode * games_per_evaluation,
                       self.evaluations, config.time_diff(start_time)))

            if evaluate_against_each_other(self.player1, self.player2):
                self.player2 = self.player1.copy(shared_weights=False)
                self.player2.strategy.train, self.player2.strategy.model.training = False, False
                self.replacements.append(episode)

        print("Best player replaced after episodes: %s" % self.replacements)
        self.final_score, self.final_results, self.results_overview = evaluate_against_base_players(
            self.player1, silent=False)
        return self
    def run(self):
        if VS_TRADITIONAL:
            # ACTOR CRITIC
            if AC:
                for player in [FCACPlayer(LR)]:
                    experiment = TrainACPlayerVsTraditionalOpponent(
                        games=GAMES,
                        evaluations=EVALUATIONS,
                        pretrained_player=player,
                        opponent=None)
                    print("\n|| ----- Running %s with %s ----- ||" %
                          (experiment, player))
                    experiment.run(lr=LR)
                    experiment.reset()

            # BASELINE
            if BASELINE:
                for player in [FCBaseLinePlayer(LR)]:
                    experiment = TrainBaselinePlayerVsTraditionalOpponent(
                        games=GAMES,
                        evaluations=EVALUATIONS,
                        pretrained_player=player,
                        opponent=None)
                    print("\n|| ----- Running %s with %s ----- ||" %
                          (experiment, player))
                    experiment.run(lr=LR)
                    experiment.reset()

            # REINFORCE
            if REINFORCE:
                for player in [FCReinforcePlayer(LR)]:
                    experiment = TrainReinforcePlayerVsTraditionalOpponent(
                        games=GAMES,
                        evaluations=EVALUATIONS,
                        pretrained_player=player,
                        opponent=None)
                    print("\n|| ----- Running %s with %s ----- ||" %
                          (experiment, player))
                    experiment.run(lr=LR)
                    experiment.reset()

        if VS_BEST:
            # ACTOR CRITIC
            if AC:
                for player in [FCACPlayer(LR)]:
                    experiment = TrainACPlayerVsBest(games=GAMES,
                                                     evaluations=EVALUATIONS,
                                                     pretrained_player=player)
                    print("\n|| ----- Running %s with %s ----- ||" %
                          (experiment, player))
                    experiment.run(lr=LR)
                    experiment.reset()

            # BASELINE
            if BASELINE:
                for player in [FCBaseLinePlayer(LR)]:
                    experiment = TrainBaselinePlayerVsBest(
                        games=GAMES,
                        evaluations=EVALUATIONS,
                        pretrained_player=player)
                    print("\n|| ----- Running %s with %s ----- ||" %
                          (experiment, player))
                    experiment.run(lr=LR)
                    experiment.reset()
                    experiment.run(lr=LR, milestones=True)

            # REINFORCE
            if REINFORCE:
                for player in [FCReinforcePlayer(LR)]:
                    experiment = TrainReinforcePlayerVsBest(
                        games=GAMES,
                        evaluations=EVALUATIONS,
                        pretrained_player=player)
                    print("\n|| ----- Running %s with %s ----- ||" %
                          (experiment, player))
                    experiment.run(lr=LR)
                    experiment.reset()

        if VS_SELF:
            # ACTOR CRITIC
            """ Not yet implemented
            if AC:
                for player in [FCACPlayer(LR)]:
                    experiment = TrainACPlayerVsSelf(games=GAMES, evaluations=EVALUATIONS, pretrained_player=player)
                    print("\n|| ----- Running %s with %s ----- ||" % (experiment, player))
                    experiment.run(lr=LR)
                    experiment.reset()
            """

            # BASELINE
            if BASELINE:
                for player in [FCBaseLinePlayer(LR)]:
                    experiment = TrainBaselinePlayerVsSelf(
                        games=GAMES,
                        evaluations=EVALUATIONS,
                        pretrained_player=player)
                    print("\n|| ----- Running %s with %s ----- ||" %
                          (experiment, player))
                    experiment.run(lr=LR)
                    experiment.reset()
                    experiment.run(lr=LR, milestones=True)

            # REINFORCE
            """ Not yet implemented
Exemplo n.º 5
0
    def test_ConvReinforcePlayer(self):
        fc_player = FCReinforcePlayer(lr=1e-4)
        random_player = RandomPlayer()

        simulation = TicTacToe([fc_player, random_player])
        simulation.run_simulations(100)
Exemplo n.º 6
0
    def test_DummyTrainReinforcePlayer(self):
        player1 = FCReinforcePlayer(lr=0.001)
        player2 = RandomPlayer()

        simulation = TicTacToe([player1, player2])
        simulation.run_simulations(10)
Exemplo n.º 7
0
 def test_CreateReinforcementPlayer(self):
     FCReinforcePlayer(lr=0.001)
Exemplo n.º 8
0
    def test_evaluation(self):
        p1 = ttt_players.RandomPlayer()
        evaluate_against_base_players(p1, silent=False)

        p2 = FCReinforcePlayer(lr=1e-5)
        evaluate_against_base_players(p2, silent=False)
Exemplo n.º 9
0
    def run(self, lr, silent=False):

        EVALUATION_GAMES = 10

        player = FCReinforcePlayer(lr=lr)
        player.color = config.BLACK

        expert = ExperiencedPlayer(deterministic=True, block_mid=True)
        expert.color = config.BLACK

        generator = RandomPlayer()
        color_iterator = self.AlternatingColorIterator()

        validation_set = self.generate_supervised_training_data(
            EVALUATION_GAMES,
            ExperiencedPlayer(deterministic=True, block_mid=True))

        print("Training ReinforcedPlayer supervised continuously with LR: %s" %
              lr)
        start = datetime.now()
        for game in range(self.games):
            rewards = []
            board = TicTacToeBoard()

            for i in range(9):
                expert_move = expert.get_move(board)
                player_move = player.get_move(board)

                reward = config.LABEL_WIN if expert_move == player_move else config.LABEL_LOSS
                rewards.append(reward)

                # prepare for next sample
                move = generator.get_move(board)
                board.apply_move(move, color_iterator.__next__())

            average_reward = sum(rewards) / len(rewards)
            player.strategy.rewards = rewards
            loss = player.strategy.update()

            del rewards[:]
            self.add_results([("Losses", loss), ("Reward", average_reward)])

            if game % self.evaluation_period == 0:
                test_rewards = []
                for board, expert_move in validation_set:
                    # Evaluation mode
                    player.strategy.train, player.strategy.model.training = False, False
                    strategy_move = player.get_move(board)
                    player.strategy.train, player.strategy.model.training = True, True

                    test_reward = config.BLACK if expert_move == strategy_move else config.WHITE
                    test_rewards.append(test_reward)

                average_test_reward = sum(test_rewards) / len(test_rewards)
                del test_rewards[:]
                self.add_results(("Test reward", average_test_reward))

            if not silent:
                if Printer.print_episode(game + 1, self.games,
                                         datetime.now() - start):
                    plot_name = "Supervised Continuous training of %s" % (
                        player)
                    plot_info = "%s Games - Final reward: %s \nTime: %s" % (
                        game + 1, average_reward, config.time_diff(start))
                    self.plot_and_save(plot_name, plot_name + "\n" + plot_info)

        return average_reward
Exemplo n.º 10
0
class PretrainLegalMoves(TicTacToeBaseExperiment):
    """
    Trains a player on a continuously random generated data set to only play legal moves.

    The data set is generated by a random player and training terminates either after :param max_games are played or the player has not performed an illegal move in :param termination_criterion games.
    """
    def __init__(self, max_games):
        super(PretrainLegalMoves, self).__init__()
        self.max_games = max_games

    def reset(self):
        self.__init__(self.max_games)
        return self

    def run(self, lr, termination_criterion, silent=False):
        self.player = FCReinforcePlayer(lr=lr)
        self.player.color = config.BLACK

        generator = RandomPlayer()
        print("Pretraining %s on legal moves" % self.player.__str__())

        losses, rewards = [], []
        start = datetime.now()
        for game in range(1, self.max_games + 1):
            loss, reward = self.__run_episode__(generator)
            losses.append(loss)
            rewards.append(reward)

            if not silent:
                if Printer.print_episode(game, self.max_games,
                                         datetime.now() - start):
                    plot_name = "Pretraining %s using %s layers on legal moves\nlr: %s" % (
                        self.player.__class__.__name__, LAYERS, lr)
                    plot_info = "%sGames - Final reward: %s \nTime: %s" % (
                        game, reward, config.time_diff(start))
                    self.plot_and_save(plot_name, plot_name + "\n" + plot_info)
                    if (100 * game / self.max_games) % 10 == 0:
                        self.save_player(
                            self.player,
                            "using %s layers pretrained on legal moves for %s games lr: %s"
                            % (LAYERS, self.max_games, lr))

            if game > termination_criterion and sum(rewards[
                    -termination_criterion:]) / termination_criterion == 1:
                print(
                    "Reached training goal: %s games with only legal moves played -> terminating training."
                    % termination_criterion)
                self.save_player(
                    self.player,
                    "using %s layers pretrained on legal moves for %s games lr: %s"
                    % (LAYERS, self.max_games, lr))
                return losses, rewards

        print("Reached max training_games (%s) -> terminating training" %
              self.max_games)
        self.save_player(
            self.player,
            "using %s layers pretrained on legal moves for %s games lr: %s" %
            (LAYERS, self.max_games, lr))
        return losses, rewards

    def __run_episode__(self, generator):
        player = self.player

        rewards = []
        color_iterator = self.AlternatingColorIterator()
        board = TicTacToeBoard()
        for i in range(9):
            player_move = player.get_move(board)

            # Win if predicted move is legal, loss otherwise
            reward = config.LABEL_WIN if player_move in board.get_valid_moves(
                player.color) else config.LABEL_LOSS
            rewards.append(reward)

            # prepare for next sample
            board.apply_move(generator.get_move(board),
                             color_iterator.__next__())

        loss = player.strategy.update()
        player.strategy.rewards = []

        average_reward = np.mean(rewards)
        del rewards[:]
        self.add_results([("Losses", loss), ("Score", average_reward)])

        return loss, average_reward
Exemplo n.º 11
0
    def run(self, lr, silent=False):

        print(
            "Training PGStrategy supervised on %s games for %s Episodes - LR: %s"
            % (self.games, self.episodes, lr))
        TEST_GAMES = 1

        player = FCReinforcePlayer(lr=lr)
        player.color = config.BLACK

        expert = ExperiencedPlayer(deterministic=True, block_mid=True)
        expert.color = config.BLACK

        training_set = self.generate_supervised_training_data(
            self.games, expert)
        test_set = self.generate_supervised_training_data(TEST_GAMES, expert)

        start = datetime.now()
        for episode in range(self.episodes):
            rewards = []
            test_rewards = []

            for board, expert_move in training_set:
                # Training mode
                player.strategy.train, player.strategy.model.training = True, True

                strategy_move = player.get_move(board)
                reward = config.LABEL_WIN if expert_move == strategy_move else config.LABEL_LOSS
                rewards.append(reward)

            average_reward = sum(rewards) / len(rewards)
            player.strategy.rewards = rewards
            loss = player.strategy.update()

            for board, expert_move in test_set:
                # Evaluation mode
                player.strategy.train, player.strategy.model.training = False, False

                strategy_move = player.get_move(board)
                test_reward = config.BLACK if expert_move == strategy_move else config.WHITE
                test_rewards.append(test_reward)

            average_test_reward = sum(test_rewards) / len(test_rewards)

            self.add_results([("Losses", loss),
                              ("Average reward", average_reward),
                              ("Average test reward", average_test_reward)])

            if not silent:
                if Printer.print_episode(episode + 1, self.episodes,
                                         datetime.now() - start):
                    plot_name = "Supervised on %s games lr: %s" % (self.games,
                                                                   lr)
                    plot_info = "Lr: %s - %s Games - %s Episodes\nFinal Scores: %s / %s \nTime: %s" % (
                        lr, self.games, episode + 1,
                        '{:.2f}'.format(average_reward),
                        '{:.2f}'.format(average_test_reward),
                        config.time_diff(start))
                    self.plot_and_save(plot_name, plot_name + "\n" + plot_info)

        return average_reward, average_test_reward