예제 #1
0
def play_multiple(player1, player2, times, n):
    black_wins = white_wins = 0
    for time in range(times):
        # print(f"game no.{time}")
        my_go = GO(n)
        result = my_go.play(player1, player2)
        if result == 1: black_wins += 1
        elif result == 2: white_wins += 1
        # print('Black player (X) | Wins:{0:.1f}% Loses:{1:.1f}%'.format(100*black_wins/(time+1), 100*white_wins/(time+1)))
        # print('White player (O) | Wins:{0:.1f}% Loses:{1:.1f}%'.format(100*white_wins/(time+1), 100*black_wins/(time+1)))
    return black_wins, white_wins
예제 #2
0
def play_multiple(player1, player2, times, n):
    black_wins = white_wins = 0
    for time in range(times):
        my_go = GO(n)
        result = my_go.play(player1, player2)
        if player1.type == 'my':
            player1.learn(result, 1, my_go, time)
        if player2.type == 'my':
            player2.learn(result, 2, my_go, time)
        if result == 1: black_wins += 1
        elif result == 2: white_wins += 1
    return black_wins, white_wins
def play_multiple(player1, player2, times, n):
    black_wins = white_wins = 0
    for time in range(times):
        # print("Played", time, "games")
        my_go = GO(n)
        result = my_go.play(player1, player2)
        if result == 1: black_wins += 1
        elif result == 2: white_wins += 1
        # if player1.type == 'my':
        #     player1.learn(my_go,result)
        # elif player2.type == 'my':
        #     player2.learn(my_go,result)

    return black_wins, white_wins
예제 #4
0
 def self_play(self, player, verbose=False, temp=1e-3):
     """ Self-play using MCTS player.
     Params:
         player: MCTS player object.
         verbose: bool, show board or not.
         temp: float, controls the weight of exploration.
     Returns: tuple of winner and game data.
     """
     self.go = GO(self.size)
     states, mcts_probs, players = [], [], []
     if verbose:
         self.go.visualize_board()
     while True:
         piece_type = 1 if self.go.X_move else 2
         if self.go.game_end(piece_type):
             # winner from the perspective of the current player
             winner = self.go.judge_winner()
             winners = np.zeros(len(players))
             winners[np.array(players) == winner] = 1.0
             winners[np.array(players) != winner] = -1.0
             # reset MCTS root node
             player.reset()
             if verbose:
                 print('Game ended.')
                 str_winner = 'X' if winner == 1 else 'O'
                 print(f'The winner is {str_winner}')
             return winner, zip(states, mcts_probs, winners)
         move, move_probs = player.get_move(self.go,
                                            temp=temp,
                                            return_prob=1)
         # store the data
         states.append(self.pv_net.get_state(self.go))
         mcts_probs.append(move_probs)
         players.append(piece_type)
         # perform a move
         i, j = move // self.go.size, move % self.go.size
         if not self.go.place_chess(i, j, piece_type):
             if verbose:
                 self.go.visualize_board()
             continue
         self.go.died_pieces = self.go.remove_died_pieces(3 - piece_type)
         self.go.X_move = not self.go.X_move
예제 #5
0
 def __init__(self, saved_weights=None):
     """ Initialize Attributes. """
     # board attributes
     self.size = 5  # board (go) size
     self.go = GO(self.size)  # initialize the board (go)
     # training params
     # mine, adjust this manually to set training process
     # --------------------------------------------------------------------
     self.R = 200  # num of simulations (rollouts) for each move
     self.check_freq = 50  # the frequency to check performance
     self.game_batch_num = 5000  # total batch number of selfplays
     self.test_num = 50  # test games number
     # --------------------------------------------------------------------
     # github preset, do not change
     self.lr = 2e-3  # learning rate of the whole process
     self.lr_coef = 1.0  # adaptively adjust lr based on KL
     self.temp = 1.0  # the temperature param controlling exploration
     self.C = 5  # hyperparameter controls the weight of prior probs
     self.buffer_size = 10000  # buffer size for data retrieving
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)  # data buffer
     self.play_batch_size = 1  # batch size for playing each time
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02  # kl target
     self.best_win_ratio = 0.0  # best_win_ratio to compare models
     # set policy-value net
     self.pv_net = PolicyValueNet(self.size, saved_weights)
     # set my player
     self.mcts_player = MyPlayer(self.pv_net.policy,
                                 c=self.C,
                                 r=self.R,
                                 is_selfplay=True,
                                 is_train=True)
     # set opponent players to evaluate performance
     self.op_player_n = 0
     self.op_players = [
         RandomPlayer(),
         GreedyPlayer(),
         AggressivePlayer(),
         SmartPlayer()
     ]
예제 #6
0
 def policy_evaluate(self, n_games=50):
     """ Evaluate the neural network policy.
     Params: n_games: int, number of games will be played.
     Returns: win_ratio: float, winning ratio of this evaluation.
     """
     p1 = MyPlayer(self.pv_net.policy, c=self.C, r=self.R, is_train=True)
     # p1 = MyPlayer(c=self.C, r=self.R)
     p2 = self.op_players[self.op_player_n]
     wins = defaultdict(int)
     for i in range(n_games):
         self.go = GO(self.size)
         if i % 2 == 0:
             winner = self.go.play(p1, p2, verbose=False)
             wins[p1.type if winner == 1 else p2.type] += 1
         else:
             winner = self.go.play(p2, p1, verbose=False)
             wins[p1.type if winner == 2 else p2.type] += 1
     win_ratio = 1.0 * wins[p1.type] / n_games
     print(" op_player: {}, win: {}, lose: {}, win_ratio:{}".format(
         p2.type, wins[p1.type], wins[p2.type], win_ratio))
     return win_ratio
예제 #7
0
    n = args.size
    times = args.times
    players = [args.player1, args.player2]
    player_objs = []
    for player in players:
        if player.lower() == 'random':
            player_objs.append(RandomPlayer())
        elif player.lower() == 'manual':
            player_objs.append(ManualPlayer())
        elif player.lower() == 'greedy':
            player_objs.append(GreedyPlayer())
        elif player.lower() == 'my':
            player_objs.append(MyPlayer())
        else:
            print('Wrong player type. Options: manual, random, greedy, my')
            sys.exit()

    black_wins = white_wins = 0
    for time in range(times):
        player1, player2 = player_objs[0], player_objs[1]
        my_go = GO(n)
        result = my_go.play(player1, player2)
        if result == 1: black_wins += 1
        elif result == 2: white_wins += 1
    print()
    print('Black player (X) | Wins:{0:.1f}% Loses:{1:.1f}%'.format(
        100 * black_wins / times, 100 * white_wins / times))
    print('White player (O) | Wins:{0:.1f}% Loses:{1:.1f}%'.format(
        100 * white_wins / times, 100 * black_wins / times))
    print()
예제 #8
0
class Train(object):
    """ Training Pipeline of Policy Value Neural Network. """
    def __init__(self, saved_weights=None):
        """ Initialize Attributes. """
        # board attributes
        self.size = 5  # board (go) size
        self.go = GO(self.size)  # initialize the board (go)
        # training params
        # mine, adjust this manually to set training process
        # --------------------------------------------------------------------
        self.R = 200  # num of simulations (rollouts) for each move
        self.check_freq = 50  # the frequency to check performance
        self.game_batch_num = 5000  # total batch number of selfplays
        self.test_num = 50  # test games number
        # --------------------------------------------------------------------
        # github preset, do not change
        self.lr = 2e-3  # learning rate of the whole process
        self.lr_coef = 1.0  # adaptively adjust lr based on KL
        self.temp = 1.0  # the temperature param controlling exploration
        self.C = 5  # hyperparameter controls the weight of prior probs
        self.buffer_size = 10000  # buffer size for data retrieving
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)  # data buffer
        self.play_batch_size = 1  # batch size for playing each time
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02  # kl target
        self.best_win_ratio = 0.0  # best_win_ratio to compare models
        # set policy-value net
        self.pv_net = PolicyValueNet(self.size, saved_weights)
        # set my player
        self.mcts_player = MyPlayer(self.pv_net.policy,
                                    c=self.C,
                                    r=self.R,
                                    is_selfplay=True,
                                    is_train=True)
        # set opponent players to evaluate performance
        self.op_player_n = 0
        self.op_players = [
            RandomPlayer(),
            GreedyPlayer(),
            AggressivePlayer(),
            SmartPlayer()
        ]

    def self_play(self, player, verbose=False, temp=1e-3):
        """ Self-play using MCTS player.
        Params:
            player: MCTS player object.
            verbose: bool, show board or not.
            temp: float, controls the weight of exploration.
        Returns: tuple of winner and game data.
        """
        self.go = GO(self.size)
        states, mcts_probs, players = [], [], []
        if verbose:
            self.go.visualize_board()
        while True:
            piece_type = 1 if self.go.X_move else 2
            if self.go.game_end(piece_type):
                # winner from the perspective of the current player
                winner = self.go.judge_winner()
                winners = np.zeros(len(players))
                winners[np.array(players) == winner] = 1.0
                winners[np.array(players) != winner] = -1.0
                # reset MCTS root node
                player.reset()
                if verbose:
                    print('Game ended.')
                    str_winner = 'X' if winner == 1 else 'O'
                    print(f'The winner is {str_winner}')
                return winner, zip(states, mcts_probs, winners)
            move, move_probs = player.get_move(self.go,
                                               temp=temp,
                                               return_prob=1)
            # store the data
            states.append(self.pv_net.get_state(self.go))
            mcts_probs.append(move_probs)
            players.append(piece_type)
            # perform a move
            i, j = move // self.go.size, move % self.go.size
            if not self.go.place_chess(i, j, piece_type):
                if verbose:
                    self.go.visualize_board()
                continue
            self.go.died_pieces = self.go.remove_died_pieces(3 - piece_type)
            self.go.X_move = not self.go.X_move

    def get_aug_data(self, data):
        """ Augment data by rotating and flipping.
        Params: data: list of (state, probs, winner) tuples.
        Returns: aug_data: list of 4 augmented data of the original one.
        """
        aug_data = []
        for state, mcts_porb, winner in data:
            for i in range(1, 5):
                # rotate
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(
                    np.flipud(mcts_porb.reshape(self.size, self.size)), i)
                aug_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # flip
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                aug_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return aug_data

    def buffer_selfplay_data(self, n_games=1):
        """ Buffer selfplay data.
        Params: n_games: int, number of selfplay games.
        """
        for i in range(n_games):
            winner, data = self.self_play(self.mcts_player, temp=self.temp)
            data = list(data)[:]
            self.last_moves = len(data)
            data = self.get_aug_data(data)
            self.data_buffer.extend(data)

    def update_weights(self):
        """ Update neural network weights using training data.
        Returns: float loss and entropy.
        """
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        s_batch = [data[0] for data in mini_batch]
        mp_batch = [data[1] for data in mini_batch]
        w_batch = [data[2] for data in mini_batch]
        old_probs, old_val = self.pv_net.model.predict_on_batch(
            np.array(s_batch))
        for i in range(self.epochs):
            loss, entropy = self.pv_net.train_core(s_batch, mp_batch, w_batch,
                                                   self.lr * self.lr_coef)
            new_probs, new_val = self.pv_net.model.predict_on_batch(
                np.array(s_batch))
            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        # adaptively adjust the learning rate
        if kl > self.kl_targ * 2 and self.lr_coef > 0.1:
            self.lr_coef /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_coef < 10:
            self.lr_coef *= 1.5
        explained_var_old = (1 -
                             np.var(np.array(w_batch) - old_val.flatten()) /
                             np.var(np.array(w_batch)))
        explained_var_new = (1 -
                             np.var(np.array(w_batch) - new_val.flatten()) /
                             np.var(np.array(w_batch)))
        print(("kl:{:.5f}, "
               "lr_coef:{:.3f}, "
               "loss:{}, "
               "entropy:{}, "
               "explained_var_old:{:.3f}, "
               "explained_var_new:{:.3f}").format(kl, self.lr_coef, loss,
                                                  entropy, explained_var_old,
                                                  explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=50):
        """ Evaluate the neural network policy.
        Params: n_games: int, number of games will be played.
        Returns: win_ratio: float, winning ratio of this evaluation.
        """
        p1 = MyPlayer(self.pv_net.policy, c=self.C, r=self.R, is_train=True)
        # p1 = MyPlayer(c=self.C, r=self.R)
        p2 = self.op_players[self.op_player_n]
        wins = defaultdict(int)
        for i in range(n_games):
            self.go = GO(self.size)
            if i % 2 == 0:
                winner = self.go.play(p1, p2, verbose=False)
                wins[p1.type if winner == 1 else p2.type] += 1
            else:
                winner = self.go.play(p2, p1, verbose=False)
                wins[p1.type if winner == 2 else p2.type] += 1
        win_ratio = 1.0 * wins[p1.type] / n_games
        print(" op_player: {}, win: {}, lose: {}, win_ratio:{}".format(
            p2.type, wins[p1.type], wins[p2.type], win_ratio))
        return win_ratio

    def train(self):
        """ Training process. """
        try:
            for i in range(self.game_batch_num):
                self.buffer_selfplay_data(self.play_batch_size)
                print("Batch No.{} Moves:{}".format(i + 1, self.last_moves))
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.update_weights()
                # check performance and save weights
                if (i + 1) % self.check_freq == 0:
                    print(f" Self-play batch: {i+1}. Games: {self.test_num}")
                    win_ratio = self.policy_evaluate(self.test_num)
                    self.pv_net.save_weights('./current_policy.model')
                    if win_ratio > self.best_win_ratio:
                        print(" New best policy weight. Saving... ")
                        self.best_win_ratio = win_ratio
                        # save best policy weights
                        self.pv_net.save_weights('./best_policy.model')
                        if self.best_win_ratio >= 0.98:
                            if self.op_player_n != 3:
                                self.op_player_n += 1
                                self.best_win_ratio = 0.0
                            else:
                                if self.test_num >= 100:
                                    break
                                self.test_num += 10
                                self.best_win_ratio = 0.95
        except KeyboardInterrupt:
            print('\n\rExiting...')