Exemplo n.º 1
0
def linear_approximator_compete():
    policyA = PolicyValueNet(width, height, model_file = dnn_file)
    playerA = MCTSPlayer(policyA.policy_value_fn, c_puct=5, n_playout=400, save_tree_on_compete= False)

    policyB = PolicyValueNet(width, height, PolicyValueNetCls= LinearNet, model_file = lin_file)
    playerB = MCTSPlayer(policyB.policy_value_fn, c_puct=5, n_playout=400, save_tree_on_compete= False)

    run(playerA, playerB)
Exemplo n.º 2
0
def improvement_compete():

    policyA = PolicyValueNet(width, height, model_file = dnn_file)
    playerA = MCTSPlayer(policyA.policy_value_fn, c_puct=5, n_playout=400, save_tree_on_compete= True)

    policyB = PolicyValueNet(width, height, model_file = dnn_file)
    playerB = MCTSPlayer(policyB.policy_value_fn, c_puct=5, n_playout=400, save_tree_on_compete= False)

    run(playerA, playerB)
    def __init__(self, init_model=None):
        # params of the board and the game
        self.writer = SummaryWriter()
        self.board_width = BOARD_SIZE
        self.board_height = BOARD_SIZE
        self.n_resnet = N_RESNET
        self.in_channel = IN_CHANNEL
        self.n_in_row = N_ROW
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board,
                         state_representation_channel=self.in_channel)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = SAVE_FREQ
        self.game_batch_num = TRAIN_EPOCH
        self.best_win_ratio = 0.0
        self.best_policy = 0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   self.n_resnet,
                                                   self.in_channel,
                                                   model_file=init_model,
                                                   use_gpu=USE_GPU)

        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   self.n_resnet,
                                                   self.in_channel,
                                                   use_gpu=USE_GPU)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)
Exemplo n.º 4
0
def run():
    n = N
    width, height = SIZE,SIZE

    # if MCTS_PURE:
    #     player_1 = MCTS_Pure(c_puct=5, n_playout=PLAYOUT)
    #     print ("Benchmarking the following two models:"+MODEL_1+" Pure MCTS")
    # elif HUMAN:
    #     player_2=Human()
    #     print ("Benchmarking the following two models:"+MODEL_1+" Human")
    # else:
    #     print ("Benchmarking the following two models:"+MODEL_1+"  vs  "+MODEL_2)
    #     policy_2= PolicyValueNet(width, height, model_file=MODEL_2,state_representation_channel = 4)
    #     player_2 = MCTSPlayer(policy_2.policy_value_fn,c_puct=5,n_playout=400)  # set larger n_playout for better performance


    #
    policy_1= PolicyValueNet(width, height, model_file=MODEL_1,in_channel = 11,n_resnet=1)
    player_1 = MCTSPlayer(policy_1.policy_value_fn,
                             c_puct=5,
                             n_playout=400)  # set larger n_playout for better performance

    # player_1 = Human()
    player_2 = Human()


    win_ratio = policy_evaluate(player_1,player_2)
    print("The win ratio for "+MODEL_1+" is: ",str(100*win_ratio)+"%")
Exemplo n.º 5
0
def run():
    # n = 5
    width, height = 9, 9  # width, height = 8, 8
    # model_file = './best_policy.pkl'
    # board = Board(width=width, height=height, n_in_row=n)
    board = Board(width=width, height=height)
    game = Game(board)

    # ############### human VS AI ###################
    # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

    # best_policy = PolicyValueNet(width, height, model_file = model_file)
    # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

    # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
    # try:
    #     policy_param = pickle.load(open(model_file, 'rb'))
    # policy_param = pickle.load(open(model_file, 'rb'),
    #                                encoding='bytes')  # To support python3
    best_policy = PolicyValueNet(width, height)
    mcts_player = MCTSPlayer(
        best_policy.policy_value_fn, c_puct=5,
        n_playout=400)  # set larger n_playout for better performance

    # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
    # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

    # human player, input your move in the format: 2,3
    human = Human()

    # set start_player=0 for human first
    game.start_play(human, mcts_player, start_player=0, is_shown=1)
Exemplo n.º 6
0
def run_game(surface, omok, menu):
    omok.turn = black_stone
    omok.init_game()
    board = Board(width=board_size, height=board_size, n_in_row=5)
    game = Game(board)
    model_file = 'current_policy_15x15-self500.model'
    while True:

        for event in pygame.event.get():
            pygame.display.flip()
            if omok.turn == black_stone and event.type == MOUSEBUTTONUP:
                x, y = omok.check_board_black(event.pos)
                human = Human(x, y)
            elif omok.turn == white_stone:

                best_policy = PolicyValueNet(15, 15, model_file=model_file)
                mcts = MCTSPlayer(best_policy.policy_value_fn,
                                  c_puct=7,
                                  n_playout=800)
                move = game.start_play(human, mcts, start_player=1)
                x = move / board_size
                y = move % board_size
                print(int(x), y)
                omok.check_board_white(int(x), y)
        else:
            pass

            if omok.is_gameover:
                return

            pygame.display.update()
            fps_clock.tick(fps)
Exemplo n.º 7
0
    def loadAI(self, init_model):
        """"
        # load AI
        """

        # return
        # self.buffer_size = 10000
        # self.batch_size = 512  # mini-batch size for training
        # self.data_buffer = deque(maxlen=self.buffer_size)
        init_model = "current_policy.model"
        init_model = "best_policy.model"
        init_model = 'best_policy_12000.pt'
        # init_model = 'best_policy200.pt'

        self.result = False

        # self.policy_value_net = PolicyValueNet(self.width,
        #                                        self.height,
        #                                        model_file=False)
        self.policy_value_net = PolicyValueNet(self.width,
                                               self.height,
                                               model_file=init_model,
                                               use_gpu=False)

        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=0)
        self.board2 = Board2(width=self.width,
                             height=self.height,
                             n_in_row=self.n_in_row)
        self.board2.init_board(1)
        self.game = Game(self.board2)
Exemplo n.º 8
0
def run():
    # n = 5
    width, height = 9, 9    # width, height = 8, 8
    model_file = r'./current_policy_cpu.pkl'
    try:
        board = Board(width=width, height=height)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        best_policy = PolicyValueNet(width, height, model_file = model_file)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        # best_policy = PolicyValueNetNumpy(width, height, policy_param)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=0, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Exemplo n.º 9
0
def run():
    n = 5
    width, height = 15, 15
    model_file = 'best_policy_15_15_5_forbidden_pyt.model'
    try:
        board = Board(width=width, height=height, n_in_row=n, forbidden_check_level=-1)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        best_policy = PolicyValueNet(width, height, model_file = model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3
        # best_policy = PolicyValueNetNumpy(width, height, policy_param)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn,
        #                          c_puct=5,
        #                          n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=0, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Exemplo n.º 10
0
def run():
    n = 5
    # 这里可以修改棋盘的大小,需要和AI Model的棋盘大小相等
    width, height = 10, 10
    # 调用AI模型
    model_file = 'best_policy.model'
    try:
        # 初始化棋盘
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # 加载AI Model
        best_policy = PolicyValueNet(width,
                                     height,
                                     model_file=model_file,
                                     use_gpu=False)
        # 设置n_playout越大,效果越好,不需要设置is_selfplay,因为不需要进行AI训练
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)

        # 也可以使用MCTS_Pure进行对弈,但是它太弱了
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # 创建人类player, 输入下棋位置比如 3,3
        human = Human()

        # start_player=1表示电脑先手,0表示人先手
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Exemplo n.º 11
0
def run(
        n_in_row,
        width,
        height,  # 几子棋,棋盘宽度,高度
        model_file,
        ai_first,  # 载入的模型文件,是否AI先下棋
        n_playout,
        use_gpu):  # AI每次进行蒙特卡洛的模拟次数,是否使用GPU
    try:
        board = Board(width=width, height=height, n_in_row=n_in_row)  # 产生一个棋盘

        # ############### human VS AI ###################
        best_policy = PolicyValueNet(width,
                                     height,
                                     model_file=model_file,
                                     use_gpu=use_gpu)  # 加载最佳策略网络
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=n_playout)  # 生成一个AI玩家
        main = UserInterface_GO_Human_vs_AI(
            mcts_player,
            board,
            width,
            height,
        )

        main.test()
        # set start_player=0 for human first


#        game.start_play(human, mcts_player, start_player=ai_first, is_shown=1) # 开始游戏
    except KeyboardInterrupt:
        print('\n\rquit')
Exemplo n.º 12
0
    def __init__(self, init_model=None):
        # params of the board and the game
        self.board_width = 6
        self.board_height = 6
        self.n_in_row = 4
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.data_buffer = deque(maxlen=1000)
        self.batch_size = 10
        self.temp = 1.0  # the temperature param
        self.n_playout = 40  # num of simulations for each move
        self.c_puct = 5
        self.epochs = 50

        self.pure_mcts_playout_num = 2
        self.best_win_ratio = 0.0

        self.policy_value_net = PolicyValueNet(self.board_width,
                                               self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)
Exemplo n.º 13
0
    def __init__(self, config: Config):
        # params of the game

        self.config = config
        self.game = Game.from_config(config)

        # training params

        self.buffer_size = 10000
        self.min_data_to_collect = 128
        self.batch_size = 128  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)

        self.save_freq = 20
        self.eval_freq = self.save_freq * 100
        self.num_total_iter = self.eval_freq * 4
        assert self.num_total_iter % self.save_freq == 0
        assert self.num_total_iter % self.eval_freq == 0

        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.policy_value_net = PolicyValueNet(self.config.size,
                                               model_file=config.model_file)
        self.mcts_player = MCTSPlayer(
            self.policy_value_net,
            c_puct=config.c_puct,
            n_playout=config.n_playout,
            is_selfplay=True,
        )
Exemplo n.º 14
0
 def __init__(self, init_model=None):
     # params of the board and the game
     self.board_width = 8  #6
     self.board_height = 8  #6
     self.n_in_row = 5  #4
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 5e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     # c_puct是MCTS里用来控制exploration-exploit tradeoff的参数
     # 这个参数越大的话MCTS搜索的过程中就偏向于均匀的探索,越小的话就偏向于直接选择访问次数多的分支
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.025
     # 检查当前策咯胜率的频率,当前设置为每50次训练后通过自我对弈评价当前策略
     # 如果找到更优策略,则保存当前策咯模型
     self.check_freq = 50
     #训练迭代次数
     self.game_batch_num = 1500
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
     #每次训练蒙特卡洛树搜索的次数,初始化为1000(后续训练过程中会不断增加)
     self.pure_mcts_playout_num = 1000
     if init_model:
         # start training from an initial policy-value net
         policy_param = pickle.load(open(init_model, 'rb'))
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                net_params=policy_param)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Exemplo n.º 15
0
def run():
    n = 6
    width, height = 9, 9
    model_file = 'best_policy.model'
    model_file_compare = 'compare.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        try:
            policy_param = pickle.load(open(model_file, 'rb'))
            policy_param_compare = pickle.load(open(model_file_compare, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')
            policy_param_compare = pickle.load(
                open(model_file_compare,
                     'rb'), encoding='bytes')  # To support python3

        #對照組
        best_policy = PolicyValueNet(width, height, policy_param)
        mcts_player = MCTSPlayer(
            best_policy.policy_value_fn, c_puct=5,
            n_playout=800)  # set larger n_playout for better performance

        #實驗組
        compare = PolicyValueNet(width, height, policy_param_compare)
        mcts_player_compare = MCTSPlayer(
            compare.policy_value_fn, c_puct=5,
            n_playout=800)  # set larger n_playout for better performance

        #評估勝率
        win_cnt = defaultdict(int)
        for i in range(200):  #場數設定
            winner = game.start_play(mcts_player_compare,
                                     mcts_player,
                                     start_player=i % 2,
                                     is_shown=1)
            win_cnt[winner] += 1

        print("win: {}, lose: {}, tie:{}".format(win_cnt[1], win_cnt[2],
                                                 win_cnt[-1]))

    except KeyboardInterrupt:
        print('\n\rquit')
Exemplo n.º 16
0
    def __init__(self, init_model=None):
        # params of the board and the game
        self.board_width = 15
        self.board_height = 15
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 200
        self.game_batch_num = 5000
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        #init_model = "best_policy.pt"
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model,
                                                   use_gpu=True)

        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   use_gpu=True)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)
Exemplo n.º 17
0
 def __init__(self, init_model=None):
     # 设置棋盘和游戏的参数
     self.board_width = 10
     self.board_height = 10
     self.n_in_row = 5
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # 设置训练参数
     self.learn_rate = 2e-3  # 基准学习率
     self.lr_multiplier = 1.2  # 基于KL自动调整学习倍速
     self.temp = 1.0  # 温度参数
     self.n_playout = 400  # 每下一步棋,模拟的步骤数
     self.c_puct = 5  # exploitation和exploration之间的折中系数
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)  #使用 deque 创建一个双端队列
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02  # 早停检查
     self.check_freq = 50  # 每50次检查一次,策略价值网络是否更新
     self.game_batch_num = 2000  # 训练多少个epoch
     self.best_win_ratio = 0.0  # 当前最佳胜率,用他来判断是否有更好的模型
     # 弱AI(纯MCTS)模拟步数,用于给训练的策略AI提供对手
     self.pure_mcts_playout_num = 1000
     if init_model:
         # 通过init_model设置策略网络
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model,
                                                use_gpu=True)
     else:
         # 训练一个新的策略网络
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                use_gpu=True)
     # AI Player,设置is_selfplay=1 自我对弈,因为是在进行训练
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Exemplo n.º 18
0
    def __init__(self, init_model=None):
        """ init function for the class"""

        # params of the board and the game
        self.board_width = 6  # board width
        self.board_height = 6  # board height
        self.n_in_row = 4  # win by n in line (vertically, horizontally, diagonally)
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5  # a number in (0, inf) controlling the relative impact of value Q, and prior probability P, on this node's score.
        self.buffer_size = 10000  # buffer size for replaying experience
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)  # buffer
        self.play_batch_size = 1  # size of rollout for each episode
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02  # target of KL loss
        self.check_freq = 50  # frequency for check evaluation and save model
        self.game_batch_num = 1500  # number of training game loop
        self.best_win_ratio = 0.0  # best evaluated win ratio
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        if init_model:  # load from existing file
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)
Exemplo n.º 19
0
def main():
    config = Config.from_args()
    _game = Game.from_config(config)
    policy_value_net = PolicyValueNet(config.size,
                                      model_file=config.model_file)
    mcts_player = MCTSPlayer(
        policy_value_net,
        c_puct=config.c_puct,
        n_playout=config.n_playout,
    )
    evaluate_policy(_game, mcts_player)
Exemplo n.º 20
0
def EMD_between_two_models_on_board(model1_name,
                                    input_plains_num_1,
                                    i1,
                                    model2_name,
                                    input_plains_num_2,
                                    i2,
                                    board1,
                                    board2,
                                    width=6,
                                    height=6,
                                    use_gpu=True):
    model_file_1 = f'/home/lirontyomkin/AlphaZero_Gomoku/models/{model1_name}/current_policy_{i1}.model'
    policy_1 = PolicyValueNet(width,
                              height,
                              model_file=model_file_1,
                              input_plains_num=input_plains_num_1,
                              use_gpu=use_gpu)

    model_file_2 = f'/home/lirontyomkin/AlphaZero_Gomoku/models/{model2_name}/current_policy_{i2}.model'
    policy_2 = PolicyValueNet(width,
                              height,
                              model_file=model_file_2,
                              input_plains_num=input_plains_num_2,
                              use_gpu=use_gpu)

    board_current_state1 = board1.current_state(last_move=True,
                                                is_random_last_turn=False)
    board_current_state2 = board2.current_state(last_move=True,
                                                is_random_last_turn=False)

    acts_policy1, probas_policy1 = zip(*policy_1.policy_value_fn(board1)[0])
    acts_policy2, probas_policy2 = zip(*policy_2.policy_value_fn(board2)[0])

    dist_matrix = generate_matrix_dist_metric(width)

    distance = emd(np.asarray(probas_policy1, dtype='float64'),
                   np.asarray(probas_policy2, dtype='float64'), dist_matrix)

    return distance
Exemplo n.º 21
0
 def __init__(self, init_model=None):
     # params of the board and the game
     #width of chessboard
     self.board_width = 8  #6 #10
     #height of chessboard
     self.board_height = 8 #6 #10
     #conditions for victory
     self.n_in_row = 5     #4 #5
     self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # training params 
     self.learn_rate = 5e-3   #learning rate
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0 # the temperature param
     self.n_playout = 400 # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000 #The number of maximum elements in the queue
     self.batch_size = 512 # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size) # queue size      
     self.play_batch_size = 1 # collect a set of data if it self-play once
     self.epochs = 5 # num of train_steps for each update
     self.kl_targ = 0.025 #KL target
     #check frequency: evaluate the game and current AI model every 50 times of self-play
     #The evaluation method is to use the latest AI model and MCTs-pure AI (based on random roll out) to fight 10 rounds
     self.check_freq = 50  #50
     self.game_batch_num = 200 #the number of training batches
     self.best_win_ratio = 0.0 #historical best winning rate
     # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000  
     if init_model:
         # start training from an initial policy-value net
         #pickle.load(file)反序列化对象。将文件中的数据解析为一个pytorch对象
         policy_param = pickle.load(open(init_model, 'rb')) #使用‘rb’按照二进制位读取
         self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) 
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
Exemplo n.º 22
0
 def __init__(self,
              init_model=None,
              board_width=6,
              board_height=6,
              n_in_row=4,
              n_playout=400,
              use_gpu=False,
              is_shown=False,
              output_file_name="",
              game_batch_number=1500):
     # 游戏和棋盘参数
     self.board_width = board_width
     self.board_height = board_height
     self.n_in_row = n_in_row
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # 训练参数
     self.learn_rate = 2e-3  #学习率α :0.002
     self.lr_multiplier = 1.0  # 根据 KL散度 适应性的调整学习率
     self.temp = 1.0  # 温度参数t
     self.n_playout = n_playout  # 每次move的 模拟playout次数
     self.c_puct = 5  #c_put常量
     self.buffer_size = 10000  #缓冲区大小
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  #  每次 update 的 train_steps
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = game_batch_number  #训练局数
     self.best_win_ratio = 0.0
     # 纯蒙特卡索搜索训练参数
     # 目的可以是作为真正训练的模型的对手
     self.pure_mcts_playout_num = 1000  #纯蒙特卡洛搜索模拟次数
     self.use_gpu = use_gpu
     self.is_shown = is_shown
     self.output_file_name = output_file_name  #输出的txt文件名
     #初始化神经网络
     self.policy_value_net = PolicyValueNet(self.board_width,
                                            self.board_height,
                                            model_file=init_model,
                                            use_gpu=self.use_gpu)
     #
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Exemplo n.º 23
0
def run():
    w = 12
    try:
        ai_player = AiEngine('pela', 'pbrain-pela.exe', w)

        best_policy = PolicyValueNet(
            w, w, 'result/pytorch_12_5/current_policy.model')
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=2000)

        game = Game(Board(width=w, height=w, n_in_row=5))
        game.start_play(ai_player, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Exemplo n.º 24
0
 def __init__(self, game_batch_num, model_file=None):
     # params of the board and the game
     self.size = BOARD_SIZE
     use_gpu = False
     board = Board(size=self.size, n_in_row=N_IN_ROW)
     self.game = Game(board)
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     self.c_puct = 5
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=10000)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = game_batch_num
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     if model_file:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(size=self.size,
                                                model_file=model_file,
                                                use_gpu=use_gpu)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(size=self.size,
                                                use_gpu=use_gpu)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
def run():
    n = N
    width, height = SIZE, SIZE

    if MCTS_PURE:
        player_2 = MCTS_Pure(c_puct=5, n_playout=PLAYOUT)
        # print ("Benchmarking the following two models:"+MODEL_1+" Pure MCTS")
    elif HUMAN:
        player_2 = Human()
        # print ("Benchmarking the following two models:"+MODEL_1+" Human")
    else:
        pass
        # print ("Benchmarking the following two models:"+MODEL_1+" "+MODEL_2)

    #
    # best_policy_2 = PolicyValueNet(width, height, model_file=MODEL_2)
    # player_2 = MCTSPlayer(best_policy_2.policy_value_fn,
    #                          c_puct=5,
    #                          n_playout=400)  # set larger n_playout for better performance
    # player_1=Human()

    win_ratios = []
    game_batchs = range(50, 1501, 100)
    for game_batch in game_batchs:
        model = './models/iter_' + str(game_batch) + '.model'
        print(model)

        policy = PolicyValueNet(width, height, model_file=model)
        player_1 = MCTSPlayer(
            policy.policy_value_fn, c_puct=5,
            n_playout=400)  # set larger n_playout for better performance
        win_ratio = policy_evaluate(player_1, player_2)
        win_ratios.append(win_ratio)
        print("The win ratio for " + model + " is: ",
              str(100 * win_ratio) + "%")

    print(zip(win_ratios, game_batchs))

    fig, ax = plt.subplots()
    ax.plot(game_batchs, win_ratios)

    ax.set(
        xlabel='iterations',
        ylabel='win ratios',
        title='Win ratio of models trained by 5 input states vs. MCTS player')
    ax.grid()

    fig.savefig("win_ratio.png")
Exemplo n.º 26
0
def run():
    n = 5
    width, height = 8, 8
    # model_file = 'best_policy_8_8_5.model'
    # model_file = 'best_policy_6_6_4.model'
    model_file = 'current_policy.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)
        # human player, input your move in the format: 2,3
        human = Human()

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow
        # Add FORBIDDEN move player
        best_policy = PolicyValueNet(width, height, model_file=model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3

        # ################ ORIGINAL POLICY and PLAYER ################
        # best_policy = PolicyValueNetNumpy(width, height, policy_param)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn,
        #                          c_puct=5,
        #                          n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)
        mcts_pure = MCTS_Pure(c_puct=5, n_playout=1000)

        # set start_player=0 for human first

        # game.start_play(human, mcts_player, start_player=1, is_shown=1)

        # ############## IMPLEMENTED PURE RL PLAYER ##############
        adv_player = QPlayer(board)
        # game.start_play(human, adv_player, start_player=1, is_shown=1)
        game.start_play(human, adv_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Exemplo n.º 27
0
 def __init__(self,
              init_model=None,
              board_width=6,
              board_height=6,
              n_in_row=4,
              n_playout=400,
              use_gpu=False,
              is_shown=False,
              output_file_name="",
              game_batch_number=1500):
     # params of the board and the game
     self.board_width = board_width
     self.board_height = board_height
     self.n_in_row = n_in_row
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = n_playout  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = game_batch_number
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     self.use_gpu = use_gpu
     self.is_shown = is_shown
     self.output_file_name = output_file_name
     self.policy_value_net = PolicyValueNet(self.board_width,
                                            self.board_height,
                                            model_file=init_model,
                                            use_gpu=self.use_gpu)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Exemplo n.º 28
0
def run():
    model_file = 'best_policy_pytorch.model'
    try:
        board = Board(size=BOARD_SIZE, n_in_row=N_IN_ROW)
        game = Game(board)

        # ############### human VS AI ###################
        best_policy = PolicyValueNet(BOARD_SIZE, model_file)

        mcts_player = MCTSPlayer(
            best_policy.policy_value_fn, c_puct=5,
            n_playout=800)  # set larger n_playout for better performance

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\nQuit')
Exemplo n.º 29
0
def run(n_in_row, width, height, model_file, ai_first, n_playout, use_gpu):
    try:
        board = Board(width=width, height=height, n_in_row=n_in_row)  # 产生一个棋盘
        game = Game(board)  # 加载一个游戏

        # ############### 人类 VS AI ###################
        best_policy = PolicyValueNet(width,
                                     height,
                                     model_file=model_file,
                                     use_gpu=use_gpu)  # 加载最佳策略网络
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=n_playout)  # 生成一个AI玩家
        human = Human()  # 生成一个人类玩家

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=ai_first,
                        is_shown=1)  # 开始游戏
    except KeyboardInterrupt:
        print('\n\rquit')
Exemplo n.º 30
0
def run():
    n = 6
    width, height = 9, 9
    model_file = 'best_policy.model' #載入模型
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)      
        
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'), encoding = 'bytes')  # To support python3
        best_policy = PolicyValueNet(width, height, policy_param)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)  # set larger n_playout for better performance
        
        
        # human player, input your move in the format: 2,3
        human = Human()                   
        
        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')