Пример #1
0
def run():
    n = 5
    width, height = 9, 9
    iteration = 1000

    model_file = './model/current_policy_{}_{}_{}_iteration{}.model'.format(
        height, width, n, iteration)
    #model_file = './model/best_policy_{}_{}_{}.model'.format(height,width,n)
    try:
        board = Board(width=width, height=height, n_in_row=n)

        best_policy = PolicyValueNet(width, height, model_file=model_file)
        AI_player1 = MCTSPlayer(best_policy.policy_value_fn,
                                c_puct=5,
                                n_playout=400)
        AI_player2 = MCTSPlayer(best_policy.policy_value_fn,
                                c_puct=5,
                                n_playout=400)
        human = Human()

        game = Game("AlphaZero Gomoku", board, AI_player1, AI_player2)
        while True:
            game.play()
            pygame.display.update()

            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    pygame.quit()
                    exit()
                elif event.type == pygame.MOUSEBUTTONDOWN:
                    mouse_x, mouse_y = pygame.mouse.get_pos()
                    game.mouseClick(mouse_x, mouse_y)
                    game.check_buttons(mouse_x, mouse_y)

    except KeyboardInterrupt:
        print('\n\rquit')
Пример #2
0
def run():
    model_file = './current_policy.model'
    best_policy = PolicyValueNet(6, 6, model_file)
    config = GameConfig()
    board = Board(config)
    game = Game(board)
    mcts_player1 = MCTSPlayer(best_policy.policy_value_fn,
                              c_puct=5,
                              n_playout=1000)
    mcts_player2 = MCTS_Pure(c_puct=5, n_playout=1000)
    mcts_player3 = MCTS_Pure(c_puct=5, n_playout=1000)
    human = Human(config)
    human2 = Human(config)
    human3 = Human(config)
    game.start_play(mcts_player3, human, mcts_player2)
Пример #3
0
 def policy_evaluate(self, n_games=10):
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct, n_playout=self.n_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num)
     win_cnt = defaultdict(int)
     for i in range(n_games):
         # AI和弱AI(纯MCTS)对弈,不需要可视化 is_shown=0,双方轮流职黑 start_player=i % 2
         winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0)
         win_cnt[winner] += 1
     # 计算胜率,平手计为0.5分
     win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games
     print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
             self.pure_mcts_playout_num,
             win_cnt[1], win_cnt[2], win_cnt[-1]))
     return win_ratio
Пример #4
0
def run():
    w = 12
    try:
        ai_player = AiEngine('pela', 'pbrain-pela.exe', w)

        best_policy = PolicyValueNet(
            w, w, 'result/pytorch_12_5/current_policy.model')
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=2000)

        game = Game(Board(width=w, height=w, n_in_row=5))
        game.start_play(ai_player, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Пример #5
0
 def policy_evaluate(self, n_games=10,batch=0):
     """
     Evaluate the trained policy by playing games against the pure MCTS player
     Note: this is only for monitoring the progress of training
     """
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num)
     win_cnt = defaultdict(int)
     for i in range(n_games):
         winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i%2, is_shown=0)
         win_cnt[winner] += 1
     win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1])/n_games
     print("batch_i:{}, num_playouts:{}, win: {}, lose: {}, tie:{}".format(batch, self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
     logging.debug("batch_i {} num_playouts {} win {} lose {} tie {}".format(batch, self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
     return win_ratio
Пример #6
0
    def __init__(self, init_model=None, is_remote=False):
        # params of the board and the game
        self.board_width = 8
        self.board_height = 8
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.n_playout_self_play = 1000
        self.c_puct = 5
        self.buffer_size = 2000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 100
        self.game_batch_num = 1500
        self.best_win_ratio = 0.0
        self.td_step = 2
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        if is_remote:
            self.path = '/content/drive/My Drive/'
        else:
            self.path = './'

        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=self.path +
                                                   init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout_self_play,
                                      is_selfplay=1)
def run():
    n = N
    width, height = SIZE, SIZE

    if MCTS_PURE:
        player_2 = MCTS_Pure(c_puct=5, n_playout=PLAYOUT)
        # print ("Benchmarking the following two models:"+MODEL_1+" Pure MCTS")
    elif HUMAN:
        player_2 = Human()
        # print ("Benchmarking the following two models:"+MODEL_1+" Human")
    else:
        pass
        # print ("Benchmarking the following two models:"+MODEL_1+" "+MODEL_2)

    #
    # best_policy_2 = PolicyValueNet(width, height, model_file=MODEL_2)
    # player_2 = MCTSPlayer(best_policy_2.policy_value_fn,
    #                          c_puct=5,
    #                          n_playout=400)  # set larger n_playout for better performance
    # player_1=Human()

    win_ratios = []
    game_batchs = range(50, 1501, 100)
    for game_batch in game_batchs:
        model = './models/iter_' + str(game_batch) + '.model'
        print(model)

        policy = PolicyValueNet(width, height, model_file=model)
        player_1 = MCTSPlayer(
            policy.policy_value_fn, c_puct=5,
            n_playout=400)  # set larger n_playout for better performance
        win_ratio = policy_evaluate(player_1, player_2)
        win_ratios.append(win_ratio)
        print("The win ratio for " + model + " is: ",
              str(100 * win_ratio) + "%")

    print(zip(win_ratios, game_batchs))

    fig, ax = plt.subplots()
    ax.plot(game_batchs, win_ratios)

    ax.set(
        xlabel='iterations',
        ylabel='win ratios',
        title='Win ratio of models trained by 5 input states vs. MCTS player')
    ax.grid()

    fig.savefig("win_ratio.png")
Пример #8
0
    def get_action(self, board):
        print("AI's turn")
        try:
            model_file = './best_model_9_9_5.h5'
            best_policy = PolicyValueNet(9, 9, model_file=model_file)
            mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                     c_puct=5,
                                     n_playout=400)

        except Exception as e:
            print(e)
            move = -1
        if move == -1 or move not in board.availables:
            print(f"invalid move: {move}")
            move = self.get_action(board)
        return move
Пример #9
0
def run(n_in_row, width, height, # 几子棋,棋盘宽度,高度
        model_file, ai_first, # 载入的模型文件,是否AI先下棋
        n_playout, use_gpu): # AI每次进行蒙特卡洛的模拟次数,是否使用GPU
    try:
        board = Board(width=width, height=height, n_in_row=n_in_row) # 产生一个棋盘

        # ############### human VS AI ###################
        best_policy = PolicyValueNet(width, height, model_file=model_file, use_gpu=use_gpu) # 加载最佳策略网络
        mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=n_playout) # 生成一个AI玩家
        main = UserInterface_GO_Human_vs_AI(mcts_player, board, width, height,)
        
        main.test()
        # set start_player=0 for human first
#        game.start_play(human, mcts_player, start_player=ai_first, is_shown=1) # 开始游戏
    except KeyboardInterrupt:
        print('\n\rquit')
Пример #10
0
 def __init__(self,
              init_model=None,
              board_width=6,
              board_height=6,
              n_in_row=4,
              n_playout=400,
              use_gpu=False,
              is_shown=False,
              output_file_name="",
              game_batch_number=1500):
     # params of the board and the game
     self.board_width = board_width
     self.board_height = board_height
     self.n_in_row = n_in_row
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = n_playout  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = game_batch_number
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     self.use_gpu = use_gpu
     self.is_shown = is_shown
     self.output_file_name = output_file_name
     self.policy_value_net = PolicyValueNet(self.board_width,
                                            self.board_height,
                                            model_file=init_model,
                                            use_gpu=self.use_gpu)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Пример #11
0
def run():
    n = 5
    width, height = 8, 8
    # model_file = 'best_policy_8_8_5.model'
    # model_file = 'best_policy_6_6_4.model'
    model_file = 'current_policy.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)
        # human player, input your move in the format: 2,3
        human = Human()

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow
        # Add FORBIDDEN move player
        best_policy = PolicyValueNet(width, height, model_file=model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3

        # ################ ORIGINAL POLICY and PLAYER ################
        # best_policy = PolicyValueNetNumpy(width, height, policy_param)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn,
        #                          c_puct=5,
        #                          n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)
        mcts_pure = MCTS_Pure(c_puct=5, n_playout=1000)

        # set start_player=0 for human first

        # game.start_play(human, mcts_player, start_player=1, is_shown=1)

        # ############## IMPLEMENTED PURE RL PLAYER ##############
        adv_player = QPlayer(board)
        # game.start_play(human, adv_player, start_player=1, is_shown=1)
        game.start_play(human, adv_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Пример #12
0
    def __init__(self, init_model):
        self.init_model = init_model
        # params of the board and the game
        self.board_width = 6
        self.board_height = 6
        self.n_in_row = 4
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 50
        self.game_batch_num = 1000
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        if os.path.isdir(init_model):
            self.is_init = True
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            self.is_init = False
            os.system('mkdir ' + init_model)
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

        if not os.path.isdir(init_model + 'best'):
            os.system('mkdir ' + init_model + 'best')
Пример #13
0
def run():
    # n = 5
    # width, height = 8, 8
    # model_file = 'best_policy_8_8_5.model'
    n = 4
    width, height = 6, 6
    model_file = 'best_policy.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # ===================================================================================
        # ===================================================================================
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow
        best_policy = PolicyValueNet(width, height, model_file = model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)
        
        # ===================================================================================

        # ===================================================================================
        # ===================================================================================
        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        # try:
        #     policy_param = pickle.load(open(model_file, 'rb'))
        # except:
        #     policy_param = pickle.load(open(model_file, 'rb'),
        #                                encoding='bytes')  # To support python3
        # best_policy = PolicyValueNetNumpy(width, height, policy_param)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn,
        #                          c_puct=5,
        #                          n_playout=400)  # set larger n_playout for better performance
        # ===================================================================================



        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Пример #14
0
 def __init__(self, init_model=None):
     # params of the board and the game
     self.board_width = 7
     self.board_height = 7
     self.n_in_row = 5
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 1500  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 150000
     self.batch_size = 2048  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     if os.path.exists("data_buffer.pkl"):
         with open("data_buffer.pkl", "rb") as f:
             self.data_buffer = pickle.load(f)
             print("Load data, size = %d" % len(self.data_buffer))
     self.play_batch_size = 1
     self.epochs = 10  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 1500
     self.save_freq = 500
     self.game_batch_num = 10000
     self.best_win_ratio = 0.0
     self.episode_len = 0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     if init_model:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Пример #15
0
 def __init__(self, init_model=None):
     # params of the board and the game
     self.board_width = 8  #6
     self.board_height = 8  #6
     self.n_in_row = 5  #4
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 5e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     # c_puct是MCTS里用来控制exploration-exploit tradeoff的参数
     # 这个参数越大的话MCTS搜索的过程中就偏向于均匀的探索,越小的话就偏向于直接选择访问次数多的分支
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.025
     # 检查当前策咯胜率的频率,当前设置为每50次训练后通过自我对弈评价当前策略
     # 如果找到更优策略,则保存当前策咯模型
     self.check_freq = 50
     #训练迭代次数
     self.game_batch_num = 1500
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
     #每次训练蒙特卡洛树搜索的次数,初始化为1000(后续训练过程中会不断增加)
     self.pure_mcts_playout_num = 1000
     if init_model:
         # start training from an initial policy-value net
         policy_param = pickle.load(open(init_model, 'rb'))
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                net_params=policy_param)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Пример #16
0
    def __init__(self, init_model=None):
        # params of the board and the game
        self.board_width = 6    #棋盘宽度
        self.board_height = 6   #棋盘高度
        self.n_in_row = 4       #胜利条件:多少个棋连成一线算是胜利

        # 实例化一个board,定义棋盘宽高和胜利条件
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)

        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 50
        self.game_batch_num = 1500
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000

        #初始化network和树,network是一直保存的,树的话不知道什么时候重置。
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)
Пример #17
0
def run(width=8, height=8, num_in_row=5, model_file='best_policy_8_8_5.model'):

    try:
        board = Board(n_in_row=num_in_row)

        game = Game(board)

        policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes')

        best_policy = PolicyValueNetPlay(width, height, policy_param)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)

        # set start_player=0 for human first
        game.start_play(Human(), mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Пример #18
0
 def __init__(self, init_model=None, board_width=6, board_height=6,
              n_in_row=4, n_playout=400, use_gpu=False, is_shown=False,
              output_file_name="", game_batch_number=1500):
     # 游戏和棋盘参数
     self.board_width = board_width
     self.board_height = board_height
     self.n_in_row = n_in_row
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # 训练参数
     self.learn_rate = 2e-3  #学习率α :0.002
     self.lr_multiplier = 1.0  # 根据 KL散度 适应性的调整学习率 
     self.temp = 1.0  # 温度参数t
     self.n_playout = n_playout  # 每次move的 模拟playout次数
     self.c_puct = 5 #c_put常量
     self.buffer_size = 10000 #缓冲区大小
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size) 
     self.play_batch_size = 1 
     self.epochs = 5  #  每次 update 的 train_steps
     self.kl_targ = 0.02
     self.check_freq = 100
     self.game_batch_num = game_batch_number #训练局数
     self.best_win_ratio = 0.0
     # 纯蒙特卡索搜索训练参数
     # 目的可以是作为真正训练的模型的对手
     self.pure_mcts_playout_num = 7000 #纯蒙特卡洛搜索模拟次数
     self.use_gpu = use_gpu 
     self.is_shown = is_shown
     self.output_file_name = output_file_name #输出的txt文件名
     #初始化神经网络
     self.policy_value_net = PolicyValueNet(self.board_width,
                                            self.board_height,
                                            model_file=init_model,
                                            use_gpu=self.use_gpu
                                            )
     #
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Пример #19
0
    def run(self):
        """run the training pipeline"""
        np.random.seed(0)
        try:
            for i in range(self.num_total_iter):
                n_data, n_game = self.collect_selfplay_data()

                print("iteration {}: total {} data collected from {} game(s)".
                      format(i, n_data, n_game))

                self.policy_value_net.set_train_mode()
                self.policy_update()
                self.data_buffer.clear()
                # check the performance of the current model,
                # and save the model params
                if (i + 1) % self.save_freq == 0:
                    print("saving current model at {}: file={}".format(
                        i + 1, self.config.get_current_model_name()))
                    self.policy_value_net.save_model(
                        self.config.get_current_model_name())
                if (i + 1) % self.eval_freq == 0:
                    print("evalutating current model: {}".format(i + 1))
                    current_mcts_player = MCTSPlayer(
                        self.policy_value_net,
                        c_puct=self.config.c_puct,
                        n_playout=self.config.n_playout,
                    )
                    win_ratio = evaluate.evaluate_policy(
                        self.game, current_mcts_player)
                    if win_ratio > self.best_win_ratio:
                        print(
                            "saving the new best policy at {}! win_ratio={}, file={}"
                            .format(
                                i,
                                win_ratio,
                                self.config.get_best_model_name(),
                            ))
                        self.best_win_ratio = win_ratio
                        # update the best_policy
                        self.policy_value_net.save_model(
                            self.config.get_best_model_name())
        except KeyboardInterrupt:
            print("\n\rquit")
def run():
    n = 5
    width, height = 8, 8

    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)      
        
        ################ human VS AI ###################        
      
        best_policy = PolicyValueNet(width, height)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)  # set larger n_playout for better performance
        
        human = Human()                   
        
        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Пример #21
0
 def __init__(self, init_model=None):
     # params of the board and the game
     self.board_width = 10
     self.board_height = 10
     self.actiondim = 24
     self.totaltime = 100
     #self.n_in_row = 4
     self.board = Board(width=self.board_width, height=self.board_height)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 5e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.025
     self.check_freq = 10
     self.game_batch_num = 1500
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 500
     if init_model:
         # start training from an initial policy-value net
         policy_param = pickle.load(open(init_model, 'rb'))
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                net_params=policy_param)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                self.actiondim,
                                                self.totaltime)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Пример #22
0
def run():
    if (len(sys.argv)) != 2:
        print(sys.argv)
        print("Need to provide one argument, the model which to play with")
        sys.exit(0)

    n = 5
    width, height = 15, 15
    model_file = sys.argv[1]
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        best_policy = PolicyValueNet(width, height, model_file=model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3
        # best_policy = PolicyValueNetNumpy(width, height, policy_param)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn,
        #                         c_puct=5,
        #                         n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Пример #23
0
    def __init__(self, init_model=None):
        """ init function for the class"""

        # params of the board and the game
        self.board_width = 6  # board width
        self.board_height = 6  # board height
        self.n_in_row = 4  # win by n in line (vertically, horizontally, diagonally)
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5  # a number in (0, inf) controlling the relative impact of value Q, and prior probability P, on this node's score.
        self.buffer_size = 10000  # buffer size for replaying experience
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)  # buffer
        self.play_batch_size = 1  # size of rollout for each episode
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02  # target of KL loss
        self.check_freq = 50  # frequency for check evaluation and save model
        self.game_batch_num = 1500  # number of training game loop
        self.best_win_ratio = 0.0  # best evaluated win ratio
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        if init_model:  # load from existing file
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)
Пример #24
0
 def __init__(self, init_model=None):
     # 设置棋盘和游戏的参数
     self.board_width = 10
     self.board_height = 10
     self.n_in_row = 5
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # 设置训练参数
     self.learn_rate = 2e-3  # 基准学习率
     self.lr_multiplier = 1.2  # 基于KL自动调整学习倍速
     self.temp = 1.0  # 温度参数
     self.n_playout = 400  # 每下一步棋,模拟的步骤数
     self.c_puct = 5  # exploitation和exploration之间的折中系数
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)  #使用 deque 创建一个双端队列
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02  # 早停检查
     self.check_freq = 50  # 每50次检查一次,策略价值网络是否更新
     self.game_batch_num = 2000  # 训练多少个epoch
     self.best_win_ratio = 0.0  # 当前最佳胜率,用他来判断是否有更好的模型
     # 弱AI(纯MCTS)模拟步数,用于给训练的策略AI提供对手
     self.pure_mcts_playout_num = 1000
     if init_model:
         # 通过init_model设置策略网络
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model,
                                                use_gpu=True)
     else:
         # 训练一个新的策略网络
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                use_gpu=True)
     # AI Player,设置is_selfplay=1 自我对弈,因为是在进行训练
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Пример #25
0
def run():
    model_file = 'best_policy_pytorch.model'
    try:
        board = Board(size=BOARD_SIZE, n_in_row=N_IN_ROW)
        game = Game(board)

        # ############### human VS AI ###################
        best_policy = PolicyValueNet(BOARD_SIZE, model_file)

        mcts_player = MCTSPlayer(
            best_policy.policy_value_fn, c_puct=5,
            n_playout=800)  # set larger n_playout for better performance

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\nQuit')
def run():
    n = 5
    width, height = 15, 15
    model_file = 'dist/best_policy.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game_UI(board, is_shown=1)

        # ############### Human-machine ###################
        best_policy = PolicyValueNet(width, height, model_file=model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)

        human = Human()

        game.start_play_mouse(human, mcts_player, start_player=0, is_shown=1)

    except KeyboardInterrupt:
        print('\n\rquit')
Пример #27
0
def run(n_in_row, width, height, model_file, ai_first, n_playout, use_gpu):
    try:
        board = Board(width=width, height=height, n_in_row=n_in_row)  # 产生一个棋盘
        game = Game(board)  # 加载一个游戏

        # ############### 人类 VS AI ###################
        best_policy = PolicyValueNet(width,
                                     height,
                                     model_file=model_file,
                                     use_gpu=use_gpu)  # 加载最佳策略网络
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=n_playout)  # 生成一个AI玩家
        human = Human()  # 生成一个人类玩家

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=ai_first,
                        is_shown=1)  # 开始游戏
    except KeyboardInterrupt:
        print('\n\rquit')
Пример #28
0
 def __init__(self, init_model=None):
     # params of the board and the game
     self.board_width = 5
     self.board_height = 6
     self.board = Board()
     self.game = Game(self.board)
     # training params
     #学习率0.002
     self.learn_rate = 2e-3
     #自动调整学习率 kl比较两个概率分布的接近程度。在某个变化范围内,KL散度取到最小值的时候,对应的参数是我们想要的最优参数
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     #    self.check_freq = 50
     self.check_freq = 50
     #    self.game_batch_num = 1500
     self.game_batch_num = 50
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     if init_model:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Пример #29
0
def ai():
    try:
        board = Board(width=weight, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        # best_policy = PolicyValueNet(weight, height, model_file = model_file)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy

        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3
        best_policy = PolicyValueNetNumpy(weight, height, policy_param)
        mcts_player = MCTSPlayer(
            best_policy.policy_value_fn, c_puct=5,
            n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human(gui=ex)

        # set start_player=0 for human first
        if (humanFirst):
            start_player = 0
        else:
            start_player = 1
        ex.endTheGame(win=game.start_play(human,
                                          mcts_player,
                                          start_player=start_player,
                                          is_shown=0),
                      move=board.last_move)
    except KeyboardInterrupt:
        print('\n\rquit')
Пример #30
0
def run():
    n = 5
    width, height = 8, 8
    model_file = 'best_policy_8_8_5.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        # best_policy = PolicyValueNet(width, height, model_file = model_file)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3
        best_policy = PolicyValueNetNumpy(width, height, policy_param)
        mcts_player = MCTSPlayer(
            best_policy.policy_value_fn, c_puct=5,
            n_playout=400)  # set larger n_playout for better performance

        #pure mcts player
        #make quick_play=True to enable a weaker but much faster roll-out player without mcts
        pure_mcts_player = MCTS_Pure(c_puct=1, n_playout=600, quick_play=False)
        roll_out_player = MCTS_Pure(quick_play=True)

        #1.run with two human player
        game.start_play_with_UI()

        #2.run with alpha zero nerutral network AI, and my quick roll-out AI
        #game.start_play_with_UI(AI=mcts_player, AI2 = roll_out_player)

        #3.run with alpha zero nerutral network AI, and my pure mcts AI
        #game.start_play_with_UI(AI=mcts_player, AI2 = pure_mcts_player)

    except KeyboardInterrupt:
        print('\n\rquit')