def run(): n = 5 width, height = 9, 9 iteration = 1000 model_file = './model/current_policy_{}_{}_{}_iteration{}.model'.format( height, width, n, iteration) #model_file = './model/best_policy_{}_{}_{}.model'.format(height,width,n) try: board = Board(width=width, height=height, n_in_row=n) best_policy = PolicyValueNet(width, height, model_file=model_file) AI_player1 = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) AI_player2 = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) human = Human() game = Game("AlphaZero Gomoku", board, AI_player1, AI_player2) while True: game.play() pygame.display.update() for event in pygame.event.get(): if event.type == pygame.QUIT: pygame.quit() exit() elif event.type == pygame.MOUSEBUTTONDOWN: mouse_x, mouse_y = pygame.mouse.get_pos() game.mouseClick(mouse_x, mouse_y) game.check_buttons(mouse_x, mouse_y) except KeyboardInterrupt: print('\n\rquit')
def run(): model_file = './current_policy.model' best_policy = PolicyValueNet(6, 6, model_file) config = GameConfig() board = Board(config) game = Game(board) mcts_player1 = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=1000) mcts_player2 = MCTS_Pure(c_puct=5, n_playout=1000) mcts_player3 = MCTS_Pure(c_puct=5, n_playout=1000) human = Human(config) human2 = Human(config) human3 = Human(config) game.start_play(mcts_player3, human, mcts_player2)
def policy_evaluate(self, n_games=10): current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): # AI和弱AI(纯MCTS)对弈,不需要可视化 is_shown=0,双方轮流职黑 start_player=i % 2 winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 # 计算胜率,平手计为0.5分 win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def run(): w = 12 try: ai_player = AiEngine('pela', 'pbrain-pela.exe', w) best_policy = PolicyValueNet( w, w, 'result/pytorch_12_5/current_policy.model') mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=2000) game = Game(Board(width=w, height=w, n_in_row=5)) game.start_play(ai_player, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def policy_evaluate(self, n_games=10,batch=0): """ Evaluate the trained policy by playing games against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i%2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1])/n_games print("batch_i:{}, num_playouts:{}, win: {}, lose: {}, tie:{}".format(batch, self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) logging.debug("batch_i {} num_playouts {} win {} lose {} tie {}".format(batch, self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def __init__(self, init_model=None, is_remote=False): # params of the board and the game self.board_width = 8 self.board_height = 8 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.n_playout_self_play = 1000 self.c_puct = 5 self.buffer_size = 2000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 100 self.game_batch_num = 1500 self.best_win_ratio = 0.0 self.td_step = 2 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if is_remote: self.path = '/content/drive/My Drive/' else: self.path = './' if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=self.path + init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout_self_play, is_selfplay=1)
def run(): n = N width, height = SIZE, SIZE if MCTS_PURE: player_2 = MCTS_Pure(c_puct=5, n_playout=PLAYOUT) # print ("Benchmarking the following two models:"+MODEL_1+" Pure MCTS") elif HUMAN: player_2 = Human() # print ("Benchmarking the following two models:"+MODEL_1+" Human") else: pass # print ("Benchmarking the following two models:"+MODEL_1+" "+MODEL_2) # # best_policy_2 = PolicyValueNet(width, height, model_file=MODEL_2) # player_2 = MCTSPlayer(best_policy_2.policy_value_fn, # c_puct=5, # n_playout=400) # set larger n_playout for better performance # player_1=Human() win_ratios = [] game_batchs = range(50, 1501, 100) for game_batch in game_batchs: model = './models/iter_' + str(game_batch) + '.model' print(model) policy = PolicyValueNet(width, height, model_file=model) player_1 = MCTSPlayer( policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance win_ratio = policy_evaluate(player_1, player_2) win_ratios.append(win_ratio) print("The win ratio for " + model + " is: ", str(100 * win_ratio) + "%") print(zip(win_ratios, game_batchs)) fig, ax = plt.subplots() ax.plot(game_batchs, win_ratios) ax.set( xlabel='iterations', ylabel='win ratios', title='Win ratio of models trained by 5 input states vs. MCTS player') ax.grid() fig.savefig("win_ratio.png")
def get_action(self, board): print("AI's turn") try: model_file = './best_model_9_9_5.h5' best_policy = PolicyValueNet(9, 9, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) except Exception as e: print(e) move = -1 if move == -1 or move not in board.availables: print(f"invalid move: {move}") move = self.get_action(board) return move
def run(n_in_row, width, height, # 几子棋,棋盘宽度,高度 model_file, ai_first, # 载入的模型文件,是否AI先下棋 n_playout, use_gpu): # AI每次进行蒙特卡洛的模拟次数,是否使用GPU try: board = Board(width=width, height=height, n_in_row=n_in_row) # 产生一个棋盘 # ############### human VS AI ################### best_policy = PolicyValueNet(width, height, model_file=model_file, use_gpu=use_gpu) # 加载最佳策略网络 mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=n_playout) # 生成一个AI玩家 main = UserInterface_GO_Human_vs_AI(mcts_player, board, width, height,) main.test() # set start_player=0 for human first # game.start_play(human, mcts_player, start_player=ai_first, is_shown=1) # 开始游戏 except KeyboardInterrupt: print('\n\rquit')
def __init__(self, init_model=None, board_width=6, board_height=6, n_in_row=4, n_playout=400, use_gpu=False, is_shown=False, output_file_name="", game_batch_number=1500): # params of the board and the game self.board_width = board_width self.board_height = board_height self.n_in_row = n_in_row self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = n_playout # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = game_batch_number self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 self.use_gpu = use_gpu self.is_shown = is_shown self.output_file_name = output_file_name self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model, use_gpu=self.use_gpu) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(): n = 5 width, height = 8, 8 # model_file = 'best_policy_8_8_5.model' # model_file = 'best_policy_6_6_4.model' model_file = 'current_policy.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # human player, input your move in the format: 2,3 human = Human() # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # Add FORBIDDEN move player best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') # To support python3 # ################ ORIGINAL POLICY and PLAYER ################ # best_policy = PolicyValueNetNumpy(width, height, policy_param) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, # c_puct=5, # n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) mcts_pure = MCTS_Pure(c_puct=5, n_playout=1000) # set start_player=0 for human first # game.start_play(human, mcts_player, start_player=1, is_shown=1) # ############## IMPLEMENTED PURE RL PLAYER ############## adv_player = QPlayer(board) # game.start_play(human, adv_player, start_player=1, is_shown=1) game.start_play(human, adv_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def __init__(self, init_model): self.init_model = init_model # params of the board and the game self.board_width = 6 self.board_height = 6 self.n_in_row = 4 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1000 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if os.path.isdir(init_model): self.is_init = True # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: self.is_init = False os.system('mkdir ' + init_model) # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) if not os.path.isdir(init_model + 'best'): os.system('mkdir ' + init_model + 'best')
def run(): # n = 5 # width, height = 8, 8 # model_file = 'best_policy_8_8_5.model' n = 4 width, height = 6, 6 model_file = 'best_policy.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # =================================================================================== # =================================================================================== # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow best_policy = PolicyValueNet(width, height, model_file = model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # =================================================================================== # =================================================================================== # =================================================================================== # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy # try: # policy_param = pickle.load(open(model_file, 'rb')) # except: # policy_param = pickle.load(open(model_file, 'rb'), # encoding='bytes') # To support python3 # best_policy = PolicyValueNetNumpy(width, height, policy_param) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, # c_puct=5, # n_playout=400) # set larger n_playout for better performance # =================================================================================== # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def __init__(self, init_model=None): # params of the board and the game self.board_width = 7 self.board_height = 7 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 1500 # num of simulations for each move self.c_puct = 5 self.buffer_size = 150000 self.batch_size = 2048 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) if os.path.exists("data_buffer.pkl"): with open("data_buffer.pkl", "rb") as f: self.data_buffer = pickle.load(f) print("Load data, size = %d" % len(self.data_buffer)) self.play_batch_size = 1 self.epochs = 10 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 1500 self.save_freq = 500 self.game_batch_num = 10000 self.best_win_ratio = 0.0 self.episode_len = 0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None): # params of the board and the game self.board_width = 8 #6 self.board_height = 8 #6 self.n_in_row = 5 #4 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 5e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move # c_puct是MCTS里用来控制exploration-exploit tradeoff的参数 # 这个参数越大的话MCTS搜索的过程中就偏向于均匀的探索,越小的话就偏向于直接选择访问次数多的分支 self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.025 # 检查当前策咯胜率的频率,当前设置为每50次训练后通过自我对弈评价当前策略 # 如果找到更优策略,则保存当前策咯模型 self.check_freq = 50 #训练迭代次数 self.game_batch_num = 1500 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy #每次训练蒙特卡洛树搜索的次数,初始化为1000(后续训练过程中会不断增加) self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net policy_param = pickle.load(open(init_model, 'rb')) self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params=policy_param) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None): # params of the board and the game self.board_width = 6 #棋盘宽度 self.board_height = 6 #棋盘高度 self.n_in_row = 4 #胜利条件:多少个棋连成一线算是胜利 # 实例化一个board,定义棋盘宽高和胜利条件 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 #初始化network和树,network是一直保存的,树的话不知道什么时候重置。 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(width=8, height=8, num_in_row=5, model_file='best_policy_8_8_5.model'): try: board = Board(n_in_row=num_in_row) game = Game(board) policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') best_policy = PolicyValueNetPlay(width, height, policy_param) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # set start_player=0 for human first game.start_play(Human(), mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def __init__(self, init_model=None, board_width=6, board_height=6, n_in_row=4, n_playout=400, use_gpu=False, is_shown=False, output_file_name="", game_batch_number=1500): # 游戏和棋盘参数 self.board_width = board_width self.board_height = board_height self.n_in_row = n_in_row self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # 训练参数 self.learn_rate = 2e-3 #学习率α :0.002 self.lr_multiplier = 1.0 # 根据 KL散度 适应性的调整学习率 self.temp = 1.0 # 温度参数t self.n_playout = n_playout # 每次move的 模拟playout次数 self.c_puct = 5 #c_put常量 self.buffer_size = 10000 #缓冲区大小 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # 每次 update 的 train_steps self.kl_targ = 0.02 self.check_freq = 100 self.game_batch_num = game_batch_number #训练局数 self.best_win_ratio = 0.0 # 纯蒙特卡索搜索训练参数 # 目的可以是作为真正训练的模型的对手 self.pure_mcts_playout_num = 7000 #纯蒙特卡洛搜索模拟次数 self.use_gpu = use_gpu self.is_shown = is_shown self.output_file_name = output_file_name #输出的txt文件名 #初始化神经网络 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model, use_gpu=self.use_gpu ) # self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(self): """run the training pipeline""" np.random.seed(0) try: for i in range(self.num_total_iter): n_data, n_game = self.collect_selfplay_data() print("iteration {}: total {} data collected from {} game(s)". format(i, n_data, n_game)) self.policy_value_net.set_train_mode() self.policy_update() self.data_buffer.clear() # check the performance of the current model, # and save the model params if (i + 1) % self.save_freq == 0: print("saving current model at {}: file={}".format( i + 1, self.config.get_current_model_name())) self.policy_value_net.save_model( self.config.get_current_model_name()) if (i + 1) % self.eval_freq == 0: print("evalutating current model: {}".format(i + 1)) current_mcts_player = MCTSPlayer( self.policy_value_net, c_puct=self.config.c_puct, n_playout=self.config.n_playout, ) win_ratio = evaluate.evaluate_policy( self.game, current_mcts_player) if win_ratio > self.best_win_ratio: print( "saving the new best policy at {}! win_ratio={}, file={}" .format( i, win_ratio, self.config.get_best_model_name(), )) self.best_win_ratio = win_ratio # update the best_policy self.policy_value_net.save_model( self.config.get_best_model_name()) except KeyboardInterrupt: print("\n\rquit")
def run(): n = 5 width, height = 8, 8 try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) ################ human VS AI ################### best_policy = PolicyValueNet(width, height) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def __init__(self, init_model=None): # params of the board and the game self.board_width = 10 self.board_height = 10 self.actiondim = 24 self.totaltime = 100 #self.n_in_row = 4 self.board = Board(width=self.board_width, height=self.board_height) self.game = Game(self.board) # training params self.learn_rate = 5e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.025 self.check_freq = 10 self.game_batch_num = 1500 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy self.pure_mcts_playout_num = 500 if init_model: # start training from an initial policy-value net policy_param = pickle.load(open(init_model, 'rb')) self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params=policy_param) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, self.actiondim, self.totaltime) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(): if (len(sys.argv)) != 2: print(sys.argv) print("Need to provide one argument, the model which to play with") sys.exit(0) n = 5 width, height = 15, 15 model_file = sys.argv[1] try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') # To support python3 # best_policy = PolicyValueNetNumpy(width, height, policy_param) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, # c_puct=5, # n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def __init__(self, init_model=None): """ init function for the class""" # params of the board and the game self.board_width = 6 # board width self.board_height = 6 # board height self.n_in_row = 4 # win by n in line (vertically, horizontally, diagonally) self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 # a number in (0, inf) controlling the relative impact of value Q, and prior probability P, on this node's score. self.buffer_size = 10000 # buffer size for replaying experience self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) # buffer self.play_batch_size = 1 # size of rollout for each episode self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 # target of KL loss self.check_freq = 50 # frequency for check evaluation and save model self.game_batch_num = 1500 # number of training game loop self.best_win_ratio = 0.0 # best evaluated win ratio # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # load from existing file # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None): # 设置棋盘和游戏的参数 self.board_width = 10 self.board_height = 10 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # 设置训练参数 self.learn_rate = 2e-3 # 基准学习率 self.lr_multiplier = 1.2 # 基于KL自动调整学习倍速 self.temp = 1.0 # 温度参数 self.n_playout = 400 # 每下一步棋,模拟的步骤数 self.c_puct = 5 # exploitation和exploration之间的折中系数 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) #使用 deque 创建一个双端队列 self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 # 早停检查 self.check_freq = 50 # 每50次检查一次,策略价值网络是否更新 self.game_batch_num = 2000 # 训练多少个epoch self.best_win_ratio = 0.0 # 当前最佳胜率,用他来判断是否有更好的模型 # 弱AI(纯MCTS)模拟步数,用于给训练的策略AI提供对手 self.pure_mcts_playout_num = 1000 if init_model: # 通过init_model设置策略网络 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model, use_gpu=True) else: # 训练一个新的策略网络 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, use_gpu=True) # AI Player,设置is_selfplay=1 自我对弈,因为是在进行训练 self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(): model_file = 'best_policy_pytorch.model' try: board = Board(size=BOARD_SIZE, n_in_row=N_IN_ROW) game = Game(board) # ############### human VS AI ################### best_policy = PolicyValueNet(BOARD_SIZE, model_file) mcts_player = MCTSPlayer( best_policy.policy_value_fn, c_puct=5, n_playout=800) # set larger n_playout for better performance # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\nQuit')
def run(): n = 5 width, height = 15, 15 model_file = 'dist/best_policy.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game_UI(board, is_shown=1) # ############### Human-machine ################### best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) human = Human() game.start_play_mouse(human, mcts_player, start_player=0, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def run(n_in_row, width, height, model_file, ai_first, n_playout, use_gpu): try: board = Board(width=width, height=height, n_in_row=n_in_row) # 产生一个棋盘 game = Game(board) # 加载一个游戏 # ############### 人类 VS AI ################### best_policy = PolicyValueNet(width, height, model_file=model_file, use_gpu=use_gpu) # 加载最佳策略网络 mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=n_playout) # 生成一个AI玩家 human = Human() # 生成一个人类玩家 # set start_player=0 for human first game.start_play(human, mcts_player, start_player=ai_first, is_shown=1) # 开始游戏 except KeyboardInterrupt: print('\n\rquit')
def __init__(self, init_model=None): # params of the board and the game self.board_width = 5 self.board_height = 6 self.board = Board() self.game = Game(self.board) # training params #学习率0.002 self.learn_rate = 2e-3 #自动调整学习率 kl比较两个概率分布的接近程度。在某个变化范围内,KL散度取到最小值的时候,对应的参数是我们想要的最优参数 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 # self.check_freq = 50 self.check_freq = 50 # self.game_batch_num = 1500 self.game_batch_num = 50 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def ai(): try: board = Board(width=weight, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # best_policy = PolicyValueNet(weight, height, model_file = model_file) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') # To support python3 best_policy = PolicyValueNetNumpy(weight, height, policy_param) mcts_player = MCTSPlayer( best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = Human(gui=ex) # set start_player=0 for human first if (humanFirst): start_player = 0 else: start_player = 1 ex.endTheGame(win=game.start_play(human, mcts_player, start_player=start_player, is_shown=0), move=board.last_move) except KeyboardInterrupt: print('\n\rquit')
def run(): n = 5 width, height = 8, 8 model_file = 'best_policy_8_8_5.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # best_policy = PolicyValueNet(width, height, model_file = model_file) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') # To support python3 best_policy = PolicyValueNetNumpy(width, height, policy_param) mcts_player = MCTSPlayer( best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance #pure mcts player #make quick_play=True to enable a weaker but much faster roll-out player without mcts pure_mcts_player = MCTS_Pure(c_puct=1, n_playout=600, quick_play=False) roll_out_player = MCTS_Pure(quick_play=True) #1.run with two human player game.start_play_with_UI() #2.run with alpha zero nerutral network AI, and my quick roll-out AI #game.start_play_with_UI(AI=mcts_player, AI2 = roll_out_player) #3.run with alpha zero nerutral network AI, and my pure mcts AI #game.start_play_with_UI(AI=mcts_player, AI2 = pure_mcts_player) except KeyboardInterrupt: print('\n\rquit')