def run(): model_file = './current_policy.model' best_policy = PolicyValueNet(6, 6, model_file) config = GameConfig() board = Board(config) game = Game(board) mcts_player1 = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=1000) mcts_player2 = MCTS_Pure(c_puct=5, n_playout=1000) mcts_player3 = MCTS_Pure(c_puct=5, n_playout=1000) human = Human(config) human2 = Human(config) human3 = Human(config) game.start_play(mcts_player3, human, mcts_player2)
def policy_evaluate(self, iteration, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2 + 1, is_shown=0, savefig=False) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts: {}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) self.writer.add_text( tag='evaluation results', text_string= f"num_playouts: {self.pure_mcts_playout_num}, win: {win_cnt[1]}, lose: {win_cnt[2]}, tie:{win_cnt[-1]}", global_step=iteration + 1) return win_ratio
def policy_evaluate(self, pure_mcts_playout_num, current_policy_value_net, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer( current_policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=pure_mcts_playout_num) win_cnt = defaultdict(int) board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) game = Game(board) logging.info('update process alphazero with pure mcts game start') for i in range(n_games): winner = game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 logging.info('update process alphazero with pure mcts game finished') win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games logging.info( "update process num_playouts:{}, win: {}, lose: {}, tie:{}".format( pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def run(): n = N width, height = SIZE, SIZE # best_policy_1 = PolicyValueNet(width, height, model_file=MODEL_1) # player_1 = MCTSPlayer(best_policy_1.policy_value_fn, # c_puct=5, # n_playout=400) # set larger n_playout for better performance # # best_policy_2 = PolicyValueNet(width, height, model_file=MODEL_2) # player_2 = MCTSPlayer(best_policy_2.policy_value_fn, # c_puct=5, # n_playout=400) # set larger n_playout for better performance if MCTS_PURE: player_2 = MCTS_Pure(c_puct=5, n_playout=PLAYOUT) print("Benchmarking the following two models:" + MODEL_1 + " Pure MCTS") elif HUMAN: player_2 = Human() print("Benchmarking the following two models:" + MODEL_1 + " Human") else: print("Benchmarking the following two models:" + MODEL_1 + " " + MODEL_2) player_1 = Human() result = policy_evaluate(player_1, player_2) print("The win ratio for " + MODEL_1 + " is: ", str(100 * result) + "%")
def single_game_play(): board = Board(width=15,height=15,n_in_row=5) game = Game(board) temp = 1.0 player = MCTS_Pure() winner, play_data = game.start_self_play(player, temp=temp) return winner, play_data
def policy_evaluate(self, n_games=10): """ 通過與純MCTS玩家對戰來評估經過培訓的策略網路 注意:這僅用於監視培訓進度 """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) send_msg("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) if not os.path.exists(self.evaluate_path): with open(self.evaluate_path, 'w') as f: f.write('i, num_playouts, win, lose, tie') with open(self.evaluate_path, 'a') as f: f.write( f'{self.i}, {self.pure_mcts_playout_num}, {win_cnt[1]}, {win_cnt[2]}, {win_cnt[-1]}\n' ) return win_ratio
def policy_evaluate(self, current_batch, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games output = "current_batch:{},num_playouts:{},win:{},lose:{},tie:{},win_ratio:{}".format( current_batch, self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1], win_ratio) utils.log(output, SCORE_OUTPUT) return win_ratio
def policy_evaluate(self, n_games=10): ''' Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training ''' current_mcts_player = MCTSPlayer( policy_value_function=self.policy_value_net.policy_value_fn_random, action_fc=self.policy_value_net.action_fc_test, evaluation_fc=self.policy_value_net.evaluation_fc2_test, c_puct=5, n_playout=400, is_selfplay=False) test_player = MCTS_Pure(c_puct=5, n_playout=3000) win_cnt = defaultdict(int) # 5 white stone games, 5 black stone games for i in range(n_games): winner = self.game.start_play(player1=current_mcts_player, player2=test_player, start_player=i % 2, is_shown=0, print_prob=False) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def policy_evaluate(self, n_games=10): #print "_____policy__evaluation________" current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) print "winner", winner win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[0]) / n_games print "win ratio =", win_ratio print("num_playout:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[0])) return win_ratio
def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ self.evaluate_game = Game( Board(width=self.config['board_width'], height=self.config['board_height'], n_in_row=self.config['n_in_row'])) current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.config['c_puct'], n_playout=self.config['n_playout']) pure_mcts_player = MCTS_Pure( c_puct=5, n_playout=self.config['pure_mcts_playout_num']) win_cnt = defaultdict(int) for i in range(n_games): winner = self.evaluate_game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.config['pure_mcts_playout_num'], win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def policy_evaluate(self, n_games=10,num=0,self_evaluate = 0): ''' Evaluate the trained policy by playing against the pure MCTS player or play with itself pure MCTS only for monitoring the progress of training play with itself (last best net) for evaluating the best model so as to collect data ''' # fix the playout times to 400 current_mcts_player = MCTSPlayer(policy_value_function=self.policy_value_net.policy_value_fn_random, action_fc=self.policy_value_net.action_fc_test, evaluation_fc=self.policy_value_net.evaluation_fc2_test, c_puct=self.c_puct, n_playout=400, is_selfplay=False) if self_evaluate: self.policy_value_net.load_numpy(self.policy_value_net.network_oppo_all_params) mcts_player_oppo = MCTSPlayer(policy_value_function=self.policy_value_net.policy_value_fn_random, action_fc=self.policy_value_net.action_fc_test_oppo, evaluation_fc=self.policy_value_net.evaluation_fc2_test_oppo, c_puct=self.c_puct, n_playout=400, is_selfplay=False) else: test_player = MCTS_Pure(c_puct=5,n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): if self_evaluate: print('+' * 80 + 'rank: {}, epoch:{}, game:{} , now situation : {} , self evaluating ...'.format(rank, num,i,win_cnt)) winner = self.game.start_play(player1=current_mcts_player, player2=mcts_player_oppo, start_player=i%2, is_shown=0, print_prob =False) else: print('+'*80+'pure mcts playout: {}, rank: {}, epoch:{}, game:{} evaluating ...'.format(self.pure_mcts_playout_num,rank,num,i)) print() winner = self.game.start_play(player1=current_mcts_player, player2=test_player, start_player=i % 2, is_shown=0, print_prob=False) win_cnt[winner] += 1 win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games #win for 1,tie for 0.5 if self_evaluate: print("-"*150+"win: {}, lose: {}, tie:{}".format(win_cnt[1], win_cnt[2], win_cnt[-1])) else: print("-"*80+"num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def run(): n = 5 width, height = 8, 8 model_file = 'best_policy_8_8_5.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # best_policy = PolicyValueNet(width, height, model_file = model_file) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') # To support python3 best_policy = PolicyValueNetNumpy(width, height, policy_param) mcts_player = MCTSPlayer( best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance #pure mcts player #make quick_play=True to enable a weaker but much faster roll-out player without mcts pure_mcts_player = MCTS_Pure(c_puct=1, n_playout=600, quick_play=False) roll_out_player = MCTS_Pure(quick_play=True) #1.run with two human player game.start_play_with_UI() #2.run with alpha zero nerutral network AI, and my quick roll-out AI #game.start_play_with_UI(AI=mcts_player, AI2 = roll_out_player) #3.run with alpha zero nerutral network AI, and my pure mcts AI #game.start_play_with_UI(AI=mcts_player, AI2 = pure_mcts_player) except KeyboardInterrupt: print('\n\rquit')
def __init__(self): self.temp = 1e-3 # the temperature param self.n_playout = 200 # num of simulations for each move self.c_puct = 5 self.board_width = 8 self.board_height = 8 self.model_path = os.path.join("./models/curr_model_100rollout.pt") #self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params=None) #self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) self.mcts_player = MCTS_Pure(c_puct=5, n_playout=self.n_playout) self.env = gym.make("Reversi8x8-v0") self.init_model()
def run(): n = 5 width, height = 12, 12 try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) mcts_player = MCTS_Pure(c_puct=5, n_playout=10000) # set larger n_playout for better performance human = Human() game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def run(): # n = 5 width, height = 5, 5 # model_file = 'best_policy_8_8_5.model' try: ## board = Board(width=width, height=height, n_in_row=n) board = Board(width=width, height=height) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # best_policy = PolicyValueNet(width, height, model_file = model_file) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy # try: # policy_param = pickle.load(open(model_file, 'rb')) # except: # policy_param = pickle.load(open(model_file, 'rb'), # encoding='bytes') # To support python3 # best_policy = PolicyValueNetNumpy(width, height, policy_param) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, # c_puct=5, # n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) mcts_player1 = MCTS_Pure(c_puct=5, n_playout=500) mcts_player2 = MCTS_Pure(c_puct=5, n_playout=500) # human player, input your move in the format: 2,3 # human = Human() # set start_player=0 for human first game.start_play(mcts_player1, mcts_player2, start_player=1, is_shown=0) # game.start_play(human, mcts_player2, start_player=1, is_shown=0) except KeyboardInterrupt: print('\n\rquit')
def policy_evaluate(self, n_games=10): current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): # AI和弱AI(纯MCTS)对弈,不需要可视化 is_shown=0,双方轮流职黑 start_player=i % 2 winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 # 计算胜率,平手计为0.5分 win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def policy_evaluate(self, n_games=10,batch=0): """ Evaluate the trained policy by playing games against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i%2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1])/n_games print("batch_i:{}, num_playouts:{}, win: {}, lose: {}, tie:{}".format(batch, self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) logging.debug("batch_i {} num_playouts {} win {} lose {} tie {}".format(batch, self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def run(): n = N width, height = SIZE, SIZE if MCTS_PURE: player_2 = MCTS_Pure(c_puct=5, n_playout=PLAYOUT) # print ("Benchmarking the following two models:"+MODEL_1+" Pure MCTS") elif HUMAN: player_2 = Human() # print ("Benchmarking the following two models:"+MODEL_1+" Human") else: pass # print ("Benchmarking the following two models:"+MODEL_1+" "+MODEL_2) # # best_policy_2 = PolicyValueNet(width, height, model_file=MODEL_2) # player_2 = MCTSPlayer(best_policy_2.policy_value_fn, # c_puct=5, # n_playout=400) # set larger n_playout for better performance # player_1=Human() win_ratios = [] game_batchs = range(50, 1501, 100) for game_batch in game_batchs: model = './models/iter_' + str(game_batch) + '.model' print(model) policy = PolicyValueNet(width, height, model_file=model) player_1 = MCTSPlayer( policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance win_ratio = policy_evaluate(player_1, player_2) win_ratios.append(win_ratio) print("The win ratio for " + model + " is: ", str(100 * win_ratio) + "%") print(zip(win_ratios, game_batchs)) fig, ax = plt.subplots() ax.plot(game_batchs, win_ratios) ax.set( xlabel='iterations', ylabel='win ratios', title='Win ratio of models trained by 5 input states vs. MCTS player') ax.grid() fig.savefig("win_ratio.png")
def run(): n = 5 width, height = 8, 8 # model_file = 'best_policy_8_8_5.model' # model_file = 'best_policy_6_6_4.model' model_file = 'current_policy.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # human player, input your move in the format: 2,3 human = Human() # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # Add FORBIDDEN move player best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') # To support python3 # ################ ORIGINAL POLICY and PLAYER ################ # best_policy = PolicyValueNetNumpy(width, height, policy_param) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, # c_puct=5, # n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) mcts_pure = MCTS_Pure(c_puct=5, n_playout=1000) # set start_player=0 for human first # game.start_play(human, mcts_player, start_player=1, is_shown=1) # ############## IMPLEMENTED PURE RL PLAYER ############## adv_player = QPlayer(board) # game.start_play(human, adv_player, start_player=1, is_shown=1) game.start_play(human, adv_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def policy_evaluate(self, n_games=10): """ 通过与纯的MCTS算法对抗来评估训练的策略 注意:这仅用于监控训练进度 """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def run(states, sensible_moves, currentPlayer, lastMove): n = 5 width, height = 8, 8 board = Board(width=width, height=height, n_in_row=n) board.init_board() board.states = states board.availables = sensible_moves board.current_player = currentPlayer board.last_move = lastMove #best_policy = PolicyValueNetNumpy(width, height, policy_param) #mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) #只用纯MCTS mcts_player = MCTS_Pure(c_puct=5, n_playout=4000) # n_playout参数 表示 搜索次数 nextmove = mcts_player.get_action(board) return nextmove
def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.chess_mcts_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.mcts_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.run_game.play_game(current_mcts_player, pure_mcts_player) print(winner) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.mcts_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in tqdm(range(n_games), ascii=True, desc='Policy Evaluate'): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=self.is_shown) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def policy_evaluate(self, n_games=10): """ 通过和纯MCTS玩家对战评估当前策略 Note: 这仅仅是为了监控训练的进程 """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=self.is_shown) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def play(self): model_file = "current.model" best_policy = PolicyValueNet(self.width, self.height, model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=300) pure_player = MCTS_Pure(c_puct=5, n_playout=300) human1 = Human() human2 = Human() # self.show() win_cnt = defaultdict(int) for i in range(10): winner = self.start_play(mcts_player, pure_player, start_player=(i % 2), is_shown=1) win_cnt[winner] += 1 print "win", win_cnt[1], "lose", win_cnt[2], "tie", win_cnt[0]
def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ current_player = self.player win_ratios = {} for playout_num in self.pure_mcts_playout_num: pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) win_ratios[str(playout_num)] = win_ratio return win_ratios
def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ print('4') current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = 0 for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt += 1 win_ratio = win_cnt / n_games print("num_playouts:{}, win: {}".format(self.pure_mcts_playout_num, win_cnt)) return win_ratio
def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing games against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): print("train-policy_evaluate: game = %d" % (i)) winner = start_play(self.board, current_mcts_player, pure_mcts_player, startPlayer=i % 2) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[0]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[0])) return win_ratio
def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing games against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer( self.policy_value_net.policy_value_func, c_puct=self.c_puct, n_play_out=self.n_play_out) pure_mcts_player = MCTS_Pure(c_puct=5, n_play_out=self.pure_mcts_play_out_number) win_cnt = defaultdict(int) results = self.pool.map(self.game.start_play, [(current_mcts_player, pure_mcts_player, i) for i in range(n_games)]) for winner in results: win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print_log("number_play_outs:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_play_out_number, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def parse_agent(agent_type, filename): if agent_type == 'mcts_a0': model_file = 'best_policy_8_8_5.model' if filename: model_file = filename # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # best_policy = PolicyValueNet(width, height, model_file = model_file) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load( open(model_file, 'rb'), encoding='bytes') # To support python3 best_policy = PolicyValueNetNumpy(width, height, policy_param) player = MCTSPlayer( best_policy.policy_value_fn, c_puct=5, n_playout=400 ) # set larger n_playout for better performance elif agent_type == 'mcts_pure': player = MCTS_Pure(c_puct=5, n_playout=1000) elif agent_type == 'minmax': player = Minimax() elif agent_type == 'dqn': model_file = 'output/v_1/epoch_100/agent_2.pkl' if filename: model_file = filename player = DQNPlayer(model_file) elif agent_type == 'human': player = Human() else: player = Human() print('Illegal Agent Type. Defaulting to human player.') return player