def __init__(self): # 게임(오목)에 대한 변수들 self.board_width, self.board_height = 9, 9 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # 학습에 대한 변수들 self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # KL에 기반하여 학습 계수를 적응적으로 조정 self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.data_buffer = deque(maxlen=self.buffer_size) self.batch_size = 512 # mini-batch size : 버퍼 안의 데이터 중 512개를 추출 self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 500 # 지정 횟수마다 모델을 체크하고 저장. 원래는 100이었음. self.game_batch_num = 3000 # 최대 학습 횟수 self.train_num = 0 # 현재 학습 횟수 # policy-value net에서 학습 시작 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None): # params of the board and the game self.board_width = 6 self.board_height = 6 self.n_in_row = 4 self.board = Quoridor() self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 1000 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 1000 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 64 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 2000 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(model_file=init_model, use_gpu=True) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet() self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None): self.board = Board() self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL # self.board_height, self.temp = 1.0 # the temperature param self.n_playout = 1600 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 15000 self.best_win_ratio = 0.0 self.pure_mcts_playout_num = 1000 if init_model: self.policy_value_net = PolicyValueNet(model_file=init_model, use_gpu=True) else: self.policy_value_net = PolicyValueNet(use_gpu=True) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) print("init done")
def __init__(self, init_model=None): # 棋盘参数 self.game = Quoridor() # 训练参数 self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # 适应性调节学习速率 self.temp = 1.0 self.n_playout = 400 self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 128 # 取1 测试ing self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 self.pure_mcts_playout_num = 1000 if init_model: self.policy_value_net = PolicyValueNet(model_file=init_model) else: self.policy_value_net = PolicyValueNet() # 设置电脑玩家信息 self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self): # params of the board and the game self.board_width = 5 self.board_height = 5 self.game = Game() # training params self.learn_rate = 0.001 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 500 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 128 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 100 self.game_batch_num = 2000 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy self.pure_mcts_playout_num = 3000 # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None): # params of the game self.width = 4 self.height = 4 self.game = Game() # params of training self.learn_rate = 2e-3 self.lr_multiplier = 1.0 self.temp = 1.0 self.n_playout = 300 self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 64 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 5000 self.best_win_ratio = 0.0 self.pure_mcts_playout_num = 500 if init_model: self.policy_value_net = PolicyValueNet(self.width, self.height, model_file=init_model) else: self.policy_value_net = PolicyValueNet(self.width, self.height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(self): """run the training pipeline""" try: for i in range(self.game_batch_num): self.collect_selfplay_data(self.play_batch_size) print("batch i:{}, episode_len:{}".format( i+1, self.episode_len)) if len(self.data_buffer) > self.batch_size: loss, entropy = self.policy_update() # check the performance of the current model, # and save the model params if (i+1) % self.check_freq == 0: print("current self-play batch: {}".format(i+1)) # win_ratio = self.policy_evaluate(n_games=1) self.policy_value_net.save_model('./best_policy.model') self.policy_value_net = PolicyValueNet('./best_policy.model', use_gpu=True) # if win_ratio > self.best_win_ratio: # print("New best policy!!!!!!!!") # self.best_win_ratio = win_ratio # # update the best_policy # self.policy_value_net.save_model('./best_policy.model') # if (self.best_win_ratio == 1.0 and # self.pure_mcts_playout_num < 5000): # self.pure_mcts_playout_num += 1000 # self.best_win_ratio = 0.0 except KeyboardInterrupt: print('\n\rquit')
def __init__(self): # params of the board and the game self.board_width = BOARD_SIZE self.board_height = BOARD_SIZE self.board = Board() self.game = Game(self.board) # training params self.learn_rate = 5e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 300 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.025 self.check_freq = 1 self.game_batch_num = 1500 self.best_win_ratio = 0.0 self.episode_len = 0 # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy self.pure_mcts_playout_num = 300 # start training from a given policy-value net # policy_param = pickle.load(open('current_policy.model', 'rb')) # self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param) # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None): self.game = Quoridor() self.learn_rate = 2e-3 self.lr_multiplier = 1.0 self.temp = 1.0 self.n_playout = 200 self.c_puct = 5 self.buffer_size = 10000 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.kl_targ = 0.02 self.check_freq = 10 self.game_batch_num = 1000 self.best_win_ratio = 0.0 self.pure_mcts_playout_num = 1000 self.old_probs = 0 self.new_probs = 0 self.first_trained = False if init_model: self.policy_value_net = PolicyValueNet(model_file=init_model) else: self.policy_value_net = PolicyValueNet() self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None): # params of the board and the game self.n = 8 self.board = Board(self.n) self.game = Game(self.board) # training params self.learn_rate = 5e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_play_out = 400 # number of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.epochs = 5 # number of train_steps for each update self.kl_target = 0.025 self.check_freq = 50 self.game_batch_number = 10000 self.best_win_ratio = 0.0 self.episode_length = 0 self.pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) # number of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy self.last_batch_number = 0 self.pure_mcts_play_out_number = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.n, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.n) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_func, c_puct=self.c_puct, n_play_out=self.n_play_out, is_self_play=1)
class TrainPipeline(): def __init__(self, init_model=None): # params of the board and the game self.game = Game() # training params self.config = TrainConfig() self.greedy_config = TrainGreedyConfig() self.data_buffer = deque(maxlen=self.config.buffer_size) if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet() self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.config.c_puct, n_playout=self.config.n_playout, is_selfplay=1) self.mcts_player_greedy = MCTSPlayerGreedy( self.policy_value_net.policy_value_fn, c_puct=self.greedy_config.c_puct, n_playout=self.greedy_config.n_playout, is_selfplay=1) def collect_selfplay_data(self, n_games=1): """collect self-play data for training""" for i in range(n_games): winner, play_data = self.game.start_self_play( self.mcts_player, temp=self.config.temp, greedy_player=self.mcts_player_greedy, who_greedy="B") play_data = list(play_data) # augment the data play_data = symmetry_board_moves(play_data) self.data_buffer.extend(play_data) def policy_update(self): """update the policy-value net""" state_batch = [data[0] for data in self.data_buffer] mcts_probs_batch = [data[1] for data in self.data_buffer] winner_batch = [data[2] for data in self.data_buffer] self.policy_value_net.train(state_batch, mcts_probs_batch, winner_batch, self.config.epochs) self.policy_value_net.save_model("model.h5") def run(self): """run the training pipeline""" try: self.collect_selfplay_data(self.config.play_batch_size) self.policy_update() except KeyboardInterrupt: print('\n\rquit') def summary(self): self.policy_value_net.model.summary()
def __init__(self, init_model=None): # params of the board and the game self.board_width = 6 self.board_height = 6 self.n_in_row = 4 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 # add output log self.formatter = logging.Formatter('%(asctime)s [%(module)s] %(levelname)s: %(message)s', '%Y-%m-%d %H:%M:%S') self.logger = logging.getLogger(__name__) self.logger.setLevel(level=logging.INFO) self.handler = logging.FileHandler("output.log") self.handler.setLevel(logging.INFO) self.handler.setFormatter(self.formatter) self.console = logging.StreamHandler() self.console.setLevel(logging.INFO) self.console.setFormatter(self.formatter) self.logger.addHandler(self.handler) self.logger.addHandler(self.console) if init_model: if os.path.exists(init_model): # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: self.logger.error("{} does not exists!\n".format(init_model)) return -1 else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def smart_worker_train(): pvnet = PolicyValueNet(board_n, model_filename) server = SmartServer(pvnet) #print("Training") #server.train_fn(*server.mem.get_history()) #print("Done") while True: server.train() pvnet.save_model(model_filename)
def __init__(self): # params of the board and the game self.board_width = w self.board_height = h self.n_in_row = l self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 5e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.025 self.check_freq = p self.game_batch_num = r self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 print( datetime.datetime.now(a), "init....{}x{}x{}".format(self.board_width, self.board_height, self.n_in_row)) # start training from a given policy-value net if os.path.isfile('current_policy_{}_{}_{}.model'.format( self.board_width, self.board_height, self.n_in_row)): print( "load old AI model ", 'current_policy_{}_{}_{}.model'.format(self.board_width, self.board_height, self.n_in_row)) policy_param = pickle.load(open( 'current_policy_{}_{}_{}.model'.format(self.board_width, self.board_height, self.n_in_row), 'rb'), encoding='bytes') self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params=policy_param) else: # start training from a new policy-value net print("init new AI model") self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def simple_train(): #board = Board(board_n, win) #game = Game() pvnet = PolicyValueNet(board_n, model_filename) #mcts_player = MCTSPlayer(pvnet.get_pvnet_fn()) #bh, ph, vh = game.selfplay(board, mcts_player) #bh, ph, vh = game.selfplay(board, HumanPlayer()) #print(vh) while True: train(pvnet, config.train_config['train_samples']) pvnet.save_model(model_filename)
def main(): human = HumanWASDPlayer() play(human, human) pvnet = PolicyValueNet(board_n, model_filename) mem = Memory() while True: mcts_player = MCTSPlayer(pvnet.get_pvnet_fn(), play_style = 3) bh, ph, vh = play(mcts_player, human, mcts_player) mem.save_data((bh, ph, vh)) mcts_player = MCTSPlayer(pvnet.get_pvnet_fn(), play_style = 3) bh, ph, vh = play(human, mcts_player, mcts_player) mem.save_data((bh, ph, vh))
def run(): n = 5 width, height = 8, 8 model_file = 'current_policy_10.21.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # best_policy = PolicyValueNet(width, height, model_file = model_file) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy # try: # policy_param = pickle.load(open(model_file, 'rb')) # except: # policy_param = pickle.load(open(model_file, 'rb'), # encoding='bytes') # To support python3 best_policy = PolicyValueNet(width, height, model_file) mcts_player = MCTSPlayer_Alphago( best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def RLput(board, who, n_playout=400): model_file = "./best_policy.model" # policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') best_policy = PolicyValueNet(board.width, board.height, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # 设置当前下棋者,使用do_move的才要 # board.set_current_player(who) # 如果是先手,随机下一个地方 last = board.getLast() if last == [-1, -1]: row = random.randint(2, 5) col = random.randint(2, 5) if board[row][col] == 0: move = board.location_to_move((row, col)) if board.do_move(move): return True return False # 不是先手 move = mcts_player.get_action(board) #print(board.current_player, who) # input("按任意键继续") return board.do_move(move)
def run(): # play the chess with human game = Game() # log, the model for training 1500 is suck, maybe the value is not prepared and need to be # trained more times - 2018.7.11 best_policy = PolicyValueNet(5, 5) mctsplayer = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=10000) puremctsplayer = PURE(c_puct=5, n_playout=10000) human = Human() ''' # human first, red win = {1: 0, 2: 0} for i in range(50): a = time.time() winner = game.start_play(puremctsplayer, mctsplayer, 1, 2, (i % 2 + 1), is_show=1) if winner == 1: win[1] += 1 else: win[2] += 1 # print(i, 'winner is', 'red' if winner == 1 else 'blue') print(i, 'blue win rate:', win[2] / (i + 1)) print(i, 'cost:', time.time() - a, 's') # print('win rating ...', win[2] / 100) ''' ''' import time a = time.time() game.start_self_play(mctsplayer, is_show=1) print(time.time() - a) ''' game.start_play(human, mctsplayer, 1, 2, 1, is_show=1)
def run(): n = 4 width, height = 6, 6 model_file = './models_664_origpure_nofrust/current_policy_3450.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in TensorFlow best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=800, is_selfplay=0, disp=True) # for MCTS without neural network # mcts_player = MCTS_Pure(c_puct=5, n_playout=5) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def net_mcts_play(board_width, board_height, n_in_row): game_board = GomokuBoard(width=board_width, height=board_height, n_in_row=n_in_row) brain = PolicyValueNet(board_width, board_height, ".\\CurrentModel\\GomokuAi") net_player = MCTSPlayer(brain.policy_value, 2000) mcts_player = MCTSPlayer(rollout_policy_value, 10000) while True: action = mcts_player.get_action(game_board) game_board.move(action) end, winner = game_board.check_winner() game_board.dbg_print() if end: print(winner) break action, prob = net_player.get_action_prob(game_board) game_board.move(action) end, winner = game_board.check_winner() print(action, prob) game_board.dbg_print() if end: print(winner) break
def run(): width, height, n = 6, 6, 4 board = Board(width=width, height=height, n_in_row=n) best_policy = PolicyValueNet(width, height, 'current_policy.model') # Below set larger n_playout for better performance mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) board.init_board(0) p1, p2 = board.players mcts_player.set_player_ind(p2) players = {p1: 'Human Player', p2: mcts_player} app_tk = tk.Tk() app_tk.resizable(False, False) app_tk.geometry('{}x{}+{}+{}'.format(cell_size * width, cell_size * height, cell_size, cell_size)) app_tk.title('Human VS AI - Gomoku') for x in range(width): cells_column = [] gui_cells.append(cells_column) for y in range(height): cells_column.append(GuiCell(app_tk, board, players, (x, y))) app_tk.mainloop()
def run(): n = 5 width, height = 10, 10 model_file = 'current_policy.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) graphic = Graphic() # graphic.run() print(1111) # thread1 = threading.Thread(target=graphic.run, args=()) best_policy = PolicyValueNet(width, height, model_file='./model/' + model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=1200) human = Human(graphic) # set start_player=0 for human first thread2 = threading.Thread(target=game.start_play, args=(human, mcts_player, graphic, 1, 1)) # game.start_play(human, mcts_player, graphic, start_player=0, is_shown=1) thread2.setDaemon(True) thread2.start() graphic.run() except KeyboardInterrupt: print('\n\rquit')
def __init__(self, init_model=None): # params of the board and the game # basic params self.board_width = 9 self.board_height = 9 self.n_in_row = 5 # init the board and game self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 3e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1e-3 # the temperature param # self.n_playout = 400 # num of simulations for each move self.n_playout = 400 self.c_puct = 3 # a number in (0, inf) that controls how quickly exploration # converges to the maximum-value policy. A higher value means # relying on the prior more. self.buffer_size = 10000 # self.batch_size = 512 # mini-batch size for training self.batch_size = 256 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1000 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 400 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None): # params of the board and the game self.board_width = 6 #棋盘宽度 self.board_height = 6 #棋盘高度 self.n_in_row = 4 #胜利条件:多少个棋连成一线算是胜利 # 实例化一个board,定义棋盘宽高和胜利条件 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 #初始化network和树,network是一直保存的,树的话不知道什么时候重置。 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None): # params of the board and the game self.board_length = 6 self.n_in_row = 4 self.num_history = 2 self.chess = chessboard(self.board_length, self.n_in_row) # training params self.learn_rate = 5e-4 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temperature = 1.0 # the temperature param self.cpuct = 5 self.buffer_size = 10000 self.batch_size = 512 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 10 self.kl_targ = 0.02 self.check_freq = 50 self.best_win_ratio = 0.0 self.game_batch_num = 4000 self.loss_dict = {} self.loss_hold = 50 self.real_mcts_simulation_times = 400 self.pure_mcts_simulation_times = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_length, self.num_history, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_length, self.num_history) # ============================================================================= # deepcopy self.chess or not??????????????????????????????????????????? # ============================================================================= self.mcts_player = real_mcts(self.chess, self.policy_value_net.policy_value, self.cpuct, self.real_mcts_simulation_times, self.temperature, self.num_history, True)
def __init__(self, init_model=None): # params of the board and the game self.game = Game() # training params self.config=TrainConfig() self.data_buffer = deque(maxlen=self.config.buffer_size) if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet() self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.config.c_puct, n_playout=self.config.n_playout, is_selfplay=1)
def __init__(self, init_model=None): # params of the board and the game self.board_width = 6 self.board_height = 6 self.n_in_row = 4 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None): # 棋盘数据 self.board_width = 8 self.board_height = 8 # self.n_in_row = 5 self.board = chessboard(row=self.board_width, col=self.board_height) # 训练参数 self.learn_rate = 2e-3 self.lr_multiplier = 1.0 self.temp = 1.0 self.n_playout = 400 # 每次模拟次数 self.c_puct = 5 self.buffer_size = 10000000 self.batch_size = 512 # 每批样本量 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # 每次更新前迭代次数 self.kl_targ = 0.02 self.check_freq = 2 # 自我对弈次数 self.game_batch_num = 1000 self.best_win_ratio = 0.0 # 纯蒙特卡罗树搜索,用来作为基准 self.pure_mcts_playout_num = 400 # 有预训练模型的情况 if init_model: self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model, use_gpu=True) else: # 从头开始训练 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, use_gpu=True) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def test_playout(): from quoridor import Quoridor from policy_value_net import PolicyValueNet c_puct = 5 n_playout = 400 policy_value_net = PolicyValueNet(model_file=None, use_gpu=True) mcts = MCTS(policy_value_net.policy_value_fn, c_puct=c_puct, n_playout=n_playout) q = Quoridor() acts, act_probs = mcts.get_move_probs(q) print(acts) print(act_probs)
def __init__(self): # params of the board and the game self.board_width = 9 self.board_height = 9 self.board = Board(width=self.board_width, height=self.board_height) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 800 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_loss = None # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 init_model = 'checkpoint/current_policy.model' if os.path.isfile(init_model + '.index'): # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
class TrainPipeline(): def __init__(self, init_model=None): # params of the board and the game self.board_width = 6 self.board_height = 6 self.n_in_row = 4 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) def get_equi_data(self, play_data): """augment the data set by rotation and flipping play_data: [(state, mcts_prob, winner_z), ..., ...] """ extend_data = [] for state, mcts_porb, winner in play_data: for i in [1, 2, 3, 4]: # rotate counterclockwise equi_state = np.array([np.rot90(s, i) for s in state]) equi_mcts_prob = np.rot90(np.flipud( mcts_porb.reshape(self.board_height, self.board_width)), i) extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) # flip horizontally equi_state = np.array([np.fliplr(s) for s in equi_state]) equi_mcts_prob = np.fliplr(equi_mcts_prob) extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) return extend_data def collect_selfplay_data(self, n_games=1): """collect self-play data for training""" for i in range(n_games): winner, play_data = self.game.start_self_play(self.mcts_player, temp=self.temp) play_data = list(play_data)[:] self.episode_len = len(play_data) # augment the data play_data = self.get_equi_data(play_data) self.data_buffer.extend(play_data) def policy_update(self): """update the policy-value net""" mini_batch = random.sample(self.data_buffer, self.batch_size) state_batch = [data[0] for data in mini_batch] mcts_probs_batch = [data[1] for data in mini_batch] winner_batch = [data[2] for data in mini_batch] old_probs, old_v = self.policy_value_net.policy_value(state_batch) for i in range(self.epochs): loss, entropy = self.policy_value_net.train_step( state_batch, mcts_probs_batch, winner_batch, self.learn_rate*self.lr_multiplier) new_probs, new_v = self.policy_value_net.policy_value(state_batch) kl = np.mean(np.sum(old_probs * ( np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1) ) if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly break # adaptively adjust the learning rate if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 explained_var_old = (1 - np.var(np.array(winner_batch) - old_v.flatten()) / np.var(np.array(winner_batch))) explained_var_new = (1 - np.var(np.array(winner_batch) - new_v.flatten()) / np.var(np.array(winner_batch))) print(("kl:{:.5f}," "lr_multiplier:{:.3f}," "loss:{}," "entropy:{}," "explained_var_old:{:.3f}," "explained_var_new:{:.3f}" ).format(kl, self.lr_multiplier, loss, entropy, explained_var_old, explained_var_new)) return loss, entropy def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio def run(self): """run the training pipeline""" try: for i in range(self.game_batch_num): self.collect_selfplay_data(self.play_batch_size) print("batch i:{}, episode_len:{}".format( i+1, self.episode_len)) if len(self.data_buffer) > self.batch_size: loss, entropy = self.policy_update() # check the performance of the current model, # and save the model params if (i+1) % self.check_freq == 0: print("current self-play batch: {}".format(i+1)) win_ratio = self.policy_evaluate() self.policy_value_net.save_model('./current_policy.model') if win_ratio > self.best_win_ratio: print("New best policy!!!!!!!!") self.best_win_ratio = win_ratio # update the best_policy self.policy_value_net.save_model('./best_policy.model') if (self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000): self.pure_mcts_playout_num += 1000 self.best_win_ratio = 0.0 except KeyboardInterrupt: print('\n\rquit')