def __init__(self, init_model=None): self.board = CSB_Game() self.game = Game(self.board) # training params self.learn_rate = .001 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 50 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 50 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 20 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 100000000000000000000000 self.game_batch_num = 200000000 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet() self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, flag_is_shown=False, flag_is_train=True): # training params self.flag_is_shown = flag_is_shown self.flag_is_train = flag_is_train self.game = Game(self.flag_is_shown, self.flag_is_train) self.NN = PolicyValueNet( (4, self.game.board_width, self.game.board_height)) if not self.flag_is_train: self.NN.load_model("./paras/policy.model") self.mcts_player = MCTSPlayer(self.NN.propagation)
def __init__(self, init_model=None): #cpu count self.n_workers = multiprocessing.cpu_count() -1 self.worker_pool = None # self.episode_len = -1 # params of the board and the game self.board_width = 15 self.board_height = 15 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 36000 self.batch_size = 1024 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.pre_data_size = 5 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 # self.check_freq = 50 self.check_freq = 30 # when use predfined data self.game_batch_num = 300 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 self.batch_i = 0 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) self.best_win_ratio = 0.6 else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(): n = 5 width, height = 9, 9 model_file = './current_model_9_9_5.h5' try: global winner, game, BlockingThread board = Board(width=width, height=height, n_in_row=n) game = Game(board) # USE ML best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) ### while True: BlockingThread = True # blocking print("new game starts") # set start_player=0 for human first winner = game.start_play(Client(), mcts_player, start_player=0, is_shown=1, send_step=send_step) has_winner(winner) eventlet.sleep(1) print("game end") while BlockingThread: # blocking eventlet.sleep(2) except KeyboardInterrupt: print('\n\rquit')
def __init__(self, init_model = None, last_iteration = None): # params of the board and the game self.board_width = 9 self.board_height = 9 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 200 self.game_batch_num = 1500 self.best_win_ratio = 0.95 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 3500 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) self.last_iteration = last_iteration else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.last_iteration = 0 self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
class TrainPipeline(): save_ParaFreq = 200 MAX_EPISODES = 2000 def __init__(self, flag_is_shown=False, flag_is_train=True): # training params self.flag_is_shown = flag_is_shown self.flag_is_train = flag_is_train self.game = Game(self.flag_is_shown, self.flag_is_train) self.NN = PolicyValueNet( (4, self.game.board_width, self.game.board_height)) if not self.flag_is_train: self.NN.load_model("./paras/policy.model") self.mcts_player = MCTSPlayer(self.NN.propagation) def train(self): """run the training pipeline""" for episode in range(self.MAX_EPISODES): if self.flag_is_train: winner, play_data = self.game.start_self_play(self.mcts_player) self.NN.memory(play_data) if len(self.NN.data_buffer) > self.NN.batch_size: loss = self.NN.policy_update() else: print( "Collecting data: %d%%, " % (len(self.NN.data_buffer) / self.NN.batch_size * 100), end="") # and save the model params if (episode + 1) % self.save_ParaFreq == 0: self.NN.save_model('./paras/policy.model') print("episode = %d" % episode) else: self.game.start_play(self.mcts_player)
def single_game_play(num,initmode): print('Starting worker {} '.format(num)) board = Board(width=board_width, height=board_height, n_in_row=n_in_row) game = Game(board) if initmode: policy_value_net = PolicyValueNet(board_width,board_height,model_file=initmode) else: policy_value_net = PolicyValueNet(board_width,board_height) mcts_player = MCTSPlayer(policy_value_net.policy_value_fn, c_puct=c_puct, n_playout=n_playout, is_selfplay=1) winner, play_data = game.start_self_play(mcts_player,temp=temp) #should not do following line because zip function return a iterator instead of a static data strutcure like list #playlen = len(list(play_data)) #print('Exiting worker{} and len is {}'.format(num,playlen)) #logging.info('Exiting worker{} and len is {}'.format(num,playlen)) return winner, play_data
def __init__(self, init_model='./current_policy.hdf5'): # 棋盘参数 self.board_width = 8 self.board_height = 8 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # t训练的参数 self.learn_rate = 2e-3 self.lr_multiplier = 1.0 self.temp = 1.0 # 温度参数 self.n_playout = 400 # 每一次落子模拟次数 self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 #每次更新的训练步数 self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 # 对策略评估使用的MCTS self.pure_mcts_playout_num = 2000 if init_model: # 从现有的网络开始训练 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # 从新的网络开始训练 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def get_action(self, board): print("AI's turn") try: model_file = './best_model_9_9_5.h5' best_policy = PolicyValueNet(9, 9, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) except Exception as e: print(e) move = -1 if move == -1 or move not in board.availables: print(f"invalid move: {move}") move = self.get_action(board) return move
def run(self): n = 5 width, height = 9, 9 model_file = './best_model_9_9_5.h5' print('Game start.') try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow best_policy = PolicyValueNet(width, height, model_file = model_file) mcts_player = BraccioPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) #mcts_player = MCTSPlayer(best_policy.policy_value_fn,c_puct=5,n_playout=400) from braccio_player import init init(self.testMode) # load the provided model (trained in Theano/Lasagne) # into a MCTS player written in pure numpy """ try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') # To support python3 best_policy = PolicyValueNetNumpy(width, height, policy_param) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance """ # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) #mcts_player = MCTS_Pure(c_puct=5, n_playout=3000) # human player, input your move in the format: 2,3 human = self.client # set start_player=0 for human first winner = game.start_play(human, mcts_player, start_player=self.who_first, is_shown=1) print(f'[Play with Robot] winner: {winner}') if self.parent != None: self.parent.end_game(winner) except KeyboardInterrupt: print('\n\rquit') cv2.destroyAllWindows()
def run(): # n = 5 # width, height = 8, 8 # model_file = 'best_policy_8_8_5.model' n = 5 width, height = 9, 9 iteration = 1000 model_file = './model/current_policy_{}_{}_{}_iteration{}.model'.format(height,width,n,iteration) #model_file = './model/best_policy_{}_{}_{}.model'.format(height,width,n) try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow best_policy = PolicyValueNet(width, height, model_file = model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy # try: # policy_param = pickle.load(open(model_file, 'rb')) # except: # policy_param = pickle.load(open(model_file, 'rb'), # encoding='bytes') # To support python3 # best_policy = PolicyValueNetNumpy(width, height, policy_param) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, # c_puct=5, # n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first #game.start_play(human, mcts_player, start_player=1, is_shown=1) mcts_player2 = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) game.start_play(mcts_player2, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def run(): if (len(sys.argv)) != 2: print(sys.argv) print("Need to provide one argument, the model which to play with") sys.exit(0) n = 5 width, height = 15, 15 model_file = sys.argv[1] try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') # To support python3 # best_policy = PolicyValueNetNumpy(width, height, policy_param) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, # c_puct=5, # n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def run(): n = 5 width, height = 9, 9 model_file = './n400-o/current_model_9_9_5_o_50.h5' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy """ try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') # To support python3 best_policy = PolicyValueNetNumpy(width, height, policy_param) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance """ # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=3000) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def run(): n = 5 width, height = 9, 9 iteration = 1000 model_file = './model/current_policy_{}_{}_{}_iteration{}.model'.format( height, width, n, iteration) #model_file = './model/best_policy_{}_{}_{}.model'.format(height,width,n) try: board = Board(width=width, height=height, n_in_row=n) best_policy = PolicyValueNet(width, height, model_file=model_file) AI_player1 = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) AI_player2 = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) human = Human() game = Game("AlphaZero Gomoku", board, AI_player1, AI_player2) while True: game.play() pygame.display.update() for event in pygame.event.get(): if event.type == pygame.QUIT: pygame.quit() exit() elif event.type == pygame.MOUSEBUTTONDOWN: mouse_x, mouse_y = pygame.mouse.get_pos() game.mouseClick(mouse_x, mouse_y) game.check_buttons(mouse_x, mouse_y) except KeyboardInterrupt: print('\n\rquit')
def __init__(self): """ 關於訓練的初始設置 *補充說明 kl 用於計算 lr (learning rate) """ # run() ----------------------------------------------------------------------------------- self.game_batch_num = -1 # 跑一次訓練的重複次數,負值代表不限制 self.play_batch_size = 1 # 自我訓練的執行次數 self.batch_size = 1024 # 每次要訓練的資料量,當 data_buffer 的資料累積到超過本數值就會更新 policy self.check_freq = 50 # 每訓練 ( check_freq ) 次就會與MCTS比賽 self.save_freq = 50 # 每訓練 ( save_freq ) 次就會存檔 # collect_selfplay_data() ----------------------------------------------------------------- self.buffer_size = 10000 self.data_buffer = deque(maxlen=self.buffer_size) self.kl_targ = 0.02 # policy_update() ------------------------------------------------------------------------- self.epochs = 5 # 每次更新的 epochs 數 # board ----------------------------------------------------------------------------------- self.board_width = 9 # 寬度 self.board_height = 9 # 高度 self.n_in_row = 5 # 多少顆連成一線獲得勝利 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # keras ----------------------------------------------------------------------------------- self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # 基於KL自適應調整學習率 self.temp = 1.0 # 溫度參數,太小會導致訓練不夠全面 file_folder = './n400-o' model_tag = '9_9_5_o' self.current_model= f'{file_folder}/current_model_{model_tag}.h5' self.best_model= f'{file_folder}/best_model_{model_tag}.h5' init_model = self.current_model self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file = init_model if os.path.exists(init_model) else None) self.progress = file_folder + '/progress.csv' self.evaluate_path = file_folder + '/evaluate.csv' self.history_path = file_folder + '/history.csv' self.history = [] # MCTS ------------------------------------------------------------------------------------ self.c_puct = 5 # MCTS的搜索偏好 self.loss_goal = 0 #! 存檔時 loss 小於此值會增加訓練時的 n_playout 次數 self.pure_mcts_playout_num = 1000 # MCTS每一步的模擬次數,隨著模型強度提升 self.pure_mcts_playout_num_upgrade = 1000 # MCTS隨著模型強度提升的模擬次數 self.best_win_ratio = 0.0 self.n_playout = 400 # 神經網路每一步的模擬次數,越大代表結果越依賴MCTS的技巧,否則依靠神經網路的判斷 self.n_playout_training = 400 self.n_playout_growth = 0 self.n_playout_limit = 2000 self.MCTS_levelup()
logging.info("New best policy!!!!!!!!") best_win_ratio = win_ratio # update the best_policy policy_value_net.save_model('./best_policy.model') if (best_win_ratio == 1.0 and pure_mcts_playout_num < 5000): pure_mcts_playout_num += 1000 best_win_ratio = 0.0 except KeyboardInterrupt: print('\n\rquit') if __name__ == '__main__': if os.path.exists('./current_policy.model'): initmode = './current_policy.model' policy_value_net = PolicyValueNet(board_width, board_height, model_file=initmode) logging.info('use existing model file') win_ratio = 0.6 else: initmode = None policy_value_net = PolicyValueNet(board_width,board_height) win_ratio = 0.1 do_run() # todo # load trained model to continue # save record of auto-play(at least when vs pure mcts ) ->sgf format # simple gui to load record and show -> parse sgf
def __init__(self): """ 關於訓練的初始設置 *補充說明 kl 用於計算 lr (learning rate) """ # run() ----------------------------------------------------------------------------------- self.game_batch_num = -1 # 跑一次訓練的重複次數,負值代表不限制 self.play_batch_size = 1 # 自我訓練的執行次數 self.batch_size = 4096 # 每次要訓練的資料量,當 data_buffer 的資料累積到超過本數值就會更新 policy self.check_freq = 500 # 每訓練 ( check_freq ) 次就會與MCTS比賽 self.save_freq = 50 # 每訓練 ( save_freq ) 次就會存檔 # collect_selfplay_data() ----------------------------------------------------------------- self.buffer_size = 10000 self.data_buffer = deque(maxlen=self.buffer_size) self.kl_targ = 0.02 # policy_update() ------------------------------------------------------------------------- self.epochs = 20 # 每次更新 lr 前應嘗試的訓練次數 # board ----------------------------------------------------------------------------------- self.board_width = 13 # 寬度 self.board_height = 13 # 高度 self.n_in_row = 5 # 多少顆連成一線獲得勝利 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # keras ----------------------------------------------------------------------------------- self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # 基於KL自適應調整學習率 self.temp = 2.0 # 溫度參數,太小會導致訓練不夠全面 file_folder = './n400' model_tag = '13_13_5' self.current_model = f'{file_folder}/current_model_{model_tag}.h5' self.best_model = f'{file_folder}/best_model_{model_tag}.h5' init_model = self.current_model self.policy_value_net = PolicyValueNet( self.board_width, self.board_height, model_file=init_model if os.path.exists(init_model) else None) self.progress = file_folder + '/progress.csv' self.evaluate_path = file_folder + '/evaluate.csv' self.history_path = file_folder + '/history.csv' self.history = [] # MCTS ------------------------------------------------------------------------------------ self.c_puct = 5 # MCTS的搜索偏好 self.n_playout = 400 # 神經網路每一步的模擬次數,越大代表結果越依賴MCTS的技巧,否則依靠神經網路的判斷 self.loss_goal = 4.0 # 直到 loss 小於此值才會與MCTS比較,以節省訓練時間 self.pure_mcts_playout_num = 1000 # MCTS每一步的模擬次數,隨著模型強度提升 self.pure_mcts_playout_num_upgrade = 500 # MCTS隨著模型強度提升的模擬次數 self.best_win_ratio = 0.0 self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) self.flush_gate = [5.5, 5.0, 4.4, 4.0, 3.6, 3.2, 2.8, 2.6, 2.4, 2.2] # 當 loss 降低到一定程度後,清空之前舊模型生成的爛數據,以新數據重新訓練 self.flushTimes = 0
class TrainPipeline: def __init__(self): """ 關於訓練的初始設置 *補充說明 kl 用於計算 lr (learning rate) """ # run() ----------------------------------------------------------------------------------- self.game_batch_num = -1 # 跑一次訓練的重複次數,負值代表不限制 self.play_batch_size = 1 # 自我訓練的執行次數 self.batch_size = 4096 # 每次要訓練的資料量,當 data_buffer 的資料累積到超過本數值就會更新 policy self.check_freq = 500 # 每訓練 ( check_freq ) 次就會與MCTS比賽 self.save_freq = 50 # 每訓練 ( save_freq ) 次就會存檔 # collect_selfplay_data() ----------------------------------------------------------------- self.buffer_size = 10000 self.data_buffer = deque(maxlen=self.buffer_size) self.kl_targ = 0.02 # policy_update() ------------------------------------------------------------------------- self.epochs = 20 # 每次更新 lr 前應嘗試的訓練次數 # board ----------------------------------------------------------------------------------- self.board_width = 13 # 寬度 self.board_height = 13 # 高度 self.n_in_row = 5 # 多少顆連成一線獲得勝利 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # keras ----------------------------------------------------------------------------------- self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # 基於KL自適應調整學習率 self.temp = 2.0 # 溫度參數,太小會導致訓練不夠全面 file_folder = './n400' model_tag = '13_13_5' self.current_model = f'{file_folder}/current_model_{model_tag}.h5' self.best_model = f'{file_folder}/best_model_{model_tag}.h5' init_model = self.current_model self.policy_value_net = PolicyValueNet( self.board_width, self.board_height, model_file=init_model if os.path.exists(init_model) else None) self.progress = file_folder + '/progress.csv' self.evaluate_path = file_folder + '/evaluate.csv' self.history_path = file_folder + '/history.csv' self.history = [] # MCTS ------------------------------------------------------------------------------------ self.c_puct = 5 # MCTS的搜索偏好 self.n_playout = 400 # 神經網路每一步的模擬次數,越大代表結果越依賴MCTS的技巧,否則依靠神經網路的判斷 self.loss_goal = 4.0 # 直到 loss 小於此值才會與MCTS比較,以節省訓練時間 self.pure_mcts_playout_num = 1000 # MCTS每一步的模擬次數,隨著模型強度提升 self.pure_mcts_playout_num_upgrade = 500 # MCTS隨著模型強度提升的模擬次數 self.best_win_ratio = 0.0 self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) self.flush_gate = [5.5, 5.0, 4.4, 4.0, 3.6, 3.2, 2.8, 2.6, 2.4, 2.2] # 當 loss 降低到一定程度後,清空之前舊模型生成的爛數據,以新數據重新訓練 self.flushTimes = 0 # ----------------------------------------------------------------------------------------- def run(self): try: reset = False if os.path.exists(self.progress) and os.path.exists( self.history_path) and not reset: with open(self.progress, 'r', newline='') as f: rows = csv.DictReader(f) for row in rows: self.i = int(row['i']) self.pure_mcts_playout_num = int( row['pure_mcts_playout_num']) self.best_win_ratio = float(row['best_win_ratio']) self.flushTimes = int(row['flushTimes']) print( f'continue training: i = {self.i}, pure_mcts_playout_num = {self.pure_mcts_playout_num}, best_win_ratio = {self.best_win_ratio}, flushTimes = {self.flushTimes}' ) else: self.i = 0 self.save_progress() with open(self.history_path, 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow([ 'i', 'kl', 'lr_multiplier', 'loss', 'entropy', 'explained_var_old', 'explained_var_new' ]) while (self.i != self.game_batch_num): self.i += 1 self.collect_selfplay_data(self.play_batch_size) print("batch i:{}, episode_len:{}".format( self.i, self.episode_len)) # 資料累積足夠,開始訓練 if len(self.data_buffer) > self.batch_size: # 更新 policy 並計算 loss self.loss, entropy = self.policy_update() if (self.i) % self.save_freq == 0: # save self.policy_value_net.save_model(self.current_model) with open(self.history_path, 'a', newline='') as f: writer = csv.writer(f) writer.writerows(self.history) self.history = [] self.save_progress() # 檢查當前模型的性能,並保存模型參數 if ( self.i ) % self.check_freq == 0 and self.loss < self.loss_goal: print("current self-play batch: {}".format(self.i)) win_ratio = self.policy_evaluate() if win_ratio > self.best_win_ratio: print("New best policy!!!!!!!!") self.best_win_ratio = win_ratio # update the best_policy self.policy_value_net.save_model(self.best_model) if (self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000): self.pure_mcts_playout_num += self.pure_mcts_playout_num_upgrade self.best_win_ratio = 0.0 # save self.policy_value_net.save_model(self.current_model) with open(self.history_path, 'a', newline='') as f: writer = csv.writer(f) writer.writerows(self.history) self.history = [] self.save_progress() # 清空爛數據 if self.flushTimes < len(self.flush_gate): if self.loss < self.flush_gate[self.flushTimes]: print( f'loss {self.loss} < flush gate {self.flush_gate[self.flushTimes]}, clear old data' ) self.data_buffer.clear() # 清空 data buffer self.flushTimes += 1 else: # 還未開始訓練,本次不算數 self.i -= 1 except KeyboardInterrupt: print('\n\rquit') def collect_selfplay_data(self, n_games=1): """收集自我訓練數據進行訓練""" self.episode_len = [] for i in range(n_games): winner, play_data = self.game.start_self_play(self.mcts_player, temp=self.temp) # todo: 解析比賽資料的內容 play_data = list(play_data)[:] # deepcopy 一個 play_data self.episode_len.append(len(play_data)) # 統計 episode_len # augment the data play_data = self.get_equi_data(play_data) # 對稱/鏡像複製,增加資料量 self.data_buffer.extend(play_data) # 將 play_data 新增至 deque 右方 self.episode_len = np.array(self.episode_len).mean( ) # 計算 episode_len 為所有 episode_len 的平均值 (用途?) def get_equi_data(self, play_data): """通過旋轉和翻轉增強數據集 play_data:[(狀態,mcts_prob,winner_z),...,...] """ extend_data = [] for state, mcts_porb, winner in play_data: for i in [1, 2, 3, 4]: # rotate counterclockwise equi_state = np.array([np.rot90(s, i) for s in state]) equi_mcts_prob = np.rot90( np.flipud( mcts_porb.reshape(self.board_height, self.board_width)), i) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) # flip horizontally equi_state = np.array([np.fliplr(s) for s in equi_state]) equi_mcts_prob = np.fliplr(equi_mcts_prob) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) return extend_data def policy_update(self): """更新價值網路, 回傳新的 loss, entropy""" mini_batch = random.sample(self.data_buffer, self.batch_size) # 分類資料 ----------------------------------------- state_batch = [data[0] for data in mini_batch] mcts_probs_batch = [data[1] for data in mini_batch] winner_batch = [data[2] for data in mini_batch] # ------------------------------------------------- old_probs, old_v = self.policy_value_net.policy_value(state_batch) """ * 簡言之 old_probs, old_v = model.predict_on_batch(state_input) * predict_on_batch is a keras function > Returns predictions for a single batch of samples. > Arguments > x: Input samples, as a Numpy array. > Returns > Numpy array(s) of predictions. """ for i in range(self.epochs): # 計算 loss 和 entropy loss, entropy = self.policy_value_net.train_step( state_batch, mcts_probs_batch, winner_batch, self.learn_rate * self.lr_multiplier) new_probs, new_v = self.policy_value_net.policy_value(state_batch) kl = np.mean( np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1)) if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly break # 自適應調整學習率 if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 explained_var_old = (1 - np.var(np.array(winner_batch) - old_v.flatten()) / np.var(np.array(winner_batch))) explained_var_new = (1 - np.var(np.array(winner_batch) - new_v.flatten()) / np.var(np.array(winner_batch))) self.history.append([ self.i, kl, self.lr_multiplier, loss, entropy, explained_var_old, explained_var_new ]) print(("kl:{:.5f}," "lr_multiplier:{:.3f}," "loss:{:.8f}," "entropy:{:.5f}," "explained_var_old:{:.3f}," "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss, entropy, explained_var_old, explained_var_new)) return loss, entropy def policy_evaluate(self, n_games=10): """ 通過與純MCTS玩家對戰來評估經過培訓的策略網路 注意:這僅用於監視培訓進度 """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) send_msg("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) if not os.path.exists(self.evaluate_path): with open(self.evaluate_path, 'w') as f: f.write('i, num_playouts, win, lose, tie') with open(self.evaluate_path, 'a') as f: f.write( f'{self.i}, {self.pure_mcts_playout_num}, {win_cnt[1]}, {win_cnt[2]}, {win_cnt[-1]}\n' ) return win_ratio def save_progress(self): with open(self.progress, 'w', newline='') as f: table = [[ 'i', 'pure_mcts_playout_num', 'best_win_ratio', 'flushTimes' ], [ self.i, self.pure_mcts_playout_num, self.best_win_ratio, self.flushTimes ]] writer = csv.writer(f) writer.writerows(table)
class TrainPipeline(): def __init__(self, init_model='./current_policy.hdf5'): # 棋盘参数 self.board_width = 8 self.board_height = 8 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # t训练的参数 self.learn_rate = 2e-3 self.lr_multiplier = 1.0 self.temp = 1.0 # 温度参数 self.n_playout = 400 # 每一次落子模拟次数 self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 #每次更新的训练步数 self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 # 对策略评估使用的MCTS self.pure_mcts_playout_num = 2000 if init_model: # 从现有的网络开始训练 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # 从新的网络开始训练 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) def get_equi_data(self, play_data): """ 通过对成变换或者旋转之类的来增加数据 """ extend_data = [] for state, mcts_porb, winner in play_data: for i in [1, 2, 3, 4]: equi_state = np.array([np.rot90(s, i) for s in state]) equi_mcts_prob = np.rot90(np.flipud( mcts_porb.reshape(self.board_height, self.board_width)), i) extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) equi_state = np.array([np.fliplr(s) for s in equi_state]) equi_mcts_prob = np.fliplr(equi_mcts_prob) extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) return extend_data def collect_selfplay_data(self, n_games=1): """ 收集自我对局的数据 """ for i in range(n_games): winner, play_data = self.game.start_self_play(self.mcts_player, temp=self.temp) play_data = list(play_data)[:] self.episode_len = len(play_data) # 数据增加 play_data = self.get_equi_data(play_data) self.data_buffer.extend(play_data) def policy_update(self): """ 更新策略价值网络 """ mini_batch = random.sample(self.data_buffer, self.batch_size) state_batch = [data[0] for data in mini_batch] mcts_probs_batch = [data[1] for data in mini_batch] winner_batch = [data[2] for data in mini_batch] old_probs, old_v = self.policy_value_net.policy_value(state_batch) for i in range(self.epochs): loss, entropy = self.policy_value_net.train_step( state_batch, mcts_probs_batch, winner_batch, self.learn_rate*self.lr_multiplier) new_probs, new_v = self.policy_value_net.policy_value(state_batch) kl = np.mean(np.sum(old_probs * ( np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1) ) if kl > self.kl_targ * 4: # 如果loss增加,停止训练 break # 调整学习率 if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 explained_var_old = (1 - np.var(np.array(winner_batch) - old_v.flatten()) / np.var(np.array(winner_batch))) explained_var_new = (1 - np.var(np.array(winner_batch) - new_v.flatten()) / np.var(np.array(winner_batch))) print(("kl:{:.5f}," "lr_multiplier:{:.3f}," "loss:{}," "entropy:{}," "explained_var_old:{:.3f}," "explained_var_new:{:.3f}" ).format(kl, self.lr_multiplier, loss, entropy, explained_var_old, explained_var_new)) return loss, entropy def policy_evaluate(self, n_games=10): """ 通过和纯MCTS进行对弈来评估训练好的策略 """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio def run(self): for i in range(self.game_batch_num): self.collect_selfplay_data(self.play_batch_size) print("batch i:{}, episode_len:{}".format( i+1, self.episode_len)) if len(self.data_buffer) > self.batch_size: loss, entropy = self.policy_update() # 检查当前模型,并且保存参数 if (i+1) % self.check_freq == 0: print("current self-play batch: {}".format(i+1)) win_ratio = self.policy_evaluate() self.policy_value_net.save_model('./current_policy.hdf5') if win_ratio > self.best_win_ratio: print("New best policy!!!!!!!!") self.best_win_ratio = win_ratio # update the best_policy self.policy_value_net.save_model('./best_policy.hdf5') if (self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000): self.pure_mcts_playout_num += 1000 self.best_win_ratio = 0.0
class TrainPipeline(): def __init__(self, init_model=None): # params of the board and the game self.board_width = 8 self.board_height = 8 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) def get_equi_data(self, play_data): """augment the data set by rotation and flipping play_data: [(state, mcts_prob, winner_z), ..., ...] """ extend_data = [] for state, mcts_porb, winner in play_data: for i in [1, 2, 3, 4]: # rotate counterclockwise equi_state = np.array([np.rot90(s, i) for s in state]) equi_mcts_prob = np.rot90( np.flipud( mcts_porb.reshape(self.board_height, self.board_width)), i) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) # flip horizontally equi_state = np.array([np.fliplr(s) for s in equi_state]) equi_mcts_prob = np.fliplr(equi_mcts_prob) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) return extend_data def collect_selfplay_data(self, n_games=1): """collect self-play data for training""" for i in range(n_games): winner, play_data = self.game.start_self_play(self.mcts_player, temp=self.temp) play_data = list(play_data)[:] self.episode_len = len(play_data) # augment the data play_data = self.get_equi_data(play_data) self.data_buffer.extend(play_data) def policy_update(self): """update the policy-value net""" mini_batch = random.sample(self.data_buffer, self.batch_size) state_batch = [data[0] for data in mini_batch] mcts_probs_batch = [data[1] for data in mini_batch] winner_batch = [data[2] for data in mini_batch] old_probs, old_v = self.policy_value_net.policy_value(state_batch) for i in range(self.epochs): loss, entropy = self.policy_value_net.train_step( state_batch, mcts_probs_batch, winner_batch, self.learn_rate * self.lr_multiplier) new_probs, new_v = self.policy_value_net.policy_value(state_batch) kl = np.mean( np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1)) if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly break # adaptively adjust the learning rate if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 explained_var_old = (1 - np.var(np.array(winner_batch) - old_v.flatten()) / np.var(np.array(winner_batch))) explained_var_new = (1 - np.var(np.array(winner_batch) - new_v.flatten()) / np.var(np.array(winner_batch))) print(("kl:{:.5f}," "lr_multiplier:{:.3f}," "loss:{}," "entropy:{}," "explained_var_old:{:.3f}," "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss, entropy, explained_var_old, explained_var_new)) return loss, entropy def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio def run(self): """run the training pipeline""" try: for i in range(self.game_batch_num): self.collect_selfplay_data(self.play_batch_size) print("batch i:{}, episode_len:{}".format( i + 1, self.episode_len)) if len(self.data_buffer) > self.batch_size: loss, entropy = self.policy_update() # check the performance of the current model, # and save the model params if (i + 1) % self.check_freq == 0: print("current self-play batch: {}".format(i + 1)) win_ratio = self.policy_evaluate() self.policy_value_net.save_model('./current_policy.model') if win_ratio > self.best_win_ratio: print("New best policy!!!!!!!!") self.best_win_ratio = win_ratio # update the best_policy self.policy_value_net.save_model('./best_policy.model') if (self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000): self.pure_mcts_playout_num += 1000 self.best_win_ratio = 0.0 except KeyboardInterrupt: print('\n\rquit')
class TrainPipeline(): def __init__(self, init_model=None): self.board = CSB_Game() self.game = Game(self.board) # training params self.learn_rate = .001 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 50 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 50 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 20 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 100000000000000000000000 self.game_batch_num = 200000000 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet() self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) def collect_selfplay_data(self, n_games=1): """collect self-play data for training""" for i in range(n_games): winner, play_data = self.game.start_self_play(self.mcts_player, temp=self.temp) play_data = list(play_data)[:] self.episode_len = len(play_data) # augment the data self.data_buffer.extend(play_data) def policy_update(self): """update the policy-value net""" mini_batch = random.sample(self.data_buffer, self.batch_size) #print(mini_batch) state_batch = [data[0] for data in mini_batch] mcts_probs_batch = [data[1] for data in mini_batch] winner_batch = [data[2] for data in mini_batch] old_probs, old_v = self.policy_value_net.policy_value(state_batch) for i in range(self.epochs): loss, entropy = self.policy_value_net.train_step( state_batch, mcts_probs_batch, winner_batch, self.learn_rate) new_probs, new_v = self.policy_value_net.policy_value(state_batch) #print(winner_batch, new_v) kl = np.mean( np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1)) #if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly # break # adaptively adjust the learning rate if kl > self.kl_targ * 2 and self.lr_multiplier > 0.01: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2 and self.lr_multiplier < 100: self.lr_multiplier *= 1.5 #print(winner_batch) eps = 0.00000000001 explained_var_old = (1 - np.var(np.array(winner_batch) - old_v.flatten()) / (np.var(np.array(winner_batch)) + eps)) explained_var_new = (1 - np.var(np.array(winner_batch) - new_v.flatten()) / (np.var(np.array(winner_batch)) + eps)) print(("kl:{:.5f}," "lr_multiplier:{:.3f}," "loss:{}," "entropy:{}," "explained_var_old:{:.3f}," "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss, entropy, explained_var_old, explained_var_new)) return loss, entropy def run(self): """run the training pipeline""" try: for i in range(self.game_batch_num): self.collect_selfplay_data(self.play_batch_size) print("batch i:{}, episode_len:{}".format( i + 1, self.episode_len)) if len(self.data_buffer) > self.batch_size: loss, entropy = self.policy_update() # check the performance of the current model, # and save the model params if (i + 1) % 100 == 0: self.policy_value_net.save_model('./current_policy.model') except KeyboardInterrupt: print('\n\rquit')