def update_net(self, shared_queue, net_lock, data_lock, stop_update_process): os.environ["CUDA_VISIBLE_DEVICES"] = "1" from policy_value_net_tensorflow import PolicyValueNet logging.info('update process start') # 读取和写入模型文 current_policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_dir) current_policy_value_net.save_model(current_model_name) i = 0 best_win_ratio = 0 pure_mcts_playout_num = 1000 get_enough_train_data = False while stop_update_process.value == 0: time.sleep(1) if get_enough_train_data: i += 1 logging.info('update process start {} th self train'.format(i)) self.policy_update(current_policy_value_net, shared_queue, net_lock, data_lock, i) logging.info('update process end {} th self train'.format(i)) # 这里更新最新模型文件 if (i + 1) % self.update_freq == 0: logging.info('update process ask net lock') with net_lock: logging.info('update process get net lock') current_policy_value_net.save_model(current_model_name) logging.info('update process release net lock') # 这里和纯MCTS比赛,判断胜率,更新最优模型文件 if (i + 1) % self.check_freq == 0: logging.info("Game {}: AlphagZero VS PURE MCTS".format(i + 1)) win_ratio = self.policy_evaluate(pure_mcts_playout_num, current_policy_value_net) if win_ratio >= best_win_ratio: logging.info("update process New best policy!!!!!!!!") best_win_ratio = win_ratio # update the best_policy current_policy_value_net.save_model(best_model_name) if (best_win_ratio == 1.0 and pure_mcts_playout_num < 5000): pure_mcts_playout_num += 1000 best_win_ratio = 0.0 else: with data_lock: get_enough_train_data = len( shared_queue) >= self.batch_size logging.info('update process finished')
def update_net_thread(self, shared_queue, net_lock, data_lock, stop_update_process, update_best_model): os.environ["CUDA_VISIBLE_DEVICES"] = "1" from policy_value_net_tensorflow import PolicyValueNet logging.info('update process start') # 读取和写入模型文 current_policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_dir) current_policy_value_net.save_model(current_model_name) current_policy_value_net.save_model(best_model_name) best_win_ratio = 0 get_enough_train_data = False global_update_step = 0 lr_multiplier = 1.0 while stop_update_process.value == 0: time.sleep(1) if get_enough_train_data: global_update_step += 1 logging.info('update process start {} th self train'.format( global_update_step)) lr_multiplier = self.policy_update(current_policy_value_net, shared_queue, net_lock, data_lock, global_update_step, lr_multiplier) logging.info('update process end {} th self train'.format( global_update_step)) # 这里更新最新模型文件 logging.info('update process ask net lock') with net_lock: logging.info('update process get net lock') current_policy_value_net.save_model(current_model_name) logging.info('update process release net lock') if (global_update_step + 1) % self.update_freq == 0: update_best_model.value = 1 else: with data_lock: get_enough_train_data = len( shared_queue) >= self.batch_size logging.info('update process finished')
def update_net(self, shared_queue, net_lock, update_best_model, global_update_step, lr_multiplier, stop_update_process, update_or_selfplay): os.environ["CUDA_VISIBLE_DEVICES"] = "1" from policy_value_net_tensorflow import PolicyValueNet current_policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_dir) current_policy_value_net.save_model(current_model_name) current_policy_value_net.save_model(best_model_name) while global_update_step.value <= self.game_batch_num: if update_or_selfplay.value == 0: if len(shared_queue) >= self.batch_size: for _ in range(self.epochs): global_update_step.value += 1 logging.info('update current model process start self train: {}'.format(global_update_step.value)) self.policy_update(current_policy_value_net, shared_queue, net_lock, global_update_step, lr_multiplier) if (global_update_step.value) % self.check_freq == 0: update_best_model.value = 1 # 这里更新最新模型文件 with net_lock: logging.info('update process update current model') current_policy_value_net.save_model(current_model_name) update_or_selfplay.value = 1 else: time.sleep(1) stop_update_process.value = 1
class TrainPipeline(): def __init__(self, init_model=None): # params of the board and the game self.board_width = 15 self.board_height = 15 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 800 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) # 存储mcts的数据,增广以后的数据 self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update # 此处应该是400或者800 self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 # 此处是1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) def get_equi_data(self, play_data): """augment the data set by rotation and flipping # 通过旋转翻转增强数据集 play_data: [(state, mcts_prob, winner_z), ..., ...] # state表示当前棋盘,mcts_prob表示棋盘的点的概率 """ extend_data = [] for state, mcts_porb, winner in play_data: for i in [1, 2, 3, 4]: # rotate counterclockwise equi_state = np.array([np.rot90(s, i) for s in state]) equi_mcts_prob = np.rot90( np.flipud( mcts_porb.reshape(self.board_height, self.board_width)), i) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) # flip horizontally equi_state = np.array([np.fliplr(s) for s in equi_state]) equi_mcts_prob = np.fliplr(equi_mcts_prob) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) return extend_data def collect_selfplay_data(self, n_games=1): """collect self-play data for training""" for i in range(n_games): # play_data为zip(states, mcts_probs, winners_z) winner, play_data = self.game.start_self_play(self.mcts_player, temp=self.temp) play_data = list(play_data)[:] # 对应的玩家和棋面组成元祖 self.episode_len = len(play_data) # 表示一盘棋走了多少步,有多少个状态 # augment the data 数据增广 play_data = self.get_equi_data(play_data) self.data_buffer.extend(play_data) def policy_update(self): """update the policy-value net""" mini_batch = random.sample(self.data_buffer, self.batch_size) state_batch = [data[0] for data in mini_batch] mcts_probs_batch = [data[1] for data in mini_batch] winner_batch = [data[2] for data in mini_batch] old_probs, old_v = self.policy_value_net.policy_value(state_batch) for i in range(self.epochs): loss, entropy = self.policy_value_net.train_step( state_batch, mcts_probs_batch, winner_batch, self.learn_rate * self.lr_multiplier) new_probs, new_v = self.policy_value_net.policy_value(state_batch) kl = np.mean( np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1)) if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly break # adaptively adjust the learning rate if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 explained_var_old = (1 - np.var(np.array(winner_batch) - old_v.flatten()) / np.var(np.array(winner_batch))) explained_var_new = (1 - np.var(np.array(winner_batch) - new_v.flatten()) / np.var(np.array(winner_batch))) print(("kl:{:.5f}," "lr_multiplier:{:.3f}," "loss:{}," "entropy:{}," "explained_var_old:{:.3f}," "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss, entropy, explained_var_old, explained_var_new)) return loss, entropy def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio def run(self): """run the training pipeline""" try: for i in range(self.game_batch_num): self.collect_selfplay_data(self.play_batch_size) print("batch i:{}, episode_len:{}".format( i + 1, self.episode_len)) if len(self.data_buffer) > self.batch_size: # 所有状态是否超过512 loss, entropy = self.policy_update() # check the performance of the current model, # and save the model params if (i + 1) % self.check_freq == 0: print("current self-play batch: {}".format(i + 1)) win_ratio = self.policy_evaluate() self.policy_value_net.save_model('./current_policy_model') if win_ratio > self.best_win_ratio: print("New best policy!!!!!!!!") self.best_win_ratio = win_ratio # update the best_policy self.policy_value_net.save_model('./best_policy_model') if (self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000): self.pure_mcts_playout_num += 1000 self.best_win_ratio = 0.0 except KeyboardInterrupt: print('\n\rquit')
class TrainPipeline(): def __init__(self, init_model=None, is_shown=0): self.board_width = 15 self.board_height = 15 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.is_shown = is_shown self.game = Game_UI(self.board, is_shown) self.learn_rate = 2e-3 self.lr_multiplier = 1.0 self.temp = 1.0 self.n_playout = 400 self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 self.pure_mcts_playout_num = 1000 if init_model: self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) def get_equi_data(self, play_data): extend_data = [] for state, mcts_porb, winner in play_data: for i in [1, 2, 3, 4]: equi_state = np.array([np.rot90(s, i) for s in state]) equi_mcts_prob = np.rot90( np.flipud( mcts_porb.reshape(self.board_height, self.board_width)), i) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) equi_state = np.array([np.fliplr(s) for s in equi_state]) equi_mcts_prob = np.fliplr(equi_mcts_prob) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) return extend_data def collect_selfplay_data(self, n_games=1): for i in range(n_games): winner, play_data = self.game.start_self_play(self.mcts_player, temp=self.temp) play_data = list(play_data)[:] self.episode_len = len(play_data) play_data = self.get_equi_data(play_data) self.data_buffer.extend(play_data) def policy_update(self): mini_batch = random.sample(self.data_buffer, self.batch_size) state_batch = [data[0] for data in mini_batch] mcts_probs_batch = [data[1] for data in mini_batch] winner_batch = [data[2] for data in mini_batch] old_probs, old_v = self.policy_value_net.policy_value(state_batch) for i in range(self.epochs): loss, entropy = self.policy_value_net.train_step( state_batch, mcts_probs_batch, winner_batch, self.learn_rate * self.lr_multiplier) new_probs, new_v = self.policy_value_net.policy_value(state_batch) kl = np.mean( np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1)) if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly break if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 explained_var_old = (1 - np.var(np.array(winner_batch) - old_v.flatten()) / np.var(np.array(winner_batch))) explained_var_new = (1 - np.var(np.array(winner_batch) - new_v.flatten()) / np.var(np.array(winner_batch))) print(("kl:{:.5f}," "lr_multiplier:{:.3f}," "loss:{}," "entropy:{}," "explained_var_old:{:.3f}," "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss, entropy, explained_var_old, explained_var_new)) return loss, entropy def policy_evaluate(self, n_games=10): current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio def run(self): root = os.getcwd() dst_path = os.path.join(root, 'dist') if not os.path.exists(dst_path): os.makedirs(dst_path) try: for i in range(self.game_batch_num): self.collect_selfplay_data(self.play_batch_size) print("batch i:{}, episode_len:{}".format( i + 1, self.episode_len)) if len(self.data_buffer) > self.batch_size: loss, entropy = self.policy_update() print("loss :{}, entropy:{}".format(loss, entropy)) if (i + 1) % self.check_freq == 0: print("current self-play batch: {}".format(i + 1)) win_ratio = self.policy_evaluate() self.policy_value_net.save_model( os.path.join(dst_path, 'current_policy.model')) if win_ratio > self.best_win_ratio: print("New best policy!!!!!!!!") self.best_win_ratio = win_ratio self.policy_value_net.save_model( os.path.join(dst_path, 'best_policy.model')) if (self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000): self.pure_mcts_playout_num += 1000 self.best_win_ratio = 0.0 except KeyboardInterrupt: print('\n\rquit')
class TrainPipeline(): def __init__(self, init_model): self.init_model = init_model # params of the board and the game self.board_width = 6 self.board_height = 6 self.n_in_row = 4 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1000 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if os.path.isdir(init_model): self.is_init = True # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: self.is_init = False os.system('mkdir ' + init_model) # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) if not os.path.isdir(init_model + 'best'): os.system('mkdir ' + init_model + 'best') def get_equi_data(self, play_data): """augment the data set by rotation and flipping play_data: [(state, mcts_prob, winner_z), ..., ...] """ extend_data = [] for state, mcts_porb, winner in play_data: for i in [1, 2, 3, 4]: # rotate counterclockwise equi_state = np.array([np.rot90(s, i) for s in state]) equi_mcts_prob = np.rot90( np.flipud( mcts_porb.reshape(self.board_height, self.board_width)), i) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) # flip horizontally equi_state = np.array([np.fliplr(s) for s in equi_state]) equi_mcts_prob = np.fliplr(equi_mcts_prob) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) return extend_data def collect_selfplay_data(self, n_games=1): """collect self-play data for training""" for i in range(n_games): winner, play_data = self.game.start_self_play(self.mcts_player, temp=self.temp, is_shown=0) play_data = list(play_data)[:] self.episode_len = len(play_data) # augment the data play_data = self.get_equi_data(play_data) self.data_buffer.extend(play_data) def policy_update(self, drop_trained=False): """update the policy-value net""" #mini_batch = random.sample(self.data_buffer, self.batch_size) sample_index = random.sample(range(len(self.data_buffer)), self.batch_size) mini_batch = [self.data_buffer[i] for i in sample_index] state_batch = [data[0] for data in mini_batch] mcts_probs_batch = [data[1] for data in mini_batch] winner_batch = [data[2] for data in mini_batch] old_probs, old_v = self.policy_value_net.policy_value(state_batch) for i in range(self.epochs): loss, entropy = self.policy_value_net.train_step( state_batch, mcts_probs_batch, winner_batch, self.learn_rate * self.lr_multiplier) new_probs, new_v = self.policy_value_net.policy_value(state_batch) kl = np.mean( np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1)) if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly break # adaptively adjust the learning rate if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 explained_var_old = (1 - np.var(np.array(winner_batch) - old_v.flatten()) / np.var(np.array(winner_batch))) explained_var_new = (1 - np.var(np.array(winner_batch) - new_v.flatten()) / np.var(np.array(winner_batch))) # remove sample when trained if drop_trained: removed_batch = random.sample(sample_index, self.batch_size // 2) #removed_batch = sample_index removed_batch.sort() delete = 0 for i in removed_batch: del self.data_buffer[i - delete] delete += 1 # end of drop trained print(("kl:{:.5f}," "lr_multiplier:{:.3f}," "loss:{}," "entropy:{}," "explained_var_old:{:.3f}," "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss, entropy, explained_var_old, explained_var_new)) return loss, entropy def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): print('game:', i) winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=1) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio def run(self): """run the training pipeline""" start_time = time.clock() try: # do policy_evaluate first if using trained model if self.is_init: self.best_win_ratio = self.policy_evaluate() if self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000: self.pure_mcts_playout_num += 1000 self.best_win_ratio = 0.0 start_time = time.clock() for i in range(self.game_batch_num): self.collect_selfplay_data( self.play_batch_size) # self play one game #print('buffer size:', len(self.data_buffer)) print("batch i:{}, episode_len:{}".format( i + 1, self.episode_len)) if len(self.data_buffer) > self.batch_size: loss, entropy = self.policy_update( drop_trained=True) # can control if drop trained # check the performance of the current model, # and save the model params if (i + 1) % self.check_freq == 0: elapse_time = time.clock() - start_time print('current elapse time:', elapse_time, 'sec') print("current self-play batch: {}".format(i + 1)) win_ratio = self.policy_evaluate() self.policy_value_net.save_model(self.init_model + 'current_policy.model') if win_ratio > self.best_win_ratio: print("New best policy!!!!!!!!") self.best_win_ratio = win_ratio # update the best_policy self.policy_value_net.save_model( self.init_model + 'best/best_policy.model') if (self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000): self.pure_mcts_playout_num += 1000 self.best_win_ratio = 0.0 except KeyboardInterrupt: elapse_time = time.clock() - start_time print('total time:', elapse_time, 'sec') print('\n\rquit')
class TrainPipeline(): def __init__(self, init_model=None): self.board_width = 6 self.board_height = 6 self.config = GameConfig() self.board = Board(self.config) self.game = Game(self.board) # training params #学习率0.002 self.learn_rate = 2e-3 #自动调整学习率 kl比较两个概率分布的接近程度。在某个变化范围内,KL散度取到最小值的时候,对应的参数是我们想要的最优参数 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 1500 # num of simulations for each move self.c_puct = 5 #UCTK self.buffer_size = 10000 self.batch_size = 200 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 # self.check_freq = 25 # self.game_batch_num = 1500 self.game_batch_num = 5000 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 5000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) def collect_selfplay_data(self, n_games=1): for i in range(n_games): winner, play_data = self.game.start_self_play(self.mcts_player, temp=self.temp) play_data = list(play_data)[:] self.episode_len = len(play_data) self.data_buffer.extend(play_data) def policy_update(self): mini_batch = random.sample(self.data_buffer, self.batch_size) state_batch = [data[0] for data in mini_batch] mcts_probs_batch = [data[1] for data in mini_batch] winner_batch = [data[2] for data in mini_batch] old_probs, old_v = self.policy_value_net.policy_value(state_batch) for i in range(self.epochs): loss, entropy = self.policy_value_net.train_step( state_batch, mcts_probs_batch, winner_batch, self.learn_rate * self.lr_multiplier) new_probs, new_v = self.policy_value_net.policy_value(state_batch) kl = np.mean( np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1)) if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly break # adaptively adjust the learning rate if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 explained_var_old = (1 - np.var(np.array(winner_batch) - old_v.flatten()) / np.var(np.array(winner_batch))) explained_var_new = (1 - np.var(np.array(winner_batch) - new_v.flatten()) / np.var(np.array(winner_batch))) print(("kl:{:.5f}," "lr_multiplier:{:.3f}," "loss:{}," "entropy:{}," "explained_var_old:{:.3f}," "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss, entropy, explained_var_old, explained_var_new)) return loss, entropy def run(self): """run the training pipeline""" try: #训练多少批 for i in range(self.game_batch_num): #play_batch_size:批大小 self.collect_selfplay_data(self.play_batch_size) print("batch i:{}, episode_len:{}".format( i + 1, self.episode_len)) if len(self.data_buffer) > self.batch_size: print("start update policy ") loss, entropy = self.policy_update() if (i + 1) % self.check_freq == 0: print("current self-play batch: {}".format(i + 1)) self.policy_value_net.save_model('./current_policy.model') except KeyboardInterrupt: print('\n\rquit')
class Evaluator(Process): def __init__(self, config, weight_queue): super(Evaluator, self).__init__() self.config = config self.queue = weight_queue self.best_win_ratio = 0.0 self.pure_mcts_playout_num = self.config['pure_mcts_playout_num'] def run(self): self.policy_value_net = PolicyValueNet( self.config['board_width'], self.config['board_height'], model_file=self.config['init_model']) while True: weight = self.queue.get() self.policy_value_net.set_weight(weight) win_ratio = self.policy_evaluate() self.policy_value_net.save_model( self.config['current_policy_name']) if win_ratio > self.best_win_ratio: print("New best policy!!!!!!!!") self.best_win_ratio = win_ratio # update the best_policy self.policy_value_net.save_model( self.config['best_policy_name']) if (self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 10000): self.pure_mcts_playout_num += 1000 self.best_win_ratio = 0.0 def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ self.evaluate_game = Game( Board(width=self.config['board_width'], height=self.config['board_height'], n_in_row=self.config['n_in_row'])) current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.config['c_puct'], n_playout=self.config['n_playout']) pure_mcts_player = MCTS_Pure( c_puct=5, n_playout=self.config['pure_mcts_playout_num']) win_cnt = defaultdict(int) for i in range(n_games): winner = self.evaluate_game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.config['pure_mcts_playout_num'], win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
class TrainPipeline(): def __init__(self, init_model=None): # params of the board and the game self.board_width = 15 self.board_height = 15 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) self.manual = Manual(self.board) # training params self.learn_rate = 1e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 100 # num of simulations for each move self.c_puct = 1 self.buffer_size = 100000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.episode_len = 0 self.kl_targ = 0.02 self.check_freq = 1 self.game_batch_num = 5 self.best_win_ratio = 0.55 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 self.lock = threading.Lock() if init_model: # start training from an initial policy-value net self.g1 = tf.Graph() with self.g1.as_default(): self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model, graph=self.g1, output='/data/data/') # tf.reset_default_graph() self.g2 = tf.Graph() with self.g2.as_default(): self.policy_value_net_train = PolicyValueNet(self.board_width, self.board_height, model_file=init_model, graph=self.g2, output='/data/output/') else: # start training from a new policy-value net self.g1 = tf.Graph() with self.g1.as_default(): self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, graph=self.g1, output='./data/') # tf.reset_default_graph() self.g2 = tf.Graph() with self.g2.as_default(): self.policy_value_net_train = PolicyValueNet(self.board_width, self.board_height, graph=self.g2, output='./output/') self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) def get_equi_data(self, play_data): """augment the data set by rotation and flipping play_data: [(state, mcts_prob, winner_z), ..., ...] """ extend_data = [] for state, mcts_porb, winner in play_data: for i in [1, 2, 3, 4]: # rotate counterclockwise equi_state = np.array([np.rot90(s, i) for s in state]) equi_mcts_prob = np.rot90(np.flipud( mcts_porb.reshape(self.board_height, self.board_width)), i) extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) # flip horizontally equi_state = np.array([np.fliplr(s) for s in equi_state]) equi_mcts_prob = np.fliplr(equi_mcts_prob) extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) return extend_data def collect_selfplay_data(self, n_games=1): """collect self-play data for training""" for i in range(n_games): # self.lock.acquire() # print("game {}".format(i)) with self.g1.as_default(): '''mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) game = Game(board)''' winner, play_data = self.game.start_self_play(self.mcts_player, is_shown=0, temp=self.temp) # self.lock.release() play_data = list(play_data)[:] self.episode_len = len(play_data) # augment the data play_data = self.get_equi_data(play_data) self.data_buffer.extend(play_data) # print("self play end...") def collect_manual_data(self, file): winner, play_data = self.manual.read_manual_data(file) # read the chess manual fail if winner == 0: return play_data = list(play_data)[:] self.episode_len = len(play_data) # augment the data play_data = self.get_equi_data(play_data) self.data_buffer.extend(play_data) def collect_test_data(self): self.board.init_board() states, mcts_probs, current_players = [], [], [] move = 128 self.board.do_move(112) states.append(self.board.current_state()) probs = np.zeros(self.board.width * self.board.height) probs[[move]] = 1 mcts_probs.append(probs) current_players.append(self.board.current_player) winners_z = np.array([1]) play_data = zip(states, mcts_probs, winners_z) play_data = list(play_data)[:] self.data_buffer.extend(play_data) def policy_update(self): """update the policy-value net""" mini_batch = random.sample(self.data_buffer, self.batch_size) state_batch = [data[0] for data in mini_batch] mcts_probs_batch = [data[1] for data in mini_batch] winner_batch = [data[2] for data in mini_batch] with self.g2.as_default(): for i in range(self.epochs): loss, entropy = self.policy_value_net_train.train_step( state_batch, mcts_probs_batch, winner_batch, self.learn_rate*self.lr_multiplier) print(( "lr_multiplier:{:.3f}," "loss:{}," "entropy:{}," ).format( self.lr_multiplier, loss, entropy)) return loss, entropy def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ print("evaluating...") current_mcts_player = MCTSPlayer(self.policy_value_net_train.policy_value_fn, c_puct=self.c_puct, n_playout=self.pure_mcts_playout_num) best_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, best_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) # save the current_model self.policy_value_net_train.save_model('/data/output/current_policy.model') if win_ratio > self.best_win_ratio: print("New best policy!!!!!!!!") # update the best_policy self.policy_value_net_train.save_model('/data/output/best_policy.model') self.g1 = tf.Graph() with self.g1.as_default(): self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file='/data/output/best_policy.model', graph=self.g1, output='/data/data/') return win_ratio def run(self): """run the training pipeline""" try: '''coord = tf.train.Coordinator() self_play = [threading.Thread(target=self.collect_selfplay_data, args=(self.play_batch_size,)) for i in range(4)] for sp in self_play: sp.start() coord.join(self_play) while len(self.data_buffer) < self.batch_size: print(len(self.data_buffer)) time.sleep(3) pass''' multiplier = [0.1, 0.1, 0.01, 0.01, 0.01] step = 0 for n in range(self.game_batch_num): self.collect_selfplay_data(self.play_batch_size) # self.collect_test_data() self.policy_value_net.n_step += 1 print("batch i:{}, episode_len:{}".format( self.policy_value_net.n_step, self.episode_len)) # optimisation if len(self.data_buffer) > self.batch_size: for i in range(100): self.policy_update() # evaluation if self.policy_value_net.n_step % self.check_freq == 0: # self.lr_multiplier = multiplier[step] # step += 1 self.mcts_player.mcts._discount = 1 - 0.98*(1 - self.mcts_player.mcts._discount) print("current self-play batch: {}, discount: {}".format( self.policy_value_net.n_step, self.mcts_player.mcts._discount)) # self.lock.acquire() self.policy_evaluate(n_games=15) # self.lock.release() except KeyboardInterrupt: print('\n\rquit')
class TrainPipeline(): def __init__(self, init_model=None): # params of the board and the game self.board_width = 7 self.board_height = 7 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 1500 # num of simulations for each move self.c_puct = 5 self.buffer_size = 150000 self.batch_size = 2048 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) if os.path.exists("data_buffer.pkl"): with open("data_buffer.pkl", "rb") as f: self.data_buffer = pickle.load(f) print("Load data, size = %d" % len(self.data_buffer)) self.play_batch_size = 1 self.epochs = 10 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 1500 self.save_freq = 500 self.game_batch_num = 10000 self.best_win_ratio = 0.0 self.episode_len = 0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) def get_equi_data(self, play_data): """augment the data set by rotation and flipping play_data: [(state, mcts_prob, winner_z), ..., ...] """ extend_data = [] for state, mcts_porb, winner in play_data: for i in [1, 2, 3, 4]: # rotate counterclockwise equi_state = np.array([np.rot90(s, i) for s in state]) equi_mcts_prob = np.rot90( np.flipud( mcts_porb.reshape(self.board_height, self.board_width)), i) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) # flip horizontally equi_state = np.array([np.fliplr(s) for s in equi_state]) equi_mcts_prob = np.fliplr(equi_mcts_prob) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) return extend_data def collect_selfplay_data(self, n_games=1): """collect self-play data for training""" for i in range(n_games): winner, play_data = self.game.start_self_play(self.mcts_player, temp=self.temp) play_data = list(play_data)[:] self.episode_len = len(play_data) # augment the data play_data = self.get_equi_data(play_data) self.data_buffer.extend(play_data) # def collect_selfplay_data(self, n_games=1): # """collect self-play data for training""" # pool = Pool(processes = 8) # multi = [] # for i in range(n_games): # multi.append(pool.apply_async(self.game.start_self_play,(self.mcts_player,self.temp))) # # pool.close() # # pool.join() # for data in multi: # data.wait() # pool.close() # pool.join() # for data in multi: # if data.ready(): # print("Ready!") # if data.successful(): # print("SUCCESS!") # winner, play_data = data.get() # play_data = list(play_data)[:] # self.episode_len = len(play_data) # # augment the data # play_data = self.get_equi_data(play_data) # self.data_buffer.extend(play_data) def policy_update(self): """update the policy-value net""" mini_batch = random.sample(self.data_buffer, self.batch_size) state_batch = [data[0] for data in mini_batch] mcts_probs_batch = [data[1] for data in mini_batch] winner_batch = [data[2] for data in mini_batch] old_probs, old_v = self.policy_value_net.policy_value(state_batch) for i in range(self.epochs): loss, entropy = self.policy_value_net.train_step( state_batch, mcts_probs_batch, winner_batch, self.learn_rate * self.lr_multiplier) new_probs, new_v = self.policy_value_net.policy_value(state_batch) kl = np.mean( np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1)) if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly break # adaptively adjust the learning rate if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 explained_var_old = (1 - np.var(np.array(winner_batch) - old_v.flatten()) / np.var(np.array(winner_batch))) explained_var_new = (1 - np.var(np.array(winner_batch) - new_v.flatten()) / np.var(np.array(winner_batch))) print(("kl:{:.5f}," "lr_multiplier:{:.3f}," "loss:{}," "entropy:{}," "explained_var_old:{:.3f}," "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss, entropy, explained_var_old, explained_var_new)) return loss, entropy def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio def run(self): """run the training pipeline""" try: print("HAHA") print("%s Start Running" % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) for i in range(self.game_batch_num): self.collect_selfplay_data(self.play_batch_size) print("batch i:{}, episode_len:{}".format( i + 1, self.episode_len)) if len(self.data_buffer) > self.batch_size: loss, entropy = self.policy_update() # check the performance of the current model, # and save the model params if (i + 1) % self.save_freq == 0: self.policy_value_net.save_model( './current_policy_%d_%d.model' % (self.board_width, self.board_height)) with open("data_buffer.pkl", "wb") as f: pickle.dump(self.data_buffer, f) print("Dump data, size = %d" % len(self.data_buffer)) if (i + 1) % self.check_freq == 0: print("{} current self-play batch: {}".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), i + 1)) win_ratio = self.policy_evaluate() if win_ratio > self.best_win_ratio: print("New best policy!!!!!!!!") self.best_win_ratio = win_ratio # update the best_policy self.policy_value_net.save_model( './best_policy_%d_%d.model' % (self.board_width, self.board_height)) if (self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 10000): self.pure_mcts_playout_num += 1000 self.best_win_ratio = 0.0 except KeyboardInterrupt: print('\n\rquit')