Python PolicyValueNet.save_model 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: policy_value_net_tensorflow

클래스/타입: PolicyValueNet

메소드/함수: save_model

hotexamples.com에서의 예제들: 10

Python PolicyValueNet.save_model - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 policy_value_net_tensorflow.PolicyValueNet.save_model에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

PolicyValueNet(30)

save_model(10)

train_step(7)

policy_value(6)

set_weight(2)

SavedModel(1)

get_weight(1)

summary_record(1)

예제 #1

파일 보기

파일: train_multi.py 프로젝트: liyaozong1991/Gomoku-AI

 def update_net(self, shared_queue, net_lock, data_lock,
                stop_update_process):
     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
     from policy_value_net_tensorflow import PolicyValueNet
     logging.info('update process start')
     # 读取和写入模型文
     current_policy_value_net = PolicyValueNet(self.board_width,
                                               self.board_height, model_dir)
     current_policy_value_net.save_model(current_model_name)
     i = 0
     best_win_ratio = 0
     pure_mcts_playout_num = 1000
     get_enough_train_data = False
     while stop_update_process.value == 0:
         time.sleep(1)
         if get_enough_train_data:
             i += 1
             logging.info('update process start {} th self train'.format(i))
             self.policy_update(current_policy_value_net, shared_queue,
                                net_lock, data_lock, i)
             logging.info('update process end {} th self train'.format(i))
             # 这里更新最新模型文件
             if (i + 1) % self.update_freq == 0:
                 logging.info('update process ask net lock')
                 with net_lock:
                     logging.info('update process get net lock')
                     current_policy_value_net.save_model(current_model_name)
                 logging.info('update process release net lock')
             # 这里和纯MCTS比赛，判断胜率，更新最优模型文件
             if (i + 1) % self.check_freq == 0:
                 logging.info("Game {}: AlphagZero VS PURE MCTS".format(i +
                                                                        1))
                 win_ratio = self.policy_evaluate(pure_mcts_playout_num,
                                                  current_policy_value_net)
                 if win_ratio >= best_win_ratio:
                     logging.info("update process New best policy!!!!!!!!")
                     best_win_ratio = win_ratio
                     # update the best_policy
                     current_policy_value_net.save_model(best_model_name)
                     if (best_win_ratio == 1.0
                             and pure_mcts_playout_num < 5000):
                         pure_mcts_playout_num += 1000
                         best_win_ratio = 0.0
         else:
             with data_lock:
                 get_enough_train_data = len(
                     shared_queue) >= self.batch_size
     logging.info('update process finished')

예제 #2

파일 보기

파일: train_multi_2.0.py 프로젝트: liyaozong1991/Gomoku-AI

 def update_net_thread(self, shared_queue, net_lock, data_lock,
                       stop_update_process, update_best_model):
     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
     from policy_value_net_tensorflow import PolicyValueNet
     logging.info('update process start')
     # 读取和写入模型文
     current_policy_value_net = PolicyValueNet(self.board_width,
                                               self.board_height, model_dir)
     current_policy_value_net.save_model(current_model_name)
     current_policy_value_net.save_model(best_model_name)
     best_win_ratio = 0
     get_enough_train_data = False
     global_update_step = 0
     lr_multiplier = 1.0
     while stop_update_process.value == 0:
         time.sleep(1)
         if get_enough_train_data:
             global_update_step += 1
             logging.info('update process start {} th self train'.format(
                 global_update_step))
             lr_multiplier = self.policy_update(current_policy_value_net,
                                                shared_queue, net_lock,
                                                data_lock,
                                                global_update_step,
                                                lr_multiplier)
             logging.info('update process end {} th self train'.format(
                 global_update_step))
             # 这里更新最新模型文件
             logging.info('update process ask net lock')
             with net_lock:
                 logging.info('update process get net lock')
                 current_policy_value_net.save_model(current_model_name)
             logging.info('update process release net lock')
             if (global_update_step + 1) % self.update_freq == 0:
                 update_best_model.value = 1
         else:
             with data_lock:
                 get_enough_train_data = len(
                     shared_queue) >= self.batch_size
     logging.info('update process finished')

예제 #3

파일 보기

파일: train_multi_3.0.py 프로젝트: liyaozong1991/Gomoku-AI

 def update_net(self, shared_queue, net_lock, update_best_model, global_update_step, lr_multiplier, stop_update_process, update_or_selfplay):
     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
     from policy_value_net_tensorflow import PolicyValueNet
     current_policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_dir)
     current_policy_value_net.save_model(current_model_name)
     current_policy_value_net.save_model(best_model_name)
     while global_update_step.value <= self.game_batch_num:
         if update_or_selfplay.value == 0:
             if len(shared_queue) >= self.batch_size:
                 for _ in range(self.epochs):
                     global_update_step.value += 1
                     logging.info('update current model process start self train: {}'.format(global_update_step.value))
                     self.policy_update(current_policy_value_net, shared_queue, net_lock, global_update_step, lr_multiplier)
                     if (global_update_step.value) % self.check_freq == 0:
                         update_best_model.value = 1
                 # 这里更新最新模型文件
                 with net_lock:
                     logging.info('update process update current model')
                     current_policy_value_net.save_model(current_model_name)
             update_or_selfplay.value = 1
         else:
             time.sleep(1)
     stop_update_process.value = 1

예제 #4

파일 보기

class TrainPipeline():
    def __init__(self, init_model=None):
        # params of the board and the game
        self.board_width = 15
        self.board_height = 15
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 800  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)  # 存储mcts的数据,增广以后的数据
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update # 此处应该是400或者800
        self.kl_targ = 0.02
        self.check_freq = 50
        self.game_batch_num = 1500
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000  # 此处是1000
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def get_equi_data(self, play_data):
        """augment the data set by rotation and flipping # 通过旋转翻转增强数据集
        play_data: [(state, mcts_prob, winner_z), ..., ...] # state表示当前棋盘，mcts_prob表示棋盘的点的概率
        """
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(
                    np.flipud(
                        mcts_porb.reshape(self.board_height,
                                          self.board_width)), i)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data

    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        for i in range(n_games):
            # play_data为zip(states, mcts_probs, winners_z)
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data = list(play_data)[:]  # 对应的玩家和棋面组成元祖
            self.episode_len = len(play_data)  # 表示一盘棋走了多少步,有多少个状态
            # augment the data 数据增广
            play_data = self.get_equi_data(play_data)
            self.data_buffer.extend(play_data)

    def policy_update(self):
        """update the policy-value net"""
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]

        old_probs, old_v = self.policy_value_net.policy_value(state_batch)

        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch,
                self.learn_rate * self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        # adaptively adjust the learning rate
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = (1 -
                             np.var(np.array(winner_batch) - old_v.flatten()) /
                             np.var(np.array(winner_batch)))
        explained_var_new = (1 -
                             np.var(np.array(winner_batch) - new_v.flatten()) /
                             np.var(np.array(winner_batch)))
        print(("kl:{:.5f},"
               "lr_multiplier:{:.3f},"
               "loss:{},"
               "entropy:{},"
               "explained_var_old:{:.3f},"
               "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss,
                                                  entropy, explained_var_old,
                                                  explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio

    def run(self):
        """run the training pipeline"""
        try:
            for i in range(self.game_batch_num):
                self.collect_selfplay_data(self.play_batch_size)
                print("batch i:{}, episode_len:{}".format(
                    i + 1, self.episode_len))
                if len(self.data_buffer) > self.batch_size:  # 所有状态是否超过512
                    loss, entropy = self.policy_update()
                # check the performance of the current model,
                # and save the model params
                if (i + 1) % self.check_freq == 0:
                    print("current self-play batch: {}".format(i + 1))
                    win_ratio = self.policy_evaluate()

                    self.policy_value_net.save_model('./current_policy_model')

                    if win_ratio > self.best_win_ratio:
                        print("New best policy!!!!!!!!")
                        self.best_win_ratio = win_ratio
                        # update the best_policy
                        self.policy_value_net.save_model('./best_policy_model')
                        if (self.best_win_ratio == 1.0
                                and self.pure_mcts_playout_num < 5000):
                            self.pure_mcts_playout_num += 1000
                            self.best_win_ratio = 0.0
        except KeyboardInterrupt:
            print('\n\rquit')

예제 #5

파일 보기

파일: train.py 프로젝트: aipromote/reinforcement-learning-master

class TrainPipeline():
    def __init__(self, init_model=None, is_shown=0):

        self.board_width = 15
        self.board_height = 15
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.is_shown = is_shown
        self.game = Game_UI(self.board, is_shown)

        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0
        self.temp = 1.0
        self.n_playout = 400
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5
        self.kl_targ = 0.02
        self.check_freq = 50
        self.game_batch_num = 1500
        self.best_win_ratio = 0.0
        self.pure_mcts_playout_num = 1000
        if init_model:
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def get_equi_data(self, play_data):
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(
                    np.flipud(
                        mcts_porb.reshape(self.board_height,
                                          self.board_width)), i)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data

    def collect_selfplay_data(self, n_games=1):
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data = list(play_data)[:]
            self.episode_len = len(play_data)
            play_data = self.get_equi_data(play_data)
            self.data_buffer.extend(play_data)

    def policy_update(self):
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch,
                self.learn_rate * self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = (1 -
                             np.var(np.array(winner_batch) - old_v.flatten()) /
                             np.var(np.array(winner_batch)))
        explained_var_new = (1 -
                             np.var(np.array(winner_batch) - new_v.flatten()) /
                             np.var(np.array(winner_batch)))
        print(("kl:{:.5f},"
               "lr_multiplier:{:.3f},"
               "loss:{},"
               "entropy:{},"
               "explained_var_old:{:.3f},"
               "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss,
                                                  entropy, explained_var_old,
                                                  explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio

    def run(self):
        root = os.getcwd()

        dst_path = os.path.join(root, 'dist')

        if not os.path.exists(dst_path):
            os.makedirs(dst_path)

        try:
            for i in range(self.game_batch_num):
                self.collect_selfplay_data(self.play_batch_size)
                print("batch i:{}, episode_len:{}".format(
                    i + 1, self.episode_len))
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update()
                    print("loss :{}, entropy:{}".format(loss, entropy))
                if (i + 1) % self.check_freq == 0:
                    print("current self-play batch: {}".format(i + 1))
                    win_ratio = self.policy_evaluate()
                    self.policy_value_net.save_model(
                        os.path.join(dst_path, 'current_policy.model'))
                    if win_ratio > self.best_win_ratio:
                        print("New best policy!!!!!!!!")
                        self.best_win_ratio = win_ratio
                        self.policy_value_net.save_model(
                            os.path.join(dst_path, 'best_policy.model'))
                        if (self.best_win_ratio == 1.0
                                and self.pure_mcts_playout_num < 5000):
                            self.pure_mcts_playout_num += 1000
                            self.best_win_ratio = 0.0
        except KeyboardInterrupt:
            print('\n\rquit')

예제 #6

파일 보기

파일: train.py 프로젝트: zxcqwe4906/AlphaZero_Gomoku

class TrainPipeline():
    def __init__(self, init_model):
        self.init_model = init_model
        # params of the board and the game
        self.board_width = 6
        self.board_height = 6
        self.n_in_row = 4
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 50
        self.game_batch_num = 1000
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        if os.path.isdir(init_model):
            self.is_init = True
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            self.is_init = False
            os.system('mkdir ' + init_model)
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

        if not os.path.isdir(init_model + 'best'):
            os.system('mkdir ' + init_model + 'best')

    def get_equi_data(self, play_data):
        """augment the data set by rotation and flipping
        play_data: [(state, mcts_prob, winner_z), ..., ...]
        """
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(
                    np.flipud(
                        mcts_porb.reshape(self.board_height,
                                          self.board_width)), i)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data

    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp,
                                                          is_shown=0)

            play_data = list(play_data)[:]
            self.episode_len = len(play_data)
            # augment the data
            play_data = self.get_equi_data(play_data)
            self.data_buffer.extend(play_data)

    def policy_update(self, drop_trained=False):
        """update the policy-value net"""
        #mini_batch = random.sample(self.data_buffer, self.batch_size)
        sample_index = random.sample(range(len(self.data_buffer)),
                                     self.batch_size)
        mini_batch = [self.data_buffer[i] for i in sample_index]

        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]

        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch,
                self.learn_rate * self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        # adaptively adjust the learning rate
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = (1 -
                             np.var(np.array(winner_batch) - old_v.flatten()) /
                             np.var(np.array(winner_batch)))
        explained_var_new = (1 -
                             np.var(np.array(winner_batch) - new_v.flatten()) /
                             np.var(np.array(winner_batch)))
        # remove sample when trained
        if drop_trained:
            removed_batch = random.sample(sample_index, self.batch_size // 2)
            #removed_batch = sample_index
            removed_batch.sort()
            delete = 0
            for i in removed_batch:
                del self.data_buffer[i - delete]
                delete += 1
        # end of drop trained

        print(("kl:{:.5f},"
               "lr_multiplier:{:.3f},"
               "loss:{},"
               "entropy:{},"
               "explained_var_old:{:.3f},"
               "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss,
                                                  entropy, explained_var_old,
                                                  explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            print('game:', i)
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=1)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio

    def run(self):
        """run the training pipeline"""
        start_time = time.clock()
        try:
            # do policy_evaluate first if using trained model
            if self.is_init:
                self.best_win_ratio = self.policy_evaluate()
                if self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000:
                    self.pure_mcts_playout_num += 1000
                    self.best_win_ratio = 0.0
            start_time = time.clock()
            for i in range(self.game_batch_num):
                self.collect_selfplay_data(
                    self.play_batch_size)  # self play one game
                #print('buffer size:', len(self.data_buffer))
                print("batch i:{}, episode_len:{}".format(
                    i + 1, self.episode_len))
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update(
                        drop_trained=True)  # can control if drop trained
                # check the performance of the current model,
                # and save the model params
                if (i + 1) % self.check_freq == 0:
                    elapse_time = time.clock() - start_time
                    print('current elapse time:', elapse_time, 'sec')
                    print("current self-play batch: {}".format(i + 1))
                    win_ratio = self.policy_evaluate()
                    self.policy_value_net.save_model(self.init_model +
                                                     'current_policy.model')
                    if win_ratio > self.best_win_ratio:
                        print("New best policy!!!!!!!!")
                        self.best_win_ratio = win_ratio
                        # update the best_policy
                        self.policy_value_net.save_model(
                            self.init_model + 'best/best_policy.model')
                        if (self.best_win_ratio == 1.0
                                and self.pure_mcts_playout_num < 5000):
                            self.pure_mcts_playout_num += 1000
                            self.best_win_ratio = 0.0
        except KeyboardInterrupt:
            elapse_time = time.clock() - start_time
            print('total time:', elapse_time, 'sec')
            print('\n\rquit')

예제 #7

파일 보기

class TrainPipeline():
    def __init__(self, init_model=None):
        self.board_width = 6
        self.board_height = 6
        self.config = GameConfig()
        self.board = Board(self.config)
        self.game = Game(self.board)
        # training params
        #学习率0.002
        self.learn_rate = 2e-3
        #自动调整学习率 kl比较两个概率分布的接近程度。在某个变化范围内，KL散度取到最小值的时候，对应的参数是我们想要的最优参数
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 1500  # num of simulations for each move
        self.c_puct = 5  #UCTK
        self.buffer_size = 10000
        self.batch_size = 200  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 50
        #    self.check_freq = 25
        #    self.game_batch_num = 1500
        self.game_batch_num = 5000
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 5000
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def collect_selfplay_data(self, n_games=1):
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data = list(play_data)[:]
            self.episode_len = len(play_data)
            self.data_buffer.extend(play_data)

    def policy_update(self):
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch,
                self.learn_rate * self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)

            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        # adaptively adjust the learning rate
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = (1 -
                             np.var(np.array(winner_batch) - old_v.flatten()) /
                             np.var(np.array(winner_batch)))
        explained_var_new = (1 -
                             np.var(np.array(winner_batch) - new_v.flatten()) /
                             np.var(np.array(winner_batch)))
        print(("kl:{:.5f},"
               "lr_multiplier:{:.3f},"
               "loss:{},"
               "entropy:{},"
               "explained_var_old:{:.3f},"
               "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss,
                                                  entropy, explained_var_old,
                                                  explained_var_new))
        return loss, entropy

    def run(self):
        """run the training pipeline"""
        try:
            #训练多少批
            for i in range(self.game_batch_num):
                #play_batch_size:批大小
                self.collect_selfplay_data(self.play_batch_size)
                print("batch i:{}, episode_len:{}".format(
                    i + 1, self.episode_len))
                if len(self.data_buffer) > self.batch_size:
                    print("start update policy ")
                    loss, entropy = self.policy_update()
                if (i + 1) % self.check_freq == 0:
                    print("current self-play batch: {}".format(i + 1))
                    self.policy_value_net.save_model('./current_policy.model')
        except KeyboardInterrupt:
            print('\n\rquit')

예제 #8

파일 보기

class Evaluator(Process):
    def __init__(self, config, weight_queue):
        super(Evaluator, self).__init__()
        self.config = config
        self.queue = weight_queue

        self.best_win_ratio = 0.0
        self.pure_mcts_playout_num = self.config['pure_mcts_playout_num']

    def run(self):
        self.policy_value_net = PolicyValueNet(
            self.config['board_width'],
            self.config['board_height'],
            model_file=self.config['init_model'])

        while True:
            weight = self.queue.get()
            self.policy_value_net.set_weight(weight)
            win_ratio = self.policy_evaluate()
            self.policy_value_net.save_model(
                self.config['current_policy_name'])

            if win_ratio > self.best_win_ratio:
                print("New best policy!!!!!!!!")
                self.best_win_ratio = win_ratio
                # update the best_policy
                self.policy_value_net.save_model(
                    self.config['best_policy_name'])
                if (self.best_win_ratio == 1.0
                        and self.pure_mcts_playout_num < 10000):
                    self.pure_mcts_playout_num += 1000
                    self.best_win_ratio = 0.0

    def policy_evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        self.evaluate_game = Game(
            Board(width=self.config['board_width'],
                  height=self.config['board_height'],
                  n_in_row=self.config['n_in_row']))

        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.config['c_puct'],
                                         n_playout=self.config['n_playout'])

        pure_mcts_player = MCTS_Pure(
            c_puct=5, n_playout=self.config['pure_mcts_playout_num'])

        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.evaluate_game.start_play(current_mcts_player,
                                                   pure_mcts_player,
                                                   start_player=i % 2,
                                                   is_shown=0)
            win_cnt[winner] += 1

        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.config['pure_mcts_playout_num'], win_cnt[1], win_cnt[2],
            win_cnt[-1]))
        return win_ratio

예제 #9

파일 보기

파일: train.py 프로젝트: xzabg/AlphaZero_Gobang

class TrainPipeline():
    def __init__(self, init_model=None):
        # params of the board and the game
        self.board_width = 15
        self.board_height = 15
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        self.manual = Manual(self.board)
        # training params
        self.learn_rate = 1e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 100  # num of simulations for each move
        self.c_puct = 1
        self.buffer_size = 100000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.episode_len = 0
        self.kl_targ = 0.02
        self.check_freq = 1
        self.game_batch_num = 5
        self.best_win_ratio = 0.55
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        self.lock = threading.Lock()
        if init_model:
            # start training from an initial policy-value net
            self.g1 = tf.Graph()
            with self.g1.as_default():
                self.policy_value_net = PolicyValueNet(self.board_width,
                                                       self.board_height,
                                                       model_file=init_model,
                                                       graph=self.g1,
                                                       output='/data/data/')
            # tf.reset_default_graph()
            self.g2 = tf.Graph()
            with self.g2.as_default():
                self.policy_value_net_train = PolicyValueNet(self.board_width,
                                                             self.board_height,
                                                             model_file=init_model,
                                                             graph=self.g2,
                                                             output='/data/output/')
        else:
            # start training from a new policy-value net
            self.g1 = tf.Graph()
            with self.g1.as_default():
                self.policy_value_net = PolicyValueNet(self.board_width,
                                                       self.board_height,
                                                       graph=self.g1,
                                                       output='./data/')
            # tf.reset_default_graph()
            self.g2 = tf.Graph()
            with self.g2.as_default():
                self.policy_value_net_train = PolicyValueNet(self.board_width,
                                                             self.board_height,
                                                             graph=self.g2,
                                                             output='./output/')

        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def get_equi_data(self, play_data):
        """augment the data set by rotation and flipping
        play_data: [(state, mcts_prob, winner_z), ..., ...]
        """
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(np.flipud(
                    mcts_porb.reshape(self.board_height, self.board_width)), i)
                extend_data.append((equi_state,
                                    np.flipud(equi_mcts_prob).flatten(),
                                    winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append((equi_state,
                                    np.flipud(equi_mcts_prob).flatten(),
                                    winner))
        return extend_data

    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        for i in range(n_games):
            # self.lock.acquire()
            # print("game {}".format(i))
            with self.g1.as_default():
                '''mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout,
                                         is_selfplay=1)
                board = Board(width=self.board_width,
                              height=self.board_height,
                              n_in_row=self.n_in_row)
                game = Game(board)'''
                winner, play_data = self.game.start_self_play(self.mcts_player,
                                                              is_shown=0,
                                                              temp=self.temp)
            # self.lock.release()

            play_data = list(play_data)[:]
            self.episode_len = len(play_data)
            # augment the data
            play_data = self.get_equi_data(play_data)
            self.data_buffer.extend(play_data)

        # print("self play end...")

    def collect_manual_data(self, file):
        winner, play_data = self.manual.read_manual_data(file)
        # read the chess manual fail
        if winner == 0:
            return

        play_data = list(play_data)[:]
        self.episode_len = len(play_data)
        # augment the data
        play_data = self.get_equi_data(play_data)
        self.data_buffer.extend(play_data)

    def collect_test_data(self):
        self.board.init_board()
        states, mcts_probs, current_players = [], [], []
        move = 128
        self.board.do_move(112)
        states.append(self.board.current_state())
        probs = np.zeros(self.board.width * self.board.height)
        probs[[move]] = 1
        mcts_probs.append(probs)
        current_players.append(self.board.current_player)
        winners_z = np.array([1])
        play_data = zip(states, mcts_probs, winners_z)
        play_data = list(play_data)[:]
        self.data_buffer.extend(play_data)

    def policy_update(self):
        """update the policy-value net"""
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        with self.g2.as_default():
            for i in range(self.epochs):
                loss, entropy = self.policy_value_net_train.train_step(
                        state_batch,
                        mcts_probs_batch,
                        winner_batch,
                        self.learn_rate*self.lr_multiplier)

        print((
               "lr_multiplier:{:.3f},"
               "loss:{},"
               "entropy:{},"
               ).format(
                        self.lr_multiplier,
                        loss,
                        entropy))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        print("evaluating...")
        current_mcts_player = MCTSPlayer(self.policy_value_net_train.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.pure_mcts_playout_num)
        best_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.pure_mcts_playout_num)

        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          best_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
                self.pure_mcts_playout_num,
                win_cnt[1], win_cnt[2], win_cnt[-1]))

        # save the current_model
        self.policy_value_net_train.save_model('/data/output/current_policy.model')
        if win_ratio > self.best_win_ratio:
            print("New best policy!!!!!!!!")
            # update the best_policy
            self.policy_value_net_train.save_model('/data/output/best_policy.model')
            self.g1 = tf.Graph()
            with self.g1.as_default():
                self.policy_value_net = PolicyValueNet(self.board_width,
                                                       self.board_height,
                                                       model_file='/data/output/best_policy.model',
                                                       graph=self.g1,
                                                       output='/data/data/')

        return win_ratio

    def run(self):
        """run the training pipeline"""
        try:
            '''coord = tf.train.Coordinator()
            self_play = [threading.Thread(target=self.collect_selfplay_data, args=(self.play_batch_size,)) for i in range(4)]
            for sp in self_play:
                sp.start()
            coord.join(self_play)
            while len(self.data_buffer) < self.batch_size:
                print(len(self.data_buffer))
                time.sleep(3)
                pass'''
            multiplier = [0.1, 0.1, 0.01, 0.01, 0.01]
            step = 0
            for n in range(self.game_batch_num):
                self.collect_selfplay_data(self.play_batch_size)
                # self.collect_test_data()
                self.policy_value_net.n_step += 1

                print("batch i:{}, episode_len:{}".format(
                   self.policy_value_net.n_step, self.episode_len))

                # optimisation
                if len(self.data_buffer) > self.batch_size:
                    for i in range(100):
                        self.policy_update()

                # evaluation
                if self.policy_value_net.n_step % self.check_freq == 0:
                    # self.lr_multiplier = multiplier[step]
                    # step += 1
                    self.mcts_player.mcts._discount = 1 - 0.98*(1 - self.mcts_player.mcts._discount)
                    print("current self-play batch: {}, discount: {}".format(
                        self.policy_value_net.n_step, self.mcts_player.mcts._discount))

                    # self.lock.acquire()
                    self.policy_evaluate(n_games=15)
                    # self.lock.release()
        except KeyboardInterrupt:
            print('\n\rquit')

예제 #10

파일 보기

class TrainPipeline():
    def __init__(self, init_model=None):
        # params of the board and the game
        self.board_width = 7
        self.board_height = 7
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 1500  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 150000
        self.batch_size = 2048  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        if os.path.exists("data_buffer.pkl"):
            with open("data_buffer.pkl", "rb") as f:
                self.data_buffer = pickle.load(f)
                print("Load data, size = %d" % len(self.data_buffer))
        self.play_batch_size = 1
        self.epochs = 10  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 1500
        self.save_freq = 500
        self.game_batch_num = 10000
        self.best_win_ratio = 0.0
        self.episode_len = 0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def get_equi_data(self, play_data):
        """augment the data set by rotation and flipping
        play_data: [(state, mcts_prob, winner_z), ..., ...]
        """
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(
                    np.flipud(
                        mcts_porb.reshape(self.board_height,
                                          self.board_width)), i)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data

    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data = list(play_data)[:]
            self.episode_len = len(play_data)
            # augment the data
            play_data = self.get_equi_data(play_data)
            self.data_buffer.extend(play_data)

    # def collect_selfplay_data(self, n_games=1):
    #     """collect self-play data for training"""
    #     pool = Pool(processes = 8)
    #     multi = []
    #     for i in range(n_games):
    #         multi.append(pool.apply_async(self.game.start_self_play,(self.mcts_player,self.temp)))

    #     # pool.close()
    #     # pool.join()

    #     for data in multi:
    #         data.wait()

    #     pool.close()
    #     pool.join()

    #     for data in multi:
    #         if data.ready():
    #             print("Ready!")
    #             if data.successful():
    #                 print("SUCCESS!")
    #                 winner, play_data = data.get()

    #                 play_data = list(play_data)[:]
    #                 self.episode_len = len(play_data)
    #                 # augment the data
    #                 play_data = self.get_equi_data(play_data)
    #                 self.data_buffer.extend(play_data)

    def policy_update(self):
        """update the policy-value net"""
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch,
                self.learn_rate * self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        # adaptively adjust the learning rate
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = (1 -
                             np.var(np.array(winner_batch) - old_v.flatten()) /
                             np.var(np.array(winner_batch)))
        explained_var_new = (1 -
                             np.var(np.array(winner_batch) - new_v.flatten()) /
                             np.var(np.array(winner_batch)))
        print(("kl:{:.5f},"
               "lr_multiplier:{:.3f},"
               "loss:{},"
               "entropy:{},"
               "explained_var_old:{:.3f},"
               "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss,
                                                  entropy, explained_var_old,
                                                  explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio

    def run(self):
        """run the training pipeline"""
        try:
            print("HAHA")
            print("%s Start Running" %
                  time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
            for i in range(self.game_batch_num):
                self.collect_selfplay_data(self.play_batch_size)
                print("batch i:{}, episode_len:{}".format(
                    i + 1, self.episode_len))
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update()
                # check the performance of the current model,
                # and save the model params
                if (i + 1) % self.save_freq == 0:
                    self.policy_value_net.save_model(
                        './current_policy_%d_%d.model' %
                        (self.board_width, self.board_height))
                    with open("data_buffer.pkl", "wb") as f:
                        pickle.dump(self.data_buffer, f)
                        print("Dump data, size = %d" % len(self.data_buffer))

                if (i + 1) % self.check_freq == 0:
                    print("{} current self-play batch: {}".format(
                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
                        i + 1))
                    win_ratio = self.policy_evaluate()
                    if win_ratio > self.best_win_ratio:
                        print("New best policy!!!!!!!!")
                        self.best_win_ratio = win_ratio
                        # update the best_policy
                        self.policy_value_net.save_model(
                            './best_policy_%d_%d.model' %
                            (self.board_width, self.board_height))
                        if (self.best_win_ratio == 1.0
                                and self.pure_mcts_playout_num < 10000):
                            self.pure_mcts_playout_num += 1000
                            self.best_win_ratio = 0.0
        except KeyboardInterrupt:
            print('\n\rquit')