Python PolicyValueNet, Codelib примеры использования

Язык программирования: Python

Класс/Тип: PolicyValueNet

Примеров на hotexamples.com: 9

Python PolicyValueNet - 9 примеров найдено. Это лучшие примеры Python кода для PolicyValueNet из пакета Codelib, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

PolicyValueNet(6)

get_policy_param(2)

doOneTrain(1)

doPolicyValueFunction(1)

fit(1)

policy_value(1)

predict_many(1)

saveModel(1)

Пример #1

Показать файл

Файл: train.py Проект: fk873806472/TwelveShogi_AlphaZero

 def __init__(self):
     # params of the board and the game
     self.board_width = 6
     self.board_height = 6
     self.n_in_row = 4
     self.board = ShogiBoard()
     # training params
     self.learn_rate = 5e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.025
     self.check_freq = 50
     self.game_batch_num = 3000
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     # start training from a given policy-value net
     #        policy_param = pickle.load(open('current_policy.model', 'rb'))
     #        self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param)
     # start training from a new policy-value net
     self.policy_value_net = PolicyValueNet()
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)

Пример #2

Показать файл

Файл: Train.py Проект: wsszh/AlphaZero_Gobang

    def __init__(self, config=None):
        # params of the board and the game
        self.config = config if config else Config()

        # Network wrapper
        self.policy_value_net = PolicyValueNet(self.config.board_width, self.config.board_height,
                                               net_params=self.config.policy_param,
                                               Network=self.config.network)

        # 传入policy_value_net的predict方法，神经网络辅助MCTS搜索过程
        self.mcts_player = AlphaZeroPlayer(self.policy_value_net.predict, c_puct=self.config.c_puct,
                                           nplays=self.config.n_playout, is_selfplay=True)

Пример #3

Показать файл

Файл: Run.py Проект: zeal4u/AlphaZero_Gobang

def run(config=None):
    if config == None:
        config = load_config(file_name=root_data_file + 'resnet_6_6_4.model',
                             only_load_param=True)
    try:
        board = Board(width=config.board_width,
                      height=config.board_height,
                      n_in_row=config.n_in_row)
        game = Game(board)

        # --------------- human VS AI ----------------
        best_policy = PolicyValueNet(
            config.board_width,
            config.board_height,
            Network=config.network,
            net_params=config.policy_param
        )  # setup which Network to use based on the net_params

        mcts_player = AlphaZeroPlayer(
            best_policy.predict,
            c_puct=config.c_puct,
            nplays=100,
            add_noise=True)  # set larger nplays for better performance

        # uncomment the following line to play with pure MCTS
        # mcts_player2 = RolloutPlayer(nplays=1000, c_puct=config.c_puct)

        # human player, input your move in the format: 2,3
        human = HumanPlayer()

        # set who_first=0 for human first
        game.start_game(human, mcts_player, who_first=1, is_shown=1)

    except KeyboardInterrupt:
        print('\n\rquit')

Пример #4

Показать файл

    def __init__(self, config=None):
        # params of the board and the game
        self.config = config if config else Config()
        if not hasattr(self.config, "use_gpu"):
            setattr(config, "use_gpu",
                    False)  # compatible with old version config
        # Network wrapper
        self.policy_value_net = PolicyValueNet(
            self.config.board_width,
            self.config.board_height,
            net_params=self.config.policy_param,
            Network=self.config.network,
            use_gpu=self.config.use_gpu)

        # forward the reference of policy_value_net'predict function，for MCTS simulation
        self.mcts_player = AlphaZeroPlayer(self.policy_value_net.predict,
                                           c_puct=self.config.c_puct,
                                           nplays=self.config.n_playout,
                                           is_selfplay=True)

Пример #5

Показать файл

 def __init__(self, modelPath=None):
     # 棋盘和游戏
     self.boardWidth = 4
     self.boardHeight = 4
     self.game = Game()
     # 训练参数
     self.learningRate = 5e-3
     self.learningRateMultiplier = 1.0  # 自适应
     try:
         self.learningRateMultiplier = float(Util.getNewestLearningRateMultiplier(type='from_db' if modelPath is None else 'from_self_play'))
     except Exception as e:
         print(str(e))
     self.temperature = 1.0  # 温度, 含义见参考资料.txt第2条
     self.playoutTimes = 500  # 模拟次数
     self.polynomialUpperConfidenceTreesConstant = 5  # 论文中的c_puct, 含义见参考资料.txt第2条
     self.dataDequeSize = 10000
     self.trainBatchSize = 512  # 训练批次尺寸,原本为512,先使用50用于调试
     self.dataDeque = deque(maxlen=self.dataDequeSize)  # 超出maxlen会自动删除另一边的元素
     self.playBatchSize = 1
     self.epochs = 5  # 单次训练拟合多少次
     self.KLDParam = 0.025
     self.checkFrequency = 1000  # 之前为100,改为1000,因为评测太浪费时间
     self.gameBatchSize = 200000  # 5000时对mcts1500的胜率为0.9,所以再升到10000
     self.maxWinRatio = 0.0
     self.pureMctsPlayoutTimes = 1500  # 初始为500,现在已到1500s
     self.pureMctsPlayoutTimesAddend = 500
     self.maxPureMctsPlayoutTimes = 3000
     self.modelPath = modelPath
     self.trainedGameCountInDB = Util.readGameCount(type='train')
     self.lossDataCount = 12184  # 被黑客删掉的棋谱数量,这些棋谱数据已无法恢复
     if modelPath is not None:
         self.policyValueNet = PolicyValueNet(self.boardWidth, self.boardHeight, logPath=Util.getTrainLogPath(isFromDB=False), modelPath=modelPath)
         self.trainedGameCount = self.trainedGameCountInDB + self.lossDataCount
     else:
         self.policyValueNet = PolicyValueNet(self.boardWidth, self.boardHeight, logPath=Util.getTrainLogPath(isFromDB=True))
         self.trainedGameCount = 0 + self.lossDataCount
     self.zeroPlayer = ZeroPlayer(self.policyValueNet.policyValueFunction,
                                  polynomialUpperConfidenceTreesConstant=self.polynomialUpperConfidenceTreesConstant,
                                  playoutTimes=self.playoutTimes, isSelfPlay=1)

Пример #6

Показать файл

def run(config=None):
    if config == None:
        config = load_config(file_name=root_data_file + 'resnet_6_6_4.model',
                             only_load_param=True)
    try:
        board = Board(width=config.board_width,
                      height=config.board_height,
                      n_in_row=config.n_in_row)

        #--------------------1.set player:alphazero VS human---------------------#
        best_policy = PolicyValueNet(
            config.board_width,
            config.board_height,
            Network=config.network,
            net_params=config.policy_param
        )  # setup which Network to use based on the net_params

        player1 = AlphaZeroPlayer(
            best_policy.predict, c_puct=config.c_puct,
            nplays=1000)  #set larger nplays for better performance

        # uncomment the following line to play with pure MCTS
        #player2 = RolloutPlayer(nplays=1000, c_puct=config.c_puct)
        player2 = HumanPlayer()
        # --------------------2.set order---------------------#
        who_first = 0  # 0 means player1 first, otherwise player2 first

        # --------------------3.start game--------------------#
        game = Game(board, is_visualize=True)
        t = threading.Thread(target=game.start_game,
                             args=(player1, player2, who_first))
        t.start()
        game.show()

    except:
        print('\n\rquit')

Пример #7

Показать файл

class TrainPipeline:
    def __init__(self, modelPath=None):
        # 棋盘和游戏
        self.boardWidth = 4
        self.boardHeight = 4
        self.game = Game()
        # 训练参数
        self.learningRate = 5e-3
        self.learningRateMultiplier = 1.0  # 自适应
        try:
            self.learningRateMultiplier = float(Util.getNewestLearningRateMultiplier(type='from_db' if modelPath is None else 'from_self_play'))
        except Exception as e:
            print(str(e))
        self.temperature = 1.0  # 温度, 含义见参考资料.txt第2条
        self.playoutTimes = 500  # 模拟次数
        self.polynomialUpperConfidenceTreesConstant = 5  # 论文中的c_puct, 含义见参考资料.txt第2条
        self.dataDequeSize = 10000
        self.trainBatchSize = 512  # 训练批次尺寸,原本为512,先使用50用于调试
        self.dataDeque = deque(maxlen=self.dataDequeSize)  # 超出maxlen会自动删除另一边的元素
        self.playBatchSize = 1
        self.epochs = 5  # 单次训练拟合多少次
        self.KLDParam = 0.025
        self.checkFrequency = 1000  # 之前为100,改为1000,因为评测太浪费时间
        self.gameBatchSize = 200000  # 5000时对mcts1500的胜率为0.9,所以再升到10000
        self.maxWinRatio = 0.0
        self.pureMctsPlayoutTimes = 1500  # 初始为500,现在已到1500s
        self.pureMctsPlayoutTimesAddend = 500
        self.maxPureMctsPlayoutTimes = 3000
        self.modelPath = modelPath
        self.trainedGameCountInDB = Util.readGameCount(type='train')
        self.lossDataCount = 12184  # 被黑客删掉的棋谱数量,这些棋谱数据已无法恢复
        if modelPath is not None:
            self.policyValueNet = PolicyValueNet(self.boardWidth, self.boardHeight, logPath=Util.getTrainLogPath(isFromDB=False), modelPath=modelPath)
            self.trainedGameCount = self.trainedGameCountInDB + self.lossDataCount
        else:
            self.policyValueNet = PolicyValueNet(self.boardWidth, self.boardHeight, logPath=Util.getTrainLogPath(isFromDB=True))
            self.trainedGameCount = 0 + self.lossDataCount
        self.zeroPlayer = ZeroPlayer(self.policyValueNet.policyValueFunction,
                                     polynomialUpperConfidenceTreesConstant=self.polynomialUpperConfidenceTreesConstant,
                                     playoutTimes=self.playoutTimes, isSelfPlay=1)

    def generateEquivalentData(self, stateProbScore):
        """
        生成等价数据,这是为了加快训练速度. 旋转,左右翻转,可得到8组等价数据
        分值不用旋转,它是针对整个盘面的一个标量

        :param stateProbScore: 元组列表[(states, mctsProbabilities, scores), ..., ...]"""
        extendedData = []
        for states, probabilities, scores in stateProbScore:
            for i in [1, 2, 3, 4]:
                # 逆时针旋转
                equivalentState = np.array([np.rot90(state, i) for state in states])
                # 这里的4的含义是每个点有4个方向可以走动,这里一共有16个点,下面会在boardHeight的方向上翻转
                equivalentProbabilities = np.rot90(np.flipud(probabilities.reshape(self.boardHeight, self.boardWidth, 4)), i)
                # 概率先上下翻转,因为之前的棋盘状态是上下翻转了的,这里需要保持一致
                extendedData.append((equivalentState, np.flipud(equivalentProbabilities).flatten(), scores))
                # 水平翻转
                equivalentState = np.array([np.fliplr(state) for state in equivalentState])
                equivalentProbabilities = np.fliplr(equivalentProbabilities)
                extendedData.append((equivalentState, np.flipud(equivalentProbabilities).flatten(), scores))
        return extendedData

    def collectOneSelfPlayData(self, times=1):
        """收集训练数据"""
        for i in range(times):
            _, stateProbScore = self.game.doOneSelfPlay(self.zeroPlayer, printMove=False, temperature=self.temperature)
            stateProbScore = list(stateProbScore)[:]
            self.episodeSize = len(stateProbScore)
            # 用等价数据增加训练数据量
            stateProbScore = self.generateEquivalentData(stateProbScore)
            self.dataDeque.extend(stateProbScore)

    def updatePolicy(self, type):
        """更新策略网络"""
        batchSample = random.sample(self.dataDeque, self.trainBatchSize)
        batchState = [stateProbScore[0] for stateProbScore in batchSample]
        batchProbability = [data[1] for data in batchSample]
        batchScore = [data[2] for data in batchSample]
        oldLearningRateMultiplier = self.learningRateMultiplier
        oldLearningRate = oldLearningRateMultiplier * self.learningRate
        oldProbability, oldScore = self.policyValueNet.doPolicyValueFunction(batchState)
        for i in range(self.epochs):
            loss, entropy = self.policyValueNet.doOneTrain(batchState, batchProbability, batchScore, oldLearningRate)
            newProbability, newScore = self.policyValueNet.doPolicyValueFunction(batchState)
            Kullback_Leibler_Divergence = np.mean(np.sum(oldProbability * (np.log(oldProbability + 1e-10) - np.log(newProbability + 1e-10)), axis=1))
            if Kullback_Leibler_Divergence > self.KLDParam * 4:  # 如果D_KL发生严重分歧,提早停止
                break
        # 自适应地调整学习率
        if Kullback_Leibler_Divergence > self.KLDParam * 2 and self.learningRateMultiplier > 0.1:
            self.learningRateMultiplier /= 1.5
        elif Kullback_Leibler_Divergence < self.KLDParam / 2 and self.learningRateMultiplier < 10:
            self.learningRateMultiplier *= 1.5
        # 方差
        oldVariance = 1 - np.var(np.array(batchScore) - oldScore.flatten()) / np.var(np.array(batchScore))
        newVariance = 1 - np.var(np.array(batchScore) - newScore.flatten()) / np.var(np.array(batchScore))
        # print("Kullback_Leibler_Divergence:{:.5f},learningRateMultiplier:{:.3f},loss:{},entropy:{},oldVariance:{:.3f},newVariance:{:.3f}".format(Kullback_Leibler_Divergence, self.learningRateMultiplier, loss, entropy, oldVariance, newVariance))
        Util.savePolicyUpdate(uuid=uuid.uuid1(), KullbackLeiblerDivergence=Kullback_Leibler_Divergence, oldLearningRateMultiplier=oldLearningRateMultiplier, newLearningRateMultiplier=self.learningRateMultiplier, oldLearningRate=oldLearningRate, newLearningRate=self.learningRateMultiplier * self.learningRate, loss=loss, entropy=entropy, oldVariance=oldVariance, newVariance=newVariance, insertTime=Util.getTimeNowStr(), type=type)
        return loss, entropy

    def doPolicyEvaluate(self, times=10):
        """
        通过与纯MCTS玩家对弈来评估策略网络,这仅用于监控训练的进度
        :param times 对弈次数
        """
        zeroPlayer = ZeroPlayer(self.policyValueNet.policyValueFunction,
                                polynomialUpperConfidenceTreesConstant=self.polynomialUpperConfidenceTreesConstant,
                                playoutTimes=self.playoutTimes)
        zeroPlayer.setName('AlphaZero_' + str(Util.readGameCount(type='train')))
        zeroPlayer.setNetworkVersion(1)
        purePlayer = PurePlayer(polynomialUpperConfidenceTreesConstant=5, playoutTimes=self.pureMctsPlayoutTimes)
        winTimes = defaultdict(int)
        for i in range(times):
            # 这里把startPlayer=i%2改为=0,即永远黑棋先行,因为训练时一直都是黑棋先行,没有执白且白棋先行这种情况,而先行方又是输入参数之一
            if 0 == i % 2:
                winner = self.game.startPlay(zeroPlayer, purePlayer, startPlayer=0, printMove=1, type='evaluation')
            else:
                winner = self.game.startPlay(purePlayer, zeroPlayer, startPlayer=0, printMove=1, type='evaluation')
            if winner == -1:  # 平局
                winTimes['tie'] += 1
            elif winner == 0:  # 黑棋胜
                if 0 == i % 2:
                    winTimes['zero'] += 1
                else:
                    winTimes['pure'] += 1
            else:  # 白棋胜
                if 0 == i % 2:
                    winTimes['pure'] += 1
                else:
                    winTimes['zero'] += 1
        winRatio = 1.0 * (winTimes['zero'] + 0.5 * winTimes['tie']) / times
        print("PlayoutTimes:{}, win: {}, lose: {}, tie:{}".format(self.pureMctsPlayoutTimes, winTimes['zero'], winTimes['pure'], winTimes['tie']))
        return winRatio

    @staticmethod
    def toListOfNumpyArray(lst: list):
        for i in range(len(lst)):
            lst[i] = np.array(lst[i])
        return lst

    def trainByDataFromDB(self):
        # 分批次读取数据,每次读取100局数据,因为数据量很大,一次读完会卡在这里
        pageSize = 10
        # Python里直接用/会四舍五入
        readTimes = self.trainedGameCountInDB // pageSize
        # 避免漏掉余下的数据
        if self.trainedGameCountInDB % pageSize != 0:
            readTimes += 1
        for pageIndex in range(readTimes):
            gameDatas = Util.readGameFromDB(offset=pageIndex * pageSize, size=pageSize, readAll=False, type='train')
            for i in range(len(gameDatas)):
                gameData = gameDatas[i]
                # print(gameData)
                states = self.toListOfNumpyArray(json.loads(gameData[1]))
                probabilities = self.toListOfNumpyArray(json.loads(gameData[2]))
                scores = np.array(json.loads(gameData[3]))
                stateProbScore = zip(states, probabilities, scores)
                stateProbScore = list(stateProbScore)[:]
                self.episodeSize = len(stateProbScore)
                # 用等价数据增加训练数据量
                stateProbScore = self.generateEquivalentData(stateProbScore)
                self.dataDeque.extend(stateProbScore)
                print("Train from DB Batch i:{}, episodeSize:{}".format(i + 1, self.episodeSize))
                if len(self.dataDeque) > self.trainBatchSize:
                    self.updatePolicy(type='from_db')
                self.policyEvaluate(index=i, currentModelSavedPath=Util.getPathToSaveModel(False, True, True), willDoPolicyEvaluate=False)
                self.trainedGameCount += 1

    def run(self):
        """运行训练流水线"""
        try:
            if self.modelPath is None:  # 如果没有指定模型文件,则先把数据库里的数据拿来训练
                self.trainByDataFromDB()
            for i in range(self.trainedGameCount, self.gameBatchSize):
                self.collectOneSelfPlayData(self.playBatchSize)
                print("Batch i:{}, episodeSize:{}".format(i + 1, self.episodeSize))
                if len(self.dataDeque) > self.trainBatchSize:
                    self.updatePolicy(type='from_self_play')
                self.policyEvaluate(i)
        except KeyboardInterrupt:
            print('\n\rquit')

    def policyEvaluate(self, index, currentModelSavedPath=Util.getPathToSaveModel(False, True, False),
                       bestModelSavedPath=Util.getPathToSaveModel(False, False, False) + '_' + str(Util.readGameCount(type='train')),
                       willDoPolicyEvaluate=True):
        """保存模型参数到文件,检查当前模型的性能"""
        self.policyValueNet.saveModel(currentModelSavedPath)
        if willDoPolicyEvaluate and (index + 1) % self.checkFrequency == 0:
            print("Self play batch: {}".format(index + 1))
            # 这里有个bug,评估的时候start_player是0,1互换的,这就导致白棋先行,而这是训练时没有产生的情况,其实规定先行方只能是黑棋,是完全合理的
            winRatio = self.doPolicyEvaluate()
            if winRatio >= self.maxWinRatio:  # >改为>=
                print("New best policy with win ratio: {}".format(winRatio))
                self.maxWinRatio = winRatio
                # 更新最好的模型
                self.policyValueNet.saveModel(bestModelSavedPath)
                if self.maxWinRatio == 1.0 and self.pureMctsPlayoutTimes < self.maxPureMctsPlayoutTimes:
                    self.pureMctsPlayoutTimes += self.pureMctsPlayoutTimesAddend
                    self.maxWinRatio = 0.0

Пример #8

Показать файл

Файл: Train.py Проект: wsszh/AlphaZero_Gobang

class TrainPipeline():
    def __init__(self, config=None):
        # params of the board and the game
        self.config = config if config else Config()

        # Network wrapper
        self.policy_value_net = PolicyValueNet(self.config.board_width, self.config.board_height,
                                               net_params=self.config.policy_param,
                                               Network=self.config.network)

        # 传入policy_value_net的predict方法，神经网络辅助MCTS搜索过程
        self.mcts_player = AlphaZeroPlayer(self.policy_value_net.predict, c_puct=self.config.c_puct,
                                           nplays=self.config.n_playout, is_selfplay=True)


    def self_play(self, n_games=1):
        """
        collect self-play data for training
        n_game: 自我对弈n_game局后，再更新网络
        """
        self.episode_len = 0
        self.augmented_len = 0
        for i in range(n_games):
            winner, play_data, episode_len = self.config.game.start_self_play_game(self.mcts_player, temp=self.config.temp)
            self.episode_len += episode_len # episode_len每局下的回合数
            # augment the data
            play_data = self.augment_data(play_data)
            self.augmented_len += len(play_data)
            self.config.data_buffer.extend(play_data)

    def optimize(self, iteration):
        """update the policy-value net"""
        mini_batch = random.sample(self.config.data_buffer, self.config.batch_size)
        state_batch, mcts_probs_batch, winner_batch = list(zip(*mini_batch))

        if self.config.is_adjust_lr and iteration % self.config.adjust_lr_freq == 0:
            old_probs, old_v = self.policy_value_net.predict_many(state_batch) # used for adjusting lr

        for i in range(self.config.per_game_opt_times): # number of opt times
            loss_info = self.policy_value_net.fit(state_batch, mcts_probs_batch, winner_batch,
                                                      self.config.learn_rate * self.config.lr_multiplier)
        if self.config.is_adjust_lr and iteration % self.config.adjust_lr_freq == 0:
            # adaptively adjust the learning rate
            self.adjust_learning_rate(old_probs, old_v, state_batch, winner_batch)
            #self.adjust_learning_rate_2(iteration)

        print("combined loss:{0:.5f}, value loss:{1:.5f}, policy loss:{2:.5f}, entropy:{3:.5f}".
              format(loss_info['combined_loss'], loss_info['value_loss'], loss_info['policy_loss'], loss_info['entropy']))

        return loss_info


    def adjust_learning_rate(self, old_probs, old_v, state_batch, winner_batch):
        '''
        reference paper: PPO:Proximal Policy Optimization
        adjust learning rate based on KL
        '''
        new_probs, new_v = self.policy_value_net.predict_many(state_batch)
        kl = np.mean(np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1))  # KL
        if kl > self.config.kl_targ * 2 and self.config.lr_multiplier > 0.1:  # kl increase, denote that the new move prob distribution deviate a lot from original distribution, that's what we don't expect. maybe dute to too large lr
            self.config.lr_multiplier /= 1.5
        elif kl < self.config.kl_targ / 2 and self.config.lr_multiplier < 10:  # kl decrease, denote that learning procedure is vary stable and slow
            self.config.lr_multiplier *= 1.5

        explained_var_old = 1 - np.var(np.array(winner_batch) - old_v.flatten()) / np.var(np.array(winner_batch))
        explained_var_new = 1 - np.var(np.array(winner_batch) - new_v.flatten()) / np.var(np.array(winner_batch))

        print("kl:{:.5f},lr:{:.7f},explained_var_old:{:.3f},explained_var_new:{:.3f}".format(
                kl, self.config.learn_rate * self.config.lr_multiplier, explained_var_old, explained_var_new))


    def adjust_learning_rate_2(self, iteration):
        '''衰减法'''
        if (iteration+1) % self.config.lr_decay_per_iterations == 0:
            self.config.lr_multiplier /= self.config.lr_decay_speed
        print ("lr:{}".format(self.config.learn_rate * self.config.lr_multiplier))


    def evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing games against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = AlphaZeroPlayer(self.policy_value_net.predict, c_puct=self.config.c_puct,
                                              nplays=self.config.n_playout)

        if self.config.evaluate_opponent == 'Pure':
            # opponent is rolloutplayer
            print("Begin evaluation, Opponent is RolloutMCTSPlayer")
            opponent_mcts_player = RolloutPlayer(c_puct=5, nplays=self.config.pure_mcts_playout_num)
        else:
            # oppenent is AlphaZeroPlayer
            print("Begin evaluation, Opponent is AlphaZeroMCTSPlayer")
            opponent_mcts_player = load_current_best_player(self.config.cur_best_alphazero_store_filename)

        win_cnt = defaultdict(int)
        for i in range(n_games):
            print ("evaluate game %d" %i)
            winner = self.config.game.start_game(current_mcts_player, opponent_mcts_player, who_first=i % 2, is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(self.config.pure_mcts_playout_num, win_cnt[1], win_cnt[2],
                                                                  win_cnt[-1]))
        return win_ratio




    def augment_data(self, play_data):
        """
        augment the data set by rotation and flipping
        play_data: [(state, mcts_prob, winner_z), ..., ...]"""
        extend_data = []
        for state, mcts_porb, winner in play_data:
            '''
            state:
            3*3 board's moves like:
                6 7 8
                3 4 5
                0 1 2
            mcts_porb: flatten
            0,1,2,3,4,5,6,7,8
            winner
            1 or -1
            '''
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])  # i=4就是原来的数据
                equi_mcts_prob = np.rot90(np.flipud(mcts_porb.reshape(self.config.board_height, self.config.board_width)),
                                          i)  # 上下翻转成棋盘形状，各个cell的值对应该位置下棋概率
                extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])  # 水平翻转
                equi_mcts_prob = np.fliplr(equi_mcts_prob)  # equi_mcts_prob和equi_state对应
                extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data


    def save_model(self, win_ratio, epochs):
        # save model if necessary
        # if opponent is Rollout Player, then win_ratio > best_win_pure_so_far
        # if opponent is the Strongest Rollout Player, then win_ratio must be 1.0
        # else win_ratio >= win_ratio_alphazero
        if (self.config.evaluate_opponent == 'Pure' and win_ratio > self.config.best_win_pure_so_far) or \
                (self.config.evaluate_opponent == 'Pure' and self.config.pure_mcts_playout_num == 5000 and win_ratio == 1.0) or \
                (self.config.evaluate_opponent == 'AlphaZero' and win_ratio >= self.config.win_ratio_alphazero):

            print("New best policy!!!!!!!!")
            # load network parameters
            self.config.policy_param = self.policy_value_net.get_policy_param()  # get model params

            self.config.cur_best_alphazero_store_filename = "tmp/epochs-{0}-opponent-{1}-win-{2:.2f}.pkl".format(epochs,
                                                                                                                 self.config.evaluate_opponent,
                                                                                                                 win_ratio)
            pickle.dump(self.config, open(self.config.cur_best_alphazero_store_filename, 'wb'))
            pickle.dump(self.config, open(self.config.local_model_path + self.config.cur_best_alphazero_store_filename, 'wb'))


        #---------------Adjust Opponent---------------------#
        # Firstly, Make Rollout stronger(increase pure_mcts_playout_num)
        # Secondly, when RolloutPlayer is the strongest version(mcts_num=5000) but still lose self.config change_opponent_continuous_times Times,
        # Then Change the opponent to AlphaZero Player

        # if opponent is RolloutPlayer, Then make it Stronger!!
        if self.config.evaluate_opponent =='Pure' and win_ratio > self.config.best_win_pure_so_far:
            if win_ratio == 1.0 and self.config.pure_mcts_playout_num < 5000:
                self.config.pure_mcts_playout_num += 1000  # stronger
                self.config.best_win_pure_so_far = 0.0 # reset win_ratio

        # current model continuously win(or tie) against the strongest pure mcts player(mcts_play_out>=5000)
        if self.config.evaluate_opponent == 'Pure' and self.config.pure_mcts_playout_num >= 5000 \
                and win_ratio == 1.0: # note: add equal
            self.config.continuous_win_pure_times += 1


        # change the opponent
        if self.config.evaluate_opponent == 'Pure' and \
                self.config.continuous_win_pure_times  >= self.config.change_opponent_continuous_times:
            print ('Change Opponent:AlphaZero')
            self.config.evaluate_opponent = 'AlphaZero'


    def check_loss_change(self):
        '''
        check loss change every self.config.check_freq steps
        record the current minimum [mean loss of every self.config.check_freq steps]
        if the mean loss of every self.config.check_freq steps don't decrease for twice times comparing to the current minimum
        then decrease the learn_rate by half
        '''
        combined_loss_list = [loss['combined_loss'] for loss in self.config.loss_records]
        last_check_freq_mean_loss = np.mean(combined_loss_list[-self.config.check_freq:])
        if self.config.min_mean_loss_every_check_freq is None or \
                last_check_freq_mean_loss < self.config.min_mean_loss_every_check_freq:
            if self.config.min_mean_loss_every_check_freq is not None:
                print('decrease loss by {0:.4f}'.format(self.config.min_mean_loss_every_check_freq-last_check_freq_mean_loss))
            self.config.min_mean_loss_every_check_freq = last_check_freq_mean_loss # update
            self.config.increase_mean_loss_times = 0 # reset to zero
        else:
            print('increase loss by {0:.4f}'.format(last_check_freq_mean_loss-self.config.min_mean_loss_every_check_freq))
            self.config.increase_mean_loss_times += 1

        if self.config.increase_mean_loss_times >= self.config.adjust_lr_increase_loss_times:
            self.config.learn_rate /= 10 # decrease init lr by half
            self.config.kl_targ /= 10 # decrease kl_targ, so that the lr tends to be smaller
            #self.config.increase_mean_loss_times = 0 # reset again
            print('decrease lr by half, now init lr is {0:.5f}'.format(self.config.learn_rate))



    def run(self):
        """run the training pipeline"""
        print ("start training from game:{}".format(self.config.start_game_num))
        try:
            for i in range(self.config.start_game_num, self.config.game_batch_num):

                self.self_play(self.config.play_batch_size) # big step 1
                print("iteration i:{}, episode_len:{}, augmented_len:{}, current_buffer_len:{}".format(i + 1,
                                                            self.episode_len, self.augmented_len, len(self.config.data_buffer)))
                # new Added parameters,So check for old config file
                if not hasattr(self.config, "episode_records"): setattr(config, "episode_records", [])
                self.config.episode_records.append(self.episode_len)

                if len(self.config.data_buffer) > self.config.batch_size:
                    loss_info = self.optimize(iteration=i+1) # big step 2
                    self.config.loss_records.append(loss_info)

                self.config.start_game_num = i + 1  # update for restart

                # check the performance of the current model，and save the model params
                if (i + 1) % self.config.check_freq == 0:
                    print("current iteration: {}".format(i + 1))
                    win_ratio = self.evaluate() #big step 3
                    self.check_loss_change() # check loss, and adjust init lr if necessary
                    self.save_model(win_ratio, i + 1)



        except KeyboardInterrupt:
            print('\n\rquit')

Пример #9

Показать файл

Файл: train.py Проект: fk873806472/TwelveShogi_AlphaZero

class TrainPipeline():
    def __init__(self):
        # params of the board and the game
        self.board_width = 6
        self.board_height = 6
        self.n_in_row = 4
        self.board = ShogiBoard()
        # training params
        self.learn_rate = 5e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.025
        self.check_freq = 50
        self.game_batch_num = 3000
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        # start training from a given policy-value net
        #        policy_param = pickle.load(open('current_policy.model', 'rb'))
        #        self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param)
        # start training from a new policy-value net
        self.policy_value_net = PolicyValueNet()
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        for i in range(n_games):
            winner, stepCounts, play_data = start_self_play(self.board,
                                                            self.mcts_player,
                                                            temp=self.temp)
            print("train-collect_selfplay_data: winner = %d" % winner)
            self.episode_len = stepCounts
            self.data_buffer.extend(play_data)

    def policy_update(self):
        """update the policy-value net"""
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch,
                self.learn_rate * self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        # adaptively adjust the learning rate
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = 1 - np.var(
            np.array(winner_batch) - old_v.flatten()) / np.var(
                np.array(winner_batch))
        explained_var_new = 1 - np.var(
            np.array(winner_batch) - new_v.flatten()) / np.var(
                np.array(winner_batch))
        print(
            "kl:{:.5f},lr_multiplier:{:.3f},loss:{},entropy:{},explained_var_old:{:.3f},explained_var_new:{:.3f}"
            .format(kl, self.lr_multiplier, loss, entropy, explained_var_old,
                    explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing games against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            print("train-policy_evaluate: game = %d" % (i))
            winner = start_play(self.board,
                                current_mcts_player,
                                pure_mcts_player,
                                startPlayer=i % 2)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[0]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[0]))
        return win_ratio

    def run(self):
        """run the training pipeline"""
        try:
            curTime = datetime.datetime.now()
            writeTrainingLog("train-run: {}".format(curTime))
            for i in range(self.game_batch_num):
                print("train-run: train round %d" % i)
                writeTrainingLog("train-run: train round %d" % i)
                self.collect_selfplay_data(self.play_batch_size)
                print("batch i:{}, episode_len:{}".format(
                    i + 1, self.episode_len))
                writeTrainingLog("batch i:{}, episode_len:{}".format(
                    i + 1, self.episode_len))
                print("train-run: len of data_buffer = {}".format(
                    len(self.data_buffer)))
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update()
                # check the performance of the current model，and save the model params
                if (i + 1) % self.check_freq == 0:
                    print("current self-play batch: {}".format(i + 1))
                    writeTrainingLog("current self-play batch: {}".format(i +
                                                                          1))
                    win_ratio = self.policy_evaluate()
                    net_params = self.policy_value_net.get_policy_param(
                    )  # get model params
                    pickle.dump(net_params,
                                open('current_policy.model',
                                     'wb'))  # save model param to file
                    if win_ratio > self.best_win_ratio:
                        print("New best policy!!!!!!!!")
                        writeTrainingLog("New best policy!!!!!!!!")
                        self.best_win_ratio = win_ratio
                        pickle.dump(net_params,
                                    open('best_policy.model',
                                         'wb'))  # update the best_policy
                        if self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000:
                            self.pure_mcts_playout_num += 1000
                            self.best_win_ratio = 0.0
        except KeyboardInterrupt:
            print('\n\rquit')
            writeTrainingLog('\n\rquit')