def __init__(self, game, nnet): """ :param game: 棋盘对象 :param nnet: 神经网络对象 """ self.args = args self.player = WHITE self.game = game self.nnet = nnet # self.pnet = self.nnet.__class__(self.game) # the competitor network # 旧网络 self.mcts = Mcts(self.game, self.nnet, self.args) self.batch = [] # 每次给NNet喂的数据量,但类型不对(多维列表) self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples()
def __init__(self, game, nnet): """ :param game: 棋盘对象 :param nnet: 神经网络对象 """ self.num_white_win = 0 self.num_black_win = 0 self.args = args self.player = WHITE self.game = game self.nnet = nnet self.pnet = self.nnet.__class__(self.game) self.mcts = Mcts(self.game, self.nnet, self.args) self.batch = [] # 每次给NNet喂的数据量,但类型不对(多维列表) self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples()
def play_one_game(self): """ 使用Mcts完整下一盘棋 :return: 4 * [(board, pi, z)] : 返回四个训练数据元组:(棋盘,策略,输赢) """ # 每盘棋的 [board, WHITE, pi] 数据 one_game_train_data = [] board = self.game.get_init_board(self.game.board_size) play_step = 0 while True: play_step += 1 ts = time.time() print('---------------------------') print('第', play_step, '步') print(board) # pboard.print_board(board, play_step+1) self.mcts.episodeStep = play_step # 在Mcts中,始终以白棋视角选择 transformed_board = self.game.get_transformed_board(board, self.player) # 进行多次mcts搜索得出来概率(以白棋视角) self.mcts = Mcts(self.game, self.nnet, self.args) next_action, steps_train_data = self.mcts.get_best_action(transformed_board, self.player) one_game_train_data += steps_train_data te = time.time() if self.player == WHITE: print(" 白棋走:", next_action, '搜索:', int(te - ts), 's') else: print(" 黑棋走:", next_action, '搜索:', int(te - ts), 's') board, self.player = self.game.get_next_state(board, self.player, next_action) r = self.game.get_game_ended(board, self.player) if r != 0: # 胜负已分,r始终为 -1 if self.player == WHITE: print('白棋输') self.num_black_win += 1 else: print('黑棋输') self.num_white_win += 1 print("##### 终局 #####") print(board) # pboard.print_board(board, play_step+2) a = [(board, pi, r * ((-1) ** (player != self.player))) for board, player, pi in one_game_train_data] # print(len(a)) # for i in range(len(a) // 4): # print(4 * a[i][0], a[4 * i][2]) return a
def learn(self): for i in range(1, self.args.num_iter + 1): print( '#######################################################################################' ) print('#################################### IterNum: ' + str(i) + ' ####################################') print( '#######################################################################################' ) # 每次都执行 if not self.skipFirstSelfPlay or i > 1: # deque:双向队列 max_len:队列最大长度:self.args.max_len_queue iter_train_data = deque([], maxlen=self.args.max_len_queue) # 下“num_play_game”盘棋训练一次NNet for i in range(self.args.num_play_game): # 重置搜索树 print("====================================== 第", i + 1, "盘棋 ======================================") self.mcts = Mcts(self.game, self.nnet, self.args) self.player = WHITE iter_train_data += self.play_one_game() # save the iteration examples to the history self.batch.append(iter_train_data) # 不断更新训练数据 # 如果 训练数据 大于规定的训练长度,则将最旧的数据删除 if len(self.batch) > self.args.max_batch_size: print("len(max_batch_size) =", len(self.batch), " => remove the oldest batch") self.batch.pop(0) # 保存训练数据 self.saveTrainExamples(i - 1) # 原batch是多维列表,此处标准化batch standard_batch = [] for e in self.batch: # extend() 在列表末尾一次性追加其他序列中多个元素 standard_batch.extend(e) # 打乱数据,是数据服从独立同分布(排除数据间的相关性) shuffle(standard_batch) # 这里保存的是一个temp也就是一直保存着最近一次的网络,这里是为了和最新的网络进行对弈 self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') # 开启训练 self.nnet.train(standard_batch) print('PITTING AGAINST PREVIOUS VERSION') # 旧、新网路赢的次数 和 平局 pwins, nwins, draws = 10, 100, 1 print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) # 如果旧网路和新网路赢得和为0 或 新网络/ 新网络+旧网路 小于 更新阈值(0.55)则不更新,否则更新成新网络参数 if pwins + nwins == 0 or float(nwins) / ( pwins + nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') # 如果拒绝了新模型,这老模型就能发挥作用 self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') # 保存当前模型并更新最新模型 self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')
class TrainMode: def __init__(self, game, nnet): """ :param game: 棋盘对象 :param nnet: 神经网络对象 """ self.args = args self.player = WHITE self.game = game self.nnet = nnet # self.pnet = self.nnet.__class__(self.game) # the competitor network # 旧网络 self.mcts = Mcts(self.game, self.nnet, self.args) self.batch = [] # 每次给NNet喂的数据量,但类型不对(多维列表) self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples() # 调用NNet开始训练 def learn(self): for i in range(1, self.args.num_iter + 1): print( '#######################################################################################' ) print('#################################### IterNum: ' + str(i) + ' ####################################') print( '#######################################################################################' ) # 每次都执行 if not self.skipFirstSelfPlay or i > 1: # deque:双向队列 max_len:队列最大长度:self.args.max_len_queue iter_train_data = deque([], maxlen=self.args.max_len_queue) # 下“num_play_game”盘棋训练一次NNet for i in range(self.args.num_play_game): # 重置搜索树 print("====================================== 第", i + 1, "盘棋 ======================================") self.mcts = Mcts(self.game, self.nnet, self.args) self.player = WHITE iter_train_data += self.play_one_game() # save the iteration examples to the history self.batch.append(iter_train_data) # 不断更新训练数据 # 如果 训练数据 大于规定的训练长度,则将最旧的数据删除 if len(self.batch) > self.args.max_batch_size: print("len(max_batch_size) =", len(self.batch), " => remove the oldest batch") self.batch.pop(0) # 保存训练数据 self.saveTrainExamples(i - 1) # 原batch是多维列表,此处标准化batch standard_batch = [] for e in self.batch: # extend() 在列表末尾一次性追加其他序列中多个元素 standard_batch.extend(e) # 打乱数据,是数据服从独立同分布(排除数据间的相关性) shuffle(standard_batch) # 这里保存的是一个temp也就是一直保存着最近一次的网络,这里是为了和最新的网络进行对弈 self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') # 开启训练 self.nnet.train(standard_batch) print('PITTING AGAINST PREVIOUS VERSION') # 旧、新网路赢的次数 和 平局 pwins, nwins, draws = 10, 100, 1 print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) # 如果旧网路和新网路赢得和为0 或 新网络/ 新网络+旧网路 小于 更新阈值(0.55)则不更新,否则更新成新网络参数 if pwins + nwins == 0 or float(nwins) / ( pwins + nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') # 如果拒绝了新模型,这老模型就能发挥作用 self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') # 保存当前模型并更新最新模型 self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') # 完整下一盘游戏 def play_one_game(self): """ 使用Mcts完整下一盘棋 :return: 4 * [(board, pi, z)] : 返回四个训练数据元组:(棋盘,策略,输赢) """ board = self.game.get_init_board(self.game.board_size) play_step = 0 while True: play_step += 1 ts = time.time() print('---------------------------') print('第', play_step, '步') print(board) self.mcts.episodeStep = play_step # 这里进行了更新 # 在MCTS中,始终以白棋视角选择 transformed_board = self.game.get_transformed_board( board, self.player) # 将翻转后的棋盘和temp传给蒙特卡洛树搜索方法得到当前的策略 # 进行多次mcts搜索得出来概率 next_action, steps_train_data = self.mcts.get_best_action( transformed_board, self.player) te = time.time() print("下一步:", next_action, '用时:', int(te - ts), 's') board, self.player = self.game.get_next_state( board, self.player, next_action) r = self.game.get_game_ended(board, self.player) if r != 0: # 胜负已分 if self.player == WHITE: print('白棋输') else: print('黑棋输') print("##### 终局 #####") print(board) return [(x[0], x[2], r * ((-1)**(x[1] != self.player))) for x in steps_train_data]
def learn(self): for i in range(1, self.args.num_iter + 1): print('') print('#################################### IterNum: ' + str(i) + ' ####################################') print('下', self.args.num_play_game, '盘棋训练一次NNet') print('MCTS搜索模拟次数:', self.args.num_mcts_search) # 每次都执行 if not self.skipFirstSelfPlay or i > 1: # deque:双向队列 max_len:队列最大长度:self.args.max_len_queue # 每次训练的 [board, WHITE, pi] 数据 iter_train_data = deque([], maxlen=self.args.max_len_queue) # 下“num_play_game”盘棋训练一次NNet for j in range(self.args.num_play_game): # 重置搜索树 print("====================================== 第", j + 1, "盘棋 ======================================") self.mcts = Mcts(self.game, self.nnet, self.args) self.player = WHITE iter_train_data += self.play_one_game() # pboard.save_figure(j + 1) print('TrainMode.py-learn()', '白棋赢:', self.num_white_win, '盘;', '黑棋赢:', self.num_black_win, '盘') self.batch.append(iter_train_data) # 如果 训练数据 大于规定的训练长度,则将最旧的数据删除 if len(self.batch) > self.args.max_batch_size: print("len(max_batch_size) =", len(self.batch), " => remove the oldest batch") self.batch.pop(0) # 保存训练数据 self.save_train_examples(i) # 原batch是多维列表,此处标准化batch standard_batch = [] for e in self.batch: # extend() 在列表末尾一次性追加其他序列中多个元素 standard_batch.extend(e) # 打乱数据,是数据服从独立同分布(排除数据间的相关性) shuffle(standard_batch) print('NN训练的batch:', len(standard_batch), '条数据', ' TrainMode.py-learn()') # 这里保存的是一个temp也就是一直保存着最近一次的网络,这里是为了和最新的网络进行对弈 # 将临时网络保存,付给对抗网络 self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = Mcts(self.game, self.pnet, self.args) # 开启训练 self.nnet.train(standard_batch) nmcts = Mcts(self.game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') # pwins, nwins, draws = 10, 100, 1 arena = UpdateNet(lambda x, player: np.argmax(pmcts.get_best_action(x, player)), lambda x, player: np.argmax(nmcts.get_best_action(x, player)), self.game) # 对抗、本网络赢的次数 和 平局 pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) # 如果旧网路和新网路赢得和为0 或 新网络/ 新网络+旧网路 小于 更新阈值(0.55)则不更新,否则更新成新网络参数 if pwins + nwins == 0 or float(nwins) / (pwins + nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') # 如果拒绝了新模型,这老模型就能发挥作用 self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') # 保存当前模型并更新最新模型 self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='checkpoint_' + str(i) + '.pth.tar') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')
class TrainMode: """ 自博弈类 """ def __init__(self, game, nnet): """ :param game: 棋盘对象 :param nnet: 神经网络对象 """ self.num_white_win = 0 self.num_black_win = 0 self.args = args self.player = WHITE self.game = game self.nnet = nnet self.pnet = self.nnet.__class__(self.game) self.mcts = Mcts(self.game, self.nnet, self.args) self.batch = [] # 每次给NNet喂的数据量,但类型不对(多维列表) self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples() # 调用NNet开始训练 def learn(self): for i in range(1, self.args.num_iter + 1): print('') print('#################################### IterNum: ' + str(i) + ' ####################################') print('下', self.args.num_play_game, '盘棋训练一次NNet') print('MCTS搜索模拟次数:', self.args.num_mcts_search) # 每次都执行 if not self.skipFirstSelfPlay or i > 1: # deque:双向队列 max_len:队列最大长度:self.args.max_len_queue # 每次训练的 [board, WHITE, pi] 数据 iter_train_data = deque([], maxlen=self.args.max_len_queue) # 下“num_play_game”盘棋训练一次NNet for j in range(self.args.num_play_game): # 重置搜索树 print("====================================== 第", j + 1, "盘棋 ======================================") self.mcts = Mcts(self.game, self.nnet, self.args) self.player = WHITE iter_train_data += self.play_one_game() # pboard.save_figure(j + 1) print('TrainMode.py-learn()', '白棋赢:', self.num_white_win, '盘;', '黑棋赢:', self.num_black_win, '盘') self.batch.append(iter_train_data) # 如果 训练数据 大于规定的训练长度,则将最旧的数据删除 if len(self.batch) > self.args.max_batch_size: print("len(max_batch_size) =", len(self.batch), " => remove the oldest batch") self.batch.pop(0) # 保存训练数据 self.save_train_examples(i) # 原batch是多维列表,此处标准化batch standard_batch = [] for e in self.batch: # extend() 在列表末尾一次性追加其他序列中多个元素 standard_batch.extend(e) # 打乱数据,是数据服从独立同分布(排除数据间的相关性) shuffle(standard_batch) print('NN训练的batch:', len(standard_batch), '条数据', ' TrainMode.py-learn()') # 这里保存的是一个temp也就是一直保存着最近一次的网络,这里是为了和最新的网络进行对弈 # 将临时网络保存,付给对抗网络 self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = Mcts(self.game, self.pnet, self.args) # 开启训练 self.nnet.train(standard_batch) nmcts = Mcts(self.game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') # pwins, nwins, draws = 10, 100, 1 arena = UpdateNet(lambda x, player: np.argmax(pmcts.get_best_action(x, player)), lambda x, player: np.argmax(nmcts.get_best_action(x, player)), self.game) # 对抗、本网络赢的次数 和 平局 pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) # 如果旧网路和新网路赢得和为0 或 新网络/ 新网络+旧网路 小于 更新阈值(0.55)则不更新,否则更新成新网络参数 if pwins + nwins == 0 or float(nwins) / (pwins + nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') # 如果拒绝了新模型,这老模型就能发挥作用 self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') # 保存当前模型并更新最新模型 self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='checkpoint_' + str(i) + '.pth.tar') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') # 完整下一盘游戏 def play_one_game(self): """ 使用Mcts完整下一盘棋 :return: 4 * [(board, pi, z)] : 返回四个训练数据元组:(棋盘,策略,输赢) """ # 每盘棋的 [board, WHITE, pi] 数据 one_game_train_data = [] board = self.game.get_init_board(self.game.board_size) play_step = 0 while True: play_step += 1 ts = time.time() print('---------------------------') print('第', play_step, '步') print(board) # pboard.print_board(board, play_step+1) self.mcts.episodeStep = play_step # 在Mcts中,始终以白棋视角选择 transformed_board = self.game.get_transformed_board(board, self.player) # 进行多次mcts搜索得出来概率(以白棋视角) self.mcts = Mcts(self.game, self.nnet, self.args) next_action, steps_train_data = self.mcts.get_best_action(transformed_board, self.player) one_game_train_data += steps_train_data te = time.time() if self.player == WHITE: print(" 白棋走:", next_action, '搜索:', int(te - ts), 's') else: print(" 黑棋走:", next_action, '搜索:', int(te - ts), 's') board, self.player = self.game.get_next_state(board, self.player, next_action) r = self.game.get_game_ended(board, self.player) if r != 0: # 胜负已分,r始终为 -1 if self.player == WHITE: print('白棋输') self.num_black_win += 1 else: print('黑棋输') self.num_white_win += 1 print("##### 终局 #####") print(board) # pboard.print_board(board, play_step+2) a = [(board, pi, r * ((-1) ** (player != self.player))) for board, player, pi in one_game_train_data] # print(len(a)) # for i in range(len(a) // 4): # print(4 * a[i][0], a[4 * i][2]) return a def save_train_examples(self, iteration): """ 保存训练数据(board, pi, v)为 @params iteration:迭代次数,保存数据为:checkpoint_iteration.pth.tar.examples """ # folder = args.checkpoint :'./temp/' folder = self.args.checkpoint if not os.path.exists(folder): os.makedirs(folder) file_name = os.path.join(folder, 'checkpoint_' + str(iteration) + '.pth.tar' + ".examples") with open(file_name, "wb+") as f: Pickler(f).dump(self.batch) f.closed def load_train_examples(self): """ 加载(board, pi, v)数据模型 """ # load_folder_file[0]:'/models/'; load_folder_file[1]:'best.pth.tar' model_file = os.path.join(self.args.load_folder_file[0], self.args.load_folder_file[1]) examples_file = model_file + ".examples" if not os.path.isfile(examples_file): print(examples_file) r = input("File with trainExamples not found. Continue? [y|n]") if r != "y": sys.exit() else: print("File with trainExamples found. Read it.") with open(examples_file, "rb") as f: self.batch = Unpickler(f).load() f.closed self.skipFirstSelfPlay = True