def get_board(game): board = Board() board.init_board(0) for move in game.moves: board.do_move(move.location) return board
class GameStrategy_MZhang(): def __init__(self, startplayer=0): model_file = 'models/resnet/output318/current_policy.model' policy_param = None self.height = 15 self.width = 15 '''if model_file is not None: print('loading...', model_file) try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes')''' policy_value_net = PolicyValueNet(self.height, self.width, model_file=model_file, output='output/') self.mcts_player = MCTSPlayer(policy_value_net.policy_value_fn, c_puct=1, n_playout=1000) self.board = Board(width=self.width, height=self.height, n_in_row=5) self.board.init_board(startplayer) self.game = Game(self.board) p1, p2 = self.board.players print('players:', p1, p2) self.mcts_player.set_player_ind(p1) pass def play_one_piece(self, user, gameboard): print('user:'******'gameboard:', gameboard.move_history) lastm = gameboard.get_lastmove() if lastm[0] != -1: usr, n, row, col = lastm mv = (self.height - row - 1) * self.height + col # if not self.board.states.has_key(mv): self.board.do_move(mv) print('board:', self.board.states.items()) move = self.mcts_player.get_action(self.board) self.board.do_move(move) self.game.graphic(self.board, *self.board.players) outmv = (self.height - move // self.height - 1, move % self.width) return outmv
class RL_QG_agent(object): def __init__(self): self.temp = 1e-3 # the temperature param self.n_playout = 200 # num of simulations for each move self.c_puct = 5 self.board_width = 8 self.board_height = 8 self.model_path = os.path.join("./models/curr_model_100rollout.pt") #self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params=None) #self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) self.mcts_player = MCTS_Pure(c_puct=5, n_playout=self.n_playout) self.env = gym.make("Reversi8x8-v0") self.init_model() #self.load_model() def init_model(self): self.board = Board(env=self.env, width=self.board_width, height=self.board_height) self.board.init_board() self.game = Game(self.board) self.have_step = False def place(self, state, enables, player=None): curr_state = bit_to_board(self.board.black, self.board.white) curr_state = 1 - (curr_state[0] + curr_state[1]) reverse_change = np.where((curr_state - state[2]) == -1) if self.have_step == False: pass elif reverse_change[0].shape[0] > 1: self.board.init_board() self.have_step = False curr_state = bit_to_board(self.board.black, self.board.white) curr_state = 1 - (curr_state[0] + curr_state[1]) change = np.where((curr_state - state[2]) == 1) if change[0].shape[0] == 1: action = change[0][0] * self.board_width + change[1][0] self.board.do_move(action) else: if self.have_step == False: pass else: action = 65 self.board.do_move(action) move = self.mcts_player.get_action(self.board) self.board.do_move(move) self.have_step = True return move def load_model(self): self.policy_value_net.policy_value_net.load_state_dict( torch.load(self.model_path))
def player_moved(): receive_data = request.get_json() print(receive_data) board = Board(width=9, height=9, n_in_row=5) board.init_board(0) states_loc = receive_data['states_loc'] if states_loc != None: board.states_loc = states_loc board.states_loc_to_states() # 플레이어가 둔 돌의 위치를 받고 player_loc = receive_data['player_moved'] player_move = board.location_to_move(player_loc) board.do_move(player_move) board.set_forbidden() # 금수 자리 업데이트 print(np.array(board.states_loc)) print(board.states) # 승리 판정 (플레이어가 이겼는지) end, winner = board.game_end() if end: if winner == -1: message = "tie" else: message = winner data = { 'ai_moved': None, 'forbidden': board.forbidden_locations, 'message': message } return jsonify(data) # AI가 둘 위치를 보낸다. # 난이도에 해당하는 player 불러옴. hard_idx = receive_data['hard_idx'] hards = [2500, 5000, 7500, 10000, 12500, 15000, 17500, 20000] model_file = f'./model/policy_9_{hards[hard_idx]}.model' policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') best_policy = PolicyValueNetNumpy(9, 9, policy_param) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) ai_move = mcts_player.get_action(board) ai_loc = board.move_to_location(ai_move) board.do_move(ai_move) board.set_forbidden() # 금수 자리 업데이트 print(np.array(board.states_loc)) # 승리 판정 (AI가 이겼는지) message = None end, winner = board.game_end() if end: if winner == -1: message = "tie" else: message = winner data = { 'ai_moved': list(map(int, ai_loc)), 'states_loc': board.states_loc, 'forbidden': board.forbidden_locations, 'message': message } return jsonify(data)
class TrainPipeline(): def __init__(self, init_model=None): # params of the board and the game self.board_width = 15 self.board_height = 15 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) self.manual = Manual(self.board) # training params self.learn_rate = 1e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 100 # num of simulations for each move self.c_puct = 1 self.buffer_size = 100000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.episode_len = 0 self.kl_targ = 0.02 self.check_freq = 1 self.game_batch_num = 5 self.best_win_ratio = 0.55 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 self.lock = threading.Lock() if init_model: # start training from an initial policy-value net self.g1 = tf.Graph() with self.g1.as_default(): self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model, graph=self.g1, output='/data/data/') # tf.reset_default_graph() self.g2 = tf.Graph() with self.g2.as_default(): self.policy_value_net_train = PolicyValueNet(self.board_width, self.board_height, model_file=init_model, graph=self.g2, output='/data/output/') else: # start training from a new policy-value net self.g1 = tf.Graph() with self.g1.as_default(): self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, graph=self.g1, output='./data/') # tf.reset_default_graph() self.g2 = tf.Graph() with self.g2.as_default(): self.policy_value_net_train = PolicyValueNet(self.board_width, self.board_height, graph=self.g2, output='./output/') self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) def get_equi_data(self, play_data): """augment the data set by rotation and flipping play_data: [(state, mcts_prob, winner_z), ..., ...] """ extend_data = [] for state, mcts_porb, winner in play_data: for i in [1, 2, 3, 4]: # rotate counterclockwise equi_state = np.array([np.rot90(s, i) for s in state]) equi_mcts_prob = np.rot90(np.flipud( mcts_porb.reshape(self.board_height, self.board_width)), i) extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) # flip horizontally equi_state = np.array([np.fliplr(s) for s in equi_state]) equi_mcts_prob = np.fliplr(equi_mcts_prob) extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) return extend_data def collect_selfplay_data(self, n_games=1): """collect self-play data for training""" for i in range(n_games): # self.lock.acquire() # print("game {}".format(i)) with self.g1.as_default(): '''mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) game = Game(board)''' winner, play_data = self.game.start_self_play(self.mcts_player, is_shown=0, temp=self.temp) # self.lock.release() play_data = list(play_data)[:] self.episode_len = len(play_data) # augment the data play_data = self.get_equi_data(play_data) self.data_buffer.extend(play_data) # print("self play end...") def collect_manual_data(self, file): winner, play_data = self.manual.read_manual_data(file) # read the chess manual fail if winner == 0: return play_data = list(play_data)[:] self.episode_len = len(play_data) # augment the data play_data = self.get_equi_data(play_data) self.data_buffer.extend(play_data) def collect_test_data(self): self.board.init_board() states, mcts_probs, current_players = [], [], [] move = 128 self.board.do_move(112) states.append(self.board.current_state()) probs = np.zeros(self.board.width * self.board.height) probs[[move]] = 1 mcts_probs.append(probs) current_players.append(self.board.current_player) winners_z = np.array([1]) play_data = zip(states, mcts_probs, winners_z) play_data = list(play_data)[:] self.data_buffer.extend(play_data) def policy_update(self): """update the policy-value net""" mini_batch = random.sample(self.data_buffer, self.batch_size) state_batch = [data[0] for data in mini_batch] mcts_probs_batch = [data[1] for data in mini_batch] winner_batch = [data[2] for data in mini_batch] with self.g2.as_default(): for i in range(self.epochs): loss, entropy = self.policy_value_net_train.train_step( state_batch, mcts_probs_batch, winner_batch, self.learn_rate*self.lr_multiplier) print(( "lr_multiplier:{:.3f}," "loss:{}," "entropy:{}," ).format( self.lr_multiplier, loss, entropy)) return loss, entropy def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ print("evaluating...") current_mcts_player = MCTSPlayer(self.policy_value_net_train.policy_value_fn, c_puct=self.c_puct, n_playout=self.pure_mcts_playout_num) best_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, best_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) # save the current_model self.policy_value_net_train.save_model('/data/output/current_policy.model') if win_ratio > self.best_win_ratio: print("New best policy!!!!!!!!") # update the best_policy self.policy_value_net_train.save_model('/data/output/best_policy.model') self.g1 = tf.Graph() with self.g1.as_default(): self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file='/data/output/best_policy.model', graph=self.g1, output='/data/data/') return win_ratio def run(self): """run the training pipeline""" try: '''coord = tf.train.Coordinator() self_play = [threading.Thread(target=self.collect_selfplay_data, args=(self.play_batch_size,)) for i in range(4)] for sp in self_play: sp.start() coord.join(self_play) while len(self.data_buffer) < self.batch_size: print(len(self.data_buffer)) time.sleep(3) pass''' multiplier = [0.1, 0.1, 0.01, 0.01, 0.01] step = 0 for n in range(self.game_batch_num): self.collect_selfplay_data(self.play_batch_size) # self.collect_test_data() self.policy_value_net.n_step += 1 print("batch i:{}, episode_len:{}".format( self.policy_value_net.n_step, self.episode_len)) # optimisation if len(self.data_buffer) > self.batch_size: for i in range(100): self.policy_update() # evaluation if self.policy_value_net.n_step % self.check_freq == 0: # self.lr_multiplier = multiplier[step] # step += 1 self.mcts_player.mcts._discount = 1 - 0.98*(1 - self.mcts_player.mcts._discount) print("current self-play batch: {}, discount: {}".format( self.policy_value_net.n_step, self.mcts_player.mcts._discount)) # self.lock.acquire() self.policy_evaluate(n_games=15) # self.lock.release() except KeyboardInterrupt: print('\n\rquit')
class GoBang(QWidget): def __init__(self): super().__init__() self.initUI() def initUI(self): self.tup = (None, None) self.board = Board() # 棋盘类 self.board.init_board(1) palette1 = QPalette() # 设置棋盘背景 palette1.setBrush(self.backgroundRole(), QtGui.QBrush(QtGui.QPixmap('img/linesofaction.png'))) self.setPalette(palette1) # self.setStyleSheet("board-image:url(img/chessboard.jpg)") # 不知道这为什么不行 self.setCursor(Qt.PointingHandCursor) # 鼠标变成手指形状 # self.sound_piece = QSound("sound/luozi.wav") # 加载落子音效 # self.sound_win = QSound("sound/win.wav") # 加载胜利音效 # self.sound_defeated = QSound("sound/defeated.wav") # 加载失败音效 self.resize(WIDTH, HEIGHT) # 固定大小 540*540 self.setMinimumSize(QtCore.QSize(WIDTH, HEIGHT)) self.setMaximumSize(QtCore.QSize(WIDTH, HEIGHT)) self.setWindowTitle("Lines-Of-Action") # 窗口名称 self.setWindowIcon(QIcon('img/black.png')) # 窗口图标 # self.lb1 = QLabel(' ', self) # self.lb1.move(20, 10) self.black = QPixmap('img/black.png') self.white = QPixmap('img/white.png') self.piece_now = BLACK # 黑棋先行 self.my_turn = True # 玩家先行 self.step = 0 # 步数 self.x, self.y = 1000, 1000 #self.mouse_point = LaBel(self) # 将鼠标图片改为棋子 # self.mouse_point.setScaledContents(True) # self.mouse_point.setPixmap(self.black) # 加载黑棋 # self.mouse_point.setGeometry(270, 270, PIECE, PIECE) self.pieces = [[ LaBel(self), LaBel(self), LaBel(self), LaBel(self), LaBel(self), LaBel(self), LaBel(self), LaBel(self) ] for _ in range(8)] # 新建棋子标签,准备在棋盘上绘制棋子 # for piece in self.pieces: # piece.setVisible(True) # 图片可视 # piece.setScaledContents(True) # 图片大小根据标签大小可变 for i in range(8): for j in range(8): self.pieces[i][j].setVisible(True) self.pieces[i][j].setScaledContents(True) #self.mouse_point.raise_() # 鼠标始终在最上层 self.ai_down = True # AI已下棋,主要是为了加锁,当值是False的时候说明AI正在思考,这时候玩家鼠标点击失效,要忽略掉 mousePressEvent self.setMouseTracking(True) self.DrawPieces() self.show() def DrawPieces(self): for i in range(8): for j in range(8): if self.board.map[i][j] == -1: x, y = self.coordinate_transform_map2pixel(i, j) self.pieces[i][j].setPixmap(self.black) self.pieces[i][j].setGeometry(x, y, PIECE, PIECE) if self.board.map[i][j] == 1: x, y = self.coordinate_transform_map2pixel(i, j) self.pieces[i][j].setPixmap(self.white) self.pieces[i][j].setGeometry(x, y, PIECE, PIECE) if self.board.map[i][j] == 0: x, y = self.coordinate_transform_map2pixel(i, j) self.pieces[i][j].setPixmap(QPixmap("")) self.pieces[i][j].setGeometry(x, y, PIECE, PIECE) def paintEvent(self, event): # 画出指示箭头 qp = QPainter() qp.begin(self) self.drawLines(qp) qp.end() def mouseMoveEvent(self, e): # 黑色棋子随鼠标移动 # self.lb1.setText(str(e.x()) + ' ' + str(e.y())) # self.mouse_point.move(e.x() - 16, e.y() - 16) e.accept() def mousePressEvent(self, e): # 玩家下棋 if e.button() == Qt.LeftButton and self.ai_down == True: x, y = e.x(), e.y() # 鼠标坐标 i, j = self.coordinate_transform_pixel2map(x, y) # 对应棋盘坐标 new_x, new_y = self.coordinate_transform_map2pixel(i, j) if not i is None and not j is None: # 棋子落在棋盘上,排除边缘 if self.board.map[i][j] == -1: # 人类玩家执黑子 self.board.current_player = 1 # self.draw(i, j) print(self.tup) self.tup = (i, j) else: (old_i, old_j) = self.tup if old_i is None or old_j is None: return moves = self.board.get_available(1) my_move = str(old_i) + str(old_j) + str(i) + str(j) print("人类当前走法") print(moves) print("人类当前棋盘") print(self.board.map) if my_move in moves: self.board.do_move(my_move) self.pieces[old_i][old_j].setPixmap(QPixmap("")) self.pieces[i][j].setPixmap(self.black) self.pieces[i][j].setGeometry(new_x, new_y, PIECE, PIECE) end, winner = self.board.game_end() if end: self.gameover(winner) if self.board.current_player == 2: self.ai_down = False board = self.board self.AI = AI(board) # 新建线程对象,传入棋盘参数 self.AI.finishSignal.connect( self.AI_draw) # 结束线程,传出参数 self.AI.start() else: print("error move") # run def AI_draw(self, i, j, nxt_i, nxt_j): print(i, j, nxt_i, nxt_j) self.pieces[i][j].setPixmap(QPixmap("")) self.pieces[nxt_i][nxt_j].setPixmap(self.white) # AI x, y = self.coordinate_transform_map2pixel(nxt_i, nxt_j) self.pieces[nxt_i][nxt_j].setGeometry(x, y, PIECE, PIECE) end, winner = self.board.game_end() if end: self.gameover(winner) self.ai_down = True self.update() def drawLines(self, qp): # 指示AI当前下的棋子 if self.step != 0: pen = QtGui.QPen(QtCore.Qt.black, 2, QtCore.Qt.SolidLine) qp.setPen(pen) qp.drawLine(self.x - 5, self.y - 5, self.x + 3, self.y + 3) qp.drawLine(self.x + 3, self.y, self.x + 3, self.y + 3) qp.drawLine(self.x, self.y + 3, self.x + 3, self.y + 3) def coordinate_transform_map2pixel(self, i, j): # 从 chessMap 里的逻辑坐标到 UI 上的绘制坐标的转换 return MARGIN + j * GRID - PIECE / 2, MARGIN + i * GRID - PIECE / 2 def coordinate_transform_pixel2map(self, x, y): # 从 UI 上的绘制坐标到 chessMap 里的逻辑坐标的转换 i, j = int(round((y - MARGIN) / GRID)), int(round((x - MARGIN) / GRID)) # 有MAGIN, 排除边缘位置导致 i,j 越界 if i < 0 or i >= 15 or j < 0 or j >= 15: return None, None else: return i, j def gameover(self, winner): if winner == 1: #self.sound_win.play() reply = QMessageBox.question(self, 'You Win!', 'Continue?', QMessageBox.Yes | QMessageBox.No, QMessageBox.No) else: if winner == 2: #self.sound_defeated.play() reply = QMessageBox.question(self, 'You Lost!', 'Continue?', QMessageBox.Yes | QMessageBox.No, QMessageBox.No) else: reply = QMessageBox.question(self, 'Tie', 'Continue?', QMessageBox.Yes | QMessageBox.No, QMessageBox.No) if reply == QMessageBox.Yes: # 复位 # self.piece_now = BLACK # self.mouse_point.setPixmap(self.black) # self.step = 0 # for piece in self.pieces: # piece.clear() # self.chessboard.reset() self.board.init_board(1) self.ai_down = True self.board.current_player = 1 self.DrawPieces() self.update() else: self.close()