def feature(board, from_, action): """ 第一视角的棋局特征 :param board: 棋盘 :param from_: 走哪颗子 :param action: 动作,向哪个方向走 :return: 当前动作的特征(5x5xN) """ player = board[from_] to_ = tuple(np.add(from_, rule.actions_move[action])) # 棋盘特征:空白-己方棋子-对方棋子 space = (board == 0).astype(np.int8).reshape((5, 5, 1)) self = (board == player).astype(np.int8).reshape((5, 5, 1)) opponent = (board == -player).astype(np.int8).reshape((5, 5, 1)) # 动作特征 from_location = np.zeros((5,5,1)) from_location[from_] = 1 to_location = np.zeros((5,5,1)) to_location[to_] = 1 # 走子后的棋盘 board = board.copy() result,_ = rule.move(board, from_, to_) space2 = (board == 0).astype(np.int8).reshape((5, 5, 1)) self2 = (board == player).astype(np.int8).reshape((5, 5, 1)) opponent2 = (board == -player).astype(np.int8).reshape((5, 5, 1)) # 走子后是否赢棋 is_win = np.ones((5,5,1)) if result == rule.WIN else np.zeros((5,5,1)) # 偏置 bias = np.ones((5, 5, 1)) return np.concatenate((space, self, opponent, from_location, to_location, space2, self2, opponent2, is_win, bias), axis=2)
def move_to(self, stone, to_loc): """ 把棋子移动到to_loc处,同时判断是否吃子 :param stone: :param to_loc: :return: True:终止移动,False:继续移动 """ old_board = self.board() from_ = stone.loc result, del_stone_loc = rule.move(self.board(), stone.loc, to_loc) if result == rule.ACCQUIRE or result == rule.WIN: self.move_to_loc(stone, to_loc) for loc in del_stone_loc: self.del_stone(self.stone(loc)) logger.info('from %s to %s, result:%s, del:%s', from_, to_loc, result, del_stone_loc) action = rule.actions_move.index(tuple(np.subtract(to_loc, from_))) logger.debug('action is: %s', action) self.record.add(old_board, from_, action, len(del_stone_loc), None, win=(result == rule.WIN)) return result
def play(self, board): logger.info('%s play...', self.name) board_self = rule.flip_board( board) if self.stone_val == -1 else board.copy() from_, action, vp, p = self.play_process.predict( board_self, self.stone_val) to_ = tuple(np.add(from_, rule.actions_move[action])) if self.stone_val == -1: from_ = rule.flip_location(from_) to_ = rule.flip_location(to_) # vp = rule.flip_action_probs(vp) p = rule.flip_action_probs(p) logger.info('from %s to %s', from_, to_) rule.move(board, from_, to_) opp_q_table = self.predict_opponent(board) logger.debug(opp_q_table) self.play_func(self.stone_val, from_, to_, p, opp_q_table)
def simulate(self, ts, board, player): from record import Record from value_network import NoActionException records = Record() while True: try: bd = board.copy() board_str = util.board_str(board) valid_action = rule.valid_actions(board, player) while True: (from_, act), q = self.epsilon_greedy(board, player, valid_action, ts) if (board_str, from_, act) not in self.predicts or len( ts.root.sub_edge) == 1: break ts.root.sub_edge = [ e for e in ts.root.sub_edge if e.a != (from_, act) ] valid_action.remove((from_, act)) assert board[from_] == player ts.move_down(board, player, action=(from_, act)) if self.episode % 10 == 0: logger.info('action:%s,%s', from_, act) logger.info('q is %s', q) to_ = tuple(np.add(from_, rule.actions_move[act])) command, eat = rule.move(board, from_, to_) records.add3(bd, from_, act, len(eat), win=command == rule.WIN) except NoActionException: # 随机初始化局面后一方无路可走 return Record(), 0 except Exception as ex: logging.warning('board is:\n%s', board) logging.warning('player is: %s', player) valid = rule.valid_actions(board, player) logging.warning('valid is:\n%s', valid) logging.warning('from_:%s, act:%s', from_, act) ts.show_info() records.save('records/train/1st_') raise ex if command == rule.WIN: logging.info('%s WIN, step use: %s, epsilon:%s', str(player), records.length(), self.epsilon) return records, player if records.length() > 10000: logging.info('走子数过多: %s', records.length()) return Record(), 0 player = -player board = rule.flip_board(board)
def play(self, board): logger.info('%s play...', self.name) board_self = rule.flip_board( board) if self.stone_val == -1 else board.copy() (from_, action), (valid, q) = self.play_process.predict(board_self, self.stone_val) logger.debug('valid is:%s', valid) logger.debug('q is:%s', q) logger.debug('from:%s, action:%s', from_, action) to_ = tuple(np.add(from_, rule.actions_move[action])) q_table = np.zeros((5, 5, 4)) for (f, a), q1 in zip(valid, q): q_table[f][a] = q1 if self.stone_val == -1: from_ = rule.flip_location(from_) to_ = rule.flip_location(to_) q_table = rule.flip_action_probs(q_table) logger.info('from %s to %s', from_, to_) rule.move(board, from_, to_) opp_q_table = self.predict_opponent(board) logger.debug(opp_q_table) self.play_func(self.stone_val, from_, to_, q_table, opp_q_table)
def simulate(nw0, nw1, init='fixed'): board = rule.init_board() if init == 'fixed' else rule.random_init_board() player = 1 records = Record() while True: nw = nw0 if player == 1 else nw1 try: bd = board.copy() from_, action, vp, p = nw.policy(board, player) # print('>', from_, action) assert board[from_] == player to_ = tuple(np.add(from_, rule.actions_move[action])) command, eat = rule.move(board, from_, to_) reward = len(eat) records.add(bd, from_, action, reward, vp, win=command == rule.WIN) except NoActionException: return Record(), 0 except Exception as e: logging.info('board is:') logging.info(board) logging.info('player is: %s', player) valid = rule.valid_action(board, player) logging.info('predict is:') print(nw.p) logging.info('sum is: %s', nw.p.sum()) logging.info('valid action is:') logging.info(nw.valid) logging.info('p * valid is:') logging.info(nw.vp) logging.info('from:%s, action:%s', from_, action) logging.info('prob is: %s', valid[from_][action]) records.save('records/train/1st_') raise e # if eat: # print(player, from_, to_, eat, N) if command == rule.WIN: logging.info('%s WIN, step use: %s', str(player), records.length()) return records, player if records.length() > 10000: logging.info('走子数过多: %s', records.length()) return Record(), 0 player = -player board = rule.flip_board(board)
def simulate(nw0, nw1, activation, init='fixed'): np.random.seed(util.rand_int32()) player = 1 if np.random.random() > 0.5 else -1 logger.info('init:%s, player:%s', init, player) board = rule.init_board( player) if init == 'fixed' else rule.random_init_board() records = Record() # full_records = Record() boards = set() # {(board,player)} nws = [None, nw0, nw1] n_steps = 0 while True: nw = nws[player] # nw0 if player == 1 else nw1 try: bd = board.copy() board_str = util.board_str(board) if (board_str, player) in boards: # 找出环,并将目标置为0.5进行训练,然后将环清除 finded = False records2 = Record() for i in range(len(boards) - 1, -1, -1): b, f, a, _, _ = records[i] if (b == board).all() and b[f] == player: finded = True break assert finded, (board, player) records2.records = records.records[i:] records2.draw() nw0.train(records2) nw1.train(records2) # 将环里的数据清除 records.records = records.records[:i] for b, f, a, _, _ in records2: boards.remove((util.board_str(b), b[f])) logger.info('环:%s, records:%s, epsilon:%s', len(records2), records.length(), nw.epsilon) boards.add((board_str, player)) from_, action = nw.policy(board, player) assert board[from_] == player to_ = tuple(np.add(from_, rule.actions_move[action])) command, eat = rule.move(board, from_, to_) reward = len(eat) if activation == 'sigmoid': records.add3(bd, from_, action, reward, win=command == rule.WIN) # full_records.add3(bd, from_, action, reward, win=command==rule.WIN) elif activation == 'linear': records.add2(bd, from_, action, reward, win=command == rule.WIN) # full_records.add2(bd, from_, action, reward, win=command == rule.WIN) elif activation == 'selu': records.add4(bd, from_, action, reward, win=command == rule.WIN) # full_records.add4(bd, from_, action, reward, win=command == rule.WIN) else: raise ValueError if command == rule.WIN: logging.info('%s WIN, stone:%s, step use: %s, epsilon:%s', str(player), (board == player).sum(), records.length(), nw.epsilon) return records, player if n_steps - records.length() > 500: logging.info('循环走子数过多: %s', records.length()) # 走子数过多,和棋 records.clear() return records, 0 player = -player if init == 'fixed': board = rule.flip_board(board) n_steps += 1 except NoActionException: # 随机初始化局面后一方无路可走 return Record(), 0 except Exception as e: logging.info('board is:\n%s', board) logging.info('player is: %s', player) valid = rule.valid_actions(board, player) logging.info('valid is:\n%s', valid) logging.info('predict is:\n%s', nw.q_value) logging.info('valid action is:\n%s', nw.valid) logging.info('from:%s, action:%s', from_, action) records.save('records/train/1st_') raise e