def __init__(self, board_size, n_history, n_simul): self.env_simul = GomokuEnv(board_size, n_history, display=False) self.board_size = board_size self.n_simul = n_simul self.tree = None self.root = None self.state = None self.board = None # used for backup self.key_memory = deque() self.action_memory = deque() self.reset_tree()
def main(): env = GomokuEnv(BOARD_SIZE, HISTORY) manager = HumanUI() result = {'Black': 0, 'White': 0, 'Draw': 0} for g in range(GAME): print('##### Game: {} #####'.format(g + 1)) state, board = env.reset() done = False idx = 0 while not done: env.render() # start simulations action = manager.get_action(state, board, idx) state, board, z, done = env.step(action) #print(board) idx += 1 if done: if z == 1: result['Black'] += 1 elif z == -1: result['White'] += 1 else: result['Draw'] += 1 # render & reset tree env.render() manager.ai.reset_tree() # result print('') print('=' * 20, " {} Game End ".format(g + 1), '=' * 20) blw, whw, drw = result['Black'], result['White'], result['Draw'] stat = ( 'Black Win: {} White Win: {} Draw: {} Winrate: {:0.1f}%'.format( blw, whw, drw, 1 / (1 + np.exp(whw / (g + 1)) / np.exp(blw / (g + 1))) * 100)) print(stat, '\n')
def play(): env = GomokuEnv(BOARD_SIZE, HISTORY) mcts = MCTS(BOARD_SIZE, HISTORY, N_SIMUL) result = {'Black': 0, 'White': 0, 'Draw': 0} for g in range(GAME): print('#' * (BOARD_SIZE - 4), ' GAME: {} '.format(g + 1), '#' * (BOARD_SIZE - 4)) # reset state state, board = env.reset() done = False while not done: env.render() # start simulations action = mcts.get_action(state, board) state, board, z, done = env.step(action) if done: if z == 1: result['Black'] += 1 elif z == -1: result['White'] += 1 else: result['Draw'] += 1 # render & reset tree env.render() mcts.reset_tree() # result print('') print('=' * 20, " {} Game End ".format(g + 1), '=' * 20) blw, whw, drw = result['Black'], result['White'], result['Draw'] stat = ( 'Black Win: {} White Win: {} Draw: {} Winrate: {:0.1f}%'.format( blw, whw, drw, 1 / (1 + np.exp(whw / (g + 1)) / np.exp(blw / (g + 1))) * 100)) print(stat, '\n')
from random import choice from gomoku_env import GomokuEnv from gameloop import Game try: # python3 compatibility input = raw_input except NameError: pass def player1_turn(state): return choice(state.available_turns()) def player1_human_turn(state): _ = input("Type your turn as x,y: ") x, y = _.split(",") x, y = int(x.strip()), int(y.strip()) return x, y def player2_turn(state): return choice(state.available_turns()) env = GomokuEnv(board_size=15, win_len=5) game = Game(env, player1_turn, player2_turn) game.loop()
class MCTS: def __init__(self, board_size, n_history, n_simul): self.env_simul = GomokuEnv(board_size, n_history, display=False) self.board_size = board_size self.n_simul = n_simul self.tree = None self.root = None self.state = None self.board = None self.legal_move = None self.no_legal_move = None self.ucb = None # used for backup self.key_memory = None self.action_memory = None # init self._reset() self.reset_tree() def _reset(self): self.key_memory = deque(maxlen=self.board_size**2) self.action_memory = deque(maxlen=self.board_size**2) def reset_tree(self): self.tree = defaultdict(lambda: zeros( (self.board_size**2, 2), 'float')) def get_action(self, state, board): self.root = state.copy() self._simulation(state) # init root board after simulatons self.board = board board_fill = self.board[CURRENT] + self.board[OPPONENT] self.legal_move = argwhere(board_fill == 0).flatten() self.no_legal_move = argwhere(board_fill != 0).flatten() # root state's key root_key = hash(self.root.tostring()) # argmax Q action = self._selection(root_key, c_ucb=0) print('') print( self.ucb.reshape(self.board_size, self.board_size).round(decimals=4)) return action def _simulation(self, state): start = time.time() finish = 0 for sim in range(self.n_simul): print('\rsimulation: {}'.format(sim + 1), end='') sys.stdout.flush() # reset state self.state, self.board = self.env_simul.reset(state) done = False n_selection = 0 n_expansion = 0 while not done: board_fill = self.board[CURRENT] + self.board[OPPONENT] self.legal_move = argwhere(board_fill == 0).flatten() self.no_legal_move = argwhere(board_fill != 0).flatten() key = hash(self.state.tostring()) # search my tree if key in self.tree: # selection action = self._selection(key, c_ucb=1) self.action_memory.appendleft(action) self.key_memory.appendleft(key) n_selection += 1 elif n_expansion == 0: # expansion action = self._expansion(key) self.action_memory.appendleft(action) self.key_memory.appendleft(key) n_expansion += 1 else: # rollout action = random.choice(self.legal_move) self.state, self.board, reward, done = \ self.env_simul.step(action) if done: # backup & reset memory self._backup(reward, n_selection + n_expansion) self._reset() finish = time.time() - start # if finish >= self.think_time: # break print('\r{} simulations end ({:0.0f}s)'.format(sim + 1, finish)) def _selection(self, key, c_ucb): edges = self.tree[key] ucb = self._get_ucb(edges, c_ucb) self.ucb = ucb if self.board[COLOR][0] == WHITE: # black's choice action = argwhere(ucb == ucb.max()).flatten() else: # white's choice action = argwhere(ucb == ucb.min()).flatten() action = action[random.choice(len(action))] return action def _expansion(self, key): # only select once for rollout action = self._selection(key, c_ucb=1) return action def _get_ucb(self, edges, c_ucb): total_N = 0 ucb = zeros((self.board_size**2), 'float') for i in range(self.board_size**2): total_N += edges[i][N] # black's ucb if self.board[COLOR][0] == WHITE: for move in self.legal_move: if edges[move][N] != 0: ucb[move] = edges[move][Q] + c_ucb * \ sqrt(2 * log(total_N) / edges[move][N]) else: ucb[move] = np.inf for move in self.no_legal_move: ucb[move] = -np.inf # white's ucb else: for move in self.legal_move: if edges[move][N] != 0: ucb[move] = edges[move][Q] - c_ucb * \ sqrt(2 * log(total_N) / edges[move][N]) else: ucb[move] = -np.inf for move in self.no_legal_move: ucb[move] = np.inf return ucb def _backup(self, reward, steps): # steps is n_selection + n_expansion # update edges in my tree for i in range(steps): edges = self.tree[self.key_memory[i]] action = self.action_memory[i] edges[action][N] += 1 edges[action][Q] += (reward - edges[action][Q]) / edges[action][N]
class MCTS: def __init__(self, board_size, n_history, n_simul): self.env_simul = GomokuEnv(board_size, n_history, display=False) self.board_size = board_size self.n_simul = n_simul self.tree = None self.root = None self.state = None self.board = None # used for backup self.key_memory = deque() self.action_memory = deque() self.reset_tree() def reset_tree(self): self.tree = defaultdict(lambda: zeros((self.board_size**2, 2))) def get_action(self, state, board): self.root = state.copy() self._simulation(state) # init root board after simulatons self.board = board # root state's key root_key = hash(self.root.tostring()) # argmax Q or argmin Q action = self._selection(root_key, c_pucb=0) return action def _simulation(self, state): start = time.time() finish = 0 for sim in range(self.n_simul): print('\rsimulation: {}'.format(sim + 1), end='') sys.stdout.flush() # reset state self.state, self.board = self.env_simul.reset(state) done = False is_expansion = True while not done: key = hash(self.state.tostring()) # search my tree if key in self.tree: # selection action = self._selection(key, c_pucb=5) self.action_memory.appendleft(action) self.key_memory.appendleft(key) else: # expansion legal_move, _ = self._get_legal_move(self.board) action = random.choice(legal_move) if is_expansion: self.action_memory.appendleft(action) self.key_memory.appendleft(key) is_expansion = False self.state, self.board, reward, done = self.env_simul.step( action) if done: # backup & reset memory self._backup(reward) finish = time.time() - start # if finish >= self.think_time: # break print('\r{} simulations end ({:0.0f}s)'.format(sim + 1, finish)) def _get_legal_move(self, board): board_fill = board[CURRENT] + board[OPPONENT] legal_move = argwhere(board_fill != 1).flatten() return legal_move, board_fill def _selection(self, key, c_pucb): edges = self.tree[key] pucb = self._get_pucb(edges, c_pucb) if c_pucb == 0: visit = edges[:, N] print('\nvisit count') print(visit.reshape(self.board_size, self.board_size).round()) action = argwhere(visit == visit.max()).flatten() action = action[random.choice(len(action))] return action if self.board[COLOR][0] == WHITE: # black's choice action = argwhere(pucb == pucb.max()).flatten() else: # white's choice action = argwhere(pucb == pucb.min()).flatten() action = action[random.choice(len(action))] return action def _get_pucb(self, edges, c_pucb): legal_move, no_legal_loc = self._get_legal_move(self.board) prior = 1 / len(legal_move) total_N = edges.sum(0)[N] # black's pucb if self.board[COLOR][0] == WHITE: no_legal_loc *= -9999 pucb = edges[:, Q] + \ c_pucb * prior * sqrt(total_N) / (edges[:, N] + 1) + no_legal_loc # white's pucb else: no_legal_loc *= 9999 pucb = edges[:, Q] - \ c_pucb * prior * sqrt(total_N) / (edges[:, N] + 1) + no_legal_loc return pucb def _backup(self, reward): # update edges in my tree while self.action_memory: key = self.key_memory.popleft() action = self.action_memory.popleft() edges = self.tree[key] edges[action][N] += 1 edges[action][Q] += (reward - edges[action][Q]) / edges[action][N] return 0