예제 #1
0
 def __init__(self, board_size, n_history, n_simul):
     self.env_simul = OmokEnv(board_size, n_history, display=False)
     self.board_size = board_size
     self.n_simul = n_simul
     self.tree = None
     self.root = None
     self.state = None
     self.board = None
     # used for backup
     self.key_memory = deque()
     self.action_memory = deque()
     self.reset_tree()
예제 #2
0
def play():
    env = OmokEnv(BOARD_SIZE, HISTORY)
    mcts = MCTS(BOARD_SIZE, HISTORY, N_SIMUL)
    result = {'Black': 0, 'White': 0, 'Draw': 0}
    for g in range(GAME):
        print('#' * (BOARD_SIZE - 4),
              ' GAME: {} '.format(g + 1),
              '#' * (BOARD_SIZE - 4))
        # reset state
        state, board = env.reset()
        done = False
        while not done:
            env.render()
            # start simulations
            action = mcts.get_action(state, board)
            state, board, z, done = env.step(action)
        if done:
            if z == 1:
                result['Black'] += 1
            elif z == -1:
                result['White'] += 1
            else:
                result['Draw'] += 1
            # render & reset tree
            env.render()
            mcts.reset_tree()
        # result
        blw, whw, drw = result['Black'], result['White'], result['Draw']
        print('')
        print('=' * 20, " {}  Game End  ".format(blw + whw + drw), '=' * 20)
        stats = (
            'Black Win: {}  White Win: {}  Draw: {}  Winrate: {:.2f}%'.format(
                blw, whw, drw, (blw + 0.5 * drw) / (blw + whw + drw) * 100))
        print(stats, '\n')
예제 #3
0
def main():
    env = OmokEnv(BOARD_SIZE, HISTORY)
    manager = HumanUI()
    result = {'Black': 0, 'White': 0, 'Draw': 0}
    for g in range(GAME):
        print('##########   Game: {}   ##########'.format(g + 1))
        state, board = env.reset()
        done = False
        idx = 0
        while not done:
            env.render()
            # start simulations
            action = manager.get_action(state, board, idx)
            state, board, z, done = env.step(action)
            idx += 1
        if done:
            if z == 1:
                result['Black'] += 1
            elif z == -1:
                result['White'] += 1
            else:
                result['Draw'] += 1
            # render & reset tree
            env.render()
            manager.ai.reset_tree()
        # result
        print('')
        print('=' * 20, " {}  Game End  ".format(g + 1), '=' * 20)
        blw, whw, drw = result['Black'], result['White'], result['Draw']
        stat = (
            'Black Win: {}  White Win: {}  Draw: {}  Winrate: {:0.1f}%'.format(
                blw, whw, drw,
                1 / (1 + np.exp(whw / (g + 1)) / np.exp(blw / (g + 1))) * 100))
        print(stat, '\n')
예제 #4
0
 def __init__(self, n_block, channel, board_size, n_history, n_simul, mode):
     self.env_simul = OmokEnv(board_size, n_history, display=False)
     self.model = PVNet(n_block, n_history * 2 + 1, channel, board_size)
     self.board_size = board_size
     self.n_simul = n_simul
     self.mode = mode
     self.alpha = 10 / board_size**2
     self.tree = None
     self.root = None
     self.root_key = None
     self.state = None
     self.board = None
     # used for backup
     self.key_memory = deque()
     self.action_memory = deque()
     self.reset_tree()
예제 #5
0
def play():
    USE_CUDA = torch.cuda.is_available()
    env = OmokEnv(BOARD_SIZE, HISTORY)
    manager = HumanUI()

    if USE_CUDA:
        manager.ai.model.cuda()

    model_path = None

    if model_path:
        print('load model: {}\n'.format(model_path))
        manager.ai.model.load_state_dict(torch.load(model_path))

    result = {'Black': 0, 'White': 0, 'Draw': 0}

    for g in range(N_GAME):
        print('#####  Game: {}  #####'.format(g + 1))
        state, board = env.reset()
        done = False
        idx = 0
        while not done:
            env.render()
            # start simulations
            action = manager.get_action(state, board, idx)
            state, board, z, done = env.step(action)
            idx += 1
        if done:
            if z == 1:
                result['Black'] += 1
            elif z == -1:
                result['White'] += 1
            else:
                result['Draw'] += 1
            # render & reset tree
            env.render()
            manager.ai.reset_tree()
        # result
        blw, whw, drw = result['Black'], result['White'], result['Draw']
        print('')
        print('=' * 20, " {}  Game End  ".format(blw + whw + drw), '=' * 20)
        stats = (
            'Black Win: {}  White Win: {}  Draw: {}  Winrate: {:.2f}%'.format(
                blw, whw, drw, (blw + 0.5 * drw) / (blw + whw + drw) * 100))
        print(stats, '\n')
예제 #6
0
class MCTS:
    def __init__(self, board_size, n_history, n_simul):
        self.env_simul = OmokEnv(board_size, n_history, display=False)
        self.board_size = board_size
        self.n_simul = n_simul
        self.tree = None
        self.root = None
        self.state = None
        self.board = None
        self.legal_move = None
        self.no_legal_move = None
        self.ucb = None

        # used for backup
        self.key_memory = None
        self.action_memory = None

        # init
        self._reset()
        self.reset_tree()

    def _reset(self):
        self.key_memory = deque(maxlen=self.board_size**2)
        self.action_memory = deque(maxlen=self.board_size**2)

    def reset_tree(self):
        self.tree = defaultdict(
            lambda: np.zeros((self.board_size**2, 2), 'float'))

    def get_action(self, state, board):
        self.root = state.copy()
        self._simulation(state)
        # init root board after simulatons
        self.board = board
        board_fill = self.board[CURRENT] + self.board[OPPONENT]
        self.legal_move = np.argwhere(board_fill == 0).flatten()
        self.no_legal_move = np.argwhere(board_fill != 0).flatten()
        # root state's key
        root_key = hash(self.root.tostring())
        # argmax Q
        action = self._selection(root_key, c_ucb=0)
        print('')
        print(self.ucb.reshape(
            self.board_size, self.board_size).round(decimals=4))
        return action

    def _simulation(self, state):
        start = time.time()
        finish = 0
        for sim in range(self.n_simul):
            print('\rsimulation: {}'.format(sim + 1), end='')
            sys.stdout.flush()
            # reset state
            self.state, self.board = self.env_simul.reset(state)
            done = False
            n_selection = 0
            n_expansion = 0

            while not done:
                board_fill = self.board[CURRENT] + self.board[OPPONENT]
                self.legal_move = np.argwhere(board_fill == 0).flatten()
                self.no_legal_move = np.argwhere(board_fill != 0).flatten()
                key = hash(self.state.tostring())
                # search my tree
                if key in self.tree:
                    # selection
                    action = self._selection(key, c_ucb=1)
                    self.action_memory.appendleft(action)
                    self.key_memory.appendleft(key)
                    n_selection += 1
                elif n_expansion == 0:
                    # expansion
                    action = self._expansion(key)
                    self.action_memory.appendleft(action)
                    self.key_memory.appendleft(key)
                    n_expansion += 1
                else:
                    # rollout
                    action = random.choice(self.legal_move)
                self.state, self.board, reward, done = \
                    self.env_simul.step(action)
            if done:
                # backup & reset memory
                self._backup(reward, n_selection + n_expansion)
                self._reset()
                finish = time.time() - start
                # if finish >= self.think_time:
                # break
        print('\n{} simulation end ({:0.0f}s)'.format(sim, finish))

    def _selection(self, key, c_ucb):
        edges = self.tree[key]
        # get ucb
        ucb = self._ucb(edges, c_ucb)
        self.ucb = ucb
        if self.board[COLOR][0] == WHITE:
            # black's choice
            action = np.argwhere(ucb == ucb.max()).flatten()
        else:
            # white's choice
            action = np.argwhere(ucb == ucb.min()).flatten()
        action = action[random.choice(len(action))]
        return action

    def _expansion(self, key):
        # only select once for rollout
        action = self._selection(key, c_ucb=1)
        return action

    def _ucb(self, edges, c_ucb):
        total_N = 0
        ucb = np.zeros((self.board_size**2), 'float')
        for i in range(self.board_size**2):
            total_N += edges[i][N]
        # black's ucb
        if self.board[COLOR][0] == WHITE:
            for move in self.legal_move:
                if edges[move][N] != 0:
                    ucb[move] = edges[move][Q] + c_ucb * \
                        np.sqrt(2 * np.log(total_N) / edges[move][N])
                else:
                    ucb[move] = np.inf
            for move in self.no_legal_move:
                ucb[move] = -np.inf
        # white's ucb
        else:
            for move in self.legal_move:
                if edges[move][N] != 0:
                    ucb[move] = edges[move][Q] - c_ucb * \
                        np.sqrt(2 * np.log(total_N) / edges[move][N])
                else:
                    ucb[move] = -np.inf
            for move in self.no_legal_move:
                ucb[move] = np.inf
        return ucb

    def _backup(self, reward, steps):
        # steps = n_selection + n_expansion
        # update edges in my tree
        for i in range(steps):
            edges = self.tree[self.key_memory[i]]
            action = self.action_memory[i]
            edges[action][N] += 1
            edges[action][Q] += (reward - edges[action][Q]) / edges[action][N]
예제 #7
0
class MCTS:
    def __init__(self, board_size, n_history, n_simul):
        self.env_simul = OmokEnv(board_size, n_history, display=False)
        self.board_size = board_size
        self.n_simul = n_simul
        self.tree = None
        self.root = None
        self.state = None
        self.board = None
        # used for backup
        self.key_memory = deque()
        self.action_memory = deque()
        self.reset_tree()

    def reset_tree(self):
        self.tree = defaultdict(lambda: zeros((self.board_size**2, 2)))

    def get_action(self, state, board):
        self.root = state.copy()
        self._simulation(state)
        # init root board after simulatons
        self.board = board
        # root state's key
        root_key = hash(self.root.tostring())
        # argmax Q or argmin Q
        action = self._selection(root_key, c_pucb=0)
        return action

    def _simulation(self, state):
        start = time.time()
        finish = 0
        for sim in range(self.n_simul):
            print('\rsimulation: {}'.format(sim + 1), end='')
            sys.stdout.flush()
            # reset state
            self.state, self.board = self.env_simul.reset(state)
            done = False
            is_expansion = True

            while not done:
                key = hash(self.state.tostring())
                # search my tree
                if key in self.tree:
                    # selection
                    action = self._selection(key, c_pucb=5)
                    self.action_memory.appendleft(action)
                    self.key_memory.appendleft(key)
                else:
                    # expansion
                    legal_move, _ = self._get_legal_move(self.board)
                    action = random.choice(legal_move)
                    if is_expansion:
                        self.action_memory.appendleft(action)
                        self.key_memory.appendleft(key)
                        is_expansion = False

                self.state, self.board, reward, done = self.env_simul.step(
                    action)

            if done:
                # backup & reset memory
                self._backup(reward)
                finish = time.time() - start
                # if finish >= self.think_time:
                #     break
        print('\r{} simulations end ({:0.0f}s)'.format(sim + 1, finish))

    def _get_legal_move(self, board):
        board_fill = board[CURRENT] + board[OPPONENT]
        legal_move = argwhere(board_fill != 1).flatten()
        return legal_move, board_fill

    def _selection(self, key, c_pucb):
        edges = self.tree[key]
        pucb = self._get_pucb(edges, c_pucb)

        if c_pucb == 0:
            visit = edges[:, N]
            print('\nvisit count')
            print(visit.reshape(self.board_size, self.board_size).round())
            action = argwhere(visit == visit.max()).flatten()
            action = action[random.choice(len(action))]
            return action

        if self.board[COLOR][0] == WHITE:
            # black's choice
            action = argwhere(pucb == pucb.max()).flatten()
        else:
            # white's choice
            action = argwhere(pucb == pucb.min()).flatten()
        action = action[random.choice(len(action))]
        return action

    def _get_pucb(self, edges, c_pucb):
        legal_move, no_legal_loc = self._get_legal_move(self.board)
        prior = 1 / len(legal_move)
        total_N = edges.sum(0)[N]
        # black's pucb
        if self.board[COLOR][0] == WHITE:
            no_legal_loc *= -99999999
            pucb = edges[:, Q] + \
                c_pucb * prior * sqrt(total_N) / \
                (edges[:, N] + 1) + no_legal_loc
        # white's pucb
        else:
            no_legal_loc *= 99999999
            pucb = edges[:, Q] - \
                c_pucb * prior * sqrt(total_N) / \
                (edges[:, N] + 1) + no_legal_loc
        return pucb

    def _backup(self, reward):
        # update edges in my tree
        while self.action_memory:
            key = self.key_memory.popleft()
            action = self.action_memory.popleft()
            edges = self.tree[key]
            edges[action][N] += 1
            edges[action][Q] += (reward - edges[action][Q]) / edges[action][N]
        return 0
예제 #8
0
class MCTS:
    def __init__(self, n_block, channel, board_size, n_history, n_simul, mode):
        self.env_simul = OmokEnv(board_size, n_history, display=False)
        self.model = PVNet(n_block, n_history * 2 + 1, channel, board_size)
        self.board_size = board_size
        self.n_simul = n_simul
        self.mode = mode
        self.alpha = 10 / board_size**2
        self.tree = None
        self.root = None
        self.root_key = None
        self.state = None
        self.board = None
        # used for backup
        self.key_memory = deque()
        self.action_memory = deque()
        self.reset_tree()

    def reset_tree(self):
        self.tree = defaultdict(lambda: np.zeros((self.board_size**2, 3)))

    def get_action(self, state, board, tau):
        self.root = state.copy()
        self.root_key = hash(self.root.tostring())
        self._simulation(state)
        # init root board after simulatons
        self.board = board
        # argmax visit count
        action, pi = self._selection(self.root_key, c_pucb=0)
        if tau == 1:
            action = np.random.choice(self.board_size**2, p=pi)

        return action, pi

    def _simulation(self, state):
        start = time.time()
        finish = 0
        for sim in range(self.n_simul):
            print('\rsimulation: {}'.format(sim + 1), end='')
            sys.stdout.flush()
            # reset state
            self.state, self.board = self.env_simul.reset(state)
            done = False

            while not done:
                key = hash(self.state.tostring())
                # search my tree
                if key in self.tree:
                    # selection
                    action = self._selection(key, c_pucb=5)
                    self.action_memory.appendleft(action)
                    self.key_memory.appendleft(key)
                else:
                    # expansion
                    reward, done = self._expansion(key, self.state)
                    break

                self.state, self.board, reward, done = self.env_simul.step(
                    action)

            if done:
                # backup & reset memory
                self._backup(reward)
                finish = time.time() - start
                # if finish >= self.think_time:
                #     break
        print('\r{} simulations end ({:0.0f}s)'.format(sim + 1, finish))

    def _selection(self, key, c_pucb):
        edges = self.tree[key]
        pucb = self._get_pucb(edges, key, c_pucb)

        if c_pucb == 0:
            visit = edges[:, N]
            print('\nvisit count')
            print(visit.reshape(self.board_size, self.board_size).round())
            action = np.argwhere(visit == visit.max()).flatten()
            action = action[np.random.choice(len(action))]
            # pi = np.exp(visit) / np.exp(visit).sum()
            pi = visit / visit.sum()
            print('\npi')
            print(pi.reshape(
                self.board_size, self.board_size).round(decimals=2))
            return action, pi

        if self.board[COLOR][0] == WHITE:
            # black's choice
            action = np.argwhere(pucb == pucb.max()).flatten()
        else:
            # white's choice
            action = np.argwhere(pucb == pucb.min()).flatten()
        action = action[np.random.choice(len(action))]
        return action

    def _expansion(self, key, state):
        edges = self.tree[key]
        state_input = Variable(
            TENSOR([state.reshape(
                HISTORY * 2 + 1, self.board_size, self.board_size)]))
        prior, value = self.model(state_input)
        prior = prior.exp() / prior.exp().sum()
        edges[:, P] = prior.data.cpu().numpy()[0]
        done = True
        return value.data.cpu().numpy()[0], done

    def _backup(self, reward):
        # update edges in my tree
        while self.key_memory:
            key = self.key_memory.popleft()
            edges = self.tree[key]
            if self.action_memory:
                action = self.action_memory.popleft()
                edges[action][N] += 1
                edges[action][Q] += (reward - edges[action][Q]) / \
                    edges[action][N]
        return 0

    def _get_no_legal_loc(self, board):
        board_fill = board[CURRENT] + board[OPPONENT]
        legal_action = np.argwhere(board_fill == 0).flatten()
        return board_fill, legal_action

    def _get_pucb(self, edges, key, c_pucb):
        no_legal_loc, legal_action = self._get_no_legal_loc(self.board)
        prob = edges[:, P]
        if key == self.root_key and self.mode == 'learn':
            noise = np.random.dirichlet(
                self.alpha * np.ones(len(legal_action)))
            for i, action in enumerate(legal_action):
                prob[action] = 0.75 * prob[action] + 0.25 * noise[i]
        total_N = edges.sum(0)[N]
        # black's pucb
        if self.board[COLOR][0] == WHITE:
            no_legal_loc *= -99999999
            pucb = edges[:, Q] + \
                c_pucb * prob * np.sqrt(total_N) / (edges[:, N] + 1) + \
                no_legal_loc
        # white's pucb
        else:
            no_legal_loc *= 99999999
            pucb = edges[:, Q] - \
                c_pucb * prob * np.sqrt(total_N) / (edges[:, N] + 1) + \
                no_legal_loc
        return pucb
예제 #9
0
            loss.backward()
            optimizer.step()
            running_loss += loss.data[0]
            STEPS += 1

            print('{:3} step loss: {:.3f}'.format(
                STEPS, running_loss / (i + 1)))


if __name__ == '__main__':
    np.set_printoptions(suppress=True)
    np.random.seed(0)
    torch.manual_seed(0)
    torch.cuda.manual_seed_all(0)
    MEMORY = deque(maxlen=8000)
    ENV = OmokEnv(BOARD_SIZE, HISTORY)
    AGENT = MCTS(N_BLOCK, CHANNEL, BOARD_SIZE, HISTORY, N_SIMUL, 'learn')
    RESULT = {'Black': 0, 'White': 0, 'Draw': 0}
    STEPS = 0
    model_path = '8192_step_model.pickle'

    if model_path:
        print('load model: {}\n'.format(model_path))
        AGENT.model.load_state_dict(torch.load(model_path))
        SETPS = False

    if USE_CUDA:
        AGENT.model.cuda()

    for idx in range(N_ITER):
        self_play(idx)