示例#1
0
    def act(self, b, c, t, temperature=1):
        board_batch = []
        valid_ply = []
        for x in range(8):
            for y in range(8):
                if board.is_valid(b, c, x, y):
                    valid_ply.append((x, y))
                    b_ = board.put(b, c, x, y)
                    board_batch.append(board.to_state(b_, 1 - c, t + 1))
        x_batch = chainer.Variable(self.xp.array(np.stack(board_batch),
                                                 'float32'),
                                   volatile=True)
        scores = self.predict(x_batch, train=False)

        min_score = 64
        best_ply = None
        for ply, score in zip(valid_ply,
                              batch_softmax(cuda.to_cpu(scores.data))):
            print(ply, np.argmax(score) - 20)
            print()
            if np.argmax(score) < min_score:
                min_score = np.argmax(score)
                best_ply = ply
        action = best_ply[0] * 8 + best_ply[1]

        return action
示例#2
0
    def get_example(self, i, with_aug=True):
        plies = self.data[i]

        # 盤面を作る
        # 各チャンネルの詳細は https://github.com/Kiikurage/aai/issues/13 を参照

        # plies = game['plies']
        b = board.init()
        n = random.randint(0, len(plies) - 1)

        for color, ply in plies[:n]:
            if ply == -1:
                continue

            x = ply // 8
            y = ply % 8
            b = board.put(b, color, x, y)

        color, ply = plies[n]

        if with_aug:
            b, ply = self.data_augmentation(b, ply)

        res = board.to_state(b, color, n)

        return res, np.int32(ply)
示例#3
0
    def act(self, b, c, t, temperature=1):
        board_batch = []
        valid_ply = []
        for x in range(8):
            for y in range(8):
                if board.is_valid(b, c, x, y):
                    valid_ply.append((x, y))
                    b_ = board.put(b, c, x, y)
                    board_batch.append(board.to_state(b_, 1 - c, t + 1))
        if len(board_batch) == 0:
            return -1
        x_batch = chainer.Variable(self.xp.array(np.stack(board_batch),
                                                 'float32'),
                                   volatile=True)
        scores = self.predict(x_batch, train=False)

        max_rate = 0
        best_ply = None
        for ply, score in zip(valid_ply,
                              batch_softmax(cuda.to_cpu(scores.data))):
            win_rate = score[:21].sum()
            print(ply, win_rate)
            if win_rate > max_rate:
                max_rate = win_rate
                best_ply = ply
        print('best ply', best_ply)

        action = best_ply[0] * 8 + best_ply[1]

        return action
示例#4
0
    def get_example(self, i, with_aug=True):
        plies, score = self.data[i]

        # 盤面を作る
        # 各チャンネルの詳細は https://github.com/Kiikurage/aai/issues/13 を参照

        # plies = game['plies']
        b = board.init()
        n = random.randint(0, len(plies) - 1)

        for color, ply in plies[:n]:
            if ply == -1:
                continue

            x = ply // 8
            y = ply % 8
            b = board.put(b, color, x, y)

        color, ply = plies[n]

        if with_aug:
            b, ply = self.data_augmentation(b, ply)

        res = board.to_state(b, color, n)
        score = np.clip(score, -40, 40)
        score = (abs(score) + 1) // 2 if score >= 0 else -(
            (abs(score) + 1) // 2)
        score = score if color == 0 else -score
        return res, np.int32(score + 20)
示例#5
0
def main():
    pass_cnt = 0
    color = C.Black
    board = B.init()

    strategies = {
        C.Black: (args.black, strategy_dict[args.black]),
        C.White: (args.white, strategy_dict[args.white]),
    }

    while pass_cnt < 2:
        x, y = strategies[color][1](board, color)
        if x == -1:
            pass_cnt += 1
            print('{0}({1}): pass'.format(
                'Black' if color == C.Black else 'White',
                strategies[color][0]))
            print('')

        else:
            pass_cnt = 0
            board = B.put(board, color, x, y)
            print('{0}({1}): ({2}, {3})'.format(
                'Black' if color == C.Black else 'White', strategies[color][0],
                x, y))
            print(B.stringify(board))
            print('')

        color = C.other(color)

    num_black = board[0].sum()
    num_white = board[1].sum()

    print('({0})Black  {1}-{2}  White({3})'.format(strategies[C.Black][0],
                                                   num_black, num_white,
                                                   strategies[C.White][0]))

    if num_black > num_white:
        print('Black win')

    elif num_white > num_black:
        print('White win')

    else:
        print('Draw')
示例#6
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--init', '-i', help='path to initial olayer model')
    parser.add_argument('--opponent')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU device ID')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=10000,
                        help='# of epoch')
    parser.add_argument('--batch_size',
                        type=int,
                        default=256,
                        help='size of mini-batch')
    parser.add_argument('--adam_eps',
                        type=float,
                        default=1e-2,
                        help='parameter eps in adam')
    parser.add_argument('--adam_alpha',
                        type=float,
                        default=1e-4,
                        help='parameter alpha in adam')
    parser.add_argument('--density',
                        type=int,
                        default=1,
                        help='density of cnn kernel')
    parser.add_argument('--no_bn',
                        dest='use_bn',
                        action='store_false',
                        default=True)
    parser.add_argument('--draw',
                        dest='draw',
                        action='store_true',
                        default=False)
    parser.add_argument('--out', default='')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    batch_size = args.batch_size

    # log directory
    out = datetime.datetime.now().strftime('%m%d')
    if args.out:
        out = out + '_' + args.out
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs_rl", out))
    os.makedirs(os.path.join(out_dir, 'models'), exist_ok=True)

    player_model = RLPolicy(use_bn=args.use_bn)
    # opponent_model = RLPolicy(use_bn=args.use_bn)
    opponent_model = RolloutPolicy()

    # load player model
    serializers.load_hdf5(args.init, player_model)

    # gpu
    if args.gpu >= 0:
        cuda.get_device(args.gpu).use()
        player_model.to_gpu()
        opponent_model.to_gpu()

    # setting
    with open(os.path.join(out_dir, 'setting.txt'), 'w') as f:
        for k, v in args._get_kwargs():
            print('{} = {}'.format(k, v))
            f.write('{} = {}\n'.format(k, v))

    # optimizer
    optimizer = chainer.optimizers.Adam(alpha=args.adam_alpha,
                                        eps=args.adam_eps)
    optimizer.setup(player_model)

    # start training
    start = time.time()

    # opponent model
    if args.opponent is None:
        opponent = args.init
    else:
        opponent = args.opponent
    opponent_models = [opponent]

    win_rate_summary = []
    for epoch in range(args.epoch):

        # load opponent model
        if epoch % 1000 == 0:
            serializers.load_hdf5(np.random.choice(opponent_models),
                                  opponent_model)

        # initialize
        states = np.zeros(shape=(batch_size, 80, 6, 8, 8))
        plies = np.zeros(shape=(batch_size, 80), dtype='int64')
        ply_nums = np.zeros(shape=batch_size, dtype='int64')
        results = np.zeros(shape=batch_size)

        # simulation (self-play)
        for player_color in [Color.Black, Color.White]:

            models = {
                player_color: player_model,
                1 - player_color: opponent_model
            }
            x_batch = [
                board.to_state(board.init(), 0, 0)
                for _ in range(batch_size // 2)
            ]
            turn = 0
            c = Color.Black
            pass_cnts = np.zeros(shape=batch_size // 2)
            while True:
                if min(pass_cnts) >= 2:
                    break

                if c == player_color:
                    scores = models[c].predict(
                        models[c].xp.array(x_batch, 'float32'), False)
                    scores = cuda.to_cpu(scores.data)

                for i in range(batch_size // 2):
                    # gameが終わったか判定
                    if pass_cnts[i] == 2:
                        continue

                    b = x_batch[i][:2]
                    valid_mask = x_batch[i][2].ravel()

                    if valid_mask.sum() == 0:
                        plies[(batch_size // 2) * player_color + i, turn] = -1
                        pass_cnts[i] += 1
                        x_batch[i] = board.to_state(b, 1 - c, turn + 1)
                    else:
                        stone_cnt = b[0:2].sum()
                        if c == player_color:
                            if stone_cnt >= 64 - 8:
                                # 残り12手は探索で。
                                # print('in zentansaku', stone_cnt)
                                if args.draw:
                                    x, y = traverse.BitBoard(b.astype(
                                        np.bool)).traverse(c, 3)
                                else:
                                    x, y = traverse.BitBoard(b.astype(
                                        np.bool)).traverse(c, 1)
                                ply = x * 8 + y

                            else:
                                pred = softmax(scores[i].astype(np.float64),
                                               mask=valid_mask,
                                               T=1)
                                ply = np.random.choice(64, p=pred)

                                states[(batch_size // 2) * player_color + i,
                                       turn, :, :, :] = x_batch[i]
                                plies[(batch_size // 2) * player_color + i,
                                      turn] = ply
                                ply_nums[(batch_size // 2) * player_color +
                                         i] += 1
                            x = ply // 8
                            y = ply % 8

                        else:
                            stone_cnt = b[0].sum() + b[1].sum()
                            b = b.astype(np.bool)
                            bb = traverse.BitBoard(b)

                            if 64 - stone_cnt > 12:
                                x, y = bb.montecarlo(c, 10000, 1)
                            else:
                                x, y = traverse.BitBoard(b.astype(
                                    np.bool)).traverse(c, 1)

                        if not board.is_valid(b, c, x, y):
                            print(valid_mask)
                            print(scores[i])
                            print(softmax(scores[i], mask=valid_mask, T=1))
                            raise ValueError('invalid ply')

                        x_batch[i] = board.to_state(board.put(b, c, x, y),
                                                    1 - c, turn + 1)
                        pass_cnts[i] = 0

                c = 1 - c
                turn += 1

            # check win/lose
            for i, b in enumerate(x_batch):
                num_black = b[0].sum()
                num_white = b[1].sum()
                if args.draw:
                    diff = abs(num_black - num_white)
                    if diff <= 3:
                        res = 1
                    elif diff <= 10:
                        res = 0
                    else:
                        res = -0.5
                    results[(batch_size // 2) * player_color + i] = res
                else:
                    res = np.sign(num_black - num_white)
                    res = res if player_color == Color.Black else -res
                    results[(batch_size // 2) * player_color +
                            i] = res if res >= 0 else res / 2

        # train (policy gradient)
        optimizer.update(player_model, states, plies, results, ply_nums)

        if args.draw:
            win_rate = np.mean(np.array(results) > 0)
        else:
            win_rate = np.mean(np.array(results) >= 0)

        progress_report(start, epoch, batch_size, win_rate)
        win_rate_summary.append(win_rate)
        if epoch % 100 == 0:
            serializers.save_hdf5(
                os.path.join(out_dir, "models",
                             "rl_policy_{}.model".format(epoch)), player_model)
            print('\nwin_rate_summary {}'.format(np.mean(win_rate_summary)))
            win_rate_summary = []