示例#1
0
    def reinforce(self):
        if len(self.oppo_pool) == 0:
            self.oppo_pool.append(
                StrategyDNN(is_train=False, is_revive=True, is_rl=False))

        s1 = StrategyDNN(is_train=False, is_revive=True, is_rl=True)
        s2 = random.choice(self.oppo_pool)

        stat = []
        win1, win2, draw = 0, 0, 0

        n_lose = 0
        iter_n = 100
        i = 0
        while True:
            print('iter:', i)

            for _ in range(1000):
                s1.stand_for = random.choice(
                    [Board.STONE_BLACK, Board.STONE_WHITE])
                s2.stand_for = Board.oppo(s1.stand_for)

                g = Game(Board.rand_generate_a_position(), s1, s2, observer=s1)
                g.step_to_end()
                win1 += 1 if g.winner == s1.stand_for else 0
                win2 += 1 if g.winner == s2.stand_for else 0
                draw += 1 if g.winner == Board.STONE_EMPTY else 0

#             if win1 > win2:
#                 s1_c = s1.mind_clone()
#                 self.oppo_pool.append(s1_c)
#                 s2 = random.choice(self.oppo_pool)
#                 n_lose = 0
#                 print('stronger, oppos:', len(self.oppo_pool))
#             elif win1 < win2:
#                 n_lose += 1
#
#             if n_lose >= 50:
#                 break

            if i % 1 == 0 or i + 1 == iter_n:
                total = win1 + win2 + draw
                win1_r = win1 / total
                win2_r = win2 / total
                draw_r = draw / total
                print("iter:%d, win: %.3f, loss: %.3f, tie: %.3f" %
                      (i, win1_r, win2_r, draw_r))
                stat.append([win1_r, win2_r, draw_r])

            i += 1

            if i > iter_n:
                break

        stat = np.array(stat)
        print('stat. shape:', stat.shape)
        np.savez('/home/splendor/fusor/stat.npz', stat=np.array(stat))
        self.strategy_1 = self.strategy_2 = s1
示例#2
0
    def match(self):
        s1, s2 = self.strategy_1, self.strategy_2
        print('player1:', s1.__class__.__name__)
        print('player2:', s2.__class__.__name__)

        probs = np.zeros(6)
        games = 100  # 30
        for i in range(games):
            print(i)
            s1.stand_for = Board.STONE_BLACK
            s2.stand_for = Board.STONE_WHITE
            g = Game(Board.rand_generate_a_position(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_BLACK:
                probs[0] += 1
            elif g.winner == Board.STONE_WHITE:
                probs[1] += 1
            else:
                probs[2] += 1

            s1.stand_for = Board.STONE_WHITE
            s2.stand_for = Board.STONE_BLACK
            g = Game(Board.rand_generate_a_position(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_WHITE:
                probs[3] += 1
            elif g.winner == Board.STONE_BLACK:
                probs[4] += 1
            else:
                probs[5] += 1

        print('total play:', games)
        print(probs)
示例#3
0
    def vs_human(self, which_side_human_play):
        strategy = self.which_one(Board.oppo(which_side_human_play))
        if strategy is None or isinstance(strategy, StrategyRand):
            strategy = self.which_one(which_side_human_play)
        if strategy is None:
            print('without opponent')
            return

        old_is_learning, old_stand_for = strategy.is_learning, strategy.stand_for
        strategy.is_learning, strategy.stand_for = False, Board.oppo(which_side_human_play)

        s1 = strategy
        s2 = StrategyHuman()
        s2.stand_for = which_side_human_play

        self.game = Game(Board(), s1, s2, self.msg_queue)
        self.game.step_to_end()

        strategy.is_learning, strategy.stand_for = old_is_learning, old_stand_for
示例#4
0
    def match(self):
        s1, s2 = self.strategy_1, self.strategy_2
        print('player1:', s1.__class__.__name__)
        print('player2:', s2.__class__.__name__)

        probs = np.zeros(6)
        games = 100  # 30
        for i in range(games):
            print(i)
            s1.stand_for = Board.STONE_BLACK
            s2.stand_for = Board.STONE_WHITE
            g = Game(Board.rand_generate_a_position(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_BLACK:
                probs[0] += 1
            elif g.winner == Board.STONE_WHITE:
                probs[1] += 1
            else:
                probs[2] += 1

            s1.stand_for = Board.STONE_WHITE
            s2.stand_for = Board.STONE_BLACK
            g = Game(Board.rand_generate_a_position(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_WHITE:
                probs[3] += 1
            elif g.winner == Board.STONE_BLACK:
                probs[4] += 1
            else:
                probs[5] += 1

        print('total play:', games)
        print(probs)
示例#5
0
    def learn_from_2_teachers(self):
        s1 = StrategyMinMax()
        s1.stand_for = Board.STONE_BLACK
        self.strategy_1 = s1

        s2 = StrategyMinMax()
        s2.stand_for = Board.STONE_WHITE
        self.strategy_2 = s2

        observer = StrategyMC()

        win1, win2, draw = 0, 0, 0
        step_counter, explo_counter = 0, 0
        begin = datetime.datetime.now()
        episodes = 10000
        for i in range(episodes):
            g = Game(Board(), s1, s2, observer=observer)
            g.step_to_end()
            win1 += 1 if g.winner == Board.STONE_BLACK else 0
            win2 += 1 if g.winner == Board.STONE_WHITE else 0
            draw += 1 if g.winner == Board.STONE_EMPTY else 0

            step_counter += g.step_counter
            explo_counter += g.exploration_counter
            print('training...%d' % i)

        total = win1 + win2 + draw
        print("black win: %f" % (win1 / total))
        print("white win: %f" % (win2 / total))
        print("draw: %f" % (draw / total))

        print('avg. steps[%f], avg. explos[%f]' %
              (step_counter / episodes, explo_counter / episodes))

        end = datetime.datetime.now()
        diff = end - begin
        print("time cost[%f]s, avg.[%f]s" %
              (diff.total_seconds(), diff.total_seconds() / episodes))

        observer.save('./brain1.npz')
示例#6
0
    def sim_once(self, s0):
        s = copy.deepcopy(s0)
        node = self._root
        while True:
            legal_states, who, legal_moves = Game.possible_moves(s)
            if len(legal_states) == 0:
                return None, None

            if node.is_leaf():
                return node, s
            else:
                move, node = node.select()
                s = self.make_a_move(s, move, who)
示例#7
0
    def learn_from_2_teachers(self):
        s1 = StrategyMinMax()
        s1.stand_for = Board.STONE_BLACK
        self.strategy_1 = s1

        s2 = StrategyMinMax()
        s2.stand_for = Board.STONE_WHITE
        self.strategy_2 = s2

        observer = StrategyMC()

        win1, win2, draw = 0, 0, 0
        step_counter, explo_counter = 0, 0
        begin = datetime.datetime.now()
        episodes = 10000
        for i in range(episodes):
            g = Game(Board(), s1, s2, observer=observer)
            g.step_to_end()
            win1 += 1 if g.winner == Board.STONE_BLACK else 0
            win2 += 1 if g.winner == Board.STONE_WHITE else 0
            draw += 1 if g.winner == Board.STONE_EMPTY else 0

            step_counter += g.step_counter
            explo_counter += g.exploration_counter
            print('training...%d' % i)

        total = win1 + win2 + draw
        print("black win: %f" % (win1 / total))
        print("white win: %f" % (win2 / total))
        print("draw: %f" % (draw / total))

        print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes))

        end = datetime.datetime.now()
        diff = end - begin
        print("time cost[%f]s, avg.[%f]s" % (diff.total_seconds(), diff.total_seconds() / episodes))

        observer.save('./brain1.npz')
示例#8
0
    def _evaluate_rollout(self, state, limit):
        # _, player, legal_moves = Game.possible_moves(state)
        winner = 0

#         old_board = Board()
#         old_board.stones = state
        player = None
        for i in range(limit):
            legal_states, p, legal_moves = Game.possible_moves(state)
            if player is None:
                player = p
            if len(legal_states) == 0:
                break

            probs = self._rollout(state, legal_moves)
            mask = np.full_like(probs, -0.01)
            mask[:, legal_moves] = probs[:, legal_moves]
            probs = mask

            best_move = np.argmax(probs, 1)[0]

            idx = np.where(legal_moves == best_move)[0]
#             if idx.size == 0:
#                 print(i, idx)
#                 print(best_move)
#                 print(probs.shape)
#                 print(legal_moves)
#                 print(probs)
            assert idx.size == 1
            idx = idx[0]
            st1 = legal_states[idx]

            over, winner, last_loc = st1.is_over(state)
            if over:
                break

            state = st1
        else:
            # If no break from the loop, issue a warning.
            print("WARNING: rollout reached move limit")

        if winner == 0:
            return 0
        else:
            return 1 if winner == player else -1
示例#9
0
    def _evaluate_rollout(self, state, limit):
        # _, player, legal_moves = Game.possible_moves(state)
        winner = 0

        #         old_board = Board()
        #         old_board.stones = state
        player = None
        for i in range(limit):
            legal_states, p, legal_moves = Game.possible_moves(state)
            if player is None:
                player = p
            if len(legal_states) == 0:
                break

            probs = self._rollout(state, legal_moves)
            mask = np.full_like(probs, -0.01)
            mask[:, legal_moves] = probs[:, legal_moves]
            probs = mask

            best_move = np.argmax(probs, 1)[0]

            idx = np.where(legal_moves == best_move)[0]
            #             if idx.size == 0:
            #                 print(i, idx)
            #                 print(best_move)
            #                 print(probs.shape)
            #                 print(legal_moves)
            #                 print(probs)
            assert idx.size == 1
            idx = idx[0]
            st1 = legal_states[idx]

            over, winner, last_loc = st1.is_over(state)
            if over:
                break

            state = st1
        else:
            # If no break from the loop, issue a warning.
            print("WARNING: rollout reached move limit")

        if winner == 0:
            return 0
        else:
            return 1 if winner == player else -1
示例#10
0
    def _playout(self, state, leaf_depth):
        start_time = time.time()
        node = self._root

        print('exploit')
        for i in range(leaf_depth):
            #             print()
            legal_states, _, legal_moves = Game.possible_moves(state)
            #             print(state)
            #             print(legal_moves)
            #             print('depth:', i, 'legal moves:', legal_moves.shape)

            if len(legal_states) == 0:
                break
            if node.is_leaf():
                action_probs = self._policy(state)
                if len(action_probs) == 0:
                    break
#                 print('num of action-prob:', len(action_probs))
                node.expand(action_probs)

#             print('num of children:', len(node._children))
            best_move, node = node.select()
            idx = np.where(legal_moves == best_move)[0]
            if idx.size == 0:
                print('depth:', i, idx)
                print('best move:', best_move)
                #                 print(legal_moves)
                p = node.parent
                for a, s1 in p.children.items():
                    print('  ', a, s1.get_value())

            assert idx.size == 1
            state = legal_states[idx[0]]

#         duration = time.time() - start_time
#         print('time cost:', duration)
        print('rollout...')
        v = self._value(state) if self._lmbda < 1 else 0
        z = self._evaluate_rollout(
            state, self._rollout_limit) if self._lmbda > 0 else 0
        leaf_value = (1 - self._lmbda) * v + self._lmbda * z

        node.update_recursive(leaf_value, self._c_puct)
示例#11
0
    def vs_human(self, which_side_human_play):
        strategy = self.which_one(Board.oppo(which_side_human_play))
        if strategy is None or isinstance(strategy, StrategyRand):
            strategy = self.which_one(which_side_human_play)
        if strategy is None:
            print('without opponent')
            return

        old_is_learning, old_stand_for = strategy.is_learning, strategy.stand_for
        strategy.is_learning, strategy.stand_for = False, Board.oppo(which_side_human_play)

        s1 = strategy
        s2 = StrategyHuman()
        s2.stand_for = which_side_human_play

        self.game = Game(Board(), s1, s2, self.msg_queue)
        self.game.step_to_end()

        strategy.is_learning, strategy.stand_for = old_is_learning, old_stand_for
示例#12
0
    def _playout(self, state, leaf_depth):
        # start_time = time.time()
        node = self._root

        print('exploit')
        for i in range(leaf_depth):
            legal_states, _, legal_moves = Game.possible_moves(state)
#             print(state)
#             print(legal_moves)
#             print('depth:', i, 'legal moves:', legal_moves.shape)

            if len(legal_states) == 0:
                break
            if node.is_leaf():
                action_probs = self._policy(state)
                if len(action_probs) == 0:
                    break
#                 print('num of action-prob:', len(action_probs))
                node.expand(action_probs)

#             print('num of children:', len(node._children))
            best_move, node = node.select()
            idx = np.where(legal_moves == best_move)[0]
            if idx.size == 0:
                print('depth:', i, idx)
                print('best move:', best_move)
#                 print(legal_moves)
                p = node.parent
                for a, s1 in p.children.items():
                    print('  ', a, s1.get_value())

            assert idx.size == 1
            state = legal_states[idx[0]]

#         duration = time.time() - start_time
#         print('time cost:', duration)
        print('rollout...')
        v = self._value(state) if self._lmbda < 1 else 0
        z = self._evaluate_rollout(state, self._rollout_limit) if self._lmbda > 0 else 0
        leaf_value = (1 - self._lmbda) * v + self._lmbda * z

        node.update_recursive(leaf_value, self._c_puct)
示例#13
0
    def get_input_values(self, board):
        '''
        Returns:
        -----------
        vector: numpy.1darray
            the input vector
        '''
#         print('boar.stone shape: ' + str(board.stones.shape))
        v = board.stones
#         print('vectorized board shape: ' + str(v.shape))

#         print('b[%d], w[%d]' % (black, white))
        iv = np.zeros(v.shape[0] * 2 + 3)
        iv[0] = 1.
        iv[1:v.shape[0] + 1] = (v == Board.STONE_BLACK).astype(int)
        iv[v.shape[0] + 1:v.shape[0] * 2 + 1] = (v == Board.STONE_WHITE).astype(int)
        who = Game.whose_turn_now(board)
        iv[-2] = 1 if who == Board.STONE_BLACK else 0  # turn to black move
        iv[-1] = 1 if who == Board.STONE_WHITE else 0  # turn to white move
#         print(iv.shape)
#         print(iv)
        return iv
示例#14
0
    def sim(self, board):
        visited_path = []
        state = board
        winner = Board.STONE_EMPTY
        for _ in range(1, self.max_moves + 1):
            moves, player, _ = Game.possible_moves(state)
            state_new, state_new_val = self.get_best(state, moves, player)
            visited_path.append((player, state, state_new, state_new_val))
            over, winner, _ = state_new.is_over(state)
            if over:
                break
            state = state_new

        self.total_sim += 1

        ds = SupervisedDataSet(self.features_num, 2)
        for player, state, new, val in visited_path:
            plays = val[1] * self.total_sim + 1
            wins = val[0] * self.total_sim
            if player == winner:
                wins += 1
            ds.addSample(self.get_input_values(state, new, player), (wins, plays))
        self.trainer.trainOnDataset(ds)
示例#15
0
文件: mcts.py 项目: FuxiCV/ml-five
    def sim(self, board):
        visited_path = []
        state = board
        winner = Board.STONE_EMPTY
        for _ in range(1, self.max_moves + 1):
            moves, player = Game.possible_moves(state)
            state_new, state_new_val = self.get_best(state, moves, player)
            visited_path.append((player, state, state_new, state_new_val))
            over, winner, _ = state_new.is_over(state)
            if over:
                break
            state = state_new

        self.total_sim += 1

        ds = SupervisedDataSet(self.features_num, 2)
        for player, state, new, val in visited_path:
            plays = val[1] * self.total_sim + 1
            wins = val[0] * self.total_sim
            if player == winner:
                wins += 1
            ds.addSample(self.get_input_values(state, new, player),
                         (wins, plays))
        self.trainer.trainOnDataset(ds)
示例#16
0
    def measure_perf(self, s1, s2):
        old_epsilon1, old_is_learning1, old_stand_for1 = s1.epsilon, s1.is_learning, s1.stand_for
#         old_epsilon2, old_is_learning2, old_stand_for2 = s2.epsilon, s2.is_learning, s2.stand_for
        old_is_learning2, old_stand_for2 = s2.is_learning, s2.stand_for
        s1.epsilon, s1.is_learning, s1.stand_for = 0, False, Board.STONE_BLACK
#         s2.epsilon, s2.is_learning, s2.stand_for = 0, False, Board.STONE_WHITE
        s2.is_learning, s2.stand_for = False, Board.STONE_WHITE

        s3 = StrategyRand()

        probs = [0, 0, 0, 0, 0, 0]
        games = 3  # 30
        for i in range(games):
            # the learner s1 move first(use black)
            s1.stand_for = Board.STONE_BLACK
            s2.stand_for = Board.STONE_WHITE
            g = Game(Board(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_BLACK:
                probs[0] += 1
            elif g.winner == Board.STONE_EMPTY:
                probs[1] += 1

            # the learner s1 move second(use white)
            s1.stand_for = Board.STONE_WHITE
            s2.stand_for = Board.STONE_BLACK
            g = Game(Board(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_WHITE:
                probs[2] += 1
            elif g.winner == Board.STONE_EMPTY:
                probs[3] += 1

            # the learner s1 move first vs. random opponent
            s1.stand_for = Board.STONE_BLACK
            s3.stand_for = Board.STONE_WHITE
            g = Game(Board(), s1, s3)
            g.step_to_end()
            if g.winner == Board.STONE_BLACK:
                probs[4] += 1

            # the learner s1 move second vs. random opponent
            s1.stand_for = Board.STONE_WHITE
            s3.stand_for = Board.STONE_BLACK
            g = Game(Board(), s1, s3)
            g.step_to_end()
            if g.winner == Board.STONE_WHITE:
                probs[5] += 1

        probs = [i / games for i in probs]
        print(probs)

        s1.epsilon, s1.is_learning, s1.stand_for = old_epsilon1, old_is_learning1, old_stand_for1
#         s2.epsilon, s2.is_learning, s2.stand_for = old_epsilon2, old_is_learning2, old_stand_for2
        s2.is_learning, s2.stand_for = old_is_learning2, old_stand_for2
        return probs
示例#17
0
    def reinforce(self, resume=True):
        self.oppo_pool = self.get_mindsets(RL_BRAIN_DIR, FILE_PREFIX)

        part_vars = True
        if resume and len(self.oppo_pool) != 0:
            file = tf.train.latest_checkpoint(RL_BRAIN_DIR)
            part_vars = False
        else:
            file = tf.train.latest_checkpoint(SL_BRAIN_DIR)
            part_vars = True
        s1 = StrategyDNN(is_train=False, is_revive=True, is_rl=True, from_file=file, part_vars=part_vars)
        print('I was born from', file)

        if len(self.oppo_pool) != 0:
            file = random.choice(self.oppo_pool)
            file = os.path.join(RL_BRAIN_DIR, file)
            part_vars = False
        else:
            file = tf.train.latest_checkpoint(SL_BRAIN_DIR)
            part_vars = True
        s2 = StrategyDNN(is_train=False, is_revive=True, is_rl=False, from_file=file, part_vars=part_vars)
        print('vs.', file)

        stat = []

#         n_lose = 0
        iter_n = 100
        for i in range(iter_n):
            print('iter:', i)
            win1, win2, draw = 0, 0, 0
            step_counter, explo_counter = 0, 0
            episodes = cfg.REINFORCE_PERIOD
            for _ in range(episodes):
                s1.stand_for = random.choice([Board.STONE_BLACK, Board.STONE_WHITE])
                s2.stand_for = Board.oppo(s1.stand_for)

                g = Game(Board.rand_generate_a_position(), s1, s2, observer=s1)
                g.step_to_end()
                win1 += 1 if g.winner == s1.stand_for else 0
                win2 += 1 if g.winner == s2.stand_for else 0
                draw += 1 if g.winner == Board.STONE_EMPTY else 0
#                 print('winner: {:d}, stand for: {:d}'.format(g.winner, s1.stand_for))
                s1.win_ratio = win1 / win2 if win2 != 0 else 1.
                step_counter += g.step_counter
                explo_counter += g.exploration_counter

            if s1.win_ratio > 1.1:
                file = FILE_PREFIX + '-' + str(i)
                s1.mind_clone(os.path.join(RL_BRAIN_DIR, FILE_PREFIX), i)
                self.oppo_pool.append(file)
                file = random.choice(self.oppo_pool)
                file = os.path.join(RL_BRAIN_DIR, file)
                s2.close()
                s2 = StrategyDNN(is_train=False, is_revive=True, is_rl=False, from_file=file, part_vars=False)
                print('vs.', file)
#                 n_lose = 0
#             elif win1 < win2:
#                 n_lose += 1
#             if n_lose >= 50:
#                 break

            if i % 1 == 0 or i + 1 == iter_n:
                total = win1 + win2 + draw
                win1_r = win1 / total
                win2_r = win2 / total
                draw_r = draw / total
                print("iter:%d, win: %.3f, lose: %.3f, draw: %.3f, t: %.3f" % (i, win1_r, win2_r, draw_r, s1.temperature))
                stat.append([win1_r, win2_r, draw_r])
                print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes))

            if i % 10 == 0 or i + 1 == iter_n:
                np.savez(STAT_FILE, stat=np.array(stat))

        print('rl done. you can try it.')
        self.strategy_1 = self.strategy_2 = s1
示例#18
0
class Gui(object):
    STATE_IDLE = 0
    STATE_TRAINING = 1
    STATE_PLAY = 2
    RESULT_MSG = {Board.STONE_BLACK: 'Black Win',
                  Board.STONE_WHITE: 'White Win',
                  Board.STONE_EMPTY: 'Draw'}

    def __init__(self):
        size = Board.BOARD_SIZE

        keymap = [k for k in plt.rcParams.keys() if k.startswith('keymap.')]
        for k in keymap:
            plt.rcParams[k] = ''

        self.fig = plt.figure(figsize=((size + 1) / 2.54, (size + 1) / 2.54), facecolor='#FFE991')
        self.fig.canvas.set_window_title('Training')
        span = 1. / (size + 1)
        self.ax = self.fig.add_axes((span, span, (size - 1) * span, (size - 1) * span),
                                    aspect='equal',
                                    axis_bgcolor='none',
                                    xticks=range(size),
                                    yticks=range(size),
                                    xticklabels=[chr(ord('A') + i) for i in range(size)],
                                    yticklabels=range(1, 1 + size))
        self.ax.grid(color='k', linestyle='-', linewidth=1)
        self.ax.set_title('press T for training')

        self.black_stone = patches.Circle((0, 0), .45,
                                          facecolor='#131814', edgecolor=(.8, .8, .8, 1),
                                          linewidth=2, clip_on=False, zorder=10)
        self.white_stone = copy.copy(self.black_stone)
        self.white_stone.set_facecolor('#FCF5F4')
        self.white_stone.set_edgecolor((.5, .5, .5))

        self.fig.canvas.mpl_connect('key_press_event', self._key_press)
        self.fig.canvas.mpl_connect('close_event', self._handle_close)
#         self.fig.canvas.mpl_connect('button_press_event', self._button_press)

        self.state = Gui.STATE_IDLE
        self.strategy_1 = None
        self.strategy_2 = None
        self.game = None
        self.all_stones = []
        self.oppo_pool = []
        self.msg_queue = queue.Queue(maxsize=100)

        self.timer = self.fig.canvas.new_timer(interval=50)
        self.timer.add_callback(self.on_update)
        self.timer.start()

        plt.show()

    def _handle_close(self, event):
        if self.strategy_1 is not None:
            self.strategy_1.close()
        if self.strategy_2 is not None:
            self.strategy_2.close()

    def _key_press(self, event):
        # print('press', event.key)
        if event.key == '0':
            # clear
            pass
        elif event.key == 'e':
            # edit mode
            pass
        elif event.key == '1':
            self.strategy_1 = StrategyTD(1, 1)
            self.strategy_1.load('./brain1.npz')
            self.strategy_1.stand_for = Board.STONE_BLACK
        elif event.key == '2':
            self.strategy_2 = StrategyTD(1, 1)
            self.strategy_2.load('./brain2.npz')
            self.strategy_2.stand_for = Board.STONE_WHITE
        elif event.key == '3':
            self.strategy_1.save('./brain1.npz')
            self.strategy_2.save('./brain2.npz')
        elif event.key == '4':
            self.strategy_1 = StrategyMC()
            self.strategy_1.load('./brain1.npz')
            self.strategy_1.stand_for = Board.STONE_BLACK
        elif event.key == '5':
            self.strategy_2 = StrategyMC()
            self.strategy_2.load('./brain2.npz')
            self.strategy_2.stand_for = Board.STONE_WHITE
        elif event.key == 't':
            self.state = Gui.STATE_TRAINING
            Game.on_training = True
            s1, s2 = self.init_both_sides()
            self.train1(s1, s2)  # god view
        elif event.key == 'r':
            self.learn_from_2_teachers()
        elif event.key == 'f2':
            self.state = Gui.STATE_PLAY
            Game.on_training = False
            self.vs_human(Board.STONE_BLACK)
        elif event.key == 'f3':
            self.state = Gui.STATE_PLAY
            Game.on_training = False
            self.vs_human(Board.STONE_WHITE)
        elif event.key == 'f1':
            pass
        elif event.key == 'm':
            self.match()
        elif event.key == 'f4':
            self.reinforce()
        elif event.key == 'f5':
            self.join_net_match()
        elif event.key == 'f12':
            plt.pause(600)

    def _button_press(self, event):
        if self.state != Gui.STATE_PLAY:
            return
        if not self.game.wait_human:
            return
        if (event.xdata is None) or (event.ydata is None):
            return
        i, j = map(round, (event.xdata, event.ydata))
#         print('click at(%d, %d)' % (i, j))

    def which_one(self, which_side):
        if self.strategy_1 is not None and self.strategy_1.stand_for == which_side:
            return self.strategy_1
        elif self.strategy_2 is not None and self.strategy_2.stand_for == which_side:
            return self.strategy_2
        return None

    def vs_human(self, which_side_human_play):
        strategy = self.which_one(Board.oppo(which_side_human_play))
        if strategy is None or isinstance(strategy, StrategyRand):
            strategy = self.which_one(which_side_human_play)
        if strategy is None:
            print('without opponent')
            return

        old_is_learning, old_stand_for = strategy.is_learning, strategy.stand_for
        strategy.is_learning, strategy.stand_for = False, Board.oppo(which_side_human_play)

        s1 = strategy
        s2 = StrategyHuman()
        s2.stand_for = which_side_human_play

        self.game = Game(Board(), s1, s2, self.msg_queue)
        self.game.step_to_end()

        strategy.is_learning, strategy.stand_for = old_is_learning, old_stand_for

    def clear_board(self):
        print('\nclear board\n')
        for s in self.all_stones:
            s.remove()
        self.all_stones.clear()

    def show(self, who, loc):
        i, j = divmod(loc, Board.BOARD_SIZE)
        s = None
        if who == Board.STONE_BLACK:
            s = copy.copy(self.black_stone)
        elif who == Board.STONE_WHITE:
            s = copy.copy(self.white_stone)
        s.center = (i, j)
        self.all_stones.append(s)
        self.ax.add_patch(s)

    def measure_perf(self, s1, s2):
        old_epsilon1, old_is_learning1, old_stand_for1 = s1.epsilon, s1.is_learning, s1.stand_for
#         old_epsilon2, old_is_learning2, old_stand_for2 = s2.epsilon, s2.is_learning, s2.stand_for
        old_is_learning2, old_stand_for2 = s2.is_learning, s2.stand_for
        s1.epsilon, s1.is_learning, s1.stand_for = 0, False, Board.STONE_BLACK
#         s2.epsilon, s2.is_learning, s2.stand_for = 0, False, Board.STONE_WHITE
        s2.is_learning, s2.stand_for = False, Board.STONE_WHITE

        s3 = StrategyRand()

        probs = [0, 0, 0, 0, 0, 0]
        games = 3  # 30
        for i in range(games):
            # the learner s1 move first(use black)
            s1.stand_for = Board.STONE_BLACK
            s2.stand_for = Board.STONE_WHITE
            g = Game(Board(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_BLACK:
                probs[0] += 1
            elif g.winner == Board.STONE_EMPTY:
                probs[1] += 1

            # the learner s1 move second(use white)
            s1.stand_for = Board.STONE_WHITE
            s2.stand_for = Board.STONE_BLACK
            g = Game(Board(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_WHITE:
                probs[2] += 1
            elif g.winner == Board.STONE_EMPTY:
                probs[3] += 1

            # the learner s1 move first vs. random opponent
            s1.stand_for = Board.STONE_BLACK
            s3.stand_for = Board.STONE_WHITE
            g = Game(Board(), s1, s3)
            g.step_to_end()
            if g.winner == Board.STONE_BLACK:
                probs[4] += 1

            # the learner s1 move second vs. random opponent
            s1.stand_for = Board.STONE_WHITE
            s3.stand_for = Board.STONE_BLACK
            g = Game(Board(), s1, s3)
            g.step_to_end()
            if g.winner == Board.STONE_WHITE:
                probs[5] += 1

        probs = [i / games for i in probs]
        print(probs)

        s1.epsilon, s1.is_learning, s1.stand_for = old_epsilon1, old_is_learning1, old_stand_for1
#         s2.epsilon, s2.is_learning, s2.stand_for = old_epsilon2, old_is_learning2, old_stand_for2
        s2.is_learning, s2.stand_for = old_is_learning2, old_stand_for2
        return probs

    def draw_perf(self, perf):
        series = ['black win', 'black draw', 'white win', 'white draw', 'PvR 1st', 'PvR 2nd']
        colors = ['r', 'b', 'g', 'c', 'm', 'y']
        plt.figure()
        axes = plt.gca()
        axes.set_ylim([-0.1, 1.1])
        for i in range(1, len(perf)):
            plt.plot(perf[0], perf[i], label=series[i - 1], color=colors[i - 1])
        plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
        plt.show()
#         plt.savefig('selfplay_random_{0}loss.png'.format(p1.lossval))

        plt.figure(self.fig.number)

    def init_both_sides(self):
        # feat = Board.BOARD_SIZE_SQ * 2 + 2

        # if self.strategy_1 is None:
        #     s1 = StrategyTD(feat, feat * 2)
        #     s1.stand_for = Board.STONE_BLACK
        #     s1.alpha = 0.3
        #     s1.beta = 0.3
        #     s1.lambdaa = 0.05
        #     s1.epsilon = 0.3
        #     self.strategy_1 = s1
        # else:
        #     s1 = self.strategy_1
        #     s1.epsilon = 0.3

        if self.strategy_1 is None:
            # s1 = StrategyMC()
            # s1 = StrategyANN(feat, feat * 2)
            file = tf.train.latest_checkpoint(RL_BRAIN_DIR)
            s1 = StrategyDNN(from_file=file, part_vars=True)
            # s1 = StrategyMCTS1()
            self.strategy_1 = s1
        else:
            s1 = self.strategy_1

        s1.is_learning = True
        s1.stand_for = Board.STONE_BLACK


#         if self.strategy_2 is None:
#             s2 = StrategyTD(feat, feat * 2)
#             s2.stand_for = Board.STONE_WHITE
#             self.strategy_2 = s2
#         else:
#             s2 = self.strategy_2
#             s2.is_learning = False
        s2 = StrategyRand()

#         s2 = StrategyMinMax()
        s2.stand_for = Board.STONE_WHITE
        self.strategy_2 = s2

        return s1, s2

    def match(self):
        s1, s2 = self.strategy_1, self.strategy_2
        print('player1:', s1.__class__.__name__)
        print('player2:', s2.__class__.__name__)

        probs = np.zeros(6)
        games = 100  # 30
        for i in range(games):
            print(i)
            s1.stand_for = Board.STONE_BLACK
            s2.stand_for = Board.STONE_WHITE
            g = Game(Board.rand_generate_a_position(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_BLACK:
                probs[0] += 1
            elif g.winner == Board.STONE_WHITE:
                probs[1] += 1
            else:
                probs[2] += 1

            s1.stand_for = Board.STONE_WHITE
            s2.stand_for = Board.STONE_BLACK
            g = Game(Board.rand_generate_a_position(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_WHITE:
                probs[3] += 1
            elif g.winner == Board.STONE_BLACK:
                probs[4] += 1
            else:
                probs[5] += 1

        print('total play:', games)
        print(probs)

    def train1(self, s1, s2):
        '''train one time
        Returns:
        ------------
        winner : Strategy
            the win strategy
        '''

        max_explore_rate = 0.95

        win1, win2, draw = 0, 0, 0
        step_counter, explo_counter = 0, 0
        begin = datetime.datetime.now()
        episodes = 1
        # samples = 100
        # interval = episodes // samples
        # perf = [[] for _ in range(7)]
        learner = s1 if s1.is_learning else s2
        # oppo = self.which_one(Board.oppo(learner.stand_for))
        stat_win = []
        # past_me = learner.mind_clone()
        for i in range(episodes):
            # if (i + 1) % interval == 0:
            #     print(np.allclose(s1.hidden_weights, past_me.hidden_weights))
            #     probs = self.measure_perf(learner, oppo)
            #     perf[0].append(i)
            #     for idx, x in enumerate(probs):
            #         perf[idx + 1].append(x)

            learner.epsilon = max_explore_rate * np.exp(-5 * i / episodes)  # * (1 if i < episodes//2 else 0.3) #
            g = Game(Board(), s1, s2)
            g.step_to_end()
            win1 += 1 if g.winner == Board.STONE_BLACK else 0
            win2 += 1 if g.winner == Board.STONE_WHITE else 0
            draw += 1 if g.winner == Board.STONE_EMPTY else 0

            stat_win.append(win1 - win2 - draw)
#             rec.append(win1)
            step_counter += g.step_counter
            explo_counter += g.exploration_counter
#             print('steps[%d], explos[%d]' % (g.step_counter, g.exploration_counter))
            print('training...%d' % i)

        total = win1 + win2 + draw
        print("black win: %f" % (win1 / total))
        print("white win: %f" % (win2 / total))
        print("draw: %f" % (draw / total))

        print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes))

        end = datetime.datetime.now()
        diff = end - begin
        print("time cost[%f]s, avg.[%f]s" % (diff.total_seconds(), diff.total_seconds() / episodes))

        # with open('stat-result-win.txt', 'w') as f:
        #     f.write(repr(stat_win))
#         print(perf)
#         self.draw_perf(perf)

#         np.set_printoptions(threshold=np.nan, formatter={'float_kind' : lambda x: "%.4f" % x})
#         with open('stat-result-net-train-errors.txt', 'w') as f:
#             f.write(repr(np.array(s1.errors)))

        winner = Board.STONE_BLACK if win1 >= win2 else Board.STONE_WHITE
        return self.which_one(winner), max(win1, win2) / total
        # plt.title('press F3 start')
#         print(len(rec))
#         plt.plot(rec)

    def learn_from_2_teachers(self):
        s1 = StrategyMinMax()
        s1.stand_for = Board.STONE_BLACK
        self.strategy_1 = s1

        s2 = StrategyMinMax()
        s2.stand_for = Board.STONE_WHITE
        self.strategy_2 = s2

        observer = StrategyMC()

        win1, win2, draw = 0, 0, 0
        step_counter, explo_counter = 0, 0
        begin = datetime.datetime.now()
        episodes = 10000
        for i in range(episodes):
            g = Game(Board(), s1, s2, observer=observer)
            g.step_to_end()
            win1 += 1 if g.winner == Board.STONE_BLACK else 0
            win2 += 1 if g.winner == Board.STONE_WHITE else 0
            draw += 1 if g.winner == Board.STONE_EMPTY else 0

            step_counter += g.step_counter
            explo_counter += g.exploration_counter
            print('training...%d' % i)

        total = win1 + win2 + draw
        print("black win: %f" % (win1 / total))
        print("white win: %f" % (win2 / total))
        print("draw: %f" % (draw / total))

        print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes))

        end = datetime.datetime.now()
        diff = end - begin
        print("time cost[%f]s, avg.[%f]s" % (diff.total_seconds(), diff.total_seconds() / episodes))

        observer.save('./brain1.npz')

    def from_new_start_point(self, winner, s1, s2):
        '''
        Returns:
        ------------
        s1 : Strategy
            the learner
        s2 : Strategy
            the teacher
        '''
        if s1 == winner:
            s2 = s1.mind_clone()
        if s2 == winner:
            s1 = s2.mind_clone()

        # way 1: s1 follow the winner's stand-for
            s1.stand_for = winner.stand_for
        # way 2: s1 switch to another stand-for of winner
#             s1.stand_for = Board.oppo(winner.stand_for)
        # way 3: s1 random select stand-for
#             s1.stand_for = np.random.choice(np.array([Board.STONE_BLACK, Board.STONE_WHITE]))
        s2.stand_for = Board.oppo(s1.stand_for)

        s1.is_learning = True
        s2.is_learning = False
        return s1, s2

    def train2(self):
        '''train many times
        '''
        s1, s2 = self.init_both_sides()

        win_probs = []
        begin = datetime.datetime.now()
        counter = 0
        while True:
            print('epoch...%d' % counter)

            winner, win_prob = self.train1(s1, s2)
            win_probs.append(win_prob)

            counter += 1
            if counter >= 10:
                break
            s1, s2 = self.from_new_start_point(winner, s1, s2)

        end = datetime.datetime.now()
        diff = end - begin
        print("total time cost[%f] hour" % (diff.total_seconds() / 3600))

        print('win probs: ', win_probs)

        plt.title('press F3 start')


    def reinforce(self, resume=True):
        self.oppo_pool = self.get_mindsets(RL_BRAIN_DIR, FILE_PREFIX)

        part_vars = True
        if resume and len(self.oppo_pool) != 0:
            file = tf.train.latest_checkpoint(RL_BRAIN_DIR)
            part_vars = False
        else:
            file = tf.train.latest_checkpoint(SL_BRAIN_DIR)
            part_vars = True
        s1 = StrategyDNN(is_train=False, is_revive=True, is_rl=True, from_file=file, part_vars=part_vars)
        print('I was born from', file)

        if len(self.oppo_pool) != 0:
            file = random.choice(self.oppo_pool)
            file = os.path.join(RL_BRAIN_DIR, file)
            part_vars = False
        else:
            file = tf.train.latest_checkpoint(SL_BRAIN_DIR)
            part_vars = True
        s2 = StrategyDNN(is_train=False, is_revive=True, is_rl=False, from_file=file, part_vars=part_vars)
        print('vs.', file)

        stat = []

#         n_lose = 0
        iter_n = 100
        for i in range(iter_n):
            print('iter:', i)
            win1, win2, draw = 0, 0, 0
            step_counter, explo_counter = 0, 0
            episodes = cfg.REINFORCE_PERIOD
            for _ in range(episodes):
                s1.stand_for = random.choice([Board.STONE_BLACK, Board.STONE_WHITE])
                s2.stand_for = Board.oppo(s1.stand_for)

                g = Game(Board.rand_generate_a_position(), s1, s2, observer=s1)
                g.step_to_end()
                win1 += 1 if g.winner == s1.stand_for else 0
                win2 += 1 if g.winner == s2.stand_for else 0
                draw += 1 if g.winner == Board.STONE_EMPTY else 0
#                 print('winner: {:d}, stand for: {:d}'.format(g.winner, s1.stand_for))
                s1.win_ratio = win1 / win2 if win2 != 0 else 1.
                step_counter += g.step_counter
                explo_counter += g.exploration_counter

            if s1.win_ratio > 1.1:
                file = FILE_PREFIX + '-' + str(i)
                s1.mind_clone(os.path.join(RL_BRAIN_DIR, FILE_PREFIX), i)
                self.oppo_pool.append(file)
                file = random.choice(self.oppo_pool)
                file = os.path.join(RL_BRAIN_DIR, file)
                s2.close()
                s2 = StrategyDNN(is_train=False, is_revive=True, is_rl=False, from_file=file, part_vars=False)
                print('vs.', file)
#                 n_lose = 0
#             elif win1 < win2:
#                 n_lose += 1
#             if n_lose >= 50:
#                 break

            if i % 1 == 0 or i + 1 == iter_n:
                total = win1 + win2 + draw
                win1_r = win1 / total
                win2_r = win2 / total
                draw_r = draw / total
                print("iter:%d, win: %.3f, lose: %.3f, draw: %.3f, t: %.3f" % (i, win1_r, win2_r, draw_r, s1.temperature))
                stat.append([win1_r, win2_r, draw_r])
                print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes))

            if i % 10 == 0 or i + 1 == iter_n:
                np.savez(STAT_FILE, stat=np.array(stat))

        print('rl done. you can try it.')
        self.strategy_1 = self.strategy_2 = s1

    def get_mindsets(self, folder, prefix):
        mindsets = set()
        pattern = os.path.join(folder, prefix) + '*'
        listing = glob.glob(pattern)
        for f in listing:
            mindsets.add(os.path.splitext(os.path.basename(f))[0])
        return list(mindsets)

    def on_update(self):
        i = 0
        redraw = False
        while True:
            msg = None
            try:
                msg = self.msg_queue.get_nowait()
            except queue.Empty:
                break
            if msg is None:
                break

#             print(msg[0], ' ', msg[1] if len(msg) > 1 else '')
            if msg[0] == 'start':
                self.clear_board()
                redraw = True
            elif msg[0] == 'move':
                self.show(msg[1], msg[2])
                redraw = True
            elif msg[0] == 'end':
                self.ax.set_title(Gui.RESULT_MSG[msg[1]])
                redraw = True

            self.msg_queue.task_done()
            i += 1
            if i >= 5:  # max msg num each time deal with
                break

        if redraw:
            self.fig.canvas.draw()

    def join_net_match(self):
        net_t = Thread(target=net, args=(self.msg_queue,), daemon=True)
        net_t.start()
示例#19
0
    def measure_perf(self, s1, s2):
        old_epsilon1, old_is_learning1, old_stand_for1 = s1.epsilon, s1.is_learning, s1.stand_for
#         old_epsilon2, old_is_learning2, old_stand_for2 = s2.epsilon, s2.is_learning, s2.stand_for
        old_is_learning2, old_stand_for2 = s2.is_learning, s2.stand_for
        s1.epsilon, s1.is_learning, s1.stand_for = 0, False, Board.STONE_BLACK
#         s2.epsilon, s2.is_learning, s2.stand_for = 0, False, Board.STONE_WHITE
        s2.is_learning, s2.stand_for = False, Board.STONE_WHITE

        s3 = StrategyRand()

        probs = [0, 0, 0, 0, 0, 0]
        games = 3  # 30
        for i in range(games):
            # the learner s1 move first(use black)
            s1.stand_for = Board.STONE_BLACK
            s2.stand_for = Board.STONE_WHITE
            g = Game(Board(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_BLACK:
                probs[0] += 1
            elif g.winner == Board.STONE_EMPTY:
                probs[1] += 1

            # the learner s1 move second(use white)
            s1.stand_for = Board.STONE_WHITE
            s2.stand_for = Board.STONE_BLACK
            g = Game(Board(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_WHITE:
                probs[2] += 1
            elif g.winner == Board.STONE_EMPTY:
                probs[3] += 1

            # the learner s1 move first vs. random opponent
            s1.stand_for = Board.STONE_BLACK
            s3.stand_for = Board.STONE_WHITE
            g = Game(Board(), s1, s3)
            g.step_to_end()
            if g.winner == Board.STONE_BLACK:
                probs[4] += 1

            # the learner s1 move second vs. random opponent
            s1.stand_for = Board.STONE_WHITE
            s3.stand_for = Board.STONE_BLACK
            g = Game(Board(), s1, s3)
            g.step_to_end()
            if g.winner == Board.STONE_WHITE:
                probs[5] += 1

        probs = [i / games for i in probs]
        print(probs)

        s1.epsilon, s1.is_learning, s1.stand_for = old_epsilon1, old_is_learning1, old_stand_for1
#         s2.epsilon, s2.is_learning, s2.stand_for = old_epsilon2, old_is_learning2, old_stand_for2
        s2.is_learning, s2.stand_for = old_is_learning2, old_stand_for2
        return probs
示例#20
0
 def _policy_fn(self, board):
     _, _, legal_moves = Game.possible_moves(board)
     state, _ = self.get_input_values(board.stones)
     probs = self.brain.get_move_probs(state)
     probs = probs[0, legal_moves]
     return list(zip(legal_moves, probs))
示例#21
0
class Gui(object):
    STATE_IDLE = 0
    STATE_TRAINING = 1
    STATE_PLAY = 2
    RESULT_MSG = {Board.STONE_BLACK: 'Black Win',
                  Board.STONE_WHITE: 'White Win',
                  Board.STONE_EMPTY: 'Draw'}


    def __init__(self):
        size = Board.BOARD_SIZE

        keymap = [k for k in plt.rcParams.keys() if k.startswith('keymap.')]
        for k in keymap:
            plt.rcParams[k] = ''

        self.fig = plt.figure(figsize=((size + 1) / 2.54, (size + 1) / 2.54), facecolor='#FFE991')
        self.fig.canvas.set_window_title('Training')
        span = 1. / (size + 1)
        self.ax = self.fig.add_axes((span, span, (size - 1) * span, (size - 1) * span),
                                    aspect='equal',
                                    axis_bgcolor='none',
                                    xticks=range(size),
                                    yticks=range(size),
                                    xticklabels=[chr(ord('A') + i) for i in range(size)],
                                    yticklabels=range(1, 1 + size)
                                   )
        self.ax.grid(color='k', linestyle='-', linewidth=1)
        self.ax.set_title('press T for training')

        self.black_stone = patches.Circle((0, 0), .45,
                                          facecolor='#131814', edgecolor=(.8, .8, .8, 1),
                                          linewidth=2, clip_on=False, zorder=10)
        self.white_stone = copy.copy(self.black_stone)
        self.white_stone.set_facecolor('#FCF5F4')
        self.white_stone.set_edgecolor((.5, .5, .5))

        self.fig.canvas.mpl_connect('key_press_event', self._key_press)
        self.fig.canvas.mpl_connect('close_event', self._handle_close)
#         self.fig.canvas.mpl_connect('button_press_event', self._button_press)

        self.state = Gui.STATE_IDLE
        self.strategy_1 = None
        self.strategy_2 = None
        self.game = None
        self.all_stones = []
        self.oppo_pool = []
        self.msg_queue = Queue(maxsize=100)

        self.timer = self.fig.canvas.new_timer(interval=50)
        self.timer.add_callback(self.on_update)
        self.timer.start()

        plt.show()


    def _handle_close(self, event):
        if self.strategy_1 is not None:
            self.strategy_1.close()
        if self.strategy_2 is not None:
            self.strategy_2.close()

    def _key_press(self, event):
#         print('press', event.key)
        if event.key == '0':
            # clear
            pass
        elif event.key == 'e':
            # edit mode
            pass
        elif event.key == '1':
            self.strategy_1 = StrategyTD(1, 1)
            self.strategy_1.load('./brain1.npz')
            self.strategy_1.stand_for = Board.STONE_BLACK
        elif event.key == '2':
            self.strategy_2 = StrategyTD(1, 1)
            self.strategy_2.load('./brain2.npz')
            self.strategy_2.stand_for = Board.STONE_WHITE
        elif event.key == '3':
            self.strategy_1.save('./brain1.npz')
            self.strategy_2.save('./brain2.npz')
        elif event.key == '4':
            self.strategy_1 = StrategyMC()
            self.strategy_1.load('./brain1.npz')
            self.strategy_1.stand_for = Board.STONE_BLACK
        elif event.key == '5':
            self.strategy_2 = StrategyMC()
            self.strategy_2.load('./brain2.npz')
            self.strategy_2.stand_for = Board.STONE_WHITE
        elif event.key == 't':
            self.state = Gui.STATE_TRAINING
            Game.on_training = True
            s1, s2 = self.init_both_sides()
            self.train1(s1, s2)  # god view
        elif event.key == 'r':
            self.learn_from_2_teachers()
        elif event.key == 'f2':
            self.state = Gui.STATE_PLAY
            Game.on_training = False
            self.vs_human(Board.STONE_BLACK)
        elif event.key == 'f3':
            self.state = Gui.STATE_PLAY
            Game.on_training = False
            self.vs_human(Board.STONE_WHITE)
        elif event.key == 'f1':
            pass
        elif event.key == 'm':
            self.match()
        elif event.key == 'f4':
            self.reinforce()
        elif event.key == 'f5':
            self.join_net_match()
        elif event.key == 'f12':
            plt.pause(600)


    def _button_press(self, event):
        if self.state != Gui.STATE_PLAY:
            return
        if not self.game.wait_human:
            return
        if (event.xdata is None) or (event.ydata is None):
            return
        i, j = map(round, (event.xdata, event.ydata))
#         print('click at(%d, %d)' % (i, j))


    def which_one(self, which_side):
        if self.strategy_1 is not None and self.strategy_1.stand_for == which_side:
            return self.strategy_1
        elif self.strategy_2 is not None and self.strategy_2.stand_for == which_side:
            return self.strategy_2
        return None


    def vs_human(self, which_side_human_play):
        strategy = self.which_one(Board.oppo(which_side_human_play))
        if strategy is None or isinstance(strategy, StrategyRand):
            strategy = self.which_one(which_side_human_play)
        if strategy is None:
            print('without opponent')
            return

        old_is_learning, old_stand_for = strategy.is_learning, strategy.stand_for
        strategy.is_learning, strategy.stand_for = False, Board.oppo(which_side_human_play)

        s1 = strategy
        s2 = StrategyHuman()
        s2.stand_for = which_side_human_play

        self.game = Game(Board(), s1, s2, self.msg_queue)
        self.game.step_to_end()

        strategy.is_learning, strategy.stand_for = old_is_learning, old_stand_for


    def clear_board(self):
        print('\nclear board\n')
        for s in self.all_stones:
            s.remove()
        self.all_stones.clear()

    def show(self, who, loc):
        i, j = divmod(loc, Board.BOARD_SIZE)
        s = None
        if who == Board.STONE_BLACK:
            s = copy.copy(self.black_stone)
        elif who == Board.STONE_WHITE:
            s = copy.copy(self.white_stone)
        s.center = (i, j)
        self.all_stones.append(s)
        self.ax.add_patch(s)

    def measure_perf(self, s1, s2):
        old_epsilon1, old_is_learning1, old_stand_for1 = s1.epsilon, s1.is_learning, s1.stand_for
#         old_epsilon2, old_is_learning2, old_stand_for2 = s2.epsilon, s2.is_learning, s2.stand_for
        old_is_learning2, old_stand_for2 = s2.is_learning, s2.stand_for
        s1.epsilon, s1.is_learning, s1.stand_for = 0, False, Board.STONE_BLACK
#         s2.epsilon, s2.is_learning, s2.stand_for = 0, False, Board.STONE_WHITE
        s2.is_learning, s2.stand_for = False, Board.STONE_WHITE

        s3 = StrategyRand()

        probs = [0, 0, 0, 0, 0, 0]
        games = 3  # 30
        for i in range(games):
            # the learner s1 move first(use black)
            s1.stand_for = Board.STONE_BLACK
            s2.stand_for = Board.STONE_WHITE
            g = Game(Board(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_BLACK:
                probs[0] += 1
            elif g.winner == Board.STONE_EMPTY:
                probs[1] += 1

            # the learner s1 move second(use white)
            s1.stand_for = Board.STONE_WHITE
            s2.stand_for = Board.STONE_BLACK
            g = Game(Board(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_WHITE:
                probs[2] += 1
            elif g.winner == Board.STONE_EMPTY:
                probs[3] += 1

            # the learner s1 move first vs. random opponent
            s1.stand_for = Board.STONE_BLACK
            s3.stand_for = Board.STONE_WHITE
            g = Game(Board(), s1, s3)
            g.step_to_end()
            if g.winner == Board.STONE_BLACK:
                probs[4] += 1

            # the learner s1 move second vs. random opponent
            s1.stand_for = Board.STONE_WHITE
            s3.stand_for = Board.STONE_BLACK
            g = Game(Board(), s1, s3)
            g.step_to_end()
            if g.winner == Board.STONE_WHITE:
                probs[5] += 1

        probs = [i / games for i in probs]
        print(probs)

        s1.epsilon, s1.is_learning, s1.stand_for = old_epsilon1, old_is_learning1, old_stand_for1
#         s2.epsilon, s2.is_learning, s2.stand_for = old_epsilon2, old_is_learning2, old_stand_for2
        s2.is_learning, s2.stand_for = old_is_learning2, old_stand_for2
        return probs

    def draw_perf(self, perf):
        series = ['black win', 'black draw', 'white win', 'white draw', 'PvR 1st', 'PvR 2nd']
        colors = ['r', 'b', 'g', 'c', 'm', 'y']
        plt.figure()
        axes = plt.gca()
        axes.set_ylim([-0.1, 1.1])
        for i in range(1, len(perf)):
            plt.plot(perf[0], perf[i], label=series[i - 1], color=colors[i - 1])
        plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
        plt.show()
#         plt.savefig('selfplay_random_{0}loss.png'.format(p1.lossval))

        plt.figure(self.fig.number)

    def init_both_sides(self):
        feat = Board.BOARD_SIZE_SQ * 2 + 2

#         if self.strategy_1 is None:
#             s1 = StrategyTD(feat, feat * 2)
#             s1.stand_for = Board.STONE_BLACK
#     #         s1.alpha = 0.3
#     #         s1.beta = 0.3
#             s1.lambdaa = 0.05
#             s1.epsilon = 0.3
#             self.strategy_1 = s1
#         else:
#             s1 = self.strategy_1
#             s1.epsilon = 0.3

        if self.strategy_1 is None:
#             s1 = StrategyMC()
#             s1 = StrategyANN(feat, feat * 2)
            s1 = StrategyDNN()
            self.strategy_1 = s1
        else:
            s1 = self.strategy_1


        s1.is_learning = True
        s1.stand_for = Board.STONE_BLACK


#         if self.strategy_2 is None:
#             s2 = StrategyTD(feat, feat * 2)
#             s2.stand_for = Board.STONE_WHITE
#             self.strategy_2 = s2
#         else:
#             s2 = self.strategy_2
#             s2.is_learning = False
        s2 = StrategyRand()

#         s2 = StrategyMinMax()
        s2.stand_for = Board.STONE_WHITE
        self.strategy_2 = s2

        return s1, s2


    def match(self):
        s1, s2 = self.strategy_1, self.strategy_2
        print('player1:', s1.__class__.__name__)
        print('player2:', s2.__class__.__name__)

        probs = np.zeros(6)
        games = 100  # 30
        for i in range(games):
            print(i)
            s1.stand_for = Board.STONE_BLACK
            s2.stand_for = Board.STONE_WHITE
            g = Game(Board.rand_generate_a_position(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_BLACK:
                probs[0] += 1
            elif g.winner == Board.STONE_WHITE:
                probs[1] += 1
            else:
                probs[2] += 1

            s1.stand_for = Board.STONE_WHITE
            s2.stand_for = Board.STONE_BLACK
            g = Game(Board.rand_generate_a_position(), s1, s2)
            g.step_to_end()
            if g.winner == Board.STONE_WHITE:
                probs[3] += 1
            elif g.winner == Board.STONE_BLACK:
                probs[4] += 1
            else:
                probs[5] += 1

        print('total play:', games)
        print(probs)


    def train1(self, s1, s2):
        '''train one time
        Returns:
        ------------
        winner : Strategy
            the win strategy
        '''

        max_explore_rate = 0.95

        win1, win2, draw = 0, 0, 0
        step_counter, explo_counter = 0, 0
        begin = datetime.datetime.now()
        episodes = 1
        samples = 100
        interval = episodes // samples
        perf = [[] for _ in range(7)]
        learner = s1 if s1.is_learning else s2
        oppo = self.which_one(Board.oppo(learner.stand_for))
        stat_win = []
#         past_me = learner.mind_clone()
        for i in range(episodes):
#             if (i + 1) % interval == 0:
# #                 print(np.allclose(s1.hidden_weights, past_me.hidden_weights))
#                 probs = self.measure_perf(learner, oppo)
#                 perf[0].append(i)
#                 for idx, x in enumerate(probs):
#                     perf[idx + 1].append(x)

            learner.epsilon = max_explore_rate * np.exp(-5 * i / episodes)  # * (1 if i < episodes//2 else 0.3) #
            g = Game(Board(), s1, s2)
            g.step_to_end()
            win1 += 1 if g.winner == Board.STONE_BLACK else 0
            win2 += 1 if g.winner == Board.STONE_WHITE else 0
            draw += 1 if g.winner == Board.STONE_EMPTY else 0

            stat_win.append(win1 - win2 - draw)
#             rec.append(win1)
            step_counter += g.step_counter
            explo_counter += g.exploration_counter
#             print('steps[%d], explos[%d]' % (g.step_counter, g.exploration_counter))
            print('training...%d' % i)

        total = win1 + win2 + draw
        print("black win: %f" % (win1 / total))
        print("white win: %f" % (win2 / total))
        print("draw: %f" % (draw / total))

        print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes))

        end = datetime.datetime.now()
        diff = end - begin
        print("time cost[%f]s, avg.[%f]s" % (diff.total_seconds(), diff.total_seconds() / episodes))

        with open('stat-result-win.txt', 'w') as f:
            f.write(repr(stat_win))
#         print(perf)
#         self.draw_perf(perf)

#         np.set_printoptions(threshold=np.nan, formatter={'float_kind' : lambda x: "%.4f" % x})
#         with open('stat-result-net-train-errors.txt', 'w') as f:
#             f.write(repr(np.array(s1.errors)))

        winner = Board.STONE_BLACK if win1 >= win2 else Board.STONE_WHITE
        return self.which_one(winner), max(win1, win2) / total
        # plt.title('press F3 start')
#         print(len(rec))
#         plt.plot(rec)


    def learn_from_2_teachers(self):
        s1 = StrategyMinMax()
        s1.stand_for = Board.STONE_BLACK
        self.strategy_1 = s1

        s2 = StrategyMinMax()
        s2.stand_for = Board.STONE_WHITE
        self.strategy_2 = s2

        observer = StrategyMC()

        win1, win2, draw = 0, 0, 0
        step_counter, explo_counter = 0, 0
        begin = datetime.datetime.now()
        episodes = 10000
        for i in range(episodes):
            g = Game(Board(), s1, s2, observer=observer)
            g.step_to_end()
            win1 += 1 if g.winner == Board.STONE_BLACK else 0
            win2 += 1 if g.winner == Board.STONE_WHITE else 0
            draw += 1 if g.winner == Board.STONE_EMPTY else 0

            step_counter += g.step_counter
            explo_counter += g.exploration_counter
            print('training...%d' % i)

        total = win1 + win2 + draw
        print("black win: %f" % (win1 / total))
        print("white win: %f" % (win2 / total))
        print("draw: %f" % (draw / total))

        print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes))

        end = datetime.datetime.now()
        diff = end - begin
        print("time cost[%f]s, avg.[%f]s" % (diff.total_seconds(), diff.total_seconds() / episodes))

        observer.save('./brain1.npz')


    def from_new_start_point(self, winner, s1, s2):
        '''
        Returns:
        ------------
        s1 : Strategy
            the learner
        s2 : Strategy
            the teacher        
        '''
        if s1 == winner:
            s2 = s1.mind_clone()
        if s2 == winner:
            s1 = s2.mind_clone()

        # way 1: s1 follow the winner's stand-for
            s1.stand_for = winner.stand_for
        # way 2: s1 switch to another stand-for of winner
#             s1.stand_for = Board.oppo(winner.stand_for)
        # way 3: s1 random select stand-for
#             s1.stand_for = np.random.choice(np.array([Board.STONE_BLACK, Board.STONE_WHITE]))
        s2.stand_for = Board.oppo(s1.stand_for)

        s1.is_learning = True
        s2.is_learning = False
        return s1, s2


    def train2(self):
        '''train many times
        
        '''
        s1, s2 = self.init_both_sides()


        win_probs = []
        begin = datetime.datetime.now()
        counter = 0
        while True:
            print('epoch...%d' % counter)

            winner, win_prob = self.train1(s1, s2)
            win_probs.append(win_prob)

            counter += 1
            if counter >= 10:
                break
            s1, s2 = self.from_new_start_point(winner, s1, s2)

        end = datetime.datetime.now()
        diff = end - begin
        print("total time cost[%f] hour" % (diff.total_seconds() / 3600))

        print('win probs: ', win_probs)

        plt.title('press F3 start')


    def reinforce(self):
        if len(self.oppo_pool) == 0:
            self.oppo_pool.append(StrategyDNN(is_train=False, is_revive=True, is_rl=False))

        s1 = StrategyDNN(is_train=False, is_revive=True, is_rl=True)
        s2 = random.choice(self.oppo_pool)

        stat = []
        win1, win2, draw = 0, 0, 0

        n_lose = 0
        iter_n = 10
        i = 0
        while True:
            print('iter:', i)

            for _ in range(1000):
                s1.stand_for = random.choice([Board.STONE_BLACK, Board.STONE_WHITE])
                s2.stand_for = Board.oppo(s1.stand_for)

                g = Game(Board.rand_generate_a_position(), s1, s2, observer=s1)
                g.step_to_end()
                win1 += 1 if g.winner == s1.stand_for else 0
                win2 += 1 if g.winner == s2.stand_for else 0
                draw += 1 if g.winner == Board.STONE_EMPTY else 0

#             if win1 > win2:
#                 s1_c = s1.mind_clone()
#                 self.oppo_pool.append(s1_c)
#                 s2 = random.choice(self.oppo_pool)
#                 n_lose = 0
#                 print('stronger, oppos:', len(self.oppo_pool))
#             elif win1 < win2:
#                 n_lose += 1
# 
#             if n_lose >= 50:
#                 break

            if i % 1 == 0 or i + 1 == iter_n:
                total = win1 + win2 + draw
                win1_r = win1 / total
                win2_r = win2 / total
                draw_r = draw / total
                print("iter:%d, win: %.3f, loss: %.3f, tie: %.3f" % (i, win1_r, win2_r, draw_r))
                stat.append([win1_r, win2_r, draw_r])

            i += 1

            if i > iter_n:
                break

        stat = np.array(stat)
        print('stat. shape:', stat.shape)
        np.savez('/home/splendor/fusor/stat.npz', stat=np.array(stat))
        self.strategy_1 = self.strategy_2 = s1

    def on_update(self):
        i = 0
        redraw = False
        while True:
            msg = None
            try:
                msg = self.msg_queue.get_nowait()
            except queue.Empty:
                break
            if msg is None:
                break

#             print(msg[0], ' ', msg[1] if len(msg) > 1 else '')
            if msg[0] == 'start':
                self.clear_board()
                redraw = True
            elif msg[0] == 'move':
                self.show(msg[1], msg[2])
                redraw = True
            elif msg[0] == 'end':
                self.ax.set_title(Gui.RESULT_MSG[msg[1]])
                redraw = True

            self.msg_queue.task_done()
            i += 1
            if i >= 5:  # max msg num each time deal with
                break

        if redraw:
            self.fig.canvas.draw()

    def join_net_match(self):
        net_t = Thread(target=net, args=(self.msg_queue,), daemon=True)
        net_t.start()
示例#22
0
    def train1(self, s1, s2):
        '''train one time
        Returns:
        ------------
        winner : Strategy
            the win strategy
        '''

        max_explore_rate = 0.95

        win1, win2, draw = 0, 0, 0
        step_counter, explo_counter = 0, 0
        begin = datetime.datetime.now()
        episodes = 1
        # samples = 100
        # interval = episodes // samples
        # perf = [[] for _ in range(7)]
        learner = s1 if s1.is_learning else s2
        # oppo = self.which_one(Board.oppo(learner.stand_for))
        stat_win = []
        # past_me = learner.mind_clone()
        for i in range(episodes):
            # if (i + 1) % interval == 0:
            #     print(np.allclose(s1.hidden_weights, past_me.hidden_weights))
            #     probs = self.measure_perf(learner, oppo)
            #     perf[0].append(i)
            #     for idx, x in enumerate(probs):
            #         perf[idx + 1].append(x)

            learner.epsilon = max_explore_rate * np.exp(-5 * i / episodes)  # * (1 if i < episodes//2 else 0.3) #
            g = Game(Board(), s1, s2)
            g.step_to_end()
            win1 += 1 if g.winner == Board.STONE_BLACK else 0
            win2 += 1 if g.winner == Board.STONE_WHITE else 0
            draw += 1 if g.winner == Board.STONE_EMPTY else 0

            stat_win.append(win1 - win2 - draw)
#             rec.append(win1)
            step_counter += g.step_counter
            explo_counter += g.exploration_counter
#             print('steps[%d], explos[%d]' % (g.step_counter, g.exploration_counter))
            print('training...%d' % i)

        total = win1 + win2 + draw
        print("black win: %f" % (win1 / total))
        print("white win: %f" % (win2 / total))
        print("draw: %f" % (draw / total))

        print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes))

        end = datetime.datetime.now()
        diff = end - begin
        print("time cost[%f]s, avg.[%f]s" % (diff.total_seconds(), diff.total_seconds() / episodes))

        # with open('stat-result-win.txt', 'w') as f:
        #     f.write(repr(stat_win))
#         print(perf)
#         self.draw_perf(perf)

#         np.set_printoptions(threshold=np.nan, formatter={'float_kind' : lambda x: "%.4f" % x})
#         with open('stat-result-net-train-errors.txt', 'w') as f:
#             f.write(repr(np.array(s1.errors)))

        winner = Board.STONE_BLACK if win1 >= win2 else Board.STONE_WHITE
        return self.which_one(winner), max(win1, win2) / total
示例#23
0
    def train1(self, s1, s2):
        '''train one time
        Returns:
        ------------
        winner : Strategy
            the win strategy
        '''

        max_explore_rate = 0.95

        win1, win2, draw = 0, 0, 0
        step_counter, explo_counter = 0, 0
        begin = datetime.datetime.now()
        episodes = 1
        samples = 100
        interval = episodes // samples
        perf = [[] for _ in range(7)]
        learner = s1 if s1.is_learning else s2
        oppo = self.which_one(Board.oppo(learner.stand_for))
        stat_win = []
#         past_me = learner.mind_clone()
        for i in range(episodes):
#             if (i + 1) % interval == 0:
# #                 print(np.allclose(s1.hidden_weights, past_me.hidden_weights))
#                 probs = self.measure_perf(learner, oppo)
#                 perf[0].append(i)
#                 for idx, x in enumerate(probs):
#                     perf[idx + 1].append(x)

            learner.epsilon = max_explore_rate * np.exp(-5 * i / episodes)  # * (1 if i < episodes//2 else 0.3) #
            g = Game(Board(), s1, s2)
            g.step_to_end()
            win1 += 1 if g.winner == Board.STONE_BLACK else 0
            win2 += 1 if g.winner == Board.STONE_WHITE else 0
            draw += 1 if g.winner == Board.STONE_EMPTY else 0

            stat_win.append(win1 - win2 - draw)
#             rec.append(win1)
            step_counter += g.step_counter
            explo_counter += g.exploration_counter
#             print('steps[%d], explos[%d]' % (g.step_counter, g.exploration_counter))
            print('training...%d' % i)

        total = win1 + win2 + draw
        print("black win: %f" % (win1 / total))
        print("white win: %f" % (win2 / total))
        print("draw: %f" % (draw / total))

        print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes))

        end = datetime.datetime.now()
        diff = end - begin
        print("time cost[%f]s, avg.[%f]s" % (diff.total_seconds(), diff.total_seconds() / episodes))

        with open('stat-result-win.txt', 'w') as f:
            f.write(repr(stat_win))
#         print(perf)
#         self.draw_perf(perf)

#         np.set_printoptions(threshold=np.nan, formatter={'float_kind' : lambda x: "%.4f" % x})
#         with open('stat-result-net-train-errors.txt', 'w') as f:
#             f.write(repr(np.array(s1.errors)))

        winner = Board.STONE_BLACK if win1 >= win2 else Board.STONE_WHITE
        return self.which_one(winner), max(win1, win2) / total
示例#24
0
 def _policy_fn(self, board):
     _, _, legal_moves = Game.possible_moves(board)
     state, _ = self.get_input_values(board.stones)
     probs = self.brain.get_move_probs(state)
     probs = probs[0, legal_moves]
     return list(zip(legal_moves, probs))