Exemplo n.º 1
0
 def reset(self):
     """ Resets the Environment. """
     self.game = Game(verbose=self.verbose)
     self._reset_all_states()
     self._reset_action_buffer()
     self._reset_rewards()
     self.done = False
     state = self.state
     rewards = self.rewards
     done = self.done
     active_player = self.game.active_player
     return state, rewards, done, active_player
Exemplo n.º 2
0
    def __init__(self,
                 board_width=8,
                 board_height=8,
                 n_in_row=5,
                 init_modle=None):
        # 初始化棋盘和游戏服务器
        self.board_width = board_width
        self.board_height = board_height
        self.n_in_row = n_in_row
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        self.loss_to_show = -1
        self.entropy_to_show = -1

        # 初始化训练所用的参数
        self.learning_rate = 2e-3
        self.lr_multiplier = 1.0  # 根据KL散度自动调整学习速率
        self.temp = 1.0
        self.n_playout = 400
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # 每次取batch_size进行梯度下降
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 50  # 一个间隔,取模为0时存储模型到硬盘
        self.game_batch_num = 10000  # 最多进行10000局游戏
        self.best_win_ratio = 0.0

        # agent的对手,纯粹的mcts算法产生的棋手
        self.pure_mcts_playout_num = 1000

        # 是否加载原先已经存在的训练数据
        if init_modle:
            self.policy_value_net = PolicyValueNet(
                board_width=self.board_width,
                board_height=self.board_height,
                model_file=init_modle)
        else:
            self.policy_value_net = PolicyValueNet(
                board_width=self.board_width, board_height=self.board_height)

        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

        self.logger = self.init_log()
Exemplo n.º 3
0
def play_dumb_game(max_steps=1000, verbose=1):
    """
    This function plays a Tichu game with four "dumb" players.
    Each player iterates over all available combinations and tries to beat opponents.
    New stacks are played with a random combination.
    """
    game = Game(verbose=verbose)
    step_cnt = 0
    game_active = True
    while game_active:
        active_player = game.active_player
        leading_player = game.leading_player
        # pass if player has already finsihed
        if game.players[active_player].has_finished():
            suc, _ = game.step(active_player, Cards([]))
        # make a random move if stack is empty
        elif not (game.stack.cards):
            comb = game.players[active_player].random_move()
            suc, _ = game.step(active_player, comb)
        # try to make a matching move if opponent is leading
        elif ((active_player + leading_player) % 2) != 0:
            leading_type = game.stack.type
            leading_idx = COMB_TYPES[leading_type]
            avail_comb = game.players[
                active_player].hand.get_available_combinations()
            # try to play, starting with lowest combination
            suc = False
            if avail_comb[leading_idx]:
                for i in range(len(avail_comb[leading_idx])):
                    suc, _ = game.step(active_player,
                                       avail_comb[leading_idx][i])
                    if suc:
                        break
            # Try to bomb if no combination exists
            if not (suc) and avail_comb[COMB_TYPES['four_bomb']]:
                suc, _ = game.step(active_player,
                                   avail_comb[COMB_TYPES['four_bomb']][0])
            elif not (suc) and avail_comb[COMB_TYPES['straight_bomb']]:
                suc, _ = game.step(active_player,
                                   avail_comb[COMB_TYPES['straight_bomb']][0])
            # pass if nothing works
            elif not (suc):
                suc, _ = game.step(active_player, Cards([]))
        # pass if teammate is leading player
        else:
            suc, _ = game.step(active_player, Cards([]))
        # stop if game is finished (or counter overflow)
        step_cnt += 1
        if game.game_finished or step_cnt >= max_steps:
            game_active = False
            if step_cnt >= max_steps and verbose > 1:
                raise Exception(
                    "Max. steps exceeded. Possible infinity loop detected.")
            break
Exemplo n.º 4
0
    def __init__(self, init_model=None):
        """ init function for the class"""

        # params of the board and the game
        self.board_width = 6  # board width
        self.board_height = 6  # board height
        self.n_in_row = 4  # win by n in line (vertically, horizontally, diagonally)
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5  # a number in (0, inf) controlling the relative impact of value Q, and prior probability P, on this node's score.
        self.buffer_size = 10000  # buffer size for replaying experience
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)  # buffer
        self.play_batch_size = 1  # size of rollout for each episode
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02  # target of KL loss
        self.check_freq = 50  # frequency for check evaluation and save model
        self.game_batch_num = 1500  # number of training game loop
        self.best_win_ratio = 0.0  # best evaluated win ratio
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        if init_model:  # load from existing file
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)
Exemplo n.º 5
0
def run(model_file, width=8, height=8, n=5):
    n = n
    width = width
    height = height
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        # best_policy = PolicyValueNet(width, height, model_file = model_file)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            try:
                policy_param = pickle.load(
                    open(model_file,
                         'rb'), encoding='bytes')  # To support python3
            except:
                pass
        best_policy = PolicyValueNet(width, height, model_file=model_file)
        mcts_player = MCTSPlayer(
            best_policy.policy_value_fn, c_puct=5,
            n_playout=2000)  # set larger n_playout for better performance
        mcts_player2 = MCTSPlayer(best_policy.policy_value_fn,
                                  c_puct=5,
                                  n_playout=400)

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = HumanPlayer()

        # set start_player=0 for human first
        game.start_play(human, mcts_player2, start_player=0, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Exemplo n.º 6
0
def main():
    game = Game(board_size=3)
    net = NNWrapper(game)



    # net.net = torch.load(os.path.join('./models/', '{}.pt'.format('current')))
    # human_play = HumanPlay(game, net)
    # human_play.play()

    train = Train(game, net)
    train.train()
Exemplo n.º 7
0
def human_play(n, width, height, ai_type, is_humanMoveFirst=True):
    # n = 5
    # width, height = 8, 8
    # model_file = 'best_policy_8_8_5.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        # best_policy = PolicyValueNet(width, height, model_file = model_file)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        # try:
        #     policy_param = pickle.load(open(model_file, 'rb'))
        # except:
        #     policy_param = pickle.load(open(model_file, 'rb'),
        #                                encoding='bytes')  # To support python3
        # best_policy = PolicyValueNetNumpy(width, height, policy_param)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn,
        #                          c_puct=5,
        #                          n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        if ai_type == "pure_mcts":
            mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        start_player=0 if is_humanMoveFirst else 1
        game.start_play(human, mcts_player, start_player=start_player, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Exemplo n.º 8
0
class GameEnv(object):
    def __init__(self, level='env/level.csv'):

        self.game = Game(level)
        self.repeat_frame_skip = 4

    def reset(self):
        self.game.reset()
        state = self.game.state()
        self.agent_coord = state['coord']
        return state

    def step(self, action):

        for _ in range(self.repeat_frame_skip):
            self.game.step(action)

        state = self.game.state()
        dead = state['dead']
        goal = state['goal']
        coord = state['coord']

        reward = -1 + (coord[0] -
                       self.agent_coord[0]) + 100 * goal - 100 * dead
        done = dead or goal
        self.agent_coord = coord
        return state, reward, done, {
            'goal': goal,
            'dead': dead,
            'distance': self.agent_coord[0]
        }

    def render(self, mode='rgb_array'):
        pixels = self.game.render(mode)
        pixels = np.swapaxes(pixels, 0, 1)
        return pixels
Exemplo n.º 9
0
from copy import deepcopy
from env.cards import DynamicCorpus

if __name__ == "__main__":

    start_iter = 50000
    init_checkpoint = None
    num_epochs = 2000001
    dim_states = 52
    rl = RL(dim_states, lr_a=0.0001, lr_c=0.0001, init_checkpoint=init_checkpoint)

    # fine-tune
    if start_iter != 0 and not init_checkpoint:
        rl.load_model('rl', start_iter)

    env = Game()
    for episode in range(start_iter, num_epochs):
        env.reset()
        print()

        history_vec = []
        history_pid = []
        while 1:
            pid = env.now_player_id
            # 无人叫地主 or 游戏结束,记录所有存档
            if env.landlord_count == 3 or env.winner >= 0:
                for i in range(3):
                    state, f_reward, y_reward, act_ids, dyn_vec, _, label_mask, attn_mask = env.observe(pid)
                    print('玩家', pid, '获得奖励', y_reward)
                    pid = (pid + 1) % 3
                    env.now_player_id = pid
Exemplo n.º 10
0
class TrainPipeline:
    """
    通过策略价值网络训练学习最优解
    """
    def __init__(self,
                 board_width=8,
                 board_height=8,
                 n_in_row=5,
                 init_modle=None):
        # 初始化棋盘和游戏服务器
        self.board_width = board_width
        self.board_height = board_height
        self.n_in_row = n_in_row
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        self.loss_to_show = -1
        self.entropy_to_show = -1

        # 初始化训练所用的参数
        self.learning_rate = 2e-3
        self.lr_multiplier = 1.0  # 根据KL散度自动调整学习速率
        self.temp = 1.0
        self.n_playout = 400
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # 每次取batch_size进行梯度下降
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 50  # 一个间隔,取模为0时存储模型到硬盘
        self.game_batch_num = 10000  # 最多进行10000局游戏
        self.best_win_ratio = 0.0

        # agent的对手,纯粹的mcts算法产生的棋手
        self.pure_mcts_playout_num = 1000

        # 是否加载原先已经存在的训练数据
        if init_modle:
            self.policy_value_net = PolicyValueNet(
                board_width=self.board_width,
                board_height=self.board_height,
                model_file=init_modle)
        else:
            self.policy_value_net = PolicyValueNet(
                board_width=self.board_width, board_height=self.board_height)

        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

        self.logger = self.init_log()

    def init_log(self):
        """
            初始化日志
        :return:
        """
        cur_time = time.strftime('%m%d-%H:%M:%S', time.localtime(time.time()))
        logger_name = str(cur_time)
        logger = init_logger(name=logger_name)
        return logger

    def get_equi_data(self, play_data):
        """
            由于棋盘是上下左右对称的,所以我们可以通过翻转和旋转来获得更多的数据集
            play_data: [(state, mcts_prob, winner_z), ..., ...]
        :param play_data:
        :return:
        """
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(
                    np.flipud(
                        mcts_porb.reshape(self.board_height,
                                          self.board_width)), i)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data

    def collect_selfplay_data(self, n_games=1):
        """
            收集selfplay的数据用来训练
        :param n_games:
        :return:
        """
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data = list(play_data)[:]
            self.episode_len = len(play_data)

            # 拓展数据集
            play_data = self.get_equi_data(play_data)
            self.data_buffer.extend(play_data)  # 存入双向队列

    def policy_update(self):
        """
            更新策略函数
        :return:
        """
        try:
            mini_batch = random.sample(self.data_buffer, self.batch_size)
        except:
            mini_batch = random.sample(list(self.data_buffer), self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                state_batch=state_batch,
                mcts_probs=mcts_probs_batch,
                winner_batch=winner_batch,
                learning_rate=self.learning_rate * self.lr_multiplier)
            self.loss_to_show = loss
            self.entropy_to_show = entropy
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break

        # adaptively adjust the learning rate
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        # explained_var_old = (1 -
        #                      np.var(np.array(winner_batch) - old_v.flatten()) /
        #                      np.var(np.array(winner_batch)))
        # explained_var_new = (1 -
        #                      np.var(np.array(winner_batch) - new_v.flatten()) /
        #                      np.var(np.array(winner_batch)))
        # print(("kl:{:.5f},"
        #        "lr_multiplier:{:.3f},"
        #        "loss:{},"
        #        "entropy:{},"
        #        "explained_var_old:{:.3f},"
        #        "explained_var_new:{:.3f}"
        #        ).format(kl,
        #                 self.lr_multiplier,
        #                 loss,
        #                 entropy,
        #                 explained_var_old,
        #                 explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=50):
        """
            与单纯的MCTS_Pure进行对抗训练,来监控当前策略的好坏
        :param n_games:
        :return:
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] = win_cnt[winner] + 1
            self.logger.info('round:{}\t, winner:{} '.format(i, winner))
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        self.logger.info("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio

    def run(self):
        """
            开始训练
        :return:
        """
        try:
            for i in range(self.game_batch_num):
                self.collect_selfplay_data(self.play_batch_size)
                self.logger.info(
                    ("batch i:{},\t"
                     "episode_len:{},\t"
                     "loss:{:.8f},\t"
                     "entropy:{:.8f},").format(i + 1, self.episode_len,
                                               self.loss_to_show,
                                               self.entropy_to_show))
                # 数据量达到要求数目,就可以开始训练了
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update()
                if (i + 1) % self.check_freq == 0:
                    self.logger.info("current self-play batch: {}".format(i +
                                                                          1))
                    win_ratio = self.policy_evaluate()
                    self.policy_value_net.save_model(
                        'model/current_policy.model')
                    if win_ratio > self.best_win_ratio:
                        self.logger.info(
                            'update new best policy, win_ratio: ' +
                            str(win_ratio))
                        self.best_win_ratio = win_ratio
                        # update the best_policy
                        self.policy_value_net.save_model(
                            'model/best_policy.model')
                        if (self.best_win_ratio == 1.0
                                and self.pure_mcts_playout_num < 5000):
                            self.pure_mcts_playout_num += 1000
                            self.best_win_ratio = 0.0
        except KeyboardInterrupt:
            self.logger.info('quit')
Exemplo n.º 11
0
    def __init__(self, level='env/level.csv'):

        self.game = Game(level)
        self.repeat_frame_skip = 4
Exemplo n.º 12
0
class TrainPipeline():
    def __init__(self, init_model=None):
        """ init function for the class"""

        # params of the board and the game
        self.board_width = 6  # board width
        self.board_height = 6  # board height
        self.n_in_row = 4  # win by n in line (vertically, horizontally, diagonally)
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5  # a number in (0, inf) controlling the relative impact of value Q, and prior probability P, on this node's score.
        self.buffer_size = 10000  # buffer size for replaying experience
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)  # buffer
        self.play_batch_size = 1  # size of rollout for each episode
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02  # target of KL loss
        self.check_freq = 50  # frequency for check evaluation and save model
        self.game_batch_num = 1500  # number of training game loop
        self.best_win_ratio = 0.0  # best evaluated win ratio
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        if init_model:  # load from existing file
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def get_equi_data(self, play_data):
        """augment the data set by rotation and flipping

        Description:
            We can increase the training data by simply rotating or flipping the state. In such a way,
            we can get more data to contribute to increasing the performance of training neural network.

        input params:
            play_data: type:List,  [(state, mcts_prob, winner_z), ..., ...]
        """
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(
                    np.flipud(
                        mcts_porb.reshape(self.board_height,
                                          self.board_width)), i)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data

    def collect_selfplay_data(self, n_games=1):
        """collect self-play rollout data for training

        input param:
            n_games: number of rollout
        """
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data = list(play_data)[:]
            self.episode_len = len(play_data)
            # augment the data
            play_data = self.get_equi_data(play_data)
            self.data_buffer.extend(play_data)

    def policy_update(self):
        """update the policy-value net by training net

        Pipeline:
            1. sample data from the deque: self.data_buffer
            2. compute action probability for original policy network
            3. train neural network in a loop given sampled data
                    loop pipeline:
                        1. call self.policy_value_net.train_step(state_batch,
                                                                mcts_probs_batch,
                                                                winner_batch,
                                                                self.learn_rate*self.lr_multiplier)
                        2. compute action probability for new trained policy network
                        3. compute kl divergence between old and new action probability
                        4. if kl > self.kl_targ * 4, break the loop for Early Stopping
            4. adjust learning rate based on kl divergence
                    if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
                        self.lr_multiplier /= 1.5 # decrease learning rate
                    elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
                        self.lr_multiplier *= 1.5 # increase learning rate
            4. return final loss and entropy


        :return:
            loss:
            entropy:
        """
        loss, entropy = None, None
        # TODO: code here

        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """ Policy Evaluation

        Description:
            Evaluate the trained policy by playing against the pure MCTS player
            Note: this is only for monitoring the progress of training

        Pipeline:
            1. create MCTSPlayer and MCTS_Pure Player
            2. Evaluation loop
                    Pipeline:
                        1. Rollout simulation for AlphaZero vs Pure MCTS
                                winner = self.game.start_play(current_mcts_player,
                                                  pure_mcts_player,
                                                  start_player=i % 2,  # start from either Player 1 or 2 evenly
                                                  is_shown=0)
                        2. Record result
            3. compute winning ratio: win_ratio
                    winning ratio =  (winning times + 0.5 * tie times) / total times
        return:
            win_ratio
        """
        # TODO: code here
        win_ratio = None

        return win_ratio

    def run(self):
        """run the training pipeline

        Descriptions:
            train alpha zero in a loop.
            loop size: self.game_batch_num

        loop pipline:
            1. collect self-play data by rollouts
            2. policy update by sampled training data
            3. evaluated model performance (in a fixed frequency)
            4. save model (in a fixed frequency)
            5. evaluation result
        Plot

        """
        try:
            # TODO: code here
            pass
        except KeyboardInterrupt:
            print('\n\rquit')
Exemplo n.º 13
0
            # swap observation
            observation = observation_

            step += 1
            # break while loop when end of this episode
            if done:
                break
        scores.append(env.score)

        if episode % 5 == 0:
            print("#" * 80)
            print(episode, ",", int(step / 10), ",score:", env.score, ",e:",
                  RL.epsilon)
            print("avg-score: {}".format(np.mean(list(scores)[-1500:])))

        if episode % 100 == 0:
            print(observation)
            env.show()


if __name__ == "__main__":
    env = Game()
    RL = DuelingDQN(env.n_actions,
                    env.n_features,
                    learning_rate=1e-4,
                    reward_decay=0.95,
                    e_greedy=0.99,
                    start_epsilon=0.5,
                    e_greedy_increment=1e-5)
    train_2048()
Exemplo n.º 14
0
class Env():
    """
    A wrapper for Tichu Game class to enable Reinforcement Learning.

    Brings a Tichu Game instance in a shape where an (RL-)Agent can:
      1. Observe a state.
      2. Take an action.
      3. Recieve a reward.

    The state consists of infos from a Players perspective:
    [Players' hand size, Tichu Flag, Players' hand cards]
    [Opponent 1 hand size, Tichu Flag, Opponent 1 last move]
    [Teammate hand size, Tichu Flag, Teammates last move]
    [Opponent 2 hand size, Tichu Flag, Opponent 2 last move]
    The Cards are one-hot-encoded (OHE), e.g.:
    [1, 0, 0, ... 0, 0] is a OHE representation of 2 of Spades.
    There are alternative possibilites for the state-design which
    may be included in the future.

    The action is also a OHE of Cards, e.g.:
    [1, 0, 0, 0, 1, 0, ... 0] means play a pair of 2s.

    The reward function is designed two ways:

    Rich rewards means that a reward can be recieved after each step.
    A step is considered a move by all 4 players.
    In a rich reward setting, the reward is equal to the points
    in a Stack if the Stack is won by either the Player or its teammate.
    The same reward, but negative, is given to the opposing team.
    For Example:
    Player 0 wins a Stack containing 20 points.
    The rewards will be [20, -20, 20, -20] until the next step.

    Sparse rewards means that the rewards are only different from 0
    when a game has finished. In this case, the rewards exactly match
    the outcome of a Game.
    For Example:
    Team 0 has achieved 60 points, Team 1 has achieved 40 points.
    Player 0 has successfully called Tichu (+100 points.
    The rewards will be [160, -60, 160, -60].

    For both reward styles, an invalid move by a Player leads to an
    immediate negative reward.

    Attributes
    ----------
    dispatch_reward: dictionary
      This is to set the reward function (rich/sparse.
    train_mode: bool
      Sets the verbosity of the Game.
    state_size: int
      The size of the state dimension.
    action_size: int
      The size of the action dimension.
    all_cards: list of Card
      A list containing instances of all Cards in a Tichu Deck.
    game: Game
      A Tichu Game instance.
    action_buffer: list of int
      A list containing the last actions of all Players.
    states: list of int
      A list of the states from all Players' perspectives.
    rewards: list of int
      The rewards that an Agent will recieved after a step.
    done: bool
      Whether the episode (i.e. Game) is finished.
    nstep: int
      An internal step conter used for rich rewards.

    Methods
    -------
    reset():
      Instantiates a new Game and resets state, action, rewards, done.

    step(player_id, action):
      Takes a step in the Game and updates state, action, rewards, done.
    """
    def __init__(self,
                 train_mode=True,
                 illegal_move_penalty=ILLEGAL_MOVE_PENALTY):
        """
        Constructs a Tichu Environment for RL.

        Parameter
        ---------
        train_mode: bool
          If false, verbosity of Game will be set to 1.
        """
        # dispatch table for reward function
        self.dispatch_reward = {
            'rich': self._update_rich_rewards,
            'sparse': self._update_sparse_rewards
        }
        # set verbosity according to mode
        if train_mode:
            self.verbose = 0
        else:
            self.verbose = 1
        self.state_size = 232
        self.action_size = 56
        self.all_cards = Deck().all_cards
        self.game = None
        self.action_buffer = [[None], [None], [None], [None]]
        self.state = [[None], [None], [None], [None]]
        self.rewards = [None, None, None, None]
        self.done = False
        self.illegal_move_penalty = illegal_move_penalty
        self.nstep = 0  # only relevant for rich rewards

    def reset(self):
        """ Resets the Environment. """
        self.game = Game(verbose=self.verbose)
        self._reset_all_states()
        self._reset_action_buffer()
        self._reset_rewards()
        self.done = False
        state = self.state
        rewards = self.rewards
        done = self.done
        active_player = self.game.active_player
        return state, rewards, done, active_player

    def step(self, player_id, action):
        """
        Takes a step in the Game.
        Updates state, action, rewards, done and returns them.

        Paramter
        --------
        player_id: The id (0...3) of the player that makes a move.
        action: The action of the player as OHE Cards representation.
        """
        # convert action vector and make game step
        cards = self._vec_to_cards(action)
        suc, points_this_step = self.game.step(player_id, cards)
        # illegal move
        if not suc:
            self.rewards[player_id] = self.illegal_move_penalty
        # legal move
        else:
            self._update_action_buffer(player_id, action)
            self._update_all_states()
            # reset state and action buffer if stack has been emptied
            # and update rewards according to points in the stack
            if not self.game.stack.cards:
                self._reset_all_states()
                self._reset_action_buffer()
                self._update_rewards(points_this_step)
            # update rewards for pass move
            elif cards.type == 'pass':
                self._update_rewards(points_this_step)
            # reset state, action_buffer and rewards if Dog has been played
            # (required because Dog skips players)
            elif cards.cards[0].name == 'Dog':
                self._reset_all_states()
                self._reset_action_buffer()
                self._reset_rewards()
        # update rewards for regular game move
            else:
                self._update_rewards(points_this_step)
        # check if game is finished
        if self.game.game_finished:
            self.done = True
        # return step variables
        state = self.state
        rewards = self.rewards
        done = self.done
        active_player = self.game.active_player
        return state, rewards, done, active_player

    def info(self):
        """ Outputs size of state and action dimension. """
        return self.state_size, self.action_size

    def _reset_all_states(self):
        """
        Resets the state to the initial setting.

        Initial game state of player i:
        i:     [hand_size, tichu_flag, hand_cards (OHE)]
        i + 1: [hand_size, tichu_flag, played_cards (OHE)]
        i + 2: [hand_size, tichu_flag, played_cards (OHE)]
        i + 3: [hand_size, tichu_flag, played_cards (OHE)]
        """
        self.state = list()
        for i in range(4):
            this_player = i
            player_state = list()
            for j in range(4):
                pid = (this_player + j) % 4
                hand_size = self.game.players[pid].hand_size
                tichu_flag = int(self.game.players[pid].tichu_flag)
                if pid == this_player:
                    player_cards = self._cards_to_vec(
                        self.game.players[pid].hand)
                else:
                    player_cards = np.zeros(len(self.all_cards), int).tolist()
                player_state.append([hand_size, tichu_flag, player_cards])
            self.state.append(player_state)

    def _update_all_states(self):
        """ Updates states with latest action taken by other players. """
        self.state = list()
        for i in range(4):
            this_player = i
            player_state = list()
            for j in range(4):
                pid = (this_player + j) % 4
                hand_size = self.game.players[pid].hand_size
                tichu_flag = int(self.game.players[pid].tichu_flag)
                if pid == this_player:
                    player_cards = self._cards_to_vec(
                        self.game.players[pid].hand)
                else:
                    player_cards = self.action_buffer[pid]
                player_state.append([hand_size, tichu_flag, player_cards])
            self.state.append(player_state)

    def _reset_action_buffer(self):
        """ Resets the action buffer. """
        for i in range(4):
            self.action_buffer[i] = np.zeros(len(self.all_cards), int).tolist()

    def _update_action_buffer(self, player_id, action):
        """ Updates the action buffer. """
        self.action_buffer[player_id] = action.tolist()

    def _reset_rewards(self):
        """ Resets the rewards to 0. """
        self.rewards = [0, 0, 0, 0]
        self.nstep = self.game.active_player

    def _update_rewards(self, points_this_step):
        """ Updates the rewards according to reward style. """
        self.dispatch_reward[REWARD_STYLE](points_this_step)

    def _update_rich_rewards(self, points_this_step):
        """
        Updates the rewards according to a rich reward function.

        This implemenation of a reward function promises rewards after
        each round (i.e. consecutive steps of all 4 players).
        If a player or its teammate (!) gets points during a round
        (e.g. by winning a stack), it gets a reward in the amount of
        the points in this round.
        The benefit of this reward function is that each step promises a
        reward (i.e. no sparse rewards that may impede learning).
        The danger is that the actual points are assigned at the end of a
        game, which means the last player looses all its points to the
        first finisher.
        This may lead to a non-ideal game strategy, where lots of
        rewards might be collected during the game, but actually the game is
        lost if the player does not finish early.
        Also, cummulative reward is higher for players that finish later.
        However, if the winning team gets more cumulative reward, then this
        reward design will still lead to a good policy.
        """
        # reset rewards every new player round
        self.rewards[self.nstep] = 0
        # accumulate rewards (teammate rewards are also taken into account)
        # opponent rewards are considered negative
        rewards_team_0 = (points_this_step[0] + points_this_step[2])
        rewards_team_1 = (points_this_step[1] + points_this_step[3])
        self.rewards[0] += (rewards_team_0 - rewards_team_1)
        self.rewards[1] += (rewards_team_1 - rewards_team_0)
        self.rewards[2] += (rewards_team_0 - rewards_team_1)
        self.rewards[3] += (rewards_team_1 - rewards_team_0)
        # update nstep counter
        self.nstep = (self.nstep + 1) % 4

    def _update_sparse_rewards(self, points_this_step):
        """
        Updates the rewards according to a sparse reward function.

        Sparse rewards means that rewards are only achived when
        a Game is completed.
        The benefit is that the rewards exactly represent the outcome
        of the Game.
        The danger is that it is hard for an Agent to make sense
        of its actions when the rewards come only at the end
        of an episode.

        The sparse rewards are not yet implemented!
        """
        raise NotImplementedError("TODO")

    def _cards_to_vec(self, cards):
        """ Turns a Cards instance into a vector representation. """
        vec = np.zeros(len(self.all_cards), int)
        for i in range(len(self.all_cards)):
            crd = Cards([self.all_cards[i]])
            if cards.contains(crd):
                vec[i] = 1
        return vec.tolist()

    def _vec_to_cards(self, vec):
        """ Turns a vector representation into a Cards instance. """
        return Cards(list(compress(self.all_cards, vec)))