Exemplo n.º 1
0
def play_dumb_game(max_steps=1000, verbose=1):
    """
    This function plays a Tichu game with four "dumb" players.
    Each player iterates over all available combinations and tries to beat opponents.
    New stacks are played with a random combination.
    """
    game = Game(verbose=verbose)
    step_cnt = 0
    game_active = True
    while game_active:
        active_player = game.active_player
        leading_player = game.leading_player
        # pass if player has already finsihed
        if game.players[active_player].has_finished():
            suc, _ = game.step(active_player, Cards([]))
        # make a random move if stack is empty
        elif not (game.stack.cards):
            comb = game.players[active_player].random_move()
            suc, _ = game.step(active_player, comb)
        # try to make a matching move if opponent is leading
        elif ((active_player + leading_player) % 2) != 0:
            leading_type = game.stack.type
            leading_idx = COMB_TYPES[leading_type]
            avail_comb = game.players[
                active_player].hand.get_available_combinations()
            # try to play, starting with lowest combination
            suc = False
            if avail_comb[leading_idx]:
                for i in range(len(avail_comb[leading_idx])):
                    suc, _ = game.step(active_player,
                                       avail_comb[leading_idx][i])
                    if suc:
                        break
            # Try to bomb if no combination exists
            if not (suc) and avail_comb[COMB_TYPES['four_bomb']]:
                suc, _ = game.step(active_player,
                                   avail_comb[COMB_TYPES['four_bomb']][0])
            elif not (suc) and avail_comb[COMB_TYPES['straight_bomb']]:
                suc, _ = game.step(active_player,
                                   avail_comb[COMB_TYPES['straight_bomb']][0])
            # pass if nothing works
            elif not (suc):
                suc, _ = game.step(active_player, Cards([]))
        # pass if teammate is leading player
        else:
            suc, _ = game.step(active_player, Cards([]))
        # stop if game is finished (or counter overflow)
        step_cnt += 1
        if game.game_finished or step_cnt >= max_steps:
            game_active = False
            if step_cnt >= max_steps and verbose > 1:
                raise Exception(
                    "Max. steps exceeded. Possible infinity loop detected.")
            break
Exemplo n.º 2
0
class GameEnv(object):
    def __init__(self, level='env/level.csv'):

        self.game = Game(level)
        self.repeat_frame_skip = 4

    def reset(self):
        self.game.reset()
        state = self.game.state()
        self.agent_coord = state['coord']
        return state

    def step(self, action):

        for _ in range(self.repeat_frame_skip):
            self.game.step(action)

        state = self.game.state()
        dead = state['dead']
        goal = state['goal']
        coord = state['coord']

        reward = -1 + (coord[0] -
                       self.agent_coord[0]) + 100 * goal - 100 * dead
        done = dead or goal
        self.agent_coord = coord
        return state, reward, done, {
            'goal': goal,
            'dead': dead,
            'distance': self.agent_coord[0]
        }

    def render(self, mode='rgb_array'):
        pixels = self.game.render(mode)
        pixels = np.swapaxes(pixels, 0, 1)
        return pixels
Exemplo n.º 3
0
            if env.landlord_count == 3 or env.winner >= 0:
                for i in range(3):
                    state, f_reward, y_reward, act_ids, dyn_vec, _, label_mask, attn_mask = env.observe(pid)
                    print('玩家', pid, '获得奖励', y_reward)
                    pid = (pid + 1) % 3
                    env.now_player_id = pid
                break

            state, f_reward, _, act_ids, dyn_vec, id2combo, label_mask, attn_mask = env.observe(pid)

            # 叫地主环节
            if env.landlord_cards:
                action = rl.y_act(state, act_ids, attn_mask, dyn_vec, test_mode=True)

                # 更新记忆器
                env.step(action, test_mode=True)
                if action > 0:
                    history_vec.append([-1, -1, -1, -1, -1])
                    history_pid.append(pid)
                    e_act_ids, e_dyn_vec, e_label_mask, e_attn_mask = env.observe_entirety()
                    # 判断是否值得游戏
                    e_action, next_epi = rl.e_act(e_act_ids, e_dyn_vec, e_label_mask, e_attn_mask, test_mode=True)


            # 斗地主环节
            else:
                # 出不了牌
                tmp_act_ids = [[j for j, lmm in zip(i, lm) if j > 0 and lmm == 0] for i, lm in zip(act_ids, label_mask)]
                tmp_label_mask = [[lmm for j, lmm in zip(i, lm) if j > 0 and lmm == 0] for i, lm in
                                  zip(act_ids, label_mask)]
                if max([len(i) for i in tmp_label_mask] + [0]) == 0:
Exemplo n.º 4
0
                    pid = (pid + 1) % 3
                    env.now_player_id = pid
                if e_action != -1:
                    rl.store_e(e_act_ids, e_dyn_vec, e_label_mask, e_attn_mask, cls)
                step += 1
                break

            state, f_reward, _, act_ids, dyn_vec, id2combo, label_mask, attn_mask = env.observe(pid)

            # 叫地主环节
            if env.landlord_cards:
                action = rl.y_act(state, act_ids, attn_mask, dyn_vec, is_training=True)

                # 更新记忆器
                y_memory[pid] = (state, act_ids, attn_mask, dyn_vec)
                env.step(action)
                if action > 0:
                    history_vec.append([-1, -1, -1, -1, -1])
                    history_pid.append(pid)

                    e_act_ids, e_dyn_vec, e_label_mask, e_attn_mask = env.observe_entirety()
                    # 判断是否值得游戏
                    e_action, next_epi = rl.e_act(e_act_ids, e_dyn_vec, e_label_mask, e_attn_mask)
                    if next_epi:
                        break
            # 斗地主环节
            else:
                tmp_act_ids = [[j for j, lmm in zip(i, lm) if j > 0 and lmm == 0] for i, lm in zip(act_ids, label_mask)]
                tmp_l_m = [[lmm for j, lmm in zip(i, lm) if j > 0 and lmm == 0] for i, lm in zip(act_ids, label_mask)]

                # 出不了牌
Exemplo n.º 5
0
class Env():
    """
    A wrapper for Tichu Game class to enable Reinforcement Learning.

    Brings a Tichu Game instance in a shape where an (RL-)Agent can:
      1. Observe a state.
      2. Take an action.
      3. Recieve a reward.

    The state consists of infos from a Players perspective:
    [Players' hand size, Tichu Flag, Players' hand cards]
    [Opponent 1 hand size, Tichu Flag, Opponent 1 last move]
    [Teammate hand size, Tichu Flag, Teammates last move]
    [Opponent 2 hand size, Tichu Flag, Opponent 2 last move]
    The Cards are one-hot-encoded (OHE), e.g.:
    [1, 0, 0, ... 0, 0] is a OHE representation of 2 of Spades.
    There are alternative possibilites for the state-design which
    may be included in the future.

    The action is also a OHE of Cards, e.g.:
    [1, 0, 0, 0, 1, 0, ... 0] means play a pair of 2s.

    The reward function is designed two ways:

    Rich rewards means that a reward can be recieved after each step.
    A step is considered a move by all 4 players.
    In a rich reward setting, the reward is equal to the points
    in a Stack if the Stack is won by either the Player or its teammate.
    The same reward, but negative, is given to the opposing team.
    For Example:
    Player 0 wins a Stack containing 20 points.
    The rewards will be [20, -20, 20, -20] until the next step.

    Sparse rewards means that the rewards are only different from 0
    when a game has finished. In this case, the rewards exactly match
    the outcome of a Game.
    For Example:
    Team 0 has achieved 60 points, Team 1 has achieved 40 points.
    Player 0 has successfully called Tichu (+100 points.
    The rewards will be [160, -60, 160, -60].

    For both reward styles, an invalid move by a Player leads to an
    immediate negative reward.

    Attributes
    ----------
    dispatch_reward: dictionary
      This is to set the reward function (rich/sparse.
    train_mode: bool
      Sets the verbosity of the Game.
    state_size: int
      The size of the state dimension.
    action_size: int
      The size of the action dimension.
    all_cards: list of Card
      A list containing instances of all Cards in a Tichu Deck.
    game: Game
      A Tichu Game instance.
    action_buffer: list of int
      A list containing the last actions of all Players.
    states: list of int
      A list of the states from all Players' perspectives.
    rewards: list of int
      The rewards that an Agent will recieved after a step.
    done: bool
      Whether the episode (i.e. Game) is finished.
    nstep: int
      An internal step conter used for rich rewards.

    Methods
    -------
    reset():
      Instantiates a new Game and resets state, action, rewards, done.

    step(player_id, action):
      Takes a step in the Game and updates state, action, rewards, done.
    """
    def __init__(self,
                 train_mode=True,
                 illegal_move_penalty=ILLEGAL_MOVE_PENALTY):
        """
        Constructs a Tichu Environment for RL.

        Parameter
        ---------
        train_mode: bool
          If false, verbosity of Game will be set to 1.
        """
        # dispatch table for reward function
        self.dispatch_reward = {
            'rich': self._update_rich_rewards,
            'sparse': self._update_sparse_rewards
        }
        # set verbosity according to mode
        if train_mode:
            self.verbose = 0
        else:
            self.verbose = 1
        self.state_size = 232
        self.action_size = 56
        self.all_cards = Deck().all_cards
        self.game = None
        self.action_buffer = [[None], [None], [None], [None]]
        self.state = [[None], [None], [None], [None]]
        self.rewards = [None, None, None, None]
        self.done = False
        self.illegal_move_penalty = illegal_move_penalty
        self.nstep = 0  # only relevant for rich rewards

    def reset(self):
        """ Resets the Environment. """
        self.game = Game(verbose=self.verbose)
        self._reset_all_states()
        self._reset_action_buffer()
        self._reset_rewards()
        self.done = False
        state = self.state
        rewards = self.rewards
        done = self.done
        active_player = self.game.active_player
        return state, rewards, done, active_player

    def step(self, player_id, action):
        """
        Takes a step in the Game.
        Updates state, action, rewards, done and returns them.

        Paramter
        --------
        player_id: The id (0...3) of the player that makes a move.
        action: The action of the player as OHE Cards representation.
        """
        # convert action vector and make game step
        cards = self._vec_to_cards(action)
        suc, points_this_step = self.game.step(player_id, cards)
        # illegal move
        if not suc:
            self.rewards[player_id] = self.illegal_move_penalty
        # legal move
        else:
            self._update_action_buffer(player_id, action)
            self._update_all_states()
            # reset state and action buffer if stack has been emptied
            # and update rewards according to points in the stack
            if not self.game.stack.cards:
                self._reset_all_states()
                self._reset_action_buffer()
                self._update_rewards(points_this_step)
            # update rewards for pass move
            elif cards.type == 'pass':
                self._update_rewards(points_this_step)
            # reset state, action_buffer and rewards if Dog has been played
            # (required because Dog skips players)
            elif cards.cards[0].name == 'Dog':
                self._reset_all_states()
                self._reset_action_buffer()
                self._reset_rewards()
        # update rewards for regular game move
            else:
                self._update_rewards(points_this_step)
        # check if game is finished
        if self.game.game_finished:
            self.done = True
        # return step variables
        state = self.state
        rewards = self.rewards
        done = self.done
        active_player = self.game.active_player
        return state, rewards, done, active_player

    def info(self):
        """ Outputs size of state and action dimension. """
        return self.state_size, self.action_size

    def _reset_all_states(self):
        """
        Resets the state to the initial setting.

        Initial game state of player i:
        i:     [hand_size, tichu_flag, hand_cards (OHE)]
        i + 1: [hand_size, tichu_flag, played_cards (OHE)]
        i + 2: [hand_size, tichu_flag, played_cards (OHE)]
        i + 3: [hand_size, tichu_flag, played_cards (OHE)]
        """
        self.state = list()
        for i in range(4):
            this_player = i
            player_state = list()
            for j in range(4):
                pid = (this_player + j) % 4
                hand_size = self.game.players[pid].hand_size
                tichu_flag = int(self.game.players[pid].tichu_flag)
                if pid == this_player:
                    player_cards = self._cards_to_vec(
                        self.game.players[pid].hand)
                else:
                    player_cards = np.zeros(len(self.all_cards), int).tolist()
                player_state.append([hand_size, tichu_flag, player_cards])
            self.state.append(player_state)

    def _update_all_states(self):
        """ Updates states with latest action taken by other players. """
        self.state = list()
        for i in range(4):
            this_player = i
            player_state = list()
            for j in range(4):
                pid = (this_player + j) % 4
                hand_size = self.game.players[pid].hand_size
                tichu_flag = int(self.game.players[pid].tichu_flag)
                if pid == this_player:
                    player_cards = self._cards_to_vec(
                        self.game.players[pid].hand)
                else:
                    player_cards = self.action_buffer[pid]
                player_state.append([hand_size, tichu_flag, player_cards])
            self.state.append(player_state)

    def _reset_action_buffer(self):
        """ Resets the action buffer. """
        for i in range(4):
            self.action_buffer[i] = np.zeros(len(self.all_cards), int).tolist()

    def _update_action_buffer(self, player_id, action):
        """ Updates the action buffer. """
        self.action_buffer[player_id] = action.tolist()

    def _reset_rewards(self):
        """ Resets the rewards to 0. """
        self.rewards = [0, 0, 0, 0]
        self.nstep = self.game.active_player

    def _update_rewards(self, points_this_step):
        """ Updates the rewards according to reward style. """
        self.dispatch_reward[REWARD_STYLE](points_this_step)

    def _update_rich_rewards(self, points_this_step):
        """
        Updates the rewards according to a rich reward function.

        This implemenation of a reward function promises rewards after
        each round (i.e. consecutive steps of all 4 players).
        If a player or its teammate (!) gets points during a round
        (e.g. by winning a stack), it gets a reward in the amount of
        the points in this round.
        The benefit of this reward function is that each step promises a
        reward (i.e. no sparse rewards that may impede learning).
        The danger is that the actual points are assigned at the end of a
        game, which means the last player looses all its points to the
        first finisher.
        This may lead to a non-ideal game strategy, where lots of
        rewards might be collected during the game, but actually the game is
        lost if the player does not finish early.
        Also, cummulative reward is higher for players that finish later.
        However, if the winning team gets more cumulative reward, then this
        reward design will still lead to a good policy.
        """
        # reset rewards every new player round
        self.rewards[self.nstep] = 0
        # accumulate rewards (teammate rewards are also taken into account)
        # opponent rewards are considered negative
        rewards_team_0 = (points_this_step[0] + points_this_step[2])
        rewards_team_1 = (points_this_step[1] + points_this_step[3])
        self.rewards[0] += (rewards_team_0 - rewards_team_1)
        self.rewards[1] += (rewards_team_1 - rewards_team_0)
        self.rewards[2] += (rewards_team_0 - rewards_team_1)
        self.rewards[3] += (rewards_team_1 - rewards_team_0)
        # update nstep counter
        self.nstep = (self.nstep + 1) % 4

    def _update_sparse_rewards(self, points_this_step):
        """
        Updates the rewards according to a sparse reward function.

        Sparse rewards means that rewards are only achived when
        a Game is completed.
        The benefit is that the rewards exactly represent the outcome
        of the Game.
        The danger is that it is hard for an Agent to make sense
        of its actions when the rewards come only at the end
        of an episode.

        The sparse rewards are not yet implemented!
        """
        raise NotImplementedError("TODO")

    def _cards_to_vec(self, cards):
        """ Turns a Cards instance into a vector representation. """
        vec = np.zeros(len(self.all_cards), int)
        for i in range(len(self.all_cards)):
            crd = Cards([self.all_cards[i]])
            if cards.contains(crd):
                vec[i] = 1
        return vec.tolist()

    def _vec_to_cards(self, vec):
        """ Turns a vector representation into a Cards instance. """
        return Cards(list(compress(self.all_cards, vec)))