Пример #1
0
    def do_step(
            # Board state
            self,
            already_played,
            board,
            agents_finished,
            # Possible states before the next move of this agent
            list_next_possible_states=lambda ap, b: ([], []),
            # Other parameters
            always_use_best=False,
            print_luck=False):
        """
            Performs a (partial) step in the game.

            Returns (Player finished, 
                Already played cards, New board, 
                Best decision made randomly)
        """

        # Prepares the step to do
        self.prepare_step()

        # If player has already finished, pass
        if has_finished(self.hand):
            return True, already_played, board, False

        # Possible actions; Pass if no possible play
        possible_actions = possible_next_moves(self.hand, board)
        if len(possible_actions) == 1 and \
                np.all(possible_actions[0] == 0):
            return False, already_played, board, False

        # Decide action to take
        (possible_qvalues, action_index, action_taken,
         random_choice, best_decision_made_randomly) = \
            self.decide_action_to_take(
                already_played, board, always_use_best,
                print_luck, possible_actions)

        # Compute next state
        next_hand = self.hand - action_taken
        next_board = board if np.all(action_taken == 0) else action_taken
        next_already_played = already_played + action_taken

        # Process next state
        self.process_next_board_state(
            already_played, board, list_next_possible_states,
            next_already_played, next_board, next_hand, possible_qvalues,
            action_index, action_taken, random_choice, agents_finished,
            always_use_best)

        # Return next state
        self.hand = next_hand
        return (has_finished(self.hand), next_already_played, next_board,
                best_decision_made_randomly)
Пример #2
0
    def test_has_won(self):
        """ Tests whether a hand has won. """

        self.assertTrue(has_finished(np.zeros(NUM_CARD_VALUES)))
        self.assertFalse(has_finished(np.ones(NUM_CARD_VALUES)))
        self.assertTrue(
            np.all(
                has_finished(np.zeros((
                    2, NUM_CARD_VALUES))) == np.array([True, True])))
        self.assertTrue(
            np.all(
                has_finished(
                    np.array([[0, 0, 0, 0, 1], [0, 0, 0, 0, 0], [
                        1, 0, 0, 0, 0
                    ]])) == np.array([False, True, False])))
Пример #3
0
    def process_next_board_state(
            # Last board state
            self, already_played, board,
            # Possible states before the next move of this agent
            list_next_possible_states, next_ap, next_b, next_hand,
            # Decided action
            learned_values, action_index, action_taken, random_choice,
            # Other parameters
            agents_finished, always_use_best):
        """ Processes the next board state. """

        # FIXME agent does not perform two consecutive actions
        # Retrieve next state's q-value
        next_qvalues = self.qtable.get_qtable_entry(
            next_ap, next_b, next_hand)
        next_max = np.nanmax(next_qvalues) \
            if np.any(next_qvalues != None) else 0

        # Determine reward
        if has_finished(next_hand):
            reward_earned = self.rewards[agents_finished]
        else:
            reward_earned = 0

        # Only update if either old or new values are not all zero
        if np.any(learned_values != None) or reward_earned != 0 or next_max != 0:
            # Create Q-Table entry if necessary
            if np.all(learned_values == None):
                self.qtable.create_qtable_entry(
                    already_played, board, self.hand)
                learned_values = self.qtable.get_qtable_entry(
                    already_played, board, self.hand)

            # Determine new value
            def update_func(old_qvalues):
                old_qvalue = old_qvalues.iloc[0, action_index]
                new_value = (1 - self.alpha) * old_qvalue + \
                    self.alpha * (reward_earned + self.gamma * next_max)
                old_qvalues.iloc[0, action_index] = new_value
                return old_qvalues

            self.qtable.update_qtable(
                already_played, board, self.hand, update_func)
Пример #4
0
def only_passing_possible(hand, board):
    """
        Faster than checking len(possible_next_moves(...)) == 1.
    """

    # Finished players have to pass
    if has_finished(hand):
        return True

    # No passing allowed when board empty
    if np.all(board == 0):
        return False

    # Else iterate possible actions
    card_type_in_board = np.argmax(board)
    num_cards_in_board = board[card_type_in_board] \
        if card_type_in_board == JOKER \
        else board[card_type_in_board] + board[JOKER]

    for card_type_in_hand in range(NUM_CARD_VALUES - 1, -1, -1):
        # You can play clean
        if card_type_in_hand < card_type_in_board and \
                hand[card_type_in_hand] >= num_cards_in_board:
            return False

        # Or you can play dirty (with Joker(s))
        if card_type_in_hand != JOKER and hand[JOKER] > 0 \
                and card_type_in_hand < card_type_in_board \
                and num_cards_in_board >= 2 and hand[card_type_in_hand] > 0 \
                and hand[card_type_in_hand] + hand[JOKER] >= num_cards_in_board:
            # Use one joker
            if hand[card_type_in_hand] + 1 >= num_cards_in_board:
                return False

            # Use two jokers
            if hand[JOKER] == 2 and num_cards_in_board >= 3:
                return False

    # No possible actions available
    return True
Пример #5
0
def possible_next_moves(hand, board):
    """
        Returns possible next moves as a list of actions
    """

    # You can always pass if it is not the initial move
    possible_actions = [np.zeros((1, NUM_CARD_VALUES), dtype=np.int8)]

    # If board empty, moves do only depend on hand
    if np.all(board == 0):
        for card_type in range(NUM_CARD_VALUES - 1, -1, -1):
            for num_cards in range(hand[card_type], 0, -1):
                if card_type != JOKER:
                    for num_jokers in range(hand[JOKER] + 1):
                        # Form new board out of jokers and cards
                        possible_actions.append(
                            get_cards_array(card_type, num_cards) +
                            get_cards_array(JOKER, num_jokers))
                else:
                    # Form new board out of only jokers
                    possible_actions.append(
                        get_cards_array(card_type, num_cards))

    # Move has to match current board
    else:
        card_type_in_board = np.argmax(board)
        num_cards_in_board = board[card_type_in_board] \
            if card_type_in_board == JOKER \
            else board[card_type_in_board] + board[JOKER]

        if not has_finished(hand):
            for card_type_in_hand in range(NUM_CARD_VALUES - 1, -1, -1):
                # You can play clean
                if card_type_in_hand < card_type_in_board and \
                        hand[card_type_in_hand] >= num_cards_in_board:
                    possible_actions.append(
                        get_cards_array(card_type_in_hand, num_cards_in_board))

                # Or you can play dirty (with Joker(s))
                if card_type_in_hand != JOKER and hand[JOKER] > 0 \
                        and card_type_in_hand < card_type_in_board \
                        and num_cards_in_board >= 2 and hand[card_type_in_hand] > 0 \
                        and hand[card_type_in_hand] + hand[JOKER] >= num_cards_in_board:
                    # Use one joker
                    if hand[card_type_in_hand] + 1 >= num_cards_in_board:
                        possible_actions.append(
                            get_cards_array(card_type_in_hand,
                                            num_cards_in_board - 1) +
                            get_cards_array(JOKER, 1))

                    # Use two jokers
                    if hand[JOKER] == 2 and num_cards_in_board >= 3:
                        possible_actions.append(
                            get_cards_array(card_type_in_hand,
                                            num_cards_in_board - 2) +
                            get_cards_array(JOKER, 2))

    # If board empty, passing not allowed
    if np.all(board == 0):
        possible_actions = possible_actions[1:]

    return np.vstack(possible_actions)
Пример #6
0
    def process_next_board_state(
            # Last board state
            self, already_played, board,
            # Possible states before the next move of this agent
            list_next_possible_states, next_ap, next_b, next_hand,
            # Decided action
            possible_qvalues, action_index, action_taken, random_choice,
            # Other parameters
            agents_finished, always_use_best):
        """ Processes the next board state. """

        if not has_finished(next_hand):
            # List next possible states
            next_already_played_list, next_boards = \
                list_next_possible_states(next_ap, next_b)

            # Retrieve next state's max q-value
            next_possible_actions, next_boards, next_already_played_list = \
                possible_next_moves_for_all(
                    next_hand, next_boards, next_already_played_list)

            next_qvalues = self.predict_q_values_from_network(
                self.convert_to_data_batch(
                    next_already_played_list, next_boards,
                    next_hand, next_possible_actions))
            # FIXME compute weighted average based on the probabilites.
            # Use mean since best action for agent is probably not going to happen
            next_max = np.nanmean(next_qvalues)

        # Determine reward
        if has_finished(next_hand):
            # Reward based on how many other agents are already finished
            reward_earned = self.rewards[agents_finished]
            # Terminal state has q-value zero
            next_max = 0
        elif np.all(np.all(
            (next_already_played_list - next_possible_actions)
                == already_played, axis=1)):
            # Cards that win a round safely gain fixed rewards
            reward_earned = self.reward_win_round
        else:
            # Else, the more cards played the better
            reward_earned = self.reward_per_card_played * \
                np.linalg.norm(action_taken, 1)

        # Determine new q-value
        future_qvalue = reward_earned + self.gamma * next_max

        # Do not train in inference mode
        if not always_use_best:
            # Record step in replay buffer
            self.replay_buffer.add_batch((
                self.convert_to_data_batch(
                    [already_played], [board], self.hand, [action_taken]
                ), future_qvalue))

            # Fit neural net to observed replays
            if self.step_iteration != 0 and self.step_iteration % \
                    self.train_each_n_steps == 0:
                self.fit_values_to_network()
            self.step_iteration += 1

        # Validate q-values in inference mode
        else:
            self.validation_buffer.add_batch((
                self.convert_to_data_batch(
                    [already_played], [board], self.hand, [action_taken]
                ), future_qvalue))