def do_step( # Board state self, already_played, board, agents_finished, # Possible states before the next move of this agent list_next_possible_states=lambda ap, b: ([], []), # Other parameters always_use_best=False, print_luck=False): """ Performs a (partial) step in the game. Returns (Player finished, Already played cards, New board, Best decision made randomly) """ # Prepares the step to do self.prepare_step() # If player has already finished, pass if has_finished(self.hand): return True, already_played, board, False # Possible actions; Pass if no possible play possible_actions = possible_next_moves(self.hand, board) if len(possible_actions) == 1 and \ np.all(possible_actions[0] == 0): return False, already_played, board, False # Decide action to take (possible_qvalues, action_index, action_taken, random_choice, best_decision_made_randomly) = \ self.decide_action_to_take( already_played, board, always_use_best, print_luck, possible_actions) # Compute next state next_hand = self.hand - action_taken next_board = board if np.all(action_taken == 0) else action_taken next_already_played = already_played + action_taken # Process next state self.process_next_board_state( already_played, board, list_next_possible_states, next_already_played, next_board, next_hand, possible_qvalues, action_index, action_taken, random_choice, agents_finished, always_use_best) # Return next state self.hand = next_hand return (has_finished(self.hand), next_already_played, next_board, best_decision_made_randomly)
def test_has_won(self): """ Tests whether a hand has won. """ self.assertTrue(has_finished(np.zeros(NUM_CARD_VALUES))) self.assertFalse(has_finished(np.ones(NUM_CARD_VALUES))) self.assertTrue( np.all( has_finished(np.zeros(( 2, NUM_CARD_VALUES))) == np.array([True, True]))) self.assertTrue( np.all( has_finished( np.array([[0, 0, 0, 0, 1], [0, 0, 0, 0, 0], [ 1, 0, 0, 0, 0 ]])) == np.array([False, True, False])))
def process_next_board_state( # Last board state self, already_played, board, # Possible states before the next move of this agent list_next_possible_states, next_ap, next_b, next_hand, # Decided action learned_values, action_index, action_taken, random_choice, # Other parameters agents_finished, always_use_best): """ Processes the next board state. """ # FIXME agent does not perform two consecutive actions # Retrieve next state's q-value next_qvalues = self.qtable.get_qtable_entry( next_ap, next_b, next_hand) next_max = np.nanmax(next_qvalues) \ if np.any(next_qvalues != None) else 0 # Determine reward if has_finished(next_hand): reward_earned = self.rewards[agents_finished] else: reward_earned = 0 # Only update if either old or new values are not all zero if np.any(learned_values != None) or reward_earned != 0 or next_max != 0: # Create Q-Table entry if necessary if np.all(learned_values == None): self.qtable.create_qtable_entry( already_played, board, self.hand) learned_values = self.qtable.get_qtable_entry( already_played, board, self.hand) # Determine new value def update_func(old_qvalues): old_qvalue = old_qvalues.iloc[0, action_index] new_value = (1 - self.alpha) * old_qvalue + \ self.alpha * (reward_earned + self.gamma * next_max) old_qvalues.iloc[0, action_index] = new_value return old_qvalues self.qtable.update_qtable( already_played, board, self.hand, update_func)
def only_passing_possible(hand, board): """ Faster than checking len(possible_next_moves(...)) == 1. """ # Finished players have to pass if has_finished(hand): return True # No passing allowed when board empty if np.all(board == 0): return False # Else iterate possible actions card_type_in_board = np.argmax(board) num_cards_in_board = board[card_type_in_board] \ if card_type_in_board == JOKER \ else board[card_type_in_board] + board[JOKER] for card_type_in_hand in range(NUM_CARD_VALUES - 1, -1, -1): # You can play clean if card_type_in_hand < card_type_in_board and \ hand[card_type_in_hand] >= num_cards_in_board: return False # Or you can play dirty (with Joker(s)) if card_type_in_hand != JOKER and hand[JOKER] > 0 \ and card_type_in_hand < card_type_in_board \ and num_cards_in_board >= 2 and hand[card_type_in_hand] > 0 \ and hand[card_type_in_hand] + hand[JOKER] >= num_cards_in_board: # Use one joker if hand[card_type_in_hand] + 1 >= num_cards_in_board: return False # Use two jokers if hand[JOKER] == 2 and num_cards_in_board >= 3: return False # No possible actions available return True
def possible_next_moves(hand, board): """ Returns possible next moves as a list of actions """ # You can always pass if it is not the initial move possible_actions = [np.zeros((1, NUM_CARD_VALUES), dtype=np.int8)] # If board empty, moves do only depend on hand if np.all(board == 0): for card_type in range(NUM_CARD_VALUES - 1, -1, -1): for num_cards in range(hand[card_type], 0, -1): if card_type != JOKER: for num_jokers in range(hand[JOKER] + 1): # Form new board out of jokers and cards possible_actions.append( get_cards_array(card_type, num_cards) + get_cards_array(JOKER, num_jokers)) else: # Form new board out of only jokers possible_actions.append( get_cards_array(card_type, num_cards)) # Move has to match current board else: card_type_in_board = np.argmax(board) num_cards_in_board = board[card_type_in_board] \ if card_type_in_board == JOKER \ else board[card_type_in_board] + board[JOKER] if not has_finished(hand): for card_type_in_hand in range(NUM_CARD_VALUES - 1, -1, -1): # You can play clean if card_type_in_hand < card_type_in_board and \ hand[card_type_in_hand] >= num_cards_in_board: possible_actions.append( get_cards_array(card_type_in_hand, num_cards_in_board)) # Or you can play dirty (with Joker(s)) if card_type_in_hand != JOKER and hand[JOKER] > 0 \ and card_type_in_hand < card_type_in_board \ and num_cards_in_board >= 2 and hand[card_type_in_hand] > 0 \ and hand[card_type_in_hand] + hand[JOKER] >= num_cards_in_board: # Use one joker if hand[card_type_in_hand] + 1 >= num_cards_in_board: possible_actions.append( get_cards_array(card_type_in_hand, num_cards_in_board - 1) + get_cards_array(JOKER, 1)) # Use two jokers if hand[JOKER] == 2 and num_cards_in_board >= 3: possible_actions.append( get_cards_array(card_type_in_hand, num_cards_in_board - 2) + get_cards_array(JOKER, 2)) # If board empty, passing not allowed if np.all(board == 0): possible_actions = possible_actions[1:] return np.vstack(possible_actions)
def process_next_board_state( # Last board state self, already_played, board, # Possible states before the next move of this agent list_next_possible_states, next_ap, next_b, next_hand, # Decided action possible_qvalues, action_index, action_taken, random_choice, # Other parameters agents_finished, always_use_best): """ Processes the next board state. """ if not has_finished(next_hand): # List next possible states next_already_played_list, next_boards = \ list_next_possible_states(next_ap, next_b) # Retrieve next state's max q-value next_possible_actions, next_boards, next_already_played_list = \ possible_next_moves_for_all( next_hand, next_boards, next_already_played_list) next_qvalues = self.predict_q_values_from_network( self.convert_to_data_batch( next_already_played_list, next_boards, next_hand, next_possible_actions)) # FIXME compute weighted average based on the probabilites. # Use mean since best action for agent is probably not going to happen next_max = np.nanmean(next_qvalues) # Determine reward if has_finished(next_hand): # Reward based on how many other agents are already finished reward_earned = self.rewards[agents_finished] # Terminal state has q-value zero next_max = 0 elif np.all(np.all( (next_already_played_list - next_possible_actions) == already_played, axis=1)): # Cards that win a round safely gain fixed rewards reward_earned = self.reward_win_round else: # Else, the more cards played the better reward_earned = self.reward_per_card_played * \ np.linalg.norm(action_taken, 1) # Determine new q-value future_qvalue = reward_earned + self.gamma * next_max # Do not train in inference mode if not always_use_best: # Record step in replay buffer self.replay_buffer.add_batch(( self.convert_to_data_batch( [already_played], [board], self.hand, [action_taken] ), future_qvalue)) # Fit neural net to observed replays if self.step_iteration != 0 and self.step_iteration % \ self.train_each_n_steps == 0: self.fit_values_to_network() self.step_iteration += 1 # Validate q-values in inference mode else: self.validation_buffer.add_batch(( self.convert_to_data_batch( [already_played], [board], self.hand, [action_taken] ), future_qvalue))