def deal_hands(self) -> List[Iterable[Card]]: deck = new_deck() # Repeat random shuffles until the player's cards are good enough. while True: np.random.shuffle(deck) player_hands = [set(deck[i * 8:(i + 1) * 8]) for i in range(4)] if self._are_cards_suitable( player_hands[self._game_mode.declaring_player_id], self._game_mode): return player_hands
def __init__(self, player_id: int): super().__init__(player_id) self.static_policy = [ Card(Suit.eichel, Pip.ober), Card(Suit.eichel, Pip.unter), Card(Suit.gras, Pip.neun), Card(Suit.gras, Pip.ober), Card(Suit.herz, Pip.neun), Card(Suit.eichel, Pip.acht), Card(Suit.herz, Pip.acht), Card(Suit.herz, Pip.koenig), Card(Suit.gras, Pip.sau), Card(Suit.gras, Pip.unter), Card(Suit.schellen, Pip.ober), Card(Suit.herz, Pip.unter), Card(Suit.schellen, Pip.unter), Card(Suit.eichel, Pip.sau), Card(Suit.schellen, Pip.sau), Card(Suit.schellen, Pip.koenig), Card(Suit.herz, Pip.ober), Card(Suit.herz, Pip.sieben), Card(Suit.gras, Pip.zehn), Card(Suit.eichel, Pip.neun), Card(Suit.schellen, Pip.zehn), Card(Suit.gras, Pip.sieben), Card(Suit.herz, Pip.sau), Card(Suit.schellen, Pip.sieben), Card(Suit.eichel, Pip.sieben), Card(Suit.schellen, Pip.neun), Card(Suit.herz, Pip.zehn), Card(Suit.eichel, Pip.koenig), Card(Suit.schellen, Pip.acht), Card(Suit.gras, Pip.koenig), Card(Suit.eichel, Pip.zehn), Card(Suit.gras, Pip.acht), ] assert len(set(self.static_policy)) == len( new_deck()), "Need to include all cards!"
def __init__(self, player_id: int, config: Dict, training: bool): """ Creates a new DQNAgent. :param player_id: The unique id of the player (0-3). :param config: config dict containing an agent_config node. :param training: If True, will train during play. This usually means worse performance (because of exploration). If False, then the agent will always pick the highest-ranking valid action. """ super().__init__(player_id) self.logger = get_class_logger(self) config = config["agent_config"]["dqn_agent"] self.config = config self.training = training # We encode cards as one-hot vectors of size 32. # Providing indices to perform quick lookups. self._id2card = new_deck() self._card2id = {card: i for i, card in enumerate(self._id2card)} # Determine length of state vector. state_lens = { "cards_in_hand": 32, "cards_in_trick": 3 * 32, "cards_already_played": 32 } self._state_size = sum(state_lens[x] for x in config["state_contents"]) # Action space: One action for every card. # Naturally, most actions will be invalid because the agent doesn't have the card or is not allowed to play it. self._action_size = 32 # If True, then all unavailable actions are zeroed in the q-vector during learning. I thought this might improve training # speed, but it turned out to provide only a slight benefit. Incompatible with (and superseded by) allow_invalid_actions. self._zero_q_for_invalid_actions = config["zero_q_for_invalid_actions"] # If allowed, then the agent can choose an invalid card and get punished for it, while staying # in the same state. If not allowed, invalid actions are automatically skipped when playing. # See discussion in experiment_log.md self._allow_invalid_actions = config["allow_invalid_actions"] self._invalid_action_reward = config["invalid_action_reward"] if self._allow_invalid_actions and self._zero_q_for_invalid_actions: raise ValueError( "allow_invalid_actions and zero_q_for_invalid_actions are mutually exclusive." ) # Discount and exploration rate self._gamma = config["gamma"] self._epsilon = config["epsilon"] # Experience replay buffer for minibatch learning self.experience_buffer = deque(maxlen=config["experience_buffer_len"]) # Remember the state and action (card) played in the previous trick, so we can can judge it once we receive feedback. # Also remember which actions were valid at that time. self._prev_state = None self._prev_action = None self._prev_available_actions = None self._in_terminal_state = False # Create Q network (current state) and Target network (successor state). The networks are synced after every episode (game). self.q_network = self._build_model() self.target_network = self._build_model() self._align_target_model() self._batch_size = config["batch_size"] # Don't retrain after every single experience. # Retraining every time is expensive and doesn't add much information (rewards are received only at the end of the game). # If we wait for more experiences to accumulate before retraining, we get more fresh data before doing expensive training. # NOTE: This kind of breaks the "sync networks after every game" idea, but nevertheless is working very well to speed up training. self._retrain_every_n = config["retrain_every"] self._experiences_since_last_retrain = 0 # Memory: here are some things the agent remembers between moves. This is basically feature engineering, # it would be more interesting to have the agent learn these with an RNN or so! self._mem_cards_already_played = set() # For display in the GUI self._current_q_vals = None
def deal_hands(self) -> List[Iterable[Card]]: deck = new_deck() np.random.shuffle(deck) player_hands = [set(deck[i * 8:(i + 1) * 8]) for i in range(4)] return player_hands
def __enter__(self): # Show PyGame window. Assets can only be loaded after this. self._screen = pygame.display.set_mode(self.resolution) self._card_assets = {card: pygame.image.load(get_card_img_path(card)).convert() for card in new_deck()} pygame.display.set_caption("Interactive AlphaSheep") # Subscribe to events of the controller self.game_state.ev_changed.subscribe(self.on_game_state_changed) # If a player agent is the GUIAgent, register a callback that blocks until the user selects a card. assert not any(isinstance(p.agent, GUIAgent) for p in self.game_state.players[1:]), "Only Player 0 can have a GUIAgent." if isinstance(self.game_state.players[0].agent, GUIAgent): def select_card_callback(reset_clicks=False): if reset_clicks: self._clicked_pos = None self._clicked_card = None self.wait_and_draw_until(lambda: self._clicked_card is not None) return self._clicked_card self.game_state.players[0].agent.register_gui_callback(select_card_callback)