def _log_strategy( self, policy: Policy, infoset: Optional[LeducInfoset], global_step: int): def recurse(new_action): after_action_infoset = copy.deepcopy(infoset) after_action_infoset.add_action(new_action) self._log_strategy(policy, after_action_infoset, global_step) if infoset is None: for card in range(3): infoset = LeducInfoset(card, bet_sequences=[(), ()], board_card=None) self._log_strategy(policy, infoset, global_step) elif infoset.player_to_act == -1: for board_card in range(3): infoset = LeducInfoset(card=infoset.card, bet_sequences=infoset.bet_sequences, board_card=board_card) self._log_strategy(policy, infoset, global_step) elif infoset.is_terminal: return else: action_probs = policy.action_prob(infoset) nash_action_probs = self.nash_policy.action_prob(infoset) action_probs -= nash_action_probs node_name = "strategy/" + str(infoset) node_name = node_name.replace(":", "_") for action in PlayerActions.ALL_ACTIONS: if action == PlayerActions.FOLD and infoset.can_fold: if not self.text_only: self.writer.add_scalar(node_name+"/f", action_probs[action], global_step=global_step) logger.debug("Epoch %s Strategy %s %s", e, node_name+"/f", action_probs[action]) self.total_error += abs(action_probs[action]) self.state_cnt += 1 recurse(action) elif action == PlayerActions.BET_RAISE and infoset.can_raise: if not self.text_only: self.writer.add_scalar(node_name+"/r", action_probs[action], global_step=global_step) logger.debug("Epoch %s Strategy %s %s", e, node_name+"/r", action_probs[action]) self.total_error += abs(action_probs[action]) self.state_cnt += 1 recurse(action) elif action == PlayerActions.CHECK_CALL: if not self.text_only: self.writer.add_scalar(node_name + "/c", action_probs[action], global_step=global_step) logger.debug("Epoch %s Strategy %s %s", e, node_name+"/c", action_probs[action]) self.total_error += abs(action_probs[action]) self.state_cnt += 1 recurse(action)
def log_qvals( writer: SummaryWriter, policy: QPolicy, infoset: Optional[LeducInfoset], global_step: int, text_only: bool): def recurse(new_action): after_action_infoset = copy.deepcopy(infoset) after_action_infoset.add_action(new_action) log_qvals(writer, policy, after_action_infoset, global_step, text_only) if infoset is None: for card in range(3): infoset = LeducInfoset(card, bet_sequences=[(), ()], board_card=None) log_qvals(writer, policy, infoset, global_step, text_only) elif infoset.player_to_act == -1: for board_card in range(3): infoset = LeducInfoset(card=infoset.card, bet_sequences=infoset.bet_sequences, board_card=board_card) log_qvals(writer, policy, infoset, global_step, text_only) elif infoset.is_terminal: return else: state = infoset_to_state(infoset) state = torch.from_numpy(state).float().unsqueeze(0).to(device) q_vals = policy.qnetwork_local.forward(state).cpu().numpy()[0] node_name = "q_vals/" + str(infoset) node_name = node_name.replace(":", "_") for action in PlayerActions.ALL_ACTIONS: if action == PlayerActions.FOLD and infoset.can_fold: if not text_only: writer.add_scalar(node_name+"/f", q_vals[action], global_step=global_step) logger.debug("Epoch %s QValue %s %s", e, node_name+"/f", q_vals[action]) recurse(action) elif action == PlayerActions.BET_RAISE and infoset.can_raise: if not text_only: writer.add_scalar(node_name+"/r", q_vals[action], global_step=global_step) logger.debug("Epoch %s QValue %s %s", e, node_name+"/r", q_vals[action]) recurse(action) elif action == PlayerActions.CHECK_CALL: if not text_only: writer.add_scalar(node_name + "/c", q_vals[action], global_step=global_step) logger.debug("Epoch %s QValue %s %s", e, node_name+"/c", q_vals[action]) recurse(action)
def test_action_prob_supervised(self): self.sut = LeducPoker.NFSP.Agent.NfspAgent(self.mock_q_policy, self.mock_supervised_trainer, nu=0) self.sut.leduc_supervised_policy.action_prob = MagicMock(return_value=[1, 0, 0]) infoset = LeducInfoset(card=1, bet_sequences=[(PlayerActions.BET_RAISE,), ()], board_card=None) retval = self.sut.action_prob(infoset) self.assertEqual([1, 0, 0], retval) self.sut.leduc_supervised_policy.action_prob.assert_called_with(infoset)
def test_notify_reward(self): self.sut = LeducPoker.NFSP.Agent.NfspAgent(self.mock_q_policy, self.mock_supervised_trainer, nu=0) self.sut.leduc_supervised_policy.action_prob = MagicMock(return_value=[0, 1, 0]) infoset = LeducInfoset(card=1, bet_sequences=[(PlayerActions.CHECK_CALL,), ()], board_card=None) infoset_state = infoset_to_state(infoset) self.sut.get_action(infoset) self.mock_q_policy.add_sars = MagicMock() infoset_next = LeducInfoset(card=1, bet_sequences=[(PlayerActions.CHECK_CALL, PlayerActions.BET_RAISE), ()], board_card=None) infoset_next_state = infoset_to_state(infoset_next) self.sut.notify_reward(next_infoset=infoset_next, reward=123, is_terminal=True) # call_args[0] are the position args self.assertEqual(self.mock_q_policy.add_sars.call_args[0], tuple()) self.assertEqual(self.mock_q_policy.add_sars.call_args[1]["state"].tolist(), infoset_state.tolist()) self.assertEqual(self.mock_q_policy.add_sars.call_args[1]["action"], PlayerActions.CHECK_CALL) self.assertEqual(self.mock_q_policy.add_sars.call_args[1]["reward"], 123) self.assertEqual(self.mock_q_policy.add_sars.call_args[1]["next_state"].tolist(), infoset_next_state.tolist()) self.assertEqual(self.mock_q_policy.add_sars.call_args[1]["is_terminal"], True)
def test_bet_fold_game(self): def mock_random_sample(a, b): return [1, 0] def get_agent0_action(infoset: LeducInfoset): return PlayerActions.BET_RAISE def get_agent1_action(infoset: LeducInfoset): return PlayerActions.FOLD # P0 has queen, P1 has jack with mock.patch('random.sample', mock_random_sample): self.agents[0].get_action = MagicMock(side_effect=get_agent0_action) self.agents[1].get_action = MagicMock(side_effect=get_agent1_action) LeducPoker.NFSP.Agent.collect_trajectories(self.agents, num_games=1) self.agents[0].reset.assert_called_once_with() self.agents[1].reset.assert_called_once_with() self.assertEqual( self.agents[0].notify_reward.mock_calls[0][2], {"next_infoset": LeducInfoset(card=1, bet_sequences=[(),()], board_card=None), "reward": 0, "is_terminal": False}) self.assertEqual( self.agents[1].notify_reward.mock_calls[0][2], {"next_infoset": LeducInfoset(card=0, bet_sequences=[(PlayerActions.BET_RAISE,), ()], board_card=None), "reward": 0, "is_terminal": False}) self.assertEqual( self.agents[0].notify_reward.mock_calls[1][2], {"next_infoset": None, "reward": 1, "is_terminal": True}) self.assertEqual( self.agents[1].notify_reward.mock_calls[1][2], {"next_infoset": None, "reward": -1, "is_terminal": True}) self.assertEqual(2, len(self.agents[0].notify_reward.mock_calls)) self.assertEqual(2, len(self.agents[1].notify_reward.mock_calls))
def make_agent(q_policy_parameters, supervised_trainer_parameters, nu): network_units = [64] state_size = infoset_to_state(LeducInfoset(card=0, bet_sequences=[(), ()], board_card=None)).shape[0] q_network_local = QNetwork(state_size=state_size, action_size=3, hidden_units=network_units).to(device) #q_network_target = QNetwork(state_size=state_size, action_size=3, hidden_units=network_units).to(device) q_network_target = None q_policy = QPolicy( nn_local=q_network_local, nn_target=q_network_target, parameters=q_policy_parameters) supervised_network = SupervisedNetwork(state_size=state_size, action_size=3, hidden_units=network_units).to(device) supervised_trainer = SupervisedTrainer( supervised_trainer_parameters=supervised_trainer_parameters, network=supervised_network) return NfspAgent(q_policy=q_policy, supervised_trainer=supervised_trainer, nu=nu)
def test_action_prob_q(self): self.sut = LeducPoker.NFSP.Agent.NfspAgent(self.mock_q_policy, self.mock_supervised_trainer, nu=1.1) self.sut.use_q_policy = True self.sut.leduc_rl_policy.get_action = MagicMock(return_value=1) self.sut.supervised_trainer.add_observation = MagicMock() infoset = LeducInfoset(card=1, bet_sequences=[(PlayerActions.CHECK_CALL,), ()], board_card=None) infoset_state = infoset_to_state(infoset) retval = self.sut.action_prob(infoset) self.assertListEqual([0, 1, 0], retval.tolist()) self.assertEqual(infoset_state.tolist(), self.sut.last_state.tolist()) self.sut.leduc_rl_policy.get_action.assert_called_with(infoset) self.assertEqual(self.sut.supervised_trainer.add_observation.call_args[0][0].tolist(), infoset_state.tolist()) self.assertEqual(self.sut.supervised_trainer.add_observation.call_args[0][1], 1)
def _get_terminal_game_state_value( self, my_infoset: LeducPoker.LeducInfoset, opponent_card_probs: np.ndarray) -> np.ndarray: retval = np.zeros(2) player_cards = [0, 0] player_cards[self.player_num] = my_infoset.card for opponent_card in LeducPoker.LeducPokerGame.DECK: if opponent_card == my_infoset.card or opponent_card == my_infoset.board_card: assert opponent_card_probs[opponent_card] == 0 continue player_cards[self.opponent_num] = opponent_card infoset_payoffs = my_infoset.get_payoffs(player_cards).astype( float) infoset_payoffs -= infoset_payoffs.sum( ) / 2 # Equivalent to subtracting off the cumulative value of the bets payoffs = opponent_card_probs[opponent_card] * infoset_payoffs retval += payoffs return retval
def test_check_bet_call_game(self): def mock_random_sample(a, b): return [1, 2] def mock_random_choice(a): return 2 def get_agent0_action(infoset: LeducInfoset): return PlayerActions.CHECK_CALL def get_agent1_action(infoset: LeducInfoset): return PlayerActions.BET_RAISE # P0 has queen, P1 has king with mock.patch('random.sample', mock_random_sample): with mock.patch('random.choice', mock_random_choice): self.agents[0].get_action = MagicMock(side_effect=get_agent0_action) self.agents[1].get_action = MagicMock(side_effect=get_agent1_action) LeducPoker.NFSP.Agent.collect_trajectories(self.agents, num_games=1) self.agents[0].reset.assert_called_once_with() self.agents[1].reset.assert_called_once_with() self.assertEqual( self.agents[0].notify_reward.mock_calls[0][2], {"next_infoset": LeducInfoset(1, bet_sequences=[(), ()], board_card=None), "reward": 0, "is_terminal": False}) self.assertEqual( self.agents[1].notify_reward.mock_calls[0][2], {"next_infoset": LeducInfoset(2, bet_sequences=[(PlayerActions.CHECK_CALL,), ()], board_card=None), "reward": 0, "is_terminal": False}) self.assertEqual( self.agents[0].notify_reward.mock_calls[1][2], {"next_infoset": LeducInfoset(1, bet_sequences=[(PlayerActions.CHECK_CALL, PlayerActions.BET_RAISE), ()], board_card=None), "reward": 0, "is_terminal": False}) # 2nd round self.assertEqual( self.agents[0].notify_reward.mock_calls[2][2], {"next_infoset": LeducInfoset(1, bet_sequences=[(PlayerActions.CHECK_CALL, PlayerActions.BET_RAISE, PlayerActions.CHECK_CALL), ()], board_card=2), "reward": 0, "is_terminal": False}) self.assertEqual( self.agents[1].notify_reward.mock_calls[1][2], {"next_infoset": LeducInfoset(2, bet_sequences=[(PlayerActions.CHECK_CALL, PlayerActions.BET_RAISE, PlayerActions.CHECK_CALL), (PlayerActions.CHECK_CALL,)], board_card=2), "reward": 0, "is_terminal": False}) self.assertEqual( self.agents[0].notify_reward.mock_calls[3][2], {"next_infoset": LeducInfoset(1, bet_sequences=[(PlayerActions.CHECK_CALL, PlayerActions.BET_RAISE, PlayerActions.CHECK_CALL), (PlayerActions.CHECK_CALL, PlayerActions.BET_RAISE)], board_card=2), "reward": 0, "is_terminal": False}) # Terminals self.assertEqual( self.agents[1].notify_reward.mock_calls[2][2], {"next_infoset": None, "reward": 7, "is_terminal": True}) self.assertEqual( self.agents[0].notify_reward.mock_calls[4][2], {"next_infoset": None, "reward": -7, "is_terminal": True}) self.assertEqual(5, len(self.agents[0].notify_reward.mock_calls)) self.assertEqual(3, len(self.agents[1].notify_reward.mock_calls))