class Qlearn1AgentNP(BaseAgentNP): MODEL_FILES = ['q.modelb'] logger = EpochLogger(output_dir='qlearn1/logs', output_fname='progress.csv') def initialize(self, batch_size, initial_capital, n_players): self.BATCH_SIZE = batch_size self.INITAL_CAPITAL = initial_capital self.N_PLAYERS = n_players # 5 community cards x 53 (52 cards + "unknown") + 2 holecards x 52, # (1 position in this round + 1 folded + 1 pot investment total + 1 pot investment this round + 1 which player last raised) x 6 # round x 4 (1h) self.obs_dim = (5) * 53 + (2) * 52 + (1 + 1 + 1 + 1 + 1) * 6 + (1) * 5 self.act_dim = 6 self.q = models.MLPQFunction(self.obs_dim, self.act_dim, trainable=self.TRAINABLE, device=DEVICE) self.possible_actions = torch.zeros((self.BATCH_SIZE, self.act_dim, 3), device=DEVICE) self.possible_actions[:, 0, constants.FOLD] = 1 self.possible_actions[:, 1, constants.CALL] = 1 self.possible_actions[:, 2:, constants.RAISE] = 1 self.possible_raises = np.array([0, 0, 4, 20, 100, 200]) if self.TRAINABLE: self.reward = torch.zeros(self.BATCH_SIZE) self.replaybuffer = replaybuffer.ReplayBuffer( obs_dim=self.obs_dim, act_dim=self.act_dim, batch_size=self.BATCH_SIZE, size=REPLAYBUFFER_SIZE, device=DEVICE) self.q_optimizer = torch.optim.Adam(self.q.parameters(), lr=Q_LEARNING_RATE) self.first_round = True self.prev_state = None self.prev_action = None self.load_model() def act(self, player_idx, round, active_games, current_bets, min_raise, prev_round_investment, folded, last_raiser, hole_cards, community_cards): hole_cards.sort(axis=1) community_cards[:, 0:3].sort(axis=1) state = self.build_network_input(player_idx, round, current_bets, min_raise, prev_round_investment, folded, last_raiser, hole_cards, community_cards) actions, amounts, actions_serialized = self.choose_action( torch.as_tensor(state, dtype=torch.float32, device=DEVICE), current_bets) if self.TRAINABLE: if not self.first_round: self.replaybuffer.store(obs=self.prev_state, act=self.prev_action, next_obs=state, active_games=active_games) self.first_round = False self.prev_state = state self.prev_action = actions_serialized return actions, amounts def end_trajectory(self, player_idx, round, current_bets, min_raise, prev_round_investment, folded, last_raiser, hole_cards, community_cards, gains): # TODO: bugfix to prevent crash in case that agent never acted before game finish if self.TRAINABLE and self.prev_state is not None: state = self.build_network_input(player_idx, round, current_bets, min_raise, prev_round_investment, folded, last_raiser, hole_cards, community_cards) scaled_gains = (gains / self.INITAL_CAPITAL - (self.N_PLAYERS / 2 - 1)) * 2 / self.N_PLAYERS # DEBUGTOOL lost_money = (gains / self.INITAL_CAPITAL) lost_money[folded[:, player_idx] == 0] = 0 self.reward = torch.Tensor(scaled_gains).to(DEVICE) self.replaybuffer.store(obs=self.prev_state, act=self.prev_action, next_obs=state, active_games=np.ones(self.BATCH_SIZE)) self.logger.store(Reward=scaled_gains, LostInFolding=lost_money, LostGeneral=(gains / self.INITAL_CAPITAL)) self.train() self.save_model() # FIXME: Remember that replaybuffer is *not* emptied here def train(self): state = self.replaybuffer.sample_state() while state: self.update_parameters(state) state = self.replaybuffer.sample_state() self.log_everything() def log_everything(self): self.logger.log_tabular('Folds', average_only=True) self.logger.log_tabular('Calls', average_only=True) for i in range(2, self.act_dim): self.logger.log_tabular('Raises ' + str(self.possible_raises[i]), average_only=True) self.logger.log_tabular('LostInFolding', with_min_and_max=True, average_only=True) self.logger.log_tabular('LostGeneral', with_min_and_max=True, average_only=True) self.logger.log_tabular('QVals', with_min_and_max=True, average_only=True) self.logger.log_tabular('Reward', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.dump_tabular() def build_network_input(self, player_idx, round, current_bets, min_raise, prev_round_investment, folded, last_raiser, hole_cards, community_cards): # First convert the treys card IDs into indices hole_cards_converted = 13 * np.log2( np.right_shift(hole_cards, 12) & 0xF) + ( np.right_shift(hole_cards, 8) & 0xF) community_cards_converted = 13 * np.log2( np.right_shift(community_cards, 12) & 0xF) + ( np.right_shift(community_cards, 8) & 0xF) # Then convert those indices into 1h hole_cards_1h = (np.arange(52) == hole_cards_converted[..., None] - 1).astype(int) known_community_cards_1h = ( np.arange(53) == community_cards_converted[..., None] - 1).astype(int) # Fill missing community cards with zero missing_community_cards = np.zeros( (self.BATCH_SIZE, 5 - community_cards.shape[1], 53)) # Have a 53rd column in the 1h to indicate missing cards, and fill that with ones where relevant missing_community_cards[:, :, -1] = 1 community_cards_1h = np.concatenate( (known_community_cards_1h, missing_community_cards), axis=1) player_data = np.zeros((self.BATCH_SIZE, 5, self.N_PLAYERS)) # Who folded already player_data[:, 0, :] = folded # Who put how much total into the pot player_data[:, 1, :] = (prev_round_investment + current_bets) / self.INITAL_CAPITAL # Who put how much this round player_data[:, 2, :] = (current_bets) / self.INITAL_CAPITAL # Who was the last to raise player_data[:, 3, :] = np.eye(self.N_PLAYERS)[last_raiser] # Reorder the first four to correspond to player_idx player_data = np.concatenate( (player_data[:, :, player_idx:], player_data[:, :, :player_idx]), axis=2) # Which player are we player_data[:, 4, player_idx] = 1 tail_data = np.zeros((self.BATCH_SIZE, 5)) tail_data[:, round] = 1 network_input = np.concatenate( (hole_cards_1h.reshape(self.BATCH_SIZE, -1), community_cards_1h.reshape(self.BATCH_SIZE, -1), player_data.reshape(self.BATCH_SIZE, -1), tail_data.reshape(self.BATCH_SIZE, -1)), axis=1) assert (network_input.shape[1] == self.obs_dim) return network_input def choose_action(self, network_input, current_bets): scores = np.ndarray((self.BATCH_SIZE, self.act_dim)) with torch.no_grad(): for idx in range(self.possible_actions.shape[1]): onehot_actions = torch.eye(self.act_dim, device=DEVICE)[torch.full( (self.BATCH_SIZE, ), idx, dtype=torch.long, device=DEVICE)] scores[:, idx] = self.q(network_input, onehot_actions).cpu().numpy() actions = np.argmax(scores, axis=1) if self.TRAINABLE: dice = np.random.random(self.BATCH_SIZE) rand_actions = np.random.randint(0, self.act_dim, self.BATCH_SIZE) actions[dice <= NOISE_LEVEL] = rand_actions[dice <= NOISE_LEVEL] actions_array = np.eye(self.act_dim)[actions] amounts = self.possible_raises[actions] actions[actions > constants.RAISE] = constants.RAISE actions[current_bets.sum(axis=1) == 0] = constants.CALL self.logger.store(Calls=100 * np.mean(actions == constants.CALL), Folds=100 * np.mean(actions == constants.FOLD)) for i in range(2, self.act_dim): self.logger.store( **{ "Raises " + str(self.possible_raises[i]): 100 * np.mean(actions_array[:, i]) }) return actions, amounts, actions_array # Set up function for computing Q-losses def compute_loss_q(self, data): o, a, o2, active = data['obs'], data['act'], data['obs2'], data[ 'active'] q = self.q(o, a) loss_q = (active * (q - self.reward)**2).sum() # Useful info for logging q_info = dict(QVals=(q).cpu().detach().numpy()) # q_info = {} return loss_q, q_info def update_parameters(self, data): # First run one gradient descent step for Q1 and Q2 self.q_optimizer.zero_grad() loss_q, q_info = self.compute_loss_q(data) loss_q.backward() self.q_optimizer.step() # Record things self.logger.store(LossQ=loss_q.item(), **q_info) def load_model(self): if os.path.exists(self.MODEL_PATH): self.q.load(self.MODEL_PATH) if self.TRAINABLE: self.q_optimizer.load_state_dict( torch.load(os.path.join(self.MODEL_PATH, 'q_opt.optb'))) def save_model(self): print('saved', self.MODEL_PATH) self.q.save(self.MODEL_PATH) if self.TRAINABLE: torch.save(self.q_optimizer.state_dict(), os.path.join(self.MODEL_PATH, 'q_opt.optb'))
class Sac1AgentNP(BaseAgentNP): MODEL_FILES = ['policy.modelb', 'q1.modelb', 'q2.modelb'] logger = EpochLogger(output_dir='sac1/logs', output_fname='progress.csv') def initialize(self, batch_size, initial_capital, n_players): self.BATCH_SIZE = batch_size self.REPLAY_BATCH_SIZE = 1000 self.INITAL_CAPITAL = initial_capital self.N_PLAYERS = n_players # 5 community cards x 53 (52 cards + "unknown") + 2 holecards x 52, # (1 position in this round + 1 folded + 1 pot investment total + 1 pot investment this round + 1 which player last raised) x 6 # round x 4 (1h) self.obs_dim = (5) * 53 + (2) * 52 + (1 + 1 + 1 + 1 + 1) * 6 + (1) * 5 # Action dimensions self.act_dim = 4 self.ac = models.MLPActorCritic(self.obs_dim, self.act_dim, 1, trainable=self.TRAINABLE, device=DEVICE) if self.TRAINABLE: self.target_ac = deepcopy(self.ac) for parameter in self.target_ac.parameters(): parameter.requires_grad = False self.replaybuffer = replaybuffer.ReplayBuffer( obs_dim=self.obs_dim, act_dim=self.act_dim, size=REPLAYBUFFER_SIZE * self.BATCH_SIZE, device=DEVICE) self.pi_optimizer = torch.optim.Adam(self.ac.parameters(), lr=PI_LEARNING_RATE) self.q_optimizer = torch.optim.Adam(itertools.chain( self.ac.q1.parameters(), self.ac.q2.parameters()), lr=Q_LEARNING_RATE) self.first_round = True self.prev_state = None self.prev_action = None self.load_model() def act(self, player_idx, round, active_games, current_bets, min_raise, prev_round_investment, folded, last_raiser, hole_cards, community_cards): state = self.build_network_input(player_idx, round, current_bets, min_raise, prev_round_investment, folded, last_raiser, hole_cards, community_cards) hole_cards.sort(axis=1) community_cards[:, 0:3].sort(axis=1) network_output = self.ac.act(torch.as_tensor(state, dtype=torch.float32), deterministic=not self.TRAINABLE) if self.TRAINABLE: if not self.first_round: n_rounds = active_games.sum() self.replaybuffer.store(obs=self.prev_state[active_games], act=self.prev_action[active_games], rew=np.zeros(n_rounds), next_obs=state[active_games], done=np.zeros(n_rounds), batch_size=n_rounds) self.first_round = False self.prev_state = state self.prev_action = network_output actions, amounts = self.interpret_network_output( network_output, current_bets, prev_round_investment, player_idx, min_raise) return actions, amounts def end_trajectory(self, player_idx, round, current_bets, min_raise, prev_round_investment, folded, last_raiser, hole_cards, community_cards, gains): #TODO: bugfix to prevent crash in case that agent never acted before game finish if self.TRAINABLE and self.prev_state is not None: state = self.build_network_input(player_idx, round, current_bets, min_raise, prev_round_investment, folded, last_raiser, hole_cards, community_cards) scaled_gains = (gains / self.INITAL_CAPITAL - (self.N_PLAYERS / 2 - 1)) * 2 / self.N_PLAYERS # DEBUGTOOL lost_money = (gains / self.INITAL_CAPITAL) lost_money[folded[:, player_idx] == 0] = 0 self.replaybuffer.store(obs=self.prev_state, act=self.prev_action, rew=scaled_gains, next_obs=state, done=np.ones(self.BATCH_SIZE), batch_size=self.BATCH_SIZE) self.logger.store(Reward=scaled_gains, LostInFolding=lost_money, LostGeneral=(gains / self.INITAL_CAPITAL)) self.train() self.save_model() # FIXME: Remember that replaybuffer is *not* emptied here def train(self): self.replaybuffer.shuffle() batch = self.replaybuffer.sample_batch( batch_size=min(self.REPLAY_BATCH_SIZE, self.BATCH_SIZE)) while batch: self.update_parameters(batch) batch = self.replaybuffer.sample_batch( batch_size=min(self.REPLAY_BATCH_SIZE, self.BATCH_SIZE)) self.log_everything() def log_everything(self): self.logger.log_tabular('QContribPiLoss', with_min_and_max=True, average_only=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('EntropyBonus', average_only=True) self.logger.log_tabular('Raises', average_only=True) self.logger.log_tabular('Calls', average_only=True) self.logger.log_tabular('Folds', average_only=True) self.logger.log_tabular('LostInFolding', with_min_and_max=True, average_only=True) self.logger.log_tabular('LostGeneral', with_min_and_max=True, average_only=True) self.logger.log_tabular('QVals', with_min_and_max=True, average_only=True) self.logger.log_tabular('TargQVals', with_min_and_max=True, average_only=True) self.logger.log_tabular('Reward', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.dump_tabular() def build_network_input(self, player_idx, round, current_bets, min_raise, prev_round_investment, folded, last_raiser, hole_cards, community_cards): # First convert the treys card IDs into indices hole_cards_converted = 13 * np.log2( np.right_shift(hole_cards, 12) & 0xF) + ( np.right_shift(hole_cards, 8) & 0xF) community_cards_converted = 13 * np.log2( np.right_shift(community_cards, 12) & 0xF) + ( np.right_shift(community_cards, 8) & 0xF) # Then convert those indices into 1h hole_cards_1h = (np.arange(52) == hole_cards_converted[..., None] - 1).astype(int) known_community_cards_1h = ( np.arange(53) == community_cards_converted[..., None] - 1).astype(int) # Fill missing community cards with zero missing_community_cards = np.zeros( (self.BATCH_SIZE, 5 - community_cards.shape[1], 53)) # Have a 53rd column in the 1h to indicate missing cards, and fill that with ones where relevant missing_community_cards[:, :, -1] = 1 community_cards_1h = np.concatenate( (known_community_cards_1h, missing_community_cards), axis=1) player_data = np.zeros((self.BATCH_SIZE, 5, self.N_PLAYERS)) # Who folded already player_data[:, 0, :] = folded # Who put how much total into the pot player_data[:, 1, :] = (prev_round_investment + current_bets) / self.INITAL_CAPITAL # Who put how much this round player_data[:, 2, :] = (current_bets) / self.INITAL_CAPITAL # Who was the last to raise player_data[:, 3, :] = np.eye(self.N_PLAYERS)[last_raiser] # Reorder the first four to correspond to player_idx player_data = np.concatenate( (player_data[:, :, player_idx:], player_data[:, :, :player_idx]), axis=2) # Which player are we player_data[:, 4, player_idx] = 1 tail_data = np.zeros((self.BATCH_SIZE, 5)) tail_data[:, round] = 1 network_input = np.concatenate( (hole_cards_1h.reshape(self.BATCH_SIZE, -1), community_cards_1h.reshape(self.BATCH_SIZE, -1), player_data.reshape(self.BATCH_SIZE, -1), tail_data.reshape(self.BATCH_SIZE, -1)), axis=1) assert (network_input.shape[1] == self.obs_dim) return network_input def interpret_network_output(self, network_output, current_bets, prev_round_investment, player_idx, min_raise): chosen_action = np.argmax(network_output[:, :3], axis=1) actions = np.array([constants.FOLD, constants.CALL, constants.RAISE])[chosen_action] actions[current_bets.sum(axis=1) == 0] = constants.CALL current_stake = current_bets[:, player_idx] + prev_round_investment[:, player_idx] amounts = np.clip((network_output[:, 1] + 1) * self.INITAL_CAPITAL / 2, min_raise, self.INITAL_CAPITAL - current_stake) self.logger.store( Raises=100 * np.mean(actions == constants.RAISE), Calls=100 * np.mean(actions == constants.CALL), Folds=100 * np.mean(actions == constants.FOLD), ) return actions, amounts # Set up function for computing SAC Q-losses def compute_loss_q(self, data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = self.ac.q1(o, a) q2 = self.ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = self.ac.pi(o2) # Target Q-values q1_pi_targ = self.target_ac.q1(o2, a2) q2_pi_targ = self.target_ac.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) self.logger.store(EntropyBonus=(-ALPHA * logp_a2).cpu().detach().numpy()) backup = r + GAMMA * (1 - d) * (q_pi_targ - ALPHA * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(QVals=((q1 + q2) / 2).cpu().detach().numpy(), TargQVals=((q1_pi_targ + q2_pi_targ) / 2).cpu().detach().numpy()) # q_info = {} return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(self, data): o = data['obs'] action, logp_pi = self.ac.pi(o) q1_pi = self.ac.q1(o, action) q2_pi = self.ac.q2(o, action) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (ALPHA * logp_pi - q_pi).mean() self.logger.store( QContribPiLoss=(torch.abs(q_pi) / (torch.abs(q_pi) + torch.abs(ALPHA * logp_pi)) ).mean().cpu().detach().numpy()) # Useful info for logging pi_info = dict(LogPi=logp_pi.cpu().detach().numpy()) # pi_info = {} return loss_pi, pi_info def update_parameters(self, data): # First run one gradient descent step for Q1 and Q2 self.q_optimizer.zero_grad() loss_q, q_info = self.compute_loss_q(data) loss_q.backward() self.q_optimizer.step() # Record things self.logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()): p.requires_grad = False # Next run one gradient descent step for pi. self.pi_optimizer.zero_grad() loss_pi, pi_info = self.compute_loss_pi(data) loss_pi.backward() self.pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()): p.requires_grad = True # Record things self.logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(self.ac.parameters(), self.target_ac.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(POLYAK) p_targ.data.add_((1 - POLYAK) * p.data) def load_model(self): if os.path.exists(self.MODEL_PATH): self.ac.load(self.MODEL_PATH) if self.TRAINABLE: self.pi_optimizer.load_state_dict( torch.load(os.path.join(self.MODEL_PATH, 'pi_opt.optb'))) self.q_optimizer.load_state_dict( torch.load(os.path.join(self.MODEL_PATH, 'q_opt.optb'))) def save_model(self): print('saved', self.MODEL_PATH) self.ac.save(self.MODEL_PATH) if self.TRAINABLE: torch.save(self.pi_optimizer.state_dict(), os.path.join(self.MODEL_PATH, 'pi_opt.optb')) torch.save(self.q_optimizer.state_dict(), os.path.join(self.MODEL_PATH, 'q_opt.optb'))