def validate_featurizer(self, dataset, against_test_data=False): if against_test_data: d = self.data['test'] else: d = self.data['val'] # hardcode epochs x_hand, x_board, y_hs, y_probas_combi = shuffle( d['x_hand'], d['x_board'], d['y_hs'], d['y_probs_combi']) for i in range(0, len(dataset), self.batch_size): batch_i = i // self.batch_size hand = variable(x_hand[i:i + self.batch_size], cuda=self.cuda) board = variable(x_board[i:i + self.batch_size], cuda=self.cuda) target = variable(y_hs[i:i + self.batch_size], cuda=self.cuda).squeeze() self.f.eval() if len(hand) != self.batch_size: break HS_pred = self.f.forward(hand, board)[0].squeeze().float() HS = FeaturizerManager.clip(HS_pred) self.f.train() # for numerical stable m = variable(np.array([1e-6]), cuda=self.cuda) pred = t.log(t.max(HS / (1 - HS), m)) loss = (target - pred).pow(2).mean() loss = float(round(loss.data.cpu().numpy()[0], 3)) if self.tensorboard is not None: if against_test_data: self.tensorboard.add_scalar_value('test_loss', loss) else: self.tensorboard.add_scalar_value('validation_loss', loss)
def compute_loss(self, pred, target, imp_weights): ''' compute weighted mse loss loss for each sample is scaled by imp_weight we need this to account for bias in replay sampling added entropy to term to increase diversity and exploration beta is a hyperparameter ''' td_deltas = pred - target loss = t.mean(imp_weights * td_deltas.pow(2)) if self.use_entropy_loss: # @experimental: entropy term episode_id = self.game_info['#episodes'] self.beta = np.max( [self.beta / np.power(np.max([1, episode_id]), 1 / 4), 0.01]) beta_var = variable(self.beta, cuda=self.is_cuda) sm = Softmax(dim=0) probs = sm(pred.unsqueeze(dim=1)) m = variable(np.array([1e-6]), cuda=self.is_cuda) entropy = -t.sum(probs * t.log(t.max(probs, m))) loss = loss - beta_var * entropy return loss, td_deltas
def bucket_encode_actions(actions, cuda=False): """ NOTE THAT THIS IS NOT ONE HOT ENCODING THIS IS RATHER BUCKET ENCODING (PUT AN ACTION REPRESENTED AS A 6x1 ARRAY TO AN INTEGER BUCKET) IT CAN BE USED IN THE LOSS YOU SHOULD ADD +1 (see the keys of `Action.BET_BUCKETS` to understand why) :param actions: a VARIABLE of size batch_size x 5 (il y a 5 types d'actions: check, bet, call, raise, all-in) :return: a VARIABLE of size batch_size x 14 (il y a 14 buckets) """ values, indices = t.max(actions, -1) actions_buckets = variable(np.zeros(values.data.cpu().numpy().shape), cuda=cuda) actions_buckets[indices == 0] = 0 # check actions_buckets[indices == 4] = 14 # all in actions_buckets[indices == 5] = -1 # fold for bucket_idx in range(1, 14): indicator = lambda x: bucket_idx * (x >= Action.BET_BUCKETS[ bucket_idx][0]).float() * ( (x <= Action.BET_BUCKETS[bucket_idx][1]).float()) mask = (indices != 0) * (indices != 5) * (indices != 4) actions_buckets[mask] += indicator(values[mask]) return actions_buckets
def create_state_variable(state, cuda=False): # TODO: check if dtype should be handled individually # do not use this. use variable() in utils f = lambda x: variable(x, cuda=cuda) return [f(e) for e in state]
def choose_action(self, player, board, pot, actions, b_round, opponent_stack, opponent_side_pot, blinds, episode_idx, for_play=False): # decay epsilon in the same way in the paper (NFSP, 2016) # we use number of episodes as n # the exact decay schedule was not specified but alluded to slower than sqrt # so we use (n)^1/4 self.eps = np.max( [self.eps / np.power(np.max([episode_idx, 1]), 1 / 4), 0.01]) self.eta = np.max( [self.eta / np.power(np.max([episode_idx, 1]), 1 / 4), 0.1]) if player.is_all_in: assert player.stack == 0 return Action('null'), False if self.eta >= np.random.rand(): # use epsilon-greedy policy if self.verbose: start = timer() action = strategy_RL_aux(player, board, pot, actions, b_round, opponent_stack, opponent_side_pot, self._Q, greedy=self.is_greedy, blinds=blinds, verbose=self.verbose, eps=self.eps, cuda=self.cuda, for_play=for_play) if self.verbose: print('forward pass of Q took', timer() - start) self.is_Q_used = True else: # use average policy state = build_state(player, board, pot, actions, opponent_stack, blinds[1], as_variable=False) state = [variable(s, cuda=self.cuda) for s in state] state.append(for_play) if self.verbose: start = timer() action_probs = self._pi.forward(*state).squeeze() if self.verbose: print('forward pass of pi took', timer() - start) possible_actions = authorized_actions_buckets( player, actions, b_round, opponent_side_pot) # @hack # add some heuristics # if check is possible if 0 in possible_actions and -1 in possible_actions: # one should not fold del possible_actions[possible_actions.index(-1)] # @hack: remove high roller actions #try: # # anything betting above 20 should be discouraged # # when initial money is only 100 # high_bet_i = possible_actions.index(8) # # but allow all-in # possible_actions = possible_actions[:high_bet_i] + [possible_actions[-1]] #except ValueError: # pass idx = [ idx_to_bucket(k) for k, _ in enumerate(action_probs) if idx_to_bucket(k) in possible_actions ] valid_action_probs = t.stack([ p for k, p in enumerate(action_probs) if idx_to_bucket(k) in possible_actions ]) valid_action_probs /= t.sum(valid_action_probs) action = bucket_to_action( sample_action(idx, valid_action_probs.data.cpu().numpy()), actions, b_round, player, opponent_side_pot) self.is_Q_used = False return action, self.is_Q_used
def strategy_RL_aux(player, board, pot, actions, b_round, opponent_stack, opponent_side_pot, Q, greedy=True, blinds=BLINDS, verbose=False, eps=0., cuda=False, for_play=False): """ Take decision using Q values (in a greedy or random way) :param player: :param board: :param pot: :param actions: :param b_round: :param opponent_stack: :param Q: the Keras neural network that takes states as inputs :param greedy: True for greedy, False for Q-softmax sampling :param blinds: :param verbose: :return: """ possible_actions = authorized_actions_buckets( player, actions, b_round, opponent_side_pot ) # you don't have right to take certain actions, e.g betting more than you have or betting 0 or checking a raise # @hack # add some heuristics # if check is possible if 0 in possible_actions and -1 in possible_actions: # one should not fold del possible_actions[possible_actions.index(-1)] # @hack: remove high roller actions #try: # # anything betting above 20 should be discouraged # # when initial money is only 100 # high_bet_i = possible_actions.index(8) # # but allow all-in # possible_actions = possible_actions[:high_bet_i] + [possible_actions[-1]] #except ValueError: # pass state = build_state(player, board, pot, actions, opponent_stack, blinds[1], as_variable=False) state = [variable(s, cuda=cuda) for s in state] state.append(for_play) Q_values = Q.forward(*state)[0].squeeze( ) # it has multiple outputs, the first is the Qvalues Q_values = Q_values.data.cpu().numpy() # choose action in a greedy way if greedy: Q_values_for_possible_actions = np.array([ Q_value for k, Q_value in enumerate(Q_values) if idx_to_bucket(k) in possible_actions ]) ties = np.flatnonzero(Q_values_for_possible_actions == Q_values_for_possible_actions.max()) best_possible_action_bucket = np.random.choice(ties) best_possible_action_bucket = [ idx_to_bucket(k) for k, Q_value in enumerate(Q_values) if idx_to_bucket(k) in possible_actions ][best_possible_action_bucket] action = bucket_to_action(best_possible_action_bucket, actions, b_round, player, opponent_side_pot) else: idx = [ idx_to_bucket(k) for k, Q_value in enumerate(Q_values) if idx_to_bucket(k) in possible_actions ] Q_values = [ Q_value for k, Q_value in enumerate(Q_values) if idx_to_bucket(k) in possible_actions ] probabilities = softmax(Q_values) assert np.abs(np.sum(probabilities) - 1.) < 1e-6, probabilities action = bucket_to_action(sample_action(idx, probabilities), actions, b_round, player, opponent_side_pot) is_epsilon = (random.random() <= eps) if is_epsilon: return get_random_action(possible_actions, actions, b_round, player, opponent_side_pot) else: return action
def train_featurizer(self): ''' TODO: train_featurizer11 and train_featurizer1 will be merged! ''' train_dataset, val_dataset, test_dataset =\ self._preprocess_data(includes_val=True, includes_test=True) optimizer = t.optim.Adam(self.f.parameters(), lr=self.lr, weight_decay=self.weight_decay) for epoch_i in range(self.num_epochs): d = self.data['train'] x_hand_train, x_board_train, y_hs_train, y_probas_combi_train \ = shuffle(d['x_hand'], d['x_board'], d['y_hs'], d['y_probs_combi']) # check if weights are NaN start_t = time.time() for p in self.f.parameters(): if np.isnan(p.data.cpu().numpy()).sum() > 0: raise ValueError('nan weights !') for i in range(0, len(train_dataset), self.batch_size): batch_i = i // self.batch_size optimizer.zero_grad() hand = variable(x_hand_train[i:i + self.batch_size], cuda=self.cuda) board = variable(x_board_train[i:i + self.batch_size], cuda=self.cuda) target = variable(y_hs_train[i:i + self.batch_size], cuda=self.cuda).squeeze() if len(hand) != self.batch_size: break # pred HS_pred = self.f.forward(hand, board)[0].squeeze().float() HS = FeaturizerManager.clip(HS_pred) pred = t.log(HS / (1 - HS)) loss = (target - pred).pow(2).mean() raw_loss = float(round(loss.data.cpu().numpy()[0], 2)) if self.tensorboard is not None: self.tensorboard.add_scalar_value('train_loss', raw_loss) loss.backward() optimizer.step() print(time.time() - start_t, 'seconds per epoch') # epoch ended self.validate_featurizer(val_dataset, against_test_data=False) self.save_model(self.model_path, epoch_i, batch_i) self.tensorboard.to_zip('data/hand_eval/tensorboard/{}'.format( time.time())) print('saved model', 'epoch: ', epoch_i, 'batch: ', batch_i) # after training is over, we save the model self.validate_featurizer(test_dataset, against_test_data=True) self.save_model(self.model_path, epoch_i, batch_i, is_best=True) self.tensorboard.to_zip('data/hand_eval/tensorboard/{}'.format( time.time())) print('saved final model', 'epoch: ', epoch_i, 'batch: ', batch_i)