Пример #1
0
    def validate_featurizer(self, dataset, against_test_data=False):
        if against_test_data:
            d = self.data['test']
        else:
            d = self.data['val']

        # hardcode epochs
        x_hand, x_board, y_hs, y_probas_combi = shuffle(
            d['x_hand'], d['x_board'], d['y_hs'], d['y_probs_combi'])

        for i in range(0, len(dataset), self.batch_size):
            batch_i = i // self.batch_size
            hand = variable(x_hand[i:i + self.batch_size], cuda=self.cuda)
            board = variable(x_board[i:i + self.batch_size], cuda=self.cuda)
            target = variable(y_hs[i:i + self.batch_size],
                              cuda=self.cuda).squeeze()

            self.f.eval()
            if len(hand) != self.batch_size:
                break

            HS_pred = self.f.forward(hand, board)[0].squeeze().float()
            HS = FeaturizerManager.clip(HS_pred)

            self.f.train()
            # for numerical stable
            m = variable(np.array([1e-6]), cuda=self.cuda)
            pred = t.log(t.max(HS / (1 - HS), m))
            loss = (target - pred).pow(2).mean()
            loss = float(round(loss.data.cpu().numpy()[0], 3))

            if self.tensorboard is not None:
                if against_test_data:
                    self.tensorboard.add_scalar_value('test_loss', loss)
                else:
                    self.tensorboard.add_scalar_value('validation_loss', loss)
Пример #2
0
    def compute_loss(self, pred, target, imp_weights):
        '''
        compute weighted mse loss
        loss for each sample is scaled by imp_weight
        we need this to account for bias in replay sampling

        added entropy to term to increase diversity and exploration
        beta is a hyperparameter
        '''
        td_deltas = pred - target
        loss = t.mean(imp_weights * td_deltas.pow(2))

        if self.use_entropy_loss:
            # @experimental: entropy term
            episode_id = self.game_info['#episodes']
            self.beta = np.max(
                [self.beta / np.power(np.max([1, episode_id]), 1 / 4), 0.01])
            beta_var = variable(self.beta, cuda=self.is_cuda)
            sm = Softmax(dim=0)
            probs = sm(pred.unsqueeze(dim=1))
            m = variable(np.array([1e-6]), cuda=self.is_cuda)
            entropy = -t.sum(probs * t.log(t.max(probs, m)))
            loss = loss - beta_var * entropy
        return loss, td_deltas
Пример #3
0
def bucket_encode_actions(actions, cuda=False):
    """
    NOTE THAT THIS IS NOT ONE HOT ENCODING
    THIS IS RATHER BUCKET ENCODING (PUT AN ACTION REPRESENTED AS A 6x1 ARRAY TO AN INTEGER BUCKET)
    IT CAN BE USED IN THE LOSS

    YOU SHOULD ADD +1 (see the keys of `Action.BET_BUCKETS` to understand why)

    :param actions: a VARIABLE of size batch_size x 5 (il y a 5 types d'actions: check, bet, call, raise, all-in)
    :return: a VARIABLE of size batch_size x 14 (il y a 14 buckets)
    """
    values, indices = t.max(actions, -1)
    actions_buckets = variable(np.zeros(values.data.cpu().numpy().shape),
                               cuda=cuda)
    actions_buckets[indices == 0] = 0  # check
    actions_buckets[indices == 4] = 14  # all in
    actions_buckets[indices == 5] = -1  # fold
    for bucket_idx in range(1, 14):
        indicator = lambda x: bucket_idx * (x >= Action.BET_BUCKETS[
            bucket_idx][0]).float() * (
                (x <= Action.BET_BUCKETS[bucket_idx][1]).float())
        mask = (indices != 0) * (indices != 5) * (indices != 4)
        actions_buckets[mask] += indicator(values[mask])
    return actions_buckets
Пример #4
0
def create_state_variable(state, cuda=False):
    # TODO: check if dtype should be handled individually
    # do not use this. use variable() in utils
    f = lambda x: variable(x, cuda=cuda)
    return [f(e) for e in state]
Пример #5
0
    def choose_action(self,
                      player,
                      board,
                      pot,
                      actions,
                      b_round,
                      opponent_stack,
                      opponent_side_pot,
                      blinds,
                      episode_idx,
                      for_play=False):
        # decay epsilon in the same way in the paper (NFSP, 2016)
        # we use number of episodes as n
        # the exact decay schedule was not specified but alluded to slower than sqrt
        # so we use (n)^1/4
        self.eps = np.max(
            [self.eps / np.power(np.max([episode_idx, 1]), 1 / 4), 0.01])
        self.eta = np.max(
            [self.eta / np.power(np.max([episode_idx, 1]), 1 / 4), 0.1])
        if player.is_all_in:
            assert player.stack == 0
            return Action('null'), False

        if self.eta >= np.random.rand():
            # use epsilon-greedy policy
            if self.verbose:
                start = timer()

            action = strategy_RL_aux(player,
                                     board,
                                     pot,
                                     actions,
                                     b_round,
                                     opponent_stack,
                                     opponent_side_pot,
                                     self._Q,
                                     greedy=self.is_greedy,
                                     blinds=blinds,
                                     verbose=self.verbose,
                                     eps=self.eps,
                                     cuda=self.cuda,
                                     for_play=for_play)

            if self.verbose:
                print('forward pass of Q took', timer() - start)

            self.is_Q_used = True
        else:
            # use average policy
            state = build_state(player,
                                board,
                                pot,
                                actions,
                                opponent_stack,
                                blinds[1],
                                as_variable=False)
            state = [variable(s, cuda=self.cuda) for s in state]
            state.append(for_play)
            if self.verbose:
                start = timer()

            action_probs = self._pi.forward(*state).squeeze()

            if self.verbose:
                print('forward pass of pi took', timer() - start)
            possible_actions = authorized_actions_buckets(
                player, actions, b_round, opponent_side_pot)
            # @hack
            # add some heuristics
            # if check is possible
            if 0 in possible_actions and -1 in possible_actions:
                # one should not fold
                del possible_actions[possible_actions.index(-1)]
            # @hack: remove high roller actions
            #try:
            #    # anything betting above 20 should be discouraged
            #    # when initial money is only 100
            #    high_bet_i = possible_actions.index(8)
            #    # but allow all-in
            #    possible_actions = possible_actions[:high_bet_i] + [possible_actions[-1]]
            #except ValueError:
            #    pass

            idx = [
                idx_to_bucket(k) for k, _ in enumerate(action_probs)
                if idx_to_bucket(k) in possible_actions
            ]
            valid_action_probs = t.stack([
                p for k, p in enumerate(action_probs)
                if idx_to_bucket(k) in possible_actions
            ])

            valid_action_probs /= t.sum(valid_action_probs)

            action = bucket_to_action(
                sample_action(idx,
                              valid_action_probs.data.cpu().numpy()), actions,
                b_round, player, opponent_side_pot)
            self.is_Q_used = False
        return action, self.is_Q_used
Пример #6
0
def strategy_RL_aux(player,
                    board,
                    pot,
                    actions,
                    b_round,
                    opponent_stack,
                    opponent_side_pot,
                    Q,
                    greedy=True,
                    blinds=BLINDS,
                    verbose=False,
                    eps=0.,
                    cuda=False,
                    for_play=False):
    """
    Take decision using Q values (in a greedy or random way)
    :param player:
    :param board:
    :param pot:
    :param actions:
    :param b_round:
    :param opponent_stack:
    :param Q: the Keras neural network that takes states as inputs
    :param greedy: True for greedy, False for Q-softmax sampling
    :param blinds:
    :param verbose:
    :return:
    """
    possible_actions = authorized_actions_buckets(
        player, actions, b_round, opponent_side_pot
    )  # you don't have right to take certain actions, e.g betting more than you have or betting 0 or checking a raise

    # @hack
    # add some heuristics
    # if check is possible
    if 0 in possible_actions and -1 in possible_actions:
        # one should not fold
        del possible_actions[possible_actions.index(-1)]

    # @hack: remove high roller actions
    #try:
    #    # anything betting above 20 should be discouraged
    #    # when initial money is only 100
    #    high_bet_i = possible_actions.index(8)
    #    # but allow all-in
    #    possible_actions = possible_actions[:high_bet_i] + [possible_actions[-1]]
    #except ValueError:
    #    pass

    state = build_state(player,
                        board,
                        pot,
                        actions,
                        opponent_stack,
                        blinds[1],
                        as_variable=False)
    state = [variable(s, cuda=cuda) for s in state]
    state.append(for_play)
    Q_values = Q.forward(*state)[0].squeeze(
    )  # it has multiple outputs, the first is the Qvalues
    Q_values = Q_values.data.cpu().numpy()

    # choose action in a greedy way
    if greedy:
        Q_values_for_possible_actions = np.array([
            Q_value for k, Q_value in enumerate(Q_values)
            if idx_to_bucket(k) in possible_actions
        ])
        ties = np.flatnonzero(Q_values_for_possible_actions ==
                              Q_values_for_possible_actions.max())
        best_possible_action_bucket = np.random.choice(ties)
        best_possible_action_bucket = [
            idx_to_bucket(k) for k, Q_value in enumerate(Q_values)
            if idx_to_bucket(k) in possible_actions
        ][best_possible_action_bucket]
        action = bucket_to_action(best_possible_action_bucket, actions,
                                  b_round, player, opponent_side_pot)
    else:
        idx = [
            idx_to_bucket(k) for k, Q_value in enumerate(Q_values)
            if idx_to_bucket(k) in possible_actions
        ]
        Q_values = [
            Q_value for k, Q_value in enumerate(Q_values)
            if idx_to_bucket(k) in possible_actions
        ]
        probabilities = softmax(Q_values)
        assert np.abs(np.sum(probabilities) - 1.) < 1e-6, probabilities
        action = bucket_to_action(sample_action(idx, probabilities), actions,
                                  b_round, player, opponent_side_pot)

    is_epsilon = (random.random() <= eps)
    if is_epsilon:
        return get_random_action(possible_actions, actions, b_round, player,
                                 opponent_side_pot)
    else:
        return action
Пример #7
0
    def train_featurizer(self):
        '''
        TODO: train_featurizer11 and train_featurizer1 will be merged!
        '''
        train_dataset, val_dataset, test_dataset =\
        self._preprocess_data(includes_val=True, includes_test=True)
        optimizer = t.optim.Adam(self.f.parameters(),
                                 lr=self.lr,
                                 weight_decay=self.weight_decay)

        for epoch_i in range(self.num_epochs):
            d = self.data['train']
            x_hand_train, x_board_train, y_hs_train, y_probas_combi_train \
          = shuffle(d['x_hand'], d['x_board'], d['y_hs'], d['y_probs_combi'])

            # check if weights are NaN
            start_t = time.time()
            for p in self.f.parameters():
                if np.isnan(p.data.cpu().numpy()).sum() > 0:
                    raise ValueError('nan weights !')

            for i in range(0, len(train_dataset), self.batch_size):
                batch_i = i // self.batch_size

                optimizer.zero_grad()

                hand = variable(x_hand_train[i:i + self.batch_size],
                                cuda=self.cuda)
                board = variable(x_board_train[i:i + self.batch_size],
                                 cuda=self.cuda)
                target = variable(y_hs_train[i:i + self.batch_size],
                                  cuda=self.cuda).squeeze()
                if len(hand) != self.batch_size:
                    break
                # pred
                HS_pred = self.f.forward(hand, board)[0].squeeze().float()

                HS = FeaturizerManager.clip(HS_pred)
                pred = t.log(HS / (1 - HS))
                loss = (target - pred).pow(2).mean()
                raw_loss = float(round(loss.data.cpu().numpy()[0], 2))

                if self.tensorboard is not None:
                    self.tensorboard.add_scalar_value('train_loss', raw_loss)

                loss.backward()
                optimizer.step()
            print(time.time() - start_t, 'seconds per epoch')
            # epoch ended
            self.validate_featurizer(val_dataset, against_test_data=False)
            self.save_model(self.model_path, epoch_i, batch_i)
            self.tensorboard.to_zip('data/hand_eval/tensorboard/{}'.format(
                time.time()))
            print('saved model', 'epoch: ', epoch_i, 'batch: ', batch_i)

        # after training is over, we save the model
        self.validate_featurizer(test_dataset, against_test_data=True)
        self.save_model(self.model_path, epoch_i, batch_i, is_best=True)
        self.tensorboard.to_zip('data/hand_eval/tensorboard/{}'.format(
            time.time()))
        print('saved final model', 'epoch: ', epoch_i, 'batch: ', batch_i)