Python EpochLogger примеры использования

Язык программирования: Python

Пространство имен/Пакет: league.spinningupLogger

Класс/Тип: EpochLogger

Примеров на hotexamples.com: 2

Python EpochLogger - 2 примера найдено. Это лучшие примеры Python кода для league.spinningupLogger.EpochLogger, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

EpochLogger(2)

Основные методы

EpochLogger (2)

Пример #1

Показать файл

Файл: qlearn1AgentNP.py Проект: Orpheon/All-in

class Qlearn1AgentNP(BaseAgentNP):
    MODEL_FILES = ['q.modelb']
    logger = EpochLogger(output_dir='qlearn1/logs',
                         output_fname='progress.csv')

    def initialize(self, batch_size, initial_capital, n_players):
        self.BATCH_SIZE = batch_size
        self.INITAL_CAPITAL = initial_capital
        self.N_PLAYERS = n_players

        # 5 community cards x 53 (52 cards + "unknown") + 2 holecards x 52,
        # (1 position in this round + 1 folded + 1 pot investment total + 1 pot investment this round + 1 which player last raised) x 6
        # round x 4 (1h)
        self.obs_dim = (5) * 53 + (2) * 52 + (1 + 1 + 1 + 1 + 1) * 6 + (1) * 5
        self.act_dim = 6

        self.q = models.MLPQFunction(self.obs_dim,
                                     self.act_dim,
                                     trainable=self.TRAINABLE,
                                     device=DEVICE)

        self.possible_actions = torch.zeros((self.BATCH_SIZE, self.act_dim, 3),
                                            device=DEVICE)
        self.possible_actions[:, 0, constants.FOLD] = 1
        self.possible_actions[:, 1, constants.CALL] = 1
        self.possible_actions[:, 2:, constants.RAISE] = 1
        self.possible_raises = np.array([0, 0, 4, 20, 100, 200])

        if self.TRAINABLE:
            self.reward = torch.zeros(self.BATCH_SIZE)
            self.replaybuffer = replaybuffer.ReplayBuffer(
                obs_dim=self.obs_dim,
                act_dim=self.act_dim,
                batch_size=self.BATCH_SIZE,
                size=REPLAYBUFFER_SIZE,
                device=DEVICE)

            self.q_optimizer = torch.optim.Adam(self.q.parameters(),
                                                lr=Q_LEARNING_RATE)

            self.first_round = True
            self.prev_state = None
            self.prev_action = None

        self.load_model()

    def act(self, player_idx, round, active_games, current_bets, min_raise,
            prev_round_investment, folded, last_raiser, hole_cards,
            community_cards):
        hole_cards.sort(axis=1)
        community_cards[:, 0:3].sort(axis=1)
        state = self.build_network_input(player_idx, round, current_bets,
                                         min_raise, prev_round_investment,
                                         folded, last_raiser, hole_cards,
                                         community_cards)

        actions, amounts, actions_serialized = self.choose_action(
            torch.as_tensor(state, dtype=torch.float32, device=DEVICE),
            current_bets)

        if self.TRAINABLE:
            if not self.first_round:
                self.replaybuffer.store(obs=self.prev_state,
                                        act=self.prev_action,
                                        next_obs=state,
                                        active_games=active_games)
            self.first_round = False
            self.prev_state = state
            self.prev_action = actions_serialized

        return actions, amounts

    def end_trajectory(self, player_idx, round, current_bets, min_raise,
                       prev_round_investment, folded, last_raiser, hole_cards,
                       community_cards, gains):
        # TODO: bugfix to prevent crash in case that agent never acted before game finish
        if self.TRAINABLE and self.prev_state is not None:
            state = self.build_network_input(player_idx, round, current_bets,
                                             min_raise, prev_round_investment,
                                             folded, last_raiser, hole_cards,
                                             community_cards)
            scaled_gains = (gains / self.INITAL_CAPITAL -
                            (self.N_PLAYERS / 2 - 1)) * 2 / self.N_PLAYERS

            # DEBUGTOOL
            lost_money = (gains / self.INITAL_CAPITAL)
            lost_money[folded[:, player_idx] == 0] = 0

            self.reward = torch.Tensor(scaled_gains).to(DEVICE)
            self.replaybuffer.store(obs=self.prev_state,
                                    act=self.prev_action,
                                    next_obs=state,
                                    active_games=np.ones(self.BATCH_SIZE))
            self.logger.store(Reward=scaled_gains,
                              LostInFolding=lost_money,
                              LostGeneral=(gains / self.INITAL_CAPITAL))
            self.train()

            self.save_model()
            # FIXME: Remember that replaybuffer is *not* emptied here

    def train(self):
        state = self.replaybuffer.sample_state()

        while state:
            self.update_parameters(state)
            state = self.replaybuffer.sample_state()

        self.log_everything()

    def log_everything(self):
        self.logger.log_tabular('Folds', average_only=True)
        self.logger.log_tabular('Calls', average_only=True)
        for i in range(2, self.act_dim):
            self.logger.log_tabular('Raises ' + str(self.possible_raises[i]),
                                    average_only=True)
        self.logger.log_tabular('LostInFolding',
                                with_min_and_max=True,
                                average_only=True)
        self.logger.log_tabular('LostGeneral',
                                with_min_and_max=True,
                                average_only=True)
        self.logger.log_tabular('QVals',
                                with_min_and_max=True,
                                average_only=True)
        self.logger.log_tabular('Reward', average_only=True)
        self.logger.log_tabular('LossQ', average_only=True)
        self.logger.dump_tabular()

    def build_network_input(self, player_idx, round, current_bets, min_raise,
                            prev_round_investment, folded, last_raiser,
                            hole_cards, community_cards):
        # First convert the treys card IDs into indices
        hole_cards_converted = 13 * np.log2(
            np.right_shift(hole_cards, 12) & 0xF) + (
                np.right_shift(hole_cards, 8) & 0xF)
        community_cards_converted = 13 * np.log2(
            np.right_shift(community_cards, 12) & 0xF) + (
                np.right_shift(community_cards, 8) & 0xF)
        # Then convert those indices into 1h
        hole_cards_1h = (np.arange(52) == hole_cards_converted[..., None] -
                         1).astype(int)
        known_community_cards_1h = (
            np.arange(53) == community_cards_converted[..., None] -
            1).astype(int)
        # Fill missing community cards with zero
        missing_community_cards = np.zeros(
            (self.BATCH_SIZE, 5 - community_cards.shape[1], 53))
        # Have a 53rd column in the 1h to indicate missing cards, and fill that with ones where relevant
        missing_community_cards[:, :, -1] = 1
        community_cards_1h = np.concatenate(
            (known_community_cards_1h, missing_community_cards), axis=1)

        player_data = np.zeros((self.BATCH_SIZE, 5, self.N_PLAYERS))
        # Who folded already
        player_data[:, 0, :] = folded
        # Who put how much total into the pot
        player_data[:, 1, :] = (prev_round_investment +
                                current_bets) / self.INITAL_CAPITAL
        # Who put how much this round
        player_data[:, 2, :] = (current_bets) / self.INITAL_CAPITAL
        # Who was the last to raise
        player_data[:, 3, :] = np.eye(self.N_PLAYERS)[last_raiser]
        # Reorder the first four to correspond to player_idx
        player_data = np.concatenate(
            (player_data[:, :, player_idx:], player_data[:, :, :player_idx]),
            axis=2)
        # Which player are we
        player_data[:, 4, player_idx] = 1

        tail_data = np.zeros((self.BATCH_SIZE, 5))
        tail_data[:, round] = 1

        network_input = np.concatenate(
            (hole_cards_1h.reshape(self.BATCH_SIZE, -1),
             community_cards_1h.reshape(self.BATCH_SIZE, -1),
             player_data.reshape(self.BATCH_SIZE,
                                 -1), tail_data.reshape(self.BATCH_SIZE, -1)),
            axis=1)

        assert (network_input.shape[1] == self.obs_dim)

        return network_input

    def choose_action(self, network_input, current_bets):
        scores = np.ndarray((self.BATCH_SIZE, self.act_dim))
        with torch.no_grad():
            for idx in range(self.possible_actions.shape[1]):
                onehot_actions = torch.eye(self.act_dim,
                                           device=DEVICE)[torch.full(
                                               (self.BATCH_SIZE, ),
                                               idx,
                                               dtype=torch.long,
                                               device=DEVICE)]
                scores[:, idx] = self.q(network_input,
                                        onehot_actions).cpu().numpy()

            actions = np.argmax(scores, axis=1)

        if self.TRAINABLE:
            dice = np.random.random(self.BATCH_SIZE)
            rand_actions = np.random.randint(0, self.act_dim, self.BATCH_SIZE)
            actions[dice <= NOISE_LEVEL] = rand_actions[dice <= NOISE_LEVEL]

        actions_array = np.eye(self.act_dim)[actions]
        amounts = self.possible_raises[actions]

        actions[actions > constants.RAISE] = constants.RAISE

        actions[current_bets.sum(axis=1) == 0] = constants.CALL

        self.logger.store(Calls=100 * np.mean(actions == constants.CALL),
                          Folds=100 * np.mean(actions == constants.FOLD))
        for i in range(2, self.act_dim):
            self.logger.store(
                **{
                    "Raises " + str(self.possible_raises[i]):
                    100 * np.mean(actions_array[:, i])
                })

        return actions, amounts, actions_array

    # Set up function for computing Q-losses
    def compute_loss_q(self, data):
        o, a, o2, active = data['obs'], data['act'], data['obs2'], data[
            'active']

        q = self.q(o, a)

        loss_q = (active * (q - self.reward)**2).sum()

        # Useful info for logging
        q_info = dict(QVals=(q).cpu().detach().numpy())
        # q_info = {}

        return loss_q, q_info

    def update_parameters(self, data):
        # First run one gradient descent step for Q1 and Q2
        self.q_optimizer.zero_grad()
        loss_q, q_info = self.compute_loss_q(data)
        loss_q.backward()
        self.q_optimizer.step()

        # Record things
        self.logger.store(LossQ=loss_q.item(), **q_info)

    def load_model(self):
        if os.path.exists(self.MODEL_PATH):
            self.q.load(self.MODEL_PATH)
            if self.TRAINABLE:
                self.q_optimizer.load_state_dict(
                    torch.load(os.path.join(self.MODEL_PATH, 'q_opt.optb')))

    def save_model(self):
        print('saved', self.MODEL_PATH)
        self.q.save(self.MODEL_PATH)
        if self.TRAINABLE:
            torch.save(self.q_optimizer.state_dict(),
                       os.path.join(self.MODEL_PATH, 'q_opt.optb'))

Пример #2

Показать файл

Файл: sac1AgentNP.py Проект: Orpheon/All-in

class Sac1AgentNP(BaseAgentNP):
    MODEL_FILES = ['policy.modelb', 'q1.modelb', 'q2.modelb']
    logger = EpochLogger(output_dir='sac1/logs', output_fname='progress.csv')

    def initialize(self, batch_size, initial_capital, n_players):
        self.BATCH_SIZE = batch_size
        self.REPLAY_BATCH_SIZE = 1000
        self.INITAL_CAPITAL = initial_capital
        self.N_PLAYERS = n_players

        # 5 community cards x 53 (52 cards + "unknown") + 2 holecards x 52,
        # (1 position in this round + 1 folded + 1 pot investment total + 1 pot investment this round + 1 which player last raised) x 6
        # round x 4 (1h)
        self.obs_dim = (5) * 53 + (2) * 52 + (1 + 1 + 1 + 1 + 1) * 6 + (1) * 5
        # Action dimensions
        self.act_dim = 4

        self.ac = models.MLPActorCritic(self.obs_dim,
                                        self.act_dim,
                                        1,
                                        trainable=self.TRAINABLE,
                                        device=DEVICE)

        if self.TRAINABLE:
            self.target_ac = deepcopy(self.ac)
            for parameter in self.target_ac.parameters():
                parameter.requires_grad = False

            self.replaybuffer = replaybuffer.ReplayBuffer(
                obs_dim=self.obs_dim,
                act_dim=self.act_dim,
                size=REPLAYBUFFER_SIZE * self.BATCH_SIZE,
                device=DEVICE)

            self.pi_optimizer = torch.optim.Adam(self.ac.parameters(),
                                                 lr=PI_LEARNING_RATE)
            self.q_optimizer = torch.optim.Adam(itertools.chain(
                self.ac.q1.parameters(), self.ac.q2.parameters()),
                                                lr=Q_LEARNING_RATE)

            self.first_round = True
            self.prev_state = None
            self.prev_action = None

        self.load_model()

    def act(self, player_idx, round, active_games, current_bets, min_raise,
            prev_round_investment, folded, last_raiser, hole_cards,
            community_cards):
        state = self.build_network_input(player_idx, round, current_bets,
                                         min_raise, prev_round_investment,
                                         folded, last_raiser, hole_cards,
                                         community_cards)
        hole_cards.sort(axis=1)
        community_cards[:, 0:3].sort(axis=1)

        network_output = self.ac.act(torch.as_tensor(state,
                                                     dtype=torch.float32),
                                     deterministic=not self.TRAINABLE)

        if self.TRAINABLE:
            if not self.first_round:
                n_rounds = active_games.sum()
                self.replaybuffer.store(obs=self.prev_state[active_games],
                                        act=self.prev_action[active_games],
                                        rew=np.zeros(n_rounds),
                                        next_obs=state[active_games],
                                        done=np.zeros(n_rounds),
                                        batch_size=n_rounds)
            self.first_round = False
            self.prev_state = state
            self.prev_action = network_output

        actions, amounts = self.interpret_network_output(
            network_output, current_bets, prev_round_investment, player_idx,
            min_raise)

        return actions, amounts

    def end_trajectory(self, player_idx, round, current_bets, min_raise,
                       prev_round_investment, folded, last_raiser, hole_cards,
                       community_cards, gains):
        #TODO: bugfix to prevent crash in case that agent never acted before game finish
        if self.TRAINABLE and self.prev_state is not None:
            state = self.build_network_input(player_idx, round, current_bets,
                                             min_raise, prev_round_investment,
                                             folded, last_raiser, hole_cards,
                                             community_cards)
            scaled_gains = (gains / self.INITAL_CAPITAL -
                            (self.N_PLAYERS / 2 - 1)) * 2 / self.N_PLAYERS

            # DEBUGTOOL
            lost_money = (gains / self.INITAL_CAPITAL)
            lost_money[folded[:, player_idx] == 0] = 0

            self.replaybuffer.store(obs=self.prev_state,
                                    act=self.prev_action,
                                    rew=scaled_gains,
                                    next_obs=state,
                                    done=np.ones(self.BATCH_SIZE),
                                    batch_size=self.BATCH_SIZE)
            self.logger.store(Reward=scaled_gains,
                              LostInFolding=lost_money,
                              LostGeneral=(gains / self.INITAL_CAPITAL))
            self.train()
            self.save_model()
            # FIXME: Remember that replaybuffer is *not* emptied here

    def train(self):
        self.replaybuffer.shuffle()
        batch = self.replaybuffer.sample_batch(
            batch_size=min(self.REPLAY_BATCH_SIZE, self.BATCH_SIZE))

        while batch:
            self.update_parameters(batch)
            batch = self.replaybuffer.sample_batch(
                batch_size=min(self.REPLAY_BATCH_SIZE, self.BATCH_SIZE))

        self.log_everything()

    def log_everything(self):
        self.logger.log_tabular('QContribPiLoss',
                                with_min_and_max=True,
                                average_only=True)
        self.logger.log_tabular('LossPi', average_only=True)
        self.logger.log_tabular('EntropyBonus', average_only=True)
        self.logger.log_tabular('Raises', average_only=True)
        self.logger.log_tabular('Calls', average_only=True)
        self.logger.log_tabular('Folds', average_only=True)
        self.logger.log_tabular('LostInFolding',
                                with_min_and_max=True,
                                average_only=True)
        self.logger.log_tabular('LostGeneral',
                                with_min_and_max=True,
                                average_only=True)
        self.logger.log_tabular('QVals',
                                with_min_and_max=True,
                                average_only=True)
        self.logger.log_tabular('TargQVals',
                                with_min_and_max=True,
                                average_only=True)
        self.logger.log_tabular('Reward', average_only=True)
        self.logger.log_tabular('LossQ', average_only=True)
        self.logger.dump_tabular()

    def build_network_input(self, player_idx, round, current_bets, min_raise,
                            prev_round_investment, folded, last_raiser,
                            hole_cards, community_cards):
        # First convert the treys card IDs into indices
        hole_cards_converted = 13 * np.log2(
            np.right_shift(hole_cards, 12) & 0xF) + (
                np.right_shift(hole_cards, 8) & 0xF)
        community_cards_converted = 13 * np.log2(
            np.right_shift(community_cards, 12) & 0xF) + (
                np.right_shift(community_cards, 8) & 0xF)
        # Then convert those indices into 1h
        hole_cards_1h = (np.arange(52) == hole_cards_converted[..., None] -
                         1).astype(int)
        known_community_cards_1h = (
            np.arange(53) == community_cards_converted[..., None] -
            1).astype(int)
        # Fill missing community cards with zero
        missing_community_cards = np.zeros(
            (self.BATCH_SIZE, 5 - community_cards.shape[1], 53))
        # Have a 53rd column in the 1h to indicate missing cards, and fill that with ones where relevant
        missing_community_cards[:, :, -1] = 1
        community_cards_1h = np.concatenate(
            (known_community_cards_1h, missing_community_cards), axis=1)

        player_data = np.zeros((self.BATCH_SIZE, 5, self.N_PLAYERS))
        # Who folded already
        player_data[:, 0, :] = folded
        # Who put how much total into the pot
        player_data[:, 1, :] = (prev_round_investment +
                                current_bets) / self.INITAL_CAPITAL
        # Who put how much this round
        player_data[:, 2, :] = (current_bets) / self.INITAL_CAPITAL
        # Who was the last to raise
        player_data[:, 3, :] = np.eye(self.N_PLAYERS)[last_raiser]
        # Reorder the first four to correspond to player_idx
        player_data = np.concatenate(
            (player_data[:, :, player_idx:], player_data[:, :, :player_idx]),
            axis=2)
        # Which player are we
        player_data[:, 4, player_idx] = 1

        tail_data = np.zeros((self.BATCH_SIZE, 5))
        tail_data[:, round] = 1

        network_input = np.concatenate(
            (hole_cards_1h.reshape(self.BATCH_SIZE, -1),
             community_cards_1h.reshape(self.BATCH_SIZE, -1),
             player_data.reshape(self.BATCH_SIZE,
                                 -1), tail_data.reshape(self.BATCH_SIZE, -1)),
            axis=1)

        assert (network_input.shape[1] == self.obs_dim)

        return network_input

    def interpret_network_output(self, network_output, current_bets,
                                 prev_round_investment, player_idx, min_raise):

        chosen_action = np.argmax(network_output[:, :3], axis=1)
        actions = np.array([constants.FOLD, constants.CALL,
                            constants.RAISE])[chosen_action]

        actions[current_bets.sum(axis=1) == 0] = constants.CALL

        current_stake = current_bets[:,
                                     player_idx] + prev_round_investment[:,
                                                                         player_idx]
        amounts = np.clip((network_output[:, 1] + 1) * self.INITAL_CAPITAL / 2,
                          min_raise, self.INITAL_CAPITAL - current_stake)

        self.logger.store(
            Raises=100 * np.mean(actions == constants.RAISE),
            Calls=100 * np.mean(actions == constants.CALL),
            Folds=100 * np.mean(actions == constants.FOLD),
        )

        return actions, amounts

    # Set up function for computing SAC Q-losses
    def compute_loss_q(self, data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = self.ac.q1(o, a)
        q2 = self.ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = self.ac.pi(o2)

            # Target Q-values
            q1_pi_targ = self.target_ac.q1(o2, a2)
            q2_pi_targ = self.target_ac.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            self.logger.store(EntropyBonus=(-ALPHA *
                                            logp_a2).cpu().detach().numpy())
            backup = r + GAMMA * (1 - d) * (q_pi_targ - ALPHA * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(QVals=((q1 + q2) / 2).cpu().detach().numpy(),
                      TargQVals=((q1_pi_targ + q2_pi_targ) /
                                 2).cpu().detach().numpy())
        # q_info = {}

        return loss_q, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(self, data):
        o = data['obs']
        action, logp_pi = self.ac.pi(o)
        q1_pi = self.ac.q1(o, action)
        q2_pi = self.ac.q2(o, action)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (ALPHA * logp_pi - q_pi).mean()
        self.logger.store(
            QContribPiLoss=(torch.abs(q_pi) /
                            (torch.abs(q_pi) + torch.abs(ALPHA * logp_pi))
                            ).mean().cpu().detach().numpy())

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.cpu().detach().numpy())
        # pi_info = {}

        return loss_pi, pi_info

    def update_parameters(self, data):
        # First run one gradient descent step for Q1 and Q2
        self.q_optimizer.zero_grad()
        loss_q, q_info = self.compute_loss_q(data)
        loss_q.backward()
        self.q_optimizer.step()

        # Record things
        self.logger.store(LossQ=loss_q.item(), **q_info)

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in itertools.chain(self.ac.q1.parameters(),
                                 self.ac.q2.parameters()):
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        self.pi_optimizer.zero_grad()
        loss_pi, pi_info = self.compute_loss_pi(data)
        loss_pi.backward()
        self.pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in itertools.chain(self.ac.q1.parameters(),
                                 self.ac.q2.parameters()):
            p.requires_grad = True

        # Record things
        self.logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(self.ac.parameters(),
                                 self.target_ac.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(POLYAK)
                p_targ.data.add_((1 - POLYAK) * p.data)

    def load_model(self):
        if os.path.exists(self.MODEL_PATH):
            self.ac.load(self.MODEL_PATH)
            if self.TRAINABLE:
                self.pi_optimizer.load_state_dict(
                    torch.load(os.path.join(self.MODEL_PATH, 'pi_opt.optb')))
                self.q_optimizer.load_state_dict(
                    torch.load(os.path.join(self.MODEL_PATH, 'q_opt.optb')))

    def save_model(self):
        print('saved', self.MODEL_PATH)
        self.ac.save(self.MODEL_PATH)
        if self.TRAINABLE:
            torch.save(self.pi_optimizer.state_dict(),
                       os.path.join(self.MODEL_PATH, 'pi_opt.optb'))
            torch.save(self.q_optimizer.state_dict(),
                       os.path.join(self.MODEL_PATH, 'q_opt.optb'))