예제 #1
0
    def __init__(self):
        ''' Load pretrained model
        '''
        import tensorflow as tf
        from rlcard.agents import NFSPAgent
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)

        env = rlcard.make('leduc-holdem')
        with self.graph.as_default():
            self.nfsp_agents = []
            for i in range(env.player_num):
                agent = NFSPAgent(self.sess,
                                  scope='nfsp' + str(i),
                                  action_num=env.action_num,
                                  state_shape=env.state_shape,
                                  hidden_layers_sizes=[128, 128],
                                  q_mlp_layers=[128, 128])
                self.nfsp_agents.append(agent)

        check_point_path = os.path.join(ROOT_PATH, 'leduc_holdem_nfsp')
        with self.sess.as_default():
            with self.graph.as_default():
                saver = tf.train.Saver()
                saver.restore(self.sess,
                              tf.train.latest_checkpoint(check_point_path))
예제 #2
0
 def __init__(self):
     super().__init__()
     self.wins = 0
     self.losses = 0
     '''
     Instantiate agent.
     '''
     # Setup RL NFSP agent
     # Set the iterations numbers and how frequently we evaluate/save plot
     evaluate_every = 10000
     evaluate_num = 10000
     episode_num = 100000
     # The intial memory size
     memory_init_size = 1000
     # Train the agent every X steps
     train_every = 64
     # The paths for saving the logs and learning curves
     log_dir = './training/nfsp/'
     # Set a global seed
     set_global_seed(0)
     # Set agent - TODO - determine PPE parameters
     self.agent = NFSPAgent(scope='nfsp',
                            action_num=3,
                            state_shape=54,
                            hidden_layers_sizes=[512, 512],
                            min_buffer_size_to_learn=memory_init_size,
                            q_replay_memory_init_size=memory_init_size,
                            train_every=train_every,
                            q_train_every=train_every,
                            q_mlp_layers=[512, 512],
                            device=torch.device('cpu'))
     # Init a Logger to plot the learning curve
     self.logger = Logger(log_dir)
예제 #3
0
 def __init__(self):
     # load pretrained model from tensorflow
     evaluate_every = 100
     evaluate_num = 100
     episode_num = 6000
     memory_init_size = 1000
     train_every = 64
     i = 0
     self.graph = tf.Graph()
     self.sess = tf.Session(graph=self.graph)
     self.env = rlcard.make('gin-rummy')
     with self.graph.as_default():
         self.agent = NFSPAgent(self.sess,
                                scope='nfsp' + str(i),
                                action_num=self.env.action_num,
                                state_shape=[4, 52],
                                hidden_layers_sizes=[128],
                                anticipatory_param=0.5,
                                batch_size=256,
                                rl_learning_rate=0.01,
                                sl_learning_rate=0.005,
                                min_buffer_size_to_learn=memory_init_size,
                                q_replay_memory_size=int(1e5),
                                q_replay_memory_init_size=memory_init_size,
                                train_every=train_every,
                                q_train_every=train_every,
                                q_batch_size=256,
                                q_mlp_layers=[128])
     print("restoring checkpoint...")
     check_point_path = "gin_rummy_nfsp4"
     with self.sess.as_default():
         with self.graph.as_default():
             saver = tf.train.Saver()
             saver.restore(self.sess,
                           tf.train.latest_checkpoint(check_point_path))
     print("checkpoint restored!")
예제 #4
0
    def __init__(self):
        ''' Load pretrained model
        '''
        import tensorflow as tf
        from rlcard.agents import NFSPAgent, RandomAgent
        self.graph = tf.Graph()

        # Mitigation for gpu memory issue
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        self.sess = tf.Session(graph=self.graph, config=config)

        env = rlcard.make('tractor')
        with self.graph.as_default():
            self.nfsp_agents = []
            # for i in range(env.player_num):
            #     agent = NFSPAgent(self.sess,
            #                       scope='nfsp' + str(i),
            #                       action_num=env.action_num,
            #                       state_shape=env.state_shape,
            #                       hidden_layers_sizes=[512,1024,2048,1024,512],
            #                       q_mlp_layers=[512,1024,2048,1024,512])
            #     self.nfsp_agents.append(agent)

            for i in range(1):
                agent = NFSPAgent(self.sess,
                                scope='nfsp' + str(i),
                                action_num=env.action_num,
                                state_shape=env.state_shape,
                                hidden_layers_sizes=[2048,2048],
                                q_mlp_layers=[2048,2048],
                                # evaluate_with='average_policy')
                                evaluate_with='best_response')

                self.nfsp_agents.append(agent)

        check_point_path = os.path.join(TRACTOR_PATH, 'nfsp_continue_350k_0.99')

        with self.sess.as_default():
            with self.graph.as_default():
                saver = tf.train.Saver()
                saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path))
예제 #5
0
with tf.Session() as sess:

    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    agents = []
    for i in range(env.player_num):
        agent = NFSPAgent(sess,
                          scope='nfsp' + str(i),
                          action_num=env.action_num,
                          state_shape=env.state_shape,
                          hidden_layers_sizes=[512, 1024, 2048, 1024, 512],
                          anticipatory_param=0.5,
                          batch_size=256,
                          rl_learning_rate=0.00005,
                          sl_learning_rate=0.00001,
                          min_buffer_size_to_learn=memory_init_size,
                          q_replay_memory_size=int(1e5),
                          q_replay_memory_init_size=memory_init_size,
                          train_every=train_every,
                          q_train_every=train_every,
                          q_batch_size=256,
                          q_mlp_layers=[512, 1024, 2048, 1024, 512])
        agents.append(agent)
    random_agent = RandomAgent(action_num=eval_env.action_num)

    env.set_agents(agents)
    eval_env.set_agents([agents[0], random_agent, random_agent])

    # Initialize global variables
    sess.run(tf.global_variables_initializer())
예제 #6
0
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    agents = []
    for i in range(2):
        nfsp_agent = NFSPAgent(
            sess,
            scope='nfsp' + str(i),
            action_num=env.action_num,
            state_shape=env.state_shape,
            hidden_layers_sizes=[512, 1024, 2048, 1024, 512],
            #hidden_layers_sizes=[512,1024,512],
            #  hidden_layers_sizes=[64],
            anticipatory_param=0.5,
            batch_size=256,
            rl_learning_rate=0.00005,
            sl_learning_rate=0.00001,
            min_buffer_size_to_learn=memory_init_size,
            q_replay_memory_size=int(1e5),
            q_replay_memory_init_size=memory_init_size,
            train_every=train_every,
            q_train_every=train_every,
            q_batch_size=256,
            q_mlp_layers=[512, 1024, 2048, 1024, 512],
            #  q_mlp_layers=[512,1024,512],
            #  q_mlp_layers=[64],
            reservoir_buffer_capacity=int(1e4))
        agents.append(nfsp_agent)

    random_agent = RandomAgent(action_num=eval_env.action_num)
    rule_agent = TractorRuleAgent(action_num=eval_env.action_num)
예제 #7
0
class NFSPPlayer(GinRummyPlayer):
    def __init__(self):
        # load pretrained model from tensorflow
        evaluate_every = 100
        evaluate_num = 100
        episode_num = 6000
        memory_init_size = 1000
        train_every = 64
        i = 0
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)
        self.env = rlcard.make('gin-rummy')
        with self.graph.as_default():
            self.agent = NFSPAgent(self.sess,
                                   scope='nfsp' + str(i),
                                   action_num=self.env.action_num,
                                   state_shape=[4, 52],
                                   hidden_layers_sizes=[128],
                                   anticipatory_param=0.5,
                                   batch_size=256,
                                   rl_learning_rate=0.01,
                                   sl_learning_rate=0.005,
                                   min_buffer_size_to_learn=memory_init_size,
                                   q_replay_memory_size=int(1e5),
                                   q_replay_memory_init_size=memory_init_size,
                                   train_every=train_every,
                                   q_train_every=train_every,
                                   q_batch_size=256,
                                   q_mlp_layers=[128])
        print("restoring checkpoint...")
        check_point_path = "gin_rummy_nfsp4"
        with self.sess.as_default():
            with self.graph.as_default():
                saver = tf.train.Saver()
                saver.restore(self.sess,
                              tf.train.latest_checkpoint(check_point_path))
        print("checkpoint restored!")

    def init_game_get_state(self):
        state = {}
        state['hand'] = np.zeros(52, dtype=int)
        for card in self.cards:
            state['hand'][card.getId()] = 1
        state['top_discard'] = np.zeros(52, dtype=int)
        state['dead_cards'] = np.zeros(52, dtype=int)
        state['opponent_known_cards'] = np.zeros(52, dtype=int)
        rep = [state['hand'], state['top_discard'], state['dead_cards'], \
               state['opponent_known_cards']] #, unknown_cards_rep] # changed
        obs = np.array(rep)
        extracted_state = {'obs': obs, 'legal_actions': None}
        return extracted_state

    def get_state_index(self, label):
        inds = {'hand': 0, 'top_discard': 1, 'dead_cards': 2, \
                'opponent_known_cards': 3}
        return inds[label]

    def set_discard(self, card):
        self.state['obs'][self.get_state_index('top_discard')] = np.zeros(
            52, dtype=int)
        self.state['obs'][self.get_state_index('top_discard')][
            card.getId()] = 1

    # Inform player of 0-based player number (0/1), starting player number (0/1), and dealt cards
    # @param playerNum player's 0-based player number (0/1)
    # @param startingPlayerNum starting player number (0/1)
    # @param cards dealt cards
    def startGame(self, playerNum: int, startingPlayerNum: int,
                  cards: List[Card]) -> None:
        self.playerNum = playerNum
        self.startingPlayerNum = startingPlayerNum
        self.cards = list(cards)
        self.opponentKnocked = False
        self.drawDiscardBitstrings = []  # long[], or List[int]
        self.faceUpCard = None
        self.drawnCard = None
        self.state = self.init_game_get_state()
        self.action = None

    # ====================================
    # Action_ids:
    #        0 -> score_player_0_id
    #        1 -> score_player_1_id
    #        2 -> draw_card_id
    #        3 -> pick_up_discard_id
    #        4 -> declare_dead_hand_id
    #        5 -> gin_id
    #        6 to 57 -> discard_id card_id
    #        58 to 109 -> knock_id card_id
    # ====================================

    # Return whether or not player will draw the given face-up card on the draw pile.
    # @param card face-up card on the draw pile
    # @return whether or not player will draw the given face-up card on the draw pile
    def willDrawFaceUpCard(self, card: Card) -> bool:
        # Return true if card would be a part of a meld, false otherwise.
        self.faceUpCard = card
        # update state
        self.set_discard(card)

        self.state['legal_actions'] = [2, 3]
        action, probs = self.agent.eval_step(self.state)

        return True if action == 3 else False

        # self.faceUpCard = card
        # newCards = list(self.cards)
        # newCards.append(card)
        # for meld in GinRummyUtil.cardsToAllMelds(newCards):
        #     if card in meld:
        #         return True
        # return False

    # Report that the given player has drawn a given card and, if known, what the card is.
    # If the card is unknown because it is drawn from the face-down draw pile, the drawnCard is null.
    # Note that a player that returns false for willDrawFaceUpCard will learn of their face-down draw from this method.
    # @param playerNum - player drawing a card
    # @param drawnCard - the card drawn or null, depending on whether the card is known to the player or not, respectively.
    def reportDraw(self, playerNum: int, drawnCard: Card) -> None:
        # Ignore other player draws.  Add to cards if playerNum is this player.
        if playerNum == self.playerNum:
            self.cards.append(drawnCard)
            self.drawnCard = drawnCard
            # add to state
            self.state['obs'][self.get_state_index('hand')][
                drawnCard.getId()] = 1
        # if other player card is not null add to state
        elif drawnCard is not None:
            self.state['obs'][self.get_state_index('opponent_known_cards')][
                drawnCard.getId()] = 1

    # Get the player's discarded card.  If you took the top card from the discard pile,
    # you must discard a different card.
    # If this is not a card in the player's possession, the player forfeits the game.
    # @return the player's chosen card for discarding
    def getDiscard(self) -> Card:
        # Discard a random card (not just drawn face up) leaving minimal deadwood points.
        self.state['legal_actions'] = []
        for card in self.cards:
            if card == self.drawnCard and self.drawnCard == self.faceUpCard:
                continue
            self.state['legal_actions'].append(card.getId() + 6)
        action, probs = self.agent.eval_step(self.state)
        for card in self.cards:
            if card.getId() == action - 6:
                return card
        # minDeadwood = float('inf')
        # candidateCards = []
        # for card in self.cards:
        #     # Cannot draw and discard face up card.
        #     if card == self.drawnCard and self.drawnCard == self.faceUpCard:
        #         continue
        #     # Disallow repeat of draw and discard.
        #     drawDiscard = [self.drawnCard, card]
        #     if GinRummyUtil.cardsToBitstring(drawDiscard) in self.drawDiscardBitstrings:
        #         continue

        #     remainingCards = list(self.cards)
        #     remainingCards.remove(card)
        #     bestMeldSets = GinRummyUtil.cardsToBestMeldSets(remainingCards)
        #     deadwood = GinRummyUtil.getDeadwoodPoints3(remainingCards) if len(bestMeldSets) == 0 \
        #                else GinRummyUtil.getDeadwoodPoints1(bestMeldSets[0], remainingCards)
        #     if deadwood <= minDeadwood:
        #         if deadwood < minDeadwood:
        #             minDeadwood = deadwood
        #             candidateCards.clear()
        #         candidateCards.append(card)
        # # Prevent future repeat of draw, discard pair.
        # discard = candidateCards[randint(0, len(candidateCards)-1)]
        # drawDiscard = [self.drawnCard, discard]
        # self.drawDiscardBitstrings.append(GinRummyUtil.cardsToBitstring(drawDiscard))
        # return discard

    # Report that the given player has discarded a given card.
    # @param playerNum the discarding player
    # @param discardedCard the card that was discarded
    def reportDiscard(self, playerNum: int, discardedCard: Card) -> None:
        # Ignore other player discards.  Remove from cards if playerNum is this player.
        if playerNum == self.playerNum:
            self.cards.remove(discardedCard)
            # update state
            self.state['obs'][self.get_state_index('hand')][
                discardedCard.getId()] = 0
        else:
            self.state['obs'][self.get_state_index('opponent_known_cards')][
                discardedCard.getId()] = 0
        self.set_discard(discardedCard)

    # At the end of each turn, this method is called and the player that cannot (or will not) end the round will return a null value.
    # However, the first player to "knock" (that is, end the round), and then their opponent, will return an ArrayList of ArrayLists of melded cards.
    # All other cards are counted as "deadwood", unless they can be laid off (added to) the knocking player's melds.
    # When final melds have been reported for the other player, a player should return their final melds for the round.
    # @return null if continuing play and opponent hasn't melded, or an ArrayList of ArrayLists of melded cards.
    def getFinalMelds(self) -> List[List[Card]]:
        # Check if deadwood of maximal meld is low enough to go out.
        # TODO: maybe get action from agent
        bestMeldSets = GinRummyUtil.cardsToBestMeldSets(
            self.cards)  # List[List[List[Card]]]
        if not self.opponentKnocked and (len(bestMeldSets) == 0 or \
         GinRummyUtil.getDeadwoodPoints1(bestMeldSets[0], self.cards) > \
         GinRummyUtil.MAX_DEADWOOD):
            return None
        if len(bestMeldSets) == 0:
            return []
        return bestMeldSets[randint(0, len(bestMeldSets) - 1)]

    # When an player has ended play and formed melds, the melds (and deadwood) are reported to both players.
    # @param playerNum player that has revealed melds
    # @param melds an ArrayList of ArrayLists of melded cards with the last ArrayList (possibly empty) being deadwood.
    def reportFinalMelds(self, playerNum: int,
                         melds: List[List[Card]]) -> None:
        # Melds ignored by simple player, but could affect which melds to make for complex player.
        if playerNum != self.playerNum:
            self.opponentKnocked = True
        # add dead cards to state.
        for l in melds:
            for card in l:
                self.state['obs'][self.get_state_index('dead_cards')][
                    card.getId()] = 1

    # Report current player scores, indexed by 0-based player number.
    # @param scores current player scores, indexed by 0-based player number
    def reportScores(self, scores: List[int]) -> None:
        # Ignored by simple player, but could affect strategy of more complex player.
        return

    # Report layoff actions.
    # @param playerNum player laying off cards
    # @param layoffCard card being laid off
    # @param opponentMeld the opponent meld that card is being added to
    def reportLayoff(self, playerNum: int, layoffCard: Card,
                     opponentMeld: List[Card]) -> None:
        # Ignored by simple player, but could affect strategy of more complex player.
        return

    # Report the final hands of players.
    # @param playerNum player of hand reported
    # @param hand complete hand of given player
    def reportFinalHand(self, playerNum: int, hand: List[Card]) -> None:
        # Ignored by simple player, but could affect strategy of more complex player.
        return
예제 #8
0
set_global_seed(0)

with tf.compat.v1.Session() as sess:

    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    agents = []
    for i in range(env.player_num):
        agent = NFSPAgent(sess,
                          scope='nfsp' + str(i),
                          action_num=env.action_num,
                          state_shape=env.state_shape,
                          hidden_layers_sizes=[512, 512],
                          anticipatory_param=0.1,
                          min_buffer_size_to_learn=memory_init_size,
                          q_replay_memory_init_size=memory_init_size,
                          train_every=train_every,
                          q_train_every=train_every,
                          q_mlp_layers=[512, 512])
        agents.append(agent)
    random_agent = RandomAgent(action_num=eval_env.action_num)

    env.set_agents(agents)
    eval_env.set_agents([agents[0], random_agent])

    # Initialize global variables
    sess.run(tf.compat.v1.global_variables_initializer())

    # Init a Logger to plot the learning curve
예제 #9
0
    def __init__(self,
                 env_name,
                 max_episode_length=0,
                 enable_record=False,
                 record_path="1.mp4"):
        self.env_name = env_name

        self.env_type = None
        print('wtf')

        self.env = rlcard.make('no-limit-holdem',
                               config={
                                   'record_action': True,
                                   'game_player_num': 2,
                                   'seed': 477
                               })
        # self.state, self.pointer = self.game.init_game()

        memory_init_size = 300

        # The paths for saving the logs and learning curves
        self.log_dir = './experiments/nolimit_holdem_nfsp_result/ivvan'

        # Set a global seed

        self.evaluate_every = 512
        self.evaluate_num = 64
        self.episode_num = 20480

        # The intial memory size
        self.memory_init_size = 256

        # Train the agent every X steps
        self.train_every = 256
        self.agents = []

        self.agents.append(
            NFSPAgent(scope='nfsp' + str(0),
                      action_num=self.env.action_num,
                      state_shape=self.env.state_shape,
                      hidden_layers_sizes=[512, 512],
                      anticipatory_param=0.1,
                      rl_learning_rate=0.015,
                      sl_learning_rate=0.0075,
                      q_epsilon_start=.3,
                      min_buffer_size_to_learn=memory_init_size,
                      q_replay_memory_size=20480,
                      q_replay_memory_init_size=memory_init_size,
                      train_every=self.train_every + 44,
                      q_train_every=self.train_every,
                      q_mlp_layers=[512, 512],
                      evaluate_with='average_policy'))

        self.agents.append(
            NFSPAgent(scope='nfsp' + str(1),
                      action_num=self.env.action_num,
                      state_shape=self.env.state_shape,
                      hidden_layers_sizes=[512, 512],
                      anticipatory_param=0.1,
                      rl_learning_rate=0.015,
                      sl_learning_rate=0.0075,
                      q_epsilon_start=.3,
                      q_replay_memory_size=20480,
                      min_buffer_size_to_learn=memory_init_size,
                      q_replay_memory_init_size=memory_init_size,
                      train_every=self.train_every + 44,
                      q_train_every=self.train_every,
                      q_mlp_layers=[512, 512],
                      evaluate_with='average_policy'))

        self.env.set_agents(self.agents)
        self.env.reset()
        #initialize env to be equal to the game
        # print(self.state)
        # self.env = PokerState(self.state['hand'], self.state['public_cards'], 250 - self.state['all_chips'][0], 250 - self.state['all_chips'][1], abs(self.state['all_chips'][0] - self.state['all_chips'][1]), self.state['all_chips'][0] + self.state['all_chips'][1], self.state['all_chips'][0], self.state['all_chips'][1])
        self.action_n = 6

        self.max_episode_length = self.env._max_episode_steps if max_episode_length == 0 else max_episode_length

        self.current_step_count = 0

        self.since_last_reset = 0
예제 #10
0
def main():
    # Make environment
    env = rlcard.make('no-limit-holdem',
                      config={
                          'seed': 0,
                          'env_num': 16,
                          'game_player_num': 4
                      })
    eval_env = rlcard.make('no-limit-holdem',
                           config={
                               'seed': 0,
                               'env_num': 16
                           })

    # Set the iterations numbers and how frequently we evaluate the performance
    evaluate_every = 100
    evaluate_num = 1000
    episode_num = 200000

    # The intial memory size
    memory_init_size = 1000

    # Train the agent every X steps
    train_every = 1

    _reward_max = -0.8

    # The paths for saving the logs and learning curves
    log_dir = './experiments/nolimit_holdem_dqn_result/'

    # Set a global seed
    set_global_seed(0)

    with tf.Session() as sess:

        # Initialize a global step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # Set up the agents
        agent = DQNAgent(sess,
                         scope='dqn',
                         action_num=env.action_num,
                         replay_memory_init_size=memory_init_size,
                         train_every=train_every,
                         state_shape=env.state_shape,
                         mlp_layers=[512, 512])

        agent2 = NFSPAgent(sess,
                           scope='nfsp',
                           action_num=env.action_num,
                           state_shape=env.state_shape,
                           hidden_layers_sizes=[512, 512],
                           anticipatory_param=0.1,
                           min_buffer_size_to_learn=memory_init_size,
                           q_replay_memory_init_size=memory_init_size,
                           train_every=64,
                           q_train_every=64,
                           q_mlp_layers=[512, 512])

        # Initialize global variables
        sess.run(tf.global_variables_initializer())

        save_dir = 'models/nolimit_holdem_dqn'
        saver = tf.train.Saver()
        #saver.restore(sess, os.path.join(save_dir, 'model'))

        random_agent = RandomAgent(action_num=eval_env.action_num)
        env.set_agents([agent, agent, agent2, random_agent])
        eval_env.set_agents([agent, agent2])

        # Init a Logger to plot the learning curve
        logger = Logger(log_dir)

        for episode in range(episode_num):
            agent2.sample_episode_policy()
            # Generate data from the environment
            trajectories, _ = env.run(is_training=True)

            # Feed transitions into agent memory, and train the agent
            for ts in trajectories[0]:
                agent.feed(ts)

            for ts in trajectories[2]:
                agent2.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % evaluate_every == 0:
                _reward = tournament(eval_env, evaluate_num)[0]
                logger.log_performance(episode, _reward)
                if _reward > _reward_max:
                    if not os.path.exists(save_dir):
                        os.makedirs(save_dir)
                    saver.save(sess, os.path.join(save_dir, 'model'))
                    _reward_max = _reward

        # Close files in the logger
        logger.close_files()

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        saver.save(sess, os.path.join(save_dir, 'model_final'))
예제 #11
0
train_every = 64

# The paths for saving the logs and learning curves
log_dir = './experiments/nfsp_random_result/'

# Set a global seed
set_global_seed(0)

# Set up the agents
# agents = []
# print(env.player_num)
agent = NFSPAgent(scope='nfsp',
                  action_num=env.action_num,
                  state_shape=env.state_shape,
                  hidden_layers_sizes=[512, 512],
                  anticipatory_param=0.1,
                  min_buffer_size_to_learn=memory_init_size,
                  q_replay_memory_init_size=memory_init_size,
                  train_every=train_every,
                  q_train_every=train_every,
                  q_mlp_layers=[512, 512])
random_agent = RandomAgent(action_num=eval_env.action_num)

env.set_agents([agent, random_agent])
eval_env.set_agents([agent, random_agent])

# Init a Logger to plot the learning curve
logger = Logger(log_dir)

for episode in range(episode_num):

    # First sample a policy for the episode
예제 #12
0
파일: run_rl.py 프로젝트: billh0420/rlcard
def train(args):

    # Check whether gpu is available
    device = get_device()

    # Seed numpy, torch, random
    set_seed(args.seed)

    # Make the environment with seed
    env = rlcard.make(args.env, config={
        'seed': args.seed,
    })

    # Initialize the agent and use random agents as opponents
    if args.algorithm == 'dqn':
        from rlcard.agents import DQNAgent
        agent = DQNAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            mlp_layers=[64, 64],
            device=device,
        )
    elif args.algorithm == 'nfsp':
        from rlcard.agents import NFSPAgent
        agent = NFSPAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            hidden_layers_sizes=[64, 64],
            q_mlp_layers=[64, 64],
            device=device,
        )
    agents = [agent]
    for _ in range(1, env.num_players):
        agents.append(RandomAgent(num_actions=env.num_actions))
    env.set_agents(agents)

    # Start training
    with Logger(args.log_dir) as logger:
        for episode in range(args.num_episodes):

            if args.algorithm == 'nfsp':
                agents[0].sample_episode_policy()

            # Generate data from the environment
            trajectories, payoffs = env.run(is_training=True)

            # Reorganaize the data to be state, action, reward, next_state, done
            trajectories = reorganize(trajectories, payoffs)

            # Feed transitions into agent memory, and train the agent
            # Here, we assume that DQN always plays the first position
            # and the other players play randomly (if any)
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % args.evaluate_every == 0:
                logger.log_performance(
                    env.timestep,
                    tournament(
                        env,
                        args.num_eval_games,
                    )[0])

        # Get the paths
        csv_path, fig_path = logger.csv_path, logger.fig_path

    # Plot the learning curve
    plot_curve(csv_path, fig_path, args.algorithm)

    # Save model
    save_path = os.path.join(args.log_dir, 'model.pth')
    torch.save(agent, save_path)
    print('Model saved in', save_path)
예제 #13
0
train_every = 64

# The paths for saving the logs and learning curves
log_dir = './experiments/leduc_holdem_nfsp_result/'

# Set a global seed
set_global_seed(0)

# Set agents
agents = []
for i in range(env.player_num):
    agent = NFSPAgent(scope='nfsp' + str(i),
                      action_num=env.action_num,
                      state_shape=env.state_shape,
                      hidden_layers_sizes=[128, 128],
                      min_buffer_size_to_learn=memory_init_size,
                      q_replay_memory_init_size=memory_init_size,
                      train_every=train_every,
                      q_train_every=train_every,
                      q_mlp_layers=[128, 128],
                      device=torch.device('cpu'))
    agents.append(agent)
random_agent = RandomAgent(action_num=eval_env.action_num)

env.set_agents(agents)
eval_env.set_agents([agents[0], random_agent])

# Init a Logger to plot the learning curve
logger = Logger(log_dir)

for episode in range(episode_num):
예제 #14
0
# Make environment
env = rlcard.make('leduc-holdem', config={'seed': 0})

# Set a global seed
set_global_seed(0)

# Load pretrained model
graph = tf.Graph()
sess = tf.Session(graph=graph)

with graph.as_default():
    nfsp_agents = []
    for i in range(env.player_num):
        agent = NFSPAgent(sess,
                          scope='nfsp' + str(i),
                          action_num=env.action_num,
                          state_shape=env.state_shape,
                          hidden_layers_sizes=[128, 128],
                          q_mlp_layers=[128, 128])
        nfsp_agents.append(agent)

# We have a pretrained model here. Change the path for your model.
check_point_path = os.path.join(rlcard.__path__[0],
                                'models/pretrained/leduc_holdem_nfsp')

with sess.as_default():
    with graph.as_default():
        saver = tf.train.Saver()
        saver.restore(sess, tf.train.latest_checkpoint(check_point_path))

# Evaluate the performance. Play with random agents.
evaluate_num = 10000
예제 #15
0
    # Set up the agents
    agents = []
    for i in range(1):
        nfsp_agent = NFSPAgent(
            sess,
            scope='nfsp' + str(i),
            action_num=env.action_num,
            state_shape=env.state_shape,
            hidden_layers_sizes=[2048, 2048],
            #    anticipatory_param=0.1,
            anticipatory_param=0.9,
            batch_size=256,
            train_every=train_every,
            rl_learning_rate=0.00002,
            sl_learning_rate=0.0002,
            min_buffer_size_to_learn=memory_init_size,
            q_replay_memory_init_size=memory_init_size,
            q_update_target_estimator_every=500,
            q_discount_factor=0.99,
            q_epsilon_start=1,
            q_epsilon_end=0.1,
            q_epsilon_decay_steps=100000,
            q_batch_size=256,
            q_train_every=train_every,
            q_mlp_layers=[2048, 2048],
            reservoir_buffer_capacity=500000,
            q_replay_memory_size=100000,
            #    evaluate_with='average_policy')
            evaluate_with='best_response')
        agents.append(nfsp_agent)

    random_agent = RandomAgent(action_num=eval_env.action_num)
예제 #16
0
파일: ivvanPlay.py 프로젝트: PhDChe/Poker-1
def run():
    torch.multiprocessing.freeze_support()
    env = rlcard.make('no-limit-holdem',
                      config={
                          'record_action': True,
                          'game_player_num': 2,
                          'env_num': 8,
                          'use_raw': True
                      })
    # eval_env = rlcard.make('no-limit-holdem', config={'seed': 12, 'game_player_num': 2})
    # eval_env2 = rlcard.make('no-limit-holdem', config={'seed': 43, 'game_player_num': 2})
    #eval_env3 = rlcard.make('no-limit-holdem', config={'seed': 43, 'game_player_num': 2})
    # Set the iterations numbers and how frequently we evaluate the performance

    evaluate_every = 1024
    evaluate_num = 32
    episode_num = 20480

    # The intial memory size
    memory_init_size = 256

    # Train the agent every X steps
    train_every = 256
    agents = []

    agents.append(
        NFSPAgent(scope='nfsp' + str(0),
                  action_num=env.action_num,
                  state_shape=env.state_shape,
                  hidden_layers_sizes=[512, 512],
                  anticipatory_param=0.1,
                  rl_learning_rate=0.015,
                  sl_learning_rate=0.0075,
                  q_epsilon_start=.3,
                  min_buffer_size_to_learn=memory_init_size,
                  q_replay_memory_size=20480,
                  q_replay_memory_init_size=memory_init_size,
                  train_every=train_every + 44,
                  q_train_every=train_every,
                  q_mlp_layers=[512, 512],
                  evaluate_with='best_response'))

    agents.append(
        NFSPAgent(scope='nfsp' + str(1),
                  action_num=env.action_num,
                  state_shape=env.state_shape,
                  hidden_layers_sizes=[512, 512],
                  anticipatory_param=0.1,
                  rl_learning_rate=0.015,
                  sl_learning_rate=0.0075,
                  q_epsilon_start=.3,
                  q_replay_memory_size=20480,
                  min_buffer_size_to_learn=memory_init_size,
                  q_replay_memory_init_size=memory_init_size,
                  train_every=train_every + 44,
                  q_train_every=train_every,
                  q_mlp_layers=[512, 512],
                  evaluate_with='best_response'))

    # 7, 5 - all in junkies
    check_point_path = os.path.join('models/ivvan/cp/8/model-nfsp1.pth')
    checkpoint = torch.load(check_point_path)
    check_point_path = os.path.join('models/ivvan/cp/8/model-nfsp0.pth')
    checkpoint2 = torch.load(check_point_path)
    # for agent in agents:
    #     agent.load(checkpoint)
    agents[1].load(checkpoint)
    agents[0].load(checkpoint2)
    human = nolimit_holdem_human_agent.HumanAgent(env.action_num)
    env.set_agents([agents[0], agents[1]])

    while (True):
        print(">> Start a new game")

        trajectories, payoffs = env.run(is_training=False)
        if (len(trajectories[0]) == 0):
            # the bot folded immediately
            continue

        # If the human does not take the final action, we need to
        # print other players action
        final_state = trajectories[0][-1][-2]
        # print(final_state, 'waa')
        action_record = final_state['action_record']
        state = final_state['raw_obs']
        _action_list = []
        for i in range(1, len(action_record) + 1):
            if action_record[-i][0] == state['current_player']:
                break
            _action_list.insert(0, action_record[-i])

        for pair in _action_list:
            print('>> Player', pair[0], 'chooses', pair[1])

        # Let's take a look at what the agent card is
        print('===============     CFR Agent    ===============')
        print_card(env.get_perfect_information()['hand_cards'][1])

        print('===============     Result     ===============')
        if payoffs[0] > 0:
            print('You win {} chips!'.format(payoffs[0]))
        elif payoffs[0] == 0:
            print('It is a tie.')
        else:
            print('You lose {} chips!'.format(-payoffs[0]))
        print('')

        input("Press any key to continue...")
예제 #17
0
파일: ivvan.py 프로젝트: PhDChe/Poker-1
# The intial memory size
memory_init_size = 256

# Train the agent every X steps
train_every = 256
agents = []

agents.append(
    NFSPAgent(scope='nfsp' + str(0),
              action_num=env.action_num,
              state_shape=env.state_shape,
              hidden_layers_sizes=[512, 512],
              anticipatory_param=0.1,
              rl_learning_rate=0.015,
              sl_learning_rate=0.0075,
              q_epsilon_start=.3,
              min_buffer_size_to_learn=memory_init_size,
              q_replay_memory_size=20480,
              q_replay_memory_init_size=memory_init_size,
              train_every=train_every + 44,
              q_train_every=train_every,
              q_mlp_layers=[512, 512],
              evaluate_with='average_policy'))

agents.append(
    NFSPAgent(scope='nfsp' + str(1),
              action_num=env.action_num,
              state_shape=env.state_shape,
              hidden_layers_sizes=[512, 512],
              anticipatory_param=0.1,
              rl_learning_rate=0.015,
예제 #18
0
def main():
    wandb_config = wandb.config
    config = {}
    hyperparams = {}
    for key in wandb_config.keys():
        if key in default_config:
            config[key] = wandb_config[key]
        elif key in default_hyperparams:
            hyperparams[key] = wandb_config[key]

    # Make environment
    env = make("yaniv", config=config)
    eval_env = make("yaniv", config=config)

    agents = []
    for i in range(env.player_num):
        agent = NFSPAgent(scope="nfsp" + str(i),
                          action_num=env.action_num,
                          state_shape=env.state_shape,
                          device=torch.device("cuda"),
                          **hyperparams)
        agents.append(agent)
        if load_model is not None:
            state_dict = torch.load(load_model)
            policy_dict = state_dict[load_scope]
            agent.policy_network.load_state_dict(policy_dict)
            q_key = load_scope + "_dqn_q_estimator"
            agent._rl_agent.q_estimator.qnet.load_state_dict(state_dict[q_key])
            target_key = load_scope + "_dqn_target_estimator"
            agent._rl_agent.target_estimator.qnet.load_state_dict(
                state_dict[target_key])

    rule_agent = YanivNoviceRuleAgent(
        single_step=config["single_step_actions"])
    random_agent = RandomAgent(action_num=env.action_num)

    def agent_feed(agent, trajectories):
        for transition in trajectories:
            agent.feed(transition)

    def save_function(agent, model_dir):
        torch.save(agent.get_state_dict(),
                   os.path.join(model_dir, "model_{}.pth".format(i)))

    e = ExperimentRunner(
        env,
        eval_env,
        log_every=100,
        save_every=100,
        base_dir="yaniv_nfsp_pytorch",
        config=config,
        training_agent=agents[0],
        vs_agent=agents[1],
        feed_function=agent_feed,
        save_function=save_function,
    )

    e.run_training(
        episode_num=50000,
        eval_every=200,
        eval_vs=[random_agent, rule_agent],
        eval_num=100,
    )