예제 #1
0
def train(args):
    # Make environments, CFR only supports Leduc Holdem
    env = rlcard.make('leduc-holdem', config={'seed': 0, 'allow_step_back':True})
    eval_env = rlcard.make('leduc-holdem', config={'seed': 0})

    # Seed numpy, torch, random
    set_seed(args.seed)

    # Initilize CFR Agent
    agent = CFRAgent(env, os.path.join(args.log_dir, 'cfr_model'))
    agent.load()  # If we have saved model, we first load the model

    # Evaluate CFR against random
    eval_env.set_agents([agent, RandomAgent(num_actions=env.num_actions)])

    # Start training
    with Logger(args.log_dir) as logger:
        for episode in range(args.num_episodes):
            agent.train()
            print('\rIteration {}'.format(episode), end='')
            # Evaluate the performance. Play with Random agents.
            if episode % args.evaluate_every == 0:
                agent.save() # Save model
                logger.log_performance(env.timestep, tournament(eval_env, args.num_eval_games)[0])

        # Get the paths
        csv_path, fig_path = logger.csv_path, logger.fig_path
    # Plot the learning curve
    plot_curve(csv_path, fig_path, 'cfr')
예제 #2
0
def run(args):
    # Make environment
    env = rlcard.make(args.env, config={'seed': 42})

    # Seed numpy, torch, random
    set_seed(42)

    # Set agents
    agent = RandomAgent(num_actions=env.num_actions)
    env.set_agents([agent for _ in range(env.num_players)])

    # Generate data from the environment
    trajectories, player_wins = env.run(is_training=False)
    # Print out the trajectories
    print('\nTrajectories:')
    print(trajectories)
    print('\nSample raw observation:')
    pprint.pprint(trajectories[0][0]['raw_obs'])
    print('\nSample raw legal_actions:')
    pprint.pprint(trajectories[0][0]['raw_legal_actions'])
예제 #3
0
def evaluate(args):

    # Check whether gpu is available
    device = get_device()

    # Seed numpy, torch, random
    set_seed(args.seed)

    # Make the environment with seed
    env = rlcard.make(args.env, config={'seed': args.seed})

    # Load models
    agents = []
    for position, model_path in enumerate(args.models):
        agents.append(load_model(model_path, env, position, device))
    env.set_agents(agents)

    # Evaluate
    rewards = tournament(env, args.num_games)
    for position, reward in enumerate(rewards):
        print(position, args.models[position], reward)
예제 #4
0
파일: run_rl.py 프로젝트: billh0420/rlcard
def train(args):

    # Check whether gpu is available
    device = get_device()

    # Seed numpy, torch, random
    set_seed(args.seed)

    # Make the environment with seed
    env_func = env_name_to_env_func[args.env]
    env = env_func.env()
    env.seed(args.seed)
    env.reset()

    # Initialize the agent and use random agents as opponents
    learning_agent_name = env.agents[0]
    if args.algorithm == 'dqn':
        from rlcard.agents.pettingzoo_agents import DQNAgentPettingZoo
        agent = DQNAgentPettingZoo(
            num_actions=env.action_space(learning_agent_name).n,
            state_shape=env.observation_space(
                learning_agent_name)["observation"].shape,
            mlp_layers=[64, 64],
            device=device)
    elif args.algorithm == 'nfsp':
        from rlcard.agents.pettingzoo_agents import NFSPAgentPettingZoo
        agent = NFSPAgentPettingZoo(
            num_actions=env.action_space(learning_agent_name).n,
            state_shape=env.observation_space(
                learning_agent_name)["observation"].shape,
            hidden_layers_sizes=[64, 64],
            q_mlp_layers=[64, 64],
            device=device)

    agents = {learning_agent_name: agent}
    for i in range(1, env.num_agents):
        agents[env.agents[i]] = RandomAgentPettingZoo(
            num_actions=env.action_space(env.agents[i]).n)

    # Start training
    num_timesteps = 0
    with Logger(args.log_dir) as logger:
        for episode in range(args.num_episodes):

            if args.algorithm == 'nfsp':
                agent.sample_episode_policy()

            # Generate data from the environment
            trajectories = run_game_pettingzoo(env, agents, is_training=True)
            trajectories = reorganize_pettingzoo(trajectories)
            num_timesteps += sum([len(t) for t in trajectories.values()])

            for ts in trajectories[learning_agent_name]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % args.evaluate_every == 0:
                average_rewards = tournament_pettingzoo(
                    env, agents, args.num_eval_games)
                logger.log_performance(num_timesteps,
                                       average_rewards[learning_agent_name])

        # Get the paths
        csv_path, fig_path = logger.csv_path, logger.fig_path

    # Plot the learning curve
    plot_curve(csv_path, fig_path, args.algorithm)

    # Save model
    save_path = os.path.join(args.log_dir, 'model.pth')
    torch.save(agent, save_path)
    print('Model saved in', save_path)
예제 #5
0
파일: run_rl.py 프로젝트: billh0420/rlcard
def train(args):

    # Check whether gpu is available
    device = get_device()

    # Seed numpy, torch, random
    set_seed(args.seed)

    # Make the environment with seed
    env = rlcard.make(args.env, config={
        'seed': args.seed,
    })

    # Initialize the agent and use random agents as opponents
    if args.algorithm == 'dqn':
        from rlcard.agents import DQNAgent
        agent = DQNAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            mlp_layers=[64, 64],
            device=device,
        )
    elif args.algorithm == 'nfsp':
        from rlcard.agents import NFSPAgent
        agent = NFSPAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            hidden_layers_sizes=[64, 64],
            q_mlp_layers=[64, 64],
            device=device,
        )
    agents = [agent]
    for _ in range(1, env.num_players):
        agents.append(RandomAgent(num_actions=env.num_actions))
    env.set_agents(agents)

    # Start training
    with Logger(args.log_dir) as logger:
        for episode in range(args.num_episodes):

            if args.algorithm == 'nfsp':
                agents[0].sample_episode_policy()

            # Generate data from the environment
            trajectories, payoffs = env.run(is_training=True)

            # Reorganaize the data to be state, action, reward, next_state, done
            trajectories = reorganize(trajectories, payoffs)

            # Feed transitions into agent memory, and train the agent
            # Here, we assume that DQN always plays the first position
            # and the other players play randomly (if any)
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % args.evaluate_every == 0:
                logger.log_performance(
                    env.timestep,
                    tournament(
                        env,
                        args.num_eval_games,
                    )[0])

        # Get the paths
        csv_path, fig_path = logger.csv_path, logger.fig_path

    # Plot the learning curve
    plot_curve(csv_path, fig_path, args.algorithm)

    # Save model
    save_path = os.path.join(args.log_dir, 'model.pth')
    torch.save(agent, save_path)
    print('Model saved in', save_path)
예제 #6
0
def main():
    # Make environment
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env = rlcard.make('no-limit-holdem', config={'seed': 0, 'env_num': 4})
    eval_env = rlcard.make('no-limit-holdem', config={'seed': 0, 'env_num': 4})

    # Set the iterations numbers and how frequently we evaluate performance
    evaluate_every = 5000
    selfplay_every = 25000
    evaluate_num = 10000
    iteration_num = 8000000

    # The intial memory size
    memory_init_size = 100

    # Train the agent every X steps
    train_every = 1

    agent = DQNAgent(num_actions=env.num_actions,
                     state_shape=env.state_shape[0],
                     mlp_layers=[64, 64, 64, 64],
                     device=device)

    agents = [agent, load_model("model.pth")]

    env.set_agents(agents)

    with Logger('./') as logger:
        for episode in range(iteration_num):

            # Generate data from the environment
            trajectories, payoffs = env.run(is_training=True)

            # Reorganaize the data to be state, action, reward, next_state, done
            trajectories = reorganize(trajectories, payoffs)

            # Feed transitions into agent memory, and train the agent
            # Here, we assume that DQN always plays the first position
            # and the other players play randomly (if any)
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % evaluate_every == 0:
                logger.log_performance(env.timestep,
                                       tournament(env, evaluate_num)[0])
            if episode % selfplay_every == 0:
                save_path = os.path.join('./', str(episode) + "model.pth")
                torch.save(agent, save_path)
                print('Model saved in', save_path)
                agents = [agent, load_model(str(episode) + "model.pth")]
                env.set_agents(agents)

        # Get the paths
        csv_path, fig_path = logger.csv_path, logger.fig_path

    # Plot the learning curve
    #plot_curve(csv_path, fig_path, args.algorithm)

    # Save model
    save_path = os.path.join('./', 'model.pth')
    torch.save(agent, save_path)
    print('Model saved in', save_path)

    # The paths for saving the logs and learning curves
    log_dir = './experiments/nlh_cfr_result/'

    # Set a global seed
    set_seed(0)