def train(args): # Make environments, CFR only supports Leduc Holdem env = rlcard.make('leduc-holdem', config={'seed': 0, 'allow_step_back':True}) eval_env = rlcard.make('leduc-holdem', config={'seed': 0}) # Seed numpy, torch, random set_seed(args.seed) # Initilize CFR Agent agent = CFRAgent(env, os.path.join(args.log_dir, 'cfr_model')) agent.load() # If we have saved model, we first load the model # Evaluate CFR against random eval_env.set_agents([agent, RandomAgent(num_actions=env.num_actions)]) # Start training with Logger(args.log_dir) as logger: for episode in range(args.num_episodes): agent.train() print('\rIteration {}'.format(episode), end='') # Evaluate the performance. Play with Random agents. if episode % args.evaluate_every == 0: agent.save() # Save model logger.log_performance(env.timestep, tournament(eval_env, args.num_eval_games)[0]) # Get the paths csv_path, fig_path = logger.csv_path, logger.fig_path # Plot the learning curve plot_curve(csv_path, fig_path, 'cfr')
def test_vec_env(self): env = rlcard.make('limit-holdem', config={'env_num': 4}) env.set_agents( [RandomAgent(env.action_num) for _ in range(env.player_num)]) trajectories, payoffs = env.run(is_training=False) self.assertEqual(len(payoffs), 4) trajectories, payoffs = env.run(is_training=True)
def load_model(model_path, env=None, position=None, device=None): if os.path.isfile(model_path): # Torch model import torch agent = torch.load(model_path, map_location=device) agent.set_device(device) elif os.path.isdir(model_path): # CFR model from rlcard.agents import CFRAgent agent = CFRAgent(env, model_path) agent.load() elif model_path == 'random': # Random model from rlcard.agents import RandomAgent agent = RandomAgent(num_actions=env.num_actions) else: # A model in the model zoo from rlcard import models agent = models.load(model_path).agents[position] return agent
def train_uno(): # Make environment and enable human mode env = rlcard.make('uno', config={'seed': 0, 'allow_step_back':True}) eval_env = rlcard.make('uno', config={'seed': 0}) # Set the iterations numbers and how frequently we evaluate the performance and save model evaluate_every = 100 save_plot_every = 1000 evaluate_num = 10000 episode_num = 10000 # The paths for saving the logs and learning curves log_dir = './experiments/uno_cfr_result/' # Set a global seed set_global_seed(0) model_path = 'models/uno_cfr' # Initilize CFR Agent agent = CFRAgent(env,model_path = model_path) agent.load() # If we have saved model, we first load the model # Evaluate CFR against pre-trained NFSP random_agent = RandomAgent(action_num=eval_env.action_num) eval_env.set_agents([agent, random_agent]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): agent.train() print('\rIteration {}'.format(episode), end='') # Evaluate the performance. Play with NFSP agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('CFR')
def run(args): # Make environment env = rlcard.make(args.env, config={'seed': 42}) # Seed numpy, torch, random set_seed(42) # Set agents agent = RandomAgent(num_actions=env.num_actions) env.set_agents([agent for _ in range(env.num_players)]) # Generate data from the environment trajectories, player_wins = env.run(is_training=False) # Print out the trajectories print('\nTrajectories:') print(trajectories) print('\nSample raw observation:') pprint.pprint(trajectories[0][0]['raw_obs']) print('\nSample raw legal_actions:') pprint.pprint(trajectories[0][0]['raw_legal_actions'])
def run(path: str, num: int, position: int, opponent: str): # Set a global seed set_global_seed(123) env = make('thousand-schnapsen', config={ 'seed': 0, 'force_zero_sum': True }) agents = [] for _ in range(env.player_num): agent = RandomAgent(action_num=env.action_num) agents.append(agent) graph = tf.Graph() sess = tf.Session(graph=graph) with graph.as_default(): agent = DeepCFR(sess, scope=f'deep_cfr{position}', env=env, policy_network_layers=(8 * 24, 4 * 24, 2 * 24, 24), advantage_network_layers=(8 * 24, 4 * 24, 2 * 24, 24)) if opponent == 'deep_cfr': agents[0] = agent agents[1] = agent agents[2] = agent else: agents[position] = agent with sess.as_default(): with graph.as_default(): saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(path)) env.set_agents(agents) _, wins = tournament(env, num) print(wins)
def main(): # Make environment env = rlcard.make('leduc-holdem', config={'seed': 0, 'env_num': 4}) iterations = 1 # Set a global seed set_global_seed(0) # Set up agents agent = RandomAgent(action_num=env.action_num) env.set_agents([agent, agent]) for it in range(iterations): # Generate data from the environment trajectories, payoffs = env.run(is_training=False) # Print out the trajectories print('\nIteration {}'.format(it)) for ts in trajectories[0]: print( 'State: {}, Action: {}, Reward: {}, Next State: {}, Done: {}'. format(ts[0], ts[1], ts[2], ts[3], ts[4]))
def __init__(self): ''' Load random model ''' env = rlcard.make('doudizhu') self.agent = RandomAgent(action_num=env.action_num) self.player_num = env.player_num
''' A toy example of playing against a random agent on Limit Hold'em ''' import rlcard from rlcard.agents import LimitholdemHumanAgent as HumanAgent from rlcard.agents import RandomAgent from rlcard.utils.utils import print_card # Make environment env = rlcard.make('limit-holdem') human_agent = HumanAgent(env.num_actions) agent_0 = RandomAgent(num_actions=env.num_actions) env.set_agents([human_agent, agent_0]) print(">> Limit Hold'em random agent") while (True): print(">> Start a new game") trajectories, payoffs = env.run(is_training=False) # If the human does not take the final action, we need to # print other players action if len(trajectories[0]) != 0: final_state = trajectories[0][-1] action_record = final_state['action_record'] state = final_state['raw_obs'] _action_list = [] for i in range(1, len(action_record) + 1): """ if action_record[-i][0] == state['current_player']: break
def train_uno(): # Make environment env = rlcard.make("uno", config={"seed": 0}) eval_env = rlcard.make("uno", config={"seed": 0}) # Set the iterations numbers and how frequently we evaluate the performance evaluate_every = 100 evaluate_num = 1000 episode_num = 3000 # The intial memory size memory_init_size = 1000 # Train the agent every X steps train_every = 100 # The paths for saving the logs and learning curves log_dir = "./experiments/uno_results_dqn/" # Set a global seed set_global_seed(0) params = { "scope": "DQN-Agent", "num_actions": env.action_num, "replay_memory_size": memory_init_size, "num_states": env.state_shape, "discount_factor": 0.99, "epsilon_start": 1.0, "epsilon_end": 0.1, "epsilon_decay_steps": 20000, "batch_size": 32, "train_every": 1, "mlp_layers": [512, 512], "lr": 0.0005, } agent_conf = DQN_conf(**params) agent = DQN_agent(agent_conf) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent]) eval_env.set_agents([agent, random_agent]) logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot("DQN UNO") # Save model save_dir = "models/uno_dqn_pytorch" if not os.path.exists(save_dir): os.makedirs(save_dir) state_dict = agent.get_state_dict() print(state_dict.keys()) torch.save(state_dict, os.path.join(save_dir, "model.pth"))
def getAgent(agent_type, env): agent = None if agent_type == 'RandomAgent': agent = RandomAgent(action_num=env.action_num) elif agent_type == 'DDQNAgent': agent = DDQNAgent( action_num=env.action_num, state_shape=env.state_shape, ) elif agent_type == 'A2CLSTMAgent': agent = A2CLSTMAgent( action_num=env.action_num, state_shape=env.state_shape, trainble=False, discount_factor=0.95, critic_lstm_layers=[1, 512], critic_mlp_layers=[3, 512], critic_activation_func='tanh', critic_kernel_initializer='glorot_uniform', critic_learning_rate=0.001, critic_bacth_size=128, actor_lstm_layers=[1, 512], actor_mlp_layers=[3, 512], actor_activation_func='tanh', actor_kernel_initializer='glorot_uniform', actor_learning_rate=0.0001, actor_bacth_size=512, entropy_coef=0.5, max_grad_norm=1, ) elif agent_type == 'A2CQPGAgent': agent = A2CQPGAgent( action_num=env.action_num, state_shape=env.state_shape, trainble=False, discount_factor=0.95, critic_mlp_layers=[4, 512], critic_activation_func='tanh', critic_kernel_initializer='glorot_uniform', critic_learning_rate=0.001, critic_bacth_size=128, actor_mlp_layers=[4, 512], actor_activation_func='tanh', actor_kernel_initializer='glorot_uniform', actor_learning_rate=0.0001, actor_bacth_size=512, entropy_coef=1, max_grad_norm=1, ) elif agent_type == 'A2CLSTMQPGAgent': agent = A2CLSTMQPGAgent( action_num=env.action_num, state_shape=env.state_shape, trainable=False, discount_factor=0.95, critic_lstm_layers=[1, 512], critic_mlp_layers=[3, 512], critic_activation_func='tanh', critic_kernel_initializer='glorot_uniform', critic_learning_rate=0.001, critic_bacth_size=128, actor_lstm_layers=[1, 512], actor_mlp_layers=[3, 512], actor_activation_func='tanh', actor_kernel_initializer='glorot_uniform', actor_learning_rate=0.0001, actor_bacth_size=512, entropy_coef=0.5, max_grad_norm=1, ) elif agent_type == 'A2CAgent': agent = A2CAgent( action_num=env.action_num, state_shape=env.state_shape, discount_factor=0.95, critic_mlp_layers=[4, 512], critic_activation_func='tanh', critic_kernel_initializer='glorot_uniform', critic_learning_rate=0.001, critic_bacth_size=128, actor_mlp_layers=[4, 512], actor_activation_func='tanh', actor_kernel_initializer='glorot_uniform', actor_learning_rate=0.0001, actor_bacth_size=512, entropy_coef=1, max_grad_norm=1, ) else: raise ValueError(str(agent_type) + ' type not exist') return agent
# Save model save_dir = 'models/uno_dqn' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_size=20000, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[512,512]) random_agent1 = RandomAgent(action_num=eval_env.action_num) random_agent2 = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent1, random_agent2]) eval_env.set_agents([agent, random_agent1, random_agent2]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): print('Episode: ' + str(episode)) # Generate data from the environment trajectories, _ = env.run(is_training=True)
def train_uno(): # Make environment env = rlcard.make('uno', config={'seed': 0}) eval_env = rlcard.make('uno', config={'seed': 0}) # Set the iterations numbers and how frequently we evaluate the performance evaluate_every = 100 evaluate_num = 1000 episode_num = 3000 # The intial memory size memory_init_size = 1000 # Train the agent every X steps train_every = 1 # The paths for saving the logs and learning curves log_dir = './experiments/uno_dqn_result/' # Set a global seed set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_size=20000, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[512, 512]) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent]) eval_env.set_agents([agent, random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('DQN') # Save model save_dir = 'models/uno_dqn' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
def train(args): # Check whether gpu is available device = get_device() # Seed numpy, torch, random set_seed(args.seed) # Make the environment with seed env = rlcard.make(args.env, config={ 'seed': args.seed, }) # Initialize the agent and use random agents as opponents if args.algorithm == 'dqn': from rlcard.agents import DQNAgent agent = DQNAgent( num_actions=env.num_actions, state_shape=env.state_shape[0], mlp_layers=[64, 64], device=device, ) elif args.algorithm == 'nfsp': from rlcard.agents import NFSPAgent agent = NFSPAgent( num_actions=env.num_actions, state_shape=env.state_shape[0], hidden_layers_sizes=[64, 64], q_mlp_layers=[64, 64], device=device, ) agents = [agent] for _ in range(1, env.num_players): agents.append(RandomAgent(num_actions=env.num_actions)) env.set_agents(agents) # Start training with Logger(args.log_dir) as logger: for episode in range(args.num_episodes): if args.algorithm == 'nfsp': agents[0].sample_episode_policy() # Generate data from the environment trajectories, payoffs = env.run(is_training=True) # Reorganaize the data to be state, action, reward, next_state, done trajectories = reorganize(trajectories, payoffs) # Feed transitions into agent memory, and train the agent # Here, we assume that DQN always plays the first position # and the other players play randomly (if any) for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % args.evaluate_every == 0: logger.log_performance( env.timestep, tournament( env, args.num_eval_games, )[0]) # Get the paths csv_path, fig_path = logger.csv_path, logger.fig_path # Plot the learning curve plot_curve(csv_path, fig_path, args.algorithm) # Save model save_path = os.path.join(args.log_dir, 'model.pth') torch.save(agent, save_path) print('Model saved in', save_path)
''' Another example of loading a pre-trained NFSP model on Leduc Hold'em Here, we directly load the model from model zoo ''' import rlcard from rlcard.agents import RandomAgent from rlcard.utils import set_global_seed, tournament from rlcard import models # Make environment env = rlcard.make('leduc-holdem', config={'seed': 0}) # Set a global seed set_global_seed(0) # Here we directly load NFSP models from /models module nfsp_agents = models.load('leduc-holdem-nfsp').agents # Evaluate the performance. Play with random agents. evaluate_num = 10000 random_agent = RandomAgent(env.action_num) env.set_agents([nfsp_agents[0], random_agent]) reward = tournament(env, evaluate_num)[0] print('Average reward against random agent: ', reward)
from rlcard.agents import RandomAgent import rlcard import numpy as np env = rlcard.make('doudizhu') env.set_agents([ RandomAgent(env.action_num), RandomAgent(env.action_num), RandomAgent(env.action_num) ]) # 让他们进行一轮斗地主 a = 0 for i in range(1000): trans, _ = env.run(is_training=False) if (len(trans[0]) > a): print(len(trans[0])) a = len(trans[0]) print('')
def main(): parser = createParser() namespace = parser.parse_args(sys.argv[1:]) #random seed random_seed = namespace.random_seed #names env_name = namespace.env_name env_num = 1 test_name = namespace.test_name dir_name = str(env_name)+'_a2c_'+str(test_name)+str(random_seed) # Set the iterations numbers and how frequently we evaluate/save plot evaluate_every = namespace.evaluate_every evaluate_num = namespace.evaluate_num episode_num = namespace.episode_num # Train the agent every X steps train_every = namespace.train_every save_every = namespace.save_every # Make environment env_rand = rlcard.make(env_name, config={'seed': random_seed}) eval_env = rlcard.make(env_name, config={'seed': random_seed}) # The paths for saving the logs and learning curves log_dir = './experiments/rl/'+dir_name+'_result' # Save model save_dir = 'models/rl/'+dir_name+'_result' # Set a global seed set_global_seed(random_seed) # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent_rand = RandomAgent(action_num=eval_env.action_num) agent_test = A2CLSTMQPGAgent( action_num=eval_env.action_num, state_shape=eval_env.state_shape, discount_factor=0.95, critic_lstm_layers=[1,512], critic_mlp_layers=[3,512], critic_activation_func='tanh', critic_kernel_initializer='glorot_uniform', critic_learning_rate=0.001, critic_bacth_size=128, actor_lstm_layers=[1,512], actor_mlp_layers=[3,512], actor_activation_func='tanh', actor_kernel_initializer='glorot_uniform', actor_learning_rate=0.0001, actor_bacth_size=512, entropy_coef=0.5, entropy_decoy=math.pow(0.1/0.5, 1.0/(episode_num//train_every)), max_grad_norm = 1,) if namespace.load_model is not None: agent_test.load_model(namespace.load_model) env_rand.set_agents([agent_test, agent_rand]) eval_env.set_agents([agent_test, agent_rand]) # Init a Logger to plot the learning curve logger = Logger(log_dir+'/'+test_name) envs = [env_rand, ] env_num = len(envs) for episode in range(episode_num // env_num): # Generate data from the for env in envs: trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent_test.feed(ts) if episode % (train_every // env_num) == 0: agent_test.train() if episode % (save_every // env_num) == 0 : # Save model if not os.path.exists(save_dir+'/'+test_name+str(episode*env_num)): os.makedirs(save_dir+'/'+test_name+str(episode*env_num)) agent_test.save_model(save_dir+'/'+test_name+str(episode*env_num)) # Evaluate the performance. Play with random agents. if episode % (evaluate_every // env_num) == 0: print('episode: ', episode*env_num) logger.log_performance(episode*env_num, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot(dir_name) # Save model if not os.path.exists(save_dir+'/'+test_name+str(episode_num)): os.makedirs(save_dir+'/'+test_name+str(episode_num)) agent_test.save_model(save_dir+'/'+test_name+str(episode_num))
def main(): # Make environment env = rlcard.make('no-limit-holdem', config={ 'seed': 0, 'env_num': 16, 'game_player_num': 4 }) eval_env = rlcard.make('no-limit-holdem', config={ 'seed': 0, 'env_num': 16 }) # Set the iterations numbers and how frequently we evaluate the performance evaluate_every = 100 evaluate_num = 1000 episode_num = 200000 # The intial memory size memory_init_size = 1000 # Train the agent every X steps train_every = 1 _reward_max = -0.8 # The paths for saving the logs and learning curves log_dir = './experiments/nolimit_holdem_dqn_result/' # Set a global seed set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[512, 512]) agent2 = NFSPAgent(sess, scope='nfsp', action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=64, q_train_every=64, q_mlp_layers=[512, 512]) # Initialize global variables sess.run(tf.global_variables_initializer()) save_dir = 'models/nolimit_holdem_dqn' saver = tf.train.Saver() #saver.restore(sess, os.path.join(save_dir, 'model')) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, agent, agent2, random_agent]) eval_env.set_agents([agent, agent2]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): agent2.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) for ts in trajectories[2]: agent2.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: _reward = tournament(eval_env, evaluate_num)[0] logger.log_performance(episode, _reward) if _reward > _reward_max: if not os.path.exists(save_dir): os.makedirs(save_dir) saver.save(sess, os.path.join(save_dir, 'model')) _reward_max = _reward # Close files in the logger logger.close_files() if not os.path.exists(save_dir): os.makedirs(save_dir) saver.save(sess, os.path.join(save_dir, 'model_final'))
import rlcard from rlcard.agents import RandomAgent as RandomAgent from rlcard.agents import BlackjackHumanAgent as HumanAgent from rlcard.utils.utils import print_card # Make environment num_players = 2 env = rlcard.make( 'blackjack', config={ 'game_num_players': num_players, }, ) human_agent = HumanAgent(env.num_actions) random_agent = RandomAgent(env.num_actions) env.set_agents([ human_agent, random_agent, ]) print(">> Blackjack human agent") while (True): print(">> Start a new game") trajectories, payoffs = env.run(is_training=False) # If the human does not take the final action, we need to # print other players action if len(trajectories[0]) != 0:
def nfsp(): import tensorflow as tf if tf.test.gpu_device_name(): print('GPU found') else: print("No GPU found") #os.environ['TF_CPP_MIN_LOG_LEVEL']='2' # Make environment env = rlcard.make('no-limit-holdem', config={ 'game_player_num': 2, 'seed': 477 }) eval_env = rlcard.make('no-limit-holdem', config={ 'seed': 12, 'game_player_num': 2 }) eval_env2 = rlcard.make('no-limit-holdem', config={ 'seed': 43, 'game_player_num': 2 }) #eval_env3 = rlcard.make('no-limit-holdem', config={'seed': 43, 'game_player_num': 2}) # Set the iterations numbers and how frequently we evaluate the performance # The intial memory size memory_init_size = 1000 # The paths for saving the logs and learning curves log_dir = './experiments/nolimit_holdem_nfsp_result/no_all_in' # Set a global seed set_global_seed(477) graph = tf.Graph() tf.ConfigProto() sess = tf.Session(graph=graph) evaluate_every = 2048 evaluate_num = 32 episode_num = 24576 # The intial memory size memory_init_size = 256 # Train the agent every X steps train_every = 256 agents = [] with graph.as_default(): """ def __init__(self, sess, scope, action_num=4, state_shape=None, hidden_layers_sizes=None, reservoir_buffer_capacity=int(1e6), anticipatory_param=0.1, batch_size=256, train_every=1, rl_learning_rate=0.1, sl_learning_rate=0.005, min_buffer_size_to_learn=1000, q_replay_memory_size=30000, q_replay_memory_init_size=1000, q_update_target_estimator_every=1000, q_discount_factor=0.99, q_epsilon_start=0.06, q_epsilon_end=0, q_epsilon_decay_steps=int(1e6), q_batch_size=256, q_train_every=1, q_mlp_layers=None, evaluate_with='average_policy'): """ # Model1v1V3cp10good agents.append( NFSPAgent(sess, scope='nfsp' + str(0), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=0.01, sl_learning_rate=0.005, q_epsilon_start=.7, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=80000, q_replay_memory_init_size=memory_init_size, train_every=train_every + 44, q_train_every=train_every, q_mlp_layers=[512, 512])) agents.append( NFSPAgent(sess, scope='nfsp' + str(1), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=0.01, sl_learning_rate=0.005, q_epsilon_start=.7, q_replay_memory_size=80000, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every + 44, q_train_every=train_every, q_mlp_layers=[512, 512])) # check_point_path = os.path.join('models\\nolimit_holdem_nfsp\\iivan') print( '-------------------------------------------------------------------------------------' ) # print(check_point_path) #todays project :) # https://stackoverflow.com/questions/33758669/running-multiple-tensorflow-sessions-concurrently with sess.as_default(): with graph.as_default(): # saver = tf.train.Saver() # saver.restore(sess, tf.train.latest_checkpoint(check_point_path)) global_step = tf.Variable(0, name='global_step', trainable=False) random_agent = RandomAgent(action_num=eval_env2.action_num) env.set_agents(agents) eval_env.set_agents([agents[0], random_agent]) eval_env2.set_agents([random_agent, agents[1]]) # eval_env3.set_agents([agents[1], random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): print(episode, end='\r') #print('oh') # First sample a policy for the episode for agent in agents: agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]: agents[i].feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log( '\n\n\n---------------------------------------------------------------\nTournament ' + str(episode / evaluate_every)) # tournament(eval_env2, 6) # exploitability.exploitability(eval_env, agents[0], 500) res = tournament(env, evaluate_num) logger.log_performance(env.timestep, res[0]) res2 = tournament(eval_env, evaluate_num // 3) logger.log_performance(env.timestep, res2[0]) res3 = tournament(eval_env2, evaluate_num // 3) logger.log_performance(env.timestep, res3[0]) logger.log('' + str(episode_num) + " - " + str(episode) + '\n') logger.log( '\n\n----------------------------------------------------------------' ) if episode % (evaluate_every) == 0 and not episode == 0: save_dir = 'models/nolimit_holdem_nfsp/no_all_in/cp/' + str( episode // evaluate_every) if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model')) logger.log( '\n\n\n---------------------------------------------------------------\nTournament ' + str(episode / evaluate_every)) res = tournament(eval_env, evaluate_num) logger.log_performance(env.timestep, res[0]) logger.log('' + str(episode_num) + " - " + str(episode)) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('NFSP') # Save model save_dir = 'models/nolimit_holdem_nfsp/no_all_in' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
def nfsp(): import tensorflow as tf if tf.test.gpu_device_name(): print('GPU found') else: print("No GPU found") #os.environ['TF_CPP_MIN_LOG_LEVEL']='2' # Make environment env = rlcard.make('no-limit-holdem', config={ 'record_action': False, 'game_player_num': 2 }) eval_env = rlcard.make('no-limit-holdem', config={ 'seed': 12, 'game_player_num': 2 }) eval_env2 = rlcard.make('no-limit-holdem', config={ 'seed': 43, 'game_player_num': 2 }) # Set the iterations numbers and how frequently we evaluate the performance # The intial memory size memory_init_size = 1000 # The paths for saving the logs and learning curves log_dir = './experiments/nolimit_holdem_nfsp_result/1v1MCNFSPv3' # Set a global seed set_global_seed(0) graph = tf.Graph() sess = tf.Session(graph=graph) evaluate_every = 1000 evaluate_num = 250 episode_num = 5000 # The intial memory size memory_init_size = 1500 # Train the agent every X steps train_every = 256 agents = [] with graph.as_default(): # Model1v1V3cp10good agents.append( NFSPAgent(sess, scope='nfsp' + str(0), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=.1, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_mlp_layers=[512, 512])) agents.append( NFSPAgent(sess, scope='nfsp' + str(1), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.075, rl_learning_rate=0.075, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every // 2, q_train_every=train_every // 2, q_mlp_layers=[512, 512])) # check_point_path = os.path.join('models\\nolimit_holdem_nfsp\\1v1MCNFSPv3\\cp\\10') print( '-------------------------------------------------------------------------------------' ) # print(check_point_path) with sess.as_default(): with graph.as_default(): saver = tf.train.Saver() # saver.restore(sess, tf.train.latest_checkpoint(check_point_path)) global_step = tf.Variable(0, name='global_step', trainable=False) random_agent = RandomAgent(action_num=eval_env2.action_num) #easy_agent = nfsp_agents[0] print(agents) # print(nfsp_agents) env.set_agents(agents) eval_env.set_agents(agents) eval_env2.set_agents([agents[0], random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # First sample a policy for the episode for agent in agents: agent.sample_episode_policy() table = [] # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]: agents[i].feed(ts, table) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log( '\n\n\n---------------------------------------------------------------\nTournament ' + str(episode / evaluate_every)) res = tournament(eval_env, evaluate_num) res2 = tournament(eval_env2, evaluate_num // 4) logger.log_performance(env.timestep, res[0]) logger.log_performance(env.timestep, res2[0]) logger.log('' + str(episode_num) + " - " + str(episode) + '\n') logger.log( '\n\n----------------------------------------------------------------' ) if episode % (evaluate_every) == 0 and not episode == 0: save_dir = 'models/nolimit_holdem_nfsp/1v1MCNFSPv3/cp/10/good' + str( episode // evaluate_every) if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model')) logger.log( '\n\n\n---------------------------------------------------------------\nTournament ' + str(episode / evaluate_every)) res = tournament(eval_env, evaluate_num) logger.log_performance(env.timestep, res[0]) logger.log('' + str(episode_num) + " - " + str(episode)) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('NFSP') # Save model save_dir = 'models/nolimit_holdem_nfsp/1v1MCNFSPv3/cp/10/good' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
# The intial memory size memory_init_size = 100 # Train the agent every X steps train_every = 1 # The paths for saving the logs and learning curves log_dir = './experiments/blackjack_dqn_result/' # Set a global seed set_global_seed(0) # Set up the agents agent = RandomAgent(action_num=env.action_num) env.set_agents([agent]) eval_env.set_agents([agent]) # Initialize global variables # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True)
from itertools import combinations, combinations_with_replacement import rlcard from rlcard.utils import set_global_seed from rlcard.agents import RandomAgent from yaniv_rl.models.yaniv_rule_models import YanivNoviceRuleAgent, YanivIntermediateRuleAgent from yaniv_rl import utils from rlcard.envs.registration import register, make register( env_id='yaniv', entry_point='yaniv_rl.envs.yaniv:YanivEnv', ) agents = [ RandomAgent(488), YanivNoviceRuleAgent(), YanivIntermediateRuleAgent() ] # Make environment env = make('yaniv', config={'seed': 0}) eval_num = 10000 table = [[0 for i in range(3)] for i in range(3)] for i in range(3): # player v player env.set_agents([agents[i], agents[i]]) res = utils.tournament(env, eval_num) winrate = res['wins'][0] / eval_num table[i][i] = winrate
''' A toy example of playing Whale with random agents ''' import rlcard from rlcard.agents import RandomAgent from rlcard.utils import set_global_seed # Make environment env = rlcard.make('whale', config={'seed': 0, 'num_players':4}) episode_num = 5 # Set a global seed set_global_seed(0) # Set up agents agent_0 = RandomAgent(action_num=env.action_num) agent_1 = RandomAgent(action_num=env.action_num) agent_2 = RandomAgent(action_num=env.action_num) agent_3 = RandomAgent(action_num=env.action_num) env.set_agents([agent_0, agent_1, agent_2, agent_3]) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=False) # Print out the trajectories print('\nEpisode {}'.format(episode)) for ts in trajectories[0]: print('State: {}, Action: {}, Reward: {}, Next State: {}, Done: {}'. format(ts[0], ts[1], ts[2], ts[3], ts[4]))
from itertools import combinations, combinations_with_replacement import rlcard from rlcard.utils import set_global_seed from rlcard.agents import RandomAgent from yaniv_rl.models.yaniv_rule_models import YanivNoviceRuleAgent, YanivIntermediateRuleAgent from yaniv_rl import utils from rlcard.envs.registration import register, make import sys register( env_id='yaniv', entry_point='yaniv_rl.envs.yaniv:YanivEnv', ) agents = [RandomAgent(488), YanivNoviceRuleAgent(), YanivIntermediateRuleAgent()] # Make environment eval_num = 10000 env = make('yaniv', config={'seed': 0, 'starting_player': 0}) table = [[0 for i in range(3)] for i in range(3)] for i in range(3): # player v player env.set_agents([agents[i], agents[i]]) res = utils.tournament(env, eval_num) print("{} vs {}: ".format(i, i), res) winrate = res['wins'][0] / eval_num table[i][i] = winrate for agent_1, agent_2 in combinations(agents, 2): a1i = agents.index(agent_1) a2i = agents.index(agent_2)
# with tf.Session(config=config) as sess: with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[512, 512]) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent, random_agent]) eval_env.set_agents([agent, random_agent, random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent
def main(): wandb_config = wandb.config config = {} hyperparams = {} for key in wandb_config.keys(): if key in default_config: config[key] = wandb_config[key] elif key in default_hyperparams: hyperparams[key] = wandb_config[key] # Make environment env = make("yaniv", config=config) eval_env = make("yaniv", config=config) agents = [] for i in range(env.player_num): agent = NFSPAgent(scope="nfsp" + str(i), action_num=env.action_num, state_shape=env.state_shape, device=torch.device("cuda"), **hyperparams) agents.append(agent) if load_model is not None: state_dict = torch.load(load_model) policy_dict = state_dict[load_scope] agent.policy_network.load_state_dict(policy_dict) q_key = load_scope + "_dqn_q_estimator" agent._rl_agent.q_estimator.qnet.load_state_dict(state_dict[q_key]) target_key = load_scope + "_dqn_target_estimator" agent._rl_agent.target_estimator.qnet.load_state_dict( state_dict[target_key]) rule_agent = YanivNoviceRuleAgent( single_step=config["single_step_actions"]) random_agent = RandomAgent(action_num=env.action_num) def agent_feed(agent, trajectories): for transition in trajectories: agent.feed(transition) def save_function(agent, model_dir): torch.save(agent.get_state_dict(), os.path.join(model_dir, "model_{}.pth".format(i))) e = ExperimentRunner( env, eval_env, log_every=100, save_every=100, base_dir="yaniv_nfsp_pytorch", config=config, training_agent=agents[0], vs_agent=agents[1], feed_function=agent_feed, save_function=save_function, ) e.run_training( episode_num=50000, eval_every=200, eval_vs=[random_agent, rule_agent], eval_num=100, )