def train_leduc(): # Make environment and enable human mode env = rlcard.make('leduc-holdem', config={ 'seed': 0, 'allow_step_back': True }) eval_env = rlcard.make('leduc-holdem', config={'seed': 0}) # Set the iterations numbers and how frequently we evaluate the performance and save model evaluate_every = 100 save_plot_every = 1000 evaluate_num = 10000 episode_num = 10000 # The paths for saving the logs and learning curves log_dir = './experiments/leduc_holdem_oscfr_result/' # Set a global seed set_global_seed(0) # Initilize CFR Agent model_path = 'models/leduc_holdem_oscfr' agent = OutcomeSampling_CFR(env, model_path=model_path) agent.load() # If we have saved model, we first load the model # Evaluate CFR against pre-trained NFSP eval_env.set_agents([agent, models.load('leduc-holdem-nfsp').agents[0]]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): agent.train() print('\rIteration {}'.format(episode), end='') # Evaluate the performance. Play with NFSP agents. if episode % evaluate_every == 0: agent.save() # Save model logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('OSCFR')
def main(): # Make environment env = rlcard.make('blackjack', config={'env_num': 4, 'seed': 0}) eval_env = rlcard.make('blackjack', config={'env_num': 4, 'seed': 0}) # Set the iterations numbers and how frequently we evaluate performance evaluate_every = 100 evaluate_num = 10000 iteration_num = 100000 # The intial memory size memory_init_size = 100 # Train the agent every X steps train_every = 1 # The paths for saving the logs and learning curves log_dir = './experiments/blackjack_dqn_result/' # Set a global seed set_global_seed(0) with tf.compat.v1.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[10, 10]) env.set_agents([agent]) eval_env.set_agents([agent]) # Initialize global variables sess.run(tf.compat.v1.global_variables_initializer()) # Initialize a Logger to plot the learning curve logger = Logger(log_dir) for iteration in range(iteration_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if iteration % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('DQN') # Save model save_dir = 'models/blackjack_dqn' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.compat.v1.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
# Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: dqn_agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger_mcts.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) logger_dqn.log_performance(env.timestep, tournament(eval_env, evaluate_num)[1]) # Close files in the logger logger_mcts.close_files() logger_dqn.close_files() # Plot the learning curve logger_mcts.plot('MCTS') logger_dqn.plot('DQN') pd.DataFrame.to_csv(mcts_agent.action_df, os.path.join(log_dir_mcts, 'action.csv')) # Save model save_dir = os.path.join(log_dir_dqn, 'models/limit_holdem_dqn') if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: nfsp_agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger_mcts.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) logger_nfsp.log_performance(env.timestep, tournament(eval_env, evaluate_num)[1]) # Close files in the logger logger_mcts.close_files() logger_nfsp.close_files() # Plot the learning curve logger_mcts.plot('MCTS') logger_nfsp.plot('NFSP') pd.DataFrame.to_csv(mcts_agent.action_df, os.path.join(log_dir_mcts, 'action.csv')) # Save model save_dir = os.path.join(log_dir_nfsp, 'models/limit_holdem_nfsp') if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
print(state_dict.keys()) torch.save(state_dict, os.path.join(save_dir, 'model-' + agnt._scope + '.pth')) logger.log( '\n\n\n---------------------------------------------------------------\nTournament ' + str(episode / evaluate_every)) res = tournament(eval_env, evaluate_num) logger.log_performance(env.timestep, res[0]) logger.log('' + str(episode_num) + " - " + str(episode)) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('NFSP') save_dir = 'models/ivvan/' if not os.path.exists(save_dir): os.makedirs(save_dir) state_dict = agent.get_state_dict() print(state_dict.keys()) torch.save(state_dict, os.path.join(save_dir, 'model.pth')) # # Make environment # env = rlcard.make('leduc-holdem', config={'seed': 0}) # eval_env = rlcard.make('leduc-holdem', config={'seed': 0}) # # Set the iterations numbers and how frequently we evaluate the performance # evaluate_every = 100 # evaluate_num = 1000
log_dir = './experiments/blackjack_mcts_result/' # Set a global seed set_global_seed(0) # Set up the agents agent = MPMCTSAgent(eval_env) rdm_agent = RandomAgent(action_num=eval_env.action_num) eval_env.set_agents([agent, agent, agent]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(eval_env.timestep, mcts_tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('MCTS') # Save model save_dir = 'models/blackjack_mcts' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver()
# Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('DQN_random') # Save model save_dir = 'models/dqn_random' if not os.path.exists(save_dir): os.makedirs(save_dir) state_dict = agent.get_state_dict() print(state_dict.keys()) torch.save(state_dict, os.path.join(save_dir, 'model.pth'))
# agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: payoffs, peasant_wins, landlord_wins = tournament( eval_env, evaluate_num) logger.log_performance(episode, payoffs[0]) #print("DQN: ", peasant_wins, " and ", landlord_wins) logger.log_peasants(episode, peasant_wins / evaluate_num) logger.log_landlord(episode, landlord_wins / evaluate_num) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('Random', 'peasant_wins') logger.plot('Random', 'reward') logger.plot('Random', 'landlord_wins') # Save model nr = 0 nr = (str)(nr) save_dir = 'models/doudizhu_random_' + nr if not os.path.exists(save_dir): os.makedirs(save_dir) else: while (os.path.exists(save_dir)): nr = (int)(nr) + 1 nr = (str)(nr)
# Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: if episode > 0: current_time = time.time() episodes_per_sec = episode / (current_time - start_time) remaining_mins = (episode_num - episode) / episodes_per_sec / 60 print( f"Current Rate: {episodes_per_sec:.2f}, Estimated Time Remaining: {remaining_mins:.2f} mins" ) reward = tournament(eval_env, evaluate_num)[0] logger.log_performance(env.timestep, reward) with open(os.path.join(log_dir, "perf.csv"), "a+") as fd: fieldnames = ['timestep', 'reward'] writer = csv.DictWriter(fd, fieldnames=fieldnames) if episode == 0: writer.writeheader() writer.writerow({'timestep': env.timestep, 'reward': reward}) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('PPO') # Save model save_dir = 'models/nolimit_holdem_ppo' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
loss_logger.log_performance(episode, loss) save_flag = False if payoffs1[0] > max_L_WR: max_L_WR = payoffs1[0] save_flag = True if payoffs2[1] > max_P_WR: max_P_WR = payoffs2[1] save_flag = True if save_flag: save_dir = best_model_path if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'best_model')) # Close files in the logger logger.close_files() # Plot the learning curve loss_logger.plot('DQN loss') L_WR_logger.plot('DQN L WR') P_WR_logger.plot('DQN P WR') # Save model save_dir = best_model_path if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'best_model'))
def nfsp(): import tensorflow as tf if tf.test.gpu_device_name(): print('GPU found') else: print("No GPU found") #os.environ['TF_CPP_MIN_LOG_LEVEL']='2' # Make environment env = rlcard.make('no-limit-holdem', config={ 'record_action': False, 'game_player_num': 2 }) eval_env = rlcard.make('no-limit-holdem', config={ 'seed': 12, 'game_player_num': 2 }) eval_env2 = rlcard.make('no-limit-holdem', config={ 'seed': 43, 'game_player_num': 2 }) # Set the iterations numbers and how frequently we evaluate the performance # The intial memory size memory_init_size = 1000 # The paths for saving the logs and learning curves log_dir = './experiments/nolimit_holdem_nfsp_result/1v1MCNFSPv3' # Set a global seed set_global_seed(0) graph = tf.Graph() sess = tf.Session(graph=graph) evaluate_every = 1000 evaluate_num = 250 episode_num = 5000 # The intial memory size memory_init_size = 1500 # Train the agent every X steps train_every = 256 agents = [] with graph.as_default(): # Model1v1V3cp10good agents.append( NFSPAgent(sess, scope='nfsp' + str(0), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=.1, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_mlp_layers=[512, 512])) agents.append( NFSPAgent(sess, scope='nfsp' + str(1), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.075, rl_learning_rate=0.075, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every // 2, q_train_every=train_every // 2, q_mlp_layers=[512, 512])) # check_point_path = os.path.join('models\\nolimit_holdem_nfsp\\1v1MCNFSPv3\\cp\\10') print( '-------------------------------------------------------------------------------------' ) # print(check_point_path) with sess.as_default(): with graph.as_default(): saver = tf.train.Saver() # saver.restore(sess, tf.train.latest_checkpoint(check_point_path)) global_step = tf.Variable(0, name='global_step', trainable=False) random_agent = RandomAgent(action_num=eval_env2.action_num) #easy_agent = nfsp_agents[0] print(agents) # print(nfsp_agents) env.set_agents(agents) eval_env.set_agents(agents) eval_env2.set_agents([agents[0], random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # First sample a policy for the episode for agent in agents: agent.sample_episode_policy() table = [] # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]: agents[i].feed(ts, table) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log( '\n\n\n---------------------------------------------------------------\nTournament ' + str(episode / evaluate_every)) res = tournament(eval_env, evaluate_num) res2 = tournament(eval_env2, evaluate_num // 4) logger.log_performance(env.timestep, res[0]) logger.log_performance(env.timestep, res2[0]) logger.log('' + str(episode_num) + " - " + str(episode) + '\n') logger.log( '\n\n----------------------------------------------------------------' ) if episode % (evaluate_every) == 0 and not episode == 0: save_dir = 'models/nolimit_holdem_nfsp/1v1MCNFSPv3/cp/10/good' + str( episode // evaluate_every) if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model')) logger.log( '\n\n\n---------------------------------------------------------------\nTournament ' + str(episode / evaluate_every)) res = tournament(eval_env, evaluate_num) logger.log_performance(env.timestep, res[0]) logger.log('' + str(episode_num) + " - " + str(episode)) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('NFSP') # Save model save_dir = 'models/nolimit_holdem_nfsp/1v1MCNFSPv3/cp/10/good' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
def main(): # Make environment env = rlcard.make('limit-holdem', config={'seed': 0, 'env_num': 16}) eval_env = rlcard.make('limit-holdem', config={'seed': 0, 'env_num': 16}) # Set the iterations numbers and how frequently we evaluate the performance evaluate_every = 100 evaluate_num = 1000 episode_num = 1000000 # The intial memory size memory_init_size = 1000 # Train the agent every X steps train_every = 1 _reward_max = -0.8 # The paths for saving the logs and learning curves log_dir = './experiments/limit_holdem_dqn_result/' # Set a global seed set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[512, 512]) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, agent]) eval_env.set_agents([agent, random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) save_dir = 'models/limit_holdem_dqn' saver = tf.train.Saver() saver.restore(sess, os.path.join(save_dir, 'model')) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) if len(trajectories) > 1: for ts in trajectories[1]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: _reward = tournament(eval_env, evaluate_num)[0] logger.log_performance(episode, _reward) if _reward > _reward_max: if not os.path.exists(save_dir): os.makedirs(save_dir) saver.save(sess, os.path.join(save_dir, 'model')) _reward_max = _reward # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('DQN')
agent1 = limitholdem_rule_models.LimitholdemRuleAgentV1() agent2 = MCTS_Agent(action_num=env.action_num, duration=duration, exploration=explore, model_action=model_action, model_hand_rank=model_hand_rank) env.set_agents([agent2, agent1]) eval_env.set_agents([agent2, agent1]) for i in range(num_tournaments): logger.log_performance(i * 10, tournament(eval_env, evaluate_num)[0]) # for episode in range(episode_num): # # # Generate data from the environment # trajectories, _ = env.run(is_training=True) # # # print(trajectories) # # # Evaluate the performance. Play with random agents. # if episode % evaluate_every == 0: # logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() pd.DataFrame.to_csv(agent2.action_df, os.path.join(log_dir, 'action.csv')) # Plot the learning curve logger.plot(name)
def train_uno(): # Make environment env = rlcard.make("uno", config={"seed": 0}) eval_env = rlcard.make("uno", config={"seed": 0}) # Set the iterations numbers and how frequently we evaluate the performance evaluate_every = 100 evaluate_num = 1000 episode_num = 3000 # The intial memory size memory_init_size = 1000 # Train the agent every X steps train_every = 100 # The paths for saving the logs and learning curves log_dir = "./experiments/uno_results_dqn/" # Set a global seed set_global_seed(0) params = { "scope": "DQN-Agent", "num_actions": env.action_num, "replay_memory_size": memory_init_size, "num_states": env.state_shape, "discount_factor": 0.99, "epsilon_start": 1.0, "epsilon_end": 0.1, "epsilon_decay_steps": 20000, "batch_size": 32, "train_every": 1, "mlp_layers": [512, 512], "lr": 0.0005, } agent_conf = DQN_conf(**params) agent = DQN_agent(agent_conf) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent]) eval_env.set_agents([agent, random_agent]) logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot("DQN UNO") # Save model save_dir = "models/uno_dqn_pytorch" if not os.path.exists(save_dir): os.makedirs(save_dir) state_dict = agent.get_state_dict() print(state_dict.keys()) torch.save(state_dict, os.path.join(save_dir, "model.pth"))
# The paths for saving the logs and learning curves log_dir = './experiments/leduc_holdem_cfr_result/' # Set a global seed set_global_seed(0) # Initilize CFR Agent agent = CFRAgent(env) agent.load() # If we have saved model, we first load the model # Evaluate CFR against pre-trained NFSP eval_env.set_agents([agent, models.load('leduc-holdem-nfsp').agents[0]]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): agent.train() print('\rIteration {}'.format(episode), end='') # Evaluate the performance. Play with NFSP agents. if episode % evaluate_every == 0: agent.save() # Save model logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('CFR')
def nfsp(): import tensorflow as tf if tf.test.gpu_device_name(): print('GPU found') else: print("No GPU found") #os.environ['TF_CPP_MIN_LOG_LEVEL']='2' # Make environment env = rlcard.make('no-limit-holdem', config={ 'game_player_num': 2, 'seed': 477 }) eval_env = rlcard.make('no-limit-holdem', config={ 'seed': 12, 'game_player_num': 2 }) eval_env2 = rlcard.make('no-limit-holdem', config={ 'seed': 43, 'game_player_num': 2 }) #eval_env3 = rlcard.make('no-limit-holdem', config={'seed': 43, 'game_player_num': 2}) # Set the iterations numbers and how frequently we evaluate the performance # The intial memory size memory_init_size = 1000 # The paths for saving the logs and learning curves log_dir = './experiments/nolimit_holdem_nfsp_result/no_all_in' # Set a global seed set_global_seed(477) graph = tf.Graph() tf.ConfigProto() sess = tf.Session(graph=graph) evaluate_every = 2048 evaluate_num = 32 episode_num = 24576 # The intial memory size memory_init_size = 256 # Train the agent every X steps train_every = 256 agents = [] with graph.as_default(): """ def __init__(self, sess, scope, action_num=4, state_shape=None, hidden_layers_sizes=None, reservoir_buffer_capacity=int(1e6), anticipatory_param=0.1, batch_size=256, train_every=1, rl_learning_rate=0.1, sl_learning_rate=0.005, min_buffer_size_to_learn=1000, q_replay_memory_size=30000, q_replay_memory_init_size=1000, q_update_target_estimator_every=1000, q_discount_factor=0.99, q_epsilon_start=0.06, q_epsilon_end=0, q_epsilon_decay_steps=int(1e6), q_batch_size=256, q_train_every=1, q_mlp_layers=None, evaluate_with='average_policy'): """ # Model1v1V3cp10good agents.append( NFSPAgent(sess, scope='nfsp' + str(0), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=0.01, sl_learning_rate=0.005, q_epsilon_start=.7, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=80000, q_replay_memory_init_size=memory_init_size, train_every=train_every + 44, q_train_every=train_every, q_mlp_layers=[512, 512])) agents.append( NFSPAgent(sess, scope='nfsp' + str(1), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.1, rl_learning_rate=0.01, sl_learning_rate=0.005, q_epsilon_start=.7, q_replay_memory_size=80000, min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every + 44, q_train_every=train_every, q_mlp_layers=[512, 512])) # check_point_path = os.path.join('models\\nolimit_holdem_nfsp\\iivan') print( '-------------------------------------------------------------------------------------' ) # print(check_point_path) #todays project :) # https://stackoverflow.com/questions/33758669/running-multiple-tensorflow-sessions-concurrently with sess.as_default(): with graph.as_default(): # saver = tf.train.Saver() # saver.restore(sess, tf.train.latest_checkpoint(check_point_path)) global_step = tf.Variable(0, name='global_step', trainable=False) random_agent = RandomAgent(action_num=eval_env2.action_num) env.set_agents(agents) eval_env.set_agents([agents[0], random_agent]) eval_env2.set_agents([random_agent, agents[1]]) # eval_env3.set_agents([agents[1], random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): print(episode, end='\r') #print('oh') # First sample a policy for the episode for agent in agents: agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]: agents[i].feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log( '\n\n\n---------------------------------------------------------------\nTournament ' + str(episode / evaluate_every)) # tournament(eval_env2, 6) # exploitability.exploitability(eval_env, agents[0], 500) res = tournament(env, evaluate_num) logger.log_performance(env.timestep, res[0]) res2 = tournament(eval_env, evaluate_num // 3) logger.log_performance(env.timestep, res2[0]) res3 = tournament(eval_env2, evaluate_num // 3) logger.log_performance(env.timestep, res3[0]) logger.log('' + str(episode_num) + " - " + str(episode) + '\n') logger.log( '\n\n----------------------------------------------------------------' ) if episode % (evaluate_every) == 0 and not episode == 0: save_dir = 'models/nolimit_holdem_nfsp/no_all_in/cp/' + str( episode // evaluate_every) if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model')) logger.log( '\n\n\n---------------------------------------------------------------\nTournament ' + str(episode / evaluate_every)) res = tournament(eval_env, evaluate_num) logger.log_performance(env.timestep, res[0]) logger.log('' + str(episode_num) + " - " + str(episode)) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('NFSP') # Save model save_dir = 'models/nolimit_holdem_nfsp/no_all_in' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
for episode in range(episode_num): # First sample a policy for the episode agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('NFSP_random') # Save model save_dir = 'models/nfsp_random' if not os.path.exists(save_dir): os.makedirs(save_dir) state_dict = agent.get_state_dict() print(state_dict.keys()) torch.save(state_dict, os.path.join(save_dir, 'model.pth'))
## new with loss: payoffs, peasant_wins, landlord_wins, agent_peasant_wins, agent_landlord_wins = tournament(eval_env, evaluate_num) logger.log_performance(episode, payoffs[role_counter]) #print("DQN: ", peasant_wins, " and ", landlord_wins) logger.log_peasants(episode, peasant_wins/evaluate_num) logger.log_landlord(episode, landlord_wins/evaluate_num) logger.log_loss(episode, agent.get_loss()) logger.log_agent_peasant(episode, agent_peasant_wins) logger.log_agent_landlord(episode, agent_landlord_wins) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('DQN', 'peasant_wins') logger.plot('DQN', 'reward') logger.plot('DQN', 'landlord_wins') logger.plot('DQN', 'loss') logger.plot('DQN', 'agent_peasant_wins') logger.plot('DQN', 'agent_landlord_wins') #algorithm_list = ['peasant_wins', 'reward', 'landlord_wins', 'agent_landlord_wins', 'agent_peasant_wins'] #plotlist = ['peasant_wins', 'reward', 'landlord_wins','agent_landlord_wins', 'agent_peasant_wins'] algorithm_list = ['reward', 'agent_landlord_wins', 'agent_peasant_wins'] plotlist = ['reward', 'agent_landlord_wins', 'agent_peasant_wins'] logger.plot_all(algorithm_list, plotlist) # save the model nr = 0
log_dir = '/Users/zacharydawson/artificial-intelligence/poker/data/simulation_outputs/self' logger = Logger(log_dir) # Set a global seed set_global_seed(0) # Set up agents agent1 = MCTS_Agent(action_num=env.action_num) agent2 = MCTS_Agent(action_num=env.action_num) env.set_agents([agent1, agent2]) eval_env.set_agents([agent1, agent2]) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # print(trajectories) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('SELF')
time_start = time.time() payoffs2 = general_tournament(eval_env, evaluate_num, False) logger.log("episode:{} time:{} peasant winrate:{}".format(episode, time.time() - time_start, payoffs2[1])) P_WR_logger.log_performance(episode, payoffs2[1]) save_flag = False if payoffs1[0] > max_L_WR: max_L_WR = payoffs1[0] save_flag = True if payoffs2[1] > max_P_WR: max_P_WR = payoffs2[1] save_flag = True if save_flag: print('saving model') agent.save_trainable_param_to_file(sess, best_model_path) #loss_logger.plot('DRQN loss') L_WR_logger.plot('DRQN L WR') P_WR_logger.plot('DRQN P WR') sys.stdout.flush() # Close files in the logger logger.close_files() L_WR_logger.close_files() P_WR_logger.close_files() # Plot the learning curve # Save model
env.set_agents(agents) eval_env.set_agents([agents[0], random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): for agent in agents: agent.train() # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('DeepCFR') # Save model save_dir = 'models/nolimit_holdem_deepcfr' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
log_dir = '/Users/zacharydawson/artificial-intelligence/poker/data/simulation_outputs/random' logger = Logger(log_dir) # Set a global seed set_global_seed(0) # Set up agents agent1 = RandomAgent(action_num=env.action_num) agent2 = MCTS_Agent(action_num=env.action_num) env.set_agents([agent1, agent2]) eval_env.set_agents([agent1, agent2]) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # print(trajectories) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('RANDOM')
def main(): parser = createParser() namespace = parser.parse_args(sys.argv[1:]) #random seed random_seed = namespace.random_seed #names env_name = namespace.env_name env_num = 1 test_name = namespace.test_name dir_name = str(env_name)+'_a2c_'+str(test_name)+str(random_seed) # Set the iterations numbers and how frequently we evaluate/save plot evaluate_every = namespace.evaluate_every evaluate_num = namespace.evaluate_num episode_num = namespace.episode_num # Train the agent every X steps train_every = namespace.train_every save_every = namespace.save_every # Make environment env_rand = rlcard.make(env_name, config={'seed': random_seed}) eval_env = rlcard.make(env_name, config={'seed': random_seed}) # The paths for saving the logs and learning curves log_dir = './experiments/rl/'+dir_name+'_result' # Save model save_dir = 'models/rl/'+dir_name+'_result' # Set a global seed set_global_seed(random_seed) # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent_rand = RandomAgent(action_num=eval_env.action_num) agent_test = A2CLSTMQPGAgent( action_num=eval_env.action_num, state_shape=eval_env.state_shape, discount_factor=0.95, critic_lstm_layers=[1,512], critic_mlp_layers=[3,512], critic_activation_func='tanh', critic_kernel_initializer='glorot_uniform', critic_learning_rate=0.001, critic_bacth_size=128, actor_lstm_layers=[1,512], actor_mlp_layers=[3,512], actor_activation_func='tanh', actor_kernel_initializer='glorot_uniform', actor_learning_rate=0.0001, actor_bacth_size=512, entropy_coef=0.5, entropy_decoy=math.pow(0.1/0.5, 1.0/(episode_num//train_every)), max_grad_norm = 1,) if namespace.load_model is not None: agent_test.load_model(namespace.load_model) env_rand.set_agents([agent_test, agent_rand]) eval_env.set_agents([agent_test, agent_rand]) # Init a Logger to plot the learning curve logger = Logger(log_dir+'/'+test_name) envs = [env_rand, ] env_num = len(envs) for episode in range(episode_num // env_num): # Generate data from the for env in envs: trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent_test.feed(ts) if episode % (train_every // env_num) == 0: agent_test.train() if episode % (save_every // env_num) == 0 : # Save model if not os.path.exists(save_dir+'/'+test_name+str(episode*env_num)): os.makedirs(save_dir+'/'+test_name+str(episode*env_num)) agent_test.save_model(save_dir+'/'+test_name+str(episode*env_num)) # Evaluate the performance. Play with random agents. if episode % (evaluate_every // env_num) == 0: print('episode: ', episode*env_num) logger.log_performance(episode*env_num, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot(dir_name) # Save model if not os.path.exists(save_dir+'/'+test_name+str(episode_num)): os.makedirs(save_dir+'/'+test_name+str(episode_num)) agent_test.save_model(save_dir+'/'+test_name+str(episode_num))
# Generate data from the environment agent.reset_step_history() trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent trans_history = [] for ts in trajectories[0]: trans_history.append(ts) agent.feed(trans_history) #print(episode) if episode % evaluate_every == 0: print('') logger.log_performance( env.timestep, general_tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('DRQN') # Save model save_dir = 'models/blackjack_drqn' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
# Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('DQN') # Save model save_dir = 'models/doudizhu_dqn' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
log_dir = './experiments/cfr_random_result/' # Set a global seed set_global_seed(0) # Set up the agents agent = CFRAgent(env=env, model_path='./cfr_random_model') random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent]) eval_env.set_agents([agent, random_agent]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): agent.train() print('\rIteration {}'.format(episode), end='') # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: agent.save() # Save model logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('CFR_random')