def test_log(self): log_dir = "./newtest/test_log.txt" if os.path.exists(log_dir): shutil.rmtree(log_dir) logger = Logger(log_dir) logger.log("test text") logger.log_performance(1, 1) logger.log_performance(2, 2) logger.log_performance(3, 3) logger.close_files() logger.plot('aaa')
state = env.reset() for timestep in range(timesteps): action = agent.step(state) next_state, reward, done = env.step(action) ts = (state, action, reward, next_state, done) agent.feed(ts) if timestep % evaluate_every == 0: rewards = [] state = eval_env.reset() for _ in range(evaluate_num): action, _ = agent.eval_step(state) _, reward, done = env.step(action) if done: rewards.append(reward) logger.log_performance(env.timestep, np.mean(rewards)) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('DQN') # Save model save_dir = 'models/uno_single_dqn' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
# The paths for saving the logs and learning curves log_dir = './experiments/leduc_holdem_cfr_result/' # Set a global seed set_global_seed(0) # Initilize CFR Agent agent = CFRAgent(env) agent.load() # If we have saved model, we first load the model # Evaluate CFR against pre-trained NFSP eval_env.set_agents([agent, models.load('leduc-holdem-nfsp').agents[0]]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): agent.train() print('\rIteration {}'.format(episode), end='') # Evaluate the performance. Play with NFSP agents. if episode % evaluate_every == 0: agent.save() # Save model logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('CFR')
scope="dqn", collection=tf.GraphKeys.TRAINABLE_VARIABLES) variables = [var.eval() for var in variables] for task in tasks: INPUT_QUEUE.put((task, False, variables, agent.total_t)) for _ in range(evaluate_num): payoffs = OUTPUT_QUEUE.get() reward += payoffs[0] logger.log('\n########## Evaluation ##########') logger.log('Average reward is {}'.format( float(reward) / evaluate_num)) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('DQN_multi_process') # Save model save_dir = 'models/leduc_dqn_multi' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model')) # Close multi-processes for _ in range(PROCESS_NUM): INPUT_QUEUE.put(None) INPUT_QUEUE.join() for p in PROCESSES:
def train_mahjong(): # Make environment env = rlcard.make('mahjong', config={'seed': 0}) eval_env = rlcard.make('mahjong', config={'seed': 0}) # Set the iterations numbers and how frequently we evaluate the performance evaluate_every = 1000 evaluate_num = 1000 episode_num = 10000 # The intial memory size memory_init_size = 1000 # Train the agent every X steps train_every = 64 # The paths for saving the logs and learning curves log_dir = './experiments/mahjong_nfsp_result/' # Set a global seed set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[512, 512], anticipatory_param=0.5, batch_size=256, rl_learning_rate=0.00005, sl_learning_rate=0.00001, min_buffer_size_to_learn=memory_init_size, q_replay_memory_size=int(1e5), q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_batch_size=256, q_mlp_layers=[512, 512]) agents.append(agent) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents) eval_env.set_agents( [agents[0], random_agent, random_agent, random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curvefrom rlcard.agents.random_agent import RandomAgent logger = Logger(log_dir) for episode in tqdm(range(episode_num)): # First sample a policy for the episode for agent in agents: agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]: agents[i].feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('NFSP') # Save model save_dir = 'models/mahjong_nfsp' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
# First sample a policy for the episode for agent in agents: agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]: agents[i].feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('NFSP') # Save model save_dir = 'models/limit_holdem_nfsp' if not os.path.exists(save_dir): os.makedirs(save_dir) saver = tf.train.Saver() saver.save(sess, os.path.join(save_dir, 'model'))
log_dir = './experiments/leduc_holdem_br_result/' # Set a global seed set_global_seed(0) # Initilize CFR Agent opponent = CFRAgent(env) #opponent = RandomAgent(action_num=env.action_num) #opponent.load() # If we have saved model, we first load the model #agent = RandomAgent(action_num=env.action_num) agent = BRAgent(eval_env, opponent) #agent = CFRAgent(env) eval_env.set_agents([agent, opponent]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): opponent.train() #agent.train() print('\rIteration {}'.format(episode), end='') # Evaluate the performance. Play with NFSP agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() logger.plot('BR')
# Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('DQN_Keras') # Save model save_dir = 'models/blackjack_dqn_keras' if not os.path.exists(save_dir): os.makedirs(save_dir) model_path = os.path.join(save_dir, 'model.keras') agent.save_model(model_path) print("model saved")
state_shape=env.state_shape, discount_factor=0.99, learning_rate=1e-6, device=None) env.set_agents([agent]) eval_env.set_agents([agent]) logger = Logger(log_dir) for episode in range(episode_num): trajectories, _ = env.run(is_training=True) for ts in trajectories[0]: agent.feed(ts) loss = agent.train() # logger.log(f"Loss: {loss}") if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) logger.close_files() logger.plot("REINFORCE") # Save model save_dir = 'models/blackjack_reinforce' if not os.path.exists(save_dir): os.makedirs(save_dir) torch.save(agent.get_state_dict(), os.path.join(save_dir, 'model.pth'))