random_agent = RandomAgent(action_num=eval_env.action_num) # Other agents env.model.create_agents({"mocsar_min": 4}) env_agent_list = [env.model.rule_agents[i] for i in range(1, 4)] env_agent_list.insert(0, agent) env.set_agents(env_agent_list) # Evaluation agent eval_env.model.create_agents({"mocsar_random": 4}) eval_agent_list = [eval_env.model.rule_agents[i] for i in range(1, 4)] eval_agent_list.insert(0, agent) eval_env.set_agents(eval_agent_list) # Init a Logger to plot the learning curve logger = Logger(log_dir) # Log Game info logger.log('\n########## Game information ##########') logger.log('\nNumPlayers: {}, NumCards: {}, Episodes: {}'.format( env.game.num_players, env.game.num_cards, episode_num)) # logger.log(f'\nTrain Agents:{get_agent_str(env_agent_list)}') # logger.log(f'\nEval Agents:{get_agent_str(eval_agent_list)}') for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]:
with tf.Session() as sess: # Set agents global_step = tf.Variable(0, name='global_step', trainable=False) agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, state_shape=env.state_shape, mlp_layers=[10, 10]) env.set_agents([agent]) eval_env.set_agents([agent]) sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(xlabel='timestep', ylabel='reward', legend='DQN on Blackjack', log_path=log_path, csv_path=csv_path) for episode in range(episode_num // evaluate_every): # Generate data from the environment tasks = assign_task(evaluate_every, PROCESS_NUM) for task in tasks: INPUT_QUEUE.put((task, True, None, None)) for _ in range(evaluate_every): trajectories = OUTPUT_QUEUE.get() # Feed transitions into agent memory, and train for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance reward = 0
min_buffer_size_to_learn=memory_init_size, q_replay_memory_init_size=memory_init_size, train_every=train_every, q_train_every=train_every, q_mlp_layers=[512, 512]) agents.append(agent) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents(agents) eval_env.set_agents([agents[0], random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): # First sample a policy for the episode for agent in agents: agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for i in range(env.player_num): for ts in trajectories[i]: agents[i].feed(ts)
set_global_seed(0) # Initilize CFR Agent opponent = CFRAgent(env) #opponent = RandomAgent(action_num=env.action_num) #opponent.load() # If we have saved model, we first load the model #agent = RandomAgent(action_num=env.action_num) agent = BRAgent(eval_env, opponent) #agent = CFRAgent(env) # Evaluate CFR against pre-trained NFSP # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): opponent.train() #agent.train() print('\rIteration {}'.format(episode), end='') # Evaluate the performance. Play with NFSP agents. if episode % evaluate_every == 0: exploitability(eval_env, opponent) #logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() logger.plot('BR')
# Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = DQNAgent(sess, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[128, 128]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Init a Logger to plot the learning curve logger = Logger(log_dir) state = env.reset() for timestep in range(timesteps): action = agent.step(state) next_state, reward, done = env.step(action) ts = (state, action, reward, next_state, done) agent.feed(ts) if timestep % evaluate_every == 0: rewards = [] state = eval_env.reset() for _ in range(evaluate_num): action, _ = agent.eval_step(state) _, reward, done = env.step(action)
def test_log(self): log_dir = "./newtest/test_log.txt" if os.path.exists(log_dir): shutil.rmtree(log_dir) logger = Logger(log_dir) logger.log("test text") logger.log_performance(1, 1) logger.log_performance(2, 2) logger.log_performance(3, 3) logger.close_files() logger.plot('aaa')
scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[512, 512]) random_agent = RandomAgent(action_num=eval_env.action_num) sess.run(tf.compat.v1.global_variables_initializer()) env.set_agents([agent, random_agent, random_agent, random_agent]) eval_env.set_agents([agent, random_agent, random_agent, random_agent]) # Init a Logger to plot the learning curve logger = Logger(log_dir) # Log Game info logger.log('\n########## Game information ##########') logger.log('\nNumPlayers: {}, NumCards: {}, Episodes: {}'.format( env.game.num_players, env.game.num_cards, episode_num)) env.game.round.set_print_mode(print_mode=True) for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]:
device=torch.device('cuda')) # Other agents env.model.create_agents({"mocsar_min": 4}) env_agent_list = [env.model.rule_agents[i] for i in range(1, 4)] env_agent_list.insert(0, agent) env.set_agents(env_agent_list) # Evaluation agent eval_env.model.create_agents({"mocsar_random": 4}) eval_agent_list = [eval_env.model.rule_agents[i] for i in range(1, 4)] eval_agent_list.insert(0, agent) eval_env.set_agents(eval_agent_list) # Init a Logger to plot the learning curve logger = Logger(log_dir) # Log Game info logger.log('\n########## Game information, NFSP, RuleAgents, Pytorch ##########') logger.log('\nNumPlayers: {}, NumCards: {}, Episodes: {}'.format(env.game.num_players, env.game.num_cards, conf.get_int('episode_num'))) for episode in range(conf.get_int('episode_num')): # First sample a policy for the episode agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True)