def test_tournament(self): env = rlcard3.make('leduc-holdem') env.set_agents( [RandomAgent(env.action_num), RandomAgent(env.action_num)]) payoffs = tournament(env, 1000) self.assertEqual(len(payoffs), 2)
env.game.num_players, env.game.num_cards, episode_num)) # logger.log(f'\nTrain Agents:{get_agent_str(env_agent_list)}') # logger.log(f'\nEval Agents:{get_agent_str(eval_agent_list)}') for episode in range(episode_num): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0], episode=episode) # Save model save_dir = 'models/mocsar_dqn_ra_pytorch' if not os.path.exists(save_dir): os.makedirs(save_dir) state_dict = agent.get_state_dict() logger.log('\n########## Pytorch Save model ##########') logger.log('\n' + str(state_dict.keys())) torch.save(state_dict, os.path.join(save_dir, 'model.pth')) # Close files in the logger logger.close_files() # Plot the learning curve
# Load pretrained model graph = tf.Graph() sess = tf.Session(graph=graph) with graph.as_default(): nfsp_agents = [] for i in range(env.player_num): agent = NFSPAgent(sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[128,128], q_mlp_layers=[128,128]) nfsp_agents.append(agent) # We have a pretrained model here. Change the path for your model. check_point_path = os.path.join(rlcard3.__path__[0], 'models/pretrained/leduc_holdem_nfsp') with sess.as_default(): with graph.as_default(): saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(check_point_path)) # Evaluate the performance. Play with random agents. evaluate_num = 10000 random_agent = RandomAgent(env.action_num) env.set_agents([nfsp_agents[0], random_agent]) reward = tournament(env, evaluate_num)[0] print('Average reward against random agent: ', reward)
print(f"mocsar_pl_dqn_pytorch_load_model_cfg, Agents:{agents}") # # Here we directly load NFSP models from /models module # rl_agents = models.load(agent_str, # num_players=env.game.get_player_num(), # action_num=env.action_num, # state_shape=env.state_shape).agents # Evaluate the performance. Play with random agents. env.game.set_game_params(num_players=4, num_cards=nr_cards) env.model.create_agents(agents) env.set_agents(env.model.rule_agents) if NR_GAMES % 2 == 0: reward = tournament(env, NR_GAMES)[0] print( f'Average reward for {agent_str} against random agent: {reward}, cards: {nr_cards} ' ) else: stat.reset_game_nr(agents=env.model.rule_agents) print(f"Game for cards:{nr_cards}, agents:{stat.agentstr} ") payoff_total = 0 for i in range(NR_GAMES): state, payoffs, done = env.run_multi_agent(stat=stat) payoff_total += payoffs[0] print(f"-----------\nGame Finished.{i}.game, payoff: {payoffs[0]}") print( f'Average reward for {agent_str} against random agent: {payoff_total / NR_GAMES}, cards: {nr_cards} ' )
for episode in range(conf.get_int('episode_num')): # First sample a policy for the episode agent.sample_episode_policy() # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0], episode=episode) # Save model save_dir = 'models/mocsar_nfsp_pytorch_ra' if not os.path.exists(save_dir): os.makedirs(save_dir) state_dict = agent.get_state_dict() logger.log('\n########## Pytorch Save model ##########') logger.log('\n' + str(state_dict.keys())) torch.save(state_dict, os.path.join(save_dir, 'model.pth')) # Close files in the logger logger.close_files() # Plot the learning curve