def train(args): # Check whether gpu is available device = get_device() # Seed numpy, torch, random set_seed(args.seed) # Make the environment with seed env = rlcard.make(args.env, config={ 'seed': args.seed, }) # Initialize the agent and use random agents as opponents if args.algorithm == 'dqn': from rlcard.agents import DQNAgent agent = DQNAgent( num_actions=env.num_actions, state_shape=env.state_shape[0], mlp_layers=[64, 64], device=device, ) elif args.algorithm == 'nfsp': from rlcard.agents import NFSPAgent agent = NFSPAgent( num_actions=env.num_actions, state_shape=env.state_shape[0], hidden_layers_sizes=[64, 64], q_mlp_layers=[64, 64], device=device, ) agents = [agent] for _ in range(1, env.num_players): agents.append(RandomAgent(num_actions=env.num_actions)) env.set_agents(agents) # Start training with Logger(args.log_dir) as logger: for episode in range(args.num_episodes): if args.algorithm == 'nfsp': agents[0].sample_episode_policy() # Generate data from the environment trajectories, payoffs = env.run(is_training=True) # Reorganaize the data to be state, action, reward, next_state, done trajectories = reorganize(trajectories, payoffs) # Feed transitions into agent memory, and train the agent # Here, we assume that DQN always plays the first position # and the other players play randomly (if any) for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % args.evaluate_every == 0: logger.log_performance( env.timestep, tournament( env, args.num_eval_games, )[0]) # Get the paths csv_path, fig_path = logger.csv_path, logger.fig_path # Plot the learning curve plot_curve(csv_path, fig_path, args.algorithm) # Save model save_path = os.path.join(args.log_dir, 'model.pth') torch.save(agent, save_path) print('Model saved in', save_path)
def run(self, is_training=False): ''' Run X complete games, where X is the number of environemnts. The input/output are similar to Env. The difference is that The transitions for each player are stacked over the environments ''' trajectories = [[[] for _ in range(self.player_num)] for _ in range(self.num)] ready_trajectories = [None for _ in range(self.num)] active_remotes = [remote for remote in self.remotes] mapping = [i for i in range(self.num)] active_num = self.num # Reset states = [] player_ids = [] for state, player_id in send_command_to_all(active_remotes, ('reset', None)): states.append(state) player_ids.append(player_id) for i in range(active_num): trajectories[i][player_ids[i]].append(states[i]) # Loop until all the environments are over while active_num > 0: # Agent playes # TODO: Currently we naively feed one obs to the agent. This can be improved via batch commands = [] actions = [] for i in range(active_num): opt = 'raw_step' if self.agents[player_ids[i]].use_raw else 'step' if not is_training: action, _ = self.agents[player_ids[i]].eval_step(states[i]) else: action = self.agents[player_ids[i]].step(states[i]) commands.append((opt, action)) actions.append(action) # Environment steps next_states, next_player_ids, dones = [], [], [] for next_state, next_player_id, done in send_commands_to_all(active_remotes, commands): next_states.append(next_state) next_player_ids.append(next_player_id) dones.append(done) # Save action for i in range(active_num): trajectories[i][player_ids[i]].append(actions[i]) # Set the state and player states = next_states player_ids = next_player_ids # Save state finished = [] for i in range(active_num): if dones[i]: # Add a final state to all the players for j in range(self.player_num): active_remotes[i].send(('get_state', j)) trajectories[i][j].append(active_remotes[i].recv()) # Save the ready trajectories and mark them as finished ready_trajectories[mapping[i]] = trajectories[i] finished.append(i) else: trajectories[i][player_ids[i]].append(states[i]) # Pop out the finished ones trajectories = [trajectories[i] for i in range(active_num) if i not in finished] mapping = [mapping[i] for i in range(active_num) if i not in finished] active_remotes = [active_remotes[i] for i in range(active_num) if i not in finished] states = [states[i] for i in range(active_num) if i not in finished] player_ids = [player_ids[i] for i in range(active_num) if i not in finished] self.timestep += active_num active_num -= len(finished) # Payoffs payoffs = send_command_to_all(self.remotes, ('get_payoffs', None)) for i in range(self.num): ready_trajectories[i] = reorganize(ready_trajectories[i], payoffs[i]) trajectories = [[] for _ in range(self.player_num)] for trs in ready_trajectories: for i in range(self.player_num): trajectories[i].extend(trs[i]) return trajectories, payoffs
def main(): # Make environment device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = rlcard.make('no-limit-holdem', config={'seed': 0, 'env_num': 4}) eval_env = rlcard.make('no-limit-holdem', config={'seed': 0, 'env_num': 4}) # Set the iterations numbers and how frequently we evaluate performance evaluate_every = 5000 selfplay_every = 25000 evaluate_num = 10000 iteration_num = 8000000 # The intial memory size memory_init_size = 100 # Train the agent every X steps train_every = 1 agent = DQNAgent(num_actions=env.num_actions, state_shape=env.state_shape[0], mlp_layers=[64, 64, 64, 64], device=device) agents = [agent, load_model("model.pth")] env.set_agents(agents) with Logger('./') as logger: for episode in range(iteration_num): # Generate data from the environment trajectories, payoffs = env.run(is_training=True) # Reorganaize the data to be state, action, reward, next_state, done trajectories = reorganize(trajectories, payoffs) # Feed transitions into agent memory, and train the agent # Here, we assume that DQN always plays the first position # and the other players play randomly (if any) for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % evaluate_every == 0: logger.log_performance(env.timestep, tournament(env, evaluate_num)[0]) if episode % selfplay_every == 0: save_path = os.path.join('./', str(episode) + "model.pth") torch.save(agent, save_path) print('Model saved in', save_path) agents = [agent, load_model(str(episode) + "model.pth")] env.set_agents(agents) # Get the paths csv_path, fig_path = logger.csv_path, logger.fig_path # Plot the learning curve #plot_curve(csv_path, fig_path, args.algorithm) # Save model save_path = os.path.join('./', 'model.pth') torch.save(agent, save_path) print('Model saved in', save_path) # The paths for saving the logs and learning curves log_dir = './experiments/nlh_cfr_result/' # Set a global seed set_seed(0)