def main(arglist): ACTORS = 1 env = EnvWrapper(arglist.scenario, ACTORS, arglist.saved_episode) if arglist.eval: current_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime()) writer = SummaryWriter(log_dir='./logs/' + current_time + '-' + arglist.scenario) maddpg_wrapper = MADDPG(ACTORS) maddpg_wrapper.create_agents(env, arglist) j = 0 for episode in range(arglist.max_episode): obs = env.reset() terminal = False maddpg_wrapper.reset() total_reward = [0 for i in maddpg_wrapper.workers] step = 0 while not terminal and step < 25: if not arglist.eval: env.render(0) time.sleep(0.03) actions = maddpg_wrapper.take_actions(obs) obs2, reward, done = env.step(actions) for actor in range(ACTORS): for i, rew in enumerate(reward[actor]): total_reward[i] += rew j += ACTORS #terminal = all(done) if arglist.eval: maddpg_wrapper.update(j, ACTORS, actions, reward, obs, obs2, done) obs = obs2 step += 1 if arglist.eval and episode % arglist.saved_episode == 0 and episode > 0: maddpg_wrapper.save(episode) if arglist.eval: for worker, ep_ave_max in zip(maddpg_wrapper.workers, maddpg_wrapper.ep_ave_max_q_value): print(worker.pos, ' => average_max_q: ', ep_ave_max / float(step), ' Reward: ', total_reward[worker.pos], ' Episode: ', episode) writer.add_scalar( str(worker.pos) + '/Average_max_q', ep_ave_max / float(step), episode) writer.add_scalar( str(worker.pos) + '/Reward Agent', total_reward[worker.pos], episode) env.close()
def train(env, num_episodes=5000, max_t=1000, warmup_episodes=0): """ Monitor agent's performance. Params ====== - env: instance of the environment - num_episodes: maximum number of episodes of agent-environment interaction - max_t: maximum number of timesteps per episode - warmup_episodes: how many episodes to explore and collect samples before learning begins Returns ======= - scores: list containing received rewards """ # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) # amplitude of OU noise # this slowly decreases to 0 noise = 1.0 noise_reduction = 0.9999 # list containing max scores from each episode episode_scores = [] # last 100 scores scores_window = deque(maxlen=100) mean_score = 0.0 maddpg = MADDPG(state_size, action_size, num_agents * state_size, num_agents * action_size) # for each episode for i_episode in range(1, num_episodes + 1): # reset the environment and begin the episode env_info = env.reset(train_mode=True)[brain_name] maddpg.reset() # get the current state (for each agent) states = env_info.vector_observations # initialize the score (for each agent) scores = np.zeros(num_agents) for t in range(max_t): # select an action (for each agent) if i_episode > warmup_episodes: actions = maddpg.act(states, noise) noise *= noise_reduction else: # Collect random samples to explore and fill the replay buffer actions = np.random.uniform(-1, 1, (num_agents, action_size)) # send all actions to the environment env_info = env.step(actions)[brain_name] # get next state (for each agent) next_states = env_info.vector_observations # get reward (for each agent) rewards = env_info.rewards # see if episode finished dones = env_info.local_done # agents perform internal updates based on sampled experience maddpg.step(states, actions, rewards, next_states, dones) # roll over states to next time step states = next_states # learn when time is right if t % LEARN_EVERY == 0 and i_episode > warmup_episodes: for _ in range(LEARN_BATCH): maddpg.learn() # update the score (for each agent) scores += rewards # exit loop if episode finished if np.any(dones): break episode_max_score = np.max(scores) episode_scores.append(episode_max_score) if i_episode > warmup_episodes: # save final score scores_window.append(episode_max_score) mean_score = np.mean(scores_window) # monitor progress if i_episode % 10 == 0: print("\rEpisode {:d}/{:d} || Average score {:.2f}".format( i_episode, num_episodes, mean_score)) else: print("\rWarmup episode {:d}/{:d}".format(i_episode, warmup_episodes), end="") if i_episode % SAVE_EVERY == 0 and i_episode > warmup_episodes: maddpg.save_weights(i_episode) # check if task is solved if i_episode >= 100 and mean_score >= 0.5: print( '\nEnvironment solved in {:d} episodes. Average score: {:.2f}'. format(i_episode, mean_score)) maddpg.save_weights() break if i_episode == num_episodes: print("\nGame over. Too bad! Final score {:.2f}\n".format(mean_score)) return episode_scores
print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) agents = MADDPG(state_size, action_size) agents.load_from_file() for i in range(1, 20): env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations score = np.zeros((2,)) agents.reset() for t in range(100): actions = agents.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done states = next_states score += np.array(rewards) if any(dones): break env.close()
def main(): ########## # CONFIG # ########## # Target Reward tgt_score = 0.5 # Device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Seed seed = 7 seeding(seed) # Model Architecture # Actor hidden_in_actor = 256 hidden_out_actor = 128 lr_actor = 1e-4 # Critic hidden_in_critic = 256 hidden_out_critic = 128 lr_critic = 3e-4 weight_decay_critic = 0 # Episodes number_of_episodes = 10000 episode_length = 2000 # Buffer buffer_size = int(1e6) batchsize = 512 # Agent Update Frequency episode_per_update = 1 # Rewards Discounts Factor discount_factor = 0.95 # Soft Update Weight tau = 1e-2 # Noise Process noise_factor = 2 noise_reduction = 0.9999 noise_floor = 0.0 # Window win_len = 100 # Save Frequency save_interval = 200 # Logger log_path = os.getcwd() + "/log" logger = SummaryWriter(log_dir=log_path) # Model Directory model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) # Load Saved Model load_model = False #################### # Load Environment # #################### env = UnityEnvironment(file_name="./Tennis_Linux_NoVis/Tennis.x86_64") # Get brain brain_name = env.brain_names[0] brain = env.brains[brain_name] print('Brain Name:', brain_name) # Reset the environment env_info = env.reset(train_mode=True)[brain_name] # Number of Agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) #################### # Show Progressbar # #################### widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() start = time.time() ############### # Multi Agent # ############### maddpg = MADDPG(state_size, action_size, num_agents, hidden_in_actor, hidden_out_actor, lr_actor, hidden_in_critic, hidden_out_critic, lr_critic, weight_decay_critic, discount_factor, tau, seed, device) if load_model: load_dict_list = torch.load(os.path.join(model_dir, 'episode-saved.pt')) for i in range(num_agents): maddpg.maddpg_agent[i].actor.load_state_dict( load_dict_list[i]['actor_params']) maddpg.maddpg_agent[i].actor_optimizer.load_state_dict( load_dict_list[i]['actor_optim_params']) maddpg.maddpg_agent[i].critic.load_state_dict( load_dict_list[i]['critic_params']) maddpg.maddpg_agent[i].critic_optimizer.load_state_dict( load_dict_list[i]['critic_optim_params']) ################# # Replay Buffer # ################# rebuffer = ReplayBuffer(buffer_size, seed, device) ################# # TRAINING LOOP # ################# # initialize scores scores_history = [] scores_window = deque(maxlen=save_interval) # i_episode = 0 for i_episode in range(number_of_episodes): timer.update(i_episode) # Reset Environmet env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) # Reset Agent maddpg.reset() # episode_t = 0 for episode_t in range(episode_length): # Explore with decaying noise factor actions = maddpg.act(states, noise_factor=noise_factor) env_info = env.step(actions)[brain_name] # Environment reacts next_states = env_info.vector_observations # get the next states rewards = env_info.rewards # get the rewards dones = env_info.local_done # see if episode has finished ################### # Save Experience # ################### rebuffer.add(states, actions, rewards, next_states, dones) scores += rewards states = next_states if any(dones): break scores_history.append(np.max(scores)) # save most recent score scores_window.append(np.max(scores)) # save most recent score avg_rewards = np.mean(scores_window) noise_factor = max(noise_floor, noise_factor * noise_reduction) # Reduce Noise Factor ######### # LEARN # ######### if len(rebuffer) > batchsize and i_episode % episode_per_update == 0: for a_i in range(num_agents): samples = rebuffer.sample(batchsize) maddpg.update(samples, a_i, logger) # Soft Update maddpg.update_targets() ################## # Track Progress # ################## if i_episode % save_interval == 0 or i_episode == number_of_episodes - 1: logger.add_scalars('rewards', { 'Avg Reward': avg_rewards, 'Noise Factor': noise_factor }, i_episode) print( '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}' .format((time.time() - start) / 60, maddpg.update_count, episode_t), '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'. format(i_episode, avg_rewards, noise_factor), end="\n") ############## # Save Model # ############## save_info = ((i_episode) % save_interval == 0 or i_episode == number_of_episodes) if save_info: save_dict_list = [] for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save(save_dict_list, os.path.join(model_dir, 'episode-Latest.pt')) pd.Series(scores_history).to_csv( os.path.join(model_dir, "scores.csv")) # plot the scores rolling_mean = pd.Series(scores_history).rolling(win_len).mean() fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores_history)), scores_history) plt.axhline(y=tgt_score, color='r', linestyle='dashed') plt.plot(rolling_mean, lw=3) plt.ylabel('Score') plt.xlabel('Episode #') # plt.show() fig.savefig(os.path.join(model_dir, 'Average_Score.pdf')) fig.savefig(os.path.join(model_dir, 'Average_Score.jpg')) plt.close() if avg_rewards > tgt_score: logger.add_scalars('rewards', { 'Avg Reward': avg_rewards, 'Noise Factor': noise_factor }, i_episode) print( '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}' .format((time.time() - start) / 60, maddpg.update_count, episode_t), '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'. format(i_episode, avg_rewards, noise_factor), end="\n") break env.close() logger.close() timer.finish()
def train_maddpg(env, max_episode=1000, max_t=1000, print_every=5, check_history=100, sigma_start=0.2, sigma_end=0.01, sigma_decay=0.995): # reset brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] # action and state size action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] print('State size:', state_size) print('Action size: ', action_size) # initialize agent num_agents = len(env_info.agents) print('Number of agents:', num_agents) maddpg = MADDPG(state_size, action_size, random_seed=123) scores_deque = deque(maxlen=check_history) scores = [] # learning multiple episodes sigma = sigma_start for episode in range(max_episode): # prepare for training in the current epoc env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations score = 0 maddpg.reset(sigma=sigma) # play and learn in current episode for t in range(max_t): actions = maddpg.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished maddpg.step(t, states, actions, rewards, next_states, dones) states = next_states reward = np.max( rewards) # get max score of two agents as current score score += reward if np.any(dones): break # update sigma for exlporation sigma = max(sigma_end, sigma * sigma_decay) # record score epoc_score = score scores_deque.append(epoc_score) scores.append(epoc_score) if episode % print_every == 0: print('Episode {}\tscore: {:.4f}\tAverage Score: {:.4f}'.format( episode, epoc_score, np.mean(scores_deque))) if np.mean(scores_deque) >= 0.5 and episode >= check_history: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.4f}' .format(episode - check_history, np.mean(scores_deque))) for agent in maddpg.ddpg_agents: torch.save(agent.actor_local.state_dict(), 'actor_agent_' + str(agent.id) + '.pth') torch.save(agent.critic_local.state_dict(), 'critic_agent_' + str(agent.id) + '.pth') break return scores