def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 episode_length = 80 batchsize = 1000 # how many episodes to save policy and gif save_interval = 1000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in range(0, number_of_episodes): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) all_obs = env.reset() # obs, obs_full = transpose_list(all_obs) #for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) #soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) #saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
def main(): seeding() # number of parallel agents number_of_agents = 2 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 3000 batchsize = 128 # amplitude of OU noise # this slowly decreases to 0 noise = 1 noise_reduction = 0.9999 tau = 1e-3 # soft update factor gamma = 0.99 # reward discount factor print_every = 100 # how many episodes before update episode_per_update = 2 #model_dir= os.getcwd()+"/model_dir" #os.makedirs(model_dir, exist_ok=True) result_dir= os.getcwd()+"/result_dir" os.makedirs(result_dir, exist_ok=True) # do we need to set multi-thread for this env? torch.set_num_threads(number_of_agents*2) env = TennisEnv() # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(1e5)) num_agents, num_states, num_actions = env.get_shapes() # initialize policy and critic maddpg = MADDPG(num_agents, num_states, num_actions, discount_factor=gamma, tau=tau) # training loop scores_window = deque(maxlen=100) ep_scores = [] agent0_reward = [] agent1_reward = [] for episode in range(0, number_of_episodes): reward_this_episode = np.zeros((1, number_of_agents)) states, states_full, env_info = env.reset() for agent in maddpg.maddpg_agent: agent.noise.reset() while True: actions = maddpg.act(torch.tensor(states, dtype=torch.float), noise=noise) noise *= noise_reduction actions_for_env = torch.stack(actions).detach().numpy() # step forward one frame next_states, next_states_full, rewards, dones, info = env.step(actions_for_env) # add data to buffer buffer.push(states, states_full, actions_for_env, rewards, next_states, next_states_full, dones) reward_this_episode += rewards states = np.copy(next_states) states_full = np.copy(next_states_full) # update once after every episode_per_update if len(buffer) > batchsize: for a_i in range(number_of_agents): samples = buffer.sample(batchsize) maddpg.update(samples, a_i) if np.any(dones): break agent0_reward.append(reward_this_episode[0, 0]) agent1_reward.append(reward_this_episode[0, 1]) avg_rewards = max(reward_this_episode[0, 0], reward_this_episode[0, 1]) scores_window.append(avg_rewards) cur_score = np.mean(scores_window) ep_scores.append(cur_score) save_dict_list =[] if episode % print_every == 0.0 or avg_rewards > 2.5: print('\rEpisode: {}, Average score: {:.5f}, noise: {:.5f}'.format(episode, cur_score, noise)) if avg_rewards > 2.5: for i in range(number_of_agents): save_dict = {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()} save_dict_list.append(save_dict) torch.save(save_dict_list, os.path.join(model_dir, 'episode-{}-{}.pt'.format(episode, cur_score))) print('model saved') break env.close() #print('main-ep_scores: ', ep_scores) fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(1, len(ep_scores)+1), ep_scores) plt.ylabel('Score') plt.xlabel('Episode #') fig.savefig(result_dir + '/score_plot.png')
def main(): ########## # CONFIG # ########## # Target Reward tgt_score = 0.5 # Device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Seed seed = 7 seeding(seed) # Model Architecture # Actor hidden_in_actor = 256 hidden_out_actor = 128 lr_actor = 1e-4 # Critic hidden_in_critic = 256 hidden_out_critic = 128 lr_critic = 3e-4 weight_decay_critic = 0 # Episodes number_of_episodes = 10000 episode_length = 2000 # Buffer buffer_size = int(1e6) batchsize = 512 # Agent Update Frequency episode_per_update = 1 # Rewards Discounts Factor discount_factor = 0.95 # Soft Update Weight tau = 1e-2 # Noise Process noise_factor = 2 noise_reduction = 0.9999 noise_floor = 0.0 # Window win_len = 100 # Save Frequency save_interval = 200 # Logger log_path = os.getcwd() + "/log" logger = SummaryWriter(log_dir=log_path) # Model Directory model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) # Load Saved Model load_model = False #################### # Load Environment # #################### env = UnityEnvironment(file_name="./Tennis_Linux_NoVis/Tennis.x86_64") # Get brain brain_name = env.brain_names[0] brain = env.brains[brain_name] print('Brain Name:', brain_name) # Reset the environment env_info = env.reset(train_mode=True)[brain_name] # Number of Agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) #################### # Show Progressbar # #################### widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() start = time.time() ############### # Multi Agent # ############### maddpg = MADDPG(state_size, action_size, num_agents, hidden_in_actor, hidden_out_actor, lr_actor, hidden_in_critic, hidden_out_critic, lr_critic, weight_decay_critic, discount_factor, tau, seed, device) if load_model: load_dict_list = torch.load(os.path.join(model_dir, 'episode-saved.pt')) for i in range(num_agents): maddpg.maddpg_agent[i].actor.load_state_dict( load_dict_list[i]['actor_params']) maddpg.maddpg_agent[i].actor_optimizer.load_state_dict( load_dict_list[i]['actor_optim_params']) maddpg.maddpg_agent[i].critic.load_state_dict( load_dict_list[i]['critic_params']) maddpg.maddpg_agent[i].critic_optimizer.load_state_dict( load_dict_list[i]['critic_optim_params']) ################# # Replay Buffer # ################# rebuffer = ReplayBuffer(buffer_size, seed, device) ################# # TRAINING LOOP # ################# # initialize scores scores_history = [] scores_window = deque(maxlen=save_interval) # i_episode = 0 for i_episode in range(number_of_episodes): timer.update(i_episode) # Reset Environmet env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) # Reset Agent maddpg.reset() # episode_t = 0 for episode_t in range(episode_length): # Explore with decaying noise factor actions = maddpg.act(states, noise_factor=noise_factor) env_info = env.step(actions)[brain_name] # Environment reacts next_states = env_info.vector_observations # get the next states rewards = env_info.rewards # get the rewards dones = env_info.local_done # see if episode has finished ################### # Save Experience # ################### rebuffer.add(states, actions, rewards, next_states, dones) scores += rewards states = next_states if any(dones): break scores_history.append(np.max(scores)) # save most recent score scores_window.append(np.max(scores)) # save most recent score avg_rewards = np.mean(scores_window) noise_factor = max(noise_floor, noise_factor * noise_reduction) # Reduce Noise Factor ######### # LEARN # ######### if len(rebuffer) > batchsize and i_episode % episode_per_update == 0: for a_i in range(num_agents): samples = rebuffer.sample(batchsize) maddpg.update(samples, a_i, logger) # Soft Update maddpg.update_targets() ################## # Track Progress # ################## if i_episode % save_interval == 0 or i_episode == number_of_episodes - 1: logger.add_scalars('rewards', { 'Avg Reward': avg_rewards, 'Noise Factor': noise_factor }, i_episode) print( '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}' .format((time.time() - start) / 60, maddpg.update_count, episode_t), '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'. format(i_episode, avg_rewards, noise_factor), end="\n") ############## # Save Model # ############## save_info = ((i_episode) % save_interval == 0 or i_episode == number_of_episodes) if save_info: save_dict_list = [] for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save(save_dict_list, os.path.join(model_dir, 'episode-Latest.pt')) pd.Series(scores_history).to_csv( os.path.join(model_dir, "scores.csv")) # plot the scores rolling_mean = pd.Series(scores_history).rolling(win_len).mean() fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores_history)), scores_history) plt.axhline(y=tgt_score, color='r', linestyle='dashed') plt.plot(rolling_mean, lw=3) plt.ylabel('Score') plt.xlabel('Episode #') # plt.show() fig.savefig(os.path.join(model_dir, 'Average_Score.pdf')) fig.savefig(os.path.join(model_dir, 'Average_Score.jpg')) plt.close() if avg_rewards > tgt_score: logger.add_scalars('rewards', { 'Avg Reward': avg_rewards, 'Noise Factor': noise_factor }, i_episode) print( '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}' .format((time.time() - start) / 60, maddpg.update_count, episode_t), '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'. format(i_episode, avg_rewards, noise_factor), end="\n") break env.close() logger.close() timer.finish()
def train(env, model_path='model_dir', number_of_episodes=50000, episode_length=500): noise = 1.0 noise_reduction = 1.0 batchsize = 256 model_dir = os.getcwd() + "/" + model_path model_files = glob.glob(model_dir + "/*.pt") for file in model_files: os.remove(file) os.makedirs(model_dir, exist_ok=True) buffer = ReplayBuffer(int(1e5)) rewards_deque = deque(maxlen=100) rewards_total = [] # initialize policy and critic maddpg = MADDPG() for episode in range(1, number_of_episodes + 1): rewards_this_episode = np.asarray([0.0, 0.0]) env_info = env.reset(train_mode=True)[brain_name] obs = env_info.vector_observations for episode_t in range(episode_length): actions = maddpg.act(obs, noise=noise) noise *= noise_reduction env_info = env.step(actions)[brain_name] next_obs = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done # add data to buffer transition = (obs, actions, rewards, next_obs, dones) buffer.push(transition) rewards_this_episode += rewards obs = next_obs if any(dones): break # update once after every episode_per_update if len(buffer) > batchsize * 4: for _ in range(4): for a_i in range(num_agents): samples = buffer.sample(batchsize) maddpg.update(samples, a_i) maddpg.update_targets( ) # soft update the target network towards the actual networks rewards_total.append(np.max(rewards_this_episode)) rewards_deque.append(rewards_total[-1]) average_score = np.mean(rewards_deque) print(episode, rewards_this_episode, rewards_total[-1], average_score) # saving model save_dict_list = [] if episode % 1000 == 0: for i in range(2): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) torch.save(maddpg.maddpg_agent[0].actor.state_dict(), 'actor0.pt') torch.save(maddpg.maddpg_agent[1].actor.state_dict(), 'actor1.pt') torch.save(maddpg.maddpg_agent[0].critic.state_dict(), 'critic0.pt') torch.save(maddpg.maddpg_agent[1].critic.state_dict(), 'critic1.pt') return rewards_total
def main(): seeding() # number of parallel agents env = UnityEnvironment(file_name="Tennis.x86_64") env_name = 'Tennis' # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) # size of each action action_size = brain.vector_action_space_size # examine the state space states = env_info.vector_observations state_size = states.shape[-1] # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 episode_length = 10000 batchsize = 128 # amplitude of OU noise # this slowly decreases to 0 noise = 1 noise_reduction = 0.9999 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) # initialize memory buffer buffer = ReplayBuffer(int(500000), batchsize, 0) # initialize policy and critic maddpg = MADDPG(state_size, action_size, num_agents, seed=12345, discount_factor=0.95, tau=0.02) #how often to update the MADDPG model episode_per_update = 2 # training loop PRINT_EVERY = 5 scores_deque = deque(maxlen=100) # holds raw scores scores = [] # holds avg scores of last 100 epsiodes avg_last_100 = [] threshold = 0.5 # use keep_awake to keep workspace from disconnecting for episode in range(number_of_episodes): env_info = env.reset( train_mode=True)[brain_name] # reset the environment state = env_info.vector_observations # get the current state (for each agent) episode_reward_agent0 = 0 episode_reward_agent1 = 0 for agent in maddpg.maddpg_agent: agent.noise.reset() for episode_t in range(episode_length): actions = maddpg.act(torch.tensor(state, dtype=torch.float), noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() env_info = env.step(actions_array)[brain_name] next_state = env_info.vector_observations reward = env_info.rewards done = env_info.local_done episode_reward_agent0 += reward[0] episode_reward_agent1 += reward[1] # add data to buffer ''' I can either hstack or concat two states here or do it in the update function in MADDPG However I think it's easier to do it here, since in the update function I have batch_size to deal with Although the replay buffer would have to hold more data by preprocessing and creating 2 new variables that hold essentially the same info as state, and next_state, but just concatenated. ''' full_state = np.concatenate((state[0], state[1])) full_next_state = np.concatenate((next_state[0], next_state[1])) buffer.add(state, full_state, actions_array, reward, next_state, full_next_state, done) state = next_state # update once after every episode_per_update if len(buffer) > batchsize and episode % episode_per_update == 0: for i in range(num_agents): samples = buffer.sample() maddpg.update(samples, i) maddpg.update_targets( ) # soft update the target network towards the actual networks if np.any(done): #if any of the agents are done break break episode_reward = max(episode_reward_agent0, episode_reward_agent1) scores.append(episode_reward) scores_deque.append(episode_reward) avg_last_100.append(np.mean(scores_deque)) # scores.append(episode_reward) print('\rEpisode {}\tAverage Score: {:.4f}\tScore: {:.4f}'.format( episode, avg_last_100[-1], episode_reward), end="") if episode % PRINT_EVERY == 0: print('\rEpisode {}\tAverage Score: {:.4f}'.format( episode, avg_last_100[-1])) # saving successful model #training ends when the threshold value is reached. if avg_last_100[-1] >= threshold: save_dict_list = [] for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # plots graphs raw_score_plotter(scores) plotter(env_name, len(scores), avg_last_100, threshold) break
def main(): seeding() parallel_envs = 4 number_of_episodes = 1000 episode_length = 80 batchsize = 1000 save_interval = 1000 t = 0 # amplitude of OU noise, which slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) """ `env` controls three agents, two blue, one red. env.observation_space: [Box(14,), Box(14,), Box(14,)] env.action_sapce: [Box(2,), Box(2,), Box(2,)] Box(14,) can be broken down into 2+3*2+3*2=14 (2) location coordinates of the target landmark (3*2) the three agents' positions w.r.t. the target landmark (3*2) the three agents' velocities w.r.t. the target landmark """ env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) # Consult `env_wrapper.py` line 19. all_obs = env.reset() """ `all_abs` is a list of size `parallel_envs`, each item in the list is another list of size two, first is env.observation_space: [Box(14,), Box(14,), Box(14,)], second is [Box(14,)], which is added to faciliate training https://goo.gl/Xtr6sF `obs` and `obs_full` are both lists of size `parallel_envs`, `obs` has the default observation space [Box(14,), Box(14,), Box(14,)] `obs_full` has the compounded observation space [Box(14,)] """ obs, obs_full = transpose_list(all_obs) # for calculating rewards for one episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): t += parallel_envs # explore = only explore for a certain number of steps # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction # `actions_array` has shape (3, parallel_envs, 2) actions_array = torch.stack(actions).detach().numpy() # `actions_for_env` has shape (parallel_envs, 3, 2), because # input to `step` requires the first index to be `parallel_envs` actions_for_env = np.rollaxis(actions_array, axis=1) # step forward one frame next_obs, next_obs_full, rewards, dones, info = \ env.step(actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update the target network `parallel_envs`=4 times # after every `episode_per_update`=2*4 if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: # update the local network for all agents, `a_i` refers to agent no. for a_i in range(3): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) # soft update the target network towards the actual networks maddpg.update_targets() for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # Saves the model. save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # Save gif files. imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
def main(): seeding() # number of training episodes. number_of_episodes = 5000 episode_length = 1000 batchsize = 2000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) # env = UnityEnvironment('Tennis_Windows_x86_64/Tennis.exe') env = UnityEnvironment('Tennis_Windows_x86_64/Tennis.exe', no_graphics=True) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) replay_episodes = 1000 buffer = ReplayBuffer(int(replay_episodes * episode_length)) # initialize policy and critic maddpg = MADDPG() # logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] # training loop scores_deque = deque(maxlen=100) scores = [] for episode in range(0, number_of_episodes): reward_this_episode = np.zeros(num_agents) env_info = env.reset(True)[brain_name] state = env_info.vector_observations obs = [[state[0], state[1]]] obs_full = np.concatenate((state[0], state[1])) #for calculating rewards for this particular episode - addition of all time steps frames = [] tmax = 0 for episode_t in range(episode_length): t += 1 # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents # actions_for_env = np.rollaxis(actions_array,1) actions_for_env = np.clip(actions_array.flatten(), -1, 1) # print(actions_for_env) # step forward one frame # next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env) env_info = env.step(actions_for_env)[brain_name] next_state = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done next_obs = [[next_state[0], next_state[1]]] next_obs_full = np.concatenate((next_state[0], next_state[1])) # print(obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones # add data to buffer transition = ([obs], [obs_full], [actions_for_env], [rewards], [next_obs], [next_obs_full], [dones]) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full if any(dones): break # update once after every episode_per_update if len(buffer) > batchsize and episode % episode_per_update == 0: for a_i in range(num_agents): samples = buffer.sample(batchsize) maddpg.update(samples, a_i) maddpg.update_targets( ) #soft update the target network towards the actual networks avg_rewards = np.mean(reward_this_episode, axis=0) episode_reward = np.max(avg_rewards) scores_deque.append(episode_reward) scores.append(episode_reward) print('\rEpisode {}\tAverage Score: {:.3f}\tEpisode Score: {:.3f}'. format(episode, np.mean(scores_deque), episode_reward), end="") if (episode > 0 and episode % 100 == 0) or episode == number_of_episodes - 1: print('\rEpisode {}\tAverage Score: {:.3f}\tEpisode Score: {:.3f}'. format(episode, np.mean(scores_deque), episode_reward)) if np.mean(scores_deque) >= 0.5: print('\nSuccess!') break #saving model save_dict_list = [] for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save(save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) env.close() fig = plt.figure() ax = fig.add_subplot(111) plt.ylabel('Score') plt.xlabel('Episode #') plt.plot(np.arange(1, len(scores) + 1), scores) plt.savefig('tennis_score_history.png') return scores
def main(): seeding() # number of parallel agents number_of_agents = 2 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 5000 max_t = 1000 batchsize = 128 # amplitude of OU noise # this slowly decreases to 0 noise = 1 noise_reduction = 0.9999 tau = 1e-3 # soft update factor gamma = 0.99 # reward discount factor # how many episodes before update episode_per_update = 2 model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) # do we need to set multi-thread for this env? torch.set_num_threads(number_of_agents * 2) env = TennisEnv() # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(1e5)) # initialize policy and critic maddpg = MADDPG(discount_factor=gamma, tau=tau) # training loop scores_window = deque(maxlen=100) ep_scores = [] # when to save: use a dictionary to track if a model at a given score (key/10) has been saved. save_on_scores = { 5: False, 6: False, 9: False, 10: False, 11: False, 12: False, 13: False, 14: False, 15: False, 16: False, 17: False, 18: False, 19: False, 20: False } agent0_reward = [] agent1_reward = [] for episode in range(0, number_of_episodes): reward_this_episode = np.zeros((1, number_of_agents)) obs, obs_full, env_info = env.reset() for agent in maddpg.maddpg_agent: agent.noise.reset() for episode_t in range(max_t): # explore = only explore for a certain number of episodes # action input needs to be transposed #print('Obs:', obs) actions = maddpg.act(torch.tensor(obs, dtype=torch.float), noise=noise) #print(actions) #if noise>0.01: noise *= noise_reduction actions_for_env = torch.stack(actions).detach().numpy() # step forward one frame next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer buffer.push(obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) reward_this_episode += rewards obs = np.copy(next_obs) obs_full = np.copy(next_obs_full) # update once after every episode_per_update if len( buffer ) > batchsize and episode > 0 and episode % episode_per_update == 0: for a_i in range(number_of_agents): samples = buffer.sample(batchsize) maddpg.update(samples, a_i) if np.any(dones): break agent0_reward.append(reward_this_episode[0, 0]) agent1_reward.append(reward_this_episode[0, 1]) avg_rewards = max(reward_this_episode[0, 0], reward_this_episode[0, 1]) scores_window.append(avg_rewards) cur_score = np.mean(scores_window) ep_scores.append(cur_score) print( '\rEpisode:{}, Rwd:{:.3f} vs. {:.3f}, Average Score:{:.4f}, Noise:{:.4f}' .format(episode, reward_this_episode[0, 0], reward_this_episode[0, 1], cur_score, noise)) #saving model save_dict_list = [] save_info = False score_code = int(cur_score * 10) if score_code in save_on_scores.keys(): if not (save_on_scores[score_code]): save_on_scores[score_code] = True save_info = True if save_info: for i in range(number_of_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join( model_dir, 'episode-{}-{}.pt'.format(episode, score_code))) np.savez('scores-{}-{}.npz'.format(episode, score_code), agent0_reward=np.array(agent0_reward), agent1_reward=np.array(agent1_reward), avg_max_scores=np.array(ep_scores)) env.close()
def main(): seeding(seed=SEED) # number of parallel agents parallel_envs = 1 # number of agents per environment num_agents = 5 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 60000 episode_length = 35 # how many episodes to save policy and gif save_interval = 1000 t = 0 scenario_name = "simple_spread_ivan" # amplitude of OU noise # this slowly decreases to 0 noise = 0.5 # was 2, try 0.5, 0.2 noise_reduction = 0.9999 # 0.999 #### DECAY initial_noise = 0.1 decay = 0.01 # how many episodes before update # episode_per_update = UPDATE_EVERY * parallel_envs common_folder = time.strftime("/%m%d%y_%H%M%S") log_path = os.getcwd() + common_folder + "/log" model_dir = os.getcwd() + common_folder + "/model_dir" os.makedirs(model_dir, exist_ok=True) # initialize environment # torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs, seed=3, benchmark=BENCHMARK) # env = envs.make_env("simple_spread_ivan") # initialize replay buffer buffer = ReplayBuffer(int(BUFFER_SIZE)) # initialize policy and critic maddpg = MADDPG(num_agents=num_agents, discount_factor=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY) logger = SummaryWriter(log_dir=log_path) agents_reward = [] for n in range(num_agents): agents_reward.append([]) # agent0_reward = [] # agent1_reward = [] # agent2_reward = [] agent_info = [[[]]] # placeholder for benchmarking info # training loop # show progressbar import progressbar as pb widget = [ '\repisode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() print('Starting iterations...') for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, num_agents)) all_obs = env.reset() # # flip the first two indices # ADD FOR WITHOUT PARALLEL ENV # all_obs = np.expand_dims(all_obs, axis=0) obs_roll = np.rollaxis(all_obs, 1) obs = transpose_list(obs_roll) # save info or not save_info = ((episode) % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 # if save_info: # frames.append(env.render('rgb_array')) for episode_t in range(episode_length): # get actions # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise = max(initial_noise * decay**(episode_t / 20000), 0.001) # noise = max(noise*noise_reduction, 0.001) actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # environment step # step forward one frame # next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env) # ADD FOR WITHOUT PARALLEL ENV # next_obs, rewards, dones, info = env.step(actions_for_env) next_obs, rewards, dones, info = env.step(actions_for_env) # rewards_sum += np.mean(rewards) # collect experience transition = (obs, actions_for_env, rewards, next_obs, dones) buffer.push(transition) reward_this_episode += rewards # obs, obs_full = next_obs, next_obs_full obs = next_obs # increment global step counter t += parallel_envs # save gif frame if save_info: # frames.append(env.render('rgb_array')) tmax += 1 # for benchmarking learned policies if BENCHMARK: for i, inf in enumerate(info): agent_info[-1][i].append(inf['n']) # update once after every episode_per_update # if len(buffer) > BATCH_SIZE and episode % episode_per_update < parallel_envs: if len(buffer) > BATCH_SIZE and episode % UPDATE_EVERY < parallel_envs: for _ in range(UPDATE_TIMES): for a_i in range(num_agents): samples = buffer.sample(BATCH_SIZE) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) # soft update the target network towards the actual networks for i in range(parallel_envs): for n in range(num_agents): agents_reward[n].append(reward_this_episode[i, n]) # agent0_reward.append(reward_this_episode[i,0]) # agent1_reward.append(reward_this_episode[i,1]) # agent2_reward.append(reward_this_episode[i,2]) if episode % 100 == 0 or episode == number_of_episodes - 1: # avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward)] avg_rewards = [] for n in range(num_agents): avg_rewards.append(np.mean(agents_reward[n])) # agent0_reward = [] # agent1_reward = [] # agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # saving model save_dict_list = [] if save_info: print('agent_info benchmark=', agent_info) for i in range(5): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files # imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), # frames, duration=.04) env.close() logger.close() timer.finish()