def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 episode_length = 100 batchsize = 1000 # how many episodes to save policy and gif save_interval = 5000 # what is this ? t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) # this may be a list of all environments env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic # this creates a list of models, each element in the list refers to an agent in the simulation # [agent_one_ddpg, agent_two_ddpg, ...] # agent_one_ddpg contains the agent actor and critic models,e.g., agent_one_ddpg.actor, agent_one_ddpg.critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): # notice we jump forward by number of parallel environments for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) # i believe there are as many as number of agents times parallel env reward reward_this_episode = np.zeros((parallel_envs, 3)) # obs is the observation state space of all the three agents in the 4 parallel env. # for the Physical Dception environment with three agents it is of dimension 4x3x14. # obs_full is world state irrespective of the agents and its dimension is 4x14. # all_observation = array(number of environments 4, 2 elements) # element 0 : is a list that contains 3 arrays. contains the state for each agent, each state is of size 14 # element 1 : global state from the perspective of the target/green for its environment. contains 14 elements all_obs = env.reset() # obs : is a list that has 1 element per environment. each element contains a list of 3 array. # each array is the state of one agent in that environment. # obs_full: is the god eye view of each environment. So it a list, that has 1 element per environment # each element contains an array of 14 values which is the global state of that environment obs, obs_full = transpose_list(all_obs) #for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = (episode % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): # we finish the episode before sampling the buffer for trainint # t jumps forward in a multiple of environment t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed # the transpose_to_tensor(obs) changes the data to each agent point of view # since we have 4 environments, there are 4 agent 1, 4 agent 2, and 4 agent 3 # each agent has a state in each environment, total states across 4 environments for agent 1 is 4x14 tensor # transpose_to_tensor(obs) = is a list of 3 elements. each element is for 1 agent # pick element 1. this is an array of 4x14 elements of agent observation across 4 environments. # maddpg.act has a for loop that take each element of obs and pass it to the agents actor models and # to generate an action from each agent actor. actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction # there are 4 actions per agent and 3 agents, total of 12 . Each action has 2 elements force in x, y direct # actions_array is a tensor of shape (3 agent, 4 env, 2 action) actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents # the shape of actions_for_env is (4 env, 3 agent, 2 action) actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame # obs is the observation state space of all the three agents in the 4 parallel env. # for the Physical Dception environment with three agents it is of dimension 4x3x14. # obs_full is world state irrespective of the agents and its dimension is 4x14. # To gain more understanding, please see the code in the multiagent folder. next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): # although samples are drawn randomly, for each sample we have all 3 agents data, and we know which # reward and actions belong to which agent # samples is a list of 7 elements: obs, obs_full, action, reward, next_obs, next_obs_full, done # each element of sample, say samples[0] is a list of 3 elements, one for each agent # each agent element contains their corresponding value, for example in case of obs it would be a # vector with 14 values # so when i ask for 2 samples for examples, i get 2 samples each containing all 3 agents states, rewards samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) #soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) #saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 1000 episode_length = 80 batchsize = 1000 # how many episodes to save policy and gif save_interval = 1000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in range(0, number_of_episodes + parallel_envs, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) all_obs = env.reset() obs, obs_full = transpose_list(all_obs) # for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = (episode % save_interval < parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) # soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 1000 episode_length = 80 batchsize = 1000 # how many episodes to save policy and gif save_interval = 1000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + '/log' model_dir = os.getcwd() + '/model_dir' os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir = log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = ['episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) all_obs = env.reset() obs, obs_full = transpose_list(all_obs) # for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < parallel_envs or episode==number_of_episodes-parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array'))
def main(): seeding() parallel_envs = 4 number_of_episodes = 1000 episode_length = 80 batchsize = 1000 save_interval = 1000 t = 0 # amplitude of OU noise, which slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) """ `env` controls three agents, two blue, one red. env.observation_space: [Box(14,), Box(14,), Box(14,)] env.action_sapce: [Box(2,), Box(2,), Box(2,)] Box(14,) can be broken down into 2+3*2+3*2=14 (2) location coordinates of the target landmark (3*2) the three agents' positions w.r.t. the target landmark (3*2) the three agents' velocities w.r.t. the target landmark """ env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) # Consult `env_wrapper.py` line 19. all_obs = env.reset() """ `all_abs` is a list of size `parallel_envs`, each item in the list is another list of size two, first is env.observation_space: [Box(14,), Box(14,), Box(14,)], second is [Box(14,)], which is added to faciliate training https://goo.gl/Xtr6sF `obs` and `obs_full` are both lists of size `parallel_envs`, `obs` has the default observation space [Box(14,), Box(14,), Box(14,)] `obs_full` has the compounded observation space [Box(14,)] """ obs, obs_full = transpose_list(all_obs) # for calculating rewards for one episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): t += parallel_envs # explore = only explore for a certain number of steps # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction # `actions_array` has shape (3, parallel_envs, 2) actions_array = torch.stack(actions).detach().numpy() # `actions_for_env` has shape (parallel_envs, 3, 2), because # input to `step` requires the first index to be `parallel_envs` actions_for_env = np.rollaxis(actions_array, axis=1) # step forward one frame next_obs, next_obs_full, rewards, dones, info = \ env.step(actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update the target network `parallel_envs`=4 times # after every `episode_per_update`=2*4 if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: # update the local network for all agents, `a_i` refers to agent no. for a_i in range(3): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) # soft update the target network towards the actual networks maddpg.update_targets() for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # Saves the model. save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # Save gif files. imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
def main(): seeding(seed=SEED) # number of parallel agents parallel_envs = 1 # number of agents per environment num_agents = 5 # initialize environment torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs, seed=SEED, num_agents=num_agents, benchmark=BENCHMARK) # initialize policy and critic maddpg = MADDPG(num_agents=num_agents, discount_factor=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY) agents_reward = [] for n in range(num_agents): agents_reward.append([]) # trained_checkpoint = r'E:\Ivan\UPC\UDACI TY\DRL_Nanodegree\Part4\MADDPG\032521_163018\model_dir\episode-59994.pt' #test1 2 agents # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032521_211315\model_dir\episode-59994.pt' #test1 2 agents # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032621_054252\model_dir\episode-36000.pt' #test1 2 agents # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032821_102717\model_dir\episode-99000.pt' #test1 6 agents # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032921_160324\model_dir\episode-99000.pt' #test2 6 agents pretrined # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\033021_203450\model_dir\episode-73002.pt' # test2 6 agents pretrined # trained_checkpoint = "c4-a4-n01-a01-old-two-noskip/model_dir/episode-59999.pt" trained_checkpoint = "gat-huge1k/model_dir/episode-99000.pt" aux = torch.load(trained_checkpoint, map_location=torch.device('cpu')) for i in range(num_agents): # load the weights from file maddpg.maddpg_agent[i].actor.load_state_dict(aux[i]['actor_params']) maddpg.maddpg_agent[i].critic.load_state_dict(aux[i]['critic_params']) # Reset the environment all_obs = env.reset() # flip the first two indices obs_roll = np.rollaxis(all_obs, 1) obs = transpose_list(obs_roll) scores = 0 t = 0 while True: all_obs = env.reset() # flip the first two indices obs_roll = np.rollaxis(all_obs, 1) obs = transpose_list(obs_roll) scores = 0 t = 0 for _ in range(25): env.render('rgb_array') time.sleep(0.1) t += 1 # select an action actions = maddpg.act(transpose_to_tensor(obs), noise=0.) actions_array = torch.stack(actions).detach().numpy() actions_for_env = np.rollaxis(actions_array, 1) # send all actions to the environment next_obs, rewards, dones, info = env.step(actions_for_env) # update the score (for each agent) scores += np.sum(rewards) print('\r\n Rewards at step %i = %.3f' % (t, scores)) # for displaying learned policies # time.sleep(0.1) # env.render() # roll over states to next time step obs = next_obs # print("Score: {}".format(scores)) if np.any(dones): print('done') print('Next:') env.close()
def main(): seeding(seed=SEED) # number of parallel agents parallel_envs = 1 # number of agents per environment num_agents = 5 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 60000 episode_length = 35 # how many episodes to save policy and gif save_interval = 1000 t = 0 scenario_name = "simple_spread_ivan" # amplitude of OU noise # this slowly decreases to 0 noise = 0.5 # was 2, try 0.5, 0.2 noise_reduction = 0.9999 # 0.999 #### DECAY initial_noise = 0.1 decay = 0.01 # how many episodes before update # episode_per_update = UPDATE_EVERY * parallel_envs common_folder = time.strftime("/%m%d%y_%H%M%S") log_path = os.getcwd() + common_folder + "/log" model_dir = os.getcwd() + common_folder + "/model_dir" os.makedirs(model_dir, exist_ok=True) # initialize environment # torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs, seed=3, benchmark=BENCHMARK) # env = envs.make_env("simple_spread_ivan") # initialize replay buffer buffer = ReplayBuffer(int(BUFFER_SIZE)) # initialize policy and critic maddpg = MADDPG(num_agents=num_agents, discount_factor=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY) logger = SummaryWriter(log_dir=log_path) agents_reward = [] for n in range(num_agents): agents_reward.append([]) # agent0_reward = [] # agent1_reward = [] # agent2_reward = [] agent_info = [[[]]] # placeholder for benchmarking info # training loop # show progressbar import progressbar as pb widget = [ '\repisode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() print('Starting iterations...') for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, num_agents)) all_obs = env.reset() # # flip the first two indices # ADD FOR WITHOUT PARALLEL ENV # all_obs = np.expand_dims(all_obs, axis=0) obs_roll = np.rollaxis(all_obs, 1) obs = transpose_list(obs_roll) # save info or not save_info = ((episode) % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 # if save_info: # frames.append(env.render('rgb_array')) for episode_t in range(episode_length): # get actions # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise = max(initial_noise * decay**(episode_t / 20000), 0.001) # noise = max(noise*noise_reduction, 0.001) actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # environment step # step forward one frame # next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env) # ADD FOR WITHOUT PARALLEL ENV # next_obs, rewards, dones, info = env.step(actions_for_env) next_obs, rewards, dones, info = env.step(actions_for_env) # rewards_sum += np.mean(rewards) # collect experience transition = (obs, actions_for_env, rewards, next_obs, dones) buffer.push(transition) reward_this_episode += rewards # obs, obs_full = next_obs, next_obs_full obs = next_obs # increment global step counter t += parallel_envs # save gif frame if save_info: # frames.append(env.render('rgb_array')) tmax += 1 # for benchmarking learned policies if BENCHMARK: for i, inf in enumerate(info): agent_info[-1][i].append(inf['n']) # update once after every episode_per_update # if len(buffer) > BATCH_SIZE and episode % episode_per_update < parallel_envs: if len(buffer) > BATCH_SIZE and episode % UPDATE_EVERY < parallel_envs: for _ in range(UPDATE_TIMES): for a_i in range(num_agents): samples = buffer.sample(BATCH_SIZE) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) # soft update the target network towards the actual networks for i in range(parallel_envs): for n in range(num_agents): agents_reward[n].append(reward_this_episode[i, n]) # agent0_reward.append(reward_this_episode[i,0]) # agent1_reward.append(reward_this_episode[i,1]) # agent2_reward.append(reward_this_episode[i,2]) if episode % 100 == 0 or episode == number_of_episodes - 1: # avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward)] avg_rewards = [] for n in range(num_agents): avg_rewards.append(np.mean(agents_reward[n])) # agent0_reward = [] # agent1_reward = [] # agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # saving model save_dict_list = [] if save_info: print('agent_info benchmark=', agent_info) for i in range(5): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files # imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), # frames, duration=.04) env.close() logger.close() timer.finish()