def sample(self): """Randomly sample a batch of experiences from memory.""" experiences = random.sample(self.memory, k=self.batch_size) states = torch.from_numpy( np.array( transpose_list([e.state for e in experiences if e is not None]))).float().to(device) actions = torch.from_numpy( np.array( transpose_list([ e.action for e in experiences if e is not None ]))).float().to(device) rewards = torch.from_numpy( np.array( transpose_list([ e.reward for e in experiences if e is not None ]))).float().to(device) next_states = torch.from_numpy( np.array( transpose_list([ e.next_state for e in experiences if e is not None ]))).float().to(device) dones = torch.from_numpy( np.array( transpose_list([e.done for e in experiences if e is not None ])).astype(np.uint8)).float().to(device) return (states, actions, rewards, next_states, dones)
def sample(self, batchsize): """sample from the buffer""" samples = random.sample(self.memory, batchsize) print(len(transpose_list(samples))) # transpose list of list return transpose_list(samples)
def learn(self, experiences, agent_number): """update the critics and actors of all the agents """ # need to transpose each element of the samples # to flip obs[parallel_agent][agent_number] to # obs[agent_number][parallel_agent] states, actions, rewards, next_states, dones = experiences agent = self.agents[agent_number] agent.critic_optimizer.zero_grad() #critic loss = batch mean of (y- Q(s,a) from target network)^2 #y = reward of this timestep + discount * Q(st+1,at+1) from target network target_actions = self.target_act(next_states) target_actions = torch.cat(target_actions, dim=1) t = torch.tensor(transpose_list(next_states.cpu().data.numpy())) next_states_all = t.view(t.shape[0], -1).to('cpu') target_critic_input = torch.cat( (next_states_all, target_actions.to('cpu')), dim=1).to(device) with torch.no_grad(): q_next = agent.target_critic(target_critic_input) y = rewards[agent_number].view( -1, 1) + GAMMA * q_next * (1 - dones[agent_number].view(-1, 1)) actions_all = torch.cat(torch.unbind(actions), dim=1) t = torch.tensor(transpose_list(states.cpu().data.numpy())) states_all = t.view(t.shape[0], -1).to('cpu') critic_input = torch.cat((states_all, actions_all.to('cpu')), dim=1).to(device) q = agent.critic(critic_input) critic_loss = F.mse_loss(q, y.detach()) critic_loss.backward(retain_graph=True) agent.critic_optimizer.step() # update actor network using policy gradient agent.actor_optimizer.zero_grad() # make input to agent # detach the other agents to save computation # saves some time for computing derivative q_input = [self.agents[i].actor(state) if i == agent_number \ else self.agents[i].actor(state).detach() for i, state in enumerate(states)] q_input = torch.cat(q_input, dim=1) # combine all the actions and observations for input to critic # many of the obs are redundant, and obs[1] contains all useful information already q_input2 = torch.cat((states_all.to('cpu'), q_input.to('cpu')), dim=1) # get the policy gradient actor_loss = -agent.critic(q_input2).mean() actor_loss.backward(retain_graph=True) agent.actor_optimizer.step()
def push(self, transition): """push into the buffer""" input_to_buffer = transpose_list(transition) for item in input_to_buffer: self.deque.append(item)
def push(self, transition, error): """push into the buffer""" input_to_buffer = transpose_list(transition) i = 0 for item in input_to_buffer: p = self._getPriority(error[i]) self.tree.add(p, item) i += 1
def sample(self, n): """sample from the buffer""" experiences = [] indexes = [] segment = self.tree.total() / n for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) experiences.append(data) indexes.append(idx) # transpose list of list return transpose_list(experiences), indexes
def sample(self, batch_size): """sample from the buffer""" samples = random.sample(self.deque, batch_size) # transpose list of list return transpose_list(samples)
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 1000 episode_length = 80 batchsize = 1000 # how many episodes to save policy and gif save_interval = 1000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + '/log' model_dir = os.getcwd() + '/model_dir' os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir = log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = ['episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) all_obs = env.reset() obs, obs_full = transpose_list(all_obs) # for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < parallel_envs or episode==number_of_episodes-parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array'))
import torch def transpose_list(mylist): return list(map(list, zip(*mylist))) def transpose_to_tensorAsitis(input_list): make_tensor = lambda x: torch.tensor(x, dtype=torch.float) return list(map(make_tensor, input_list)) env_info = env.step(actions)[brain_name] # send all actions to tne environment next_states = env_info.vector_observations print(next_states) s = transpose_list(next_states) print(s) p = np.concatenate((next_states[0], next_states[1])) print(np.shape(p)[0]) s = transpose_to_tensorAsitis(next_states) print("tensor", s) # In[8]: # main function that sets up environments # perform training loop #import envs from buffer import ReplayBuffer from maddpg import MADDPG import torch
def main(): seeding(seed=SEED) # number of parallel agents parallel_envs = 1 # number of agents per environment num_agents = 5 # initialize environment torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs, seed=SEED, num_agents=num_agents, benchmark=BENCHMARK) # initialize policy and critic maddpg = MADDPG(num_agents=num_agents, discount_factor=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY) agents_reward = [] for n in range(num_agents): agents_reward.append([]) # trained_checkpoint = r'E:\Ivan\UPC\UDACI TY\DRL_Nanodegree\Part4\MADDPG\032521_163018\model_dir\episode-59994.pt' #test1 2 agents # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032521_211315\model_dir\episode-59994.pt' #test1 2 agents # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032621_054252\model_dir\episode-36000.pt' #test1 2 agents # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032821_102717\model_dir\episode-99000.pt' #test1 6 agents # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032921_160324\model_dir\episode-99000.pt' #test2 6 agents pretrined # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\033021_203450\model_dir\episode-73002.pt' # test2 6 agents pretrined # trained_checkpoint = "c4-a4-n01-a01-old-two-noskip/model_dir/episode-59999.pt" trained_checkpoint = "gat-huge1k/model_dir/episode-99000.pt" aux = torch.load(trained_checkpoint, map_location=torch.device('cpu')) for i in range(num_agents): # load the weights from file maddpg.maddpg_agent[i].actor.load_state_dict(aux[i]['actor_params']) maddpg.maddpg_agent[i].critic.load_state_dict(aux[i]['critic_params']) # Reset the environment all_obs = env.reset() # flip the first two indices obs_roll = np.rollaxis(all_obs, 1) obs = transpose_list(obs_roll) scores = 0 t = 0 while True: all_obs = env.reset() # flip the first two indices obs_roll = np.rollaxis(all_obs, 1) obs = transpose_list(obs_roll) scores = 0 t = 0 for _ in range(25): env.render('rgb_array') time.sleep(0.1) t += 1 # select an action actions = maddpg.act(transpose_to_tensor(obs), noise=0.) actions_array = torch.stack(actions).detach().numpy() actions_for_env = np.rollaxis(actions_array, 1) # send all actions to the environment next_obs, rewards, dones, info = env.step(actions_for_env) # update the score (for each agent) scores += np.sum(rewards) print('\r\n Rewards at step %i = %.3f' % (t, scores)) # for displaying learned policies # time.sleep(0.1) # env.render() # roll over states to next time step obs = next_obs # print("Score: {}".format(scores)) if np.any(dones): print('done') print('Next:') env.close()
def update(self, samples, agent_number, logger): """update the critics and actors of all the agents""" # `samples`: a list of batchsize, List(5,) # `states` & next_states: a list of batchsize, Array(2,24) # `actions`: a list of batchsize, Array(2,2) # `rewards` & `dones`: a list of batch size, List(2,) states, actions, rewards, next_states, dones = zip(*samples) # -------------------------- preprocessing -------------------------- # # `states` & `next_states`: a list of size 2, Tensor(batchsize,24) states = transpose_to_tensor(states) next_states = transpose_to_tensor(next_states) # `states_full` & `next_states_full`: Tensor(batchsize,48) states_full = torch.cat(states, dim=1) next_states_full = torch.cat(next_states, dim=1) # `actions`: Tensor(batchsize,4) actions = transpose_to_tensor(actions) actions = torch.cat(actions, dim=1) # `dones` & `rewards`: a list of 2, Tensor(batchsize,) dones = transpose_to_tensor(transpose_list(zip(*dones))) rewards = transpose_to_tensor(transpose_list(zip(*rewards))) # -------------------------- update critic -------------------------- # agent = self.maddpg_agent[agent_number] agent.critic_optimizer.zero_grad() # critic loss = batch mean of (y - Q(s,a) from target network)^2 # y = current reward + discount * Q(st+1,at+1) from target network target_actions = self.target_act(next_states) target_actions = torch.cat(target_actions, dim=-1) target_critic_input = torch.cat((next_states_full, target_actions), dim=1).to(device) with torch.no_grad(): q_next = agent.target_critic(target_critic_input) y = rewards[agent_number].view(-1, 1) + \ self.discount_factor * q_next * \ (1 - dones[agent_number].view(-1, 1)) critic_input = torch.cat((states_full, actions), dim=1).to(device) q = agent.critic(critic_input) huber_loss = torch.nn.SmoothL1Loss() critic_loss = huber_loss(q, y.detach()) critic_loss.backward() #torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), 0.5) agent.critic_optimizer.step() # -------------------------- update actor -------------------------- # agent.actor_optimizer.zero_grad() # make input to agent # detach the other agents to save computation # saves some time for computing derivative q_input = [ self.maddpg_agent[i].actor(state) if i == agent_number \ else self.maddpg_agent[i].actor(state).detach() for i, state in enumerate(states) ] q_input = torch.cat(q_input, dim=1) # combine all the actions and observations for input to critic q_input2 = torch.cat((states_full, q_input), dim=1) # get the policy gradient actor_loss = -agent.critic(q_input2).mean() actor_loss.backward() #torch.nn.utils.clip_grad_norm_(agent.actor.parameters(),0.5) agent.actor_optimizer.step() # for TensorBoard al = actor_loss.cpu().detach().item() cl = critic_loss.cpu().detach().item() logger.add_scalars('agent%i/losses' % agent_number, { 'critic loss': cl, 'actor_loss': al }, self.iter)
def update(self, samples, agent_number, logger): """update the critics and actors of all the agents """ # --- Experiences --- states = torch.from_numpy( np.stack( transpose_list([e.state for e in samples if e is not None]))).float().to(self.device) actions = torch.from_numpy( np.stack( transpose_list([e.action for e in samples if e is not None]))).float().to(self.device) # rewards = torch.from_numpy(np.vstack([max(e.reward) for e in samples if e is not None])).float().to(self.device).t()[0] # Use Max rewards = torch.from_numpy( np.vstack([e.reward for e in samples if e is not None])).float().to(self.device) next_states = torch.from_numpy( np.stack( transpose_list([ e.next_state for e in samples if e is not None ]))).float().to(self.device) dones = torch.from_numpy( np.vstack([e.done for e in samples if e is not None ]).astype(np.uint8)).float().to(self.device) # --- Agent --------------------# agent = self.maddpg_agent[agent_number] # ------- Update Critic ------------------------------------------# agent.critic_optimizer.zero_grad() states_flat = tensor_flatten(states, self.state_size * self.num_agents)[0] actions_flat = tensor_flatten(actions, self.action_size * self.num_agents)[0] next_states_flat = tensor_flatten(next_states, self.state_size * self.num_agents)[0] target_next_actions_flat = torch.cat(self.target_actor(next_states), dim=1) with torch.no_grad(): target_next_q = agent.target_critic( next_states_flat, target_next_actions_flat).t()[0] # target_q = rewards + self.discount_factor * target_next_q * (1 - dones[:, agent_number]) # Use MAX target_q = rewards[:, agent_number] + self.discount_factor * target_next_q * ( 1 - dones[:, agent_number]) local_q = agent.critic(states_flat, actions_flat).t()[0] critic_loss = F.mse_loss(local_q, target_q) critic_loss.backward() torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), 1) # agent.critic_optimizer.step() # ------- Update Actor ------------------------------------------# agent.actor_optimizer.zero_grad() local_actions_flat = torch.cat(self.actor(states), dim=1) actor_loss = -agent.critic(states_flat, local_actions_flat).mean() actor_loss.backward() torch.nn.utils.clip_grad_norm_(agent.actor.parameters(), 1) # agent.actor_optimizer.step() # ---- Save Loss -------------------------------# aloss = actor_loss.cpu().detach().item() closs = critic_loss.cpu().detach().item() logger.add_scalars('agent%i/losses' % agent_number, { 'critic loss': closs, 'actor_loss': aloss }, self.update_count)
def main(): seeding() parallel_envs = 4 number_of_episodes = 1000 episode_length = 80 batchsize = 1000 save_interval = 1000 t = 0 # amplitude of OU noise, which slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) """ `env` controls three agents, two blue, one red. env.observation_space: [Box(14,), Box(14,), Box(14,)] env.action_sapce: [Box(2,), Box(2,), Box(2,)] Box(14,) can be broken down into 2+3*2+3*2=14 (2) location coordinates of the target landmark (3*2) the three agents' positions w.r.t. the target landmark (3*2) the three agents' velocities w.r.t. the target landmark """ env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) # Consult `env_wrapper.py` line 19. all_obs = env.reset() """ `all_abs` is a list of size `parallel_envs`, each item in the list is another list of size two, first is env.observation_space: [Box(14,), Box(14,), Box(14,)], second is [Box(14,)], which is added to faciliate training https://goo.gl/Xtr6sF `obs` and `obs_full` are both lists of size `parallel_envs`, `obs` has the default observation space [Box(14,), Box(14,), Box(14,)] `obs_full` has the compounded observation space [Box(14,)] """ obs, obs_full = transpose_list(all_obs) # for calculating rewards for one episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): t += parallel_envs # explore = only explore for a certain number of steps # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction # `actions_array` has shape (3, parallel_envs, 2) actions_array = torch.stack(actions).detach().numpy() # `actions_for_env` has shape (parallel_envs, 3, 2), because # input to `step` requires the first index to be `parallel_envs` actions_for_env = np.rollaxis(actions_array, axis=1) # step forward one frame next_obs, next_obs_full, rewards, dones, info = \ env.step(actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update the target network `parallel_envs`=4 times # after every `episode_per_update`=2*4 if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: # update the local network for all agents, `a_i` refers to agent no. for a_i in range(3): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) # soft update the target network towards the actual networks maddpg.update_targets() for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # Saves the model. save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # Save gif files. imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
def sample(self, batchsize): """sample from the buffer""" samples = random.sample(self.deque, batchsize) #print("\n samples before tran in buffer size= {} * {} * {} ".format(len(samples),len(samples[0]),len(samples[0][0]))) # transpose list of list return transpose_list(samples)
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 1000 episode_length = 80 batchsize = 1000 # how many episodes to save policy and gif save_interval = 1000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in range(0, number_of_episodes + parallel_envs, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) all_obs = env.reset() obs, obs_full = transpose_list(all_obs) # for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = (episode % save_interval < parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) # soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 episode_length = 100 batchsize = 1000 # how many episodes to save policy and gif save_interval = 5000 # what is this ? t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) # this may be a list of all environments env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic # this creates a list of models, each element in the list refers to an agent in the simulation # [agent_one_ddpg, agent_two_ddpg, ...] # agent_one_ddpg contains the agent actor and critic models,e.g., agent_one_ddpg.actor, agent_one_ddpg.critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): # notice we jump forward by number of parallel environments for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) # i believe there are as many as number of agents times parallel env reward reward_this_episode = np.zeros((parallel_envs, 3)) # obs is the observation state space of all the three agents in the 4 parallel env. # for the Physical Dception environment with three agents it is of dimension 4x3x14. # obs_full is world state irrespective of the agents and its dimension is 4x14. # all_observation = array(number of environments 4, 2 elements) # element 0 : is a list that contains 3 arrays. contains the state for each agent, each state is of size 14 # element 1 : global state from the perspective of the target/green for its environment. contains 14 elements all_obs = env.reset() # obs : is a list that has 1 element per environment. each element contains a list of 3 array. # each array is the state of one agent in that environment. # obs_full: is the god eye view of each environment. So it a list, that has 1 element per environment # each element contains an array of 14 values which is the global state of that environment obs, obs_full = transpose_list(all_obs) #for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = (episode % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): # we finish the episode before sampling the buffer for trainint # t jumps forward in a multiple of environment t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed # the transpose_to_tensor(obs) changes the data to each agent point of view # since we have 4 environments, there are 4 agent 1, 4 agent 2, and 4 agent 3 # each agent has a state in each environment, total states across 4 environments for agent 1 is 4x14 tensor # transpose_to_tensor(obs) = is a list of 3 elements. each element is for 1 agent # pick element 1. this is an array of 4x14 elements of agent observation across 4 environments. # maddpg.act has a for loop that take each element of obs and pass it to the agents actor models and # to generate an action from each agent actor. actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction # there are 4 actions per agent and 3 agents, total of 12 . Each action has 2 elements force in x, y direct # actions_array is a tensor of shape (3 agent, 4 env, 2 action) actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents # the shape of actions_for_env is (4 env, 3 agent, 2 action) actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame # obs is the observation state space of all the three agents in the 4 parallel env. # for the Physical Dception environment with three agents it is of dimension 4x3x14. # obs_full is world state irrespective of the agents and its dimension is 4x14. # To gain more understanding, please see the code in the multiagent folder. next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): # although samples are drawn randomly, for each sample we have all 3 agents data, and we know which # reward and actions belong to which agent # samples is a list of 7 elements: obs, obs_full, action, reward, next_obs, next_obs_full, done # each element of sample, say samples[0] is a list of 3 elements, one for each agent # each agent element contains their corresponding value, for example in case of obs it would be a # vector with 14 values # so when i ask for 2 samples for examples, i get 2 samples each containing all 3 agents states, rewards samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) #soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) #saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
def main(): seeding(seed=SEED) # number of parallel agents parallel_envs = 1 # number of agents per environment num_agents = 5 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 60000 episode_length = 35 # how many episodes to save policy and gif save_interval = 1000 t = 0 scenario_name = "simple_spread_ivan" # amplitude of OU noise # this slowly decreases to 0 noise = 0.5 # was 2, try 0.5, 0.2 noise_reduction = 0.9999 # 0.999 #### DECAY initial_noise = 0.1 decay = 0.01 # how many episodes before update # episode_per_update = UPDATE_EVERY * parallel_envs common_folder = time.strftime("/%m%d%y_%H%M%S") log_path = os.getcwd() + common_folder + "/log" model_dir = os.getcwd() + common_folder + "/model_dir" os.makedirs(model_dir, exist_ok=True) # initialize environment # torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs, seed=3, benchmark=BENCHMARK) # env = envs.make_env("simple_spread_ivan") # initialize replay buffer buffer = ReplayBuffer(int(BUFFER_SIZE)) # initialize policy and critic maddpg = MADDPG(num_agents=num_agents, discount_factor=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY) logger = SummaryWriter(log_dir=log_path) agents_reward = [] for n in range(num_agents): agents_reward.append([]) # agent0_reward = [] # agent1_reward = [] # agent2_reward = [] agent_info = [[[]]] # placeholder for benchmarking info # training loop # show progressbar import progressbar as pb widget = [ '\repisode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() print('Starting iterations...') for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, num_agents)) all_obs = env.reset() # # flip the first two indices # ADD FOR WITHOUT PARALLEL ENV # all_obs = np.expand_dims(all_obs, axis=0) obs_roll = np.rollaxis(all_obs, 1) obs = transpose_list(obs_roll) # save info or not save_info = ((episode) % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 # if save_info: # frames.append(env.render('rgb_array')) for episode_t in range(episode_length): # get actions # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise = max(initial_noise * decay**(episode_t / 20000), 0.001) # noise = max(noise*noise_reduction, 0.001) actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # environment step # step forward one frame # next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env) # ADD FOR WITHOUT PARALLEL ENV # next_obs, rewards, dones, info = env.step(actions_for_env) next_obs, rewards, dones, info = env.step(actions_for_env) # rewards_sum += np.mean(rewards) # collect experience transition = (obs, actions_for_env, rewards, next_obs, dones) buffer.push(transition) reward_this_episode += rewards # obs, obs_full = next_obs, next_obs_full obs = next_obs # increment global step counter t += parallel_envs # save gif frame if save_info: # frames.append(env.render('rgb_array')) tmax += 1 # for benchmarking learned policies if BENCHMARK: for i, inf in enumerate(info): agent_info[-1][i].append(inf['n']) # update once after every episode_per_update # if len(buffer) > BATCH_SIZE and episode % episode_per_update < parallel_envs: if len(buffer) > BATCH_SIZE and episode % UPDATE_EVERY < parallel_envs: for _ in range(UPDATE_TIMES): for a_i in range(num_agents): samples = buffer.sample(BATCH_SIZE) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) # soft update the target network towards the actual networks for i in range(parallel_envs): for n in range(num_agents): agents_reward[n].append(reward_this_episode[i, n]) # agent0_reward.append(reward_this_episode[i,0]) # agent1_reward.append(reward_this_episode[i,1]) # agent2_reward.append(reward_this_episode[i,2]) if episode % 100 == 0 or episode == number_of_episodes - 1: # avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward)] avg_rewards = [] for n in range(num_agents): avg_rewards.append(np.mean(agents_reward[n])) # agent0_reward = [] # agent1_reward = [] # agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # saving model save_dict_list = [] if save_info: print('agent_info benchmark=', agent_info) for i in range(5): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files # imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), # frames, duration=.04) env.close() logger.close() timer.finish()