def config(self, config): print("Prepare for new configuration. Make new MADDPG agent.") self.n_episodes = config.get("n_episodes", N_EPISODES) self.solved_score = config.get("solved_score", SOLVED_SCORE) self.conseq_episodes = config.get("conseq_episodes", CONSEC_EPISODES) self.seed = config.get("seed", 1) self.MADDPG_obj = MADDPG(state_size=self.state_size, action_size=self.action_size, num_agents=self.num_agents, config=config)
def training(max_episodes=3000, episode_length=1000, random_seed=4): env, brain_name, num_agents, action_size, state_size = create_env() maddpg = MADDPG(num_agents, state_size, action_size, num_agents * state_size, num_agents * action_size, discount_factor=0.99, tau=0.001, random_seed=random_seed) agent_reward = [[] for _ in range(num_agents)] agent_reward_deque = [deque(maxlen=100) for _ in range(num_agents)] score_full = [] score_deque = deque(maxlen=100) for episode in range(1, max_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] obs = env_info.vector_observations obs_full = obs episode_scores = np.zeros(num_agents) for episode_t in range(episode_length): actions = maddpg.act(obs) env_info = env.step(actions)[brain_name] next_obs = env_info.vector_observations next_obs_full = next_obs rewards = env_info.rewards dones = env_info.local_done episode_scores += rewards maddpg.step(obs, obs_full, actions, rewards, next_obs, next_obs_full, dones, episode_t) obs = next_obs obs_full = next_obs_full if np.any(dones): break for i in range(num_agents): agent_reward[i].append(episode_scores[i]) agent_reward_deque[i].append(episode_scores[i]) score_full.append(max(episode_scores)) score_deque.append(max(episode_scores)) if episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( episode, np.mean(score_deque))) if np.mean(score_deque) >= 0.5: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(episode, np.mean(score_deque))) #for i in range(num_agents): # torch.save(agents[i].actor_local.state_dict(), 'checkpoint_actor'+str(i) +'.pth') # torch.save(agents[i].critic_local.state_dict(), 'checkpoint_critic'+str(i)+'.pth') torch.save(maddpg.critic.state_dict(), 'checkpoint_centralized_critic.pth') for i in range(num_agents): torch.save(maddpg.actors[i].actor_local.state_dict(), 'checkpoint_actor' + str(i) + '.pth') break env.close() return maddpg, agent_reward, score_full, random_seed
class MADDPG_Runner(): def __init__(self, env, config): """ :rtype: object """ super(MADDPG_Runner, self).__init__() self.agents = [] self.env = env # get the default brain self.brain_name = self.env.brain_names[0] self.brain = self.env.brains[self.brain_name] # reset the environment env_info = self.env.reset(train_mode=True)[self.brain_name] # number of agents self.num_agents = len(env_info.agents) print('Number of agents:', self.num_agents) # size of each action self.action_size = self.brain.vector_action_space_size print('Size of each action:', self.action_size) # examine the state space states = env_info.vector_observations self.state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'. format(states.shape[0], self.state_size)) print('The state for the first agent looks like: \n{}\n'.format( states[0])) self.config(config) def config(self, config): print("Prepare for new configuration. Make new MADDPG agent.") self.n_episodes = config.get("n_episodes", N_EPISODES) self.solved_score = config.get("solved_score", SOLVED_SCORE) self.conseq_episodes = config.get("conseq_episodes", CONSEC_EPISODES) self.seed = config.get("seed", 1) self.MADDPG_obj = MADDPG(state_size=self.state_size, action_size=self.action_size, num_agents=self.num_agents, config=config) def seeding(self): np.random.seed(self.seed) torch.manual_seed(self.seed) def reset_agents(self): for agent in self.agents: agent.reset() def learning_step(self, states, actions, rewards, next_states, done): print("learning step", states, next_states, rewards, done, actions) for i, agent in enumerate(self.agents): agent.step(states, actions, rewards, next_states, done, i) ## Training loop def training_loop(self, t_max=1000, stop_when_done=True): # initialize scoring scores_window = deque(maxlen=CONSEC_EPISODES) moving_average = [] scores_all = [] best_score = -np.inf best_episode = 0 already_solved = False self.seeding() scores_deque = deque(maxlen=100) scores_list = [] scores_list_100_avg = [] for i_episode in range(1, self.n_episodes + 1): env_info = self.env.reset( train_mode=True)[self.brain_name] # reset the environment states = env_info.vector_observations # get the current states (for all agents) scores = np.zeros( self.num_agents ) # initialize the score (for each agent in MADDPG) num_steps = 0 actions = [] for _ in range(t_max): actions = self.MADDPG_obj.act(states, i_episode, add_noise=ADD_NOISE) env_info = self.env.step(actions)[ self.brain_name] # send all actions to the environment next_states = env_info.vector_observations # get next state (for each agent in MADDPG) rewards = env_info.rewards # get rewards (for each agent in MADDPG) dones = env_info.local_done # see if episode finished scores += rewards # update the score (for each agent in MADDPG) self.MADDPG_obj.step(i_episode, states, actions, rewards, next_states, dones) # train the MADDPG_obj states = next_states # roll over states to next time step num_steps += 1 if np.any(dones): # exit loop if episode finished break # print('Total score (averaged over agents) this episode: {}'.format(np.mean(score))) scores_deque.append(np.max(scores)) scores_list.append(np.max(scores)) scores_list_100_avg.append(np.mean(scores_deque)) # print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {}'.format(i_episode, np.mean(scores_deque), score), end="") if i_episode % PRINT_EVERY == 0: print('Episode {}\tAverage Score: {:.2f}\tCurrent Score: {}'. format(i_episode, np.mean(scores_deque), np.max(scores))) print('Noise Scaling: {}, Memory size: {} and Num Steps: {}'. format(self.MADDPG_obj.maddpg_agents[0].noise_scale, len(self.MADDPG_obj.memory), num_steps)) #print("last 10", scores_list[-10:]) #print("last actions", actions) if i_episode % 500 == 0: self.MADDPG_obj.save_maddpg() print('Saved Model: Episode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque))) if np.mean(scores_deque) > self.solved_score and len( scores_deque) >= 100: self.MADDPG_obj.save_maddpg() print( 'Goal reached. Saved Model: Episode {}\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_deque))) if stop_when_done: break return scores_list, scores_list_100_avg, i_episode
num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) # Train agents function ######################################################## from maddpg_agent import MADDPG agents = MADDPG(num_agents=num_agents, state_size=state_size, action_size=action_size, random_seed=2) def train(n_episodes=100, max_t=1000): """Multi-Agent Deep Deterministic Policy Gradiant. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode """ scores_window = deque(maxlen=100) # last 100 scores scores_output = [] for i_episode in range(1, n_episodes + 1):
Weights might look like "weightname-1800.data", delete just the "-1800" part. Then, turn testing to true. """ save_dir = "saves" if not os.path.exists(save_dir): os.makedirs(save_dir) agent1_ddpg = MADDPG('agent1') agent1_ddpg_target = MADDPG('agent1_target') agent2_ddpg = MADDPG('agent2') agent2_ddpg_target = MADDPG('agent2_target') #3 # agent3_ddpg = MADDPG('agent3') # agent3_ddpg_target = MADDPG('agent3_target') #saver = tf.train.Saver() agent1_actor_target_init, agent1_actor_target_update = create_init_update('agent1_actor', 'agent1_target_actor')
for i, agent in enumerate(maddpg.agents): torch.save( agent.actor_local.state_dict(), 'Z:/{:.2f}_actor_{}_checkpoint.pth'.format(mean_score, i)) break # or not and just keep on keepin on return scores brain_name, env, env_info, state, state_size, action_size = new_unity_environment( train_mode=True) print(brain_name) print(env) print(env_info) print(state) print(state_size) print(action_size) maddpg = MADDPG(state_size, action_size, 1337) scores = maddpg_train(maddpg, env, brain_name, state_size, train_mode=True) env.close() # plot the scores after training to a 100 episode average score of 30 fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores)), scores) plt.ylabel('Score') plt.xlabel('Episode #') plt.show()
def train(): # config parameters model_dir = '/home/shijiliu/self-learning/reinforcement-learning/deep-reinforcement-learning/p3_collab-compet/draft_2/' number_of_episodes = 10000 episode_length = 80 batchsize = 128 t = 0 action_noise_coef = 10.0 param_noise_coef = 0.0 action_noise_reduction = 0.9999 param_noise_reduction = 0.9999 episode_per_update = 2 # create env, get essential env info env, brain_name, num_agents, action_size, state_size = create_env() buffer = PrioritizedReplayMemory(1000*episode_length,alpha = 0.5, beta_start = 0.4) # initialize policy and critic maddpg = MADDPG(num_agents,state_size,action_size,num_agents * state_size, num_agents * action_size, discount_factor = 0.99, tau = 0.001) agent_reward = [[] for _ in range(num_agents)] score_full = [] score_deque = deque(maxlen = 100) # training loop for episode in range(number_of_episodes): env_info = env.reset(train_mode=True)[brain_name] obs = env_info.vector_observations obs_full = obs episode_scores = np.zeros(num_agents) for episode_t in range(episode_length): actions = maddpg.act(obs, action_noise_coef, param_noise_coef) action_noise_coef *= action_noise_reduction param_noise_coef *= param_noise_reduction # process the output action to interact with the environment action_np = [a.detach().cpu().numpy() for a in actions] # step the environment for 1 step env_info = env.step(action_np)[brain_name] next_obs = env_info.vector_observations next_obs_full = next_obs rewards = env_info.rewards dones = env_info.local_done episode_scores += rewards # add data to buffer transition = (obs, obs_full, actions, rewards, next_obs, next_obs_full, dones) buffer.push(transition) obs = next_obs obs_full = next_obs_full if np.any(dones): break # update the networks once after every episode_per_update if buffer.storage_size() > batchsize and episode % episode_per_update == 0: for a_i in range(num_agents): samples,_,_ = buffer.sample(batchsize) #print(len(samples)) ordered_samples = zip(*samples) maddpg.update(ordered_samples, a_i) maddpg.update_targets() #soft update the target network towards the actual networks for i in range(num_agents): agent_reward[i].append(episode_scores[i]) score_full.append(max(episode_scores)) score_deque.append(max(episode_scores)) if episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(score_deque))) if np.mean(score_deque) >= 0.5: print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(episode, np.mean(score_deque))) # save models save_dict_list = [] for i in range(num_agents): save_dict = {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()} save_dict_list.append(save_dict) torch.save(save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) break env.close() return maddpg, agent_reward, score_full
# reset the environment env_info = env.reset(train_mode=True)[brain_name] # get number of agenets. Confirm there is one num_agents = len(env_info.agents) # Get action size action_size = brain.vector_action_space_size # examine the state space states = env_info.vector_observations state_size = states.shape[1] agent = MADDPG(state_size=state_size, action_size=action_size, num_agents=num_agents, seed=42) # agent.load_weights() episodes = 10000 # Number of episodes max_time = 1000 # Max number of time steps per episode max_score = 0.6 # Average score to beat over length 100 window # Score lists scores_deque = deque(maxlen=100) all_scores = [] all_scores_mean = [] all_scores_std = [] # Main training loop for ep in range(0, episodes):