class CERL_Agent: """Main CERL class containing all methods for CERL Parameters: args (int): Parameter class with all the parameters """ def __init__(self, args): self.args = args self.evolver = SSNE(self.args) #MP TOOLS self.manager = Manager() #Genealogy tool self.genealogy = Genealogy() #Initialize population self.pop = self.manager.list() for _ in range(args.pop_size): wwid = self.genealogy.new_id('evo') if ALGO == 'SAC': self.pop.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, wwid)) else: self.pop.append(Actor(args.state_dim, args.action_dim, wwid)) if ALGO == "SAC": self.best_policy = GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1) else: self.best_policy = Actor(args.state_dim, args.action_dim, -1) #Turn off gradients and put in eval mod for actor in self.pop: actor = actor.cpu() actor.eval() #Init BUFFER self.replay_buffer = Buffer(1000000, self.args.buffer_gpu) #Intialize portfolio of learners self.portfolio = [] self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, PORTFOLIO_ID) self.rollout_bucket = self.manager.list() for _ in range(len(self.portfolio)): if ALGO == 'SAC': self.rollout_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.rollout_bucket.append( Actor(args.state_dim, args.action_dim, -1)) # Initialize shared data bucket self.data_bucket = self.replay_buffer.tuples ############## MULTIPROCESSING TOOLS ################### #Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.pop, ENV_NAME, None, ALGO)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, self.task_pipes[id][1], self.result_pipes[id][0], True, self.data_bucket, self.rollout_bucket, ENV_NAME, args.noise_std, ALGO)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] #Test bucket self.test_bucket = self.manager.list() if ALGO == 'SAC': self.test_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.test_bucket.append(Actor(args.state_dim, args.action_dim, -1)) #5 Test workers self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_workers = [ Process(target=rollout_worker, args=(id, self.test_task_pipes[id][1], self.test_result_pipes[id][0], False, None, self.test_bucket, ENV_NAME, args.noise_std, ALGO)) for id in range(TEST_SIZE) ] for worker in self.test_workers: worker.start() self.test_flag = False #Meta-learning controller (Resource Distribution) self.allocation = [ ] #Allocation controls the resource allocation across learners for i in range(args.rollout_size): self.allocation.append( i % len(self.portfolio)) #Start uniformly (equal resources) #self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores #Trackers self.best_score = 0.0 self.gen_frames = 0 self.total_frames = 0 self.best_shaped_score = None self.test_score = None self.test_std = None def train(self, gen, frame_tracker): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ START ROLLOUTS ############## #Start Evolution rollouts if not ISOLATE_PG: for id, actor in enumerate(self.pop): if self.evo_flag[id]: self.evo_task_pipes[id][0].send(id) self.evo_flag[id] = False #Sync all learners actor to cpu (rollout) actor for i, learner in enumerate(self.portfolio): learner.algo.actor.cpu() utils.hard_update(self.rollout_bucket[i], learner.algo.actor) learner.algo.actor.cuda() # Start Learner rollouts for rollout_id, learner_id in enumerate(self.allocation): if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send(learner_id) self.roll_flag[rollout_id] = False #Start Test rollouts if gen % 5 == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send(0) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## if self.replay_buffer.__len__( ) > self.args.batch_size * 10: ###BURN IN PERIOD self.replay_buffer.tensorify( ) # Tensorify the buffer for fast sampling #Spin up threads for each learner threads = [ threading.Thread( target=learner.update_parameters, args=(self.replay_buffer, self.args.buffer_gpu, self.args.batch_size, int(self.gen_frames * self.args.gradperstep))) for learner in self.portfolio ] # Start threads for thread in threads: thread.start() #Join threads for thread in threads: thread.join() self.gen_frames = 0 ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############ if not ISOLATE_PG: all_fitness = [] all_net_ids = [] all_eplens = [] while True: for i in range(self.args.pop_size): if self.evo_result_pipes[i][1].poll(): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) all_net_ids.append(entry[0]) all_eplens.append(entry[2]) self.gen_frames += entry[2] self.total_frames += entry[2] self.evo_flag[i] = True # Soft-join (50%) if len(all_fitness ) / self.args.pop_size >= self.args.asynch_frac: break ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ for i in range(self.args.rollout_size): entry = self.result_pipes[i][1].recv() learner_id = entry[0] fitness = entry[1] num_frames = entry[2] self.portfolio[learner_id].update_stats(fitness, num_frames) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness self.roll_flag[i] = True #Referesh buffer (housekeeping tasks - pruning to keep under capacity) self.replay_buffer.referesh() ######################### END OF PARALLEL ROLLOUTS ################ ############ PROCESS MAX FITNESS ############# if not ISOLATE_PG: champ_index = all_net_ids[all_fitness.index(max(all_fitness))] utils.hard_update(self.test_bucket[0], self.pop[champ_index]) if max(all_fitness) > self.best_score: self.best_score = max(all_fitness) utils.hard_update(self.best_policy, self.pop[champ_index]) if SAVE: torch.save( self.pop[champ_index].state_dict(), self.args.aux_folder + ENV_NAME + '_best' + SAVETAG) print("Best policy saved with score", '%.2f' % max(all_fitness)) else: #Run PG in isolation utils.hard_update(self.test_bucket[0], self.rollout_bucket[0]) ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] for pipe in self.test_result_pipes: #Collect all results entry = pipe[1].recv() test_scores.append(entry[1]) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) # Update score to trackers frame_tracker.update([test_mean], self.total_frames) else: test_mean, test_std = None, None #NeuroEvolution's probabilistic selection and recombination step if not ISOLATE_PG: if gen % 5 == 0: self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, self.rollout_bucket) else: self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, []) #META LEARNING - RESET ALLOCATION USING UCB if gen % 1 == 0: self.allocation = ucb(len(self.allocation), self.portfolio, self.args.ucb_coefficient) #Metrics if not ISOLATE_PG: champ_len = all_eplens[all_fitness.index(max(all_fitness))] champ_wwid = int(self.pop[champ_index].wwid.item()) max_fit = max(all_fitness) else: champ_len = num_frames champ_wwid = int(self.rollout_bucket[0].wwid.item()) all_fitness = [fitness] max_fit = fitness all_eplens = [num_frames] return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid
class CERL_Agent: """Main CERL class containing all methods for CERL Parameters: args (int): Parameter class with all the parameters """ def __init__(self, args): # need to intialize rollout_workers to have blue agent self.args = args self.evolver = SSNE( self.args) # this evolver implements neuro-evolution # MP TOOLS self.manager = Manager() self.mutate_algos = [ Mutation_Add(self), Mutation_Delete(self), Mutation_Exchange(self) ] #store all the mutate algorithm objects # Genealogy tool self.genealogy = Genealogy() # Init BUFFER self.replay_buffer = Buffer(1000000, self.args.buffer_gpu) #if SA_FLAG: self.metrics = [] self.last_portfolio = None self.T_max = 30 self.T = self.T_max self.T_min = 0.2 self.decay_rate = 0.975 # Initialize population self.pop = self.manager.list() for _ in range(args.pop_size): wwid = self.genealogy.new_id('evo') if ALGO == 'SAC': self.pop.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, wwid)) elif ALGO == 'TD3': self.pop.append( Actor(args.state_dim, args.action_dim, wwid, ALGO)) # use ALGO to distinguish differe net architecture elif ALGO == 'dis' or 'TD3_tennis': self.pop.append( Actor(args.state_dim, args.action_dim, wwid, ALGO)) else: assert False, "invalid algorithm type" if ALGO == "SAC": self.best_policy = GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1) else: self.best_policy = Actor(args.state_dim, args.action_dim, -1, ALGO) if ALGO == 'dis': self.average_policy = AverageActor(args.state_dim, args.action_dim, -2, ALGO, self.pop, self.replay_buffer, args.buffer_gpu, args.batch_size, iterations=10) self.average_policy.share_memory() self.best_policy.share_memory() # added by macheng, share the best policy accross processes (used as internal belief update models for blue) # now we assign shared blue_trainer, we should train this agent such that the roll_out workers are also up to date # should make sure that self.best_policy (emergent learner) is also shared if ALGO == 'dis' or 'TD3_tennis': assert hasattr( args, "blue_trainer" ), "must have blue_agent trainer to intialize rollout_worker, see line 109, class Parameter definition" if ALGO == 'dis': trainers = [args.blue_trainer, self.average_policy] else: trainers = [args.blue_trainer, None ] if ALGO == 'TD3_tennis' else [] self.trainers = trainers self.blue_dqn = args.blue_trainer # Turn off gradients and put in eval mod for actor in self.pop: actor = actor.cpu() actor.eval() # Intialize portfolio of learners self.portfolio = [] self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, PORTFOLIO_ID) self.complement_portfolio = [ ] #complementary of the portfolio, whatever not in the portfolio should be stored here self.total_rollout_bucket = self.manager.list( ) #macheng: we use total_rollout_bucket to represents the whole set of rollout models, now rollout_bukcet dynamically resize according to portforlio, for SA self.rollout_bucket = self.total_rollout_bucket #self.rollout_bucket = self.manager.list() #print("rollout_bucker needs to be updated, main.py line 239 ") for _ in range(len(self.portfolio)): if ALGO == 'SAC': self.rollout_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.rollout_bucket.append( Actor(args.state_dim, args.action_dim, -1, ALGO)) # Initialize shared data bucket self.data_bucket = self.replay_buffer.tuples ############## MULTIPROCESSING TOOLS ################### # Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, 0, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.pop, ENV_NAME, None, ALGO, self.trainers)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] # Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 1, self.task_pipes[id][1], self.result_pipes[id][0], True, self.data_bucket, self.rollout_bucket, ENV_NAME, args.noise_std, ALGO, self.trainers)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] # Test bucket self.test_bucket = self.manager.list() if ALGO == 'SAC': self.test_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.test_bucket.append( Actor(args.state_dim, args.action_dim, -1, ALGO)) # 5 Test workers self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_workers = [ Process(target=rollout_worker, args=(id, 2, self.test_task_pipes[id][1], self.test_result_pipes[id][0], False, None, self.test_bucket, ENV_NAME, args.noise_std, ALGO, self.trainers)) for id in range(TEST_SIZE) ] for worker in self.test_workers: worker.start() self.test_flag = False # Meta-learning controller (Resource Distribution) self.allocation = [ ] #Allocation controls the resource allocation across learners for i in range(args.rollout_size): self.allocation.append( i % len(self.portfolio)) #Start uniformly (equal resources) # self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores # Trackers self.best_score = -np.inf self.gen_frames = 0 self.total_frames = 0 self.best_shaped_score = None self.test_score = None self.test_std = None # trainer contains the blue_dqn to be trained, and the red model used for belief update, red_actor is the actual red agent trained against # id is the actual red agent id def _update_SA_temperature(self): self.T = max(self.T * self.decay_rate, self.T_min) def _get_accept_rate(self): if RANDOM_WALK: return 1.0 else: if self.metrics[-1] > self.metrics[-2]: return 1.0 else: return np.exp((self.metrics[-1] - self.metrics[-2]) / self.T) def _mutate(self): while True: mutate_algo_index = random.choice(range(3)) if self._try_mutate(mutate_algo_index): return def _try_mutate(self, algo_index): # 0 for add, 1 for delete, 2 for exchange return self.mutate_algos[algo_index].try_mutate() def simulated_annealing(self, metric): #take in the current metric self.metrics.append(metric) if self.last_portfolio: #has last_portfolio accept_rate = self._get_accept_rate() #based on self.metrics[-2:] self._update_SA_temperature() if np.random.random() > accept_rate: #reject self.portfolio = self.last_portfolio self.complement_portfolio = self.last_complement_portfolio self.last_portfolio = copy.copy( self.portfolio) #maintain a shallow copy as self.last_complement_portfolio = copy.copy(self.complement_portfolio) self._mutate() #perturb the portfolio # update rollout_bucket size, only the first len(self.portfolio) rollout_buckets are visible self.update_rollout_bucket() # update allocation, to be compatible with the current portfolio self.update_allocation() def update_rollout_bucket(self): self.rollout_bucket = self.total_rollout_bucket[:len(self.portfolio)] def train_blue_dqn( self, trainers, env_name, gen, ALGO='dis', pomdp_adv=False ): #in this method, rollout and training are done together, opponent sampled from the population NUM_EPISODE = 100 #train 100 episodes for the blue to converge to the new best response to red EPS_START = max(1.0 * 0.5**(gen - 10), 0.15) if gen >= 10 else 1.0 #initial epsilon EPS_END = 0.05 EPS_DECAY = 0.995 if ALGO == 'dis': # make env with blue and red policy agent inside, assert trainers is not None dis_env = make_self_play_env( seed=np.random.choice(np.array(range(len(self.pop)))), return_policy_agent=False, trainers=trainers )[0] # trainer if not None, first is the shared DQN agent, second is the best red policy env = EnvironmentWrapper( env_name, ALGO, dis_env, 0) # the "0" is the index for training blue agent elif ALGO == 'TD3_tennis': no_graphics = not RENDER tennis_env = make_tennis_env.TennisEnvFactory( seed=np.random.choice(np.array(range(len(self.pop)))), no_graphics=no_graphics, pid=-1).getEnv()[0] env = EnvironmentWrapper('Tennis', ALGO, tennis_env, 0) else: env = EnvironmentWrapper(env_name, ALGO) blue_dqn = trainers[0] average_reward = 0 eps = EPS_START average_red_reward = 0 red_count = 0 average_actual_blue_reward = 0 blue_count = 0 for it in range(NUM_EPISODE): if not pomdp_adv: #if pomdp_adv, make sure that TD3_actor is never used id = np.random.choice(np.array(range(len(self.pop)))) red_actor = self.pop[id] env.set_TD3_actor(red_actor) fitness = 0.0 #here fitness if simplely reward total_frame = 0 state = env.reset() env.randomize_neu_adv() if pomdp_adv: env.try_set_pomdp_adv( ) #try to set if opponent to pomdp adv if opponent is adversary, else do nothing render_flag = (np.random.random() < 0.05) while True: # unless done action = blue_dqn.act(state, eps=eps) # action = utils.to_numpy(action) next_state, reward, done, info = env.step( copy.deepcopy(action), use_actual_reward=DRQN ) #after calling env.step, evaluator initialized later does not work #should be something wrong with the internal red model? blue_dqn.step(state, action, reward, next_state, done) if render_flag and self.args.render: env.render() # next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0) state = next_state fitness += reward total_frame += 1 # DONE FLAG IS Received if done: average_red_reward += env.get_red_reward( ) if env.get_red_reward() is not None else 0 average_actual_blue_reward += env.get_blue_actual_reward( ) if env.get_blue_actual_reward() is not None else 0 red_count += 1 if env.get_red_reward() is not None else 0 blue_count += 1 if env.get_blue_actual_reward( ) is not None else 0 if render_flag: env.env.close() break average_reward += fitness eps = max(EPS_END, EPS_DECAY * eps) if gen >= 10 and gen % 5 == 0: blue_dqn.save_net('./pytorch_models/train_blue_dqn_step_' + str(gen) + '.pth') average_reward /= NUM_EPISODE if red_count != 0: average_red_reward /= red_count if blue_count != 0: average_actual_blue_reward /= blue_count return average_reward, average_red_reward, average_actual_blue_reward def evaluate_training_fixed_blue( self): #this evaluate against the training opponent (red pop) self.evaluator.pomdp_adv = False return self.evaluator.evaluate_fixed_agents(self.trainers[0], self.trainers[1], self.pop) def train(self, gen, frame_tracker): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ START ROLLOUTS ############## # Start Evolution rollouts if not ISOLATE_PG: for id, actor in enumerate(self.pop): if self.evo_flag[id]: self.evo_task_pipes[id][0].send((id, gen)) self.evo_flag[id] = False # Sync all learners actor to cpu (rollout) actor # (update rollout parameter using the learner parameter, such that rollout worker is up to date) for i, learner in enumerate(self.portfolio): #number of learner learner.algo.actor.cpu() utils.hard_update( self.rollout_bucket[i], learner.algo.actor ) #rollout bucket is now synchronized with learner to perform rollout for learner actors if torch.cuda.is_available(): learner.algo.actor.cuda() # Start Learner rollouts for rollout_id, learner_id in enumerate( self.allocation): #number of rollout_size if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send( (learner_id, gen) ) #allocation record the id of the learner that bucket should run, so rollout_id is the id of rollout_bucket self.roll_flag[rollout_id] = False # Start Test rollouts if gen % 5 == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send((0, gen)) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## # main training loop if self.replay_buffer.__len__( ) > self.args.batch_size * 10: ###BURN IN PERIOD self.replay_buffer.tensorify( ) # Tensorify the buffer for fast sampling # Spin up threads for each learner threads = [ threading.Thread( target=learner.update_parameters, args=(self.replay_buffer, self.args.buffer_gpu, self.args.batch_size, int(self.gen_frames * self.args.gradperstep))) for learner in self.portfolio ] #macheng: do we want to train all the learners? # Start threads for thread in threads: thread.start() # Join threads for thread in threads: thread.join() # Now update average_policy #self.average_policy.cuda() if ALGO == 'dis': self.average_policy.update( ) #update the average_policy parameter with supervised learning self.gen_frames = 0 #########Visualize Learner Critic Function################# # if self.replay_buffer.__len__() % 2500 == 0: # visualize_critic(self.portfolio[2], make_self_play_env(trainers=[[],[]])[0], 50) #arguments: Learner, env, N_GRID ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############ if not ISOLATE_PG: all_fitness = [] all_net_ids = [] all_eplens = [] while True: for i in range(self.args.pop_size): if self.evo_result_pipes[i][1].poll(): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) all_net_ids.append(entry[0]) all_eplens.append(entry[2]) self.gen_frames += entry[2] self.total_frames += entry[2] self.evo_flag[i] = True # Soft-join (50%) if len(all_fitness ) / self.args.pop_size >= self.args.asynch_frac: break ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ for i in range(self.args.rollout_size): entry = self.result_pipes[i][1].recv() learner_id = entry[0] fitness = entry[1] num_frames = entry[2] self.portfolio[learner_id].update_stats(fitness, num_frames) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness self.roll_flag[i] = True # Referesh buffer (housekeeping tasks - pruning to keep under capacity) self.replay_buffer.referesh() ######################### END OF PARALLEL ROLLOUTS ################ ############ PROCESS MAX FITNESS ############# # ms:best policy is always up to date # so here the best learner is saved if not ISOLATE_PG: champ_index = all_net_ids[all_fitness.index(max(all_fitness))] utils.hard_update(self.test_bucket[0], self.pop[champ_index]) if max(all_fitness) > self.best_score: self.best_score = max(all_fitness) utils.hard_update(self.best_policy, self.pop[champ_index]) if SAVE: torch.save( self.pop[champ_index].state_dict(), self.args.aux_folder + ENV_NAME + '_best' + SAVETAG) print("Best policy saved with score", '%.2f' % max(all_fitness)) else: #Run PG in isolation utils.hard_update(self.test_bucket[0], self.rollout_bucket[0]) ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] for pipe in self.test_result_pipes: #Collect all results entry = pipe[1].recv() test_scores.append(entry[1]) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) # Update score to trackers frame_tracker.update([test_mean], self.total_frames) else: test_mean, test_std = None, None # NeuroEvolution's probabilistic selection and recombination step # ms: this epoch() method implements neuro-evolution if not ISOLATE_PG: #seems pop_size and rollout_size must be 10, otherwise this will produce error if gen % 5 == 0: self.evolver.epoch( gen, self.genealogy, self.pop, all_net_ids, all_fitness, self.rollout_bucket ) #this method also copies learner to evoler else: self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, []) # META LEARNING - RESET ALLOCATION USING UCB if gen % 1 == 0: self.update_allocation() # Metrics if not ISOLATE_PG: champ_len = all_eplens[all_fitness.index(max(all_fitness))] champ_wwid = int(self.pop[champ_index].wwid.item()) max_fit = max(all_fitness) else: champ_len = num_frames champ_wwid = int(self.rollout_bucket[0].wwid.item()) all_fitness = [fitness] max_fit = fitness all_eplens = [num_frames] return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid def update_allocation(self): self.allocation = ucb(len(self.allocation), self.portfolio, self.args.ucb_coefficient) def sim_and_eval_POMDP(self): self.evaluator = Evaluator( self, 5, self.trainers, pomdp_adv=True) # evaluator must be created before train_dqn for gen in range(1000000): print('gen=', gen) blue_score, red_score, actual_blue_score = agent.train_blue_dqn( agent.trainers, ENV_NAME, gen, ALGO='dis', pomdp_adv=True) print('Env', ENV_NAME, 'Gen', gen, ", Training average: Blue agent score: ", blue_score, " Red score: ", red_score, " Actual blue score: ", actual_blue_score) blue_score, red_score, actual_blue_score = self.evaluator.evaluate( ) print("Evaluation result: Blue agent score: ", blue_score, " Red score: ", red_score, " Actual blue score: ", actual_blue_score)
class DDQN_Trainer: """Main CERL class containing all methods for CERL Parameters: args (object): Parameter class with all the parameters """ def __init__(self, args, model_constructor, env_constructor): self.args = args #MP TOOLS self.manager = Manager() #Algo self.algo = DDQN(args, model_constructor, args.gamma) #Save best policy self.best_policy = model_constructor.make_model('DDQN') #Init BUFFER self.replay_buffer = Buffer(args.buffer_size) self.data_bucket = self.replay_buffer.tuples #Initialize Rollout Bucket self.rollout_bucket = self.manager.list() self.rollout_bucket.append(model_constructor.make_model('DDQN')) ############## MULTIPROCESSING TOOLS ################### #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 'pg', self.task_pipes[id][1], self.result_pipes[id][0], self.data_bucket, self.rollout_bucket, env_constructor)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] #Test bucket self.test_bucket = self.manager.list() self.test_bucket.append(model_constructor.make_model('DDQN')) #5 Test workers self.test_task_pipes = [ Pipe() for _ in range(env_constructor.dummy_env.test_size) ] self.test_result_pipes = [ Pipe() for _ in range(env_constructor.dummy_env.test_size) ] self.test_workers = [ Process(target=rollout_worker, args=(id, 'test', self.test_task_pipes[id][1], self.test_result_pipes[id][0], None, self.test_bucket, env_constructor)) for id in range(env_constructor.dummy_env.test_size) ] for worker in self.test_workers: worker.start() self.test_flag = False #Trackers self.best_score = 0.0 self.gen_frames = 0 self.total_frames = 0 self.test_score = None self.test_std = None self.test_trace = [] self.rollout_fits_trace = [] self.ep_len = 0 self.r1_reward = 0 self.num_footsteps = 0 def forward_epoch(self, epoch, tracker): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ START ROLLOUTS ############## #Sync all learners actor to cpu (rollout) actor self.algo.actor.cpu() utils.hard_update(self.rollout_bucket[0], self.algo.actor) utils.hard_update(self.test_bucket[0], self.algo.actor) self.algo.actor.cuda() # Start Learner rollouts for rollout_id in range(self.args.rollout_size): if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send(0) self.roll_flag[rollout_id] = False #Start Test rollouts if epoch % 1 == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send(0) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## if self.replay_buffer.__len__( ) > self.args.learning_start: ###BURN IN PERIOD self.replay_buffer.tensorify( ) # Tensorify the buffer for fast sampling for _ in range(self.gen_frames): s, ns, a, r, done = self.replay_buffer.sample( self.args.batch_size) if torch.cuda.is_available(): s = s.cuda() ns = ns.cuda() a = a.cuda() r = r.cuda() done = done.cuda() r = r * self.args.reward_scaling self.algo.update_parameters(s, ns, a, r, done) self.gen_frames = 0 ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ if self.args.rollout_size > 0: for i in range(self.args.rollout_size): entry = self.result_pipes[i][1].recv() learner_id = entry[0] fitness = entry[1] num_frames = entry[2] self.rollout_fits_trace.append(fitness) self.gen_frames += num_frames self.total_frames += num_frames self.roll_flag[i] = True #Referesh buffer (housekeeping tasks - pruning to keep under capacity) self.replay_buffer.referesh() ######################### END OF PARALLEL ROLLOUTS ################ ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] eplens = [] r1_reward = [] num_footsteps = [] for pipe in self.test_result_pipes: #Collect all results entry = pipe[1].recv() test_scores.append(entry[1]) eplens.append(entry[3]) r1_reward.append(entry[4]) num_footsteps.append(entry[5]) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) self.test_trace.append(test_mean) self.num_footsteps = np.mean(np.array(num_footsteps)) self.ep_len = np.mean(np.array(eplens)) self.r1_reward = np.mean(np.array(r1_reward)) tracker.update([test_mean, self.r1_reward], self.total_frames) else: test_mean, test_std = None, None return test_mean, test_std def train(self, frame_limit): # Define Tracker class to track scores test_tracker = utils.Tracker( self.args.savefolder, ['score_' + self.args.savetag, 'r1_' + self.args.savetag], '.csv') # Tracker class to log progress time_start = time.time() for gen in range(1, 1000000000): # Infinite generations # Train one iteration test_mean, test_std = self.forward_epoch(gen, test_tracker) print('Gen/Frames', gen, '/', self.total_frames, 'max_ever:', '%.2f' % self.best_score, ' Avg:', '%.2f' % test_tracker.all_tracker[0][1], ' Frames/sec:', '%.2f' % (self.total_frames / (time.time() - time_start)), ' Test/RolloutScore', '%.2f' % self.test_trace[-1], '%.2f' % self.rollout_fits_trace[-1], 'Ep_len', '%.2f' % self.ep_len, '#Footsteps', '%.2f' % self.num_footsteps, 'R1_Reward', '%.2f' % self.r1_reward, 'savetag', self.args.savetag) if gen % 5 == 0: print() print('Entropy', utils.pprint(math.exp(self.algo.entropy['mean'])), 'Next_Entropy', utils.pprint(math.exp(self.algo.next_entropy['mean'])), 'Q_Loss', utils.pprint(self.algo.critic_loss['mean']), 'Q', utils.pprint(self.algo.policy_q['mean']), 'Next_Q', utils.pprint(self.algo.next_q['mean'])) print() if self.total_frames > frame_limit: break
class Agent: """Learner object encapsulating a local learner Parameters: algo_name (str): Algorithm Identifier state_dim (int): State size action_dim (int): Action size actor_lr (float): Actor learning rate critic_lr (float): Critic learning rate gamma (float): DIscount rate tau (float): Target network sync generate init_w (bool): Use kaimling normal to initialize? **td3args (**kwargs): arguments for TD3 algo """ def __init__(self, args, id): self.args = args self.id = id ###Initalize neuroevolution module### self.evolver = SSNE(self.args) ########Initialize population self.manager = Manager() self.popn = self.manager.list() for _ in range(args.popn_size): if args.ps == 'trunk': self.popn.append( MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)) else: if args.algo_name == 'TD3': self.popn.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='DeterministicPolicy')) else: self.popn.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='GaussianPolicy')) self.popn[-1].eval() #### INITIALIZE PG ALGO ##### if args.ps == 'trunk': if self.args.is_matd3 or args.is_maddpg: algo_name = 'TD3' if self.args.is_matd3 else 'DDPG' self.algo = MATD3(id, algo_name, args.state_dim, args.action_dim, args.hidden_size, args.actor_lr, args.critic_lr, args.gamma, args.tau, args.savetag, args.aux_save, args.actualize, args.use_gpu, args.config.num_agents, args.init_w) else: self.algo = MultiTD3(id, args.algo_name, args.state_dim, args.action_dim, args.hidden_size, args.actor_lr, args.critic_lr, args.gamma, args.tau, args.savetag, args.aux_save, args.actualize, args.use_gpu, args.config.num_agents, args.init_w) else: if args.algo_name == 'TD3': self.algo = TD3(id, args.algo_name, args.state_dim, args.action_dim, args.hidden_size, args.actor_lr, args.critic_lr, args.gamma, args.tau, args.savetag, args.aux_save, args.actualize, args.use_gpu, args.init_w) else: self.algo = SAC(id, args.state_dim, args.action_dim, args.hidden_size, args.gamma, args.critic_lr, args.actor_lr, args.tau, args.alpha, args.target_update_interval, args.savetag, args.aux_save, args.actualize, args.use_gpu) #### Rollout Actor is a template used for MP ##### self.rollout_actor = self.manager.list() if args.ps == 'trunk': self.rollout_actor.append( MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)) else: if args.algo_name == 'TD3': self.rollout_actor.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='DeterministicPolicy')) else: self.rollout_actor.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='GaussianPolicy')) #Initalize buffer if args.ps == 'trunk': self.buffer = [ Buffer(args.buffer_size, buffer_gpu=False, filter_c=args.filter_c) for _ in range(args.config.num_agents) ] else: self.buffer = Buffer(args.buffer_size, buffer_gpu=False, filter_c=args.filter_c) #Agent metrics self.fitnesses = [[] for _ in range(args.popn_size)] ###Best Policy HOF#### self.champ_ind = 0 def update_parameters(self): td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': -1.0, 'action_high': 1.0 } if self.args.ps == 'trunk': for agent_id, buffer in enumerate(self.buffer): if self.args.is_matd3 or self.args.is_maddpg: buffer = self.buffer[0] #Hardcoded Hack for MADDPG buffer.referesh() if buffer.__len__() < 10 * self.args.batch_size: buffer.pg_frames = 0 return ###BURN_IN_PERIOD buffer.tensorify() for _ in range(int(self.args.gradperstep * buffer.pg_frames)): s, ns, a, r, done, global_reward = buffer.sample( self.args.batch_size, pr_rew=self.args.priority_rate, pr_global=self.args.priority_rate) r *= self.args.reward_scaling if self.args.use_gpu: s = s.cuda() ns = ns.cuda() a = a.cuda() r = r.cuda() done = done.cuda() global_reward = global_reward.cuda() self.algo.update_parameters(s, ns, a, r, done, global_reward, agent_id, 1, **td3args) buffer.pg_frames = 0 else: self.buffer.referesh() if self.buffer.__len__() < 10 * self.args.batch_size: return ###BURN_IN_PERIOD self.buffer.tensorify() for _ in range(int(self.args.gradperstep * self.buffer.pg_frames)): s, ns, a, r, done, global_reward = self.buffer.sample( self.args.batch_size, pr_rew=self.args.priority_rate, pr_global=self.args.priority_rate) r *= self.args.reward_scaling if self.args.use_gpu: s = s.cuda() ns = ns.cuda() a = a.cuda() r = r.cuda() done = done.cuda() global_reward = global_reward.cuda() self.algo.update_parameters(s, ns, a, r, done, global_reward, 1, **td3args) self.buffer.pg_frames = 0 #Reset new frame counter to 0 def evolve(self): ## One gen of evolution ### if self.args.popn_size > 1: #If not no-evo if self.args.scheme == 'multipoint': #Make sure that the buffer has been refereshed and tensorified buffer_pointer = self.buffer[ 0] if self.args.ps == 'trunk' else self.buffer if buffer_pointer.__len__() < 1000: buffer_pointer.tensorify() if random.random() < 0.01: buffer_pointer.tensorify() #Get sample of states from the buffer if buffer_pointer.__len__() < 1000: sample_size = buffer_pointer.__len__() else: sample_size = 1000 if sample_size == 1000 and len(buffer_pointer.sT) < 1000: buffer_pointer.tensorify() states, _, _, _, _, _ = buffer_pointer.sample(sample_size, pr_rew=0.0, pr_global=0.0) states = states.cpu() elif self.args.scheme == 'standard': states = None else: sys.exit('Unknown Evo Scheme') #Net indices of nets that got evaluated this generation (meant for asynchronous evolution workloads) net_inds = [i for i in range(len(self.popn)) ] #Hack for a synchronous run #Evolve if self.args.rollout_size > 0: self.champ_ind = self.evolver.evolve(self.popn, net_inds, self.fitnesses, [self.rollout_actor[0]], states) else: self.champ_ind = self.evolver.evolve(self.popn, net_inds, self.fitnesses, [], states) #Reset fitness metrics self.fitnesses = [[] for _ in range(self.args.popn_size)] def update_rollout_actor(self): for actor in self.rollout_actor: self.algo.policy.cpu() mod.hard_update(actor, self.algo.policy) if self.args.use_gpu: self.algo.policy.cuda()
class Evaluator(object): def __init__( self, CERL_agent, num_workers, trainers, pomdp_adv=False ): #trainers first is the blue agent and second is the red model self.num_workers = num_workers self.trainers = trainers self.pomdp_adv = pomdp_adv self.args = CERL_agent.args self.drqn = CERL_agent.args.drqn #denote if blue uses drqn if self.pomdp_adv: self.trainers = [trainers[0], None] #make sure the red model is never used self.buffer_gpu = CERL_agent.args.buffer_gpu self.batch_size = CERL_agent.args.batch_size self.algo = CERL_agent.args.algo self.state_dim = CERL_agent.args.state_dim self.action_dim = CERL_agent.args.action_dim self.buffer = Buffer(BUFFER_SIZE, self.buffer_gpu) #initialize own replay buffer self.data_bucket = self.buffer.tuples self.evo_task_pipes = [Pipe() for _ in range(self.num_workers)] self.evo_result_pipes = [Pipe() for _ in range(self.num_workers)] self.actual_red_worker = Actor( CERL_agent.args.state_dim, CERL_agent.args.action_dim, -1, 'dis') #this model is shared accross the workers self.actual_red_worker.share_memory() self.td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': CERL_agent.args.action_low, 'action_high': CERL_agent.args.action_high, 'cerl_args': self.args } self.renew_learner( ) #now we are not using new learner for each iteration self.rollout_bucket = [ self.actual_red_worker for i in range(num_workers) ] self.workers = [ Process(target=rollout_worker, args=(id, 3, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.rollout_bucket, 'dummy_name', None, 'dis', self.trainers, False, self.pomdp_adv)) for id in range(num_workers) ] for worker in self.workers: worker.start() self.evo_flag = [True for _ in range(self.num_workers)] #def initialize(self, actor_in): #use the given actor parameter to initialize the red actor # utils.hard_update(self.actual_red_actor, actor_in) def renew_learner( self ): #create a new learning agent, with randomized initial parameter self.learner = Learner(-1, self.algo, self.state_dim, self.action_dim, actor_lr=5e-5, critic_lr=1e-3, gamma=0.99, tau=5e-3, init_w=True, **self.td3args) self.actual_red_actor = self.learner.algo.actor def collect_trajectory(self): utils.hard_update(self.actual_red_worker, self.actual_red_actor) #first snyc the actor #launch rollout_workers for id, actor in enumerate(self.rollout_bucket): if self.evo_flag[id]: self.evo_task_pipes[id][0].send( (id, 0)) #second argument in send is dummy self.evo_flag[id] = False #wait for the rollout to complete and record fitness all_fitness = [] for i in range(self.num_workers): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) self.evo_flag[i] = True self.buffer.referesh() #update replay buffer return all_fitness def train_red( self, training_iterations ): #alternate between collect_trajectory and parameter update while self.buffer.__len__() < self.batch_size * 10: ###BURN IN PERIOD self.collect_trajectory() for i in range(training_iterations): self.collect_trajectory() self.buffer.tensorify() # Tensorify the buffer for fast sampling self.learner.update_parameters(self.buffer, self.buffer_gpu, self.batch_size, 2) #2 update steps def evaluate( self ): #evaluate the quality of blue agent policy, by training a red against it, after evaluation, erase the reply buffer and renew learner self.train_red(TRAIN_ITERATION) self.clear_buffer() #self.renew_learner() return self.evaluate_fixed_agents( self.trainers[0], self.trainers[1], [self.actual_red_actor ]) #calculate the mean and std of the evaluation metric def evaluate_fixed_agents( self, blue_dqn, red_model, red_actor_list, num_iterations=25 ): #evaluate the performance given agents, use random neutral and red agent if self.algo == 'dis': # make env with blue and red policy agent inside, dis_env = make_self_play_env( seed=0, return_policy_agent=False, trainers=[blue_dqn, red_model] )[0] # trainer if not None, first is the shared DQN agent, second is the best red policy env = EnvironmentWrapper( '', self.algo, dis_env, 0) # the "0" is the index for training blue agent elif self.algo == 'TD3_tennis': tennis_env = make_tennis_env.TennisEnvFactory( seed=np.random.choice(np.array(range(len(self.pop)))), no_graphics=True, pid=-1).getEnv()[0] env = EnvironmentWrapper('Tennis', self.algo, tennis_env, 0) else: raise Exception("only work for 'dis' envir?") average_reward = 0 eps = 0 average_red_reward = 0 red_count = 0 average_actual_blue_reward = 0 blue_count = 0 belief_and_true_type_list = [] assert len(red_actor_list ) is not None, "make sure to input a list of possible red" for it in range(num_iterations): belief_and_true_type = [] if not self.pomdp_adv: # if pomdp_adv, make sure that TD3_actor is never used red_actor = random.choice(red_actor_list) env.set_TD3_actor(red_actor) fitness = 0.0 # here fitness if simplely reward state = env.reset() belief_and_true_type.append(env.belief_and_true_type()) env.randomize_neu_adv() if self.pomdp_adv: env.try_set_pomdp_adv( ) # try to set if opponent to pomdp adv if opponent is adversary, else do nothing render_flag = (np.random.random() < 0.05) while True: # unless done action = blue_dqn.act(state, eps=eps) next_state, reward, done, info = env.step( copy.deepcopy(action), use_actual_reward=self.drqn) belief_and_true_type.append(env.belief_and_true_type()) if render_flag and self.args.render: env.render() state = next_state fitness += reward if done: average_red_reward += env.get_red_reward( ) if env.get_red_reward() is not None else 0 average_actual_blue_reward += env.get_blue_actual_reward( ) if env.get_blue_actual_reward() is not None else 0 red_count += 1 if env.get_red_reward() is not None else 0 blue_count += 1 if env.get_blue_actual_reward( ) is not None else 0 if render_flag: env.env.close() break belief_and_true_type_list.append(belief_and_true_type) average_reward += fitness average_reward /= num_iterations if red_count != 0: average_red_reward /= red_count if blue_count != 0: average_actual_blue_reward /= blue_count return average_reward, average_red_reward, average_actual_blue_reward, belief_and_true_type_list def clear_buffer(self): self.buffer.clear_buffer_data() #reinitialize replay buffer def kill_processes(self): for id, actor in enumerate(self.rollout_bucket): self.evo_task_pipes[id][0].send( ('TERMINATE', 0)) #second argument in send is dummy def __del__(self): self.kill_processes()