class CERL_Agent: """Main CERL class containing all methods for CERL Parameters: args (int): Parameter class with all the parameters """ def __init__(self, args): self.args = args self.evolver = SSNE(self.args) #MP TOOLS self.manager = Manager() #Genealogy tool self.genealogy = Genealogy() #Initialize population self.pop = self.manager.list() for _ in range(args.pop_size): wwid = self.genealogy.new_id('evo') if ALGO == 'SAC': self.pop.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, wwid)) else: self.pop.append(Actor(args.state_dim, args.action_dim, wwid)) if ALGO == "SAC": self.best_policy = GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1) else: self.best_policy = Actor(args.state_dim, args.action_dim, -1) #Turn off gradients and put in eval mod for actor in self.pop: actor = actor.cpu() actor.eval() #Init BUFFER self.replay_buffer = Buffer(1000000, self.args.buffer_gpu) #Intialize portfolio of learners self.portfolio = [] self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, PORTFOLIO_ID) self.rollout_bucket = self.manager.list() for _ in range(len(self.portfolio)): if ALGO == 'SAC': self.rollout_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.rollout_bucket.append( Actor(args.state_dim, args.action_dim, -1)) # Initialize shared data bucket self.data_bucket = self.replay_buffer.tuples ############## MULTIPROCESSING TOOLS ################### #Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.pop, ENV_NAME, None, ALGO)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, self.task_pipes[id][1], self.result_pipes[id][0], True, self.data_bucket, self.rollout_bucket, ENV_NAME, args.noise_std, ALGO)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] #Test bucket self.test_bucket = self.manager.list() if ALGO == 'SAC': self.test_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.test_bucket.append(Actor(args.state_dim, args.action_dim, -1)) #5 Test workers self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_workers = [ Process(target=rollout_worker, args=(id, self.test_task_pipes[id][1], self.test_result_pipes[id][0], False, None, self.test_bucket, ENV_NAME, args.noise_std, ALGO)) for id in range(TEST_SIZE) ] for worker in self.test_workers: worker.start() self.test_flag = False #Meta-learning controller (Resource Distribution) self.allocation = [ ] #Allocation controls the resource allocation across learners for i in range(args.rollout_size): self.allocation.append( i % len(self.portfolio)) #Start uniformly (equal resources) #self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores #Trackers self.best_score = 0.0 self.gen_frames = 0 self.total_frames = 0 self.best_shaped_score = None self.test_score = None self.test_std = None def train(self, gen, frame_tracker): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ START ROLLOUTS ############## #Start Evolution rollouts if not ISOLATE_PG: for id, actor in enumerate(self.pop): if self.evo_flag[id]: self.evo_task_pipes[id][0].send(id) self.evo_flag[id] = False #Sync all learners actor to cpu (rollout) actor for i, learner in enumerate(self.portfolio): learner.algo.actor.cpu() utils.hard_update(self.rollout_bucket[i], learner.algo.actor) learner.algo.actor.cuda() # Start Learner rollouts for rollout_id, learner_id in enumerate(self.allocation): if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send(learner_id) self.roll_flag[rollout_id] = False #Start Test rollouts if gen % 5 == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send(0) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## if self.replay_buffer.__len__( ) > self.args.batch_size * 10: ###BURN IN PERIOD self.replay_buffer.tensorify( ) # Tensorify the buffer for fast sampling #Spin up threads for each learner threads = [ threading.Thread( target=learner.update_parameters, args=(self.replay_buffer, self.args.buffer_gpu, self.args.batch_size, int(self.gen_frames * self.args.gradperstep))) for learner in self.portfolio ] # Start threads for thread in threads: thread.start() #Join threads for thread in threads: thread.join() self.gen_frames = 0 ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############ if not ISOLATE_PG: all_fitness = [] all_net_ids = [] all_eplens = [] while True: for i in range(self.args.pop_size): if self.evo_result_pipes[i][1].poll(): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) all_net_ids.append(entry[0]) all_eplens.append(entry[2]) self.gen_frames += entry[2] self.total_frames += entry[2] self.evo_flag[i] = True # Soft-join (50%) if len(all_fitness ) / self.args.pop_size >= self.args.asynch_frac: break ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ for i in range(self.args.rollout_size): entry = self.result_pipes[i][1].recv() learner_id = entry[0] fitness = entry[1] num_frames = entry[2] self.portfolio[learner_id].update_stats(fitness, num_frames) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness self.roll_flag[i] = True #Referesh buffer (housekeeping tasks - pruning to keep under capacity) self.replay_buffer.referesh() ######################### END OF PARALLEL ROLLOUTS ################ ############ PROCESS MAX FITNESS ############# if not ISOLATE_PG: champ_index = all_net_ids[all_fitness.index(max(all_fitness))] utils.hard_update(self.test_bucket[0], self.pop[champ_index]) if max(all_fitness) > self.best_score: self.best_score = max(all_fitness) utils.hard_update(self.best_policy, self.pop[champ_index]) if SAVE: torch.save( self.pop[champ_index].state_dict(), self.args.aux_folder + ENV_NAME + '_best' + SAVETAG) print("Best policy saved with score", '%.2f' % max(all_fitness)) else: #Run PG in isolation utils.hard_update(self.test_bucket[0], self.rollout_bucket[0]) ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] for pipe in self.test_result_pipes: #Collect all results entry = pipe[1].recv() test_scores.append(entry[1]) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) # Update score to trackers frame_tracker.update([test_mean], self.total_frames) else: test_mean, test_std = None, None #NeuroEvolution's probabilistic selection and recombination step if not ISOLATE_PG: if gen % 5 == 0: self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, self.rollout_bucket) else: self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, []) #META LEARNING - RESET ALLOCATION USING UCB if gen % 1 == 0: self.allocation = ucb(len(self.allocation), self.portfolio, self.args.ucb_coefficient) #Metrics if not ISOLATE_PG: champ_len = all_eplens[all_fitness.index(max(all_fitness))] champ_wwid = int(self.pop[champ_index].wwid.item()) max_fit = max(all_fitness) else: champ_len = num_frames champ_wwid = int(self.rollout_bucket[0].wwid.item()) all_fitness = [fitness] max_fit = fitness all_eplens = [num_frames] return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid
class SAC_Discrete_Trainer: """Main CERL class containing all methods for CERL Parameters: args (object): Parameter class with all the parameters """ def __init__(self, args, model_constructor, env_constructor): self.args = args #MP TOOLS self.manager = Manager() #Algo self.algo = SAC_Discrete(args, model_constructor, args.gamma) # #Save best policy # self.best_policy = model_constructor.make_model('actor') #Init BUFFER self.replay_buffer = Buffer(args.buffer_size) self.data_bucket = self.replay_buffer.tuples #Initialize Rollout Bucket self.rollout_bucket = self.manager.list() self.rollout_bucket.append(model_constructor.make_model('Gumbel_FF')) ############## MULTIPROCESSING TOOLS ################### #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 'pg', self.task_pipes[id][1], self.result_pipes[id][0], self.data_bucket, self.rollout_bucket, env_constructor)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] #Test bucket self.test_bucket = self.manager.list() self.test_bucket.append(model_constructor.make_model('Gumbel_FF')) #5 Test workers self.test_task_pipes = [ Pipe() for _ in range(env_constructor.dummy_env.test_size) ] self.test_result_pipes = [ Pipe() for _ in range(env_constructor.dummy_env.test_size) ] self.test_workers = [ Process(target=rollout_worker, args=(id, 'test', self.test_task_pipes[id][1], self.test_result_pipes[id][0], None, self.test_bucket, env_constructor)) for id in range(env_constructor.dummy_env.test_size) ] for worker in self.test_workers: worker.start() self.test_flag = False #Trackers self.best_score = 0.0 self.gen_frames = 0 self.total_frames = 0 self.test_score = None self.test_std = None self.test_trace = [] self.rollout_fits_trace = [] self.ep_len = 0 self.r1_reward = 0 self.num_footsteps = 0 def forward_epoch(self, epoch, tracker): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ START ROLLOUTS ############## #Sync all learners actor to cpu (rollout) actor self.algo.actor.cpu() utils.hard_update(self.rollout_bucket[0], self.algo.actor) utils.hard_update(self.test_bucket[0], self.algo.actor) self.algo.actor.cuda() # Start Learner rollouts for rollout_id in range(self.args.rollout_size): if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send(0) self.roll_flag[rollout_id] = False #Start Test rollouts if epoch % 1 == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send(0) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## if self.replay_buffer.__len__( ) > self.args.learning_start: ###BURN IN PERIOD #self.replay_buffer.tensorify() # Tensorify the buffer for fast sampling for _ in range(self.gen_frames): s, ns, a, r, done = self.replay_buffer.sample( self.args.batch_size) if torch.cuda.is_available(): s = s.cuda() ns = ns.cuda() a = a.cuda() r = r.cuda() done = done.cuda() r = r * self.args.reward_scaling self.algo.update_parameters(s, ns, a, r, done) self.gen_frames = 0 ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ if self.args.rollout_size > 0: for i in range(self.args.rollout_size): entry = self.result_pipes[i][1].recv() learner_id = entry[0] fitness = entry[1] num_frames = entry[2] self.rollout_fits_trace.append(fitness) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness self.roll_flag[i] = True #Referesh buffer (housekeeping tasks - pruning to keep under capacity) self.replay_buffer.referesh() ######################### END OF PARALLEL ROLLOUTS ################ ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] eplens = [] r1_reward = [] num_footsteps = [] for pipe in self.test_result_pipes: #Collect all results entry = pipe[1].recv() test_scores.append(entry[1]) eplens.append(entry[3]) r1_reward.append(entry[4]) num_footsteps.append(entry[5]) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) self.test_trace.append(test_mean) self.num_footsteps = np.mean(np.array(num_footsteps)) self.ep_len = np.mean(np.array(eplens)) self.r1_reward = np.mean(np.array(r1_reward)) tracker.update([test_mean, self.r1_reward], self.total_frames) if self.r1_reward > self.best_score: self.best_score = self.r1_reward torch.save( self.test_bucket[0].state_dict(), self.args.aux_folder + 'bestR1_' + self.args.savetag) print("Best R1 Policy saved with score", '%.2f' % self.r1_reward) else: test_mean, test_std = None, None if epoch % 20 == 0: #Save models torch.save(self.algo.actor.state_dict(), self.args.aux_folder + 'actor_' + self.args.savetag) torch.save(self.algo.critic.state_dict(), self.args.aux_folder + 'critic_' + self.args.savetag) print("Actor and Critic saved") return test_mean, test_std def train(self, frame_limit): # Define Tracker class to track scores test_tracker = utils.Tracker( self.args.savefolder, ['score_' + self.args.savetag, 'r1_' + self.args.savetag], '.csv') # Tracker class to log progress time_start = time.time() for gen in range(1, 1000000000): # Infinite generations # Train one iteration test_mean, test_std = self.forward_epoch(gen, test_tracker) print('Gen/Frames', gen, '/', self.total_frames, 'max_ever:', '%.2f' % self.best_score, ' Avg:', '%.2f' % test_tracker.all_tracker[0][1], ' Frames/sec:', '%.2f' % (self.total_frames / (time.time() - time_start)), ' Test/RolloutScore', ['%.2f' % i for i in self.test_trace[-1:]], '%.2f' % self.rollout_fits_trace[-1], 'Ep_len', '%.2f' % self.ep_len, '#Footsteps', '%.2f' % self.num_footsteps, 'R1_Reward', '%.2f' % self.r1_reward, 'savetag', self.args.savetag) if gen % 5 == 0: print() print('Entropy', self.algo.entropy['mean'], 'Next_Entropy', self.algo.next_entropy['mean'], 'Temp', self.algo.temp['mean'], 'Poilcy_Q', self.algo.policy_q['mean'], 'Critic_Loss', self.algo.critic_loss['mean']) print() if self.total_frames > frame_limit: break
class CERL_Agent: """Main CERL class containing all methods for CERL Parameters: args (int): Parameter class with all the parameters """ def __init__(self, args): # need to intialize rollout_workers to have blue agent self.args = args self.evolver = SSNE( self.args) # this evolver implements neuro-evolution # MP TOOLS self.manager = Manager() self.mutate_algos = [ Mutation_Add(self), Mutation_Delete(self), Mutation_Exchange(self) ] #store all the mutate algorithm objects # Genealogy tool self.genealogy = Genealogy() # Init BUFFER self.replay_buffer = Buffer(1000000, self.args.buffer_gpu) #if SA_FLAG: self.metrics = [] self.last_portfolio = None self.T_max = 30 self.T = self.T_max self.T_min = 0.2 self.decay_rate = 0.975 # Initialize population self.pop = self.manager.list() for _ in range(args.pop_size): wwid = self.genealogy.new_id('evo') if ALGO == 'SAC': self.pop.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, wwid)) elif ALGO == 'TD3': self.pop.append( Actor(args.state_dim, args.action_dim, wwid, ALGO)) # use ALGO to distinguish differe net architecture elif ALGO == 'dis' or 'TD3_tennis': self.pop.append( Actor(args.state_dim, args.action_dim, wwid, ALGO)) else: assert False, "invalid algorithm type" if ALGO == "SAC": self.best_policy = GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1) else: self.best_policy = Actor(args.state_dim, args.action_dim, -1, ALGO) if ALGO == 'dis': self.average_policy = AverageActor(args.state_dim, args.action_dim, -2, ALGO, self.pop, self.replay_buffer, args.buffer_gpu, args.batch_size, iterations=10) self.average_policy.share_memory() self.best_policy.share_memory() # added by macheng, share the best policy accross processes (used as internal belief update models for blue) # now we assign shared blue_trainer, we should train this agent such that the roll_out workers are also up to date # should make sure that self.best_policy (emergent learner) is also shared if ALGO == 'dis' or 'TD3_tennis': assert hasattr( args, "blue_trainer" ), "must have blue_agent trainer to intialize rollout_worker, see line 109, class Parameter definition" if ALGO == 'dis': trainers = [args.blue_trainer, self.average_policy] else: trainers = [args.blue_trainer, None ] if ALGO == 'TD3_tennis' else [] self.trainers = trainers self.blue_dqn = args.blue_trainer # Turn off gradients and put in eval mod for actor in self.pop: actor = actor.cpu() actor.eval() # Intialize portfolio of learners self.portfolio = [] self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, PORTFOLIO_ID) self.complement_portfolio = [ ] #complementary of the portfolio, whatever not in the portfolio should be stored here self.total_rollout_bucket = self.manager.list( ) #macheng: we use total_rollout_bucket to represents the whole set of rollout models, now rollout_bukcet dynamically resize according to portforlio, for SA self.rollout_bucket = self.total_rollout_bucket #self.rollout_bucket = self.manager.list() #print("rollout_bucker needs to be updated, main.py line 239 ") for _ in range(len(self.portfolio)): if ALGO == 'SAC': self.rollout_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.rollout_bucket.append( Actor(args.state_dim, args.action_dim, -1, ALGO)) # Initialize shared data bucket self.data_bucket = self.replay_buffer.tuples ############## MULTIPROCESSING TOOLS ################### # Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, 0, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.pop, ENV_NAME, None, ALGO, self.trainers)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] # Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 1, self.task_pipes[id][1], self.result_pipes[id][0], True, self.data_bucket, self.rollout_bucket, ENV_NAME, args.noise_std, ALGO, self.trainers)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] # Test bucket self.test_bucket = self.manager.list() if ALGO == 'SAC': self.test_bucket.append( GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) else: self.test_bucket.append( Actor(args.state_dim, args.action_dim, -1, ALGO)) # 5 Test workers self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)] self.test_workers = [ Process(target=rollout_worker, args=(id, 2, self.test_task_pipes[id][1], self.test_result_pipes[id][0], False, None, self.test_bucket, ENV_NAME, args.noise_std, ALGO, self.trainers)) for id in range(TEST_SIZE) ] for worker in self.test_workers: worker.start() self.test_flag = False # Meta-learning controller (Resource Distribution) self.allocation = [ ] #Allocation controls the resource allocation across learners for i in range(args.rollout_size): self.allocation.append( i % len(self.portfolio)) #Start uniformly (equal resources) # self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores # Trackers self.best_score = -np.inf self.gen_frames = 0 self.total_frames = 0 self.best_shaped_score = None self.test_score = None self.test_std = None # trainer contains the blue_dqn to be trained, and the red model used for belief update, red_actor is the actual red agent trained against # id is the actual red agent id def _update_SA_temperature(self): self.T = max(self.T * self.decay_rate, self.T_min) def _get_accept_rate(self): if RANDOM_WALK: return 1.0 else: if self.metrics[-1] > self.metrics[-2]: return 1.0 else: return np.exp((self.metrics[-1] - self.metrics[-2]) / self.T) def _mutate(self): while True: mutate_algo_index = random.choice(range(3)) if self._try_mutate(mutate_algo_index): return def _try_mutate(self, algo_index): # 0 for add, 1 for delete, 2 for exchange return self.mutate_algos[algo_index].try_mutate() def simulated_annealing(self, metric): #take in the current metric self.metrics.append(metric) if self.last_portfolio: #has last_portfolio accept_rate = self._get_accept_rate() #based on self.metrics[-2:] self._update_SA_temperature() if np.random.random() > accept_rate: #reject self.portfolio = self.last_portfolio self.complement_portfolio = self.last_complement_portfolio self.last_portfolio = copy.copy( self.portfolio) #maintain a shallow copy as self.last_complement_portfolio = copy.copy(self.complement_portfolio) self._mutate() #perturb the portfolio # update rollout_bucket size, only the first len(self.portfolio) rollout_buckets are visible self.update_rollout_bucket() # update allocation, to be compatible with the current portfolio self.update_allocation() def update_rollout_bucket(self): self.rollout_bucket = self.total_rollout_bucket[:len(self.portfolio)] def train_blue_dqn( self, trainers, env_name, gen, ALGO='dis', pomdp_adv=False ): #in this method, rollout and training are done together, opponent sampled from the population NUM_EPISODE = 100 #train 100 episodes for the blue to converge to the new best response to red EPS_START = max(1.0 * 0.5**(gen - 10), 0.15) if gen >= 10 else 1.0 #initial epsilon EPS_END = 0.05 EPS_DECAY = 0.995 if ALGO == 'dis': # make env with blue and red policy agent inside, assert trainers is not None dis_env = make_self_play_env( seed=np.random.choice(np.array(range(len(self.pop)))), return_policy_agent=False, trainers=trainers )[0] # trainer if not None, first is the shared DQN agent, second is the best red policy env = EnvironmentWrapper( env_name, ALGO, dis_env, 0) # the "0" is the index for training blue agent elif ALGO == 'TD3_tennis': no_graphics = not RENDER tennis_env = make_tennis_env.TennisEnvFactory( seed=np.random.choice(np.array(range(len(self.pop)))), no_graphics=no_graphics, pid=-1).getEnv()[0] env = EnvironmentWrapper('Tennis', ALGO, tennis_env, 0) else: env = EnvironmentWrapper(env_name, ALGO) blue_dqn = trainers[0] average_reward = 0 eps = EPS_START average_red_reward = 0 red_count = 0 average_actual_blue_reward = 0 blue_count = 0 for it in range(NUM_EPISODE): if not pomdp_adv: #if pomdp_adv, make sure that TD3_actor is never used id = np.random.choice(np.array(range(len(self.pop)))) red_actor = self.pop[id] env.set_TD3_actor(red_actor) fitness = 0.0 #here fitness if simplely reward total_frame = 0 state = env.reset() env.randomize_neu_adv() if pomdp_adv: env.try_set_pomdp_adv( ) #try to set if opponent to pomdp adv if opponent is adversary, else do nothing render_flag = (np.random.random() < 0.05) while True: # unless done action = blue_dqn.act(state, eps=eps) # action = utils.to_numpy(action) next_state, reward, done, info = env.step( copy.deepcopy(action), use_actual_reward=DRQN ) #after calling env.step, evaluator initialized later does not work #should be something wrong with the internal red model? blue_dqn.step(state, action, reward, next_state, done) if render_flag and self.args.render: env.render() # next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0) state = next_state fitness += reward total_frame += 1 # DONE FLAG IS Received if done: average_red_reward += env.get_red_reward( ) if env.get_red_reward() is not None else 0 average_actual_blue_reward += env.get_blue_actual_reward( ) if env.get_blue_actual_reward() is not None else 0 red_count += 1 if env.get_red_reward() is not None else 0 blue_count += 1 if env.get_blue_actual_reward( ) is not None else 0 if render_flag: env.env.close() break average_reward += fitness eps = max(EPS_END, EPS_DECAY * eps) if gen >= 10 and gen % 5 == 0: blue_dqn.save_net('./pytorch_models/train_blue_dqn_step_' + str(gen) + '.pth') average_reward /= NUM_EPISODE if red_count != 0: average_red_reward /= red_count if blue_count != 0: average_actual_blue_reward /= blue_count return average_reward, average_red_reward, average_actual_blue_reward def evaluate_training_fixed_blue( self): #this evaluate against the training opponent (red pop) self.evaluator.pomdp_adv = False return self.evaluator.evaluate_fixed_agents(self.trainers[0], self.trainers[1], self.pop) def train(self, gen, frame_tracker): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ START ROLLOUTS ############## # Start Evolution rollouts if not ISOLATE_PG: for id, actor in enumerate(self.pop): if self.evo_flag[id]: self.evo_task_pipes[id][0].send((id, gen)) self.evo_flag[id] = False # Sync all learners actor to cpu (rollout) actor # (update rollout parameter using the learner parameter, such that rollout worker is up to date) for i, learner in enumerate(self.portfolio): #number of learner learner.algo.actor.cpu() utils.hard_update( self.rollout_bucket[i], learner.algo.actor ) #rollout bucket is now synchronized with learner to perform rollout for learner actors if torch.cuda.is_available(): learner.algo.actor.cuda() # Start Learner rollouts for rollout_id, learner_id in enumerate( self.allocation): #number of rollout_size if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send( (learner_id, gen) ) #allocation record the id of the learner that bucket should run, so rollout_id is the id of rollout_bucket self.roll_flag[rollout_id] = False # Start Test rollouts if gen % 5 == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send((0, gen)) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## # main training loop if self.replay_buffer.__len__( ) > self.args.batch_size * 10: ###BURN IN PERIOD self.replay_buffer.tensorify( ) # Tensorify the buffer for fast sampling # Spin up threads for each learner threads = [ threading.Thread( target=learner.update_parameters, args=(self.replay_buffer, self.args.buffer_gpu, self.args.batch_size, int(self.gen_frames * self.args.gradperstep))) for learner in self.portfolio ] #macheng: do we want to train all the learners? # Start threads for thread in threads: thread.start() # Join threads for thread in threads: thread.join() # Now update average_policy #self.average_policy.cuda() if ALGO == 'dis': self.average_policy.update( ) #update the average_policy parameter with supervised learning self.gen_frames = 0 #########Visualize Learner Critic Function################# # if self.replay_buffer.__len__() % 2500 == 0: # visualize_critic(self.portfolio[2], make_self_play_env(trainers=[[],[]])[0], 50) #arguments: Learner, env, N_GRID ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############ if not ISOLATE_PG: all_fitness = [] all_net_ids = [] all_eplens = [] while True: for i in range(self.args.pop_size): if self.evo_result_pipes[i][1].poll(): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) all_net_ids.append(entry[0]) all_eplens.append(entry[2]) self.gen_frames += entry[2] self.total_frames += entry[2] self.evo_flag[i] = True # Soft-join (50%) if len(all_fitness ) / self.args.pop_size >= self.args.asynch_frac: break ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ for i in range(self.args.rollout_size): entry = self.result_pipes[i][1].recv() learner_id = entry[0] fitness = entry[1] num_frames = entry[2] self.portfolio[learner_id].update_stats(fitness, num_frames) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness self.roll_flag[i] = True # Referesh buffer (housekeeping tasks - pruning to keep under capacity) self.replay_buffer.referesh() ######################### END OF PARALLEL ROLLOUTS ################ ############ PROCESS MAX FITNESS ############# # ms:best policy is always up to date # so here the best learner is saved if not ISOLATE_PG: champ_index = all_net_ids[all_fitness.index(max(all_fitness))] utils.hard_update(self.test_bucket[0], self.pop[champ_index]) if max(all_fitness) > self.best_score: self.best_score = max(all_fitness) utils.hard_update(self.best_policy, self.pop[champ_index]) if SAVE: torch.save( self.pop[champ_index].state_dict(), self.args.aux_folder + ENV_NAME + '_best' + SAVETAG) print("Best policy saved with score", '%.2f' % max(all_fitness)) else: #Run PG in isolation utils.hard_update(self.test_bucket[0], self.rollout_bucket[0]) ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] for pipe in self.test_result_pipes: #Collect all results entry = pipe[1].recv() test_scores.append(entry[1]) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) # Update score to trackers frame_tracker.update([test_mean], self.total_frames) else: test_mean, test_std = None, None # NeuroEvolution's probabilistic selection and recombination step # ms: this epoch() method implements neuro-evolution if not ISOLATE_PG: #seems pop_size and rollout_size must be 10, otherwise this will produce error if gen % 5 == 0: self.evolver.epoch( gen, self.genealogy, self.pop, all_net_ids, all_fitness, self.rollout_bucket ) #this method also copies learner to evoler else: self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, []) # META LEARNING - RESET ALLOCATION USING UCB if gen % 1 == 0: self.update_allocation() # Metrics if not ISOLATE_PG: champ_len = all_eplens[all_fitness.index(max(all_fitness))] champ_wwid = int(self.pop[champ_index].wwid.item()) max_fit = max(all_fitness) else: champ_len = num_frames champ_wwid = int(self.rollout_bucket[0].wwid.item()) all_fitness = [fitness] max_fit = fitness all_eplens = [num_frames] return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid def update_allocation(self): self.allocation = ucb(len(self.allocation), self.portfolio, self.args.ucb_coefficient) def sim_and_eval_POMDP(self): self.evaluator = Evaluator( self, 5, self.trainers, pomdp_adv=True) # evaluator must be created before train_dqn for gen in range(1000000): print('gen=', gen) blue_score, red_score, actual_blue_score = agent.train_blue_dqn( agent.trainers, ENV_NAME, gen, ALGO='dis', pomdp_adv=True) print('Env', ENV_NAME, 'Gen', gen, ", Training average: Blue agent score: ", blue_score, " Red score: ", red_score, " Actual blue score: ", actual_blue_score) blue_score, red_score, actual_blue_score = self.evaluator.evaluate( ) print("Evaluation result: Blue agent score: ", blue_score, " Red score: ", red_score, " Actual blue score: ", actual_blue_score)
class Agent: """Learner object encapsulating a local learner Parameters: algo_name (str): Algorithm Identifier state_dim (int): State size action_dim (int): Action size actor_lr (float): Actor learning rate critic_lr (float): Critic learning rate gamma (float): DIscount rate tau (float): Target network sync generate init_w (bool): Use kaimling normal to initialize? **td3args (**kwargs): arguments for TD3 algo """ def __init__(self, args, id): self.args = args self.id = id ###Initalize neuroevolution module### self.evolver = SSNE(self.args) ########Initialize population self.manager = Manager() self.popn = self.manager.list() for _ in range(args.popn_size): if args.ps == 'trunk': self.popn.append( MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)) else: if args.algo_name == 'TD3': self.popn.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='DeterministicPolicy')) else: self.popn.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='GaussianPolicy')) self.popn[-1].eval() #### INITIALIZE PG ALGO ##### if args.ps == 'trunk': if self.args.is_matd3 or args.is_maddpg: algo_name = 'TD3' if self.args.is_matd3 else 'DDPG' self.algo = MATD3(id, algo_name, args.state_dim, args.action_dim, args.hidden_size, args.actor_lr, args.critic_lr, args.gamma, args.tau, args.savetag, args.aux_save, args.actualize, args.use_gpu, args.config.num_agents, args.init_w) else: self.algo = MultiTD3(id, args.algo_name, args.state_dim, args.action_dim, args.hidden_size, args.actor_lr, args.critic_lr, args.gamma, args.tau, args.savetag, args.aux_save, args.actualize, args.use_gpu, args.config.num_agents, args.init_w) else: if args.algo_name == 'TD3': self.algo = TD3(id, args.algo_name, args.state_dim, args.action_dim, args.hidden_size, args.actor_lr, args.critic_lr, args.gamma, args.tau, args.savetag, args.aux_save, args.actualize, args.use_gpu, args.init_w) else: self.algo = SAC(id, args.state_dim, args.action_dim, args.hidden_size, args.gamma, args.critic_lr, args.actor_lr, args.tau, args.alpha, args.target_update_interval, args.savetag, args.aux_save, args.actualize, args.use_gpu) #### Rollout Actor is a template used for MP ##### self.rollout_actor = self.manager.list() if args.ps == 'trunk': self.rollout_actor.append( MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)) else: if args.algo_name == 'TD3': self.rollout_actor.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='DeterministicPolicy')) else: self.rollout_actor.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='GaussianPolicy')) #Initalize buffer if args.ps == 'trunk': self.buffer = [ Buffer(args.buffer_size, buffer_gpu=False, filter_c=args.filter_c) for _ in range(args.config.num_agents) ] else: self.buffer = Buffer(args.buffer_size, buffer_gpu=False, filter_c=args.filter_c) #Agent metrics self.fitnesses = [[] for _ in range(args.popn_size)] ###Best Policy HOF#### self.champ_ind = 0 def update_parameters(self): td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': -1.0, 'action_high': 1.0 } if self.args.ps == 'trunk': for agent_id, buffer in enumerate(self.buffer): if self.args.is_matd3 or self.args.is_maddpg: buffer = self.buffer[0] #Hardcoded Hack for MADDPG buffer.referesh() if buffer.__len__() < 10 * self.args.batch_size: buffer.pg_frames = 0 return ###BURN_IN_PERIOD buffer.tensorify() for _ in range(int(self.args.gradperstep * buffer.pg_frames)): s, ns, a, r, done, global_reward = buffer.sample( self.args.batch_size, pr_rew=self.args.priority_rate, pr_global=self.args.priority_rate) r *= self.args.reward_scaling if self.args.use_gpu: s = s.cuda() ns = ns.cuda() a = a.cuda() r = r.cuda() done = done.cuda() global_reward = global_reward.cuda() self.algo.update_parameters(s, ns, a, r, done, global_reward, agent_id, 1, **td3args) buffer.pg_frames = 0 else: self.buffer.referesh() if self.buffer.__len__() < 10 * self.args.batch_size: return ###BURN_IN_PERIOD self.buffer.tensorify() for _ in range(int(self.args.gradperstep * self.buffer.pg_frames)): s, ns, a, r, done, global_reward = self.buffer.sample( self.args.batch_size, pr_rew=self.args.priority_rate, pr_global=self.args.priority_rate) r *= self.args.reward_scaling if self.args.use_gpu: s = s.cuda() ns = ns.cuda() a = a.cuda() r = r.cuda() done = done.cuda() global_reward = global_reward.cuda() self.algo.update_parameters(s, ns, a, r, done, global_reward, 1, **td3args) self.buffer.pg_frames = 0 #Reset new frame counter to 0 def evolve(self): ## One gen of evolution ### if self.args.popn_size > 1: #If not no-evo if self.args.scheme == 'multipoint': #Make sure that the buffer has been refereshed and tensorified buffer_pointer = self.buffer[ 0] if self.args.ps == 'trunk' else self.buffer if buffer_pointer.__len__() < 1000: buffer_pointer.tensorify() if random.random() < 0.01: buffer_pointer.tensorify() #Get sample of states from the buffer if buffer_pointer.__len__() < 1000: sample_size = buffer_pointer.__len__() else: sample_size = 1000 if sample_size == 1000 and len(buffer_pointer.sT) < 1000: buffer_pointer.tensorify() states, _, _, _, _, _ = buffer_pointer.sample(sample_size, pr_rew=0.0, pr_global=0.0) states = states.cpu() elif self.args.scheme == 'standard': states = None else: sys.exit('Unknown Evo Scheme') #Net indices of nets that got evaluated this generation (meant for asynchronous evolution workloads) net_inds = [i for i in range(len(self.popn)) ] #Hack for a synchronous run #Evolve if self.args.rollout_size > 0: self.champ_ind = self.evolver.evolve(self.popn, net_inds, self.fitnesses, [self.rollout_actor[0]], states) else: self.champ_ind = self.evolver.evolve(self.popn, net_inds, self.fitnesses, [], states) #Reset fitness metrics self.fitnesses = [[] for _ in range(self.args.popn_size)] def update_rollout_actor(self): for actor in self.rollout_actor: self.algo.policy.cpu() mod.hard_update(actor, self.algo.policy) if self.args.use_gpu: self.algo.policy.cuda()
class EGRL_Trainer: """Main CERL class containing all methods for CERL Parameters: args (object): Parameter class with all the parameters """ def __init__(self, args, model_constructor, env_constructor, observation_space, action_space, env, state_template, test_envs, platform): self.args = args model_constructor.state_dim += 2 self.platform = platform self.policy_string = self.compute_policy_type() self.device = torch.device("cuda" if torch.cuda.is_available( ) else "cpu") if self.args.gpu else torch.device('cpu') #Evolution dram_action = torch.ones((len(state_template.x), 2)) + 1 state_template.x = torch.cat([state_template.x, dram_action], axis=1) self.evolver = MixedSSNE( self.args, state_template ) #GA(self.args) if args.boltzman else SSNE(self.args) self.env_constructor = env_constructor self.test_tracker = utils.Tracker( self.args.plot_folder, ['score_' + self.args.savetag, 'speedup_' + self.args.savetag], '.csv') # Tracker class to log progress self.time_tracker = utils.Tracker(self.args.plot_folder, [ 'timed_score_' + self.args.savetag, 'timed_speedup_' + self.args.savetag ], '.csv') self.champ_tracker = utils.Tracker(self.args.plot_folder, [ 'champ_score_' + self.args.savetag, 'champ_speedup_' + self.args.savetag ], '.csv') self.pg_tracker = utils.Tracker(self.args.plot_folder, [ 'pg_noisy_speedup_' + self.args.savetag, 'pg_clean_speedup_' + self.args.savetag ], '.csv') self.migration_tracker = utils.Tracker(self.args.plot_folder, [ 'selection_rate_' + self.args.savetag, 'elite_rate_' + self.args.savetag ], '.csv') #Generalization Trackers self.r50_tracker = utils.Tracker(self.args.plot_folder, [ 'r50_score_' + self.args.savetag, 'r50_speedup_' + self.args.savetag ], '.csv') self.r101_tracker = utils.Tracker(self.args.plot_folder, [ 'r101_score_' + self.args.savetag, 'r101_speedup_' + self.args.savetag ], '.csv') self.bert_tracker = utils.Tracker(self.args.plot_folder, [ 'bert_score_' + self.args.savetag, 'bert_speedup_' + self.args.savetag ], '.csv') self.r50_frames_tracker = utils.Tracker(self.args.plot_folder, [ 'r50_score_' + self.args.savetag, 'r50_speedup_' + self.args.savetag ], '.csv') self.r101_frames_tracker = utils.Tracker(self.args.plot_folder, [ 'r101_score_' + self.args.savetag, 'r101_speedup_' + self.args.savetag ], '.csv') self.bert_frames_tracker = utils.Tracker(self.args.plot_folder, [ 'bert_score_' + self.args.savetag, 'bert_speedup_' + self.args.savetag ], '.csv') #Genealogy tool self.genealogy = Genealogy() self.env = env self.test_envs = test_envs if self.args.use_mp: #MP TOOLS self.manager = Manager() #Initialize Mixed Population self.population = self.manager.list() else: self.population = [] boltzman_count = int(args.pop_size * args.ratio) rest = args.pop_size - boltzman_count for _ in range(boltzman_count): self.population.append( BoltzmannChromosome(model_constructor.num_nodes, model_constructor.action_dim)) for _ in range(rest): self.population.append( model_constructor.make_model(self.policy_string)) self.population[-1].eval() #Save best policy self.best_policy = model_constructor.make_model(self.policy_string) #Init BUFFER self.replay_buffer = Buffer(args.buffer_size, state_template, action_space, args.aux_folder + args.savetag) self.data_bucket = self.replay_buffer.tuples #Intialize portfolio of learners self.portfolio = [] if args.rollout_size > 0: self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, args.portfolio_id, model_constructor) #Initialize Rollout Bucket self.rollout_bucket = self.manager.list() if self.args.use_mp else [] for _ in range(len(self.portfolio)): self.rollout_bucket.append( model_constructor.make_model(self.policy_string)) if self.args.use_mp: ############## MULTIPROCESSING TOOLS ################### #Evolutionary population Rollout workers data_bucket = self.data_bucket if args.rollout_size > 0 else None #If Strictly Evo - don;t store data self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, 'evo', self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], data_bucket, self.population, env_constructor)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 'pg', self.task_pipes[id][1], self.result_pipes[id][0], data_bucket, self.rollout_bucket, env_constructor)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] self.evo_flag = [True for _ in range(args.pop_size)] #Meta-learning controller (Resource Distribution) self.allocation = [ ] #Allocation controls the resource allocation across learners for i in range(args.rollout_size): self.allocation.append( i % len(self.portfolio)) #Start uniformly (equal resources) #Trackers self.best_score = -float('inf') self.gen_frames = 0 self.total_frames = 0 self.best_speedup = -float('inf') self.champ_type = None def checkpoint(self): utils.pickle_obj(self.args.ckpt_folder + 'test_tracker', self.test_tracker) utils.pickle_obj(self.args.ckpt_folder + 'time_tracker', self.time_tracker) utils.pickle_obj(self.args.ckpt_folder + 'champ_tracker', self.champ_tracker) for i in range(len(self.population)): net = self.population[i] if net.model_type == 'BoltzmanChromosome': utils.pickle_obj(self.args.ckpt_folder + 'Boltzman/' + str(i), net) else: torch.save(net.state_dict(), self.args.ckpt_folder + 'Gumbel/' + str(i)) self.population[i] = net def load_checkpoint(self): #Try to load trackers try: self.test_tracker = utils.unpickle_obj(self.args.ckpt_folder + 'test_tracker') self.time_tracker = utils.unpickle_obj(self.args.ckpt_folder + 'time_tracker') self.champ_tracker = utils.unpickle_obj(self.args.ckpt_folder + 'champ_tracker') except: None gumbel_template = False for i in range(len(self.population)): if self.population[i].model_type == 'GumbelPolicy': gumbel_template = self.population[i] break boltzman_nets = os.listdir(self.args.ckpt_folder + 'Boltzman/') gumbel_nets = os.listdir(self.args.ckpt_folder + 'Gumbel/') print('Boltzman seeds', boltzman_nets, 'Gumbel seeds', gumbel_nets) gumbel_models = [] boltzman_models = [] for fname in boltzman_nets: try: net = utils.unpickle_obj(self.args.ckpt_folder + 'Boltzman/' + fname) boltzman_models.append(net) except: print('Failed to load', self.args.ckpt_folder + 'Boltzman/' + fname) for fname in gumbel_nets: try: model_template = copy.deepcopy(gumbel_template) model_template.load_state_dict( torch.load(self.args.ckpt_folder + 'Gumbel/' + fname)) model_template.eval() gumbel_models.append(model_template) except: print('Failed to load', self.args.ckpt_folder + 'Gumbel/' + fname) for i in range(len(self.population)): net = self.population[i] if net.model_type == 'GumbelPolicy' and len(gumbel_models) >= 1: seed_model = gumbel_models.pop() utils.hard_update(net, seed_model) elif net.model_type == 'BoltzmanChromosome' and len( boltzman_models) >= 1: seed_model = boltzman_models.pop() net = seed_model self.population[i] = net print() print() print() print() print('Checkpoint Loading Phase Completed') print() print() print() print() def forward_generation(self, gen, time_start): ################ START ROLLOUTS ############## #Start Evolution rollouts if self.args.pop_size >= 1 and self.args.use_mp: for id, actor in enumerate(self.population): if self.evo_flag[id]: self.evo_task_pipes[id][0].send(id) self.evo_flag[id] = False #If Policy Gradient if self.args.rollout_size > 0: #Sync all learners actor to cpu (rollout) actor for i, learner in enumerate(self.portfolio): learner.algo.actor.cpu() utils.hard_update(self.rollout_bucket[i], learner.algo.actor) learner.algo.actor.to(self.device) # Start Learner rollouts if self.args.use_mp: for rollout_id, learner_id in enumerate(self.allocation): if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send(learner_id) self.roll_flag[rollout_id] = False ############# UPDATE PARAMS USING GRADIENT DESCENT ########## if self.replay_buffer.__len__( ) > self.args.learning_start and not self.args.random_baseline: ###BURN IN PERIOD print('INSIDE GRAD DESCENT') for learner in self.portfolio: learner.update_parameters( self.replay_buffer, self.args.batch_size, int(self.gen_frames * self.args.gradperstep)) self.gen_frames = 0 else: print('BURN IN PERIOD') gen_best = -float('inf') gen_best_speedup = -float("inf") gen_champ = None ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############ if self.args.pop_size >= 1: for i in range(self.args.pop_size): if self.args.use_mp: entry = self.evo_result_pipes[i][1].recv() else: entry = rollout_function( i, 'evo', self.population[i], self.env, store_data=self.args.rollout_size > 0) self.gen_frames += entry[2] self.total_frames += entry[2] speedup = entry[3][0] score = entry[1] net = self.population[entry[0]] net.fitness_stats['speedup'] = speedup net.fitness_stats['score'] = score net.fitness_stats['shaped'][:] = entry[5] self.population[entry[0]] = net self.test_tracker.update([score, speedup], self.total_frames) self.time_tracker.update([score, speedup], time.time() - time_start) if speedup > self.best_speedup: self.best_speedup = speedup if score > gen_best: gen_best = score gen_champ = self.population[i] if speedup > gen_best_speedup: gen_best_speedup = speedup if score > self.best_score: self.best_score = score champ_index = i self.champ_type = net.model_type try: torch.save( self.population[champ_index].state_dict(), self.args.models_folder + 'bestChamp_' + self.args.savetag) except: None # TODO print("Best Evo Champ saved with score", '%.2f' % score) if self.args.rollout_size > 0: self.replay_buffer.add(entry[4]) self.evo_flag[i] = True try: torch.save( gen_champ.state_dict(), self.args.models_folder + 'genChamp_' + str(gen) + '_speedup_' + str(gen_best_speedup) + '_' + self.args.savetag) except: None ############################# GENERALIZATION EXPERIMENTS ######################## _, resnet50_score, _, resnet50_speedup, _, _ = rollout_function( 0, 'evo', gen_champ, self.test_envs[0], store_data=False) _, resnet101_score, _, resnet101_speedup, _, _ = rollout_function( 0, 'evo', gen_champ, self.test_envs[1], store_data=False) resnet50_speedup = resnet50_speedup[0] resnet101_speedup = resnet101_speedup[0] self.r50_tracker.update([resnet50_score, resnet50_speedup], gen) self.r101_tracker.update([resnet101_score, resnet101_speedup], gen) self.r50_frames_tracker.update([resnet50_score, resnet50_speedup], self.total_frames) self.r101_frames_tracker.update([resnet101_score, resnet101_speedup], self.total_frames) bert_speedup, bert_score = None, None if self.platform != 'wpa': _, bert_score, _, bert_speedup, _, _ = rollout_function( 0, 'evo', gen_champ, self.test_envs[2], store_data=False) bert_speedup = bert_speedup[0] self.bert_tracker.update([bert_score, bert_speedup], gen) self.bert_frames_tracker.update([bert_score, bert_speedup], self.total_frames) ############################# GENERALIZATION EXPERIMENTS ######################## ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ if self.args.rollout_size > 0: for i in range(self.args.rollout_size): #NOISY PG if self.args.use_mp: entry = self.result_pipes[i][1].recv() else: entry = rollout_function(i, 'pg', self.rollout_bucket[i], self.env, store_data=True) learner_id = entry[0] fitness = entry[1] num_frames = entry[2] speedup = entry[3][0] self.portfolio[learner_id].update_stats(fitness, num_frames) self.replay_buffer.add(entry[4]) self.test_tracker.update([fitness, speedup], self.total_frames) self.time_tracker.update([fitness, speedup], time.time() - time_start) gen_best = max(fitness, gen_best) self.best_speedup = max(speedup, self.best_speedup) gen_best_speedup = max(speedup, gen_best_speedup) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness torch.save( self.rollout_bucket[i].state_dict(), self.args.models_folder + 'noisy_bestPG_' + str(speedup) + '_' + self.args.savetag) print("Best Rollout Champ saved with score", '%.2f' % fitness) noisy_speedup = speedup # Clean PG Measurement entry = rollout_function(i, 'evo', self.rollout_bucket[i], self.env, store_data=True) learner_id = entry[0] fitness = entry[1] num_frames = entry[2] speedup = entry[3][0] self.portfolio[learner_id].update_stats(fitness, num_frames) self.replay_buffer.add(entry[4]) self.test_tracker.update([fitness, speedup], self.total_frames) self.time_tracker.update([fitness, speedup], time.time() - time_start) gen_best = max(fitness, gen_best) self.best_speedup = max(speedup, self.best_speedup) gen_best_speedup = max(speedup, gen_best_speedup) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness torch.save( self.rollout_bucket[i].state_dict(), self.args.models_folder + 'clean_bestPG_' + str(speedup) + '_' + self.args.savetag) print("Best Clean Evo Champ saved with score", '%.2f' % fitness) self.pg_tracker.update([noisy_speedup, speedup], self.total_frames) self.roll_flag[i] = True self.champ_tracker.update([gen_best, gen_best_speedup], self.total_frames) #NeuroEvolution's probabilistic selection and recombination step if self.args.pop_size >= 1 and not self.args.random_baseline: if gen % 1 == 0: self.population = self.evolver.epoch(self.population, self.rollout_bucket) else: self.population = self.evolver.epoch(self.population, []) if self.evolver.selection_stats['total'] > 0: selection_rate = ( 1.0 * self.evolver.selection_stats['selected'] + self.evolver.selection_stats['elite'] ) / self.evolver.selection_stats['total'] elite_rate = selection_rate = ( 1.0 * self.evolver.selection_stats['elite'] ) / self.evolver.selection_stats['total'] self.migration_tracker.update([selection_rate, elite_rate], self.total_frames) if gen % 1 == 0: self.checkpoint() return gen_best def train(self, frame_limit): time_start = time.time() for gen in range(1, 1000000000): # Infinite generations # Train one iteration gen_best = self.forward_generation(gen, time_start) print() print('Gen/Frames', gen, '/', self.total_frames, 'Gen_Score', '%.2f' % gen_best, 'Best_Score', '%.2f' % self.best_score, ' Speedup', '%.2f' % self.best_speedup, ' Frames/sec:', '%.2f' % (self.total_frames / (time.time() - time_start)), 'Buffer', self.replay_buffer.__len__(), 'Savetag', self.args.savetag) for net in self.population: print(net.model_type, net.fitness_stats) if net.model_type == 'BoltzmanChromosome': print(net.temperature_stats) print() print() try: print('Initial Ratio', self.args.ratio, 'Current Ratio', self.evolver.ratio, 'Chamption Type', self.champ_type) except: None if gen % 5 == 0: print('Learner Fitness', [ utils.pprint(learner.value) for learner in self.portfolio ]) if self.total_frames > frame_limit: break ###Kill all processes try: for p in self.task_pipes: p[0].send('TERMINATE') for p in self.test_task_pipes: p[0].send('TERMINATE') for p in self.evo_task_pipes: p[0].send('TERMINATE') except: None def compute_policy_type(self): if self.args.algo == 'ddqn': return 'DDQN' elif self.args.algo == 'sac': return 'Gaussian_FF' elif self.args.algo == 'td3': return 'Deterministic_FF' elif self.args.algo == 'sac_discrete': return 'GumbelPolicy'
class ERL_Trainer: def __init__(self, args, model_constructor, env_constructor): self.args = args self.policy_string = 'CategoricalPolicy' if env_constructor.is_discrete else 'Gaussian_FF' self.manager = Manager() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") #Evolution self.evolver = SSNE(self.args) #Initialize population self.population = self.manager.list() for _ in range(args.pop_size): self.population.append( model_constructor.make_model(self.policy_string)) #Save best policy self.best_policy = model_constructor.make_model(self.policy_string) #PG Learner if env_constructor.is_discrete: from algos.ddqn import DDQN self.learner = DDQN(args, model_constructor) else: from algos.sac import SAC self.learner = SAC(args, model_constructor) #Replay Buffer self.replay_buffer = Buffer(args.buffer_size) #Initialize Rollout Bucket self.rollout_bucket = self.manager.list() for _ in range(args.rollout_size): self.rollout_bucket.append( model_constructor.make_model(self.policy_string)) ############## MULTIPROCESSING TOOLS ################### #Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, 'evo', self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], args.rollout_size > 0, self.population, env_constructor)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 'pg', self.task_pipes[id][1], self.result_pipes[id][0], True, self.rollout_bucket, env_constructor)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] #Test bucket self.test_bucket = self.manager.list() self.test_bucket.append( model_constructor.make_model(self.policy_string)) # Test workers self.test_task_pipes = [Pipe() for _ in range(args.num_test)] self.test_result_pipes = [Pipe() for _ in range(args.num_test)] self.test_workers = [ Process(target=rollout_worker, args=(id, 'test', self.test_task_pipes[id][1], self.test_result_pipes[id][0], False, self.test_bucket, env_constructor)) for id in range(args.num_test) ] for worker in self.test_workers: worker.start() self.test_flag = False #Trackers self.best_score = -float('inf') self.gen_frames = 0 self.total_frames = 0 self.test_score = None self.test_std = None def forward_generation(self, gen, tracker): gen_max = -float('inf') #Start Evolution rollouts if self.args.pop_size > 1: for id, actor in enumerate(self.population): self.evo_task_pipes[id][0].send(id) #Sync all learners actor to cpu (rollout) actor and start their rollout self.learner.actor.cpu() for rollout_id in range(len(self.rollout_bucket)): utils.hard_update(self.rollout_bucket[rollout_id], self.learner.actor) self.task_pipes[rollout_id][0].send(0) self.learner.actor.to(device=self.device) #Start Test rollouts if gen % self.args.test_frequency == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send(0) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## if self.replay_buffer.__len__( ) > self.args.learning_start: ###BURN IN PERIOD for _ in range(int(self.gen_frames * self.args.gradperstep)): s, ns, a, r, done = self.replay_buffer.sample( self.args.batch_size) self.learner.update_parameters(s, ns, a, r, done) self.gen_frames = 0 ########## JOIN ROLLOUTS FOR EVO POPULATION ############ all_fitness = [] all_eplens = [] if self.args.pop_size > 1: for i in range(self.args.pop_size): _, fitness, frames, trajectory = self.evo_result_pipes[i][ 1].recv() all_fitness.append(fitness) all_eplens.append(frames) self.gen_frames += frames self.total_frames += frames self.replay_buffer.add(trajectory) self.best_score = max(self.best_score, fitness) gen_max = max(gen_max, fitness) ########## JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ rollout_fitness = [] rollout_eplens = [] if self.args.rollout_size > 0: for i in range(self.args.rollout_size): _, fitness, pg_frames, trajectory = self.result_pipes[i][ 1].recv() self.replay_buffer.add(trajectory) self.gen_frames += pg_frames self.total_frames += pg_frames self.best_score = max(self.best_score, fitness) gen_max = max(gen_max, fitness) rollout_fitness.append(fitness) rollout_eplens.append(pg_frames) ######################### END OF PARALLEL ROLLOUTS ################ ############ FIGURE OUT THE CHAMP POLICY AND SYNC IT TO TEST ############# if self.args.pop_size > 1: champ_index = all_fitness.index(max(all_fitness)) utils.hard_update(self.test_bucket[0], self.population[champ_index]) if max(all_fitness) > self.best_score: self.best_score = max(all_fitness) utils.hard_update(self.best_policy, self.population[champ_index]) torch.save(self.population[champ_index].state_dict(), self.args.aux_folder + '_best' + self.args.savetag) print("Best policy saved with score", '%.2f' % max(all_fitness)) else: #If there is no population, champion is just the actor from policy gradient learner utils.hard_update(self.test_bucket[0], self.rollout_bucket[0]) ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] for pipe in self.test_result_pipes: #Collect all results _, fitness, _, _ = pipe[1].recv() self.best_score = max(self.best_score, fitness) gen_max = max(gen_max, fitness) test_scores.append(fitness) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) tracker.update([test_mean], self.total_frames) else: test_mean, test_std = None, None #NeuroEvolution's probabilistic selection and recombination step if self.args.pop_size > 1: self.evolver.epoch(gen, self.population, all_fitness, self.rollout_bucket) #Compute the champion's eplen champ_len = all_eplens[all_fitness.index( max(all_fitness))] if self.args.pop_size > 1 else rollout_eplens[ rollout_fitness.index(max(rollout_fitness))] return gen_max, champ_len, all_eplens, test_mean, test_std, rollout_fitness, rollout_eplens def train(self, frame_limit): # Define Tracker class to track scores test_tracker = utils.Tracker(self.args.savefolder, ['score_' + self.args.savetag], '.csv') # Tracker class to log progress time_start = time.time() for gen in range(1, 1000000000): # Infinite generations # Train one iteration max_fitness, champ_len, all_eplens, test_mean, test_std, rollout_fitness, rollout_eplens = self.forward_generation( gen, test_tracker) if test_mean: self.args.writer.add_scalar('test_score', test_mean, gen) print( 'Gen/Frames:', gen, '/', self.total_frames, ' Gen_max_score:', '%.2f' % max_fitness, ' Champ_len', '%.2f' % champ_len, ' Test_score u/std', utils.pprint(test_mean), utils.pprint(test_std), ' Rollout_u/std:', utils.pprint(np.mean(np.array(rollout_fitness))), utils.pprint(np.std(np.array(rollout_fitness))), ' Rollout_mean_eplen:', utils.pprint(sum(rollout_eplens) / len(rollout_eplens)) if rollout_eplens else None) if gen % 5 == 0: print( 'Best_score_ever:' '/', '%.2f' % self.best_score, ' FPS:', '%.2f' % (self.total_frames / (time.time() - time_start)), 'savetag', self.args.savetag) print() if self.total_frames > frame_limit: break ###Kill all processes try: for p in self.task_pipes: p[0].send('TERMINATE') for p in self.test_task_pipes: p[0].send('TERMINATE') for p in self.evo_task_pipes: p[0].send('TERMINATE') except: None
class CERL_Trainer: """Main CERL class containing all methods for CERL Parameters: args (object): Parameter class with all the parameters """ def __init__(self, args, model_constructor, env_constructor): self.args = args self.policy_string = self.compute_policy_type() #Evolution self.evolver = SSNE(self.args) #MP TOOLS self.manager = Manager() #Genealogy tool self.genealogy = Genealogy() #Initialize population self.population = self.manager.list() seed = True for _ in range(args.pop_size): self.population.append( model_constructor.make_model(self.policy_string, seed=seed)) seed = False #SEED #self.population[0].load_state_dict(torch.load('Results/Auxiliary/_bestcerl_td3_s2019_roll10_pop10_portfolio10')) #Save best policy self.best_policy = model_constructor.make_model(self.policy_string) #Turn off gradients and put in eval mod for actor in self.population: actor = actor.cpu() actor.eval() #Init BUFFER self.replay_buffer = Buffer(args.buffer_size) self.data_bucket = self.replay_buffer.tuples #Intialize portfolio of learners self.portfolio = [] self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, args.portfolio_id, model_constructor) #Initialize Rollout Bucket self.rollout_bucket = self.manager.list() for _ in range(len(self.portfolio)): self.rollout_bucket.append( model_constructor.make_model(self.policy_string)) ############## MULTIPROCESSING TOOLS ################### #Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, 'evo', self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], self.data_bucket, self.population, env_constructor)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 'pg', self.task_pipes[id][1], self.result_pipes[id][0], self.data_bucket, self.rollout_bucket, env_constructor)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] #Test bucket self.test_bucket = self.manager.list() self.test_bucket.append( model_constructor.make_model(self.policy_string)) #5 Test workers self.test_task_pipes = [ Pipe() for _ in range(env_constructor.dummy_env.test_size) ] self.test_result_pipes = [ Pipe() for _ in range(env_constructor.dummy_env.test_size) ] self.test_workers = [ Process(target=rollout_worker, args=(id, 'test', self.test_task_pipes[id][1], self.test_result_pipes[id][0], None, self.test_bucket, env_constructor)) for id in range(env_constructor.dummy_env.test_size) ] for worker in self.test_workers: worker.start() self.test_flag = False #Meta-learning controller (Resource Distribution) self.allocation = [ ] #Allocation controls the resource allocation across learners for i in range(args.rollout_size): self.allocation.append( i % len(self.portfolio)) #Start uniformly (equal resources) #Trackers self.best_score = 0.0 self.gen_frames = 0 self.total_frames = 0 self.test_score = None self.test_std = None self.best_r1_score = 0.0 self.ep_len = 0 self.r1_reward = 0 self.num_footsteps = 0 self.test_trace = [] def checkpoint(self): utils.pickle_obj( self.args.aux_folder + self.args.algo + '_checkpoint_frames' + str(self.total_frames), self.portfolio) def load_checkpoint(self, filename): self.portfolio = utils.unpickle_obj(filename) def forward_generation(self, gen, tracker): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ START ROLLOUTS ############## #Start Evolution rollouts if self.args.pop_size > 1: for id, actor in enumerate(self.population): if self.evo_flag[id]: self.evo_task_pipes[id][0].send(id) self.evo_flag[id] = False #Sync all learners actor to cpu (rollout) actor for i, learner in enumerate(self.portfolio): learner.algo.actor.cpu() utils.hard_update(self.rollout_bucket[i], learner.algo.actor) learner.algo.actor.cuda() # Start Learner rollouts for rollout_id, learner_id in enumerate(self.allocation): if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send(learner_id) self.roll_flag[rollout_id] = False #Start Test rollouts if gen % 5 == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send(0) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## if self.replay_buffer.__len__( ) > self.args.learning_start: ###BURN IN PERIOD #Spin up threads for each learner threads = [ threading.Thread( target=learner.update_parameters, args=(self.replay_buffer, self.args.batch_size, int(self.gen_frames * self.args.gradperstep))) for learner in self.portfolio ] # Start threads for thread in threads: thread.start() #Join threads for thread in threads: thread.join() self.gen_frames = 0 ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############ if self.args.pop_size > 1: all_fitness = [] all_net_ids = [] all_eplens = [] while True: for i in range(self.args.pop_size): if self.evo_result_pipes[i][1].poll(): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) all_net_ids.append(entry[0]) all_eplens.append(entry[2]) self.gen_frames += entry[2] self.total_frames += entry[2] self.evo_flag[i] = True # Soft-join (50%) if len(all_fitness ) / self.args.pop_size >= self.args.asynch_frac: break ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ if self.args.rollout_size > 0: for i in range(self.args.rollout_size): entry = self.result_pipes[i][1].recv() learner_id = entry[0] fitness = entry[1] num_frames = entry[2] self.portfolio[learner_id].update_stats(fitness, num_frames) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness self.roll_flag[i] = True ######################### END OF PARALLEL ROLLOUTS ################ ############ PROCESS MAX FITNESS ############# if self.args.pop_size > 1: champ_index = all_net_ids[all_fitness.index(max(all_fitness))] utils.hard_update(self.test_bucket[0], self.population[champ_index]) else: #Run PG in isolation utils.hard_update(self.test_bucket[0], self.rollout_bucket[0]) ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] eplens = [] r1_reward = [] num_footsteps = [] for pipe in self.test_result_pipes: #Collect all results entry = pipe[1].recv() test_scores.append(entry[1]) eplens.append(entry[3]) r1_reward.append(entry[4]) num_footsteps.append(entry[5]) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) self.test_trace.append(test_mean) self.num_footsteps = np.mean(np.array(num_footsteps)) self.ep_len = np.mean(np.array(eplens)) self.r1_reward = np.mean(np.array(r1_reward)) if self.r1_reward > self.best_r1_score: self.best_r1_score = self.r1_reward utils.hard_update(self.best_policy, self.test_bucket[0]) torch.save( self.test_bucket[0].state_dict(), self.args.aux_folder + '_bestR1_' + self.args.savetag) print("Best R2 policy saved with score", '%.2f' % self.r1_reward) if test_mean > self.best_score: self.best_score = test_mean utils.hard_update(self.best_policy, self.test_bucket[0]) torch.save( self.test_bucket[0].state_dict(), self.args.aux_folder + '_bestShaped' + self.args.savetag) print("Best Shaped policy saved with score", '%.2f' % test_mean) tracker.update([test_mean, self.r1_reward], self.total_frames) else: test_mean, test_std = None, None # Referesh buffer (housekeeping tasks - pruning to keep under capacity) self.replay_buffer.referesh() #NeuroEvolution's probabilistic selection and recombination step if self.args.pop_size > 1: if self.args.scheme == 'multipoint': sample_size = self.args.batch_size if self.replay_buffer.__len__( ) >= self.args.batch_size else self.replay_buffer.__len__() states, _, _, _, _ = self.replay_buffer.sample( batch_size=sample_size) else: states = None self.evolver.epoch(self.population, all_net_ids, all_fitness, self.rollout_bucket, states) #META LEARNING - RESET ALLOCATION USING UCB if self.args.rollout_size > 0: self.allocation = ucb(len(self.allocation), self.portfolio, self.args.ucb_coefficient) #Metrics if self.args.pop_size > 1: champ_len = all_eplens[all_fitness.index(max(all_fitness))] #champ_wwid = int(self.pop[champ_index].wwid.item()) max_fit = max(all_fitness) else: champ_len = num_frames all_fitness = [fitness] max_fit = fitness all_eplens = [num_frames] return max_fit, champ_len, all_eplens, test_mean, test_std def train(self, frame_limit): # Define Tracker class to track scores test_tracker = utils.Tracker( self.args.savefolder, ['score_' + self.args.savetag, 'r2_' + self.args.savetag], '.csv') # Tracker class to log progress grad_temp = [ str(i) + 'entropy_' + self.args.savetag for i in range(len(self.portfolio)) ] + [ str(i) + 'policyQ_' + self.args.savetag for i in range(len(self.portfolio)) ] grad_tracker = utils.Tracker(self.args.aux_folder, grad_temp, '.csv') # Tracker class to log progress time_start = time.time() for gen in range(1, 1000000000): # Infinite generations # Train one iteration max_fitness, champ_len, all_eplens, test_mean, test_std = self.forward_generation( gen, test_tracker) print('Gen/Frames', gen, '/', self.total_frames, ' Pop_max/max_ever:', '%.2f' % max_fitness, '/', '%.2f' % self.best_score, ' Avg:', '%.2f' % test_tracker.all_tracker[0][1], ' Frames/sec:', '%.2f' % (self.total_frames / (time.time() - time_start)), ' Champ_len', '%.2f' % champ_len, ' Test_score u/std', utils.pprint(test_mean), utils.pprint(test_std), 'Ep_len', '%.2f' % self.ep_len, '#Footsteps', '%.2f' % self.num_footsteps, 'R2_Reward', '%.2f' % self.r1_reward, 'savetag', self.args.savetag) grad_temp = [ algo.algo.entropy['mean'] for algo in self.portfolio ] + [algo.algo.policy_q['mean'] for algo in self.portfolio] grad_tracker.update(grad_temp, self.total_frames) if gen % 5 == 0: print('Learner Fitness', [ utils.pprint(learner.value) for learner in self.portfolio ], 'Sum_stats_resource_allocation', [learner.visit_count for learner in self.portfolio]) try: print('Entropy', [ '%.2f' % algo.algo.entropy['mean'] for algo in self.portfolio ], 'Next_Entropy', [ '%.2f' % algo.algo.next_entropy['mean'] for algo in self.portfolio ], 'Poilcy_Q', [ '%.2f' % algo.algo.policy_q['mean'] for algo in self.portfolio ], 'Critic_Loss', [ '%.2f' % algo.algo.critic_loss['mean'] for algo in self.portfolio ]) print() except: None if self.total_frames > frame_limit: break ###Kill all processes try: for p in self.task_pipes: p[0].send('TERMINATE') for p in self.test_task_pipes: p[0].send('TERMINATE') for p in self.evo_task_pipes: p[0].send('TERMINATE') except: None def compute_policy_type(self): if self.args.algo == 'ddqn': return 'DDQN' elif self.args.algo == 'sac': return 'Gaussian_FF' elif self.args.algo == 'td3': return 'Deterministic_FF'
class Evaluator(object): def __init__( self, CERL_agent, num_workers, trainers, pomdp_adv=False ): #trainers first is the blue agent and second is the red model self.num_workers = num_workers self.trainers = trainers self.pomdp_adv = pomdp_adv self.args = CERL_agent.args self.drqn = CERL_agent.args.drqn #denote if blue uses drqn if self.pomdp_adv: self.trainers = [trainers[0], None] #make sure the red model is never used self.buffer_gpu = CERL_agent.args.buffer_gpu self.batch_size = CERL_agent.args.batch_size self.algo = CERL_agent.args.algo self.state_dim = CERL_agent.args.state_dim self.action_dim = CERL_agent.args.action_dim self.buffer = Buffer(BUFFER_SIZE, self.buffer_gpu) #initialize own replay buffer self.data_bucket = self.buffer.tuples self.evo_task_pipes = [Pipe() for _ in range(self.num_workers)] self.evo_result_pipes = [Pipe() for _ in range(self.num_workers)] self.actual_red_worker = Actor( CERL_agent.args.state_dim, CERL_agent.args.action_dim, -1, 'dis') #this model is shared accross the workers self.actual_red_worker.share_memory() self.td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': CERL_agent.args.action_low, 'action_high': CERL_agent.args.action_high, 'cerl_args': self.args } self.renew_learner( ) #now we are not using new learner for each iteration self.rollout_bucket = [ self.actual_red_worker for i in range(num_workers) ] self.workers = [ Process(target=rollout_worker, args=(id, 3, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.rollout_bucket, 'dummy_name', None, 'dis', self.trainers, False, self.pomdp_adv)) for id in range(num_workers) ] for worker in self.workers: worker.start() self.evo_flag = [True for _ in range(self.num_workers)] #def initialize(self, actor_in): #use the given actor parameter to initialize the red actor # utils.hard_update(self.actual_red_actor, actor_in) def renew_learner( self ): #create a new learning agent, with randomized initial parameter self.learner = Learner(-1, self.algo, self.state_dim, self.action_dim, actor_lr=5e-5, critic_lr=1e-3, gamma=0.99, tau=5e-3, init_w=True, **self.td3args) self.actual_red_actor = self.learner.algo.actor def collect_trajectory(self): utils.hard_update(self.actual_red_worker, self.actual_red_actor) #first snyc the actor #launch rollout_workers for id, actor in enumerate(self.rollout_bucket): if self.evo_flag[id]: self.evo_task_pipes[id][0].send( (id, 0)) #second argument in send is dummy self.evo_flag[id] = False #wait for the rollout to complete and record fitness all_fitness = [] for i in range(self.num_workers): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) self.evo_flag[i] = True self.buffer.referesh() #update replay buffer return all_fitness def train_red( self, training_iterations ): #alternate between collect_trajectory and parameter update while self.buffer.__len__() < self.batch_size * 10: ###BURN IN PERIOD self.collect_trajectory() for i in range(training_iterations): self.collect_trajectory() self.buffer.tensorify() # Tensorify the buffer for fast sampling self.learner.update_parameters(self.buffer, self.buffer_gpu, self.batch_size, 2) #2 update steps def evaluate( self ): #evaluate the quality of blue agent policy, by training a red against it, after evaluation, erase the reply buffer and renew learner self.train_red(TRAIN_ITERATION) self.clear_buffer() #self.renew_learner() return self.evaluate_fixed_agents( self.trainers[0], self.trainers[1], [self.actual_red_actor ]) #calculate the mean and std of the evaluation metric def evaluate_fixed_agents( self, blue_dqn, red_model, red_actor_list, num_iterations=25 ): #evaluate the performance given agents, use random neutral and red agent if self.algo == 'dis': # make env with blue and red policy agent inside, dis_env = make_self_play_env( seed=0, return_policy_agent=False, trainers=[blue_dqn, red_model] )[0] # trainer if not None, first is the shared DQN agent, second is the best red policy env = EnvironmentWrapper( '', self.algo, dis_env, 0) # the "0" is the index for training blue agent elif self.algo == 'TD3_tennis': tennis_env = make_tennis_env.TennisEnvFactory( seed=np.random.choice(np.array(range(len(self.pop)))), no_graphics=True, pid=-1).getEnv()[0] env = EnvironmentWrapper('Tennis', self.algo, tennis_env, 0) else: raise Exception("only work for 'dis' envir?") average_reward = 0 eps = 0 average_red_reward = 0 red_count = 0 average_actual_blue_reward = 0 blue_count = 0 belief_and_true_type_list = [] assert len(red_actor_list ) is not None, "make sure to input a list of possible red" for it in range(num_iterations): belief_and_true_type = [] if not self.pomdp_adv: # if pomdp_adv, make sure that TD3_actor is never used red_actor = random.choice(red_actor_list) env.set_TD3_actor(red_actor) fitness = 0.0 # here fitness if simplely reward state = env.reset() belief_and_true_type.append(env.belief_and_true_type()) env.randomize_neu_adv() if self.pomdp_adv: env.try_set_pomdp_adv( ) # try to set if opponent to pomdp adv if opponent is adversary, else do nothing render_flag = (np.random.random() < 0.05) while True: # unless done action = blue_dqn.act(state, eps=eps) next_state, reward, done, info = env.step( copy.deepcopy(action), use_actual_reward=self.drqn) belief_and_true_type.append(env.belief_and_true_type()) if render_flag and self.args.render: env.render() state = next_state fitness += reward if done: average_red_reward += env.get_red_reward( ) if env.get_red_reward() is not None else 0 average_actual_blue_reward += env.get_blue_actual_reward( ) if env.get_blue_actual_reward() is not None else 0 red_count += 1 if env.get_red_reward() is not None else 0 blue_count += 1 if env.get_blue_actual_reward( ) is not None else 0 if render_flag: env.env.close() break belief_and_true_type_list.append(belief_and_true_type) average_reward += fitness average_reward /= num_iterations if red_count != 0: average_red_reward /= red_count if blue_count != 0: average_actual_blue_reward /= blue_count return average_reward, average_red_reward, average_actual_blue_reward, belief_and_true_type_list def clear_buffer(self): self.buffer.clear_buffer_data() #reinitialize replay buffer def kill_processes(self): for id, actor in enumerate(self.rollout_bucket): self.evo_task_pipes[id][0].send( ('TERMINATE', 0)) #second argument in send is dummy def __del__(self): self.kill_processes()