class SAC_Discrete_Trainer: """Main CERL class containing all methods for CERL Parameters: args (object): Parameter class with all the parameters """ def __init__(self, args, model_constructor, env_constructor): self.args = args #MP TOOLS self.manager = Manager() #Algo self.algo = SAC_Discrete(args, model_constructor, args.gamma) # #Save best policy # self.best_policy = model_constructor.make_model('actor') #Init BUFFER self.replay_buffer = Buffer(args.buffer_size) self.data_bucket = self.replay_buffer.tuples #Initialize Rollout Bucket self.rollout_bucket = self.manager.list() self.rollout_bucket.append(model_constructor.make_model('Gumbel_FF')) ############## MULTIPROCESSING TOOLS ################### #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 'pg', self.task_pipes[id][1], self.result_pipes[id][0], self.data_bucket, self.rollout_bucket, env_constructor)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] #Test bucket self.test_bucket = self.manager.list() self.test_bucket.append(model_constructor.make_model('Gumbel_FF')) #5 Test workers self.test_task_pipes = [ Pipe() for _ in range(env_constructor.dummy_env.test_size) ] self.test_result_pipes = [ Pipe() for _ in range(env_constructor.dummy_env.test_size) ] self.test_workers = [ Process(target=rollout_worker, args=(id, 'test', self.test_task_pipes[id][1], self.test_result_pipes[id][0], None, self.test_bucket, env_constructor)) for id in range(env_constructor.dummy_env.test_size) ] for worker in self.test_workers: worker.start() self.test_flag = False #Trackers self.best_score = 0.0 self.gen_frames = 0 self.total_frames = 0 self.test_score = None self.test_std = None self.test_trace = [] self.rollout_fits_trace = [] self.ep_len = 0 self.r1_reward = 0 self.num_footsteps = 0 def forward_epoch(self, epoch, tracker): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ START ROLLOUTS ############## #Sync all learners actor to cpu (rollout) actor self.algo.actor.cpu() utils.hard_update(self.rollout_bucket[0], self.algo.actor) utils.hard_update(self.test_bucket[0], self.algo.actor) self.algo.actor.cuda() # Start Learner rollouts for rollout_id in range(self.args.rollout_size): if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send(0) self.roll_flag[rollout_id] = False #Start Test rollouts if epoch % 1 == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send(0) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## if self.replay_buffer.__len__( ) > self.args.learning_start: ###BURN IN PERIOD #self.replay_buffer.tensorify() # Tensorify the buffer for fast sampling for _ in range(self.gen_frames): s, ns, a, r, done = self.replay_buffer.sample( self.args.batch_size) if torch.cuda.is_available(): s = s.cuda() ns = ns.cuda() a = a.cuda() r = r.cuda() done = done.cuda() r = r * self.args.reward_scaling self.algo.update_parameters(s, ns, a, r, done) self.gen_frames = 0 ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ if self.args.rollout_size > 0: for i in range(self.args.rollout_size): entry = self.result_pipes[i][1].recv() learner_id = entry[0] fitness = entry[1] num_frames = entry[2] self.rollout_fits_trace.append(fitness) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness self.roll_flag[i] = True #Referesh buffer (housekeeping tasks - pruning to keep under capacity) self.replay_buffer.referesh() ######################### END OF PARALLEL ROLLOUTS ################ ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] eplens = [] r1_reward = [] num_footsteps = [] for pipe in self.test_result_pipes: #Collect all results entry = pipe[1].recv() test_scores.append(entry[1]) eplens.append(entry[3]) r1_reward.append(entry[4]) num_footsteps.append(entry[5]) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) self.test_trace.append(test_mean) self.num_footsteps = np.mean(np.array(num_footsteps)) self.ep_len = np.mean(np.array(eplens)) self.r1_reward = np.mean(np.array(r1_reward)) tracker.update([test_mean, self.r1_reward], self.total_frames) if self.r1_reward > self.best_score: self.best_score = self.r1_reward torch.save( self.test_bucket[0].state_dict(), self.args.aux_folder + 'bestR1_' + self.args.savetag) print("Best R1 Policy saved with score", '%.2f' % self.r1_reward) else: test_mean, test_std = None, None if epoch % 20 == 0: #Save models torch.save(self.algo.actor.state_dict(), self.args.aux_folder + 'actor_' + self.args.savetag) torch.save(self.algo.critic.state_dict(), self.args.aux_folder + 'critic_' + self.args.savetag) print("Actor and Critic saved") return test_mean, test_std def train(self, frame_limit): # Define Tracker class to track scores test_tracker = utils.Tracker( self.args.savefolder, ['score_' + self.args.savetag, 'r1_' + self.args.savetag], '.csv') # Tracker class to log progress time_start = time.time() for gen in range(1, 1000000000): # Infinite generations # Train one iteration test_mean, test_std = self.forward_epoch(gen, test_tracker) print('Gen/Frames', gen, '/', self.total_frames, 'max_ever:', '%.2f' % self.best_score, ' Avg:', '%.2f' % test_tracker.all_tracker[0][1], ' Frames/sec:', '%.2f' % (self.total_frames / (time.time() - time_start)), ' Test/RolloutScore', ['%.2f' % i for i in self.test_trace[-1:]], '%.2f' % self.rollout_fits_trace[-1], 'Ep_len', '%.2f' % self.ep_len, '#Footsteps', '%.2f' % self.num_footsteps, 'R1_Reward', '%.2f' % self.r1_reward, 'savetag', self.args.savetag) if gen % 5 == 0: print() print('Entropy', self.algo.entropy['mean'], 'Next_Entropy', self.algo.next_entropy['mean'], 'Temp', self.algo.temp['mean'], 'Poilcy_Q', self.algo.policy_q['mean'], 'Critic_Loss', self.algo.critic_loss['mean']) print() if self.total_frames > frame_limit: break
class ERL_Trainer: def __init__(self, args, model_constructor, env_constructor): self.args = args self.policy_string = 'CategoricalPolicy' if env_constructor.is_discrete else 'Gaussian_FF' self.manager = Manager() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") #Evolution self.evolver = SSNE(self.args) #Initialize population self.population = self.manager.list() for _ in range(args.pop_size): self.population.append( model_constructor.make_model(self.policy_string)) #Save best policy self.best_policy = model_constructor.make_model(self.policy_string) #PG Learner if env_constructor.is_discrete: from algos.ddqn import DDQN self.learner = DDQN(args, model_constructor) else: from algos.sac import SAC self.learner = SAC(args, model_constructor) #Replay Buffer self.replay_buffer = Buffer(args.buffer_size) #Initialize Rollout Bucket self.rollout_bucket = self.manager.list() for _ in range(args.rollout_size): self.rollout_bucket.append( model_constructor.make_model(self.policy_string)) ############## MULTIPROCESSING TOOLS ################### #Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, 'evo', self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], args.rollout_size > 0, self.population, env_constructor)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 'pg', self.task_pipes[id][1], self.result_pipes[id][0], True, self.rollout_bucket, env_constructor)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] #Test bucket self.test_bucket = self.manager.list() self.test_bucket.append( model_constructor.make_model(self.policy_string)) # Test workers self.test_task_pipes = [Pipe() for _ in range(args.num_test)] self.test_result_pipes = [Pipe() for _ in range(args.num_test)] self.test_workers = [ Process(target=rollout_worker, args=(id, 'test', self.test_task_pipes[id][1], self.test_result_pipes[id][0], False, self.test_bucket, env_constructor)) for id in range(args.num_test) ] for worker in self.test_workers: worker.start() self.test_flag = False #Trackers self.best_score = -float('inf') self.gen_frames = 0 self.total_frames = 0 self.test_score = None self.test_std = None def forward_generation(self, gen, tracker): gen_max = -float('inf') #Start Evolution rollouts if self.args.pop_size > 1: for id, actor in enumerate(self.population): self.evo_task_pipes[id][0].send(id) #Sync all learners actor to cpu (rollout) actor and start their rollout self.learner.actor.cpu() for rollout_id in range(len(self.rollout_bucket)): utils.hard_update(self.rollout_bucket[rollout_id], self.learner.actor) self.task_pipes[rollout_id][0].send(0) self.learner.actor.to(device=self.device) #Start Test rollouts if gen % self.args.test_frequency == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send(0) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## if self.replay_buffer.__len__( ) > self.args.learning_start: ###BURN IN PERIOD for _ in range(int(self.gen_frames * self.args.gradperstep)): s, ns, a, r, done = self.replay_buffer.sample( self.args.batch_size) self.learner.update_parameters(s, ns, a, r, done) self.gen_frames = 0 ########## JOIN ROLLOUTS FOR EVO POPULATION ############ all_fitness = [] all_eplens = [] if self.args.pop_size > 1: for i in range(self.args.pop_size): _, fitness, frames, trajectory = self.evo_result_pipes[i][ 1].recv() all_fitness.append(fitness) all_eplens.append(frames) self.gen_frames += frames self.total_frames += frames self.replay_buffer.add(trajectory) self.best_score = max(self.best_score, fitness) gen_max = max(gen_max, fitness) ########## JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ rollout_fitness = [] rollout_eplens = [] if self.args.rollout_size > 0: for i in range(self.args.rollout_size): _, fitness, pg_frames, trajectory = self.result_pipes[i][ 1].recv() self.replay_buffer.add(trajectory) self.gen_frames += pg_frames self.total_frames += pg_frames self.best_score = max(self.best_score, fitness) gen_max = max(gen_max, fitness) rollout_fitness.append(fitness) rollout_eplens.append(pg_frames) ######################### END OF PARALLEL ROLLOUTS ################ ############ FIGURE OUT THE CHAMP POLICY AND SYNC IT TO TEST ############# if self.args.pop_size > 1: champ_index = all_fitness.index(max(all_fitness)) utils.hard_update(self.test_bucket[0], self.population[champ_index]) if max(all_fitness) > self.best_score: self.best_score = max(all_fitness) utils.hard_update(self.best_policy, self.population[champ_index]) torch.save(self.population[champ_index].state_dict(), self.args.aux_folder + '_best' + self.args.savetag) print("Best policy saved with score", '%.2f' % max(all_fitness)) else: #If there is no population, champion is just the actor from policy gradient learner utils.hard_update(self.test_bucket[0], self.rollout_bucket[0]) ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] for pipe in self.test_result_pipes: #Collect all results _, fitness, _, _ = pipe[1].recv() self.best_score = max(self.best_score, fitness) gen_max = max(gen_max, fitness) test_scores.append(fitness) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) tracker.update([test_mean], self.total_frames) else: test_mean, test_std = None, None #NeuroEvolution's probabilistic selection and recombination step if self.args.pop_size > 1: self.evolver.epoch(gen, self.population, all_fitness, self.rollout_bucket) #Compute the champion's eplen champ_len = all_eplens[all_fitness.index( max(all_fitness))] if self.args.pop_size > 1 else rollout_eplens[ rollout_fitness.index(max(rollout_fitness))] return gen_max, champ_len, all_eplens, test_mean, test_std, rollout_fitness, rollout_eplens def train(self, frame_limit): # Define Tracker class to track scores test_tracker = utils.Tracker(self.args.savefolder, ['score_' + self.args.savetag], '.csv') # Tracker class to log progress time_start = time.time() for gen in range(1, 1000000000): # Infinite generations # Train one iteration max_fitness, champ_len, all_eplens, test_mean, test_std, rollout_fitness, rollout_eplens = self.forward_generation( gen, test_tracker) if test_mean: self.args.writer.add_scalar('test_score', test_mean, gen) print( 'Gen/Frames:', gen, '/', self.total_frames, ' Gen_max_score:', '%.2f' % max_fitness, ' Champ_len', '%.2f' % champ_len, ' Test_score u/std', utils.pprint(test_mean), utils.pprint(test_std), ' Rollout_u/std:', utils.pprint(np.mean(np.array(rollout_fitness))), utils.pprint(np.std(np.array(rollout_fitness))), ' Rollout_mean_eplen:', utils.pprint(sum(rollout_eplens) / len(rollout_eplens)) if rollout_eplens else None) if gen % 5 == 0: print( 'Best_score_ever:' '/', '%.2f' % self.best_score, ' FPS:', '%.2f' % (self.total_frames / (time.time() - time_start)), 'savetag', self.args.savetag) print() if self.total_frames > frame_limit: break ###Kill all processes try: for p in self.task_pipes: p[0].send('TERMINATE') for p in self.test_task_pipes: p[0].send('TERMINATE') for p in self.evo_task_pipes: p[0].send('TERMINATE') except: None
class Agent: """Learner object encapsulating a local learner Parameters: algo_name (str): Algorithm Identifier state_dim (int): State size action_dim (int): Action size actor_lr (float): Actor learning rate critic_lr (float): Critic learning rate gamma (float): DIscount rate tau (float): Target network sync generate init_w (bool): Use kaimling normal to initialize? **td3args (**kwargs): arguments for TD3 algo """ def __init__(self, args, id): self.args = args self.id = id ###Initalize neuroevolution module### self.evolver = SSNE(self.args) ########Initialize population self.manager = Manager() self.popn = self.manager.list() for _ in range(args.popn_size): if args.ps == 'trunk': self.popn.append( MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)) else: if args.algo_name == 'TD3': self.popn.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='DeterministicPolicy')) else: self.popn.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='GaussianPolicy')) self.popn[-1].eval() #### INITIALIZE PG ALGO ##### if args.ps == 'trunk': if self.args.is_matd3 or args.is_maddpg: algo_name = 'TD3' if self.args.is_matd3 else 'DDPG' self.algo = MATD3(id, algo_name, args.state_dim, args.action_dim, args.hidden_size, args.actor_lr, args.critic_lr, args.gamma, args.tau, args.savetag, args.aux_save, args.actualize, args.use_gpu, args.config.num_agents, args.init_w) else: self.algo = MultiTD3(id, args.algo_name, args.state_dim, args.action_dim, args.hidden_size, args.actor_lr, args.critic_lr, args.gamma, args.tau, args.savetag, args.aux_save, args.actualize, args.use_gpu, args.config.num_agents, args.init_w) else: if args.algo_name == 'TD3': self.algo = TD3(id, args.algo_name, args.state_dim, args.action_dim, args.hidden_size, args.actor_lr, args.critic_lr, args.gamma, args.tau, args.savetag, args.aux_save, args.actualize, args.use_gpu, args.init_w) else: self.algo = SAC(id, args.state_dim, args.action_dim, args.hidden_size, args.gamma, args.critic_lr, args.actor_lr, args.tau, args.alpha, args.target_update_interval, args.savetag, args.aux_save, args.actualize, args.use_gpu) #### Rollout Actor is a template used for MP ##### self.rollout_actor = self.manager.list() if args.ps == 'trunk': self.rollout_actor.append( MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)) else: if args.algo_name == 'TD3': self.rollout_actor.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='DeterministicPolicy')) else: self.rollout_actor.append( Actor(args.state_dim, args.action_dim, args.hidden_size, policy_type='GaussianPolicy')) #Initalize buffer if args.ps == 'trunk': self.buffer = [ Buffer(args.buffer_size, buffer_gpu=False, filter_c=args.filter_c) for _ in range(args.config.num_agents) ] else: self.buffer = Buffer(args.buffer_size, buffer_gpu=False, filter_c=args.filter_c) #Agent metrics self.fitnesses = [[] for _ in range(args.popn_size)] ###Best Policy HOF#### self.champ_ind = 0 def update_parameters(self): td3args = { 'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': -1.0, 'action_high': 1.0 } if self.args.ps == 'trunk': for agent_id, buffer in enumerate(self.buffer): if self.args.is_matd3 or self.args.is_maddpg: buffer = self.buffer[0] #Hardcoded Hack for MADDPG buffer.referesh() if buffer.__len__() < 10 * self.args.batch_size: buffer.pg_frames = 0 return ###BURN_IN_PERIOD buffer.tensorify() for _ in range(int(self.args.gradperstep * buffer.pg_frames)): s, ns, a, r, done, global_reward = buffer.sample( self.args.batch_size, pr_rew=self.args.priority_rate, pr_global=self.args.priority_rate) r *= self.args.reward_scaling if self.args.use_gpu: s = s.cuda() ns = ns.cuda() a = a.cuda() r = r.cuda() done = done.cuda() global_reward = global_reward.cuda() self.algo.update_parameters(s, ns, a, r, done, global_reward, agent_id, 1, **td3args) buffer.pg_frames = 0 else: self.buffer.referesh() if self.buffer.__len__() < 10 * self.args.batch_size: return ###BURN_IN_PERIOD self.buffer.tensorify() for _ in range(int(self.args.gradperstep * self.buffer.pg_frames)): s, ns, a, r, done, global_reward = self.buffer.sample( self.args.batch_size, pr_rew=self.args.priority_rate, pr_global=self.args.priority_rate) r *= self.args.reward_scaling if self.args.use_gpu: s = s.cuda() ns = ns.cuda() a = a.cuda() r = r.cuda() done = done.cuda() global_reward = global_reward.cuda() self.algo.update_parameters(s, ns, a, r, done, global_reward, 1, **td3args) self.buffer.pg_frames = 0 #Reset new frame counter to 0 def evolve(self): ## One gen of evolution ### if self.args.popn_size > 1: #If not no-evo if self.args.scheme == 'multipoint': #Make sure that the buffer has been refereshed and tensorified buffer_pointer = self.buffer[ 0] if self.args.ps == 'trunk' else self.buffer if buffer_pointer.__len__() < 1000: buffer_pointer.tensorify() if random.random() < 0.01: buffer_pointer.tensorify() #Get sample of states from the buffer if buffer_pointer.__len__() < 1000: sample_size = buffer_pointer.__len__() else: sample_size = 1000 if sample_size == 1000 and len(buffer_pointer.sT) < 1000: buffer_pointer.tensorify() states, _, _, _, _, _ = buffer_pointer.sample(sample_size, pr_rew=0.0, pr_global=0.0) states = states.cpu() elif self.args.scheme == 'standard': states = None else: sys.exit('Unknown Evo Scheme') #Net indices of nets that got evaluated this generation (meant for asynchronous evolution workloads) net_inds = [i for i in range(len(self.popn)) ] #Hack for a synchronous run #Evolve if self.args.rollout_size > 0: self.champ_ind = self.evolver.evolve(self.popn, net_inds, self.fitnesses, [self.rollout_actor[0]], states) else: self.champ_ind = self.evolver.evolve(self.popn, net_inds, self.fitnesses, [], states) #Reset fitness metrics self.fitnesses = [[] for _ in range(self.args.popn_size)] def update_rollout_actor(self): for actor in self.rollout_actor: self.algo.policy.cpu() mod.hard_update(actor, self.algo.policy) if self.args.use_gpu: self.algo.policy.cuda()
class CERL_Trainer: """Main CERL class containing all methods for CERL Parameters: args (object): Parameter class with all the parameters """ def __init__(self, args, model_constructor, env_constructor): self.args = args self.policy_string = self.compute_policy_type() #Evolution self.evolver = SSNE(self.args) #MP TOOLS self.manager = Manager() #Genealogy tool self.genealogy = Genealogy() #Initialize population self.population = self.manager.list() seed = True for _ in range(args.pop_size): self.population.append( model_constructor.make_model(self.policy_string, seed=seed)) seed = False #SEED #self.population[0].load_state_dict(torch.load('Results/Auxiliary/_bestcerl_td3_s2019_roll10_pop10_portfolio10')) #Save best policy self.best_policy = model_constructor.make_model(self.policy_string) #Turn off gradients and put in eval mod for actor in self.population: actor = actor.cpu() actor.eval() #Init BUFFER self.replay_buffer = Buffer(args.buffer_size) self.data_bucket = self.replay_buffer.tuples #Intialize portfolio of learners self.portfolio = [] self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, args.portfolio_id, model_constructor) #Initialize Rollout Bucket self.rollout_bucket = self.manager.list() for _ in range(len(self.portfolio)): self.rollout_bucket.append( model_constructor.make_model(self.policy_string)) ############## MULTIPROCESSING TOOLS ################### #Evolutionary population Rollout workers self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] self.evo_workers = [ Process(target=rollout_worker, args=(id, 'evo', self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], self.data_bucket, self.population, env_constructor)) for id in range(args.pop_size) ] for worker in self.evo_workers: worker.start() self.evo_flag = [True for _ in range(args.pop_size)] #Learner rollout workers self.task_pipes = [Pipe() for _ in range(args.rollout_size)] self.result_pipes = [Pipe() for _ in range(args.rollout_size)] self.workers = [ Process(target=rollout_worker, args=(id, 'pg', self.task_pipes[id][1], self.result_pipes[id][0], self.data_bucket, self.rollout_bucket, env_constructor)) for id in range(args.rollout_size) ] for worker in self.workers: worker.start() self.roll_flag = [True for _ in range(args.rollout_size)] #Test bucket self.test_bucket = self.manager.list() self.test_bucket.append( model_constructor.make_model(self.policy_string)) #5 Test workers self.test_task_pipes = [ Pipe() for _ in range(env_constructor.dummy_env.test_size) ] self.test_result_pipes = [ Pipe() for _ in range(env_constructor.dummy_env.test_size) ] self.test_workers = [ Process(target=rollout_worker, args=(id, 'test', self.test_task_pipes[id][1], self.test_result_pipes[id][0], None, self.test_bucket, env_constructor)) for id in range(env_constructor.dummy_env.test_size) ] for worker in self.test_workers: worker.start() self.test_flag = False #Meta-learning controller (Resource Distribution) self.allocation = [ ] #Allocation controls the resource allocation across learners for i in range(args.rollout_size): self.allocation.append( i % len(self.portfolio)) #Start uniformly (equal resources) #Trackers self.best_score = 0.0 self.gen_frames = 0 self.total_frames = 0 self.test_score = None self.test_std = None self.best_r1_score = 0.0 self.ep_len = 0 self.r1_reward = 0 self.num_footsteps = 0 self.test_trace = [] def checkpoint(self): utils.pickle_obj( self.args.aux_folder + self.args.algo + '_checkpoint_frames' + str(self.total_frames), self.portfolio) def load_checkpoint(self, filename): self.portfolio = utils.unpickle_obj(filename) def forward_generation(self, gen, tracker): """Main training loop to do rollouts, neureoevolution, and policy gradients Parameters: gen (int): Current epoch of training Returns: None """ ################ START ROLLOUTS ############## #Start Evolution rollouts if self.args.pop_size > 1: for id, actor in enumerate(self.population): if self.evo_flag[id]: self.evo_task_pipes[id][0].send(id) self.evo_flag[id] = False #Sync all learners actor to cpu (rollout) actor for i, learner in enumerate(self.portfolio): learner.algo.actor.cpu() utils.hard_update(self.rollout_bucket[i], learner.algo.actor) learner.algo.actor.cuda() # Start Learner rollouts for rollout_id, learner_id in enumerate(self.allocation): if self.roll_flag[rollout_id]: self.task_pipes[rollout_id][0].send(learner_id) self.roll_flag[rollout_id] = False #Start Test rollouts if gen % 5 == 0: self.test_flag = True for pipe in self.test_task_pipes: pipe[0].send(0) ############# UPDATE PARAMS USING GRADIENT DESCENT ########## if self.replay_buffer.__len__( ) > self.args.learning_start: ###BURN IN PERIOD #Spin up threads for each learner threads = [ threading.Thread( target=learner.update_parameters, args=(self.replay_buffer, self.args.batch_size, int(self.gen_frames * self.args.gradperstep))) for learner in self.portfolio ] # Start threads for thread in threads: thread.start() #Join threads for thread in threads: thread.join() self.gen_frames = 0 ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############ if self.args.pop_size > 1: all_fitness = [] all_net_ids = [] all_eplens = [] while True: for i in range(self.args.pop_size): if self.evo_result_pipes[i][1].poll(): entry = self.evo_result_pipes[i][1].recv() all_fitness.append(entry[1]) all_net_ids.append(entry[0]) all_eplens.append(entry[2]) self.gen_frames += entry[2] self.total_frames += entry[2] self.evo_flag[i] = True # Soft-join (50%) if len(all_fitness ) / self.args.pop_size >= self.args.asynch_frac: break ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ if self.args.rollout_size > 0: for i in range(self.args.rollout_size): entry = self.result_pipes[i][1].recv() learner_id = entry[0] fitness = entry[1] num_frames = entry[2] self.portfolio[learner_id].update_stats(fitness, num_frames) self.gen_frames += num_frames self.total_frames += num_frames if fitness > self.best_score: self.best_score = fitness self.roll_flag[i] = True ######################### END OF PARALLEL ROLLOUTS ################ ############ PROCESS MAX FITNESS ############# if self.args.pop_size > 1: champ_index = all_net_ids[all_fitness.index(max(all_fitness))] utils.hard_update(self.test_bucket[0], self.population[champ_index]) else: #Run PG in isolation utils.hard_update(self.test_bucket[0], self.rollout_bucket[0]) ###### TEST SCORE ###### if self.test_flag: self.test_flag = False test_scores = [] eplens = [] r1_reward = [] num_footsteps = [] for pipe in self.test_result_pipes: #Collect all results entry = pipe[1].recv() test_scores.append(entry[1]) eplens.append(entry[3]) r1_reward.append(entry[4]) num_footsteps.append(entry[5]) test_scores = np.array(test_scores) test_mean = np.mean(test_scores) test_std = (np.std(test_scores)) self.test_trace.append(test_mean) self.num_footsteps = np.mean(np.array(num_footsteps)) self.ep_len = np.mean(np.array(eplens)) self.r1_reward = np.mean(np.array(r1_reward)) if self.r1_reward > self.best_r1_score: self.best_r1_score = self.r1_reward utils.hard_update(self.best_policy, self.test_bucket[0]) torch.save( self.test_bucket[0].state_dict(), self.args.aux_folder + '_bestR1_' + self.args.savetag) print("Best R2 policy saved with score", '%.2f' % self.r1_reward) if test_mean > self.best_score: self.best_score = test_mean utils.hard_update(self.best_policy, self.test_bucket[0]) torch.save( self.test_bucket[0].state_dict(), self.args.aux_folder + '_bestShaped' + self.args.savetag) print("Best Shaped policy saved with score", '%.2f' % test_mean) tracker.update([test_mean, self.r1_reward], self.total_frames) else: test_mean, test_std = None, None # Referesh buffer (housekeeping tasks - pruning to keep under capacity) self.replay_buffer.referesh() #NeuroEvolution's probabilistic selection and recombination step if self.args.pop_size > 1: if self.args.scheme == 'multipoint': sample_size = self.args.batch_size if self.replay_buffer.__len__( ) >= self.args.batch_size else self.replay_buffer.__len__() states, _, _, _, _ = self.replay_buffer.sample( batch_size=sample_size) else: states = None self.evolver.epoch(self.population, all_net_ids, all_fitness, self.rollout_bucket, states) #META LEARNING - RESET ALLOCATION USING UCB if self.args.rollout_size > 0: self.allocation = ucb(len(self.allocation), self.portfolio, self.args.ucb_coefficient) #Metrics if self.args.pop_size > 1: champ_len = all_eplens[all_fitness.index(max(all_fitness))] #champ_wwid = int(self.pop[champ_index].wwid.item()) max_fit = max(all_fitness) else: champ_len = num_frames all_fitness = [fitness] max_fit = fitness all_eplens = [num_frames] return max_fit, champ_len, all_eplens, test_mean, test_std def train(self, frame_limit): # Define Tracker class to track scores test_tracker = utils.Tracker( self.args.savefolder, ['score_' + self.args.savetag, 'r2_' + self.args.savetag], '.csv') # Tracker class to log progress grad_temp = [ str(i) + 'entropy_' + self.args.savetag for i in range(len(self.portfolio)) ] + [ str(i) + 'policyQ_' + self.args.savetag for i in range(len(self.portfolio)) ] grad_tracker = utils.Tracker(self.args.aux_folder, grad_temp, '.csv') # Tracker class to log progress time_start = time.time() for gen in range(1, 1000000000): # Infinite generations # Train one iteration max_fitness, champ_len, all_eplens, test_mean, test_std = self.forward_generation( gen, test_tracker) print('Gen/Frames', gen, '/', self.total_frames, ' Pop_max/max_ever:', '%.2f' % max_fitness, '/', '%.2f' % self.best_score, ' Avg:', '%.2f' % test_tracker.all_tracker[0][1], ' Frames/sec:', '%.2f' % (self.total_frames / (time.time() - time_start)), ' Champ_len', '%.2f' % champ_len, ' Test_score u/std', utils.pprint(test_mean), utils.pprint(test_std), 'Ep_len', '%.2f' % self.ep_len, '#Footsteps', '%.2f' % self.num_footsteps, 'R2_Reward', '%.2f' % self.r1_reward, 'savetag', self.args.savetag) grad_temp = [ algo.algo.entropy['mean'] for algo in self.portfolio ] + [algo.algo.policy_q['mean'] for algo in self.portfolio] grad_tracker.update(grad_temp, self.total_frames) if gen % 5 == 0: print('Learner Fitness', [ utils.pprint(learner.value) for learner in self.portfolio ], 'Sum_stats_resource_allocation', [learner.visit_count for learner in self.portfolio]) try: print('Entropy', [ '%.2f' % algo.algo.entropy['mean'] for algo in self.portfolio ], 'Next_Entropy', [ '%.2f' % algo.algo.next_entropy['mean'] for algo in self.portfolio ], 'Poilcy_Q', [ '%.2f' % algo.algo.policy_q['mean'] for algo in self.portfolio ], 'Critic_Loss', [ '%.2f' % algo.algo.critic_loss['mean'] for algo in self.portfolio ]) print() except: None if self.total_frames > frame_limit: break ###Kill all processes try: for p in self.task_pipes: p[0].send('TERMINATE') for p in self.test_task_pipes: p[0].send('TERMINATE') for p in self.evo_task_pipes: p[0].send('TERMINATE') except: None def compute_policy_type(self): if self.args.algo == 'ddqn': return 'DDQN' elif self.args.algo == 'sac': return 'Gaussian_FF' elif self.args.algo == 'td3': return 'Deterministic_FF'