예제 #1
0
파일: main.py 프로젝트: Anonymous991/CERL
class CERL_Agent:
    """Main CERL class containing all methods for CERL

		Parameters:
		args (int): Parameter class with all the parameters

	"""
    def __init__(self, args):
        self.args = args
        self.evolver = SSNE(self.args)

        #MP TOOLS
        self.manager = Manager()

        #Genealogy tool
        self.genealogy = Genealogy()

        #Initialize population
        self.pop = self.manager.list()
        for _ in range(args.pop_size):
            wwid = self.genealogy.new_id('evo')
            if ALGO == 'SAC':
                self.pop.append(
                    GaussianPolicy(args.state_dim, args.action_dim,
                                   args.hidden_size, wwid))
            else:
                self.pop.append(Actor(args.state_dim, args.action_dim, wwid))

        if ALGO == "SAC":
            self.best_policy = GaussianPolicy(args.state_dim, args.action_dim,
                                              args.hidden_size, -1)
        else:
            self.best_policy = Actor(args.state_dim, args.action_dim, -1)

        #Turn off gradients and put in eval mod
        for actor in self.pop:
            actor = actor.cpu()
            actor.eval()

        #Init BUFFER
        self.replay_buffer = Buffer(1000000, self.args.buffer_gpu)

        #Intialize portfolio of learners
        self.portfolio = []
        self.portfolio = initialize_portfolio(self.portfolio, self.args,
                                              self.genealogy, PORTFOLIO_ID)
        self.rollout_bucket = self.manager.list()
        for _ in range(len(self.portfolio)):
            if ALGO == 'SAC':
                self.rollout_bucket.append(
                    GaussianPolicy(args.state_dim, args.action_dim,
                                   args.hidden_size, -1))
            else:
                self.rollout_bucket.append(
                    Actor(args.state_dim, args.action_dim, -1))

        # Initialize shared data bucket
        self.data_bucket = self.replay_buffer.tuples

        ############## MULTIPROCESSING TOOLS ###################

        #Evolutionary population Rollout workers
        self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_workers = [
            Process(target=rollout_worker,
                    args=(id, self.evo_task_pipes[id][1],
                          self.evo_result_pipes[id][0], False,
                          self.data_bucket, self.pop, ENV_NAME, None, ALGO))
            for id in range(args.pop_size)
        ]
        for worker in self.evo_workers:
            worker.start()
        self.evo_flag = [True for _ in range(args.pop_size)]

        #Learner rollout workers
        self.task_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.result_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, self.task_pipes[id][1], self.result_pipes[id][0],
                          True, self.data_bucket, self.rollout_bucket,
                          ENV_NAME, args.noise_std, ALGO))
            for id in range(args.rollout_size)
        ]
        for worker in self.workers:
            worker.start()
        self.roll_flag = [True for _ in range(args.rollout_size)]

        #Test bucket
        self.test_bucket = self.manager.list()
        if ALGO == 'SAC':
            self.test_bucket.append(
                GaussianPolicy(args.state_dim, args.action_dim,
                               args.hidden_size, -1))
        else:
            self.test_bucket.append(Actor(args.state_dim, args.action_dim, -1))

        #5 Test workers
        self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)]
        self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)]
        self.test_workers = [
            Process(target=rollout_worker,
                    args=(id, self.test_task_pipes[id][1],
                          self.test_result_pipes[id][0], False, None,
                          self.test_bucket, ENV_NAME, args.noise_std, ALGO))
            for id in range(TEST_SIZE)
        ]
        for worker in self.test_workers:
            worker.start()
        self.test_flag = False

        #Meta-learning controller (Resource Distribution)
        self.allocation = [
        ]  #Allocation controls the resource allocation across learners
        for i in range(args.rollout_size):
            self.allocation.append(
                i % len(self.portfolio))  #Start uniformly (equal resources)
        #self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores

        #Trackers
        self.best_score = 0.0
        self.gen_frames = 0
        self.total_frames = 0
        self.best_shaped_score = None
        self.test_score = None
        self.test_std = None

    def train(self, gen, frame_tracker):
        """Main training loop to do rollouts, neureoevolution, and policy gradients

			Parameters:
				gen (int): Current epoch of training

			Returns:
				None
		"""
        ################ START ROLLOUTS ##############

        #Start Evolution rollouts
        if not ISOLATE_PG:
            for id, actor in enumerate(self.pop):
                if self.evo_flag[id]:
                    self.evo_task_pipes[id][0].send(id)
                    self.evo_flag[id] = False

        #Sync all learners actor to cpu (rollout) actor
        for i, learner in enumerate(self.portfolio):
            learner.algo.actor.cpu()
            utils.hard_update(self.rollout_bucket[i], learner.algo.actor)
            learner.algo.actor.cuda()

        # Start Learner rollouts
        for rollout_id, learner_id in enumerate(self.allocation):
            if self.roll_flag[rollout_id]:
                self.task_pipes[rollout_id][0].send(learner_id)
                self.roll_flag[rollout_id] = False

        #Start Test rollouts
        if gen % 5 == 0:
            self.test_flag = True
            for pipe in self.test_task_pipes:
                pipe[0].send(0)

        ############# UPDATE PARAMS USING GRADIENT DESCENT ##########
        if self.replay_buffer.__len__(
        ) > self.args.batch_size * 10:  ###BURN IN PERIOD
            self.replay_buffer.tensorify(
            )  # Tensorify the buffer for fast sampling

            #Spin up threads for each learner
            threads = [
                threading.Thread(
                    target=learner.update_parameters,
                    args=(self.replay_buffer, self.args.buffer_gpu,
                          self.args.batch_size,
                          int(self.gen_frames * self.args.gradperstep)))
                for learner in self.portfolio
            ]

            # Start threads
            for thread in threads:
                thread.start()

            #Join threads
            for thread in threads:
                thread.join()
            self.gen_frames = 0

        ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############
        if not ISOLATE_PG:
            all_fitness = []
            all_net_ids = []
            all_eplens = []
            while True:
                for i in range(self.args.pop_size):
                    if self.evo_result_pipes[i][1].poll():
                        entry = self.evo_result_pipes[i][1].recv()
                        all_fitness.append(entry[1])
                        all_net_ids.append(entry[0])
                        all_eplens.append(entry[2])
                        self.gen_frames += entry[2]
                        self.total_frames += entry[2]
                        self.evo_flag[i] = True

                # Soft-join (50%)
                if len(all_fitness
                       ) / self.args.pop_size >= self.args.asynch_frac:
                    break

        ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############
        for i in range(self.args.rollout_size):
            entry = self.result_pipes[i][1].recv()
            learner_id = entry[0]
            fitness = entry[1]
            num_frames = entry[2]
            self.portfolio[learner_id].update_stats(fitness, num_frames)

            self.gen_frames += num_frames
            self.total_frames += num_frames
            if fitness > self.best_score: self.best_score = fitness

            self.roll_flag[i] = True

        #Referesh buffer (housekeeping tasks - pruning to keep under capacity)
        self.replay_buffer.referesh()
        ######################### END OF PARALLEL ROLLOUTS ################

        ############ PROCESS MAX FITNESS #############
        if not ISOLATE_PG:
            champ_index = all_net_ids[all_fitness.index(max(all_fitness))]
            utils.hard_update(self.test_bucket[0], self.pop[champ_index])
            if max(all_fitness) > self.best_score:
                self.best_score = max(all_fitness)
                utils.hard_update(self.best_policy, self.pop[champ_index])
                if SAVE:
                    torch.save(
                        self.pop[champ_index].state_dict(),
                        self.args.aux_folder + ENV_NAME + '_best' + SAVETAG)
                    print("Best policy saved with score",
                          '%.2f' % max(all_fitness))

        else:  #Run PG in isolation
            utils.hard_update(self.test_bucket[0], self.rollout_bucket[0])

        ###### TEST SCORE ######
        if self.test_flag:
            self.test_flag = False
            test_scores = []
            for pipe in self.test_result_pipes:  #Collect all results
                entry = pipe[1].recv()
                test_scores.append(entry[1])
            test_scores = np.array(test_scores)
            test_mean = np.mean(test_scores)
            test_std = (np.std(test_scores))

            # Update score to trackers
            frame_tracker.update([test_mean], self.total_frames)
        else:
            test_mean, test_std = None, None

        #NeuroEvolution's probabilistic selection and recombination step
        if not ISOLATE_PG:
            if gen % 5 == 0:
                self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids,
                                   all_fitness, self.rollout_bucket)
            else:
                self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids,
                                   all_fitness, [])

        #META LEARNING - RESET ALLOCATION USING UCB
        if gen % 1 == 0:
            self.allocation = ucb(len(self.allocation), self.portfolio,
                                  self.args.ucb_coefficient)

        #Metrics
        if not ISOLATE_PG:
            champ_len = all_eplens[all_fitness.index(max(all_fitness))]
            champ_wwid = int(self.pop[champ_index].wwid.item())
            max_fit = max(all_fitness)
        else:
            champ_len = num_frames
            champ_wwid = int(self.rollout_bucket[0].wwid.item())
            all_fitness = [fitness]
            max_fit = fitness
            all_eplens = [num_frames]

        return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid
예제 #2
0
class SAC_Discrete_Trainer:
    """Main CERL class containing all methods for CERL

		Parameters:
		args (object): Parameter class with all the parameters

	"""
    def __init__(self, args, model_constructor, env_constructor):
        self.args = args

        #MP TOOLS
        self.manager = Manager()

        #Algo
        self.algo = SAC_Discrete(args, model_constructor, args.gamma)

        # #Save best policy
        # self.best_policy = model_constructor.make_model('actor')

        #Init BUFFER
        self.replay_buffer = Buffer(args.buffer_size)
        self.data_bucket = self.replay_buffer.tuples

        #Initialize Rollout Bucket
        self.rollout_bucket = self.manager.list()
        self.rollout_bucket.append(model_constructor.make_model('Gumbel_FF'))

        ############## MULTIPROCESSING TOOLS ###################
        #Learner rollout workers
        self.task_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.result_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, 'pg', self.task_pipes[id][1],
                          self.result_pipes[id][0], self.data_bucket,
                          self.rollout_bucket, env_constructor))
            for id in range(args.rollout_size)
        ]
        for worker in self.workers:
            worker.start()
        self.roll_flag = [True for _ in range(args.rollout_size)]

        #Test bucket
        self.test_bucket = self.manager.list()
        self.test_bucket.append(model_constructor.make_model('Gumbel_FF'))

        #5 Test workers
        self.test_task_pipes = [
            Pipe() for _ in range(env_constructor.dummy_env.test_size)
        ]
        self.test_result_pipes = [
            Pipe() for _ in range(env_constructor.dummy_env.test_size)
        ]
        self.test_workers = [
            Process(target=rollout_worker,
                    args=(id, 'test', self.test_task_pipes[id][1],
                          self.test_result_pipes[id][0], None,
                          self.test_bucket, env_constructor))
            for id in range(env_constructor.dummy_env.test_size)
        ]
        for worker in self.test_workers:
            worker.start()
        self.test_flag = False

        #Trackers
        self.best_score = 0.0
        self.gen_frames = 0
        self.total_frames = 0
        self.test_score = None
        self.test_std = None
        self.test_trace = []
        self.rollout_fits_trace = []

        self.ep_len = 0
        self.r1_reward = 0
        self.num_footsteps = 0

    def forward_epoch(self, epoch, tracker):
        """Main training loop to do rollouts, neureoevolution, and policy gradients

			Parameters:
				gen (int): Current epoch of training

			Returns:
				None
		"""

        ################ START ROLLOUTS ##############
        #Sync all learners actor to cpu (rollout) actor
        self.algo.actor.cpu()
        utils.hard_update(self.rollout_bucket[0], self.algo.actor)
        utils.hard_update(self.test_bucket[0], self.algo.actor)
        self.algo.actor.cuda()

        # Start Learner rollouts
        for rollout_id in range(self.args.rollout_size):
            if self.roll_flag[rollout_id]:
                self.task_pipes[rollout_id][0].send(0)
                self.roll_flag[rollout_id] = False

        #Start Test rollouts
        if epoch % 1 == 0:
            self.test_flag = True
            for pipe in self.test_task_pipes:
                pipe[0].send(0)

        ############# UPDATE PARAMS USING GRADIENT DESCENT ##########
        if self.replay_buffer.__len__(
        ) > self.args.learning_start:  ###BURN IN PERIOD
            #self.replay_buffer.tensorify()  # Tensorify the buffer for fast sampling
            for _ in range(self.gen_frames):
                s, ns, a, r, done = self.replay_buffer.sample(
                    self.args.batch_size)
                if torch.cuda.is_available():
                    s = s.cuda()
                    ns = ns.cuda()
                    a = a.cuda()
                    r = r.cuda()
                    done = done.cuda()
                    r = r * self.args.reward_scaling
                self.algo.update_parameters(s, ns, a, r, done)
        self.gen_frames = 0

        ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############
        if self.args.rollout_size > 0:
            for i in range(self.args.rollout_size):
                entry = self.result_pipes[i][1].recv()
                learner_id = entry[0]
                fitness = entry[1]
                num_frames = entry[2]
                self.rollout_fits_trace.append(fitness)

                self.gen_frames += num_frames
                self.total_frames += num_frames
                if fitness > self.best_score: self.best_score = fitness

                self.roll_flag[i] = True

            #Referesh buffer (housekeeping tasks - pruning to keep under capacity)
            self.replay_buffer.referesh()
        ######################### END OF PARALLEL ROLLOUTS ################

        ###### TEST SCORE ######
        if self.test_flag:
            self.test_flag = False
            test_scores = []
            eplens = []
            r1_reward = []
            num_footsteps = []
            for pipe in self.test_result_pipes:  #Collect all results
                entry = pipe[1].recv()
                test_scores.append(entry[1])
                eplens.append(entry[3])
                r1_reward.append(entry[4])
                num_footsteps.append(entry[5])

            test_scores = np.array(test_scores)
            test_mean = np.mean(test_scores)
            test_std = (np.std(test_scores))
            self.test_trace.append(test_mean)
            self.num_footsteps = np.mean(np.array(num_footsteps))
            self.ep_len = np.mean(np.array(eplens))
            self.r1_reward = np.mean(np.array(r1_reward))
            tracker.update([test_mean, self.r1_reward], self.total_frames)

            if self.r1_reward > self.best_score:
                self.best_score = self.r1_reward
                torch.save(
                    self.test_bucket[0].state_dict(),
                    self.args.aux_folder + 'bestR1_' + self.args.savetag)
                print("Best R1 Policy saved with score",
                      '%.2f' % self.r1_reward)

        else:
            test_mean, test_std = None, None

        if epoch % 20 == 0:
            #Save models
            torch.save(self.algo.actor.state_dict(),
                       self.args.aux_folder + 'actor_' + self.args.savetag)
            torch.save(self.algo.critic.state_dict(),
                       self.args.aux_folder + 'critic_' + self.args.savetag)
            print("Actor and Critic saved")

        return test_mean, test_std

    def train(self, frame_limit):
        # Define Tracker class to track scores
        test_tracker = utils.Tracker(
            self.args.savefolder,
            ['score_' + self.args.savetag, 'r1_' + self.args.savetag],
            '.csv')  # Tracker class to log progress
        time_start = time.time()

        for gen in range(1, 1000000000):  # Infinite generations

            # Train one iteration
            test_mean, test_std = self.forward_epoch(gen, test_tracker)

            print('Gen/Frames', gen, '/', self.total_frames, 'max_ever:',
                  '%.2f' % self.best_score, ' Avg:',
                  '%.2f' % test_tracker.all_tracker[0][1], ' Frames/sec:',
                  '%.2f' % (self.total_frames / (time.time() - time_start)),
                  ' Test/RolloutScore',
                  ['%.2f' % i for i in self.test_trace[-1:]],
                  '%.2f' % self.rollout_fits_trace[-1], 'Ep_len',
                  '%.2f' % self.ep_len, '#Footsteps',
                  '%.2f' % self.num_footsteps, 'R1_Reward',
                  '%.2f' % self.r1_reward, 'savetag', self.args.savetag)

            if gen % 5 == 0:
                print()

                print('Entropy', self.algo.entropy['mean'], 'Next_Entropy',
                      self.algo.next_entropy['mean'], 'Temp',
                      self.algo.temp['mean'], 'Poilcy_Q',
                      self.algo.policy_q['mean'], 'Critic_Loss',
                      self.algo.critic_loss['mean'])

                print()

            if self.total_frames > frame_limit:
                break
예제 #3
0
class CERL_Agent:
    """Main CERL class containing all methods for CERL

		Parameters:
		args (int): Parameter class with all the parameters

	"""
    def __init__(self,
                 args):  # need to intialize rollout_workers to have blue agent
        self.args = args
        self.evolver = SSNE(
            self.args)  # this evolver implements neuro-evolution

        # MP TOOLS
        self.manager = Manager()

        self.mutate_algos = [
            Mutation_Add(self),
            Mutation_Delete(self),
            Mutation_Exchange(self)
        ]  #store all the mutate algorithm objects
        # Genealogy tool
        self.genealogy = Genealogy()

        # Init BUFFER
        self.replay_buffer = Buffer(1000000, self.args.buffer_gpu)

        #if SA_FLAG:
        self.metrics = []
        self.last_portfolio = None
        self.T_max = 30
        self.T = self.T_max
        self.T_min = 0.2
        self.decay_rate = 0.975

        # Initialize population
        self.pop = self.manager.list()
        for _ in range(args.pop_size):
            wwid = self.genealogy.new_id('evo')
            if ALGO == 'SAC':
                self.pop.append(
                    GaussianPolicy(args.state_dim, args.action_dim,
                                   args.hidden_size, wwid))
            elif ALGO == 'TD3':
                self.pop.append(
                    Actor(args.state_dim, args.action_dim, wwid, ALGO))
                # use ALGO to distinguish differe net architecture
            elif ALGO == 'dis' or 'TD3_tennis':
                self.pop.append(
                    Actor(args.state_dim, args.action_dim, wwid, ALGO))
            else:
                assert False, "invalid algorithm type"

        if ALGO == "SAC":
            self.best_policy = GaussianPolicy(args.state_dim, args.action_dim,
                                              args.hidden_size, -1)
        else:
            self.best_policy = Actor(args.state_dim, args.action_dim, -1, ALGO)
            if ALGO == 'dis':
                self.average_policy = AverageActor(args.state_dim,
                                                   args.action_dim,
                                                   -2,
                                                   ALGO,
                                                   self.pop,
                                                   self.replay_buffer,
                                                   args.buffer_gpu,
                                                   args.batch_size,
                                                   iterations=10)
                self.average_policy.share_memory()

        self.best_policy.share_memory()

        # added by macheng, share the best policy accross processes (used as internal belief update models for blue)

        # now we assign shared blue_trainer, we should train this agent such that the roll_out workers are also up to date
        # should make sure that self.best_policy (emergent learner) is also shared
        if ALGO == 'dis' or 'TD3_tennis':
            assert hasattr(
                args, "blue_trainer"
            ), "must have blue_agent trainer to intialize rollout_worker, see line 109, class Parameter definition"
        if ALGO == 'dis':
            trainers = [args.blue_trainer, self.average_policy]
        else:
            trainers = [args.blue_trainer, None
                        ] if ALGO == 'TD3_tennis' else []

        self.trainers = trainers

        self.blue_dqn = args.blue_trainer

        # Turn off gradients and put in eval mod
        for actor in self.pop:
            actor = actor.cpu()
            actor.eval()
        # Intialize portfolio of learners
        self.portfolio = []
        self.portfolio = initialize_portfolio(self.portfolio, self.args,
                                              self.genealogy, PORTFOLIO_ID)
        self.complement_portfolio = [
        ]  #complementary of the portfolio, whatever not in the portfolio should be stored here
        self.total_rollout_bucket = self.manager.list(
        )  #macheng: we use total_rollout_bucket to represents the whole set of rollout models, now rollout_bukcet dynamically resize according to portforlio, for SA
        self.rollout_bucket = self.total_rollout_bucket
        #self.rollout_bucket = self.manager.list()
        #print("rollout_bucker needs to be updated, main.py line 239 ")
        for _ in range(len(self.portfolio)):
            if ALGO == 'SAC':
                self.rollout_bucket.append(
                    GaussianPolicy(args.state_dim, args.action_dim,
                                   args.hidden_size, -1))
            else:
                self.rollout_bucket.append(
                    Actor(args.state_dim, args.action_dim, -1, ALGO))
        # Initialize shared data bucket
        self.data_bucket = self.replay_buffer.tuples

        ############## MULTIPROCESSING TOOLS ###################
        # Evolutionary population Rollout workers
        self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_workers = [
            Process(target=rollout_worker,
                    args=(id, 0, self.evo_task_pipes[id][1],
                          self.evo_result_pipes[id][0], False,
                          self.data_bucket, self.pop, ENV_NAME, None, ALGO,
                          self.trainers)) for id in range(args.pop_size)
        ]
        for worker in self.evo_workers:
            worker.start()
        self.evo_flag = [True for _ in range(args.pop_size)]

        # Learner rollout workers
        self.task_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.result_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, 1, self.task_pipes[id][1],
                          self.result_pipes[id][0], True, self.data_bucket,
                          self.rollout_bucket, ENV_NAME, args.noise_std, ALGO,
                          self.trainers)) for id in range(args.rollout_size)
        ]
        for worker in self.workers:
            worker.start()
        self.roll_flag = [True for _ in range(args.rollout_size)]

        # Test bucket
        self.test_bucket = self.manager.list()
        if ALGO == 'SAC':
            self.test_bucket.append(
                GaussianPolicy(args.state_dim, args.action_dim,
                               args.hidden_size, -1))
        else:
            self.test_bucket.append(
                Actor(args.state_dim, args.action_dim, -1, ALGO))

        # 5 Test workers
        self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)]
        self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)]
        self.test_workers = [
            Process(target=rollout_worker,
                    args=(id, 2, self.test_task_pipes[id][1],
                          self.test_result_pipes[id][0], False, None,
                          self.test_bucket, ENV_NAME, args.noise_std, ALGO,
                          self.trainers)) for id in range(TEST_SIZE)
        ]
        for worker in self.test_workers:
            worker.start()
        self.test_flag = False

        # Meta-learning controller (Resource Distribution)
        self.allocation = [
        ]  #Allocation controls the resource allocation across learners
        for i in range(args.rollout_size):
            self.allocation.append(
                i % len(self.portfolio))  #Start uniformly (equal resources)
        # self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores

        # Trackers
        self.best_score = -np.inf
        self.gen_frames = 0
        self.total_frames = 0
        self.best_shaped_score = None
        self.test_score = None
        self.test_std = None

        # trainer contains the blue_dqn to be trained, and the red model used for belief update, red_actor is the actual red agent trained against
        # id is the actual red agent id

    def _update_SA_temperature(self):
        self.T = max(self.T * self.decay_rate, self.T_min)

    def _get_accept_rate(self):
        if RANDOM_WALK:
            return 1.0
        else:
            if self.metrics[-1] > self.metrics[-2]:
                return 1.0
            else:
                return np.exp((self.metrics[-1] - self.metrics[-2]) / self.T)

    def _mutate(self):
        while True:
            mutate_algo_index = random.choice(range(3))
            if self._try_mutate(mutate_algo_index):
                return

    def _try_mutate(self,
                    algo_index):  # 0 for add, 1 for delete, 2 for exchange
        return self.mutate_algos[algo_index].try_mutate()

    def simulated_annealing(self, metric):  #take in the current metric
        self.metrics.append(metric)
        if self.last_portfolio:  #has last_portfolio
            accept_rate = self._get_accept_rate()  #based on self.metrics[-2:]
            self._update_SA_temperature()
            if np.random.random() > accept_rate:  #reject
                self.portfolio = self.last_portfolio
                self.complement_portfolio = self.last_complement_portfolio

        self.last_portfolio = copy.copy(
            self.portfolio)  #maintain a shallow copy as
        self.last_complement_portfolio = copy.copy(self.complement_portfolio)
        self._mutate()  #perturb the portfolio
        # update rollout_bucket size, only the first len(self.portfolio) rollout_buckets are visible
        self.update_rollout_bucket()
        # update allocation, to be compatible with the current portfolio
        self.update_allocation()

    def update_rollout_bucket(self):
        self.rollout_bucket = self.total_rollout_bucket[:len(self.portfolio)]

    def train_blue_dqn(
        self,
        trainers,
        env_name,
        gen,
        ALGO='dis',
        pomdp_adv=False
    ):  #in this method, rollout and training are done together, opponent sampled from the population
        NUM_EPISODE = 100  #train 100 episodes for the blue to converge to the new best response to red
        EPS_START = max(1.0 * 0.5**(gen - 10),
                        0.15) if gen >= 10 else 1.0  #initial epsilon
        EPS_END = 0.05
        EPS_DECAY = 0.995

        if ALGO == 'dis':  # make env with blue and red policy agent inside,
            assert trainers is not None
            dis_env = make_self_play_env(
                seed=np.random.choice(np.array(range(len(self.pop)))),
                return_policy_agent=False,
                trainers=trainers
            )[0]  # trainer if not None, first is the shared DQN agent, second is the best red policy
            env = EnvironmentWrapper(
                env_name, ALGO, dis_env,
                0)  # the "0" is the index for training blue agent
        elif ALGO == 'TD3_tennis':
            no_graphics = not RENDER
            tennis_env = make_tennis_env.TennisEnvFactory(
                seed=np.random.choice(np.array(range(len(self.pop)))),
                no_graphics=no_graphics,
                pid=-1).getEnv()[0]
            env = EnvironmentWrapper('Tennis', ALGO, tennis_env, 0)
        else:
            env = EnvironmentWrapper(env_name, ALGO)

        blue_dqn = trainers[0]
        average_reward = 0
        eps = EPS_START

        average_red_reward = 0
        red_count = 0
        average_actual_blue_reward = 0
        blue_count = 0

        for it in range(NUM_EPISODE):
            if not pomdp_adv:  #if pomdp_adv, make sure that TD3_actor is never used
                id = np.random.choice(np.array(range(len(self.pop))))
                red_actor = self.pop[id]
                env.set_TD3_actor(red_actor)

            fitness = 0.0
            #here fitness if simplely reward
            total_frame = 0
            state = env.reset()
            env.randomize_neu_adv()

            if pomdp_adv:
                env.try_set_pomdp_adv(
                )  #try to set if opponent to pomdp adv if opponent is adversary, else do nothing

            render_flag = (np.random.random() < 0.05)
            while True:  # unless done

                action = blue_dqn.act(state, eps=eps)
                # action = utils.to_numpy(action)

                next_state, reward, done, info = env.step(
                    copy.deepcopy(action), use_actual_reward=DRQN
                )  #after calling env.step, evaluator initialized later does not work
                #should be something wrong with the internal red model?
                blue_dqn.step(state, action, reward, next_state, done)

                if render_flag and self.args.render:
                    env.render()
                # next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0)
                state = next_state
                fitness += reward
                total_frame += 1

                # DONE FLAG IS Received
                if done:
                    average_red_reward += env.get_red_reward(
                    ) if env.get_red_reward() is not None else 0
                    average_actual_blue_reward += env.get_blue_actual_reward(
                    ) if env.get_blue_actual_reward() is not None else 0
                    red_count += 1 if env.get_red_reward() is not None else 0
                    blue_count += 1 if env.get_blue_actual_reward(
                    ) is not None else 0
                    if render_flag: env.env.close()
                    break

            average_reward += fitness
            eps = max(EPS_END, EPS_DECAY * eps)

        if gen >= 10 and gen % 5 == 0:
            blue_dqn.save_net('./pytorch_models/train_blue_dqn_step_' +
                              str(gen) + '.pth')

        average_reward /= NUM_EPISODE
        if red_count != 0:
            average_red_reward /= red_count
        if blue_count != 0:
            average_actual_blue_reward /= blue_count
        return average_reward, average_red_reward, average_actual_blue_reward

    def evaluate_training_fixed_blue(
            self):  #this evaluate against the training opponent (red pop)
        self.evaluator.pomdp_adv = False
        return self.evaluator.evaluate_fixed_agents(self.trainers[0],
                                                    self.trainers[1], self.pop)

    def train(self, gen, frame_tracker):
        """Main training loop to do rollouts, neureoevolution, and policy gradients

			Parameters:
				gen (int): Current epoch of training

			Returns:
				None
		"""
        ################ START ROLLOUTS ##############

        # Start Evolution rollouts
        if not ISOLATE_PG:
            for id, actor in enumerate(self.pop):
                if self.evo_flag[id]:
                    self.evo_task_pipes[id][0].send((id, gen))
                    self.evo_flag[id] = False

        # Sync all learners actor to cpu (rollout) actor
        # (update rollout parameter using the learner parameter, such that rollout worker is up to date)
        for i, learner in enumerate(self.portfolio):  #number of learner
            learner.algo.actor.cpu()
            utils.hard_update(
                self.rollout_bucket[i], learner.algo.actor
            )  #rollout bucket is now synchronized with learner to perform rollout for learner actors
            if torch.cuda.is_available(): learner.algo.actor.cuda()

        # Start Learner rollouts
        for rollout_id, learner_id in enumerate(
                self.allocation):  #number of rollout_size
            if self.roll_flag[rollout_id]:
                self.task_pipes[rollout_id][0].send(
                    (learner_id, gen)
                )  #allocation record the id of the learner that bucket should run, so rollout_id is the id of rollout_bucket
                self.roll_flag[rollout_id] = False

        # Start Test rollouts
        if gen % 5 == 0:
            self.test_flag = True
            for pipe in self.test_task_pipes:
                pipe[0].send((0, gen))

        ############# UPDATE PARAMS USING GRADIENT DESCENT ##########
        # main training loop
        if self.replay_buffer.__len__(
        ) > self.args.batch_size * 10:  ###BURN IN PERIOD
            self.replay_buffer.tensorify(
            )  # Tensorify the buffer for fast sampling

            # Spin up threads for each learner
            threads = [
                threading.Thread(
                    target=learner.update_parameters,
                    args=(self.replay_buffer, self.args.buffer_gpu,
                          self.args.batch_size,
                          int(self.gen_frames * self.args.gradperstep)))
                for learner in self.portfolio
            ]  #macheng: do we want to train all the learners?

            # Start threads
            for thread in threads:
                thread.start()

            # Join threads
            for thread in threads:
                thread.join()

            # Now update average_policy
            #self.average_policy.cuda()
            if ALGO == 'dis':
                self.average_policy.update(
                )  #update the average_policy parameter with supervised learning

            self.gen_frames = 0

            #########Visualize Learner Critic Function#################
            # if self.replay_buffer.__len__() % 2500 == 0:
            #	visualize_critic(self.portfolio[2], make_self_play_env(trainers=[[],[]])[0], 50)  #arguments: Learner, env, N_GRID

        ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############
        if not ISOLATE_PG:
            all_fitness = []
            all_net_ids = []
            all_eplens = []
            while True:
                for i in range(self.args.pop_size):
                    if self.evo_result_pipes[i][1].poll():
                        entry = self.evo_result_pipes[i][1].recv()
                        all_fitness.append(entry[1])
                        all_net_ids.append(entry[0])
                        all_eplens.append(entry[2])
                        self.gen_frames += entry[2]
                        self.total_frames += entry[2]
                        self.evo_flag[i] = True

                # Soft-join (50%)
                if len(all_fitness
                       ) / self.args.pop_size >= self.args.asynch_frac:
                    break

        ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############
        for i in range(self.args.rollout_size):
            entry = self.result_pipes[i][1].recv()
            learner_id = entry[0]
            fitness = entry[1]
            num_frames = entry[2]
            self.portfolio[learner_id].update_stats(fitness, num_frames)

            self.gen_frames += num_frames
            self.total_frames += num_frames
            if fitness > self.best_score: self.best_score = fitness

            self.roll_flag[i] = True

        # Referesh buffer (housekeeping tasks - pruning to keep under capacity)
        self.replay_buffer.referesh()
        ######################### END OF PARALLEL ROLLOUTS ################

        ############ PROCESS MAX FITNESS #############
        # ms:best policy is always up to date
        # so here the best learner is saved
        if not ISOLATE_PG:
            champ_index = all_net_ids[all_fitness.index(max(all_fitness))]
            utils.hard_update(self.test_bucket[0], self.pop[champ_index])
            if max(all_fitness) > self.best_score:
                self.best_score = max(all_fitness)
                utils.hard_update(self.best_policy, self.pop[champ_index])
                if SAVE:
                    torch.save(
                        self.pop[champ_index].state_dict(),
                        self.args.aux_folder + ENV_NAME + '_best' + SAVETAG)
                    print("Best policy saved with score",
                          '%.2f' % max(all_fitness))

        else:  #Run PG in isolation
            utils.hard_update(self.test_bucket[0], self.rollout_bucket[0])

        ###### TEST SCORE ######
        if self.test_flag:
            self.test_flag = False
            test_scores = []
            for pipe in self.test_result_pipes:  #Collect all results
                entry = pipe[1].recv()
                test_scores.append(entry[1])
            test_scores = np.array(test_scores)
            test_mean = np.mean(test_scores)
            test_std = (np.std(test_scores))

            # Update score to trackers
            frame_tracker.update([test_mean], self.total_frames)
        else:
            test_mean, test_std = None, None

        # NeuroEvolution's probabilistic selection and recombination step
        # ms: this epoch() method implements neuro-evolution
        if not ISOLATE_PG:  #seems pop_size and rollout_size must be 10, otherwise this will produce error
            if gen % 5 == 0:
                self.evolver.epoch(
                    gen, self.genealogy, self.pop, all_net_ids, all_fitness,
                    self.rollout_bucket
                )  #this method also copies learner to evoler
            else:
                self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids,
                                   all_fitness, [])

        # META LEARNING - RESET ALLOCATION USING UCB
        if gen % 1 == 0:
            self.update_allocation()
        # Metrics
        if not ISOLATE_PG:
            champ_len = all_eplens[all_fitness.index(max(all_fitness))]
            champ_wwid = int(self.pop[champ_index].wwid.item())
            max_fit = max(all_fitness)
        else:
            champ_len = num_frames
            champ_wwid = int(self.rollout_bucket[0].wwid.item())
            all_fitness = [fitness]
            max_fit = fitness
            all_eplens = [num_frames]

        return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid

    def update_allocation(self):
        self.allocation = ucb(len(self.allocation), self.portfolio,
                              self.args.ucb_coefficient)

    def sim_and_eval_POMDP(self):
        self.evaluator = Evaluator(
            self, 5, self.trainers,
            pomdp_adv=True)  # evaluator must be created before train_dqn
        for gen in range(1000000):
            print('gen=', gen)
            blue_score, red_score, actual_blue_score = agent.train_blue_dqn(
                agent.trainers, ENV_NAME, gen, ALGO='dis', pomdp_adv=True)
            print('Env', ENV_NAME, 'Gen', gen,
                  ", Training average: Blue agent score: ", blue_score,
                  " Red score: ", red_score, " Actual blue score: ",
                  actual_blue_score)
            blue_score, red_score, actual_blue_score = self.evaluator.evaluate(
            )
            print("Evaluation result: Blue agent score: ", blue_score,
                  " Red score: ", red_score, " Actual blue score: ",
                  actual_blue_score)
예제 #4
0
class Agent:
    """Learner object encapsulating a local learner

		Parameters:
		algo_name (str): Algorithm Identifier
		state_dim (int): State size
		action_dim (int): Action size
		actor_lr (float): Actor learning rate
		critic_lr (float): Critic learning rate
		gamma (float): DIscount rate
		tau (float): Target network sync generate
		init_w (bool): Use kaimling normal to initialize?
		**td3args (**kwargs): arguments for TD3 algo


	"""
    def __init__(self, args, id):
        self.args = args
        self.id = id

        ###Initalize neuroevolution module###
        self.evolver = SSNE(self.args)

        ########Initialize population
        self.manager = Manager()
        self.popn = self.manager.list()
        for _ in range(args.popn_size):
            if args.ps == 'trunk':
                self.popn.append(
                    MultiHeadActor(args.state_dim, args.action_dim,
                                   args.hidden_size, args.config.num_agents))

            else:
                if args.algo_name == 'TD3':
                    self.popn.append(
                        Actor(args.state_dim,
                              args.action_dim,
                              args.hidden_size,
                              policy_type='DeterministicPolicy'))
                else:
                    self.popn.append(
                        Actor(args.state_dim,
                              args.action_dim,
                              args.hidden_size,
                              policy_type='GaussianPolicy'))
            self.popn[-1].eval()

        #### INITIALIZE PG ALGO #####
        if args.ps == 'trunk':

            if self.args.is_matd3 or args.is_maddpg:
                algo_name = 'TD3' if self.args.is_matd3 else 'DDPG'
                self.algo = MATD3(id, algo_name, args.state_dim,
                                  args.action_dim, args.hidden_size,
                                  args.actor_lr, args.critic_lr, args.gamma,
                                  args.tau, args.savetag, args.aux_save,
                                  args.actualize, args.use_gpu,
                                  args.config.num_agents, args.init_w)

            else:
                self.algo = MultiTD3(id, args.algo_name, args.state_dim,
                                     args.action_dim, args.hidden_size,
                                     args.actor_lr, args.critic_lr, args.gamma,
                                     args.tau, args.savetag, args.aux_save,
                                     args.actualize, args.use_gpu,
                                     args.config.num_agents, args.init_w)

        else:
            if args.algo_name == 'TD3':
                self.algo = TD3(id, args.algo_name, args.state_dim,
                                args.action_dim, args.hidden_size,
                                args.actor_lr, args.critic_lr, args.gamma,
                                args.tau, args.savetag, args.aux_save,
                                args.actualize, args.use_gpu, args.init_w)
            else:
                self.algo = SAC(id, args.state_dim, args.action_dim,
                                args.hidden_size, args.gamma, args.critic_lr,
                                args.actor_lr, args.tau, args.alpha,
                                args.target_update_interval, args.savetag,
                                args.aux_save, args.actualize, args.use_gpu)

        #### Rollout Actor is a template used for MP #####
        self.rollout_actor = self.manager.list()

        if args.ps == 'trunk':
            self.rollout_actor.append(
                MultiHeadActor(args.state_dim, args.action_dim,
                               args.hidden_size, args.config.num_agents))
        else:
            if args.algo_name == 'TD3':
                self.rollout_actor.append(
                    Actor(args.state_dim,
                          args.action_dim,
                          args.hidden_size,
                          policy_type='DeterministicPolicy'))
            else:
                self.rollout_actor.append(
                    Actor(args.state_dim,
                          args.action_dim,
                          args.hidden_size,
                          policy_type='GaussianPolicy'))

        #Initalize buffer
        if args.ps == 'trunk':
            self.buffer = [
                Buffer(args.buffer_size,
                       buffer_gpu=False,
                       filter_c=args.filter_c)
                for _ in range(args.config.num_agents)
            ]
        else:
            self.buffer = Buffer(args.buffer_size,
                                 buffer_gpu=False,
                                 filter_c=args.filter_c)

        #Agent metrics
        self.fitnesses = [[] for _ in range(args.popn_size)]

        ###Best Policy HOF####
        self.champ_ind = 0

    def update_parameters(self):

        td3args = {
            'policy_noise': 0.2,
            'policy_noise_clip': 0.5,
            'policy_ups_freq': 2,
            'action_low': -1.0,
            'action_high': 1.0
        }

        if self.args.ps == 'trunk':

            for agent_id, buffer in enumerate(self.buffer):
                if self.args.is_matd3 or self.args.is_maddpg:
                    buffer = self.buffer[0]  #Hardcoded Hack for MADDPG

                buffer.referesh()
                if buffer.__len__() < 10 * self.args.batch_size:
                    buffer.pg_frames = 0
                    return  ###BURN_IN_PERIOD

                buffer.tensorify()

                for _ in range(int(self.args.gradperstep * buffer.pg_frames)):
                    s, ns, a, r, done, global_reward = buffer.sample(
                        self.args.batch_size,
                        pr_rew=self.args.priority_rate,
                        pr_global=self.args.priority_rate)
                    r *= self.args.reward_scaling
                    if self.args.use_gpu:
                        s = s.cuda()
                        ns = ns.cuda()
                        a = a.cuda()
                        r = r.cuda()
                        done = done.cuda()
                        global_reward = global_reward.cuda()
                    self.algo.update_parameters(s, ns, a, r, done,
                                                global_reward, agent_id, 1,
                                                **td3args)
                buffer.pg_frames = 0

        else:
            self.buffer.referesh()
            if self.buffer.__len__() < 10 * self.args.batch_size:
                return  ###BURN_IN_PERIOD
            self.buffer.tensorify()

            for _ in range(int(self.args.gradperstep * self.buffer.pg_frames)):
                s, ns, a, r, done, global_reward = self.buffer.sample(
                    self.args.batch_size,
                    pr_rew=self.args.priority_rate,
                    pr_global=self.args.priority_rate)
                r *= self.args.reward_scaling
                if self.args.use_gpu:
                    s = s.cuda()
                    ns = ns.cuda()
                    a = a.cuda()
                    r = r.cuda()
                    done = done.cuda()
                    global_reward = global_reward.cuda()
                self.algo.update_parameters(s, ns, a, r, done, global_reward,
                                            1, **td3args)

            self.buffer.pg_frames = 0  #Reset new frame counter to 0

    def evolve(self):

        ## One gen of evolution ###
        if self.args.popn_size > 1:  #If not no-evo

            if self.args.scheme == 'multipoint':
                #Make sure that the buffer has been refereshed and tensorified

                buffer_pointer = self.buffer[
                    0] if self.args.ps == 'trunk' else self.buffer

                if buffer_pointer.__len__() < 1000: buffer_pointer.tensorify()
                if random.random() < 0.01: buffer_pointer.tensorify()

                #Get sample of states from the buffer
                if buffer_pointer.__len__() < 1000:
                    sample_size = buffer_pointer.__len__()
                else:
                    sample_size = 1000

                if sample_size == 1000 and len(buffer_pointer.sT) < 1000:
                    buffer_pointer.tensorify()

                states, _, _, _, _, _ = buffer_pointer.sample(sample_size,
                                                              pr_rew=0.0,
                                                              pr_global=0.0)
                states = states.cpu()

            elif self.args.scheme == 'standard':
                states = None

            else:
                sys.exit('Unknown Evo Scheme')

            #Net indices of nets that got evaluated this generation (meant for asynchronous evolution workloads)
            net_inds = [i for i in range(len(self.popn))
                        ]  #Hack for a synchronous run

            #Evolve
            if self.args.rollout_size > 0:
                self.champ_ind = self.evolver.evolve(self.popn, net_inds,
                                                     self.fitnesses,
                                                     [self.rollout_actor[0]],
                                                     states)
            else:
                self.champ_ind = self.evolver.evolve(self.popn, net_inds,
                                                     self.fitnesses, [],
                                                     states)

        #Reset fitness metrics
        self.fitnesses = [[] for _ in range(self.args.popn_size)]

    def update_rollout_actor(self):
        for actor in self.rollout_actor:
            self.algo.policy.cpu()
            mod.hard_update(actor, self.algo.policy)
            if self.args.use_gpu: self.algo.policy.cuda()
예제 #5
0
class EGRL_Trainer:
    """Main CERL class containing all methods for CERL

		Parameters:
		args (object): Parameter class with all the parameters

	"""
    def __init__(self, args, model_constructor, env_constructor,
                 observation_space, action_space, env, state_template,
                 test_envs, platform):
        self.args = args
        model_constructor.state_dim += 2
        self.platform = platform

        self.policy_string = self.compute_policy_type()
        self.device = torch.device("cuda" if torch.cuda.is_available(
        ) else "cpu") if self.args.gpu else torch.device('cpu')

        #Evolution
        dram_action = torch.ones((len(state_template.x), 2)) + 1
        state_template.x = torch.cat([state_template.x, dram_action], axis=1)
        self.evolver = MixedSSNE(
            self.args, state_template
        )  #GA(self.args) if args.boltzman else SSNE(self.args)
        self.env_constructor = env_constructor

        self.test_tracker = utils.Tracker(
            self.args.plot_folder,
            ['score_' + self.args.savetag, 'speedup_' + self.args.savetag],
            '.csv')  # Tracker class to log progress
        self.time_tracker = utils.Tracker(self.args.plot_folder, [
            'timed_score_' + self.args.savetag,
            'timed_speedup_' + self.args.savetag
        ], '.csv')
        self.champ_tracker = utils.Tracker(self.args.plot_folder, [
            'champ_score_' + self.args.savetag,
            'champ_speedup_' + self.args.savetag
        ], '.csv')
        self.pg_tracker = utils.Tracker(self.args.plot_folder, [
            'pg_noisy_speedup_' + self.args.savetag,
            'pg_clean_speedup_' + self.args.savetag
        ], '.csv')
        self.migration_tracker = utils.Tracker(self.args.plot_folder, [
            'selection_rate_' + self.args.savetag,
            'elite_rate_' + self.args.savetag
        ], '.csv')

        #Generalization Trackers
        self.r50_tracker = utils.Tracker(self.args.plot_folder, [
            'r50_score_' + self.args.savetag,
            'r50_speedup_' + self.args.savetag
        ], '.csv')
        self.r101_tracker = utils.Tracker(self.args.plot_folder, [
            'r101_score_' + self.args.savetag,
            'r101_speedup_' + self.args.savetag
        ], '.csv')
        self.bert_tracker = utils.Tracker(self.args.plot_folder, [
            'bert_score_' + self.args.savetag,
            'bert_speedup_' + self.args.savetag
        ], '.csv')

        self.r50_frames_tracker = utils.Tracker(self.args.plot_folder, [
            'r50_score_' + self.args.savetag,
            'r50_speedup_' + self.args.savetag
        ], '.csv')
        self.r101_frames_tracker = utils.Tracker(self.args.plot_folder, [
            'r101_score_' + self.args.savetag,
            'r101_speedup_' + self.args.savetag
        ], '.csv')
        self.bert_frames_tracker = utils.Tracker(self.args.plot_folder, [
            'bert_score_' + self.args.savetag,
            'bert_speedup_' + self.args.savetag
        ], '.csv')

        #Genealogy tool
        self.genealogy = Genealogy()

        self.env = env
        self.test_envs = test_envs

        if self.args.use_mp:
            #MP TOOLS
            self.manager = Manager()
            #Initialize Mixed Population
            self.population = self.manager.list()

        else:
            self.population = []

        boltzman_count = int(args.pop_size * args.ratio)
        rest = args.pop_size - boltzman_count
        for _ in range(boltzman_count):
            self.population.append(
                BoltzmannChromosome(model_constructor.num_nodes,
                                    model_constructor.action_dim))

        for _ in range(rest):
            self.population.append(
                model_constructor.make_model(self.policy_string))
            self.population[-1].eval()

        #Save best policy
        self.best_policy = model_constructor.make_model(self.policy_string)

        #Init BUFFER
        self.replay_buffer = Buffer(args.buffer_size, state_template,
                                    action_space,
                                    args.aux_folder + args.savetag)
        self.data_bucket = self.replay_buffer.tuples

        #Intialize portfolio of learners
        self.portfolio = []
        if args.rollout_size > 0:
            self.portfolio = initialize_portfolio(self.portfolio, self.args,
                                                  self.genealogy,
                                                  args.portfolio_id,
                                                  model_constructor)

        #Initialize Rollout Bucket
        self.rollout_bucket = self.manager.list() if self.args.use_mp else []
        for _ in range(len(self.portfolio)):
            self.rollout_bucket.append(
                model_constructor.make_model(self.policy_string))

        if self.args.use_mp:
            ############## MULTIPROCESSING TOOLS ###################
            #Evolutionary population Rollout workers
            data_bucket = self.data_bucket if args.rollout_size > 0 else None  #If Strictly Evo - don;t store data
            self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)]
            self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)]
            self.evo_workers = [
                Process(target=rollout_worker,
                        args=(id, 'evo', self.evo_task_pipes[id][1],
                              self.evo_result_pipes[id][0], data_bucket,
                              self.population, env_constructor))
                for id in range(args.pop_size)
            ]
            for worker in self.evo_workers:
                worker.start()

            #Learner rollout workers
            self.task_pipes = [Pipe() for _ in range(args.rollout_size)]
            self.result_pipes = [Pipe() for _ in range(args.rollout_size)]
            self.workers = [
                Process(target=rollout_worker,
                        args=(id, 'pg', self.task_pipes[id][1],
                              self.result_pipes[id][0], data_bucket,
                              self.rollout_bucket, env_constructor))
                for id in range(args.rollout_size)
            ]
            for worker in self.workers:
                worker.start()

        self.roll_flag = [True for _ in range(args.rollout_size)]
        self.evo_flag = [True for _ in range(args.pop_size)]

        #Meta-learning controller (Resource Distribution)
        self.allocation = [
        ]  #Allocation controls the resource allocation across learners
        for i in range(args.rollout_size):
            self.allocation.append(
                i % len(self.portfolio))  #Start uniformly (equal resources)

        #Trackers
        self.best_score = -float('inf')
        self.gen_frames = 0
        self.total_frames = 0
        self.best_speedup = -float('inf')
        self.champ_type = None

    def checkpoint(self):

        utils.pickle_obj(self.args.ckpt_folder + 'test_tracker',
                         self.test_tracker)
        utils.pickle_obj(self.args.ckpt_folder + 'time_tracker',
                         self.time_tracker)
        utils.pickle_obj(self.args.ckpt_folder + 'champ_tracker',
                         self.champ_tracker)
        for i in range(len(self.population)):
            net = self.population[i]

            if net.model_type == 'BoltzmanChromosome':
                utils.pickle_obj(self.args.ckpt_folder + 'Boltzman/' + str(i),
                                 net)

            else:
                torch.save(net.state_dict(),
                           self.args.ckpt_folder + 'Gumbel/' + str(i))

            self.population[i] = net

    def load_checkpoint(self):

        #Try to load trackers
        try:
            self.test_tracker = utils.unpickle_obj(self.args.ckpt_folder +
                                                   'test_tracker')
            self.time_tracker = utils.unpickle_obj(self.args.ckpt_folder +
                                                   'time_tracker')
            self.champ_tracker = utils.unpickle_obj(self.args.ckpt_folder +
                                                    'champ_tracker')
        except:
            None

        gumbel_template = False
        for i in range(len(self.population)):
            if self.population[i].model_type == 'GumbelPolicy':
                gumbel_template = self.population[i]
                break

        boltzman_nets = os.listdir(self.args.ckpt_folder + 'Boltzman/')
        gumbel_nets = os.listdir(self.args.ckpt_folder + 'Gumbel/')

        print('Boltzman seeds', boltzman_nets, 'Gumbel seeds', gumbel_nets)

        gumbel_models = []
        boltzman_models = []

        for fname in boltzman_nets:
            try:
                net = utils.unpickle_obj(self.args.ckpt_folder + 'Boltzman/' +
                                         fname)
                boltzman_models.append(net)
            except:
                print('Failed to load',
                      self.args.ckpt_folder + 'Boltzman/' + fname)

        for fname in gumbel_nets:
            try:
                model_template = copy.deepcopy(gumbel_template)
                model_template.load_state_dict(
                    torch.load(self.args.ckpt_folder + 'Gumbel/' + fname))
                model_template.eval()
                gumbel_models.append(model_template)
            except:
                print('Failed to load',
                      self.args.ckpt_folder + 'Gumbel/' + fname)

        for i in range(len(self.population)):
            net = self.population[i]

            if net.model_type == 'GumbelPolicy' and len(gumbel_models) >= 1:
                seed_model = gumbel_models.pop()
                utils.hard_update(net, seed_model)

            elif net.model_type == 'BoltzmanChromosome' and len(
                    boltzman_models) >= 1:
                seed_model = boltzman_models.pop()
                net = seed_model

            self.population[i] = net

        print()
        print()
        print()
        print()
        print('Checkpoint Loading Phase Completed')
        print()
        print()
        print()
        print()

    def forward_generation(self, gen, time_start):
        ################ START ROLLOUTS ##############

        #Start Evolution rollouts
        if self.args.pop_size >= 1 and self.args.use_mp:
            for id, actor in enumerate(self.population):
                if self.evo_flag[id]:
                    self.evo_task_pipes[id][0].send(id)
                    self.evo_flag[id] = False

        #If Policy Gradient
        if self.args.rollout_size > 0:
            #Sync all learners actor to cpu (rollout) actor
            for i, learner in enumerate(self.portfolio):
                learner.algo.actor.cpu()
                utils.hard_update(self.rollout_bucket[i], learner.algo.actor)
                learner.algo.actor.to(self.device)

            # Start Learner rollouts
            if self.args.use_mp:
                for rollout_id, learner_id in enumerate(self.allocation):
                    if self.roll_flag[rollout_id]:
                        self.task_pipes[rollout_id][0].send(learner_id)
                        self.roll_flag[rollout_id] = False

            ############# UPDATE PARAMS USING GRADIENT DESCENT ##########
            if self.replay_buffer.__len__(
            ) > self.args.learning_start and not self.args.random_baseline:  ###BURN IN PERIOD

                print('INSIDE GRAD DESCENT')

                for learner in self.portfolio:
                    learner.update_parameters(
                        self.replay_buffer, self.args.batch_size,
                        int(self.gen_frames * self.args.gradperstep))

                self.gen_frames = 0

            else:
                print('BURN IN PERIOD')

        gen_best = -float('inf')
        gen_best_speedup = -float("inf")
        gen_champ = None
        ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############
        if self.args.pop_size >= 1:
            for i in range(self.args.pop_size):

                if self.args.use_mp:
                    entry = self.evo_result_pipes[i][1].recv()
                else:
                    entry = rollout_function(
                        i,
                        'evo',
                        self.population[i],
                        self.env,
                        store_data=self.args.rollout_size > 0)

                self.gen_frames += entry[2]
                self.total_frames += entry[2]
                speedup = entry[3][0]
                score = entry[1]

                net = self.population[entry[0]]
                net.fitness_stats['speedup'] = speedup
                net.fitness_stats['score'] = score
                net.fitness_stats['shaped'][:] = entry[5]
                self.population[entry[0]] = net

                self.test_tracker.update([score, speedup], self.total_frames)
                self.time_tracker.update([score, speedup],
                                         time.time() - time_start)

                if speedup > self.best_speedup:
                    self.best_speedup = speedup

                if score > gen_best:
                    gen_best = score
                    gen_champ = self.population[i]

                if speedup > gen_best_speedup:
                    gen_best_speedup = speedup

                if score > self.best_score:
                    self.best_score = score
                    champ_index = i
                    self.champ_type = net.model_type
                    try:
                        torch.save(
                            self.population[champ_index].state_dict(),
                            self.args.models_folder + 'bestChamp_' +
                            self.args.savetag)
                    except:
                        None
                    # TODO
                    print("Best Evo Champ saved with score", '%.2f' % score)

                if self.args.rollout_size > 0:
                    self.replay_buffer.add(entry[4])

                self.evo_flag[i] = True

        try:
            torch.save(
                gen_champ.state_dict(),
                self.args.models_folder + 'genChamp_' + str(gen) +
                '_speedup_' + str(gen_best_speedup) + '_' + self.args.savetag)
        except:
            None

        ############################# GENERALIZATION EXPERIMENTS ########################
        _, resnet50_score, _, resnet50_speedup, _, _ = rollout_function(
            0, 'evo', gen_champ, self.test_envs[0], store_data=False)
        _, resnet101_score, _, resnet101_speedup, _, _ = rollout_function(
            0, 'evo', gen_champ, self.test_envs[1], store_data=False)
        resnet50_speedup = resnet50_speedup[0]
        resnet101_speedup = resnet101_speedup[0]
        self.r50_tracker.update([resnet50_score, resnet50_speedup], gen)
        self.r101_tracker.update([resnet101_score, resnet101_speedup], gen)
        self.r50_frames_tracker.update([resnet50_score, resnet50_speedup],
                                       self.total_frames)
        self.r101_frames_tracker.update([resnet101_score, resnet101_speedup],
                                        self.total_frames)
        bert_speedup, bert_score = None, None

        if self.platform != 'wpa':
            _, bert_score, _, bert_speedup, _, _ = rollout_function(
                0, 'evo', gen_champ, self.test_envs[2], store_data=False)
            bert_speedup = bert_speedup[0]
            self.bert_tracker.update([bert_score, bert_speedup], gen)
            self.bert_frames_tracker.update([bert_score, bert_speedup],
                                            self.total_frames)

        ############################# GENERALIZATION EXPERIMENTS ########################

        ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############
        if self.args.rollout_size > 0:
            for i in range(self.args.rollout_size):

                #NOISY PG
                if self.args.use_mp:
                    entry = self.result_pipes[i][1].recv()
                else:
                    entry = rollout_function(i,
                                             'pg',
                                             self.rollout_bucket[i],
                                             self.env,
                                             store_data=True)

                learner_id = entry[0]
                fitness = entry[1]
                num_frames = entry[2]
                speedup = entry[3][0]
                self.portfolio[learner_id].update_stats(fitness, num_frames)
                self.replay_buffer.add(entry[4])

                self.test_tracker.update([fitness, speedup], self.total_frames)
                self.time_tracker.update([fitness, speedup],
                                         time.time() - time_start)

                gen_best = max(fitness, gen_best)
                self.best_speedup = max(speedup, self.best_speedup)
                gen_best_speedup = max(speedup, gen_best_speedup)
                self.gen_frames += num_frames
                self.total_frames += num_frames
                if fitness > self.best_score:
                    self.best_score = fitness
                    torch.save(
                        self.rollout_bucket[i].state_dict(),
                        self.args.models_folder + 'noisy_bestPG_' +
                        str(speedup) + '_' + self.args.savetag)
                    print("Best Rollout Champ saved with score",
                          '%.2f' % fitness)
                noisy_speedup = speedup

                # Clean PG Measurement
                entry = rollout_function(i,
                                         'evo',
                                         self.rollout_bucket[i],
                                         self.env,
                                         store_data=True)
                learner_id = entry[0]
                fitness = entry[1]
                num_frames = entry[2]
                speedup = entry[3][0]
                self.portfolio[learner_id].update_stats(fitness, num_frames)
                self.replay_buffer.add(entry[4])

                self.test_tracker.update([fitness, speedup], self.total_frames)
                self.time_tracker.update([fitness, speedup],
                                         time.time() - time_start)

                gen_best = max(fitness, gen_best)
                self.best_speedup = max(speedup, self.best_speedup)
                gen_best_speedup = max(speedup, gen_best_speedup)
                self.gen_frames += num_frames
                self.total_frames += num_frames
                if fitness > self.best_score:
                    self.best_score = fitness
                    torch.save(
                        self.rollout_bucket[i].state_dict(),
                        self.args.models_folder + 'clean_bestPG_' +
                        str(speedup) + '_' + self.args.savetag)
                    print("Best Clean Evo Champ saved with score",
                          '%.2f' % fitness)

                self.pg_tracker.update([noisy_speedup, speedup],
                                       self.total_frames)
                self.roll_flag[i] = True

        self.champ_tracker.update([gen_best, gen_best_speedup],
                                  self.total_frames)

        #NeuroEvolution's probabilistic selection and recombination step
        if self.args.pop_size >= 1 and not self.args.random_baseline:

            if gen % 1 == 0:
                self.population = self.evolver.epoch(self.population,
                                                     self.rollout_bucket)
            else:
                self.population = self.evolver.epoch(self.population, [])

            if self.evolver.selection_stats['total'] > 0:
                selection_rate = (
                    1.0 * self.evolver.selection_stats['selected'] +
                    self.evolver.selection_stats['elite']
                ) / self.evolver.selection_stats['total']
                elite_rate = selection_rate = (
                    1.0 * self.evolver.selection_stats['elite']
                ) / self.evolver.selection_stats['total']
                self.migration_tracker.update([selection_rate, elite_rate],
                                              self.total_frames)

        if gen % 1 == 0:
            self.checkpoint()

        return gen_best

    def train(self, frame_limit):

        time_start = time.time()

        for gen in range(1, 1000000000):  # Infinite generations

            # Train one iteration
            gen_best = self.forward_generation(gen, time_start)

            print()
            print('Gen/Frames', gen, '/', self.total_frames, 'Gen_Score',
                  '%.2f' % gen_best, 'Best_Score', '%.2f' % self.best_score,
                  ' Speedup', '%.2f' % self.best_speedup, ' Frames/sec:',
                  '%.2f' % (self.total_frames / (time.time() - time_start)),
                  'Buffer', self.replay_buffer.__len__(), 'Savetag',
                  self.args.savetag)
            for net in self.population:

                print(net.model_type, net.fitness_stats)
                if net.model_type == 'BoltzmanChromosome':
                    print(net.temperature_stats)
                print()
            print()

            try:
                print('Initial Ratio', self.args.ratio, 'Current Ratio',
                      self.evolver.ratio, 'Chamption Type', self.champ_type)
            except:
                None

            if gen % 5 == 0:
                print('Learner Fitness', [
                    utils.pprint(learner.value) for learner in self.portfolio
                ])

            if self.total_frames > frame_limit:
                break

        ###Kill all processes
        try:
            for p in self.task_pipes:
                p[0].send('TERMINATE')
            for p in self.test_task_pipes:
                p[0].send('TERMINATE')
            for p in self.evo_task_pipes:
                p[0].send('TERMINATE')
        except:
            None

    def compute_policy_type(self):

        if self.args.algo == 'ddqn':
            return 'DDQN'

        elif self.args.algo == 'sac':
            return 'Gaussian_FF'

        elif self.args.algo == 'td3':
            return 'Deterministic_FF'

        elif self.args.algo == 'sac_discrete':
            return 'GumbelPolicy'
class ERL_Trainer:
    def __init__(self, args, model_constructor, env_constructor):

        self.args = args
        self.policy_string = 'CategoricalPolicy' if env_constructor.is_discrete else 'Gaussian_FF'
        self.manager = Manager()
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        #Evolution
        self.evolver = SSNE(self.args)

        #Initialize population
        self.population = self.manager.list()
        for _ in range(args.pop_size):
            self.population.append(
                model_constructor.make_model(self.policy_string))

        #Save best policy
        self.best_policy = model_constructor.make_model(self.policy_string)

        #PG Learner
        if env_constructor.is_discrete:
            from algos.ddqn import DDQN
            self.learner = DDQN(args, model_constructor)
        else:
            from algos.sac import SAC
            self.learner = SAC(args, model_constructor)

        #Replay Buffer
        self.replay_buffer = Buffer(args.buffer_size)

        #Initialize Rollout Bucket
        self.rollout_bucket = self.manager.list()
        for _ in range(args.rollout_size):
            self.rollout_bucket.append(
                model_constructor.make_model(self.policy_string))

        ############## MULTIPROCESSING TOOLS ###################
        #Evolutionary population Rollout workers
        self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_workers = [
            Process(target=rollout_worker,
                    args=(id, 'evo', self.evo_task_pipes[id][1],
                          self.evo_result_pipes[id][0], args.rollout_size > 0,
                          self.population, env_constructor))
            for id in range(args.pop_size)
        ]
        for worker in self.evo_workers:
            worker.start()
        self.evo_flag = [True for _ in range(args.pop_size)]

        #Learner rollout workers
        self.task_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.result_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, 'pg', self.task_pipes[id][1],
                          self.result_pipes[id][0], True, self.rollout_bucket,
                          env_constructor)) for id in range(args.rollout_size)
        ]
        for worker in self.workers:
            worker.start()
        self.roll_flag = [True for _ in range(args.rollout_size)]

        #Test bucket
        self.test_bucket = self.manager.list()
        self.test_bucket.append(
            model_constructor.make_model(self.policy_string))

        # Test workers
        self.test_task_pipes = [Pipe() for _ in range(args.num_test)]
        self.test_result_pipes = [Pipe() for _ in range(args.num_test)]
        self.test_workers = [
            Process(target=rollout_worker,
                    args=(id, 'test', self.test_task_pipes[id][1],
                          self.test_result_pipes[id][0], False,
                          self.test_bucket, env_constructor))
            for id in range(args.num_test)
        ]
        for worker in self.test_workers:
            worker.start()
        self.test_flag = False

        #Trackers
        self.best_score = -float('inf')
        self.gen_frames = 0
        self.total_frames = 0
        self.test_score = None
        self.test_std = None

    def forward_generation(self, gen, tracker):

        gen_max = -float('inf')

        #Start Evolution rollouts
        if self.args.pop_size > 1:
            for id, actor in enumerate(self.population):
                self.evo_task_pipes[id][0].send(id)

        #Sync all learners actor to cpu (rollout) actor and start their rollout
        self.learner.actor.cpu()
        for rollout_id in range(len(self.rollout_bucket)):
            utils.hard_update(self.rollout_bucket[rollout_id],
                              self.learner.actor)
            self.task_pipes[rollout_id][0].send(0)
        self.learner.actor.to(device=self.device)

        #Start Test rollouts
        if gen % self.args.test_frequency == 0:
            self.test_flag = True
            for pipe in self.test_task_pipes:
                pipe[0].send(0)

        ############# UPDATE PARAMS USING GRADIENT DESCENT ##########
        if self.replay_buffer.__len__(
        ) > self.args.learning_start:  ###BURN IN PERIOD
            for _ in range(int(self.gen_frames * self.args.gradperstep)):
                s, ns, a, r, done = self.replay_buffer.sample(
                    self.args.batch_size)
                self.learner.update_parameters(s, ns, a, r, done)

            self.gen_frames = 0

        ########## JOIN ROLLOUTS FOR EVO POPULATION ############
        all_fitness = []
        all_eplens = []
        if self.args.pop_size > 1:
            for i in range(self.args.pop_size):
                _, fitness, frames, trajectory = self.evo_result_pipes[i][
                    1].recv()

                all_fitness.append(fitness)
                all_eplens.append(frames)
                self.gen_frames += frames
                self.total_frames += frames
                self.replay_buffer.add(trajectory)
                self.best_score = max(self.best_score, fitness)
                gen_max = max(gen_max, fitness)

        ########## JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############
        rollout_fitness = []
        rollout_eplens = []
        if self.args.rollout_size > 0:
            for i in range(self.args.rollout_size):
                _, fitness, pg_frames, trajectory = self.result_pipes[i][
                    1].recv()
                self.replay_buffer.add(trajectory)
                self.gen_frames += pg_frames
                self.total_frames += pg_frames
                self.best_score = max(self.best_score, fitness)
                gen_max = max(gen_max, fitness)
                rollout_fitness.append(fitness)
                rollout_eplens.append(pg_frames)

        ######################### END OF PARALLEL ROLLOUTS ################

        ############ FIGURE OUT THE CHAMP POLICY AND SYNC IT TO TEST #############
        if self.args.pop_size > 1:
            champ_index = all_fitness.index(max(all_fitness))
            utils.hard_update(self.test_bucket[0],
                              self.population[champ_index])
            if max(all_fitness) > self.best_score:
                self.best_score = max(all_fitness)
                utils.hard_update(self.best_policy,
                                  self.population[champ_index])
                torch.save(self.population[champ_index].state_dict(),
                           self.args.aux_folder + '_best' + self.args.savetag)
                print("Best policy saved with score",
                      '%.2f' % max(all_fitness))

        else:  #If there is no population, champion is just the actor from policy gradient learner
            utils.hard_update(self.test_bucket[0], self.rollout_bucket[0])

        ###### TEST SCORE ######
        if self.test_flag:
            self.test_flag = False
            test_scores = []
            for pipe in self.test_result_pipes:  #Collect all results
                _, fitness, _, _ = pipe[1].recv()
                self.best_score = max(self.best_score, fitness)
                gen_max = max(gen_max, fitness)
                test_scores.append(fitness)
            test_scores = np.array(test_scores)
            test_mean = np.mean(test_scores)
            test_std = (np.std(test_scores))
            tracker.update([test_mean], self.total_frames)

        else:
            test_mean, test_std = None, None

        #NeuroEvolution's probabilistic selection and recombination step
        if self.args.pop_size > 1:
            self.evolver.epoch(gen, self.population, all_fitness,
                               self.rollout_bucket)

        #Compute the champion's eplen
        champ_len = all_eplens[all_fitness.index(
            max(all_fitness))] if self.args.pop_size > 1 else rollout_eplens[
                rollout_fitness.index(max(rollout_fitness))]

        return gen_max, champ_len, all_eplens, test_mean, test_std, rollout_fitness, rollout_eplens

    def train(self, frame_limit):
        # Define Tracker class to track scores
        test_tracker = utils.Tracker(self.args.savefolder,
                                     ['score_' + self.args.savetag],
                                     '.csv')  # Tracker class to log progress
        time_start = time.time()

        for gen in range(1, 1000000000):  # Infinite generations

            # Train one iteration
            max_fitness, champ_len, all_eplens, test_mean, test_std, rollout_fitness, rollout_eplens = self.forward_generation(
                gen, test_tracker)
            if test_mean:
                self.args.writer.add_scalar('test_score', test_mean, gen)

            print(
                'Gen/Frames:', gen, '/', self.total_frames, ' Gen_max_score:',
                '%.2f' % max_fitness, ' Champ_len', '%.2f' % champ_len,
                ' Test_score u/std', utils.pprint(test_mean),
                utils.pprint(test_std), ' Rollout_u/std:',
                utils.pprint(np.mean(np.array(rollout_fitness))),
                utils.pprint(np.std(np.array(rollout_fitness))),
                ' Rollout_mean_eplen:',
                utils.pprint(sum(rollout_eplens) /
                             len(rollout_eplens)) if rollout_eplens else None)

            if gen % 5 == 0:
                print(
                    'Best_score_ever:'
                    '/', '%.2f' % self.best_score, ' FPS:',
                    '%.2f' % (self.total_frames / (time.time() - time_start)),
                    'savetag', self.args.savetag)
                print()

            if self.total_frames > frame_limit:
                break

        ###Kill all processes
        try:
            for p in self.task_pipes:
                p[0].send('TERMINATE')
            for p in self.test_task_pipes:
                p[0].send('TERMINATE')
            for p in self.evo_task_pipes:
                p[0].send('TERMINATE')
        except:
            None
예제 #7
0
class CERL_Trainer:
    """Main CERL class containing all methods for CERL

		Parameters:
		args (object): Parameter class with all the parameters

	"""
    def __init__(self, args, model_constructor, env_constructor):
        self.args = args
        self.policy_string = self.compute_policy_type()

        #Evolution
        self.evolver = SSNE(self.args)

        #MP TOOLS
        self.manager = Manager()

        #Genealogy tool
        self.genealogy = Genealogy()

        #Initialize population
        self.population = self.manager.list()
        seed = True
        for _ in range(args.pop_size):
            self.population.append(
                model_constructor.make_model(self.policy_string, seed=seed))
            seed = False

        #SEED
        #self.population[0].load_state_dict(torch.load('Results/Auxiliary/_bestcerl_td3_s2019_roll10_pop10_portfolio10'))

        #Save best policy
        self.best_policy = model_constructor.make_model(self.policy_string)

        #Turn off gradients and put in eval mod
        for actor in self.population:
            actor = actor.cpu()
            actor.eval()

        #Init BUFFER
        self.replay_buffer = Buffer(args.buffer_size)
        self.data_bucket = self.replay_buffer.tuples

        #Intialize portfolio of learners
        self.portfolio = []
        self.portfolio = initialize_portfolio(self.portfolio, self.args,
                                              self.genealogy,
                                              args.portfolio_id,
                                              model_constructor)

        #Initialize Rollout Bucket
        self.rollout_bucket = self.manager.list()
        for _ in range(len(self.portfolio)):
            self.rollout_bucket.append(
                model_constructor.make_model(self.policy_string))

        ############## MULTIPROCESSING TOOLS ###################

        #Evolutionary population Rollout workers
        self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_workers = [
            Process(target=rollout_worker,
                    args=(id, 'evo', self.evo_task_pipes[id][1],
                          self.evo_result_pipes[id][0], self.data_bucket,
                          self.population, env_constructor))
            for id in range(args.pop_size)
        ]
        for worker in self.evo_workers:
            worker.start()
        self.evo_flag = [True for _ in range(args.pop_size)]

        #Learner rollout workers
        self.task_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.result_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, 'pg', self.task_pipes[id][1],
                          self.result_pipes[id][0], self.data_bucket,
                          self.rollout_bucket, env_constructor))
            for id in range(args.rollout_size)
        ]
        for worker in self.workers:
            worker.start()
        self.roll_flag = [True for _ in range(args.rollout_size)]

        #Test bucket
        self.test_bucket = self.manager.list()
        self.test_bucket.append(
            model_constructor.make_model(self.policy_string))

        #5 Test workers
        self.test_task_pipes = [
            Pipe() for _ in range(env_constructor.dummy_env.test_size)
        ]
        self.test_result_pipes = [
            Pipe() for _ in range(env_constructor.dummy_env.test_size)
        ]
        self.test_workers = [
            Process(target=rollout_worker,
                    args=(id, 'test', self.test_task_pipes[id][1],
                          self.test_result_pipes[id][0], None,
                          self.test_bucket, env_constructor))
            for id in range(env_constructor.dummy_env.test_size)
        ]
        for worker in self.test_workers:
            worker.start()
        self.test_flag = False

        #Meta-learning controller (Resource Distribution)
        self.allocation = [
        ]  #Allocation controls the resource allocation across learners
        for i in range(args.rollout_size):
            self.allocation.append(
                i % len(self.portfolio))  #Start uniformly (equal resources)

        #Trackers
        self.best_score = 0.0
        self.gen_frames = 0
        self.total_frames = 0
        self.test_score = None
        self.test_std = None
        self.best_r1_score = 0.0
        self.ep_len = 0
        self.r1_reward = 0
        self.num_footsteps = 0
        self.test_trace = []

    def checkpoint(self):
        utils.pickle_obj(
            self.args.aux_folder + self.args.algo + '_checkpoint_frames' +
            str(self.total_frames), self.portfolio)

    def load_checkpoint(self, filename):
        self.portfolio = utils.unpickle_obj(filename)

    def forward_generation(self, gen, tracker):
        """Main training loop to do rollouts, neureoevolution, and policy gradients

			Parameters:
				gen (int): Current epoch of training

			Returns:
				None
		"""
        ################ START ROLLOUTS ##############

        #Start Evolution rollouts
        if self.args.pop_size > 1:
            for id, actor in enumerate(self.population):
                if self.evo_flag[id]:
                    self.evo_task_pipes[id][0].send(id)
                    self.evo_flag[id] = False

        #Sync all learners actor to cpu (rollout) actor
        for i, learner in enumerate(self.portfolio):
            learner.algo.actor.cpu()
            utils.hard_update(self.rollout_bucket[i], learner.algo.actor)
            learner.algo.actor.cuda()

        # Start Learner rollouts
        for rollout_id, learner_id in enumerate(self.allocation):
            if self.roll_flag[rollout_id]:
                self.task_pipes[rollout_id][0].send(learner_id)
                self.roll_flag[rollout_id] = False

        #Start Test rollouts
        if gen % 5 == 0:
            self.test_flag = True
            for pipe in self.test_task_pipes:
                pipe[0].send(0)

        ############# UPDATE PARAMS USING GRADIENT DESCENT ##########
        if self.replay_buffer.__len__(
        ) > self.args.learning_start:  ###BURN IN PERIOD

            #Spin up threads for each learner
            threads = [
                threading.Thread(
                    target=learner.update_parameters,
                    args=(self.replay_buffer, self.args.batch_size,
                          int(self.gen_frames * self.args.gradperstep)))
                for learner in self.portfolio
            ]

            # Start threads
            for thread in threads:
                thread.start()

            #Join threads
            for thread in threads:
                thread.join()
            self.gen_frames = 0

        ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############
        if self.args.pop_size > 1:
            all_fitness = []
            all_net_ids = []
            all_eplens = []
            while True:
                for i in range(self.args.pop_size):
                    if self.evo_result_pipes[i][1].poll():
                        entry = self.evo_result_pipes[i][1].recv()
                        all_fitness.append(entry[1])
                        all_net_ids.append(entry[0])
                        all_eplens.append(entry[2])
                        self.gen_frames += entry[2]
                        self.total_frames += entry[2]
                        self.evo_flag[i] = True

                # Soft-join (50%)
                if len(all_fitness
                       ) / self.args.pop_size >= self.args.asynch_frac:
                    break

        ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############
        if self.args.rollout_size > 0:
            for i in range(self.args.rollout_size):
                entry = self.result_pipes[i][1].recv()
                learner_id = entry[0]
                fitness = entry[1]
                num_frames = entry[2]
                self.portfolio[learner_id].update_stats(fitness, num_frames)

                self.gen_frames += num_frames
                self.total_frames += num_frames
                if fitness > self.best_score: self.best_score = fitness

                self.roll_flag[i] = True

        ######################### END OF PARALLEL ROLLOUTS ################

        ############ PROCESS MAX FITNESS #############
        if self.args.pop_size > 1:
            champ_index = all_net_ids[all_fitness.index(max(all_fitness))]
            utils.hard_update(self.test_bucket[0],
                              self.population[champ_index])

        else:  #Run PG in isolation
            utils.hard_update(self.test_bucket[0], self.rollout_bucket[0])

        ###### TEST SCORE ######
        if self.test_flag:
            self.test_flag = False
            test_scores = []
            eplens = []
            r1_reward = []
            num_footsteps = []
            for pipe in self.test_result_pipes:  #Collect all results
                entry = pipe[1].recv()
                test_scores.append(entry[1])
                eplens.append(entry[3])
                r1_reward.append(entry[4])
                num_footsteps.append(entry[5])

            test_scores = np.array(test_scores)
            test_mean = np.mean(test_scores)
            test_std = (np.std(test_scores))
            self.test_trace.append(test_mean)
            self.num_footsteps = np.mean(np.array(num_footsteps))
            self.ep_len = np.mean(np.array(eplens))
            self.r1_reward = np.mean(np.array(r1_reward))

            if self.r1_reward > self.best_r1_score:
                self.best_r1_score = self.r1_reward
                utils.hard_update(self.best_policy, self.test_bucket[0])
                torch.save(
                    self.test_bucket[0].state_dict(),
                    self.args.aux_folder + '_bestR1_' + self.args.savetag)
                print("Best R2 policy saved with score",
                      '%.2f' % self.r1_reward)

            if test_mean > self.best_score:
                self.best_score = test_mean
                utils.hard_update(self.best_policy, self.test_bucket[0])
                torch.save(
                    self.test_bucket[0].state_dict(),
                    self.args.aux_folder + '_bestShaped' + self.args.savetag)
                print("Best Shaped policy saved with score",
                      '%.2f' % test_mean)

            tracker.update([test_mean, self.r1_reward], self.total_frames)

        else:
            test_mean, test_std = None, None

        # Referesh buffer (housekeeping tasks - pruning to keep under capacity)
        self.replay_buffer.referesh()

        #NeuroEvolution's probabilistic selection and recombination step
        if self.args.pop_size > 1:
            if self.args.scheme == 'multipoint':
                sample_size = self.args.batch_size if self.replay_buffer.__len__(
                ) >= self.args.batch_size else self.replay_buffer.__len__()
                states, _, _, _, _ = self.replay_buffer.sample(
                    batch_size=sample_size)
            else:
                states = None
            self.evolver.epoch(self.population, all_net_ids, all_fitness,
                               self.rollout_bucket, states)

        #META LEARNING - RESET ALLOCATION USING UCB
        if self.args.rollout_size > 0:
            self.allocation = ucb(len(self.allocation), self.portfolio,
                                  self.args.ucb_coefficient)

        #Metrics
        if self.args.pop_size > 1:
            champ_len = all_eplens[all_fitness.index(max(all_fitness))]
            #champ_wwid = int(self.pop[champ_index].wwid.item())
            max_fit = max(all_fitness)
        else:
            champ_len = num_frames
            all_fitness = [fitness]
            max_fit = fitness
            all_eplens = [num_frames]

        return max_fit, champ_len, all_eplens, test_mean, test_std

    def train(self, frame_limit):
        # Define Tracker class to track scores
        test_tracker = utils.Tracker(
            self.args.savefolder,
            ['score_' + self.args.savetag, 'r2_' + self.args.savetag],
            '.csv')  # Tracker class to log progress

        grad_temp = [
            str(i) + 'entropy_' + self.args.savetag
            for i in range(len(self.portfolio))
        ] + [
            str(i) + 'policyQ_' + self.args.savetag
            for i in range(len(self.portfolio))
        ]
        grad_tracker = utils.Tracker(self.args.aux_folder, grad_temp,
                                     '.csv')  # Tracker class to log progress
        time_start = time.time()

        for gen in range(1, 1000000000):  # Infinite generations

            # Train one iteration
            max_fitness, champ_len, all_eplens, test_mean, test_std = self.forward_generation(
                gen, test_tracker)

            print('Gen/Frames', gen, '/', self.total_frames,
                  ' Pop_max/max_ever:', '%.2f' % max_fitness, '/',
                  '%.2f' % self.best_score, ' Avg:',
                  '%.2f' % test_tracker.all_tracker[0][1], ' Frames/sec:',
                  '%.2f' % (self.total_frames / (time.time() - time_start)),
                  ' Champ_len', '%.2f' % champ_len, ' Test_score u/std',
                  utils.pprint(test_mean), utils.pprint(test_std), 'Ep_len',
                  '%.2f' % self.ep_len, '#Footsteps',
                  '%.2f' % self.num_footsteps, 'R2_Reward',
                  '%.2f' % self.r1_reward, 'savetag', self.args.savetag)

            grad_temp = [
                algo.algo.entropy['mean'] for algo in self.portfolio
            ] + [algo.algo.policy_q['mean'] for algo in self.portfolio]
            grad_tracker.update(grad_temp, self.total_frames)

            if gen % 5 == 0:
                print('Learner Fitness', [
                    utils.pprint(learner.value) for learner in self.portfolio
                ], 'Sum_stats_resource_allocation',
                      [learner.visit_count for learner in self.portfolio])
                try:
                    print('Entropy', [
                        '%.2f' % algo.algo.entropy['mean']
                        for algo in self.portfolio
                    ], 'Next_Entropy', [
                        '%.2f' % algo.algo.next_entropy['mean']
                        for algo in self.portfolio
                    ], 'Poilcy_Q', [
                        '%.2f' % algo.algo.policy_q['mean']
                        for algo in self.portfolio
                    ], 'Critic_Loss', [
                        '%.2f' % algo.algo.critic_loss['mean']
                        for algo in self.portfolio
                    ])
                    print()
                except:
                    None

            if self.total_frames > frame_limit:
                break

        ###Kill all processes
        try:
            for p in self.task_pipes:
                p[0].send('TERMINATE')
            for p in self.test_task_pipes:
                p[0].send('TERMINATE')
            for p in self.evo_task_pipes:
                p[0].send('TERMINATE')
        except:
            None

    def compute_policy_type(self):
        if self.args.algo == 'ddqn':
            return 'DDQN'

        elif self.args.algo == 'sac':
            return 'Gaussian_FF'

        elif self.args.algo == 'td3':
            return 'Deterministic_FF'
예제 #8
0
class Evaluator(object):
    def __init__(
        self,
        CERL_agent,
        num_workers,
        trainers,
        pomdp_adv=False
    ):  #trainers first is the blue agent and second is the red model
        self.num_workers = num_workers
        self.trainers = trainers
        self.pomdp_adv = pomdp_adv
        self.args = CERL_agent.args
        self.drqn = CERL_agent.args.drqn  #denote if blue uses drqn
        if self.pomdp_adv:
            self.trainers = [trainers[0],
                             None]  #make sure the red model is never used
        self.buffer_gpu = CERL_agent.args.buffer_gpu
        self.batch_size = CERL_agent.args.batch_size
        self.algo = CERL_agent.args.algo
        self.state_dim = CERL_agent.args.state_dim
        self.action_dim = CERL_agent.args.action_dim
        self.buffer = Buffer(BUFFER_SIZE,
                             self.buffer_gpu)  #initialize own replay buffer
        self.data_bucket = self.buffer.tuples
        self.evo_task_pipes = [Pipe() for _ in range(self.num_workers)]
        self.evo_result_pipes = [Pipe() for _ in range(self.num_workers)]
        self.actual_red_worker = Actor(
            CERL_agent.args.state_dim, CERL_agent.args.action_dim, -1,
            'dis')  #this model is shared accross the workers
        self.actual_red_worker.share_memory()
        self.td3args = {
            'policy_noise': 0.2,
            'policy_noise_clip': 0.5,
            'policy_ups_freq': 2,
            'action_low': CERL_agent.args.action_low,
            'action_high': CERL_agent.args.action_high,
            'cerl_args': self.args
        }
        self.renew_learner(
        )  #now we are not using new learner for each iteration
        self.rollout_bucket = [
            self.actual_red_worker for i in range(num_workers)
        ]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, 3, self.evo_task_pipes[id][1],
                          self.evo_result_pipes[id][0], False,
                          self.data_bucket, self.rollout_bucket, 'dummy_name',
                          None, 'dis', self.trainers, False, self.pomdp_adv))
            for id in range(num_workers)
        ]

        for worker in self.workers:
            worker.start()
        self.evo_flag = [True for _ in range(self.num_workers)]

    #def initialize(self, actor_in):  #use the given actor parameter to initialize the red actor
    #    utils.hard_update(self.actual_red_actor, actor_in)

    def renew_learner(
            self
    ):  #create a new learning agent, with randomized initial parameter
        self.learner = Learner(-1,
                               self.algo,
                               self.state_dim,
                               self.action_dim,
                               actor_lr=5e-5,
                               critic_lr=1e-3,
                               gamma=0.99,
                               tau=5e-3,
                               init_w=True,
                               **self.td3args)
        self.actual_red_actor = self.learner.algo.actor

    def collect_trajectory(self):
        utils.hard_update(self.actual_red_worker,
                          self.actual_red_actor)  #first snyc the actor

        #launch rollout_workers
        for id, actor in enumerate(self.rollout_bucket):
            if self.evo_flag[id]:
                self.evo_task_pipes[id][0].send(
                    (id, 0))  #second argument in send is dummy
                self.evo_flag[id] = False

        #wait for the rollout to complete and record fitness
        all_fitness = []
        for i in range(self.num_workers):
            entry = self.evo_result_pipes[i][1].recv()
            all_fitness.append(entry[1])
            self.evo_flag[i] = True

        self.buffer.referesh()  #update replay buffer

        return all_fitness

    def train_red(
        self, training_iterations
    ):  #alternate between collect_trajectory and parameter update
        while self.buffer.__len__() < self.batch_size * 10:  ###BURN IN PERIOD
            self.collect_trajectory()

        for i in range(training_iterations):
            self.collect_trajectory()
            self.buffer.tensorify()  # Tensorify the buffer for fast sampling
            self.learner.update_parameters(self.buffer, self.buffer_gpu,
                                           self.batch_size, 2)  #2 update steps

    def evaluate(
        self
    ):  #evaluate the quality of blue agent policy, by training a red against it, after evaluation, erase the reply buffer and renew learner
        self.train_red(TRAIN_ITERATION)
        self.clear_buffer()
        #self.renew_learner()
        return self.evaluate_fixed_agents(
            self.trainers[0], self.trainers[1],
            [self.actual_red_actor
             ])  #calculate the mean and std of the evaluation metric

    def evaluate_fixed_agents(
        self,
        blue_dqn,
        red_model,
        red_actor_list,
        num_iterations=25
    ):  #evaluate the performance given agents, use random neutral and red agent
        if self.algo == 'dis':  # make env with blue and red policy agent inside,
            dis_env = make_self_play_env(
                seed=0,
                return_policy_agent=False,
                trainers=[blue_dqn, red_model]
            )[0]  # trainer if not None, first is the shared DQN agent, second is the best red policy
            env = EnvironmentWrapper(
                '', self.algo, dis_env,
                0)  # the "0" is the index for training blue agent
        elif self.algo == 'TD3_tennis':
            tennis_env = make_tennis_env.TennisEnvFactory(
                seed=np.random.choice(np.array(range(len(self.pop)))),
                no_graphics=True,
                pid=-1).getEnv()[0]
            env = EnvironmentWrapper('Tennis', self.algo, tennis_env, 0)
        else:
            raise Exception("only work for 'dis' envir?")
        average_reward = 0
        eps = 0
        average_red_reward = 0
        red_count = 0
        average_actual_blue_reward = 0
        blue_count = 0
        belief_and_true_type_list = []
        assert len(red_actor_list
                   ) is not None, "make sure to input a list of possible red"
        for it in range(num_iterations):
            belief_and_true_type = []
            if not self.pomdp_adv:  # if pomdp_adv, make sure that TD3_actor is never used
                red_actor = random.choice(red_actor_list)
                env.set_TD3_actor(red_actor)
            fitness = 0.0
            # here fitness if simplely reward
            state = env.reset()
            belief_and_true_type.append(env.belief_and_true_type())
            env.randomize_neu_adv()

            if self.pomdp_adv:
                env.try_set_pomdp_adv(
                )  # try to set if opponent to pomdp adv if opponent is adversary, else do nothing

            render_flag = (np.random.random() < 0.05)
            while True:  # unless done
                action = blue_dqn.act(state, eps=eps)
                next_state, reward, done, info = env.step(
                    copy.deepcopy(action), use_actual_reward=self.drqn)
                belief_and_true_type.append(env.belief_and_true_type())
                if render_flag and self.args.render:
                    env.render()

                state = next_state
                fitness += reward

                if done:
                    average_red_reward += env.get_red_reward(
                    ) if env.get_red_reward() is not None else 0
                    average_actual_blue_reward += env.get_blue_actual_reward(
                    ) if env.get_blue_actual_reward() is not None else 0
                    red_count += 1 if env.get_red_reward() is not None else 0
                    blue_count += 1 if env.get_blue_actual_reward(
                    ) is not None else 0
                    if render_flag: env.env.close()
                    break
            belief_and_true_type_list.append(belief_and_true_type)
            average_reward += fitness
        average_reward /= num_iterations
        if red_count != 0:
            average_red_reward /= red_count
        if blue_count != 0:
            average_actual_blue_reward /= blue_count
        return average_reward, average_red_reward, average_actual_blue_reward, belief_and_true_type_list

    def clear_buffer(self):
        self.buffer.clear_buffer_data()  #reinitialize replay buffer

    def kill_processes(self):
        for id, actor in enumerate(self.rollout_bucket):
            self.evo_task_pipes[id][0].send(
                ('TERMINATE', 0))  #second argument in send is dummy

    def __del__(self):
        self.kill_processes()