class SoftOrderPersistor(): """ A LEAF-ier ploy to separate out the files that are persisted by softorder_coevolution -- all except for checkpointing. We do this so that the session_server can be the one to do the persistence and the files can persist on the session_server machine. """ def __init__(self, experiment_dir, fitness_objectives, save_best=True, draw=True, logger=None): self.experiment_dir = experiment_dir self.save_best = save_best self.draw = draw self.fitness_objectives = fitness_objectives self.candidate_util = CandidateUtil(fitness_objectives) self.advanced_stats = { 'best_candidate': [], 'avg_fitness': [], 'time': [] } self.logger = logger def persist(self, population, generation): """ Gather statistics and persist what we want to files """ best_candidate = self.gather_advanced_stats(population) self.do_save(generation, best_candidate) self.do_draw(generation) fitness_persistence = FitnessPersistor(self.experiment_dir, generation, self.fitness_objectives) fitness_persistence.persist(self.advanced_stats) def get_candidate_fitness(self, candidate): return self.candidate_util.get_candidate_fitness(candidate) def average_fitness(self, population): """ Returns the average raw fitness of population """ my_sum = 0.0 counter = 1e-308 for candidate in population: fitness = self.get_candidate_fitness(candidate) if fitness is not None: my_sum += fitness counter += 1 return my_sum / counter def find_best_candidate(self, population): if population is None or len(population) == 0: return None one = population[0] best = None if isinstance(one, dict): # Candidates are dictionaries best_fitness = None for candidate in population: fitness = self.get_candidate_fitness(candidate) if best_fitness is None: best_fitness = fitness best = candidate elif fitness > best_fitness: best_fitness = fitness best = candidate else: # Candidates are ChromosomeData best = max(population) return best def gather_advanced_stats(self, population): """ Populates the advanced_stats member dictionary with info about the generation just evaluated. """ best_candidate = self.find_best_candidate(population) self.advanced_stats['best_candidate'].append( copy.deepcopy(best_candidate)) self.advanced_stats['avg_fitness'].append( self.average_fitness(population)) self.advanced_stats['time'].append(time.time()) return best_candidate def do_save(self, generation, best_candidate): # saves the best candidate from the current generation if not self.save_best: return if best_candidate is not None: candidate_id = self.candidate_util.get_candidate_id(best_candidate) best_persistence = BestFitnessCandidatePersistence( self.experiment_dir, candidate_id, generation, logger=self.logger) best_persistence.persist(best_candidate) def do_draw(self, generation): if self.draw: if generation >= 2: stats = (self.advanced_stats['best_candidate'], self.advanced_stats['avg_fitness']) visualize.plot_stats(stats, self.candidate_util, self.experiment_dir)
class ReevaluateBestSessionTask(SessionTask): """ SessionTask that performs a re-evaluation of the best candidates from each generation. """ # Tied for Public Enemy #5 for too-many-arguments # pylint: disable=too-many-arguments def __init__(self, session, master_config, experiment_dir, fitness_objectives, generation, experiment_start_time, experiment_id, completion_service, initial_generation, checkpoint_id=None): """ Constructor. :param session: The session with which the task can communicate with the service :param master_config: The master config for the task :param experiment_dir: The experiment directory for results :param fitness_objectives: The FitnessObjectives object :param generation: the generation number of the population :param experiment_start_time: the experiment start time in seconds :param experiment_id: the experiment id XXX Can this be derived from experiment_dir? :param completion_service: A handle to the CompletionService object for performing distributed evaluations. :param initial_generation: Flag saying whether or not this is the first generation. :param checkpoint_id: The checkpoint id (if any) relevant to the task. """ super(ReevaluateBestSessionTask, self).__init__(session, master_config, experiment_dir, fitness_objectives, checkpoint_id) self.generation = generation self.experiment_start_time = experiment_start_time self.experiment_id = experiment_id self.completion_service = completion_service self.initial_generation = initial_generation self.candidate_util = CandidateUtil(fitness_objectives) self.population_response_util = PopulationResponseUtil() # These are fields to be populated by unpack_response() experiment_config = self.master_config.get('experiment_config') self.persistor = SoftOrderPersistor( self.experiment_dir, self.fitness_objectives, draw=experiment_config.get('visualize'), logger=self.logger) self.server_stats = {} self.seen_checkpoint_ids = [] def run(self): """ Entry point for the session task execution to take over. """ experiment_config = self.master_config.get('experiment_config') assert os.path.exists( experiment_config.get('reevaluate_checkpoint_dir')) print("Re-evaluating top %s chromosomes found from experiment %s" % \ (experiment_config.get('reevaluate_num'), experiment_config.get('reevaluate_checkpoint_dir'))) candidate_fit_dict = {} # Read in the contents of the checkpoint_ids.txt file which contains # all references to any checkpoint training has seen. # By convention reevalute_checkpoint_dir is where this file is coming # from, and self.checkpoint_dir is where new results are being # written to. restoring_checkpoint_persistence = CheckpointPersistence( folder=experiment_config.get('reevaluate_checkpoint_dir'), logger=self.logger) self.seen_checkpoint_ids = restoring_checkpoint_persistence.restore() for checkpoint_id in self.seen_checkpoint_ids: print("Analyzing chromos in %s" % checkpoint_id) population_response = self.session.get_population( experiment_config.get('reevaluate_checkpoint_dir'), checkpoint_id) pop = self.population_response_util.unpack_response( population_response, self) for candidate in pop: id_key = self.candidate_util.get_candidate_id(candidate) # Get the persisted Worker Results dictionaries results_dict_persistence = ResultsDictPersistence( experiment_config.get('reevaluate_checkpoint_dir'), self.generation, logger=self.logger) results_dict = results_dict_persistence.restore() candidate_fitness = None if any(results_dict): if id_key in results_dict: candidate_results_dict = results_dict[id_key] # This is not quite a candidate, but the get-mechanism # should be the same candidate_fitness = \ self.candidate_util.get_candidate_fitness( candidate_results_dict) if candidate_fitness is None: candidate_fitness = 0.0 if id_key not in candidate_fit_dict: candidate_fit_dict[id_key] = { 'candidate': candidate, 'fit': [candidate_fitness] } else: candidate_fit_dict[id_key]['candidate'] = candidate candidate_fit_dict[id_key]['fit'].append(candidate_fitness) avg = [(x['candidate'], np.mean(x['fit'])) \ for x in list(candidate_fit_dict.values())] best = sorted(avg, key=lambda x: x[1], reverse=True)[:experiment_config.get('reevaluate_num')] best_candidates = [x[0] for x in best] best_candidate_ids = [self.candidate_util.get_candidate_id(x[0]) \ for x in best] best_fit = [round(x[1], 4) for x in best] if len(best_candidates) == 0: print("No chromos found, doing nothing") return for candidate in best_candidates: candidate_id = self.candidate_util.get_candidate_id(candidate) best_candidate_persistence = BestFitnessCandidatePersistence( self.experiment_dir, candidate_id, logger=self.logger) best_candidate_persistence.persist(candidate) print("Best chromos:") print(list(zip(best_candidate_ids, best_fit))) print("Best chromo stats:") print("Min: %s Mean: %s Max: %s Std: %s" % \ (round(np.min(best_fit), 4), round(np.mean(best_fit), 4), round(np.max(best_fit), 4), round(np.std(best_fit), 4))) # We use generation + 1 for reporting here because we are really # composing a population of the best candidates across many # different previous generations, and as such doesn't really # correspond to any generation number of the past. reevaluate_candidate_task = ReevaluateCandidateSessionTask(\ self.session, self.master_config, self.experiment_dir, self.fitness_objectives, self.generation, self.experiment_start_time, self.experiment_id, self.completion_service, self.initial_generation, self.checkpoint_id) reevaluate_candidate_task.evaluate_and_analyze_results( best_candidates, self.generation + 1)
class FitnessPersistor(Persistor): """ This implementation of the Persistor interface creates the fitness.csv file. """ def __init__(self, experiment_dir, generation, fitness_objectives): """ Constructor. """ self.filer = ExperimentFiler(experiment_dir) self.generation = generation self.fitness_objectives = fitness_objectives self.candidate_util = CandidateUtil(fitness_objectives) self.basename = 'fitness.csv' self.time_format = '%Y-%m-%d-%H:%M:%S' def persist(self, obj): """ Persists the object passed in. :param obj: an object to persist In this case we are expecting an advanced stats dictionary from the SoftOrderPersistor """ advanced_stats = obj filename = self.filer.experiment_file(self.basename) self.write_csv_file(filename, advanced_stats) def write_csv_file(self, filename, advanced_stats): """ Writes out the fitness.csv file :param filename: The filename to write to :param advanced_stats: The advanced_stats dict gathered by the SoftOrderPersistor :return: Nothing """ with open(filename, 'w') as csv_file: # Prepare dynamic column names primary_objective = self.fitness_objectives.get_fitness_objective( 0) fitness_name = primary_objective.get_metric_name() best_fitness_field_name = 'Best ' + fitness_name best_fitness_id_field_name = best_fitness_field_name + ' id' avg_fitness_field_name = 'Avg ' + fitness_name field_names = [ 'Generation', 'Timestamp', best_fitness_id_field_name, best_fitness_field_name, avg_fitness_field_name ] csv_writer = csv.DictWriter(csv_file, fieldnames=field_names, quoting=csv.QUOTE_MINIMAL, lineterminator="\n") csv_writer.writeheader() for gen in range(self.generation + 1): # Get timestamp in human-readable format timestamp = advanced_stats['time'][gen] ts_datetime = datetime.fromtimestamp(timestamp) time_string = ts_datetime.strftime(self.time_format) # Get best candidate # XXX multi-objective best_id = None best_fitness = None candidate = advanced_stats['best_candidate'][gen] if candidate is not None: best_id = self.candidate_util.get_candidate_id(candidate) best_fitness = self.candidate_util.get_candidate_fitness( candidate) # Get average fitness # XXX multi-objective avg_fitness = advanced_stats['avg_fitness'][gen] row = { 'Generation': gen, 'Timestamp': time_string, best_fitness_id_field_name: best_id, best_fitness_field_name: best_fitness, avg_fitness_field_name: avg_fitness } csv_writer.writerow(row)
class AnalyzeResultsSessionTask(SessionTask): """ SessionTask that performs the AnalyzeResults task. This task doesn't actually use the Session object that talks to the server, but instead takes all the results files created by a run and does some analysis on them. XXX What? """ def __init__(self, session, master_config, experiment_dir, fitness_objectives, checkpoint_id=None): """ Constructor. :param session: The session with which the task can communicate with the service :param master_config: The master config for the task :param experiment_dir: The experiment directory for results :param fitness_objectives: The FitnessObjectives object :param checkpoint_id: The checkpoint id (if any) relevant to the task. """ super(AnalyzeResultsSessionTask, self).__init__(session, master_config, experiment_dir, fitness_objectives, checkpoint_id) self.candidate_util = CandidateUtil(fitness_objectives) def run(self): """ Entry point for the session task execution to take over. """ print("Running AnalyzeResultsSessionTask") # Read the results files for each generation. # These are written out by write_results_file() filer = ExperimentFiler(self.experiment_dir) glob_spec = filer.experiment_file("gen_*/results_dict.json") results_dicts = glob.glob(glob_spec) worker_results_files = sorted(results_dicts) if len(worker_results_files) <= 0: raise ValueError("No results_dicts.json files found in {0}".format( self.experiment_dir)) # No generation number needed, we are only looking to # parse path components with it. generation_filer = GenerationFiler(self.experiment_dir) worker_results_dict = {} for worker_results_file in worker_results_files: generation = generation_filer.get_generation_from_path( worker_results_file) # This slurps in results information returned by workers from all # candidates of a specific generation results_dict_persistence = ResultsDictPersistence( self.experiment_dir, generation, logger=self.logger) one_worker_results_dict = results_dict_persistence.restore() # results_dict here will have one entry per candidate over all # generations worker_results_dict.update(one_worker_results_dict) fitness_objective = self.fitness_objectives.get_fitness_objectives(0) is_maximize = fitness_objective.is_maximize_fitness() best_result = sorted(list(worker_results_dict.items()), key=lambda \ x: max(self.candidate_util.get_candidate_fitness(x)), reverse=is_maximize)[0] best_id = best_result.get('id') # Open the file of the best candidate. best_candidate_persistence = BestFitnessCandidatePersistence( self.experiment_dir, best_id, logger=self.logger) best_candidate = best_candidate_persistence.restore() best_id = self.candidate_util.get_candidate_id(best_candidate) self.draw_best_candidate_results(best_candidate, generation, suffix='abs') def draw_best_candidate_results(self, best_candidate, generation=None, suffix=''): """ :param best_candidate: A candidate object comprising the best of a generation. :param generation: Default value is None :param suffix: Default value is an empty string """ experiment_config = self.master_config.get('experiment_config') if not experiment_config.get('visualize'): return best_id = self.candidate_util.get_candidate_id(best_candidate) best_fitness = self.candidate_util.get_candidate_fitness( best_candidate) fitness = best_fitness if best_fitness is None else \ round(best_fitness, 4) # Determine the output file name basis # XXX Use fitness for now. # Later on can address multi-objective goals. metric_name = "fitness" if generation is not None: # Put the file in the gen_NN directory. # Call it best_candidate to match the best_candidate.json # that gets put there base_name = "best_{0}_candidate".format(metric_name) filer = GenerationFiler(self.experiment_dir, generation) base_path = filer.get_generation_file(base_name) else: # We do not have a generation that we know about so write out # the old-school file name. # XXX Not entirely sure when this path would be taken base_name = "F{0}_ID-{1}_{2}best_{3}".format( fitness, best_id, suffix, metric_name) filer = ExperimentFiler(self.experiment_dir) base_path = filer.experiment_file(base_name) # NetworkVisualizers use the build_training_model() which requires # a data_dict of file keys -> file paths to exist. Domains that # wish to visualize their networks that use the data_dict will # need to deal with a None value for data dict in the visualization # case. data_dict = None visualizer = NetworkMultiVisualizer(self.master_config, data_dict, base_path, logger=self.logger) visualizer.visualize(best_candidate)