def _get_all_accuracies(tensorboard_hdfs_logdir, args_dict, number_params): """ Retrieves all accuracies from the parallel executions (each one is in a different file, one per combination of wrapper function parameter) Args: :tensorboard_hdfs_logdir: :args_dict: :number_params: Returns: """ results = [] #Important, this must be ordered equally than _parse_to_dict function population_dict = diff_evo.get_dict() global run_id for i in range(number_params): path_to_log = tensorboard_hdfs_logdir + "/generation." + str( generation_id - 1) + "/" for k in population_dict: path_to_log += k + '=' + str(args_dict[k][i]) + '&' path_to_log = path_to_log[:(len(path_to_log) - 1)] path_to_log = path_to_log + '/.metric' with hdfs.open_file(path_to_log, flags="r") as fi: metric = fi.read() fi.close() results.append(metric) return [float(res) for res in results]
def init_logger(self, trial_log_file): """Initializes the trial log file """ self.trial_log_file = trial_log_file # Open trial log file descriptor if not hopshdfs.exists(self.trial_log_file): hopshdfs.dump("", self.trial_log_file) self.trial_fd = hopshdfs.open_file(self.trial_log_file, flags="w")
def _get_best(root_logdir, direction): min_val = sys.float_info.max min_logdir = None max_val = sys.float_info.min max_logdir = None generation_folders = hdfs.ls(root_logdir) generation_folders.sort() for generation in generation_folders: for individual in hdfs.ls(generation): invidual_files = hdfs.ls(individual, recursive=True) for file in invidual_files: if file.endswith("/.metric"): val = hdfs.load(file) val = float(val) if val > max_val: max_val = val max_logdir = file[:-8] if val < min_val: min_val = val min_logdir = file[:-8] if direction.upper() == Direction.MAX: return_dict = {} with hdfs.open_file(max_logdir + '/.outputs.json', flags="r") as fi: return_dict = json.loads(fi.read()) fi.close() return max_logdir, return_dict else: return_dict = {} with hdfs.open_file(min_logdir + '/.outputs.json', flags="r") as fi: return_dict = json.loads(fi.read()) fi.close() return min_logdir, return_dict
def _run(sc, train_fn, run_id, local_logdir=False, name="no-name", evaluator=False): """ Args: sc: train_fn: local_logdir: name: Returns: """ app_id = str(sc.applicationId) num_executions = util.num_executors() #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Make SparkUI intuitive by grouping jobs sc.setJobGroup( os.environ['ML_ID'], "{} | ParameterServerStrategy - Distributed Training".format(name)) server = parameter_server_reservation.Server(num_executions) server_addr = server.start() num_ps = util.num_param_servers() #Force execution on executor, since GPU is located on executor nodeRDD.foreachPartition( _prepare_func(app_id, run_id, train_fn, local_logdir, server_addr, num_ps, evaluator)) logdir = experiment_utils._get_logdir(app_id, run_id) print('Finished Experiment \n') path_to_return = logdir + '/.outputs.json' if hdfs.exists(path_to_return): with hdfs.open_file(path_to_return, flags="r") as fi: contents = fi.read() fi.close() return logdir, json.loads(contents) return logdir, None
def initialize_logger(self, exp_dir): """Initialize logger of optimizer :param exp_dir: path of experiment directory :rtype exp_dir: str """ # configure logger self.log_file = exp_dir + "/pruner.log" if not hdfs.exists(self.log_file): hdfs.dump("", self.log_file) self.fd = hdfs.open_file(self.log_file, flags="w") self._log("Initialized Pruner Logger")
def __init__(self, log_file, partition_id, task_attempt, print_executor): self.metric = None self.lock = threading.RLock() self.stop = False self.trial_id = None self.trial_log_file = None self.logs = "" self.log_file = log_file self.partition_id = partition_id self.task_attempt = task_attempt self.print_executor = print_executor # Open executor log file descriptor # This log is for all maggy system related log messages if not hopshdfs.exists(log_file): hopshdfs.dump("", log_file) self.fd = hopshdfs.open_file(log_file, flags="w") self.trial_fd = None
def _get_best(args_dict, num_combinations, arg_names, arg_count, hdfs_appid_dir, optimization_key): if not optimization_key: optimization_key = 'metric' max_hp = '' max_val = '' min_hp = '' min_val = '' min_return_dict = {} max_return_dict = {} results = [] first = True for i in range(num_combinations): argIndex = 0 param_string = '' num_args = arg_count while num_args > 0: #Get args for executor and run function param_name = arg_names[argIndex] param_val = args_dict[param_name][i] param_string += str(param_name) + '=' + str(param_val) + '&' num_args -= 1 argIndex += 1 param_string = param_string[:-1] path_to_return = hdfs_appid_dir + '/' + param_string + '/.outputs.json' assert hdfs.exists( path_to_return), 'Could not find .return file on path: {}'.format( path_to_return) with hdfs.open_file(path_to_return, flags="r") as fi: return_dict = json.loads(fi.read()) fi.close() # handle case when dict with 1 key is returned if optimization_key == 'metric' and len(return_dict.keys()) == 1: optimization_key = list(return_dict.keys())[0] metric = float(return_dict[optimization_key]) if first: max_hp = param_string max_val = metric max_return_dict = return_dict min_hp = param_string min_val = metric min_return_dict = return_dict first = False if metric > max_val: max_val = metric max_hp = param_string max_return_dict = return_dict if metric < min_val: min_val = metric min_hp = param_string min_return_dict = return_dict results.append(metric) avg = sum(results) / float(len(results)) return max_val, max_hp, min_val, min_hp, avg, max_return_dict, min_return_dict
def __init__(self, experiment_type, **kwargs): global driver_secret # COMMON EXPERIMENT SETUP self._final_store = [] self._trial_store = {} self.num_executors = kwargs.get("num_executors") self._message_q = queue.Queue() self.name = kwargs.get("name") self.experiment_done = False self.worker_done = False self.hb_interval = kwargs.get("hb_interval") self.description = kwargs.get("description") self.experiment_type = experiment_type self.es_interval = kwargs.get("es_interval") self.es_min = kwargs.get("es_min") # TYPE-SPECIFIC EXPERIMENT SETUP if self.experiment_type == "optimization": # set up an optimization experiment self.num_trials = kwargs.get("num_trials", 1) searchspace = kwargs.get("searchspace") if isinstance(searchspace, Searchspace): self.searchspace = searchspace elif searchspace is None: self.searchspace = Searchspace() else: raise Exception( "The experiment's search space should be an instance of maggy.Searchspace, " "but it is {0} (of type '{1}').".format( str(searchspace), type(searchspace).__name__)) optimizer = kwargs.get("optimizer") if optimizer is None: if len(self.searchspace.names()) == 0: self.optimizer = SingleRun() else: raise Exception( "Searchspace has to be empty or None to use without optimizer" ) elif isinstance(optimizer, str): if optimizer.lower() == "randomsearch": self.optimizer = RandomSearch() elif optimizer.lower() == "asha": self.optimizer = Asha() elif optimizer.lower() == "none": if len(self.searchspace.names()) == 0: self.optimizer = SingleRun() else: raise Exception( "Searchspace has to be empty or None to use without Optimizer." ) else: raise Exception( "Unknown Optimizer. Can't initialize experiment driver." ) elif isinstance(optimizer, AbstractOptimizer): self.optimizer = optimizer print("Custom Optimizer initialized.") else: raise Exception( "The experiment's optimizer should either be an string indicating the name " "of an implemented optimizer (such as 'randomsearch') or an instance of " "maggy.optimizer.AbstractOptimizer, " "but it is {0} (of type '{1}').".format( str(optimizer), type(optimizer).__name__)) direction = kwargs.get("direction", "max") if isinstance(direction, str) and direction.lower() in ["min", "max"]: self.direction = direction.lower() else: raise Exception( "The experiment's direction should be an string (either 'min' or 'max') " "but it is {0} (of type '{1}').".format( str(direction), type(direction).__name__)) es_policy = kwargs.get("es_policy") if isinstance(es_policy, str): if es_policy.lower() == "median": self.earlystop_check = MedianStoppingRule.earlystop_check elif es_policy.lower() == "none": self.earlystop_check = NoStoppingRule.earlystop_check else: raise Exception( "The experiment's early stopping policy should either be a string ('median' or 'none') " "or a custom policy that is an instance of maggy.earlystop.AbstractEarlyStop, " "but it is {0} (of type '{1}').".format( str(es_policy), type(es_policy).__name__)) elif isinstance(es_policy, AbstractEarlyStop): self.earlystop_check = es_policy.earlystop_check print("Custom Early Stopping policy initialized.") else: raise Exception( "The experiment's early stopping policy should either be a string ('median' or 'none') " "or a custom policy that is an instance of maggy.earlystop.AbstractEarlyStop, " "but it is {0} (of type '{1}').".format( str(es_policy), type(es_policy).__name__)) self.es_interval = kwargs.get("es_interval") self.es_min = kwargs.get("es_min") self.result = { "best_val": "n.a.", "num_trials": 0, "early_stopped": 0 } elif self.experiment_type == "ablation": # set up an ablation study experiment self.earlystop_check = NoStoppingRule.earlystop_check ablation_study = kwargs.get("ablation_study") if isinstance(ablation_study, AblationStudy): self.ablation_study = ablation_study else: raise Exception( "The experiment's ablation study configuration should be an instance of " "maggy.ablation.AblationStudy, " "but it is {0} (of type '{1}').".format( str(ablation_study), type(ablation_study).__name__)) searchspace = kwargs.get("searchspace") if not searchspace: self.searchspace = Searchspace() else: raise Exception( "The experiment's search space should be None for ablation experiments, " "but it is {0} (of type '{1}').".format( str(searchspace), type(searchspace).__name__)) ablator = kwargs.get("ablator") if isinstance(ablator, str): if ablator.lower() == "loco": self.ablator = LOCO(ablation_study, self._final_store) self.num_trials = self.ablator.get_number_of_trials() if self.num_executors > self.num_trials: self.num_executors = self.num_trials else: raise Exception( "The experiment's ablation study policy should either be a string ('loco') " "or a custom policy that is an instance of maggy.ablation.ablation.AbstractAblator, " "but it is {0} (of type '{1}').".format( str(ablator), type(ablator).__name__)) elif isinstance(ablator, AbstractAblator): self.ablator = ablator print("Custom Ablator initialized. \n") else: raise Exception( "The experiment's ablation study policy should either be a string ('loco') " "or a custom policy that is an instance of maggy.ablation.ablation.AbstractAblator, " "but it is {0} (of type '{1}').".format( str(ablator), type(ablator).__name__)) self.result = { "best_val": "n.a.", "num_trials": 0, "early_stopped": "n.a" } else: raise Exception( "Unknown experiment type. experiment_type should be either 'optimization' or 'ablation', " "but it is {0}.".format(str(self.experiment_type))) # FINALIZE EXPERIMENT SETUP self.server = rpc.Server(self.num_executors) if not driver_secret: driver_secret = self._generate_secret( ExperimentDriver.SECRET_BYTES) self._secret = driver_secret self.job_start = datetime.now() self.executor_logs = "" self.maggy_log = "" self.log_lock = threading.RLock() self.log_file = kwargs.get("log_dir") + "/maggy.log" self.log_dir = kwargs.get("log_dir") self.exception = None # Open File desc for HDFS to log if not hopshdfs.exists(self.log_file): hopshdfs.dump("", self.log_file) self.fd = hopshdfs.open_file(self.log_file, flags="w")
def open_file(self, hdfs_path, project=None, flags="r", buff_size=0): return hopshdfs.open_file( hdfs_path, project=project, flags=flags, buff_size=buff_size )