def __init__(self, config_dict=None, config_path=None): assert config_dict is None or config_path is None # If specified, load a config from the given JSON file. Custom modification to the JSON spec # permits lines to be commented out using a '#' character for ease of testing if config_path is not None: with open(config_path, 'r') as f: lines = f.readlines() for i, line in enumerate(lines): if '#' in line: lines[i] = line[:line.index('#')] config_dict = json.loads(''.join(lines)) config_dict = config_dict if config_dict is not None else {} # Update the default values using the supplied configuration dict if not isinstance(config_dict, dict): raise ConfigurationError("no keys found in configuration file") # check all the configurations for k, v in config_dict.items(): print("checking for attribute {}..".format(k)) if not hasattr(self, k): raise ConfigurationError( "Unexpected configuration keyword provided - {}:{}".format( k, v)) setattr(self, k, v) # ------------ check against schema -------------- ConfigFormat().validate_json(config_dict) # --------------------------------------------- # TODO: #nodes does not pass through the model (set by kronos_executor config for now..) if self.model: if self.model.get('schedule_generation'): self.model['schedule_generation']['synthapp_n_nodes'] = 1 # --------------------------------------------- # if input or output folders do not exist, an error is raised if not os.path.exists(self.dir_input): raise ConfigurationError( "input folder {} - does not exist!".format(self.dir_input)) if not os.path.exists(self.dir_output): raise ConfigurationError( "output folder {} - does not exist!".format(self.dir_output)) # ----------------- logging setup -------------------- root_logger = logging.getLogger() fh = logging.FileHandler(self.kronos_log_file, mode='w') fh.setFormatter(logging.Formatter(log_msg_format)) fh.setLevel(logging.DEBUG if self.verbose else logging.INFO) root_logger.addHandler(fh)
def ingest_accounting_logs(path, cfg=None): """ Read PBS logs into a dataset """ if not os.path.exists(path): raise ConfigurationError( "Specified path to ingest accounting profiles does not exist: {}". format(path)) if not os.path.isfile(path): raise ConfigurationError( "Specified path for accounting log is not a file") jobs = read_accounting_logs(path) return PBSDataSet(jobs)
def ingest_epcc_csv_logs(path, cfg=None): """ Read PBS logs into a dataset """ if not os.path.exists(path): raise ConfigurationError( "Specified path to ingest CSV profiles does not exist: {}".format( path)) if not os.path.isfile(path): raise ConfigurationError( "Specified path for CSV time_schedule is not a file") jobs = read_epcc_csv_logs(path) return PBSDataSet(jobs)
def step_function(function_config): """ Function that defines a step :param function_config: :return: """ required_config_fields = [ 'x_step', ] # check that all the required fields are set for req_item in required_config_fields: if req_item not in function_config.keys(): raise ConfigurationError( "'step_function' requires to specify {}".format(req_item)) # x of the step (between 0 and 1) x_step = function_config['x_step'] # default value of x points n_values = 6 eps = 1.0e-6 # then add two very close points at the step location x_values = np.sort( np.append(np.linspace(0, 1, n_values), [x_step, x_step + eps])) y_values = np.array([float(cc) for cc in np.sign(x_values - x_step) > 0]) return x_values, y_values
def ingest_allinea_profiles(path, cfg=None, jobs_n_bins=None, list_json_files=None, json_label_map=None): """ Does what it says on the tin. """ if not os.path.exists(path): raise ConfigurationError( "Specified path to ingest Allinea profiles does not exist: {}". format(path)) if not list_json_files: if os.path.isdir(path): jobs = read_allinea_logs(path, cfg=cfg, jobs_n_bins=jobs_n_bins) else: jobs = [read_allinea_log(path, cfg=cfg, jobs_n_bins=jobs_n_bins)] else: jobs = read_allinea_logs(path, cfg=cfg, jobs_n_bins=jobs_n_bins, list_json_files=list_json_files) if not jobs: raise RuntimeError("No file found") return AllineaDataSet(jobs, json_label_map=json_label_map)
def ckeck_config(self): """ Check the configuration :return: """ # check the json data against the post-processing config schema ExportConfigFormat().validate_json(self._config_dict) # check all the configurations for k, v in self._config_dict.items(): if not hasattr(self, k): raise ConfigurationError("Unexpected configuration keyword provided - {}:{}".format(k, v)) # if OK, set this attribute.. setattr(self, k, v) # take the timestamp to be used to archive run folders (if existing) out_dir = self._config_dict["output_path"] if os.path.exists(out_dir): time_stamp_now = datetime.datetime.now().strftime('%Y%m%d%H%M%S') time_stamped_outdir = out_dir + "." + time_stamp_now print("Dir: {} already exists!\n..moving it to: {}".format(out_dir, time_stamped_outdir)) os.rename(out_dir, time_stamped_outdir)
def bin_array(t, data, bins_in, mode="sum"): """ Function that returns a binned array (elements that fall within a bin can be either discretized or averaged) :param t: :param data: :param bins: [integer]: bins span the t vector [numpy.ndarray]: bins fully specified :param mode: "sum" or "mean" :return: """ eps = 1e-8 t = np.asarray(t) data = np.asarray(data) if isinstance(bins_in, int): bins = np.linspace(min(t)-eps, max(t)+eps, bins_in) t_bins = (bins[1:]+bins[:-1])/2.0 elif isinstance(bins_in, np.ndarray): bins = bins_in bins[0] -= eps bins[-1] += eps t_bins = (bins[1:]+bins[:-1])/2.0 else: raise ConfigurationError("bins must be either interger or numpy array!") digitized = np.digitize(t, bins) # method "sum" if mode=="sum": bin_values = np.asarray([data[digitized == i].sum() if data[digitized == i].size else 0 for i in range(1, len(bins))]) bin_values = np.asarray(bin_values) # just a check.. if sum(data)-sum(bin_values) > 1e-10: print("different sum! orig: {}, binned: {}".format(sum(data), sum(bin_values))) # method "mean" elif mode == "mean": bin_values = np.asarray([data[digitized == i].mean() if data[digitized == i].size else 0 for i in range(1, len(bins))]) bin_values = np.asarray(bin_values) else: raise ConfigurationError("mode must be either sum or mean!") return t_bins, bin_values
def check_export_config(self, export_config, out_path, **kwargs): # create output dir if it does not exists.. if not os.path.exists(out_path): os.makedirs(out_path) # check that export type is consistent with the class type if export_config["type"] != self.export_type: raise ConfigurationError("Export type {}, does not match class: {}".format(export_config["type"], self.__class__.__name__)) if not self.optional_configs and kwargs: raise ConfigurationError("Class: {} does not accept optional config keys!".format(self.__class__.__name__)) else: if not all(k in self.optional_configs for k in kwargs.keys()): for k in kwargs.keys(): if k not in self.optional_configs: print("Class: {} incompatible with config {}".format(self.__class__.__name__, k)) raise ConfigurationError
def check_config(self, config): """ make sure that all the required params are set in the config :return: """ # check that all the required fields are set for req_item in self.required_config_fields: if req_item not in config.keys(): err = "{} requires config {}".format(self.__class__.__name__, req_item) raise ConfigurationError(err)
def __init__(self, workload_set, config): assert all(isinstance(wl, Workload) for wl in workload_set.workloads) assert isinstance(config, Config) # check that there is the "model" entry in the config file.. if not config.model: raise ConfigurationError( "'model' entry not set in config file, but required!") self.config = config self.workload_set = workload_set # check that there is the "model" entry in the config file.. if not self.config.model: raise ConfigurationError( "'model' entry not set in config file, but required!") # check that all the required fields are set for req_item in self.required_config_fields: if req_item not in self.config.model.keys(): raise ConfigurationError("{} requires to specify {}".format( self.__class__.__name__, req_item))
def _apply(self, config): # Apply each source workload into each destination workload n_job_matched = 0 n_destination_jobs = 0 for wl_source_tag in config['source_workloads']: try: wl_source = next(wl for wl in self.workloads if wl.tag == wl_source_tag) except StopIteration: raise ConfigurationError( "Source Workload {} not found".format(wl_source_tag)) for wl_dest_tag in config['apply_to']: try: wl_dest = next(wl for wl in self.workloads if wl.tag == wl_dest_tag) except StopIteration: raise ConfigurationError( "Destination Workload {} not found".format( wl_dest_tag)) n_destination_jobs += len(wl_dest.jobs) n_job_matched += self.apply_lookup_table( wl_source, wl_dest, config['similarity_threshold'], config['priority'], config['keywords'], ) logger.info("jobs matched/destination jobs = [{}/{}]".format( n_job_matched, n_destination_jobs))
def split_by_keywords(workload, split_config_output): """ Auxiliary internal splitting function :param workload: :param split_config_output: :return: """ # Extract configurations for the splitting new_wl_name = split_config_output['create_workload'] split_attr = split_config_output['split_by'] kw_include = split_config_output['keywords_in'] kw_exclude = split_config_output['keywords_out'] sub_wl_jobs = [] if kw_include and not kw_exclude: for j in workload.jobs: if getattr(j, split_attr): if all(kw in getattr(j, split_attr) for kw in kw_include): sub_wl_jobs.append(j) elif not kw_include and kw_exclude: for j in workload.jobs: if getattr(j, split_attr): if not any(kw in getattr(j, split_attr) for kw in kw_exclude): sub_wl_jobs.append(j) elif kw_include and kw_exclude: sub_wl_jobs = [ j for j in workload.jobs if all(kw in getattr(j, split_attr) for kw in kw_include) and not any( kw in getattr(j, split_attr) for kw in kw_exclude) ] else: raise ConfigurationError( "either included or excluded " "keywords are needed for splitting a workload") if not sub_wl_jobs: logger.error("Workload splitting has produced an empty workload!") return Workload(jobs=sub_wl_jobs, tag=new_wl_name)
def custom_function(function_config): """ Function that defines a custom distribution of values :param function_config: :return: """ required_config_fields = ['x_values', 'y_values'] # check that all the required fields are set for req_item in required_config_fields: if req_item not in function_config.keys(): raise ConfigurationError( "'step_function' requires to specify {}".format(req_item)) # x of the step (between 0 and 1) x_values = np.array(function_config['x_values']) y_values = np.array(function_config['y_values']) return x_values, y_values
def __init__(self, path, recursive=None, file_pattern=None, label_method=None, pool_readers=None): self.path = path print("Log reader ({})".format(self.log_type_name)) self.label_method = label_method if label_method is not None else self.label_method self.recursive = recursive if recursive is not None else self.recursive self.pool_readers = pool_readers if pool_readers is not None else self.pool_readers # Some checks if self.label_method not in self.available_label_methods: raise ConfigurationError( "Configuring LogReader with unavailable label method ({})". format(label_method)) # Only override the file pattern if it is supplied. if file_pattern: self.file_pattern = file_pattern
def from_logs_path(cls, ingest_path, ingest_config): """ This method should construct a log reader, read the logs and return an IngestedDataSet. If the logs are cached, then those should be read in instead. """ abs_ingest_path = os.path.abspath(os.path.realpath(ingest_path)) cache_file = "cache.{}".format(base64.b64encode(abs_ingest_path)) dataset = None # Remove reparse from the dictionary, so it is never used to compare validity of cached files. print(ingest_config) reparse = ingest_config.pop('reparse', False) cache = ingest_config.pop('cache', True) if not reparse: try: with open(cache_file, 'rb') as f: print("Using cached data from: {}".format(f.name)) dataset = pickle.load(f) except (IOError, OSError) as e: if e.errno == errno.ENOENT: print("No cache file found for ingest path") else: # An actual file read error occurred. Throw back to the user. raise if dataset: if dataset.ingest_config != ingest_config: logger.info( "Log reader configuration doesn't match cache file") logger.info("Reader: {}".format(ingest_config)) logger.info("Cached: {}".format(dataset.ingest_config)) logger.info( "Please modify configuration, or delete cache file and try again" ) raise ConfigurationError( "Log reader configuration doesn't match cache file") if os.path.abspath(os.path.realpath( dataset.ingest_path)) != abs_ingest_path: raise ConfigurationError( "Ingestion path in cache file does not match ingestion path" ) if dataset is None: # Finally read the logs, if that is required lr = cls.log_reader_class(ingest_path, **ingest_config) dataset = cls(lr.read_logs(), ingest_path, ingest_config) # Pickle the object for later rapid loading. if cache: print("Writing cache file: {}".format(cache_file)) with open(cache_file, "wb") as f: pickle.dump(dataset, f) return dataset
def apply_lookup_table(self, look_up_wl, wl_dest, threshold, priority, match_keywords): """ Uses another workload as lookup table to fill missing job information :param look_up_wl: :param wl_dest: :param threshold: :param priority: :param match_keywords: :return: """ logger.info( 'Applying look up from workload: {} onto workload: {}'.format( look_up_wl.tag, wl_dest.tag)) assert isinstance(look_up_wl, Workload) assert isinstance(threshold, float) assert isinstance(priority, int) assert isinstance(match_keywords, list) n_jobs_replaced = 0 # apply matching logic (if threshold < 1.0 - so not an exact matching is sought) n_print = 10 if threshold < 1.0: for jj, job in enumerate(wl_dest.jobs): pc_scanned = progress_percentage(jj, len(wl_dest.jobs), n_print) if pc_scanned > 0: print("Scanned {}% of source jobs".format(pc_scanned)) for lu_job in look_up_wl.jobs: # in case of multiple keys considers tha average matching ratio current_match = 0 for kw in match_keywords: if getattr(job, kw) and getattr(lu_job, kw): current_match += SequenceMatcher( lambda x: x in "-_", str(getattr(job, kw)), str(getattr(lu_job, kw))).ratio() current_match /= float(len(match_keywords)) # --------------------------------------------------------------- if current_match >= threshold: n_jobs_replaced += 1 for tsk in job.timesignals.keys(): # if the time series happen to be empty, apply the # cross-over if not job.timesignals[tsk] and lu_job.timesignals[ tsk]: logger.warning( "job {} has empty time series {}!".format( job.label, tsk)) job.timesignals[tsk] = copy.deepcopy( lu_job.timesignals[tsk]) # if there is no priority associated to the target job, # do the cross-over elif not job.timesignals[ tsk].priority and lu_job.timesignals[tsk]: job.timesignals[tsk] = copy.deepcopy( lu_job.timesignals[tsk]) # if there is a priority associated to the target job, but is # lower that the source job, do the cross-over elif job.timesignals[ tsk].priority <= priority and lu_job.timesignals[ tsk]: job.timesignals[tsk] = copy.deepcopy( lu_job.timesignals[tsk]) # compare directly (much faster..) elif threshold == 1: for jj, job in enumerate(wl_dest.jobs): pc_scanned = progress_percentage(jj, len(wl_dest.jobs), n_print) if pc_scanned > 0: print("Scanned {}% of source jobs".format(pc_scanned)) for lu_job in look_up_wl.jobs: if all( getattr(job, kw) == getattr(lu_job, kw) for kw in match_keywords): n_jobs_replaced += 1 for tsk in job.timesignals.keys(): if not job.timesignals[tsk]: job.timesignals[tsk] = lu_job.timesignals[tsk] elif job.timesignals[ tsk].priority <= priority and lu_job.timesignals[ tsk]: job.timesignals[tsk] = lu_job.timesignals[tsk] else: pass else: raise ConfigurationError( "matching threshold should be in [0,1], provided {} instead". format(threshold)) return n_jobs_replaced