def test_merge(self): """ Can we merge multiple ModelJobs. n.b. Currently this only supports time signals! TODO: Merge the non-time-signal data. """ # n.b. non-zero values. Zero time signals are ignored. kb_read1 = TimeSignal.from_values('kb_read', [0.0], [1.0], priority=8) kb_read2 = TimeSignal.from_values('kb_read', [0.0], [1.0], priority=10) kb_write1 = TimeSignal.from_values('kb_write', [0.0], [1.0], priority=8) # Test that we take the union of the available time series job1 = ModelJob(label="label1", timesignals={'kb_read': kb_read1}) job2 = ModelJob(label="label1", timesignals={'kb_write': kb_write1}) job1.merge(job2) self.assertEqual(len(job1.timesignals), len(signal_types)) self.assertEqual(job1.timesignals['kb_read'], kb_read1) self.assertEqual(job1.timesignals['kb_write'], kb_write1) # (The other time signals should still be None) for ts_name in signal_types: if ts_name in ['kb_read', 'kb_write']: continue self.assertIn(ts_name, job1.timesignals) self.assertIsNone(job1.timesignals[ts_name]) # check that when merging we take the signal with highest priority index job1 = ModelJob(label="label1", timesignals={'kb_read': kb_read1}) job2 = ModelJob(label="label1", timesignals={'kb_read': kb_read2}) job1.merge(job2) self.assertEqual(job1.timesignals['kb_read'], kb_read2)
def vector_to_ts(self, ts_vector, priority, idx_map=None, n_bins=None): """ Apply a vector to the time-signals :param ts_vector: :param idx_map: :return: """ # check that when the vector does not contain all the values, also the n_bins for each ts is provided if idx_map and not n_bins: raise ConfigurationError( "the RS results are mapped to N_columns < N_columns_tot => n_bins is needed!" ) if not idx_map and n_bins: raise ConfigurationError( "n_bins need to be specified only for mapped cases") if self.time_start is None: raise ConfigurationError( "job start-time is needed to accept time-series") if self.duration is None: raise ConfigurationError( "job duration is needed to accept time-series") if not idx_map: # case in which all the elements are filled up split_values = np.split(ts_vector, len(time_signal_names)) for tt, ts in enumerate(time_signal_names): y_values = split_values[tt] x_values = np.linspace(0.0, self.duration, len(y_values)) if not self.timesignals[ts]: self.timesignals[ts] = TimeSignal(ts).from_values( ts, x_values, y_values, priority=priority) elif self.timesignals[ts].priority <= priority: self.timesignals[ts] = TimeSignal(ts).from_values( ts, x_values, y_values, priority=priority) else: # case for which only some columns are filled up (therefore the mapping) row = np.zeros(len(time_signal_names) * n_bins) for tt, ts in enumerate(ts_vector): row[idx_map[tt]] = ts # then re-call the same function without mapping self.vector_to_ts(row, priority)
def from_random_proto_signals(job_name=None, ts_scales=None, ts_len=10): """ Generate a job from randomly chosen signals (from signal prototypes..) :param job_name: :param ts_scales: :param ts_len: :return: """ ts_scales = ts_scales if ts_scales else {} proto_signals = UserGeneratedJob.proto_signals(ts_len=ts_len) timesignals = {} # Add all the time-signals for ts_name in time_signal_names: xvalues = np.arange(ts_len) # toss a coin to decide which signal type to choose proto_signal_idx = np.random.randint(len(proto_signals)) yvalues = np.asarray(proto_signals[proto_signal_idx]) if ts_scales: yvalues = yvalues * ts_scales[ts_name] timesignals[ts_name] = TimeSignal.from_values(ts_name, xvalues, yvalues, priority=10) return UserGeneratedJob(name=job_name, timesignals=timesignals, ts_scales=ts_scales)
def test_is_valid(self): """ There are some things that are required. Check these things here! """ job = ModelJob() self.assertFalse(job.is_valid()) # If all of the required arguments are supplied, this should result in a valid job ts_complete_set = { tsk: TimeSignal.from_values(tsk, [0., 0.1], [1., 999.]) for tsk in time_signal_names } valid_args = { 'time_start': 0, 'duration': 0.2, 'ncpus': 1, 'nnodes': 1, 'timesignals': ts_complete_set } self.assertTrue(ModelJob(**valid_args).is_valid()) # If any of the supplied arguments are missing, this should invalidate things for k in valid_args.keys(): invalid_args = valid_args.copy() del invalid_args[k] self.assertTrue(ModelJob(**valid_args).is_valid())
def total_metrics_timesignals(self): """ Return a dictionary with the total time signal.. :return: """ # Concatenate all the available time series data for each of the jobs total_metrics = {} for signal_name, signal_details in signal_types.items(): try: times_vec = np.concatenate([ job.timesignals[signal_name].xvalues + job.time_start for job in self.jobs if job.timesignals[signal_name] is not None ]) data_vec = np.concatenate([ job.timesignals[signal_name].yvalues for job in self.jobs if job.timesignals[signal_name] is not None ]) ts = TimeSignal.from_values(signal_name, times_vec, data_vec, base_signal_name=signal_name) total_metrics[signal_name] = ts except ValueError: # logger.info( "======= No jobs found with time series for {}".format(signal_name)) pass return total_metrics
def test_merge_ignores_empty_timesignals(self): """ When merging in time signals from another job, if there is no data in the "other" time signal, then it should be ignored for merging purposes. :return: """ kb_read = TimeSignal.from_values('kb_read', [0.0], [1.0]) kb_write = TimeSignal.from_values('kb_write', [0.0], [0.0]) # n.b. zero data job1 = ModelJob(label="label1", timesignals={'kb_read': kb_read}) job2 = ModelJob(label="label1", timesignals={'kb_write': kb_write}) self.assertIsNone(job1.timesignals['kb_write']) self.assertIsNotNone(job2.timesignals['kb_write']) job1.merge(job2) self.assertIsNone(job1.timesignals['kb_write'])
def test_reject_mislabelled_time_signals(self): """ The initialisation routine should reject invalid time signals in a model job. """ self.assertRaises( ModellingError, lambda: ModelJob(timesignals={ 'kb_write': TimeSignal.from_values('kb_read', [0.0], [0.0]), }))
def test_initialisation(self): # Test some defaults job = ModelJob() for attr in ['time_start', 'ncpus', 'nnodes', 'duration', 'label']: self.assertTrue(hasattr(job, attr)) self.assertIsNone(getattr(job, attr)) for ts_name in signal_types: self.assertIn(ts_name, job.timesignals) self.assertIsNone(job.timesignals[ts_name]) # Test that we can override specified fields job = ModelJob(timesignals={ 'kb_read': TimeSignal.from_values('kb_read', [0.0], [0.0]), 'kb_write': TimeSignal.from_values('kb_write', [0.0], [0.0]), }, time_start=123, ncpus=4, nnodes=5, duration=678, label="a-label") self.assertEqual(job.time_start, 123) self.assertEqual(job.ncpus, 4) self.assertEqual(job.nnodes, 5) self.assertEqual(job.duration, 678) self.assertEqual(job.label, "a-label") self.assertIsInstance(job.timesignals['kb_read'], TimeSignal) self.assertIsInstance(job.timesignals['kb_write'], TimeSignal) for ts_name in signal_types: if ts_name in ['kb_read', 'kb_write']: continue self.assertIn(ts_name, job.timesignals) self.assertIsNone(job.timesignals[ts_name]) # Test that we cannot override non-specific fields self.assertRaises(ModellingError, lambda: ModelJob(invalid=123))
def test_merge_rejects_mislabelled_time_signals(self): """ Test that the merging routine checks the labelling validity. Both ways around. :return: """ kb_read = TimeSignal.from_values('kb_read', [0.0], [1.0]) kb_write = TimeSignal.from_values('kb_read', [0.0], [1.0]) # n.b. mislabelled job1 = ModelJob(label="label1", timesignals={'kb_read': kb_read}) job2 = ModelJob(label="label1") job2.timesignals['kb_write'] = kb_write self.assertRaises(ModellingError, lambda: job1.merge(job2)) # And the other way around job2 = ModelJob(label="label1", timesignals={'kb_read': kb_read}) job1 = ModelJob(label="label1") job1.timesignals['kb_write'] = kb_write self.assertRaises(ModellingError, lambda: job1.merge(job2))
def from_json(js): """ (Re)animate a ModelJob from json extracted from a KProfile (kronos.io_formats.profile_format.ProfileFormat) """ return ModelJob(timesignals={ n: TimeSignal.from_values(n, xvals=t['times'], yvals=t['values'], priority=t['priority'], base_signal_name=n) for n, t in js.get('time_series', {}).items() }, **{ k: v for k, v in js.items() if hasattr(ModelJob, k) and v is not None })
def read_allinea_log(filename, jobs_n_bins=None, cfg=None): """ Collect info from Allinea logs """ # The time signal map has a number of options for each element in the profile: # # 'name': What is the name of this signal mapped into Kronos-land (i.e. mapping onto time_signal.signal_types) # 'is_rate': True if the data is recorded as x-per-second rates, rather than accumulatable values. # (default False) # 'per_task': Is the value presented per-task, or global. If per-task it needs to be multiplied up. # (default False) logger.info( "NOTE: FLOPS not available for allinea Dataset: it will be estimated from %CPU and clock rate" ) # check of the clock_rate is passed in the config if not cfg: logger.info( "WARNING: clock rate not provided! arbitrarily set to 2.5GHz") clock_rate = 2.5e9 else: if cfg.get("clock_rate", None): clock_rate = cfg["clock_rate"] else: logger.info( "WARNING: clock rate not provided! arbitrarily set to 2.5GHz") clock_rate = 2.5e9 # read the data of the json file.. with open(filename) as json_file: json_data = json.load(json_file) # Detect the proper io_keys (lustre or not) as they have a different name in the MAP logs _samples = json_data['profile']['samples'] if _samples.get("lustre_bytes_read") and _samples.get( "lustre_bytes_written"): io_key_write = "lustre_bytes_written" io_key_read = "lustre_bytes_read" elif _samples.get("bytes_read") and _samples.get("bytes_written"): io_key_write = "bytes_written" io_key_read = "bytes_read" else: print( "The allinea map file does not seem to contain IO traces: i.e. [lustre_]bytes_[written|read]" ) sys.exit(1) allinea_time_signal_map = { 'instr_fp': { 'name': 'flops', 'scale_factor': clock_rate, 'is_time_percent': True }, io_key_read: { 'name': 'kb_read', 'is_rate': True, 'scale_factor': 1. / 1024. }, io_key_write: { 'name': 'kb_write', 'is_rate': True, 'scale_factor': 1. / 1024. }, 'mpi_p2p': { 'name': 'n_pairwise', 'is_rate': True }, 'mpi_p2p_bytes': { 'name': 'kb_pairwise', 'is_rate': True, 'scale_factor': 1. / 1024. }, 'mpi_collect': { 'name': 'n_collective', 'is_rate': True }, 'mpi_collect_bytes': { 'name': 'kb_collective', 'is_rate': True, 'scale_factor': 1. / 1024. } } # A quick sanity check for value in allinea_time_signal_map.values(): assert value['name'] in signal_types # # fill in the workload structure # i_job = IngestedJob() # time_start = json_data_stats['profile']['timestamp'] # runtime = float(json_data_stats['profile']['runtime_ms']) / 1000. # time_start_epoch = (datetime.strptime(time_start, "%a %b %d %H:%M:%S %Y") - # datetime(1970, 1, 1)).total_seconds() # fill in the workload structure i_job = IngestedJob() time_start = json_data['profile']['timestamp'] runtime = float(json_data['profile']['runtime_ms']) / 1000. try_formats = [ "%a %b %d %H:%M:%S %Y", "%Y-%m-%dT%H:%M:%S+00", "%Y-%m-%dT%H:%M:%S" ] time_start_epoch = None for fmt in try_formats: try: time_start_epoch = datetime.strptime(time_start, fmt).timestamp() break except ValueError: continue if time_start_epoch is None: raise ValueError(f"cannot parse timestamp {time_start_epoch!r}") # this job might not necessarily been queued i_job.time_created = time_start_epoch - 3 i_job.time_queued = time_start_epoch - 2 i_job.time_eligible = time_start_epoch - 1 i_job.time_start = time_start_epoch i_job.runtime = runtime i_job.time_end = time_start_epoch + runtime i_job.time_in_queue = i_job.time_start - i_job.time_queued # Threads are not considered for now.. i_job.nnodes = int(json_data['profile']["nodes"]) i_job.ncpus = int(json_data['profile']['targetProcs']) # average memory used is taken from sample average of "node_mem_percent" mem_val_bk = json_data['profile']['samples']['node_mem_percent'] mem_val = [v[2] for v in mem_val_bk ] # values inside the blocks are: min, max, mean, var mem_val_mean = sum(mem_val) / float(len(mem_val)) / 100. mem_node_kb = json_data['profile']["memory_per_node"][2] / 1024. i_job.memory_kb = mem_node_kb * mem_val_mean i_job.cpu_percent = 0 i_job.jobname = os.path.basename(filename) i_job.user = "******" i_job.group = "" i_job.queue_type = None # # times relative to start of log # profiler jobs are considered as if they were started at T0 # TODO: find more sensible solution to that.. i_job.time_start_0 = 0.0 # Obtain the timestamps for the (end of) each sampling window, converted into seconds. sample_times = np.array(json_data['profile']['sample_times']) / 1000. sample_interval = json_data['profile']['sample_interval'] / 1000. for ts_name_allinea, ts_config in allinea_time_signal_map.items(): scale_factor = ts_config.get('scale_factor', 1.0) # The Allinea time-series data is a sequence of tuples of the form: (min, max, mean, variance) # Extract the mean value for each sampling interval. y_vals = np.array([ v[2] * scale_factor for v in json_data['profile']['samples'][ts_name_allinea] ]) # If the data is recorded as a rate (a per-second value), then adjust it to record absolute data volumes # per time interval. if ts_config.get('is_rate', False): y_vals = np.array([ v * (sample_times[i] - (sample_times[i - 1] if i > 0 else 0)) for i, v in enumerate(y_vals) ]) if ts_config.get('per_task', False): y_vals *= i_job.ncpus if ts_config.get('is_time_percent', False): y_vals *= sample_interval / 100. # special case: flops areestimated by (cpu_percent*fp_percent*FREQ*Dt/100) if ts_name_allinea == 'instr_fp': y_vals *= np.array([ v[2] / 100. for v in json_data['profile']['samples']['cpu_time_percentage'] ]) ts = TimeSignal.from_values( ts_config['name'], sample_times, y_vals, priority=allinea_signal_priorities[ts_config['name']]) # if jobs_n_bins is not None: # ts.digitized(nbins=jobs_n_bins) i_job.append_time_signal(ts) return i_job
def test_workload_data(self): # If all of the required arguments are supplied, this should result in a valid job ts_complete_set = { tsk: TimeSignal.from_values(tsk, [0., 0.1], [1., 999.]) for tsk in time_signal_names } valid_args = { 'time_start': 0.1, 'duration': 0.2, 'ncpus': 1, 'nnodes': 1, 'timesignals': ts_complete_set } # check that it is a valid job job1 = ModelJob(**valid_args) job2 = ModelJob(**valid_args) job3 = ModelJob(**valid_args) job4 = ModelJob(**valid_args) job5 = ModelJob(**valid_args) input_jobs = [job1, job2, job3, job4, job5] # diversify the time start.. for jj, job in enumerate(input_jobs): job.time_start += jj * 0.1 for job in input_jobs: self.assertTrue(job.is_valid()) # create a workload with 5 model jobs test_workload = Workload(jobs=input_jobs, tag='test_wl') # -- verify that all the jobs in workload are actually the initial jobs provided -- self.assertTrue( all(job is input_jobs[jj] for jj, job in enumerate(test_workload.jobs))) # ------------ verify sums of timesignals ------------------- for ts_name in signal_types: ts_sum = 0 for j in input_jobs: ts_sum += sum(j.timesignals[ts_name].yvalues) # verify the sums.. self.assertEqual(ts_sum, test_workload.total_metrics_sum_dict[ts_name]) # ------------ verify global time signals ------------------- valid_args_1 = { 'time_start': 0.1, 'duration': 0.222, 'ncpus': 1, 'nnodes': 1, 'timesignals': { tsk: TimeSignal.from_values(tsk, np.random.rand(10), np.random.rand(10)) for tsk in time_signal_names } } job1 = ModelJob(**valid_args_1) valid_args_2 = { 'time_start': 0.1, 'duration': 0.333, 'ncpus': 1, 'nnodes': 1, 'timesignals': { tsk: TimeSignal.from_values(tsk, np.random.rand(10), np.random.rand(10)) for tsk in time_signal_names } } job2 = ModelJob(**valid_args_2) test_workload = Workload(jobs=[job1, job2], tag='wl_2jobs') for job in [job1, job2]: for ts in signal_types: self.assertTrue( all(v + job.time_start in test_workload.total_metrics_timesignals[ts].xvalues for v in job.timesignals[ts].xvalues)) self.assertTrue( all(v in test_workload.total_metrics_timesignals[ts].yvalues for v in job.timesignals[ts].yvalues))
def model_time_series(self): total_mpi_pairwise_count_send = 0 total_mpi_pairwise_bytes_send = 0 total_mpi_pairwise_count_recv = 0 total_mpi_pairwise_bytes_recv = 0 total_mpi_collective_count = 0 total_mpi_collective_bytes = 0 total_read_count = 0 total_write_count = 0 total_bytes_read = 0 total_bytes_written = 0 for task in self.tasks: total_mpi_pairwise_count_send += task.mpi_pairwise_count_send total_mpi_pairwise_bytes_send += task.mpi_pairwise_bytes_send total_mpi_pairwise_count_recv += task.mpi_pairwise_count_recv total_mpi_pairwise_bytes_recv += task.mpi_pairwise_bytes_recv total_mpi_collective_count += task.mpi_collective_count total_mpi_collective_bytes += task.mpi_collective_bytes total_read_count += task.read_count total_write_count += task.write_count total_bytes_read += task.bytes_read total_bytes_written += task.bytes_written # divide the totals of MPI ops by the number of nprocs (if specified..) tasks_list = [t.ntasks for t in self.tasks] if tasks_list: ntasks = max([t.ntasks for t in self.tasks]) total_mpi_collective_count = int(total_mpi_collective_count/float(ntasks)) total_mpi_collective_bytes /= float(ntasks) total_mpi_pairwise_count_send = int(total_mpi_pairwise_count_send/float(ntasks)) total_mpi_pairwise_bytes_send /= float(ntasks) # n.b. only using the pairwise send data. Recv should be largely a duplicate, but slightly smaller # as MPI_Sendrecv is only being counted under send for now. If we used both send and recv data # from _all_ tasks we would double count the transfers. return { 'n_collective': TimeSignal.from_values('n_collective', [0.0], [total_mpi_collective_count], priority=ipm_signal_priorities['n_collective']), 'kb_collective': TimeSignal.from_values('kb_collective', [0.0], [float(total_mpi_collective_bytes) / 1024.0], priority=ipm_signal_priorities['kb_collective']), 'n_pairwise': TimeSignal.from_values('n_pairwise', [0.0], [total_mpi_pairwise_count_send], priority=ipm_signal_priorities['n_pairwise']), 'kb_pairwise': TimeSignal.from_values('kb_pairwise', [0.0], [float(total_mpi_pairwise_bytes_send) / 1024.0], priority=ipm_signal_priorities['kb_pairwise']), 'kb_read': TimeSignal.from_values('kb_read', [0.0], [float(total_bytes_read) / 1024.0], priority=ipm_signal_priorities['kb_read']), 'kb_write': TimeSignal.from_values('kb_write', [0.0], [float(total_bytes_written) / 1024.0], priority=ipm_signal_priorities['kb_write']), 'n_read': TimeSignal.from_values('n_read', [0.0], [float(total_read_count)], priority=ipm_signal_priorities['n_read']), 'n_write': TimeSignal.from_values('n_write', [0.0], [float(total_write_count)], priority=ipm_signal_priorities['n_write']) }
def model_time_series(self): """ We want to model the time series here. TODO: Actually introduce time dependence. For now, it only considers totals! """ read_data = [] read_counts = [] write_data = [] write_counts = [] if self.time_end and self.time_start: duration = self.time_end - self.time_start else: duration = None for model_file in self.file_details.values(): if duration: if model_file.read_time_start > duration: model_file.read_time_start = duration - 1 if model_file.read_time_end > duration: model_file.read_time_end = duration if model_file.read_time_start is not None and ( model_file.read_count != 0 or model_file.bytes_read != 0): read_data.append( (model_file.read_time_start, model_file.bytes_read / 1024.0, model_file.read_time_end - model_file.read_time_start)) read_counts.append( (model_file.read_time_start, model_file.read_count, model_file.read_time_end - model_file.read_time_start)) if model_file.write_time_start is not None and ( model_file.write_count != 0 or model_file.bytes_written != 0): write_data.append( (model_file.write_time_start, model_file.bytes_written / 1024.0, model_file.write_time_end - model_file.write_time_start)) write_counts.append( (model_file.write_time_start, model_file.write_count, model_file.write_time_end - model_file.write_time_start)) times_read, read_data, read_durations = zip( *read_data) if read_data else (None, None, None) times_read2, read_counts, read_durations2 = zip( *read_counts) if read_counts else (None, None, None) times_write, write_data, write_durations = zip( *write_data) if write_data else (None, None, None) times_write2, write_counts, write_durations2 = zip( *write_counts) if write_counts else (None, None, None) time_series = {} if read_data: time_series['kb_read'] = TimeSignal.from_values( 'kb_read', times_read, read_data, durations=read_durations, priority=darshan_signal_priorities['kb_read']) if write_data: time_series['kb_write'] = TimeSignal.from_values( 'kb_write', times_write, write_data, durations=write_durations, priority=darshan_signal_priorities['kb_write']) if read_counts: time_series['n_read'] = TimeSignal.from_values( 'n_read', times_read, read_counts, durations=read_durations, priority=darshan_signal_priorities['n_read']) if write_counts: time_series['n_write'] = TimeSignal.from_values( 'n_write', times_write, write_counts, durations=write_durations, priority=darshan_signal_priorities['n_write']) return time_series
def test_generator(self): """ The configuration object should have some sane defaults """ # If all of the required arguments are supplied, this should result in a valid job ts_complete_set = { tsk: TimeSignal.from_values(tsk, [0., 0.1], [1., 999.]) for tsk in time_signal_names } valid_args = { 'time_start': 0.1, 'duration': 0.2, 'ncpus': 1, 'nnodes': 1, 'timesignals': ts_complete_set } ts_complete_set_2 = { tsk: TimeSignal.from_values(tsk, [0., 0.1], [1., 444.]) for tsk in time_signal_names } valid_args_2 = { 'time_start': 0.1, 'duration': 0.2, 'ncpus': 1, 'nnodes': 1, 'timesignals': ts_complete_set_2 } # check that it is a valid job job1 = ModelJob(**valid_args) job1.label = "job1" job2 = ModelJob(**valid_args_2) job2.label = "job2" job3 = ModelJob(**valid_args) job3.label = "job3" job4 = ModelJob(**valid_args_2) job4.label = "job4" job5 = ModelJob(**valid_args) job5.label = "job5" input_jobs = [job1, job2, job3, job4, job5] # diversify the time start.. for jj, job in enumerate(input_jobs): job.time_start += jj * 0.1 for job in input_jobs: self.assertTrue(job.is_valid()) config_generator = { "type": "cluster_and_spawn", "job_clustering": { "type": "Kmeans", "rseed": 0, "apply_to": ["test_wl_0"], "ok_if_low_rank": True, "max_iter": 100, "max_num_clusters": 3, "delta_num_clusters": 1, "num_timesignal_bins": 1, "user_does_not_check": True }, "job_submission_strategy": { "type": "match_job_pdf_exact", "n_bins_for_pdf": 20, "submit_rate_factor": 8, "total_submit_interval": 60, "random_seed": 0 } } # select the appropriate workload_filling strategy workloads = [ Workload(jobs=input_jobs, tag='test_wl_0'), Workload(jobs=input_jobs, tag='test_wl_1'), Workload(jobs=input_jobs, tag='test_wl_2') ] workload_modeller = workload_modelling_types[config_generator["type"]]( workloads) workload_modeller.apply(config_generator) # get the newly created set of (modelled) workloads workload_set = workload_modeller.get_workload_set() # make sure that we are creating only one workload self.assertEqual(len(workload_set.workloads), 1) # ---- check that all the jobs are correctly formed.. ---- # check that each job has time-signals as expected.. for job in workload_set.workloads[0].jobs: self.assertTrue(hasattr(job, "timesignals")) # check that each job has all the time-signals at this point.. for job in workload_set.workloads[0].jobs: self.assertTrue( all([k in job.timesignals.keys() for k in time_signal_names]))
def concatenate_modeljobs(cat_job_label, job_list): """ Interlaces (or concatenates) a list of jobs into one single job the job time series will be interlaced according to their respective timestamps :param cat_job_label: name of concatenated job job_list: list of jobs to concatenate :return: A ModelJob """ # 2) find start-time and end-time cat_start_time = min([job.time_start for job in job_list]) cat_end_time = max([job.time_start + job.duration for job in job_list]) # 3) find overall duration cat_duration = cat_end_time - cat_start_time # 4) interlace time-series cat_time_series = {} for ts_type in time_signal_names: cat_xvalues = [] cat_yvalues = [] cat_durations = [] # loop over jobs for job in job_list: if ts_type in job.timesignals.keys( ) and job.timesignals[ts_type] is not None: # add xvalues (in absolute value) and yvalues cat_xvalues.extend( np.asarray(job.timesignals[ts_type].xvalues) + job.time_start) cat_yvalues.extend(np.asarray( job.timesignals[ts_type].yvalues)) # add durations only if available, otherwise set them to zero.. if job.timesignals[ts_type].durations is not None: cat_durations.extend(job.timesignals[ts_type].durations) else: cat_durations.extend( np.zeros(len(job.timesignals[ts_type].xvalues))) if cat_xvalues: # reset the initial time to zero.. cat_xvalues = [x - cat_start_time for x in cat_xvalues] # sort values as time sequence.. cat_vals = zip(cat_xvalues, cat_yvalues, cat_durations) cat_vals.sort(key=lambda x: x[0], reverse=False) xvalues, yvalues, durations = zip(*cat_vals) # build the concatenated time signal.. cat_time_series[ts_type] = TimeSignal(ts_type, base_signal_name=ts_type, durations=durations, xvalues=xvalues, yvalues=yvalues) print("job {} created".format(cat_job_label)) # TODO: make a decision on what nnodes and nproc to choose.. return ModelJob(time_start=cat_start_time, duration=cat_duration, ncpus=2, nnodes=1, timesignals=cat_time_series, label=cat_job_label)
def test_workload_fillin_match(self): """ Test the metrics assignment through job name (label) matching :return: """ # ------------ verify global time signals ------------------- valid_args_1 = { 'job_name': "blabla_weird_name", 'time_start': 0.1, 'duration': 0.222, 'ncpus': 1, 'nnodes': 1, 'timesignals': { tsk: TimeSignal.from_values(tsk, np.random.rand(10), np.arange(10) * 2) for tsk in time_signal_names } } job1 = ModelJob(**valid_args_1) valid_args_2 = { 'job_name': "job_match", 'time_start': 0.1, 'duration': 0.333, 'ncpus': 1, 'nnodes': 1, 'timesignals': {} } job2 = ModelJob(**valid_args_2) # ------ target workload (that will receive the time metrics..) target_wl = Workload(jobs=[job1, job2], tag='target_workload') # ---------- source workload valid_args_3 = { 'job_name': "job_match", 'time_start': 0.1, 'duration': 0.333, 'ncpus': 1, 'nnodes': 1, 'timesignals': { tsk: TimeSignal.from_values(tsk, np.random.rand(10), np.random.rand(10)) for tsk in time_signal_names } } job3 = ModelJob(**valid_args_3) source_wl = Workload(jobs=[job3], tag='wl_match_source') # filler config filler_config = { "type": "match_by_keyword", "priority": 0, "keywords": ["job_name"], "similarity_threshold": 0.3, "source_workloads": ["wl_match_source"], "apply_to": ["target_workload"] } # Apply the user defaults to the workloads workloads = [target_wl, source_wl] filler = StrategyMatchKeyword(workloads) filler.apply(filler_config) # for ts_k, ts_v in job3.timesignals.iteritems(): # print "JOB3:{}:{}".format(ts_k, ts_v.yvalues) # # for ts_k, ts_v in target_wl.jobs[1].timesignals.iteritems(): # print "TRG_J1:{}:{}".format(ts_k, ts_v.yvalues) self.assertTrue( all([ all(ys == yt for ys, yt in zip(job3.timesignals[ts_k].yvalues, ts_v.yvalues)) for ts_k, ts_v in target_wl.jobs[1].timesignals.items() ]))
def generate_jobs(self): logger.info("Generating jobs from cluster: {}, " "that has {} jobs".format( self.wl_clusters['source-workload'], len(self.wl_clusters['jobs_for_clustering']))) start_times_vec_sa, _, _ = self.schedule_strategy.create_schedule() # Random vector of cluster indexes n_modelled_jobs = len(start_times_vec_sa) np.random.seed(self.config["job_submission_strategy"].get( 'random_seed', 0)) vec_clust_indexes = np.random.randint( self.wl_clusters['cluster_matrix'].shape[0], size=n_modelled_jobs) # Mean NCPU in cluster (considering jobs in cluster) jobs_all = self.wl_clusters['jobs_for_clustering'] lab_all = np.asarray(self.wl_clusters['labels']) # jobs in each cluster jobs_in_each_cluster = { cl: np.asarray(jobs_all)[lab_all == cl] for cl in set(lab_all) } # mean #CPUS in each cluster (from jobs for which ncpus is available, otherwise 1) mean_cpus = { cl_id: np.mean([job.ncpus if job.ncpus else 1 for job in cl_jobs]) for cl_id, cl_jobs in jobs_in_each_cluster.items() } # mean #NODES in each cluster (from jobs for which nnodes is available, otherwise 1) mean_nodes = { cl_id: np.mean([job.nnodes if job.nnodes else 1 for job in cl_jobs]) for cl_id, cl_jobs in jobs_in_each_cluster.items() } # loop over the clusters and generates jobs as needed generated_model_jobs = [] for cc, cl_idx in enumerate(vec_clust_indexes): ts_dict = {} row = self.wl_clusters['cluster_matrix'][cl_idx, :] ts_yvalues = np.split(row, len(time_signal_names)) for tt, ts_vv in enumerate(ts_yvalues): ts_name = time_signal_names[tt] ts = TimeSignal(ts_name).from_values(ts_name, np.arange(len(ts_vv)), ts_vv) ts_dict[ts_name] = ts job = ModelJob(time_start=start_times_vec_sa[cc], job_name="job-{}-cl-{}".format(cc, cl_idx), duration=None, ncpus=mean_cpus[cl_idx], nnodes=mean_nodes[cl_idx], timesignals=ts_dict, label="job-{}-cl-{}".format(cc, cl_idx)) generated_model_jobs.append(job) n_sa = len(generated_model_jobs) n_job_ratio = n_sa / float(len( self.wl_clusters['jobs_for_clustering'])) * 100. logger.info( "====> Generated {} jobs from cluster (#job ratio = {:.2f}%)". format(n_sa, n_job_ratio)) return generated_model_jobs, vec_clust_indexes
def test_workload_fillin_default(self): """ Test the correct assignment of user-defined time-series :return: """ io_metrics = ['kb_read', 'kb_write', 'n_read', 'n_write'] # create 2 random jobs (with ONLY io metrics) valid_args_1 = { 'time_start': 0.1, 'duration': 0.2, 'ncpus': 1, 'nnodes': 1, 'timesignals': { tsk: TimeSignal.from_values(tsk, np.random.rand(10), np.random.rand(10)) for tsk in io_metrics } } job1 = ModelJob(**valid_args_1) valid_args_2 = { 'time_start': 0.1, 'duration': 0.2, 'ncpus': 1, 'nnodes': 1, 'timesignals': { tsk: TimeSignal.from_values(tsk, np.random.rand(10), np.random.rand(10)) for tsk in io_metrics } } job2 = ModelJob(**valid_args_2) test_workload = Workload(jobs=[job1, job2], tag='wl_2jobs') # ---------------------- fill in config ----------------------- filling_funct_config = [{ "type": "step", "name": "step-1", "x_step": 0.5 }, { "type": "custom", "name": "custom-1", "x_values": [0, 0.1, 0.15, 0.3333, 0.5, 0.8, 0.9, 1.0], "y_values": [0, 0.1, 0.2, 0.3, 0.5, 0.8, 0.9, 1.0] }] # Values to assign to all the unspecified metrics default_config = { "type": "fill_missing_entries", "apply_to": ["wl_2jobs"], "priority": 0, "metrics": { "kb_collective": [100, 101], "n_collective": [100, 101], "kb_pairwise": { "function": "step-1", "scaling": 1000.0 }, "n_pairwise": { "function": "custom-1", "scaling": 1000.0 }, "flops": [100, 101], } } # update the filling config with the user-defined functions default_config.update({"user_functions": filling_funct_config}) # Apply the user defaults to the workloads workloads = [test_workload] filler = StrategyUserDefaults(workloads) filler.apply(default_config) # test that the IO metrics are within the random range used [0,1] for j in workloads[0].jobs: self.assertTrue( all([0.0 < x < 1.0 for x in j.timesignals['n_write'].xvalues])) self.assertTrue( all([0.0 < x < 1.0 for x in j.timesignals['n_write'].yvalues])) self.assertTrue( all([0.0 < x < 1.0 for x in j.timesignals['kb_write'].xvalues])) self.assertTrue( all([0.0 < x < 1.0 for x in j.timesignals['kb_write'].yvalues])) self.assertTrue( all([0.0 < x < 1.0 for x in j.timesignals['n_read'].xvalues])) self.assertTrue( all([0.0 < x < 1.0 for x in j.timesignals['n_read'].yvalues])) self.assertTrue( all([0.0 < x < 1.0 for x in j.timesignals['kb_read'].xvalues])) self.assertTrue( all([0.0 < x < 1.0 for x in j.timesignals['kb_read'].yvalues])) # test that the user-defined metrics are within the random range chosen [0,1] for j in workloads[0].jobs: self.assertTrue( all([100 < x < 101 for x in j.timesignals['flops'].yvalues])) self.assertTrue( all([ 100 < x < 101 for x in j.timesignals['n_collective'].yvalues ])) self.assertTrue( all([ 100 < x < 101 for x in j.timesignals['kb_collective'].yvalues ])) # test that the user-defined functions are being applied as expected for j in workloads[0].jobs: # values vs expected val_exp = zip(j.timesignals['n_pairwise'].yvalues, [0, 0.1, 0.2, 0.3, 0.5, 0.8, 0.9, 1.0]) self.assertTrue(all([x == y * 1000. for x, y in val_exp])) # and the step function self.assertTrue( all([(x == 0 or x == 1000.) for x in j.timesignals['kb_pairwise'].yvalues]))
class UserJobTests(unittest.TestCase): xvals = list(range(10)) yvals = [y**2 for y in range(10)] dummy_time_signals = { tsname: TimeSignal.from_values(tsname, xvals=list(range(10)), yvals=[y**2 for y in range(10)], priority=10) for tsname in time_signal_names } def test_user_job_init(self): """ Test initialisation of user-generated jobs :return: """ # instantiate user job user_job = UserGeneratedJob("dummy_job", timesignals=self.dummy_time_signals, ts_scales=None) self.assertEqual(user_job.name, "dummy_job") self.assertEqual(user_job.timesignals, self.dummy_time_signals) # from its proto-signals job = UserGeneratedJob.from_random_proto_signals( "from_proto_signals_job", ts_len=25) self.assertEqual(job.name, "from_proto_signals_job") first_ts_len = len(next(iter(job.timesignals.values())).xvalues) self.assertEqual(first_ts_len, 25) def test_timesignal_probability(self): # from its proto-signals job = UserGeneratedJob.from_random_proto_signals( "from_proto_signals_job", ts_len=25) # check length of all the ts.. first_ts_len = len(next(iter(job.timesignals.values())).xvalues) self.assertEqual(first_ts_len, 25) # check that all the lengths for tsv in job.timesignals.values(): self.assertEqual(len(tsv.xvalues), first_ts_len) # probability 0 meant that all the signals will be removed job_no_ts = copy.deepcopy(job) job_no_ts.apply_ts_probability(0.0) for tsv in job_no_ts.timesignals.values(): self.assertTrue(all([y == -1 for y in tsv.yvalues])) # probability 1 meant that all the signals are retained job_no_ts = copy.deepcopy(job) job_no_ts.apply_ts_probability(1.0) for tsv in job_no_ts.timesignals.values(): self.assertTrue(all([y != -1 for y in tsv.yvalues]))
def test_splitter(self): # -------------- prepare a couple of dummy jobs --------------- # If all of the required arguments are supplied, this should result in a valid job ts_complete_set = { tsk: TimeSignal.from_values(tsk, [0., 0.1], [1., 999.]) for tsk in time_signal_names } ts_complete_set_2 = { tsk: TimeSignal.from_values(tsk, [0., 0.1], [1., 444.]) for tsk in time_signal_names } valid_args = { 'time_start': 0.1, 'duration': 0.2, 'ncpus': 1, 'nnodes': 1, 'timesignals': ts_complete_set, 'job_name': "job_name_1" } valid_args_2 = { 'time_start': 0.2, 'duration': 0.4, 'ncpus': 2, 'nnodes': 2, 'timesignals': ts_complete_set_2, 'job_name': "job_name_2" } # a model job that WILL NOT be picked by the algorithm.. job1 = ModelJob(**valid_args) job1.label = "label_nottobepicked" # a model job that WILL be picked by the algorithm.. job2 = ModelJob(**valid_args_2) job2.label = "label_includeme" # dummy workload with 20 jobs np.random.seed(0) jobs_all = [] for i in range(20): # spawn a new job from either job1 or job2 if np.random.rand() < 0.5: new_job = copy.deepcopy(job1) else: new_job = copy.deepcopy(job2) # assign it a new label jobs_all.append(new_job) # create a workload out of all the jobs.. workload = Workload(jobs=jobs_all, tag="testing_workload") # configure the splitter from user config config_splitting = { "type": "split", "keywords_in": ["includeme"], "keywords_out": ["excludeme"], "split_by": "label", "apply_to": ["testing_workload"], "create_workload": "spawn_workload" } workloads = [workload] splitter = WorkloadSplit(workloads) splitter.apply(config_splitting) wl_out = None for wl in workloads: if wl.tag == config_splitting["create_workload"]: wl_out = wl break # make sure that we have created a workload as expected self.assertTrue(wl_out is not None) self.assertEqual(wl_out.tag, config_splitting["create_workload"]) # make sure that all the jobs have a label consistent with the filter for j in wl_out.jobs: self.assertTrue("includeme" in j.label and "excludeme" not in j.label)