def compute_overall_utilisation(): ''' Use all the swf file present in the curent directory to compute overall utilisation of the clusters. Write results to "overall_utilisation.csv" and return it. ''' files = [f for f in glob('./swf_files/*.swf')] res = pd.Series() i = 0 for f in files: print('Loading SWF file: {}'.format(f)) try: wl = workload.Workload.from_csv(f) norm_util_mean = load_mean(wl.utilisation) / wl.MaxProcs print('Mean Util: {}\n'.format(norm_util_mean)) res.set_value(f, norm_util_mean) i = i + 1 except AttributeError as e: print("Unable to compute normalize mean: {}".format(e)) finally: if wl: del wl print('{}'.format(res)) print('{}'.format(res.mean())) print(res.describe()) res.to_csv("overall_utilisation.csv") return res
def plot_load(load, nb_resources=None, ax=None, normalize=False, time_scale=False, load_label="load", UnixStartTime=0, TimeZoneString='UTC'): ''' Plots the number of used resources against time :normalize: if True normalize by the number of resources `nb_resources` ''' mean = metrics.load_mean(load) u = load.copy() if time_scale: # make the time index a column u = u.reset_index() # convert timestamp to datetime u.index = pd.to_datetime(u['time'] + UnixStartTime, unit='s') u.index.tz_localize('UTC').tz_convert(TimeZoneString) if normalize and nb_resources is None: nb_resources = u.load.max() if normalize: u.load = u.load / nb_resources mean = mean / nb_resources # get an axe if not provided if ax is None: ax = plt.gca() # leave room to have better view ax.margins(x=0.1, y=0.1) # plot load u.load.plot(drawstyle="steps-post", ax=ax, label=load_label) # plot a line for max available area if nb_resources and not normalize: ax.plot([u.index[0], u.index[-1]], [nb_resources, nb_resources], linestyle='-', linewidth=2, label="Maximum resources ({})".format(nb_resources)) # plot a line for mean utilisation ax.plot([u.index[0], u.index[-1]], [mean, mean], linestyle='--', linewidth=1, label="Mean {0} ({1:.2f})".format(load_label, mean)) sns.rugplot(u.load[u.load == 0].index, ax=ax, color='r') ax.scatter([], [], marker="|", linewidth=1, s=200, label="Reset event ({} == 0)".format(load_label), color='r') # FIXME: Add legend when this bug is fixed # https://github.com/mwaskom/seaborn/issues/1071 ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
def extract_periods_with_given_utilisation(self, period_in_hours, utilisation, variation=0.01, nb_max=None): ''' This extract from the workload a period (in hours) with a given mean utilisation (between 0 and 1). :returns: a list of workload of the given periods, with the given utilisation, extracted from the this workload. ''' norm_util = self.utilisation # resample the dataframe with the given period time_periods = np.arange(min(norm_util.index), max(norm_util.index), 60 * 60 * period_in_hours) mean_df = pd.DataFrame() for index, val in enumerate(time_periods): if index == len(time_periods) - 1 or index == 0: continue begin = val end = time_periods[index + 1] mean_df = mean_df.append( {"begin": begin, "end": end, "mean_util": load_mean(norm_util, begin=begin, end=end)}, ignore_index=True) mean_df["norm_mean_util"] = mean_df.mean_util / self.MaxProcs periods = mean_df.loc[ lambda x: x.norm_mean_util >= (utilisation - variation)].loc[ lambda x: x.norm_mean_util <= (utilisation + variation) ] # Only take nb_max periods if it is defined if nb_max: periods = periods[:nb_max] notes = ("Period of {} hours with a mean utilisation " "of {}".format(period_in_hours, utilisation)) return self.extract(periods, notes)
def extract_periods_with_given_utilisation(self, period_in_hours, utilisation, variation=0.01, nb_max=None): ''' This extract from the workload a period (in hours) with a given mean utilisation (between 0 and 1). :returns: a list of workload of the given periods, with the given utilisation, extracted from the this workload. ''' norm_util = self.utilisation # resample the dataframe with the given period time_periods = np.arange(min(norm_util.index), max(norm_util.index), 60 * 60 * period_in_hours) mean_df = pd.DataFrame() for index, val in enumerate(time_periods): if index == len(time_periods) - 1 or index == 0: continue begin = val end = time_periods[index + 1] mean_df = mean_df.append( { "begin": begin, "end": end, "mean_util": load_mean(norm_util, begin=begin, end=end) }, ignore_index=True) mean_df["norm_mean_util"] = mean_df.mean_util / self.MaxProcs periods = mean_df.loc[lambda x: x.norm_mean_util >= (utilisation - variation)].loc[ lambda x: x.norm_mean_util <= (utilisation + variation)] # Only take nb_max periods if it is defined if nb_max: periods = periods[:nb_max] notes = ("Period of {} hours with a mean utilisation " "of {}".format(period_in_hours, utilisation)) return self.extract(periods, notes)
def plot_load(load, nb_resources=None, ax=None, normalize=False, time_scale=False, load_label="load", UnixStartTime=0, TimeZoneString='UTC'): ''' Plots the number of used resources against time :normalize: if True normalize by the number of resources `nb_resources` ''' mean = metrics.load_mean(load) u = load.copy() if time_scale: # make the time index a column u = u.reset_index() # convert timestamp to datetime u.index = pd.to_datetime(u['time'] + UnixStartTime, unit='s') u.index.tz_localize('UTC').tz_convert(TimeZoneString) if normalize and nb_resources is None: nb_resources = u.load.max() if normalize: u.load = u.load / nb_resources mean = mean / nb_resources # get an axe if not provided if ax is None: ax = plt.gca() # leave room to have better view ax.margins(x=0.1, y=0.1) # plot load u.load.plot(drawstyle="steps-post", ax=ax, label=load_label) # plot a line for max available area if nb_resources and not normalize: ax.plot([u.index[0], u.index[-1]], [nb_resources, nb_resources], linestyle='-', linewidth=2, label="Maximum resources ({})".format(nb_resources)) # plot a line for mean utilisation ax.plot([u.index[0], u.index[-1]], [mean, mean], linestyle='--', linewidth=1, label="Mean {0} ({1:.2f})".format(load_label, mean)) sns.rugplot(u.load[u.load == 0].index, ax=ax, color='r') ax.scatter([], [], marker="|", linewidth=1, s=200, label="Reset event ({} == 0)".format(load_label), color='r') # FIXME: Add legend when this bug is fixed # https://github.com/mwaskom/seaborn/issues/1071 # ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) ax.grid(True) ax.legend() ax.set_title(load_label)
def mean_utilisation(self, begin_time=None, end_time=None): return load_mean(self.utilisation, begin=begin_time, end=end_time)
def extract_periods_with_given_utilisation(self, period_in_hours, utilisation, variation=0.01, nb_max=None, merge_basic=False, merge_change_submit_times=False, randomize_starting_times=False, random_seed=0, max_nb_jobs=None): ''' This extract from the workload a period (in hours) with a given mean utilisation (between 0 and 1). :returns: a list of workload of the given periods, with the given utilisation, extracted from the this workload. ''' norm_util = self.utilisation # resample the dataframe with the given period if randomize_starting_times: np.random.seed(random_seed) c = np.random.choice(norm_util.index, size=50) time_periods = np.compress( c <= max(norm_util.index) - period_in_hours * (60 * 60), c) else: time_periods = np.arange(min(norm_util.index), max(norm_util.index), 60 * 60 * period_in_hours) mean_df = pd.DataFrame() for index, val in enumerate(time_periods): if index == len(time_periods) - 1 or index == 0: continue begin = val end = val + period_in_hours * (60 * 60) mean_df = mean_df.append( { "begin": begin, "end": end, "mean_util": load_mean(norm_util, begin=begin, end=end) }, ignore_index=True) mean_df["norm_mean_util"] = mean_df.mean_util / self.MaxProcs periods = mean_df.loc[lambda x: x.norm_mean_util >= (utilisation - variation)].loc[ lambda x: x.norm_mean_util <= (utilisation + variation)] # Only take nb_max periods if it is defined if nb_max: periods = periods[:nb_max] notes = ("Period of {} hours with a mean utilisation " "of {}".format(period_in_hours, utilisation)) return self.extract( periods, notes, merge_basic=merge_basic, merge_change_submit_times=merge_change_submit_times, max_nb_jobs=max_nb_jobs)