def df_task_states(self, task, stringify=False): """ DataFrame of task's state updates events :param task: The task's name or PID or tuple ``(pid, comm)`` :type task: int or str or tuple(int, str) :param stringify: Include stringifed :class:`TaskState` columns :type stringify: bool :returns: a :class:`pandas.DataFrame` with: * A ``cpu`` column (the CPU where the task was on) * A ``target_cpu`` column (the CPU where the task has been scheduled). Will be ``NaN`` for non-wakeup events * A ``curr_state`` column (the current task state, see :class:`~TaskState`) * A ``next_state`` column (the next task state) * A ``delta`` column (the duration for which the task will remain in this state) """ task_id = self.trace.get_task_id(task, update=False) df = self.df_tasks_states() df = df_filter_task_ids(df, [task_id]) df = df.drop(columns=["pid", "comm"]) if stringify: self.stringify_df_task_states(df, ["curr_state", "next_state"], inplace=True) return df
def check_task_util(task_id): # Only keep the data about the tasks we care about. _df = df_filter_task_ids(df, [task_id]) avg = _df['util'].mean() util_means[task_id.comm] = avg # Util is not supposed to be higher than 512 given what we asked for in get_rtapp_profile() return avg < (512 + util_margin)
def plot_task_signals(self, task, axis, local_fig, signals=['util', 'load']): """ Plot the task-related load-tracking signals :param task: The name or PID of the task, or a tuple ``(pid, comm)`` :type task: str or int or tuple :param signals: List of signals to plot. :type signals: list(str) """ task_id = self.trace.get_task_id(task, update=False) start = self.trace.start end = self.trace.end for signal in signals: df = self.df_tasks_signal(signal) df = df_filter_task_ids(df, [task_id]) df = df_refit_index(df, start, end) df[signal].plot(ax=axis, drawstyle='steps-post', alpha=0.4) plot_overutilized = self.trace.analysis.status.plot_overutilized if self.trace.has_events(plot_overutilized.used_events): plot_overutilized(axis=axis) axis.set_title('Load-tracking signals of task "{}"'.format(task)) axis.legend() axis.grid(True) axis.set_xlim(start, end)
def plot_task_required_capacity(self, task: TaskID, axis=None, **kwargs): """ Plot the minimum required capacity of a task :param task: The name or PID of the task, or a tuple ``(pid, comm)`` :type task: str or int or tuple """ window = self.trace.window task_ids = self.trace.get_task_ids(task) df = self.df_tasks_signal('required_capacity') df = df_filter_task_ids(df, task_ids) df = df_refit_index(df, window=window) # Build task names (there could be multiple, during the task lifetime) task_name = f"Task ({', '.join(map(str, task_ids))})" def plotter(axis, local_fig): df["required_capacity"].plot(drawstyle='steps-post', ax=axis) axis.legend() axis.grid(True) if local_fig: axis.set_title(task_name) axis.set_ylim(0, 1100) axis.set_ylabel('Utilization') axis.set_xlabel('Time (s)') return self.do_plot(plotter, height=8, axis=axis, **kwargs)
def _task_filtered(self, df, task=None): if not task: return df task = self.trace.get_task_id(task) if task not in self.rtapp_tasks: raise ValueError(f"Task [{task}] is not an rt-app task: {self.rtapp_tasks}") return df_filter_task_ids(df, [task], pid_col='__pid', comm_col='__comm')
def get_simulated_pelt(self, task, signal_name): """ Simulate a PELT signal for a given task. :param task: task to look for in the trace. :type task: int or str or tuple(int, str) :param signal_name: Name of the PELT signal to simulate. :type signal_name: str :return: A :class:`pandas.DataFrame` with a ``simulated`` column containing the simulated signal, along with the column of the signal as found in the trace. """ logger = self.get_logger() trace = self.trace task = trace.get_task_id(task) cpus = trace.analysis.tasks.cpus_of_tasks([task]) # Capacity lower than 1024 will create some time-scaling artifacts that # are not currently simulated assert all( self.plat_info["cpu-capacities"][cpu] == UTIL_SCALE for cpu in cpus ) df_activation = trace.analysis.tasks.df_task_activation(task) df = trace.analysis.load_tracking.df_tasks_signal(signal_name) df = df_filter_task_ids(df, [task]) # Ignore the first activation, as its signals are incorrect df_activation = df_activation.iloc[2:] # Make sure the activation df does not start before the dataframe of # signal values, otherwise we cannot provide a sensible init value df_activation = df_activation[df.index[0]:] # Get the initial signal value matching the first activation we will care about init_iloc = df.index.get_loc(df_activation.index[0], method='ffill') init = df[signal_name].iloc[init_iloc] try: # PELT clock in nanoseconds clock = df['update_time'] * 1e-9 except KeyError: logger.warning('PELT clock is not available, ftrace timestamp will be used at the expense of accuracy') clock = None df['simulated'] = simulate_pelt(df_activation['active'], index=df.index, init=init, clock=clock) df['error'] = df[signal_name] - df['simulated'] df = df.dropna() return df
def df_task_signal(self, task, signal): """ Same as :meth:`df_tasks_signal` but for one task only. :param task: The name or PID of the task, or a tuple ``(pid, comm)`` :type task: str or int or tuple :param signal: See :meth:`df_tasks_signal`. """ task_id = self.trace.get_task_id(task, update=False) df = self.df_tasks_signal(signal=signal) return df_filter_task_ids(df, [task_id])
def cpus_of_tasks(self, tasks): """ Return the list of CPUs where the ``tasks`` executed. :param tasks: Task names or PIDs or ``(pid, comm)`` to look for. :type tasks: list(int or str or tuple(int, str)) """ trace = self.trace df = trace.df_event('sched_switch')[['next_pid', 'next_comm', '__cpu']] task_ids = [trace.get_task_id(task, update=False) for task in tasks] df = df_filter_task_ids(df, task_ids, pid_col='next_pid', comm_col='next_comm') cpus = df['__cpu'].unique() return sorted(cpus)
def get_first_switch(row): comm, pid, _ = row.name start_time = row['Time'] task = TaskID(comm=comm, pid=pid) start_swdf = df_filter_task_ids(swdf, [task], pid_col='next_pid', comm_col='next_comm') pre_phase_swdf = start_swdf[start_swdf.index < start_time] # The task with that comm and PID was never switched-in, which # means it was still on the current CPU when it was renamed, so we # just report phase-start. if pre_phase_swdf.empty: return start_time # Otherwise, we return the timestamp of the switch else: return pre_phase_swdf.index[-1]
def plot_task_residency(self, task: TaskID, axis, local_fig): """ Plot on which CPUs the task ran on over time :param task: Task to track :type task: int or str or tuple(int, str) """ task_id = self.trace.get_task_id(task, update=False) sw_df = self.trace.df_event("sched_switch") sw_df = df_filter_task_ids(sw_df, [task_id], pid_col='next_pid', comm_col='next_comm') if "freq-domains" in self.trace.plat_info: # If we are aware of frequency domains, use one color per domain for domain in self.trace.plat_info["freq-domains"]: series = sw_df[sw_df["__cpu"].isin(domain)]["__cpu"] if series.empty: # Cycle the colours to stay consistent self.cycle_colors(axis) else: series = series_refit_index(series, window=self.trace.window) series.plot( ax=axis, style='+', label="Task running in domain {}".format(domain)) else: series = series_refit_index(sw_df['__cpu'], window=self.trace.window) series.plot(ax=axis, style='+') plot_overutilized = self.trace.analysis.status.plot_overutilized if self.trace.has_events(plot_overutilized.used_events): plot_overutilized(axis=axis) # Add an extra CPU lane to make room for the legend axis.set_ylim(-0.95, self.trace.cpus_count - 0.05) axis.set_title("CPU residency of task \"{}\"".format(task)) axis.set_ylabel('CPUs') axis.grid(True) axis.legend()
def plot_task_placement(self, task, axis, local_fig): """ Plot the CPU placement of the task :param task: The name or PID of the task, or a tuple ``(pid, comm)`` :type task: str or int or tuple """ # Get all utilization update events df = self.df_tasks_signal('required_capacity') task_id = self.trace.get_task_id(task, update=False) df = df_filter_task_ids(df, [task_id]) cpu_capacities = self.trace.plat_info["cpu-capacities"] def evaluate_placement(cpu, required_capacity): capacity = cpu_capacities[cpu] if capacity < required_capacity: return "CPU capacity < required capacity" elif capacity == required_capacity: return "CPU capacity == required capacity" else: return "CPU capacity > required capacity" df["placement"] = df.apply(lambda row: evaluate_placement( row["cpu"], row["required_capacity"]), axis=1) for stat in df["placement"].unique(): df[df.placement == stat]["cpu"].plot(ax=axis, style="+", label=stat) plot_overutilized = self.trace.analysis.status.plot_overutilized if self.trace.has_events(plot_overutilized.used_events): plot_overutilized(axis=axis) axis.set_title("Utilization vs placement of task \"{}\"".format(task)) axis.set_xlim(self.trace.start, self.trace.end) axis.grid(True) axis.legend()
def plot_task_residency(self, task: TaskID): """ Plot on which CPUs the task ran on over time :param task: Task to track :type task: int or str or tuple(int, str) """ task_id = self.trace.get_task_id(task, update=False) sw_df = self.trace.df_event("sched_switch") sw_df = df_filter_task_ids(sw_df, [task_id], pid_col='next_pid', comm_col='next_comm') def plot_residency(): if "freq-domains" in self.trace.plat_info: # If we are aware of frequency domains, use one color per domain for domain in self.trace.plat_info["freq-domains"]: series = sw_df[sw_df["__cpu"].isin(domain)]["__cpu"] series = series_refit_index(series, window=self.trace.window) if series.empty: return _hv_neutral() else: return self._plot_markers( series, label=f"Task running in domain {domain}" ) else: self._plot_markers( series_refit_index(sw_df['__cpu'], window=self.trace.window) ) return ( plot_residency().options(ylabel='cpu') * self._plot_overutilized() ).options( title=f'CPU residency of task {task}' )
def plot_task_required_capacity(self, task: TaskID): """ Plot the minimum required capacity of a task :param task: The name or PID of the task, or a tuple ``(pid, comm)`` :type task: str or int or tuple """ window = self.trace.window task_ids = self.trace.get_task_ids(task) df = self.df_tasks_signal('required_capacity') df = df_filter_task_ids(df, task_ids) df = df_refit_index(df, window=window) # Build task names (there could be multiple, during the task lifetime) task_name = f"Task ({', '.join(map(str, task_ids))})" return plot_signal( df['required_capacity'], name='required_capacity', ).options( title=f'Required CPU capacity for task {task}', ylabel='Utilization', )
def test_activations(self) -> ResultBundle: """ Test signals are properly "aggregated" at enqueue/dequeue time. On fast-ramp systems, `util_est_enqueud` is expected to be always smaller than `util_est_ewma`. On non fast-ramp systems, the `util_est_enqueued` is expected to be smaller then `util_est_ewma` in ramp-down phases, or bigger in ramp-up phases. Those conditions are checked on a single execution of a task which has three main behaviours: * STABLE: periodic big task running for a relatively long period to ensure `util_avg` saturation. * DOWN: periodic ramp-down task, to slowly decay `util_avg` * UP: periodic ramp-up task, to slowly increase `util_avg` """ failure_reasons = {} metrics = {} # We have only two task: the main 'rt-app' task and our 'test_task' test_task = self.trace.analysis.rta.rtapp_tasks[-1] # Get list of task's activations df = self.trace.analysis.tasks.df_task_states(test_task) activations = df[(df.curr_state == TaskState.TASK_WAKING) & (df.next_state == TaskState.TASK_ACTIVE)].index # Check task signals at each activation df = self.trace.df_events('sched_util_est_task') df = df_filter_task_ids(df, [test_task]) # Define a time interval to correlate relative trace events. def restrict(df, time, delta=1e-3): return df[time - delta:time + delta] failures = [] for idx, activation in enumerate(activations): avg, enq, ewma = restrict(df, activation)[[ 'util_avg', 'util_est_enqueued', 'util_est_ewma' ]].iloc[-1] metrics[idx + 1] = ActivationSignals(activation, avg, enq, ewma) # UtilEst is not updated when within 1% of previous activation if 1.01 * enq < avg: failure_reasons[idx] = 'enqueued({}) smaller than util_avg({}) @ {}'\ .format(enq, avg, activation) failures.append(activation) continue # Running on FastRamp kernels: if self.fast_ramp: # STABLE, DOWN and UP: if enq > ewma: failure_reasons[idx] = 'enqueued({}) bigger than ewma({}) @ {}'\ .format(enq, ewma, activation) failures.append(activation) continue # Running on (legacy) non FastRamp kernels: else: phase = self.trace.analysis.rta.task_phase_at( test_task, activation) # STABLE: ewma ramping up if phase.id == 0 and enq < ewma: failure_reasons[idx] = 'enqueued({}) smaller than ewma({}) @ {}'\ .format(enq, ewma, activation) failures.append(activation) continue # DOWN: ewma ramping down if 0 < phase.id < 5 and enq > ewma: failure_reasons[idx] = 'enqueued({}) bigger than ewma({}) @ {}'\ .format(enq, ewma, activation) failures.append(activation) continue # UP: ewma ramping up if phase.id > 4 and enq < ewma: failure_reasons[idx] = 'enqueued({}) smaller than ewma({}) @ {}'\ .format(enq, ewma, activation) failures.append(activation) continue self._plot_signals(test_task, 'activations', failures) bundle = ResultBundle.from_bool(not failure_reasons) bundle.add_metric("signals", metrics) bundle.add_metric("failure reasons", failure_reasons) return bundle
def test_areas(self) -> ResultBundle: """ Test signals are properly "dominated". The integral of `util_est_enqueued` is expected to be always not smaller than that of `util_avg`, since this last is subject to decays while the first not. The integral of `util_est_enqueued` is expected to be always greater or equal than the integral of `util_avg`, since this `util_avg` is subject to decays while `util_est_enqueued` not. On fast-ramp systems, the `util_est_ewma` signal is never smaller then the `util_est_enqueued`, thus his integral is expected to be bigger. On non fast-ramp systems instead, the `util_est_ewma` is expected to be smaller then `util_est_enqueued` in ramp-up phases, or bigger in ramp-down phases. Those conditions are checked on a single execution of a task which has three main behaviours: * STABLE: periodic big task running for a relatively long period to ensure `util_avg` saturation. * DOWN: periodic ramp-down task, to slowly decay `util_avg` * UP: periodic ramp-up task, to slowly increase `util_avg` """ failure_reasons = {} metrics = {} # We have only two task: the main 'rt-app' task and our 'test_task' test_task = self.trace.analysis.rta.rtapp_tasks[-1] ue_df = self.trace.df_events('sched_util_est_task') ue_df = df_filter_task_ids(ue_df, [test_task]) ua_df = self.trace.analysis.load_tracking.df_tasks_signal('util') ua_df = df_filter_task_ids(ua_df, [test_task]) failures = [] for phase in self.trace.analysis.rta.task_phase_windows(test_task): phase_df = ue_df[phase.start:phase.end] area_enqueued = series_integrate(phase_df.util_est_enqueued) area_ewma = series_integrate(phase_df.util_est_ewma) phase_df = ua_df[phase.start:phase.end] area_util = series_integrate(phase_df.util) metrics[phase.id] = PhaseStats(phase.start, phase.end, area_util, area_enqueued, area_ewma) phase_name = "phase {}".format(phase.id) if area_enqueued < area_util: failure_reasons[ phase_name] = 'Enqueued smaller then Util Average' failures.append(phase.start) continue # Running on FastRamp kernels: if self.fast_ramp: # STABLE, DOWN and UP: if area_ewma < area_enqueued: failure_reasons[ phase_name] = 'NO_FAST_RAMP: EWMA smaller then Enqueued' failures.append(phase.start) continue # Running on (legacy) non FastRamp kernels: else: # STABLE: ewma ramping up if phase.id == 0 and area_ewma > area_enqueued: failure_reasons[ phase_name] = 'FAST_RAMP(STABLE): EWMA bigger then Enqueued' failures.append(phase.start) continue # DOWN: ewma ramping down if 0 < phase.id < 5 and area_ewma < area_enqueued: failure_reasons[ phase_name] = 'FAST_RAMP(DOWN): EWMA smaller then Enqueued' failures.append(phase.start) continue # UP: ewma ramping up if phase.id > 4 and area_ewma > area_enqueued: failure_reasons[ phase_name] = 'FAST_RAMP(UP): EWMA bigger then Enqueued' failures.append(phase.start) continue bundle = ResultBundle.from_bool(failure_reasons) bundle.add_metric("fast ramp", self.fast_ramp) bundle.add_metric("phases stats", metrics) if not failure_reasons: return bundle # Plot signals to support debugging analysis self._plot_signals(test_task, 'areas', failures) bundle.add_metric("failure reasons", failure_reasons) return bundle
def test_activations(self) -> ResultBundle: """ Test signals are properly "aggregated" at enqueue/dequeue time. On fast-ramp systems, `enqueued` is expected to be always smaller than `ewma`. On non fast-ramp systems, the `enqueued` is expected to be smaller then `ewma` in ramp-down phases, or bigger in ramp-up phases. Those conditions are checked on a single execution of a task which has three main behaviours: * STABLE: periodic big task running for a relatively long period to ensure `util` saturation. * DOWN: periodic ramp-down task, to slowly decay `util` * UP: periodic ramp-up task, to slowly increase `util` """ metrics = {} task = self.rtapp_task_ids_map['test'][0] # Get list of task's activations df = self.trace.ana.tasks.df_task_states(task) activations = df[(df.curr_state == TaskState.TASK_WAKING) & (df.next_state == TaskState.TASK_ACTIVE)].index # Check task signals at each activation df = self.trace.df_event('sched_util_est_se') df = df_filter_task_ids(df, [task]) for idx, activation in enumerate(activations): # Get the value of signals at their first update after the activation row = df_window(df, (activation, None), method='post').iloc[0] # It can happen that the first updated after the activation is # actually in the next phase, in which case we need to check the # util values against the right phase activation = row.name # If we are outside a phase, ignore the activation try: phase = self.trace.ana.rta.task_phase_at( task, activation, wlgen_profile=self.rtapp_profile) except KeyError: continue util = row['util'] enq = row['enqueued'] ewma = row['ewma'] def make_issue(msg): return msg.format( util=f'util={util}', enq=f'enqueued={enq}', ewma=f'ewma={ewma}', ) issue = None # UtilEst is not updated when within 1% of previous activation if 1.01 * enq < util: issue = make_issue('{enq} smaller than {util}') # Running on FastRamp kernels: elif self.fast_ramp: # ewma stable, down and up if enq > ewma: issue = make_issue('{enq} bigger than {ewma}') # Running on (legacy) non FastRamp kernels: else: if not phase.properties['meta']['from_test']: continue # ewma stable if phase.id.startswith('test/stable'): if enq < ewma: issue = make_issue('stable: {enq} smaller than {ewma}') # ewma ramping down elif phase.id.startswith('test/ramp_down'): if enq > ewma: issue = make_issue( 'ramp down: {enq} bigger than {ewma}') # ewma ramping up elif phase.id.startswith('test/ramp_up'): if enq < ewma: issue = make_issue( 'ramp up: {enq} smaller than {ewma}') metrics[idx] = ActivationSignals(activation, util, enq, ewma, issue) failures = [(idx, activation_signals) for idx, activation_signals in metrics.items() if activation_signals.issue] bundle = ResultBundle.from_bool(not failures) bundle.add_metric("failures", sorted(idx for idx, activation in failures)) bundle.add_metric("activations", metrics) failures_time = [activation.time for idx, activation in failures] self._plot_signals(task, 'activations', failures_time) return bundle
def test_means(self) -> ResultBundle: """ Test signals are properly "dominated". The mean of `enqueued` is expected to be always not smaller than that of `util`, since this last is subject to decays while the first not. The mean of `enqueued` is expected to be always greater or equal than the mean of `util`, since this `util` is subject to decays while `enqueued` not. On fast-ramp systems, the `ewma` signal is never smaller then the `enqueued`, thus his mean is expected to be bigger. On non fast-ramp systems instead, the `ewma` is expected to be smaller then `enqueued` in ramp-up phases, or bigger in ramp-down phases. Those conditions are checked on a single execution of a task which has three main behaviours: * STABLE: periodic big task running for a relatively long period to ensure `util` saturation. * DOWN: periodic ramp-down task, to slowly decay `util` * UP: periodic ramp-up task, to slowly increase `util` """ failure_reasons = {} metrics = {} task = self.rtapp_task_ids_map['test'][0] ue_df = self.trace.df_event('sched_util_est_se') ue_df = df_filter_task_ids(ue_df, [task]) ua_df = self.trace.ana.load_tracking.df_task_signal(task, 'util') failures = [] for phase in self.trace.ana.rta.task_phase_windows( task, wlgen_profile=self.rtapp_profile): if not phase.properties['meta']['from_test']: continue apply_phase_window = functools.partial(df_refit_index, window=(phase.start, phase.end)) ue_phase_df = apply_phase_window(ue_df) mean_enqueued = series_mean(ue_phase_df['enqueued']) mean_ewma = series_mean(ue_phase_df['ewma']) ua_phase_df = apply_phase_window(ua_df) mean_util = series_mean(ua_phase_df['util']) def make_issue(msg): return msg.format( util=f'util={mean_util}', enq=f'enqueued={mean_enqueued}', ewma=f'ewma={mean_ewma}', ) issue = None if mean_enqueued < mean_util: issue = make_issue('{enq} smaller than {util}') # Running on FastRamp kernels: elif self.fast_ramp: # STABLE, DOWN and UP: if mean_ewma < mean_enqueued: issue = make_issue( 'no fast ramp: {ewma} smaller than {enq}') # Running on (legacy) non FastRamp kernels: else: # STABLE: ewma ramping up if phase.id.startswith('test/stable'): if mean_ewma > mean_enqueued: issue = make_issue( 'fast ramp, stable: {ewma} bigger than {enq}') # DOWN: ewma ramping down elif phase.id.startswith('test/ramp_down'): if mean_ewma < mean_enqueued: issue = make_issue( 'fast ramp, down: {ewma} smaller than {enq}') # UP: ewma ramping up elif phase.id.startswith('test/ramp_up'): if mean_ewma > mean_enqueued: issue = make_issue( 'fast ramp, up: {ewma} bigger than {enq}') metrics[phase.id] = PhaseStats(phase.start, phase.end, mean_util, mean_enqueued, mean_ewma, issue) failures = [(phase, stat) for phase, stat in metrics.items() if stat.issue] # Plot signals to support debugging analysis self._plot_signals(task, 'means', sorted(stat.start for phase, stat in failures)) bundle = ResultBundle.from_bool(not failures) bundle.add_metric("fast ramp", self.fast_ramp) bundle.add_metric("phases", metrics) bundle.add_metric("failures", sorted(phase for phase, stat in failures)) return bundle
def get_simulated_pelt(self, task, signal_name): """ Simulate a PELT signal for a given task. :param task: task to look for in the trace. :type task: int or str or tuple(int, str) :param signal_name: Name of the PELT signal to simulate. :type signal_name: str :return: A :class:`pandas.DataFrame` with a ``simulated`` column containing the simulated signal, along with the column of the signal as found in the trace. """ logger = self.get_logger() trace = self.trace task = trace.get_task_id(task) cpus = trace.analysis.tasks.cpus_of_tasks([task]) df_activation = trace.analysis.tasks.df_task_activation( task, # Util only takes into account times where the task is actually # executing preempted_value=0, ) df = trace.analysis.load_tracking.df_tasks_signal(signal_name) df = df_filter_task_ids(df, [task]) df = df.copy(deep=False) # Ignore the first activation, as its signals are incorrect df_activation = df_activation.iloc[2:] # Make sure the activation df does not start before the dataframe of # signal values, otherwise we cannot provide a sensible init value df_activation = df_activation[df.index[0]:] # Get the initial signal value matching the first activation we will care about init_iloc = df.index.get_loc(df_activation.index[0], method='ffill') init = df[signal_name].iloc[init_iloc] try: # PELT clock in nanoseconds clock = df['update_time'] * 1e-9 except KeyError: if any(self.plat_info['cpu-capacities']['rtapp'][cpu] != UTIL_SCALE for phase in self.wlgen_task.phases for cpu in phase.cpus): raise CannotCreateError( 'PELT time scaling can only be simulated when the PELT clock is available from the trace' ) logger.warning( 'PELT clock is not available, ftrace timestamp will be used at the expense of accuracy' ) clock = None df['simulated'] = simulate_pelt(df_activation['active'], index=df.index, init=init, clock=clock) # Since load is now CPU invariant in recent kernel versions, we don't # rescale it back. To match the old behavior, that line is # needed: # df['simulated'] /= self.plat_info['cpu-capacities']['rtapp'][cpu] / UTIL_SCALE kernel_version = self.plat_info['kernel']['version'] if (signal_name == 'load' and kernel_version.parts[:2] < (5, 1)): logger().warning( f'Load signal is assumed to be CPU invariant, which is true for recent mainline kernels, but may be wrong for {kernel_version}' ) df['error'] = df[signal_name] - df['simulated'] df = df.dropna() return df
def test_noisy_tasks(self, noise_threshold_pct=None, noise_threshold_ms=None): """ Test that no non-rtapp ("noisy") task ran for longer than the specified thresholds :param noise_threshold_pct: The maximum allowed runtime for noisy tasks in percentage of the total rt-app execution time :type noise_threshold_pct: float :param noise_threshold_ms: The maximum allowed runtime for noisy tasks in ms :type noise_threshold_ms: float If both are specified, the smallest threshold (in seconds) will be used. """ if noise_threshold_pct is None and noise_threshold_ms is None: raise ValueError('Both "{}" and "{}" cannot be None'.format( "noise_threshold_pct", "noise_threshold_ms")) # No task can run longer than the recorded duration threshold_s = self.trace.time_range if noise_threshold_pct is not None: threshold_s = noise_threshold_pct * self.trace.time_range / 100 if noise_threshold_ms is not None: threshold_s = min(threshold_s, noise_threshold_ms * 1e3) df = self.trace.analysis.tasks.df_tasks_runtime() # We don't want to account the test tasks ignored_ids = list(map(self.trace.get_task_id, self.rtapp_tasks)) def compute_duration_pct(row): return row.runtime * 100 / self.trace.time_range df["runtime_pct"] = df.apply(compute_duration_pct, axis=1) df['pid'] = df.index # Figure out which PIDs to exclude from the thresholds for key, threshold in self.NOISE_ACCOUNTING_THRESHOLDS.items(): # Find out which task(s) this threshold is about if isinstance(key, str): comms = [ comm for comm in df.comm.values if re.match(key, comm) ] task_ids = [self.trace.get_task_id(comm) for comm in comms] else: # Use update=False to let None fields propagate, as they are # used to indicate a "dont care" value task_ids = [self.trace.get_task_id(key, update=False)] # For those tasks, check the threshold ignored_ids.extend( task_id for task_id in task_ids if df_filter_task_ids( df, [task_id]).iloc[0].runtime_pct <= threshold) self.get_logger().info( "Ignored PIDs for noise contribution: {}".format(", ".join( map(str, ignored_ids)))) # Filter out unwanted tasks (rt-app tasks + thresholds) df_noise = df_filter_task_ids(df, ignored_ids, invert=True) if df_noise.empty: return ResultBundle.from_bool(True) pid = df_noise.index[0] comm = df_noise.comm.values[0] duration_s = df_noise.runtime.values[0] duration_pct = duration_s * 100 / self.trace.time_range res = ResultBundle.from_bool(duration_s < threshold_s) metric = { "pid": pid, "comm": comm, "duration (abs)": TestMetric(duration_s, "s"), "duration (rel)": TestMetric(duration_pct, "%") } res.add_metric("noisiest task", metric) return res
def _df_tasks_states(self, tasks=None, return_one_df=False): """ Compute tasks states for all tasks. :param tasks: If specified, states of these tasks only will be yielded. The :class:`lisa.trace.TaskID` must have a ``pid`` field specified, since the task state is per-PID. :type tasks: list(lisa.trace.TaskID) or list(int) :param return_one_df: If ``True``, a single dataframe is returned with new extra columns. If ``False``, a generator is returned that yields tuples of ``(TaskID, task_df)``. Each ``task_df`` contains the new columns. :type return_one_df: bool """ ###################################################### # A) Assemble the sched_switch and sched_wakeup events ###################################################### wk_df = self.trace.df_event('sched_wakeup') sw_df = self.trace.df_event('sched_switch') try: wkn_df = self.trace.df_event('sched_wakeup_new') except MissingTraceEventError: pass else: wk_df = pd.concat([wk_df, wkn_df]) wk_df = wk_df[["pid", "comm", "target_cpu", "__cpu"]].copy(deep=False) wk_df["curr_state"] = TaskState.TASK_WAKING prev_sw_df = sw_df[["__cpu", "prev_pid", "prev_state", "prev_comm"]].copy() next_sw_df = sw_df[["__cpu", "next_pid", "next_comm"]].copy() prev_sw_df.rename(columns={ "prev_pid": "pid", "prev_state": "curr_state", "prev_comm": "comm", }, inplace=True) next_sw_df["curr_state"] = TaskState.TASK_ACTIVE next_sw_df.rename(columns={ 'next_pid': 'pid', 'next_comm': 'comm' }, inplace=True) all_sw_df = prev_sw_df.append(next_sw_df, sort=False) # Integer values are prefered here, otherwise the whole column # is converted to float64 all_sw_df['target_cpu'] = -1 df = all_sw_df.append(wk_df, sort=False) df.sort_index(inplace=True) df.rename(columns={'__cpu': 'cpu'}, inplace=True) # Restrict the set of data we will process to a given set of tasks if tasks is not None: def resolve_task(task): """ Get a TaskID for each task, and only update existing TaskID if they lack a PID field, since that's what we care about in that function. """ try: do_update = task.pid is None except AttributeError: do_update = False return self.trace.get_task_id(task, update=do_update) tasks = list(map(resolve_task, tasks)) df = df_filter_task_ids(df, tasks) # Return a unique dataframe with new columns added if return_one_df: df.sort_index(inplace=True) df.index.name = 'Time' df.reset_index(inplace=True) # Since sched_switch is split in two df (next and prev), we end up with # duplicated indices. Avoid that by incrementing them by the minimum # amount possible. df = df_update_duplicates(df, col='Time', inplace=True) grouped = df.groupby('pid', observed=True, sort=False) new_columns = dict( next_state=grouped['curr_state'].shift( -1, fill_value=TaskState.TASK_UNKNOWN), # GroupBy.transform() will run the function on each group, and # concatenate the resulting series to create a new column. # Note: We actually need transform() to chain 2 operations on # the group, otherwise the first operation returns a final # Series, and the 2nd is not applied on groups delta=grouped['Time'].transform( lambda time: time.diff().shift(-1)), ) df = df.assign(**new_columns) df.set_index('Time', inplace=True) return df # Return a generator yielding (TaskID, task_df) tuples else: def make_pid_df(pid_df): # Even though the initial dataframe contains duplicated indices due to # using both prev_pid and next_pid in sched_switch event, we should # never end up with prev_pid == next_pid, so task-specific dataframes # are expected to be free from duplicated timestamps. # assert not df.index.duplicated().any() # Copy the df to add new columns pid_df = pid_df.copy(deep=False) # For each PID, add the time it spent in each state pid_df['delta'] = pid_df.index.to_series().diff().shift(-1) pid_df['next_state'] = pid_df['curr_state'].shift( -1, fill_value=TaskState.TASK_UNKNOWN) return pid_df signals = df_split_signals(df, ['pid']) return ((TaskID(pid=col['pid'], comm=None), make_pid_df(pid_df)) for col, pid_df in signals)
def test_activations(self) -> ResultBundle: """ Test signals are properly "aggregated" at enqueue/dequeue time. On fast-ramp systems, `enqueud` is expected to be always smaller than `ewma`. On non fast-ramp systems, the `enqueued` is expected to be smaller then `ewma` in ramp-down phases, or bigger in ramp-up phases. Those conditions are checked on a single execution of a task which has three main behaviours: * STABLE: periodic big task running for a relatively long period to ensure `util` saturation. * DOWN: periodic ramp-down task, to slowly decay `util` * UP: periodic ramp-up task, to slowly increase `util` """ metrics = {} task = self.rtapp_task_ids_map['test'][0] # Get list of task's activations df = self.trace.analysis.tasks.df_task_states(task) activations = df[(df.curr_state == TaskState.TASK_WAKING) & (df.next_state == TaskState.TASK_ACTIVE)].index # Check task signals at each activation df = self.trace.df_event('sched_util_est_se') df = df_filter_task_ids(df, [task]) for idx, activation in enumerate(activations): # Get the value of signals at their first update after the activation row = df_window(df, (activation, None), method='post').iloc[0] util = row['util'] enq = row['enqueued'] ewma = row['ewma'] def make_issue(msg): return msg.format( util='util={}'.format(util), enq='enqueud={}'.format(enq), ewma='ewma={}'.format(ewma), ) issue = None # UtilEst is not updated when within 1% of previous activation if 1.01 * enq < util: issue = make_issue('{enq} smaller than {util}') # Running on FastRamp kernels: elif self.fast_ramp: # ewma stable, down and up if enq > ewma: issue = make_issue('{enq} bigger than {ewma}') # Running on (legacy) non FastRamp kernels: else: phase = self.trace.analysis.rta.task_phase_at(task, activation) # TODO: remove that once we have named phases to skip the buffer phase if phase.id == 0: continue # ewma stable if phase.id == 1 and enq < ewma: issue = make_issue('stable: {enq} smaller than {ewma}') # ewma ramping down elif phase.id <= 5 and enq > ewma: issue = make_issue('ramp down: {enq} bigger than {ewma}') # ewma ramping up elif phase.id >= 6 and enq < ewma: issue = make_issue('ramp up: {enq} smaller than {ewma}') metrics[idx] = ActivationSignals(activation, util, enq, ewma, issue) failures = [(idx, activation_signals) for idx, activation_signals in metrics.items() if activation_signals.issue] bundle = ResultBundle.from_bool(not failures) bundle.add_metric("failures", sorted(idx for idx, activation in failures)) bundle.add_metric("activations", metrics) failures_time = [activation.time for idx, activation in failures] self._plot_signals(task, 'activations', failures_time) return bundle
wload.run() ftrace_coll.get_trace(trace_path) trace = Trace(trace_path, target.plat_info, events=["sched_switch"]) # sched_switch __comm __pid __cpu __line prev_comm prev_pid prev_prio prev_state next_comm next_pid next_prio df = trace.df_events('sched_switch')[['next_pid', 'next_comm', '__cpu']] def analize_task_migration(task_id, ddf): start = ddf.index[0] stop = min(ddf.index[1] + 1.0, df.index[-1]) start_cpu = ddf['__cpu'].values[0] stop_cpu = ddf['__cpu'].values[1] _df = df[start:stop][df[start:stop]['__cpu'] == start_cpu] print("Task {} migrated from CPU {} to CPU {}\n".format( task_id, start_cpu, stop_cpu)) print(_df.to_string(max_cols=64) + "\n") for task in tasks: task_id = trace.get_task_id(task, update=False) _df = df_filter_task_ids(df, [task_id], pid_col='next_pid', comm_col='next_comm') ddf = _df.drop_duplicates(subset='__cpu', keep='first', inplace=False) print("****************** sched_switch {} ********************\n {} \n". format(task, ddf.to_string(max_cols=64))) if len(ddf.index) > 1: analize_task_migration(task_id, ddf)