def df_cpu_idle_state_residency(self, cpu): """ Compute time spent by a given CPU in each idle state. :param cpu: CPU ID :type cpu: int :returns: a :class:`pandas.DataFrame` with: * Idle states as index * A ``time`` column (The time spent in the idle state) """ idle_df = self.df_cpu_idle(cpu) # Ensure accurate time-based sum of state deltas idle_df = df_refit_index(idle_df, window=self.trace.window) # For each state, sum the time spent in it idle_df = df_add_delta(idle_df) residency = { cols['state']: state_df['delta'].sum() for cols, state_df in df_split_signals(idle_df, ['state']) } df = pd.DataFrame.from_dict(residency, orient='index', columns=['time']) df.index.name = 'idle_state' return df
def df_phases(self, task): """ Get phases actual start times and durations :param task: the rt-app task to filter for :type task: int or str or lisa.trace.TaskID :returns: A :class:`pandas.DataFrame` with index representing the start time of a phase and these column: * ``phase``: the phase number. * ``duration``: the measured phase duration. """ def get_duration(phase, df): start = df.index[0] end = df.index[-1] duration = end - start return (start, {'phase': phase, 'duration': duration}) loops_df = self.df_rtapp_loop(task) durations = sorted( get_duration(cols['phase'], df) for cols, df in df_split_signals(loops_df, ['phase'])) index, columns = zip(*durations) return pd.DataFrame(columns, index=index)
def test_df_split_signals(self): index = list(map(float, range(1, 6))) cols = ["foo", "bar"] ncols = len(cols) data = [(i, i % 2) for i in range(len(index))] df = pd.DataFrame(index=index, data=data, columns=cols) for ident, subdf in du.df_split_signals(df, ["bar"]): if ident["bar"] == 0: assert len(subdf) == 3 else: assert len(subdf) == 2
def df_phases(self, task): """ Get phases actual start times and durations :param task: the rt-app task to filter for :type task: int or str or lisa.trace.TaskID :returns: A :class:`pandas.DataFrame` with index representing the start time of a phase and these column: * ``phase``: the phase number. * ``duration``: the measured phase duration. """ # Trace windowing can cut the trace anywhere, so we need to remove the # partial loops records to avoid confusion def filter_partial_loop(df): # Get rid of the end of a previous loop if not df.empty and df['event'].iloc[0] == 'end': df = df.iloc[1:] # Get rid of the beginning of a new loop at the end if not df.empty and df['event'].iloc[-1] == 'start': df = df.iloc[:-1] return df def get_duration(phase, df): start = df.index[0] end = df.index[-1] duration = end - start return (start, {'phase': phase, 'duration': duration}) loops_df = self.df_rtapp_loop(task) phases_df_list = [ (cols['phase'], filter_partial_loop(df)) for cols, df in df_split_signals(loops_df, ['phase']) ] durations = sorted( get_duration(phase, df) for phase, df in phases_df_list if not df.empty) if durations: index, columns = zip(*durations) return pd.DataFrame(columns, index=index) else: return pd.DataFrame()
def plot_task_placement(self, task: TaskID): """ Plot the CPU placement of the task :param task: The name or PID of the task, or a tuple ``(pid, comm)`` :type task: str or int or tuple """ task_id = self.trace.get_task_id(task, update=False) # Get all utilization update events df = self.df_task_signal(task_id, 'required_capacity').copy() cpu_capacities = self.trace.plat_info["cpu-capacities"]['orig'] df['capacity'] = df['cpu'].map(cpu_capacities) nr_cpus = df['cpu'].max() + 1 def add_placement(df, comp, comp_str): placement = f"CPU capacity {comp_str} required capacity" condition = comp(df['capacity'], df['required_capacity']) df.loc[condition, 'placement'] = placement add_placement(df, operator.lt, '<') add_placement(df, operator.gt, '>') add_placement(df, operator.eq, '==') def plot_placement(cols, placement_df): placement = cols['placement'] series = df["cpu"] series = series_refit_index(series, window=self.trace.window) series.name = 'cpu' return hv.Scatter( series, label=placement, ).options( marker='+', yticks=list(range(nr_cpus)), ) return hv.Overlay( list( itertools.starmap(plot_placement, df_split_signals(df, ['placement']))) + [self._plot_overutilized()]).options( title=f'Utilization vs placement of task {task}', ylabel='CPU', )
def plot_task_placement(self, task: TaskID, axis, local_fig): """ Plot the CPU placement of the task :param task: The name or PID of the task, or a tuple ``(pid, comm)`` :type task: str or int or tuple """ task_id = self.trace.get_task_id(task, update=False) # Get all utilization update events df = self.df_task_signal(task_id, 'required_capacity').copy() cpu_capacities = self.trace.plat_info["cpu-capacities"]['orig'] df['capacity'] = df['cpu'].map(cpu_capacities) def add_placement(df, comp, comp_str): placement = "CPU capacity {} required capacity".format(comp_str) condition = comp(df['capacity'], df['required_capacity']) df.loc[condition, 'placement'] = placement add_placement(df, operator.lt, '<') add_placement(df, operator.gt, '>') add_placement(df, operator.eq, '==') for cols, placement_df in df_split_signals(df, ['placement']): placement = cols['placement'] series = df["cpu"] series = series_refit_index(series, window=self.trace.window) series.plot(ax=axis, style="+", label=placement) plot_overutilized = self.trace.analysis.status.plot_overutilized if self.trace.has_events(plot_overutilized.used_events): plot_overutilized(axis=axis) if local_fig: axis.set_title( 'Utilization vs placement of task "{}"'.format(task)) axis.grid(True) axis.legend()
def get_expected_cpu_util(self): """ Get the per-phase average CPU utilization expected from the duty cycle of the tasks found in the trace. :returns: A dict of the shape {cpu : {phase_id : expected_util}} .. note:: This is more robust than just looking at the duty cycle in the task profile, since rtapp might not reproduce accurately the duty cycle it was asked. """ cpu_capacities = self.plat_info['cpu-capacities']['rtapp'] cpu_util = {} cpu_freqs = self.plat_info['freqs'] try: freq_df = self.trace.analysis.frequency.df_cpus_frequency() except MissingTraceEventError: cpus_rel_freq = None else: cpus_rel_freq = { # Frequency, normalized according to max frequency on that CPU cols['cpu']: df['frequency'] / max(cpu_freqs[cols['cpu']]) for cols, df in df_split_signals(freq_df, ['cpu']) } for task in self.rtapp_task_ids: df = self.trace.analysis.tasks.df_task_activation(task) for row in self.trace.analysis.rta.df_phases(task).itertuples(): phase = row.phase duration = row.duration start = row.Index end = start + duration # Ignore the first quarter of the util signal of each phase, since # it's impacted by the phase change, and util can be affected # (rtapp does some bookkeeping at the beginning of phases) # start += duration / 4 # readjust the duration to take into account the modification of start duration = end - start window = (start, end) phase_df = df_window(df, window, clip_window=True) for cpu in self.cpus: if cpus_rel_freq is None: rel_freq_mean = 1 else: phase_freq_series = df_window(cpus_rel_freq[cpu], window=window, clip_window=True) # # We might not have frequency data at the beginning of the # # trace, or if not frequency transition happened at all. if phase_freq_series.empty: rel_freq_mean = 1 else: # If we lack freq data at the beginning of the # window, assume the frequency was right. if phase_freq_series.index[0] > start: phase_freq_series = pd.concat([ pd.Series([1.0], index=[start]), phase_freq_series ]) # Extend the frequency to the right so that the mean # takes into account all the data we have freq_window = (phase_freq_series.index[0], end) rel_freq_mean = series_mean( series_refit_index(phase_freq_series, window=freq_window)) cpu_phase_df = phase_df[phase_df['cpu'] == cpu].dropna() if cpu_phase_df.empty: duty_cycle = 0 cpu_residency = 0 else: duty_cycle = series_mean( df_refit_index(cpu_phase_df['duty_cycle'], window=window)) cpu_residency = end - max(cpu_phase_df.index[0], start) phase_util = UTIL_SCALE * duty_cycle * ( cpu_capacities[cpu] / UTIL_SCALE) # Pro-rata with the time spent on that CPU, so we get # the correct average. phase_util *= cpu_residency / duration # We might not have run at max freq, e.g. because of # thermal capping, so take that into account phase_util *= rel_freq_mean cpu_util.setdefault(cpu, {}).setdefault(phase, 0) cpu_util[cpu][phase] += phase_util return cpu_util
def df_phases(self, task, wlgen_profile=None): """ Get phases actual start times and durations :param task: the rt-app task to filter for :type task: int or str or lisa.trace.TaskID :param wlgen_profile: See :meth:`df_rtapp_loop` :type wlgen_profile: dict(str, lisa.wlgen.rta.RTAPhase) or None :returns: A :class:`pandas.DataFrame` with index representing the start time of a phase and these column: * ``phase``: the phase number or its name extracted from ``wlgen_profile``. * ``duration``: the measured phase duration. * ``properties``: the properties mapping of the phase extracted from ``wlgen_profile``. """ # Trace windowing can cut the trace anywhere, so we need to remove the # partial loops records to avoid confusion def filter_partial_loop(df): # Get rid of the end of a previous loop if not df.empty and df['event'].iloc[0] == 'end': df = df.iloc[1:] # Get rid of the beginning of a new loop at the end if not df.empty and df['event'].iloc[-1] == 'start': df = df.iloc[:-1] return df def get_duration(phase, df): start = df.index[0] end = df.index[-1] duration = end - start info = { 'phase': phase, 'duration': duration, } # The properties are the same all along for a given phase, so we # can just pick the first one try: properties = df.iloc[0]['properties'] except KeyError: properties = {} info['properties'] = properties return (start, info) loops_df = self.df_rtapp_loop(task, wlgen_profile=wlgen_profile) phases_df_list = [ (cols['phase'], filter_partial_loop(df)) for cols, df in df_split_signals(loops_df, ['phase']) ] durations = sorted( get_duration(phase, df) for phase, df in phases_df_list if not df.empty) if durations: index, columns = zip(*durations) return pd.DataFrame(columns, index=index) else: return pd.DataFrame()
def _df_tasks_states(self, tasks=None, return_one_df=False): """ Compute tasks states for all tasks. :param tasks: If specified, states of these tasks only will be yielded. The :class:`lisa.trace.TaskID` must have a ``pid`` field specified, since the task state is per-PID. :type tasks: list(lisa.trace.TaskID) or list(int) :param return_one_df: If ``True``, a single dataframe is returned with new extra columns. If ``False``, a generator is returned that yields tuples of ``(TaskID, task_df)``. Each ``task_df`` contains the new columns. :type return_one_df: bool """ ###################################################### # A) Assemble the sched_switch and sched_wakeup events ###################################################### wk_df = self.trace.df_event('sched_wakeup') sw_df = self.trace.df_event('sched_switch') try: wkn_df = self.trace.df_event('sched_wakeup_new') except MissingTraceEventError: pass else: wk_df = pd.concat([wk_df, wkn_df]) wk_df = wk_df[["pid", "comm", "target_cpu", "__cpu"]].copy(deep=False) wk_df["curr_state"] = TaskState.TASK_WAKING prev_sw_df = sw_df[["__cpu", "prev_pid", "prev_state", "prev_comm"]].copy() next_sw_df = sw_df[["__cpu", "next_pid", "next_comm"]].copy() prev_sw_df.rename(columns={ "prev_pid": "pid", "prev_state": "curr_state", "prev_comm": "comm", }, inplace=True) next_sw_df["curr_state"] = TaskState.TASK_ACTIVE next_sw_df.rename(columns={ 'next_pid': 'pid', 'next_comm': 'comm' }, inplace=True) all_sw_df = prev_sw_df.append(next_sw_df, sort=False) # Integer values are prefered here, otherwise the whole column # is converted to float64 all_sw_df['target_cpu'] = -1 df = all_sw_df.append(wk_df, sort=False) df.sort_index(inplace=True) df.rename(columns={'__cpu': 'cpu'}, inplace=True) # Restrict the set of data we will process to a given set of tasks if tasks is not None: def resolve_task(task): """ Get a TaskID for each task, and only update existing TaskID if they lack a PID field, since that's what we care about in that function. """ try: do_update = task.pid is None except AttributeError: do_update = False return self.trace.get_task_id(task, update=do_update) tasks = list(map(resolve_task, tasks)) df = df_filter_task_ids(df, tasks) # Return a unique dataframe with new columns added if return_one_df: df.sort_index(inplace=True) df.index.name = 'Time' df.reset_index(inplace=True) # Since sched_switch is split in two df (next and prev), we end up with # duplicated indices. Avoid that by incrementing them by the minimum # amount possible. df = df_update_duplicates(df, col='Time', inplace=True) grouped = df.groupby('pid', observed=True, sort=False) new_columns = dict( next_state=grouped['curr_state'].shift( -1, fill_value=TaskState.TASK_UNKNOWN), # GroupBy.transform() will run the function on each group, and # concatenate the resulting series to create a new column. # Note: We actually need transform() to chain 2 operations on # the group, otherwise the first operation returns a final # Series, and the 2nd is not applied on groups delta=grouped['Time'].transform( lambda time: time.diff().shift(-1)), ) df = df.assign(**new_columns) df.set_index('Time', inplace=True) return df # Return a generator yielding (TaskID, task_df) tuples else: def make_pid_df(pid_df): # Even though the initial dataframe contains duplicated indices due to # using both prev_pid and next_pid in sched_switch event, we should # never end up with prev_pid == next_pid, so task-specific dataframes # are expected to be free from duplicated timestamps. # assert not df.index.duplicated().any() # Copy the df to add new columns pid_df = pid_df.copy(deep=False) # For each PID, add the time it spent in each state pid_df['delta'] = pid_df.index.to_series().diff().shift(-1) pid_df['next_state'] = pid_df['curr_state'].shift( -1, fill_value=TaskState.TASK_UNKNOWN) return pid_df signals = df_split_signals(df, ['pid']) return ((TaskID(pid=col['pid'], comm=None), make_pid_df(pid_df)) for col, pid_df in signals)
def _df_group_apply(self, df, func, melt=False, index_cols=None): """ Apply ``func`` on subsets of the dataframe and return the concatenated result. :param df: Dataframe in database format (meaningless index, tag and value columns). :type df: pandas.DataFrame :param func: Callable called with 3 parameters: * ``ref``: Reference subgroup dataframe for comparison purposes. * ``df``: Dataframe of the subgroup, to compare to ``ref``. * ``group``: Dictionary ``dict(column_name, value)`` identifying the ``df`` subgroup. :type func: collections.abc.Callable :param melt: If ``True``, extra columns added by the callback in the return :class:`pandas.DataFrame` will be melted, i.e. they will be turned into row with the column name being copied to the stat column. :type melt: bool :param index_cols: Columns to aggregate on that will be used for indexing the sub-dataframes, instead of the default ``agg_cols``. :type index_cols: list(str) or None """ ref_group = FrozenDict(self._ref_group) # All the columns that are not involved in the group itself except the # value will be used as index, so that the reference group and other # groups can be joined meaningfully on the index for comparison # purposes. index_cols = index_cols if index_cols is not None else self._agg_cols index_cols = self._restrict_cols(index_cols, df) sub_group_cols = self._restrict_cols(self._sub_group_cols, df) def process_subgroup(df, group, subgroup): subgroup = FrozenDict(subgroup) try: ref = subref[subgroup] except KeyError: return None group = {**group, **subgroup} # Make sure that the columns/index levels relative to the group are # removed, since they are useless because they have a constant value def remove_cols(df): to_remove = group.keys() df = df.drop(columns=self._restrict_cols(to_remove, df)) try: drop_level = df.index.droplevel except AttributeError: pass else: df.index = drop_level( sorted(set(df.index.names) & set(to_remove))) return df df = remove_cols(df) ref = remove_cols(ref) df = func(ref, df, group) # Only assign-back subgroup columns if they have not been set by the # callback directly. to_assign = group.keys() - set(df.columns) df = df.assign( **{col: val for col, val in group.items() if col in to_assign}) # Drop RangeIndex to avoid getting an "index" column that is # useless drop_index = isinstance(df.index, pd.RangeIndex) df.reset_index(drop=drop_index, inplace=True) return df # Groups as asked by the user comparison_groups = { FrozenDict(group): df.set_index(index_cols) for group, df in df_split_signals(df, ref_group.keys()) } # We elect a comparison reference and split it in subgroups ref = comparison_groups[ref_group] subref = { FrozenDict(subgroup): subdf for subgroup, subdf in df_split_signals(ref, sub_group_cols) } # For each group, split it further in subgroups dfs = [ process_subgroup(subdf, group, subgroup) for group, df in comparison_groups.items() for subgroup, subdf in df_split_signals(df, sub_group_cols) ] df = pd.concat((df for df in dfs if df is not None), ignore_index=True, copy=False) if melt: df = self._melt(df) return df