def test_benchmark_rank_by_median(): experiment_df = create_experiment_data() benchmark_df = experiment_df[experiment_df.benchmark == 'libxml'] snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df) ranking = data_utils.benchmark_rank_by_median(snapshot_df) expected_ranking = pd.Series(index=['afl', 'libfuzzer'], data=[1100, 700]) assert ranking.equals(expected_ranking)
def test_fuzzers_with_not_enough_samples(): experiment_df = create_experiment_data() # Drop one of the afl/libxml trials (trial id 5). experiment_df = experiment_df[experiment_df.trial_id != 5] benchmark_df = experiment_df[experiment_df.benchmark == 'libxml'] snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df) expected_fuzzers = ['afl'] assert data_utils.get_fuzzers_with_not_enough_samples( snapshot_df) == expected_fuzzers
def test_benchmark_rank_by_stat_test_wins(): experiment_df = create_experiment_data() benchmark_df = experiment_df[experiment_df.benchmark == 'libxml'] snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df) ranking = data_utils.benchmark_rank_by_stat_test_wins(snapshot_df) expected_ranking = pd.Series(index=['libfuzzer', 'afl'], data=[0, 0]) ranking.sort_index(inplace=True) expected_ranking.sort_index(inplace=True) assert ranking.equals(expected_ranking)
def coverage_growth_plot(self, benchmark_df, axes=None): """Draws coverage growth plot on given |axes|. The fuzzer labels will be in the order of their mean coverage at the snapshot time (typically, the end of experiment). """ benchmark_names = benchmark_df.benchmark.unique() assert len(benchmark_names) == 1, 'Not a single benchmark data!' benchmark_snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df) snapshot_time = benchmark_snapshot_df.time.unique()[0] fuzzer_order = data_utils.benchmark_rank_by_mean( benchmark_snapshot_df).index axes = sns.lineplot( y='edges_covered', x='time', hue='fuzzer', hue_order=fuzzer_order, data=benchmark_df[benchmark_df.time <= snapshot_time], ci=None if self._quick else 95, palette=self._fuzzer_colors, ax=axes) axes.set_title(_formatted_title(benchmark_snapshot_df)) # Indicate the snapshot time with a big red vertical line. axes.axvline(x=snapshot_time, color='r') # Move legend outside of the plot. axes.legend(bbox_to_anchor=(1.00, 1), borderaxespad=0, loc='upper left', frameon=False) axes.set(ylabel='Edge coverage') axes.set(xlabel='Time (hour:minute)') if self._logscale: axes.set_xscale('log') ticks = np.logspace( # Start from the time of the first measurement. np.log10(experiment_utils.DEFAULT_SNAPSHOT_SECONDS), np.log10(snapshot_time + 1), # Include tick at end time. _DEFAULT_TICKS_COUNT) else: ticks = np.arange( experiment_utils.DEFAULT_SNAPSHOT_SECONDS, snapshot_time + 1, # Include tick at end time. snapshot_time / _DEFAULT_TICKS_COUNT) axes.set_xticks(ticks) axes.set_xticklabels([_formatted_hour_min(t) for t in ticks]) sns.despine(ax=axes, trim=True)
def test_benchmark_summary(): experiment_df = create_experiment_data() benchmark_df = experiment_df[experiment_df.benchmark == 'libxml'] snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df) summary = data_utils.benchmark_summary(snapshot_df) expected_summary = pd.DataFrame({ 'fuzzer': ['afl', 'libfuzzer'], 'time': [9, 9], 'count': [2, 2], 'min': [1000, 600], 'median': [1100, 700], 'max': [1200, 800] }).set_index(['fuzzer', 'time']).astype(float) assert summary[['count', 'min', 'median', 'max']].equals(expected_summary)
def test_benchmark_snapshot(): """Tests that the snapshot data contains only the latest timestamp for all trials, in case all trials have the same lengths.""" experiment_df = create_experiment_data() benchmark_df = experiment_df[experiment_df.benchmark == 'libxml'] snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df) timestamps_per_trial = snapshot_df[['trial_id', 'time']] timestamps_per_trial.reset_index(drop=True, inplace=True) # The latest timestamp is 9 in the example data. expected_timestamps_per_trial = pd.DataFrame([{ 'trial_id': trial, 'time': 9 } for trial in range(4, 8)]) assert timestamps_per_trial.equals(expected_timestamps_per_trial)
def crash_plot(self, benchmark_df, axes=None, logscale=False): """Draws crash plot.""" benchmark_names = benchmark_df.benchmark.unique() assert len(benchmark_names) == 1, 'Not a single benchmark data!' benchmark_snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df) snapshot_time = benchmark_snapshot_df.time.unique()[0] crash_df = data_utils.get_crash_snaphot(benchmark_df) axes = sns.lineplot(y='crashes', x='time', hue='fuzzer', data=crash_df[crash_df.time <= snapshot_time], ci=None if self._quick else 95, palette=self._fuzzer_colors, ax=axes) axes.set_title(_formatted_title(benchmark_snapshot_df)) # Indicate the snapshot time with a big red vertical line. axes.axvline(x=snapshot_time, color='r') # Move legend outside of the plot. axes.legend(bbox_to_anchor=(1.00, 1), borderaxespad=0, loc='upper left', frameon=False) axes.set(ylabel='Unique crashes') axes.set(xlabel='Time (hour:minute)') if self._logscale or logscale: axes.set_xscale('log') ticks = np.logspace( # Start from the time of the first measurement. np.log10(experiment_utils.DEFAULT_SNAPSHOT_SECONDS), np.log10(snapshot_time + 1), # Include tick at end time. _DEFAULT_TICKS_COUNT) else: ticks = np.arange( experiment_utils.DEFAULT_SNAPSHOT_SECONDS, snapshot_time + 1, # Include tick at end time. snapshot_time / _DEFAULT_TICKS_COUNT) axes.set_xticks(ticks) axes.set_xticklabels([_formatted_hour_min(t) for t in ticks]) sns.despine(ax=axes, trim=True)
def test_benchmark_snapshot_complete(threshold): """Tests that the snapshot data contains only the latest timestamp for all trials, in case all trials have the same lengths. This should happen independently of the used |threshold|. """ experiment_df = create_experiment_data() benchmark_df = experiment_df[experiment_df.benchmark == 'libxml'] snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df, threshold) timestamps_per_trial = snapshot_df[['trial_id', 'time']] timestamps_per_trial.reset_index(drop=True, inplace=True) # The latest timestamp is 9 in the example data. libxml_trial_ids = range(4, 8) expected_timestamps_per_trial = pd.DataFrame([{ 'trial_id': trial, 'time': 9 } for trial in libxml_trial_ids]) assert timestamps_per_trial.equals(expected_timestamps_per_trial)
def test_benchmark_snapshot_incomplete(threshold, expected_snapshot_time, expected_trials_left): """Tests that the snapshot data created from an incomplete benchmark data (with some early terminating trials) contains the right trial snapshots with the right timestamp according to the given |threshold|. The function under test snapshots the benchmark data at the latest time where |threshold| fraction of the trials are still running. This means that with lower |threshold| the snapshot will be made later in time, but also more trials will be thrown out. """ experiment_df = create_experiment_data(incomplete=True) benchmark_df = experiment_df[experiment_df.benchmark == 'libxml'] snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df, threshold) timestamps_per_trial = snapshot_df[['trial_id', 'time']] timestamps_per_trial.reset_index(drop=True, inplace=True) trials_left = len(timestamps_per_trial.index) assert trials_left == expected_trials_left # All trial snapshots should have the same expected timestamp. assert (timestamps_per_trial['time'] == expected_snapshot_time).all()
def _benchmark_snapshot_df(self): return data_utils.get_benchmark_snapshot(self._benchmark_df)
def coverage_growth_plot(self, benchmark_df, axes=None, logscale=False, bugs=False): """Draws edge (or bug) coverage growth plot on given |axes|. The fuzzer labels will be in the order of their mean coverage at the snapshot time (typically, the end of experiment). """ self._common_datafame_checks(benchmark_df) column_of_interest = 'bugs_covered' if bugs else 'edges_covered' benchmark_snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df) snapshot_time = benchmark_snapshot_df.time.unique()[0] fuzzer_order = data_utils.benchmark_rank_by_mean( benchmark_snapshot_df, key=column_of_interest).index axes = sns.lineplot( y=column_of_interest, x='time', hue='fuzzer', hue_order=fuzzer_order, data=benchmark_df[benchmark_df.time <= snapshot_time], ci=None if bugs or self._quick else 95, estimator=np.median, palette=self._fuzzer_colors, style='fuzzer', dashes=False, markers=self._fuzzer_markers, ax=axes) axes.set_title(_formatted_title(benchmark_snapshot_df)) # Indicate the snapshot time with a big red vertical line. axes.axvline(x=snapshot_time, color='r') # Move legend outside of the plot. axes.legend(bbox_to_anchor=(1.00, 1), borderaxespad=0, loc='upper left', frameon=False) axes.set(ylabel='Bug coverage' if bugs else 'Code region coverage') axes.set(xlabel='Time (hour:minute)') if self._logscale or logscale: axes.set_xscale('log') ticks = np.logspace( # Start from the time of the first measurement. np.log10(experiment_utils.DEFAULT_SNAPSHOT_SECONDS), np.log10(snapshot_time + 1), # Include tick at end time. _DEFAULT_TICKS_COUNT) else: ticks = np.arange( experiment_utils.DEFAULT_SNAPSHOT_SECONDS, snapshot_time + 1, # Include tick at end time. snapshot_time / _DEFAULT_TICKS_COUNT) axes.set_xticks(ticks) axes.set_xticklabels([_formatted_hour_min(t) for t in ticks]) sns.despine(ax=axes, trim=True)