def permutation_tests(): """Evaluates permutation tests.""" # Evaluate metrics that only have one value per training run. for metric in p.metrics_no_timeframes: for algo1 in p.algos: for algo2 in p.algos: data = data_def.DataDef(p.metric_values_dir, p.algos, p.tasks, p.n_runs_per_experiment) stats_runner = stats.StatsRunner(data, metric, None, p.n_random_samples, p.pvals_dir, p.metric_values_dir_permuted) stats_runner.compare_algorithms(algo1, algo2, timeframe=None) # Evaluate metrics computed at different points along each training run. for metric in p.metrics_with_timeframes: for algo1 in p.algos: for algo2 in p.algos: for timeframe in p.timeframes: data = data_def.DataDef(p.metric_values_dir, p.algos, p.tasks, p.n_runs_per_experiment) stats_runner = stats.StatsRunner( data, metric, p.n_timeframes, p.n_random_samples, p.pvals_dir, p.metric_values_dir_permuted) stats_runner.compare_algorithms(algo1, algo2, timeframe)
def test_resample_metric_results(self, metric_results, result_dims, algo_ind): stats_runner = stats.StatsRunner(data=self.dd, metric='IqrAcrossRuns', n_timeframes=3) stats_runner.result_dims = result_dims metric_results = np.array(metric_results) resampled = stats_runner._resample_metric_results( metric_results, algo_ind) # The resampled subarrays should be drawn from the original subarrays. n_task = metric_results.shape[1] for itask in range(n_task): algo_task_results = metric_results[algo_ind, itask] for run_value in resampled[algo_ind][itask]: self.assertIn(run_value, algo_task_results) # Resampled array should be the same for all other algorithms. n_metrics = metric_results.shape[0] other_algo_inds = list(set(range(n_metrics)) - {algo_ind}) resampled_other_algos = resampled[other_algo_inds] original_other_algos = metric_results[other_algo_inds] np.testing.assert_array_equal(resampled_other_algos, original_other_algos) if metric_results.shape[2] == 1: # Since there is only one run, the resampled array should be the same. np.testing.assert_array_equal(resampled, metric_results)
def make_plots(self, metric): """Make all plots for a given metric. Args: metric: String name of the metric. """ plot_utils.paper_figure_configs() # Create a metric-specific StatsRunner object stats_runner = stats.StatsRunner(self.data_def, metric, self.n_timeframes) result_dims = stats_runner.result_dims if result_dims == 'ATRP': # Within-runs metric with eval points. self._make_plots_with_eval_points(metric, stats_runner) elif result_dims == 'ATR': # Within-runs metrics without eval points (one value per run). self._make_plots_no_eval_points(metric, stats_runner) elif result_dims == 'ATP': # Across-runs metric with eval points self._make_plots_with_eval_points(metric, stats_runner) else: raise ValueError('plotting not implemented for result_dims: %s' % result_dims)
def bootstrap_confidence_intervals(): """Computes bootstrap confidence intervals for each algorithm.""" for metric in p.metrics_no_timeframes: for algo in p.algos: data = data_def.DataDef( p.metric_values_dir, p.algos, p.tasks, p.n_runs_per_experiment) stats_runner = stats.StatsRunner(data, metric, None, p.n_random_samples, p.pvals_dir, p.metric_values_dir_permuted) stats_runner.bootstrap_confidence_interval(algo, timeframe=None) for metric in p.metrics_with_timeframes: for algo in p.algos: for timeframe in p.timeframes: data = data_def.DataDef( p.metric_values_dir, p.algos, p.tasks, p.n_runs_per_experiment) stats_runner = stats.StatsRunner(data, metric, p.n_timeframes, p.n_random_samples, p.pvals_dir, p.metric_values_dir_permuted) stats_runner.bootstrap_confidence_interval(algo, timeframe)
def test_rank_per_task(self, result_dims, bigger_is_better, expected_result): results_arrays = { 'AT': [[3, 1], [2, -2], [4, 7], [0, 9]], 'ATP': [[[1., 2.], [3., 4.]], [[5., 6.], [7., 8.]]], } results_array = np.array(results_arrays[result_dims]) stats_runner = stats.StatsRunner(data=None, metric='IqrAcrossRuns') stats_runner.result_dims = result_dims stats_runner.bigger_is_better = bigger_is_better ranks = stats_runner.rank_per_task(results_array) np.testing.assert_array_equal(ranks, expected_result)
def test_get_timeframe_points(self, metric, timeframe, expected): stats_runner = stats.StatsRunner(data=self.dd, metric=metric, n_timeframes=3) timeframe_points = stats_runner.get_timeframe_points(timeframe) np.testing.assert_array_equal(list(timeframe_points), expected)