def permutation_tests(): """Evaluates permutation tests.""" # Evaluate metrics that only have one value per training run. for metric in p.metrics_no_timeframes: for algo1 in p.algos: for algo2 in p.algos: data = data_def.DataDef(p.metric_values_dir, p.algos, p.tasks, p.n_runs_per_experiment) stats_runner = stats.StatsRunner(data, metric, None, p.n_random_samples, p.pvals_dir, p.metric_values_dir_permuted) stats_runner.compare_algorithms(algo1, algo2, timeframe=None) # Evaluate metrics computed at different points along each training run. for metric in p.metrics_with_timeframes: for algo1 in p.algos: for algo2 in p.algos: for timeframe in p.timeframes: data = data_def.DataDef(p.metric_values_dir, p.algos, p.tasks, p.n_runs_per_experiment) stats_runner = stats.StatsRunner( data, metric, p.n_timeframes, p.n_random_samples, p.pvals_dir, p.metric_values_dir_permuted) stats_runner.compare_algorithms(algo1, algo2, timeframe)
def setUp(self): super(StatsTest, self).setUp() results_dir = os.path.join( './', 'rl_reliability_metrics/analysis/test_data') self.dd = data_def.DataDef(results_dir, algorithms=['algoA', 'algoB', 'algoC'], tasks=['taskX', 'taskY'], n_runs_per_experiment=2)
def bootstrap_confidence_intervals(): """Computes bootstrap confidence intervals for each algorithm.""" for metric in p.metrics_no_timeframes: for algo in p.algos: data = data_def.DataDef( p.metric_values_dir, p.algos, p.tasks, p.n_runs_per_experiment) stats_runner = stats.StatsRunner(data, metric, None, p.n_random_samples, p.pvals_dir, p.metric_values_dir_permuted) stats_runner.bootstrap_confidence_interval(algo, timeframe=None) for metric in p.metrics_with_timeframes: for algo in p.algos: for timeframe in p.timeframes: data = data_def.DataDef( p.metric_values_dir, p.algos, p.tasks, p.n_runs_per_experiment) stats_runner = stats.StatsRunner(data, metric, p.n_timeframes, p.n_random_samples, p.pvals_dir, p.metric_values_dir_permuted) stats_runner.bootstrap_confidence_interval(algo, timeframe)
def test_load_empty_results(self): """Check for exception if loading with incorrect algorithm names.""" results_dir = os.path.join( './', 'rl_reliability_metrics/analysis/test_data') algorithms = ['wrong1', 'wrong2', 'wrong3'] tasks = ['taskX', 'taskY'] with self.assertRaises(Exception): data_def.DataDef(results_dir, algorithms, tasks, n_runs_per_experiment=2)
def make_plots(): """Makes plots.""" dd = data_def.DataDef(p.metric_values_dir, p.algos, p.tasks, p.n_runs_per_experiment) my_plotter = plotter.Plotter( data=dd, pvals_dir=p.pvals_dir, confidence_intervals_dir=p.confidence_intervals_dir, n_timeframes=p.n_timeframes, algorithms=p.algos, out_dir=p.plots_dir) for metric in p.metrics: my_plotter.make_plots(metric)
def test_create_datadef(self): results_dir = os.path.join( './', 'rl_reliability_metrics/analysis/test_data') algorithms = ['algoA', 'algoB', 'algoC'] tasks = ['taskX', 'taskY'] metrics = [ 'IqrWithinRuns', 'MedianPerfDuringTraining', 'IqrAcrossRuns' ] dd = data_def.DataDef(results_dir, algorithms, tasks, n_runs_per_experiment=2) self.assertEqual(dd.algorithms, algorithms) self.assertEqual(dd.tasks, tasks) self.assertEqual(len(dd.results), len(dd.algorithms) * len(dd.tasks)) # pylint: disable=g-generic-assert self.assertCountEqual(dd.metrics, metrics)