def evaluate_metrics_on_bootstrapped_runs(): """Evaluates metrics on bootstrapped runs, for across-run metrics only.""" gin_bindings = [ 'eval_metrics.Evaluator.metrics = [@IqrAcrossRuns/singleton(), ' '@LowerCVaROnAcross/singleton()]' ] n_bootstraps_per_worker = int(p.n_random_samples / p.n_worker) # Parse gin config. gin.parse_config_files_and_bindings([p.gin_file], gin_bindings) for algo in p.algos: for task in p.tasks: for i_worker in range(p.n_worker): # Get the subdirectories corresponding to each run. summary_path = os.path.join(p.data_dir, algo, task) run_dirs = eval_metrics.get_run_dirs(summary_path, 'train', p.runs) # Evaluate results. outfile_prefix = os.path.join(p.metric_values_dir_bootstrapped, algo, task) + '/' evaluator = eval_metrics.Evaluator(metrics=gin.REQUIRED) evaluator.write_metric_params(outfile_prefix) evaluator.evaluate_with_bootstraps( run_dirs=run_dirs, outfile_prefix=outfile_prefix, n_bootstraps=n_bootstraps_per_worker, bootstrap_start_idx=(n_bootstraps_per_worker * i_worker), random_seed=i_worker)
def test_evaluate_with_permutations(self): evaluator = eval_metrics.Evaluator([metrics_online.StddevWithinRuns()]) n_permutations = 3 permutation_start_idx = 100 random_seed = 50 outfile_prefix = os.path.join(FLAGS.test_tmpdir, 'robustness_results_permuted_') results = evaluator.evaluate_with_permutations( self.run_dirs, self.run_dirs, outfile_prefix, n_permutations, permutation_start_idx, random_seed) # Check length of results. self.assertLen(results, n_permutations) # Check a single result. one_result = list(results.values())[0]['curves1'] self.assertEqual(list(one_result.keys()), ['StddevWithinRuns']) self.assertTrue(np.greater(list(one_result.values()), 0.).all()) # Check the output files. results_files = io_utils.paths_glob('%s*results.json' % outfile_prefix) self.assertLen(results_files, 1) # If run again with the same seed, the results should be the same results_same = evaluator.evaluate_with_permutations( self.run_dirs, self.run_dirs, outfile_prefix, n_permutations, permutation_start_idx, random_seed) self._assert_results_same(results, results_same) # If run again with a different seed, the results should be different results_different = evaluator.evaluate_with_permutations( self.run_dirs, self.run_dirs, outfile_prefix, n_permutations, permutation_start_idx, random_seed + 1) self._assert_results_different(results, results_different)
def evaluate_metrics_on_permuted_runs(): """Evaluates metrics on permuted runs, for across-run metrics only.""" gin_bindings = [ ('eval_metrics.Evaluator.metrics = ' '[@IqrAcrossRuns/singleton(), @LowerCVaROnAcross/singleton()]') ] n_permutations_per_worker = int(p.n_random_samples / p.n_worker) # Parse gin config. gin.parse_config_files_and_bindings([p.gin_file], gin_bindings) for algo1 in p.algos: for algo2 in p.algos: for task in p.tasks: for i_worker in range(p.n_worker): # Get the subdirectories corresponding to each run. summary_path_1 = os.path.join(p.data_dir, algo1, task) summary_path_2 = os.path.join(p.data_dir, algo2, task) run_dirs_1 = eval_metrics.get_run_dirs(summary_path_1, 'train', p.runs) run_dirs_2 = eval_metrics.get_run_dirs(summary_path_2, 'train', p.runs) # Evaluate the metrics. outfile_prefix = os.path.join(p.metric_values_dir_permuted, '%s_%s' % (algo1, algo2), task) + '/' evaluator = eval_metrics.Evaluator(metrics=gin.REQUIRED) evaluator.write_metric_params(outfile_prefix) evaluator.evaluate_with_permutations( run_dirs_1=run_dirs_1, run_dirs_2=run_dirs_2, outfile_prefix=outfile_prefix, n_permutations=n_permutations_per_worker, permutation_start_idx=(n_permutations_per_worker * i_worker), random_seed=i_worker)
def test_evaluate(self): evaluator = eval_metrics.Evaluator([ metrics_online.StddevWithinRuns(), metrics_online.StddevWithinRuns() ]) results = evaluator.evaluate(self.run_dirs) self.assertEqual(list(results.keys()), ['StddevWithinRuns']) self.assertTrue(np.greater(list(results.values()), 0.).all())
def test_compute_metrics(self): curves = [ np.array([[-1, 0, 1], [1., 1., 1.]]), np.array([[-1, 0, 1, 2], [2., 3., 4., 5.]]) ] evaluator = eval_metrics.Evaluator( [metrics_online.StddevAcrossRuns(eval_points=[0, 1], baseline=1)]) results = evaluator.compute_metrics(curves) np.testing.assert_allclose(results['StddevAcrossRuns'], [1.41421356237, 2.12132034356])
def test_evaluate_using_environment_steps(self): gin.bind_parameter('metrics_online.StddevWithinRuns.eval_points', [2001]) metric_instances = [ metrics_online.StddevWithinRuns(), metrics_online.StddevWithinRuns() ] evaluator = eval_metrics.Evaluator( metric_instances, timepoint_variable='Metrics/EnvironmentSteps') results = evaluator.evaluate(self.run_dirs) self.assertEqual(list(results.keys()), ['StddevWithinRuns']) self.assertTrue(np.greater(list(results.values()), 0.).all())
def evaluate_metrics(): """Evaluates metrics specified in the gin config.""" # Parse gin config. gin.parse_config_files_and_bindings([p.gin_file], []) for algo in p.algos: for task in p.tasks: # Get the subdirectories corresponding to each run. summary_path = os.path.join(p.data_dir, algo, task) run_dirs = eval_metrics.get_run_dirs(summary_path, 'train', p.runs) # Evaluate metrics. outfile_prefix = os.path.join(p.metric_values_dir, algo, task) + '/' evaluator = eval_metrics.Evaluator(metrics=gin.REQUIRED) evaluator.write_metric_params(outfile_prefix) evaluator.evaluate(run_dirs=run_dirs, outfile_prefix=outfile_prefix)
def test_write_results(self): # Generate some results. curves = [ np.array([[-1, 0, 1], [1., 1., 1.]]), np.array([[-1, 0, 1, 2], [2., 3., 4., 5.]]) ] metric = metrics_online.StddevAcrossRuns(eval_points=[0, 1], baseline=1) evaluator = eval_metrics.Evaluator([metric]) results = evaluator.compute_metrics(curves) outfile_prefix = os.path.join(flags.FLAGS.test_tmpdir, 'results_') params_path = evaluator.write_metric_params(outfile_prefix) results_path = evaluator.write_results(results, outfile_prefix) # Test write_results. with open(results_path, 'r') as outfile: results_loaded = outfile.readline() results_dict = json.loads(results_loaded) expected = {'StddevAcrossRuns': [1.41421356237, 2.12132034356]} self.assertEqual(results_dict.keys(), expected.keys()) np.testing.assert_allclose(expected['StddevAcrossRuns'], results_dict['StddevAcrossRuns']) # Test write_metric_params. with open(params_path, 'r') as outfile: params_loaded = outfile.readline() expected = json.dumps({ 'StddevAcrossRuns': { 'eval_points': [0, 1], 'lowpass_thresh': None, 'baseline': 1, 'window_size': None, } }) self.assertJsonEqual(expected, params_loaded)
def test_window_empty(self): curves = [np.array([[0, 2], [2, 3]])] evaluator = eval_metrics.Evaluator([metrics_online.StddevAcrossRuns()]) self.assertRaises(ValueError, evaluator.compute_metrics, curves)
def test_window_out_of_range(self): curves = [np.array([[0, 1], [1, 1]])] evaluator = eval_metrics.Evaluator([metrics_online.StddevAcrossRuns()]) self.assertRaises(ValueError, evaluator.compute_metrics, curves)