def test_clip_average_benchmark_empty(): scores = scoring.benchmark_aggregate_score(benchmark, {}) benchmark_results = defaultdict(list) task = benchmark.tasks[0] env_id = task.env_id benchmark_results[env_id].append(_benchmark_result_helper(benchmark, env_id=env_id)) scores = scoring.benchmark_aggregate_score(benchmark, benchmark_results) _assert_benchmark_score(scores, score=0.00005, num_envs_solved=0, summed_training_seconds=1.0, start_to_finish_seconds=1.0)
def test_total_reward_benchmark_empty(): scores = scoring.benchmark_aggregate_score(reward_benchmark, {}) benchmark_results = defaultdict(list) task = reward_benchmark.tasks[0] env_id = task.env_id benchmark_results[env_id].append(_benchmark_result_helper(reward_benchmark, env_id=env_id)) scores = scoring.benchmark_aggregate_score(reward_benchmark, benchmark_results) _assert_benchmark_score(scores, score=0.005, num_envs_solved=0, summed_training_seconds=1.0, start_to_finish_seconds=1.0)
def test_reward_per_time_benchmark_empty(): scores = scoring.benchmark_aggregate_score(reward_per_time_benchmark, {}) benchmark_results = defaultdict(list) task = reward_per_time_benchmark.tasks[0] env_id = task.env_id benchmark_results[env_id].append(_benchmark_result_helper(reward_per_time_benchmark, env_id=env_id, episode_lengths=[10])) scores = scoring.benchmark_aggregate_score(reward_per_time_benchmark, benchmark_results) _assert_benchmark_score(scores, score=0.0, num_envs_solved=0, summed_training_seconds=0.0, start_to_finish_seconds=0.0)
def test_total_reward_benchmark_simple(): benchmark_results = defaultdict(list) for i, task in enumerate(reward_benchmark.tasks): env_id = task.env_id benchmark_results[env_id].append(_benchmark_result_helper(reward_benchmark, env_id=env_id, timestamps=[i + 2])) scores = scoring.benchmark_aggregate_score(reward_benchmark, benchmark_results) _assert_benchmark_score(scores, score=0.01, num_envs_solved=0, summed_training_seconds=3.0, start_to_finish_seconds=2.0)
def test_reward_per_time_benchmark_scoring(): benchmark_results = defaultdict(list) for i, task in enumerate(reward_per_time_benchmark.tasks): env_id = task.env_id benchmark_results[env_id].append(_benchmark_result_helper(reward_per_time_benchmark, env_id=env_id, timestamps=[i + 2])) scores = scoring.benchmark_aggregate_score(reward_per_time_benchmark, benchmark_results) _assert_benchmark_score(scores, score=0.0075, num_envs_solved=0, summed_training_seconds=3.0, summed_task_wall_time=3.0, start_to_finish_seconds=2.0)
def test_clip_average_benchmark_incomplete(): benchmark_results = defaultdict(list) env_id = benchmark.tasks[0].env_id benchmark_results[env_id].append( _benchmark_result_helper(benchmark, env_id=env_id, timestamps=[2])) scores = scoring.benchmark_aggregate_score(benchmark, benchmark_results) _assert_benchmark_score(scores, score=0.00005, num_envs_solved=0, summed_training_seconds=1.0, start_to_finish_seconds=1.0)
def test_clip_average_benchmark_extra(): benchmark_results = defaultdict(list) for i, task in enumerate(benchmark.tasks): env_id = task.env_id benchmark_results[env_id].append(_benchmark_result_helper(benchmark, env_id=env_id, timestamps=[i + 2])) # add one more at the end with a high reward benchmark_results[env_id].append(_benchmark_result_helper(benchmark, env_id=env_id, episode_rewards=[100], timestamps=[2])) scores = scoring.benchmark_aggregate_score(benchmark, benchmark_results) _assert_benchmark_score(scores, score=0.0001, num_envs_solved=0, summed_training_seconds=3.0, summed_task_wall_time=3.0, start_to_finish_seconds=2.0)
def test_benchmark_extra(): benchmark_results = defaultdict(list) for i, task in enumerate(reward_benchmark.tasks): env_id = task.env_id benchmark_results[env_id].append(_benchmark_result_helper(reward_benchmark, env_id=env_id, timestamps=[i + 2])) # add one more at the end with a high reward benchmark_results[env_id].append(_benchmark_result_helper(reward_benchmark, env_id=env_id, episode_rewards=[100], timestamps=[2])) scores = scoring.benchmark_aggregate_score(reward_benchmark, benchmark_results) _assert_benchmark_score(scores, score=0.01, num_envs_solved=0, summed_training_seconds=3.0, start_to_finish_seconds=2.0)
def test_total_reward_benchmark_eval_handling(): # make sure we count all episodes benchmark_results = defaultdict(list) for i, task in enumerate(reward_benchmark.tasks): env_id = task.env_id benchmark_results[env_id].append(reward_benchmark.score_evaluation( env_id, data_sources=[0, 1, 1], initial_reset_timestamps=[1, 2], episode_lengths=[1, 1, 1], episode_rewards=[1, 2, 3], episode_types=['e', 't', 'e'], timestamps=[i + 2, i + 3, i + 4], )) scores = scoring.benchmark_aggregate_score(reward_benchmark, benchmark_results) _assert_benchmark_score(scores, score=0.02, num_envs_solved=0, summed_training_seconds=8.0, summed_task_wall_time=7.0, start_to_finish_seconds=4.0)
def test_total_reward_benchmark_solved(): benchmark_results = defaultdict(list) N = 200 for i, task in enumerate(reward_benchmark.tasks): env_id = task.env_id benchmark_results[env_id].append(reward_benchmark.score_evaluation( env_id, data_sources=[0] * N, initial_reset_timestamps=[1], episode_lengths=[1] * N, episode_rewards=[1000] * N, episode_types=['t'] * N, timestamps=list(range(N)), )) scores = scoring.benchmark_aggregate_score(reward_benchmark, benchmark_results) _assert_benchmark_score(scores, score=1.0, num_envs_solved=len(reward_benchmark.tasks))
def test_benchmark_eval_handling(): # make sure we count all episodes benchmark_results = defaultdict(list) for i, task in enumerate(reward_benchmark.tasks): env_id = task.env_id benchmark_results[env_id].append(reward_benchmark.score_evaluation( env_id, data_sources=[0, 1, 1], initial_reset_timestamps=[1, 2], episode_lengths=[1, 1, 1], episode_rewards=[1, 2, 3], episode_types=['e', 't', 'e'], timestamps=[i + 2, i + 3, i + 4], )) scores = scoring.benchmark_aggregate_score(reward_benchmark, benchmark_results) _assert_benchmark_score(scores, score=0.02, num_envs_solved=0, summed_training_seconds=8.0, summed_task_wall_time=7.0, start_to_finish_seconds=4.0)
def test_total_reward_benchmark_solved(): benchmark_results = defaultdict(list) N = 200 for i, task in enumerate(reward_benchmark.tasks): env_id = task.env_id benchmark_results[env_id].append(reward_benchmark.score_evaluation( env_id, data_sources=[0] * N, initial_reset_timestamps=[1], episode_lengths=[1] * N, episode_rewards=[1000] * N, episode_types=['t'] * N, timestamps=list(range(N)), )) scores = scoring.benchmark_aggregate_score(reward_benchmark, benchmark_results) _assert_benchmark_score(scores, score=1.0, num_envs_solved=len(reward_benchmark.tasks))
def test_benchmark_solved(): benchmark_results = defaultdict(list) N = 200 for i, task in enumerate(benchmark.tasks): env_id = task.env_id benchmark_results[env_id].append( benchmark.score_evaluation( env_id, data_sources=[0], initial_reset_timestamps=[1], episode_lengths=[1000] * N, episode_rewards=[1000] * N, episode_types=['t'] * N, timestamps=list(range(N)), )) scores = scoring.benchmark_aggregate_score(benchmark, benchmark_results) debug_str = "scores={}".format(scores) assert np.all(np.isclose(scores['score'], 1.0)), "scores={}".format(scores) assert scores['num_envs_solved'] == len(benchmark.tasks), debug_str
def test_benchmark_incomplete(): benchmark_results = defaultdict(list) env_id = benchmark.tasks[0].env_id benchmark_results[env_id].append( benchmark.score_evaluation( env_id, data_sources=[0], initial_reset_timestamps=[1], episode_lengths=[1], episode_rewards=[1], episode_types=['t'], timestamps=[2], )) scores = scoring.benchmark_aggregate_score(benchmark, benchmark_results) debug_str = "scores={}".format(scores) assert np.all(np.isclose(scores['summed_training_seconds'], 1.0)), debug_str assert np.all(np.isclose(scores['start_to_finish_seconds'], 1.0)), debug_str assert np.all(np.isclose(scores['score'], 0.00005)), "scores={}".format(scores) assert scores['num_envs_solved'] == 0, debug_str
def test_benchmark_extra(): benchmark_results = defaultdict(list) for i, task in enumerate(benchmark.tasks): env_id = task.env_id benchmark_results[env_id].append( benchmark.score_evaluation( env_id, data_sources=[0], initial_reset_timestamps=[1], episode_lengths=[1], episode_rewards=[1], episode_types=['t'], timestamps=[i + 2], )) # add one more at the end with a high reward benchmark_results[env_id].append( benchmark.score_evaluation( env_id, data_sources=[0], initial_reset_timestamps=[1], episode_lengths=[1], episode_rewards=[100], episode_types=['t'], timestamps=[2], )) scores = scoring.benchmark_aggregate_score(benchmark, benchmark_results) debug_str = "scores={}".format(scores) assert np.all(np.isclose(scores['score'], 0.0001)), "scores={}".format(scores) assert np.all(np.isclose(scores['summed_training_seconds'], 3.0)), debug_str assert np.all(np.isclose(scores['start_to_finish_seconds'], 2.0)), debug_str assert scores['num_envs_solved'] == 0, debug_str
def test_total_reward_benchmark_incomplete(): benchmark_results = defaultdict(list) env_id = reward_benchmark.tasks[0].env_id benchmark_results[env_id].append(_benchmark_result_helper(reward_benchmark, env_id=env_id, timestamps=[2])) scores = scoring.benchmark_aggregate_score(reward_benchmark, benchmark_results) _assert_benchmark_score(scores, score=0.005, num_envs_solved=0, summed_training_seconds=1.0, start_to_finish_seconds=1.0)