def benchmark_halfcheetah_v2(self): """Benchmarks MuJoCo HalfCheetah to 3M steps.""" self.setUp() output_dir = self._get_test_output_dir('halfcheetah_v2') start_time_sec = time.time() # TODO(b/172017027): Use halfcheetah gin config. strategy = tf.distribute.get_strategy() sac_train_eval.train_eval(output_dir, strategy, initial_collect_steps=10000, env_name='HalfCheetah-v2', eval_interval=50000, num_iterations=3000000) wall_time_sec = time.time() - start_time_sec event_file = utils.find_event_log(os.path.join(output_dir, 'eval')) values, _ = utils.extract_event_log_values(event_file, 'Metrics/AverageReturn') # Min/Max ranges are very large to only hard fail if very broken. The system # monitoring the results owns looking for anomalies. metric_1m = self.build_metric('average_return_at_env_step1000000', values[1000000], min_value=800, max_value=16000) metric_3m = self.build_metric('average_return_at_env_step3000000', values[3000000], min_value=12000, max_value=16500) self.report_benchmark(wall_time=wall_time_sec, metrics=[metric_1m, metric_3m], extras={})
def benchmark_pong_v0_at_3M(self): """Benchmarks to 3M Env steps. This is below the 12.5M train steps (50M frames) run by the paper to converge. Running 12.5M at the current throughput would take more than a week. 1-2 days is the max duration for a remotely usable test. 3M only confirms we have not regressed at 3M and does not gurantee convergence to 21 at 12.5M. """ self._setup() output_dir = self._get_test_output_dir('pongAt3M') start_time_sec = time.time() dqn_train_eval_atari.train_eval(output_dir, eval_interval=10000, num_iterations=750000) wall_time_sec = time.time() - start_time_sec event_file = utils.find_event_log(os.path.join(output_dir, 'eval')) values, _ = utils.extract_event_log_values( event_file, 'AverageReturn/EnvironmentSteps') print('Values:{}'.format(values)) # Min/Max ranges are very large to only hard fail if very broken. The system # monitoring the results owns looking for anomalies. metric_3m = self.build_metric('average_return_at_env_step3000000', values[3000000], min_value=-14, max_value=21) self.report_benchmark(wall_time=wall_time_sec, metrics=[metric_3m], extras={})
def test_extract_value_1m_only_start_at_1k(self): """Tests extracting data starting at step 1k.""" values, walltime = utils.extract_event_log_values( os.path.join(TEST_DATA, 'event_log_3m/events.out.tfevents.1599310762'), 'AverageReturn', 1000000, start_step=10000) # Verifies only 1M records were examined 0-1M = 101. self.assertLen(values, 100) # Wall time is less than if counting started at step 0. self.assertAlmostEqual(walltime, 366.92087, places=4)
def test_extract_value_1m_only(self): """Tests extracting data from the first 1M steps in the event log.""" values, walltime = utils.extract_event_log_values( os.path.join(TEST_DATA, 'event_log_3m/events.out.tfevents.1599310762'), 'AverageReturn', 1000000) # Verifies only 1M records were examined 0-1M = 101. self.assertLen(values, 101) self.assertAlmostEqual(walltime, 370.61673, places=4) # Verifies event value at 1M. self.assertAlmostEqual(values[1000000], 3791.88696, places=4)
def test_extract_value(self): """Tests extracting data from all steps in the event log.""" values, walltime = utils.extract_event_log_values( os.path.join(TEST_DATA, 'event_log_3m/events.out.tfevents.1599310762'), 'AverageReturn') # Verifies all (3M) records were examined 0-3M = 301. self.assertLen(values, 301) self.assertAlmostEqual(walltime, 1152.09573, places=4) # Verifies event value at 3M self.assertAlmostEqual(values[3000000], 5950.31835, places=4)
def get_metric_values( root_dir: str, experiment_phase: str, tag_name: str, experiment_dirs: Optional[List[str]] = None, return_time: bool = False, ) -> Dict[str, Dict[int, np.generic]]: """ When running an experiment through the experiment harness, tensorboard event files are created with metrics recorded during the experiment. Each experiment creates a separate directory under the `root_dir`. Metrics gathered during each phase of the experiment (e.g. training and policy evaluation) are stored in separate subdirectories of the experiment directory: `root_dir` / experiment id / `experiment_phase` This function will collect the values for a named metric gathered in a particular phase, from all of the listed experiments. :param root_dir: The root directory used by the experiment harness. :param experiment_phase: The phase of the experiment in which the trace was recorded. :param tag_name: The "tag" of the metric (usually defined by the metric object). :param experiment_dirs: A list of experiment ids which have been recorded in the root directory. The default behaviour is to collect the specified metric from all of the experiments in the root directory. This argument can be used to specify a subset of these, if desired. :param return_time: If set to to `True` a dictionary of wallclock times for `tag_name` events are returned instead of metric values. :return: A dictionary mapping experiment ids to either metric values or wallclock times. """ if not experiment_dirs: experiment_dirs_search_pattern = os.path.join(root_dir, "*/") experiment_dirs_full_path = tf.io.gfile.glob(experiment_dirs_search_pattern) experiment_dirs = [Path(full_path).name for full_path in experiment_dirs_full_path] all_values = {} for experiment_dir in experiment_dirs: summary_dir = os.path.join(root_dir, experiment_dir, experiment_phase) if not os.path.isdir(summary_dir): warn(f"{summary_dir} does not exist.") continue event_file = find_event_log(summary_dir) # we use TF-Agents' extract_event_log_values to extract wallclock time if return_time: metric_values = extract_event_log_values(event_file, tag_name)[1] all_values[experiment_dir] = metric_values # we use internal simplified version for metric values else: metric_values = _extract_event_log_values(event_file, tag_name) all_values[experiment_dir] = metric_values return all_values
def test_serialise_config_operational_config_tensorboard_events_file( experiment_setup, dummy_gin_global_config ): experiment_harness, _ = experiment_setup base_dir = experiment_harness.define_base_experiment_directory() experiment_harness.serialise_config(base_dir) event_file = find_event_log(base_dir) values = extract_event_log_values(event_file, GIN_CONFIG) assert "test_arg" in str(values[0][0])
def test_serialise_config_empty_operational_config_tensorboard_events_file(experiment_setup): experiment_harness, _ = experiment_setup base_dir = experiment_harness.define_base_experiment_directory() experiment_harness.serialise_config(base_dir) assert not gin.operative_config_str() event_file = find_event_log(base_dir) values = extract_event_log_values(event_file, GIN_CONFIG) assert not values[0][0]
def _gather_data(self) -> Tuple[List[Dict[int, np.generic]], List[float]]: """Gather data from all of the logs and add to the data_collector list. Returns: Tuple of arrays indexed by log file, e.g. data_collector[0] is all of the values found in the event log for the given event and walltimes[0] is the total time in minutes it took to get to the end_step in that event log. """ data_collector, walltimes = [], [] for eventlog_dir in self.eventlog_dirs: event_file = utils.find_event_log(eventlog_dir) logging.info('Processing event file: %s', event_file) data, total_time = utils.extract_event_log_values( event_file, self.event_tag, self.end_step) walltimes.append(total_time) data_collector.append(data) return data_collector, walltimes
def run_benchmark(self, training_env, expected_min, expected_max): """Run benchmark for a given environment. In order to execute ~1M environment steps to match the paper, we run 489 iterations (num_iterations=489) which results in 1,001,472 environment steps. Each iteration results in 320 training steps and 2,048 environment steps. Thus 489 * 2,048 = 1,001,472 environment steps and 489 * 320 = 156,480 training steps. Args: training_env: Name of environment to test. expected_min: The min expected return value. expected_max: The max expected return value. """ self.setUp() output_dir = self._get_test_output_dir('training_env') start_time_sec = time.time() bindings = [ 'schulman17.train_eval_lib.train_eval.env_name= "{}"'.format( training_env), 'schulman17.train_eval_lib.train_eval.eval_episodes = 100' ] gin.parse_config(bindings) ppo_clip_train_eval.ppo_clip_train_eval(output_dir, eval_interval=10000, num_iterations=489) wall_time_sec = time.time() - start_time_sec event_file = utils.find_event_log(os.path.join(output_dir, 'eval')) values, _ = utils.extract_event_log_values( event_file, 'Metrics/AverageReturn/EnvironmentSteps') metric_1m = self.build_metric('average_return_at_env_step1000000', values[1001472], min_value=expected_min, max_value=expected_max) self.report_benchmark(wall_time=wall_time_sec, metrics=[metric_1m], extras={}) self._tearDown()
def benchmark_halfcheetah_medium_v0(self): """Benchmarks MuJoCo HalfCheetah to 1M steps.""" self.setUp() output_dir = self._get_test_output_dir('halfcheetah_medium_v0_02_eval') dataset_path = self.root_data_dir start_time_sec = time.time() gin.parse_config_file( 'tf_agents/examples/cql_sac/kumar20/configs/mujoco_medium.gin') cql_sac_train_eval.train_eval( dataset_path=dataset_path, root_dir=output_dir, env_name='halfcheetah-medium-v0', num_gradient_updates=500000, # Number of iterations. learner_iterations_per_call=500, data_shuffle_buffer_size=10000, data_num_shards=50, data_parallel_reads=500, data_prefetch=1000000, eval_interval=10000) wall_time_sec = time.time() - start_time_sec event_file = utils.find_event_log(os.path.join(output_dir, 'eval')) values, _ = utils.extract_event_log_values(event_file, 'Metrics/AverageReturn', start_step=10000) # Min/Max ranges are very large to only hard fail if very broken. The system # monitoring the results owns looking for anomalies. These numbers are based # on the results that we were getting in MLCompass as of 04-NOV-2021. # Results at 500k steps and 1M steps are similar enough to not make it worth # running 1M. metric_500k = self.build_metric('average_return_at_env_step500000', values[500000], min_value=4400, max_value=5400) self.report_benchmark(wall_time=wall_time_sec, metrics=[metric_500k], extras={})