def test_malformed_lookup(): try: spec("“Breakout-v0”") except error.Error as e: assert "Malformed environment ID" in f"{e}", f"Unexpected message: {e}" else: assert False
def test_default_lookups(): register("test/Test3") with pytest.raises(error.DeprecatedEnv): spec("test/Test3-v0") # Lookup default spec("test/Test3")
def test_versioned_lookups(): register("test/Test2-v5") with pytest.raises(error.VersionNotFound): spec("test/Test2-v9") with pytest.raises(error.DeprecatedEnv): spec("test/Test2-v4") assert spec("test/Test2-v5")
def test_missing_lookup(): register(id="Test1-v0", entry_point=None) register(id="Test1-v15", entry_point=None) register(id="Test1-v9", entry_point=None) register(id="Other1-v100", entry_point=None) with pytest.raises(error.DeprecatedEnv): spec("Test1-v1") try: spec("Test1-v1000") except error.UnregisteredEnv: pass else: assert False try: spec("Unknown1-v1") except error.UnregisteredEnv: pass else: assert False
def test_spec(): spec = envs.spec("CartPole-v0") assert spec.id == "CartPole-v0"
def test_spec(): spec = envs.spec('CartPole-v0') assert spec.id == 'CartPole-v0'
def score_evaluation(self, benchmark, env_id, data_sources, initial_reset_timestamps, episode_lengths, episode_rewards, episode_types, timestamps): tasks = benchmark.task_specs(env_id) spec = envs.spec(env_id) #### 0. Compute timing stats if len(initial_reset_timestamps) > 0: initial_reset_timestamp = min(initial_reset_timestamps) else: initial_reset_timestamp = 0 # How long each episode actually took timestamps = np.array(timestamps) durations = _compute_episode_durations(initial_reset_timestamps, data_sources, timestamps) #### Grab the data corresponding to each of evaluation/training lengths = np.array(episode_lengths) rewards = np.array(episode_rewards) #### Calculate the total elapsed time (in various units) #### for each episode # How many training timesteps have elapsed by the end of each # episode. Not to be confused with Unix timestamps. elapsed_timesteps = np.cumsum(lengths) # Total number of seconds elapsed by the end of each # episode. Note that with n parallel workers each running for # m seconds, we want to count the total time as n * m. elapsed_seconds = np.cumsum(durations) # List of score for each task scores = [] # List of lists of solved episodes for each task solves = [] # List of lists of episode rewards for each task rewards = [] _timestamps = [] elapsed_times = [] for task in tasks: # Find the first episode where we're over the allotted # training timesteps. cutoff_idx = _find_cutoffs_for_task(task, elapsed_timesteps, elapsed_seconds) if not np.isfinite(cutoff_idx): # All episodes are fair game cutoff_idx = len(lengths) reward = np.array(episode_rewards)[:cutoff_idx] score, solved = self.score_and_solved_func(task, reward, elapsed_seconds[:cutoff_idx]) scores.append(score) solves.append(solved) rewards.append(reward) if np.any(timestamps[:cutoff_idx]): last_timestamp = timestamps[cutoff_idx - 1] elapsed_time = elapsed_seconds[cutoff_idx - 1] else: # If we don't have any valid episodes, then the # last valid timestamp is when we started. last_timestamp = initial_reset_timestamp elapsed_time = 0.0 # Record the timestamp of the last episode _timestamps.append(last_timestamp) elapsed_times.append(elapsed_time) return { 'rewards': rewards, 'scores': scores, 'solves': solves, 'timestamps': _timestamps, 'elapsed_times': elapsed_times, 'initial_reset_timestamp': initial_reset_timestamp, }
def generate_yaml( exp_names, output_path, # number=None ): # The yaml file should reflect complete information of experiment. # So we do not allow number as argument. # Get the trial_name-json_path dict. assert spec(args.env_name) # make sure no typo in env_name trial_json_dict = {} if isinstance(exp_names, str): exp_names = [exp_names] for exp_name in exp_names: trial_json_dict.update(get_trial_json_dict(exp_name)) # Get the trial_name-trial_data dict. This is not ordered. trial_data_dict = get_trial_data_dict(trial_json_dict) K = 3 trial_performance_list = [] for i, (trial_name, data) in enumerate(trial_data_dict.items()): avg = data[PERFORMANCE_METRIC].tail(K).mean() if np.isnan(avg): avg = float("-inf") print("Avg: ", avg, np.isnan(avg)) trial_performance_list.append([trial_name, avg]) # print("Collected trial_performance_list: ", trial_performance_list) sorted_trial_pfm_list = sorted(trial_performance_list, key=lambda pair: pair[1]) def get_video_name(trial_name, performance): # trial_name: PPO_BipedalWalker-v2_38_seed=138 # result: "PPO seed=139 rew=249.01" components = trial_name.split("_") try: ret = "{0} {3} rew={4:.2f}".format(*components, performance) except ValueError: strs = components + [performance] strs = [str(s) for s in strs] ret = ",".join(strs) return ret # Return: [{"name": NAME, "path": CKPT_PATH, ...}, {...}, ...] results = [] for (trial_name, performance) in sorted_trial_pfm_list: json_path = trial_json_dict[trial_name] trial_path = os.path.dirname(json_path) ckpt = get_latest_checkpoint(trial_path) if ckpt is None: continue run_name = trial_name.split("_")[0] env_name = trial_name.split("_")[1] cool_name = get_video_name(trial_name, performance) results.append({ "name": cool_name, "path": ckpt["path"], "performance": float(performance), "run_name": run_name, "env_name": env_name, "iter": ckpt["iter"] }) save_yaml(results, output_path) print("Successfully collect yaml file containing {} checkpoints.".format( len(results))) # if rollout: # pass # several_agent_rollout(output_path, num_rollouts, seed) return results
def score_evaluation(self, benchmark, env_id, data_sources, initial_reset_timestamps, episode_lengths, episode_rewards, episode_types, timestamps): # TODO refactor code shared with the clip scoring rule above tasks = benchmark.task_specs(env_id) spec = envs.spec(env_id) #### 0. Compute timing stats if len(initial_reset_timestamps) > 0: initial_reset_timestamp = min(initial_reset_timestamps) else: initial_reset_timestamp = 0 # How long each episode actually took durations = np.zeros(len(timestamps)) data_sources = np.array(data_sources) timestamps = np.array(timestamps) for source, initial_ts in enumerate(initial_reset_timestamps): (source_indexes, ) = np.where(data_sources == source) # Once we know the indexes corresponding to a particular # source (i.e. worker thread), we can just subtract # adjoining values durations[ source_indexes[0]] = timestamps[source_indexes[0]] - initial_ts durations[source_indexes[1:]] = timestamps[ source_indexes[1:]] - timestamps[source_indexes[:-1]] #### Grab the data corresponding to each of evaluation/training lengths = np.array(episode_lengths) rewards = np.array(episode_rewards) durations = np.array(durations) #### Calculate the total elapsed time (in various units) #### for each episode # How many training timesteps have elapsed by the end of each # episode. Not to be confused with Unix timestamps. elapsed_timesteps = np.cumsum(lengths) # Total number of seconds elapsed by the end of each # episode. Note that with n parallel workers each running for # m seconds, we want to count the total time as n * m. elapsed_seconds = np.cumsum(durations) scores = [] solves = [] rewards = [] _timestamps = [] elapsed_times = [] for task in tasks: # Find the first episode where we're over the allotted # training timesteps. cutoff_idx = np.inf if task.max_timesteps: # this looks a little funny, but we want the first idx greater # than the cutoff (timestep_cutoff, ) = np.where( elapsed_timesteps > task.max_timesteps) if len(timestep_cutoff) > 0: cutoff_idx = min(cutoff_idx, timestep_cutoff[0]) if task.max_seconds: (seconds_cutoff, ) = np.where( elapsed_seconds > task.max_seconds) if len(seconds_cutoff) > 0: cutoff_idx = min(cutoff_idx, seconds_cutoff[0]) if not np.isfinite(cutoff_idx): # All episodes are fair game cutoff_idx = len(lengths) reward = np.array(episode_rewards)[:cutoff_idx] floor = task.reward_floor ceiling = task.reward_ceiling solved = reward >= ceiling # Sum raw rewards, linearly rescale to between 0 and 1 score = np.clip((np.mean(reward) - floor) / (ceiling - floor), 0, 1) # Take the mean rescaled score scores.append(score) # Record the list of solved episodes solves.append(solved) # Record the list of rewards rewards.append(reward) if np.any(timestamps[:cutoff_idx]): last_idx = cutoff_idx - 1 last_timestamp = timestamps[last_idx] elapsed_time = elapsed_seconds[last_idx] else: # If we don't have any valid episodes, then the # last valid timestamp is when we started. last_timestamp = initial_reset_timestamp elapsed_time = 0.0 # Record the timestamp of the last episode timestamp _timestamps.append(last_timestamp) elapsed_times.append(elapsed_time) return { 'rewards': rewards, 'scores': scores, 'solves': solves, 'timestamps': _timestamps, 'elapsed_times': elapsed_times, 'initial_reset_timestamp': initial_reset_timestamp, }
def score_evaluation(self, benchmark, env_id, episode_lengths, episode_rewards, episode_types, timestamps, initial_reset_timestamp): tasks = benchmark.task_groups[env_id] spec = envs.spec(env_id) (t_idx, ) = np.where([t == 't' for t in episode_types]) # training episodes (e_idx, ) = np.where([t == 'e' for t in episode_types]) # evaluation episodes if len(e_idx) == 0: # If no episodes marked for evaluation, consider # everything both a training and evaluation episode. (t_idx, ) = np.where([True for t in episode_types]) (e_idx, ) = np.where([True for t in episode_types]) training_lengths = np.array(episode_lengths)[t_idx] training_rewards = np.array(episode_rewards)[t_idx] evaluation_lengths = np.array(episode_lengths)[e_idx] evaluation_rewards = np.array(episode_rewards)[e_idx] # How many training timesteps have elapsed by the end of each # episode. Not to be confused with Unix timestamps. elapsed_timesteps = np.cumsum(training_lengths) scores = [] solves = [] rewards = [] _timestamps = [] for task in tasks: # Find the first episode where we're over the allotted # training timesteps. (cutoff, ) = np.where(elapsed_timesteps > task.timesteps) if len(cutoff) > 0: cutoff_idx = cutoff[-1] orig_cutoff_idx = t_idx[ cutoff_idx] # cutoff index in the original (allowed_e_idx, ) = np.where( e_idx < orig_cutoff_idx) # restrict to earlier episodes else: # All episodes are fair game allowed_e_idx = e_idx if len(allowed_e_idx) > 0: last_timestamp = timestamps[allowed_e_idx[-1]] else: # If we don't have any evaluation episodes, then the # last valid timestamp is when we started. last_timestamp = initial_reset_timestamp # Grab the last num_episodes evaluation episodes from # before the cutoff (at which point we've gathered too # much experience). # # This probably won't work long-term but is fine for now. allowed_episode_rewards = np.array(episode_rewards)[allowed_e_idx] reward = allowed_episode_rewards[-self.num_episodes:] floor = task.reward_floor ceiling = task.reward_ceiling if len(reward) < self.num_episodes: extra = self.num_episodes - len(reward) logger.info('Only %s rewards for %s; adding %s', len(reward), env_id, extra) reward = np.concatenate([reward, [floor] * extra]) # Grab the indexes where we reached the ceiling solved = reward >= ceiling # Linearly rescale rewards to between 0 and 1 clipped = np.clip((reward - floor) / (ceiling - floor), 0, 1) # Take the mean rescaled score score = np.mean(clipped) scores.append(score) # Record the list of solved episodes solves.append(solved) # Record the list of rewards rewards.append(reward) # Record the timestamp of the last episode timestamp _timestamps.append(last_timestamp) return { 'rewards': rewards, 'scores': scores, 'solves': solves, 'timestamps': _timestamps, }
def score_evaluation(self, benchmark, env_id, episode_lengths, episode_rewards, episode_types, timestamps, initial_reset_timestamp): tasks = benchmark.task_groups[env_id] spec = envs.spec(env_id) (t_idx,) = np.where([t == 't' for t in episode_types]) # training episodes (e_idx,) = np.where([t == 'e' for t in episode_types]) # evaluation episodes if len(e_idx) == 0: # If no episodes marked for evaluation, consider # everything both a training and evaluation episode. (t_idx,) = np.where([True for t in episode_types]) (e_idx,) = np.where([True for t in episode_types]) training_lengths = np.array(episode_lengths)[t_idx] training_rewards = np.array(episode_rewards)[t_idx] evaluation_lengths = np.array(episode_lengths)[e_idx] evaluation_rewards = np.array(episode_rewards)[e_idx] # How many training timesteps have elapsed by the end of each # episode. Not to be confused with Unix timestamps. elapsed_timesteps = np.cumsum(training_lengths) scores = [] solves = [] rewards = [] _timestamps = [] for task in tasks: # Find the first episode where we're over the allotted # training timesteps. (cutoff,) = np.where(elapsed_timesteps > task.timesteps) if len(cutoff) > 0: cutoff_idx = cutoff[-1] orig_cutoff_idx = t_idx[cutoff_idx] # cutoff index in the original (allowed_e_idx,) = np.where(e_idx < orig_cutoff_idx) # restrict to earlier episodes else: # All episodes are fair game allowed_e_idx = e_idx if len(allowed_e_idx) > 0: last_timestamp = timestamps[allowed_e_idx[-1]] else: # If we don't have any evaluation episodes, then the # last valid timestamp is when we started. last_timestamp = initial_reset_timestamp # Grab the last num_episodes evaluation episodes from # before the cutoff (at which point we've gathered too # much experience). # # This probably won't work long-term but is fine for now. allowed_episode_rewards = np.array(episode_rewards)[allowed_e_idx] reward = allowed_episode_rewards[-self.num_episodes:] floor = task.reward_floor ceiling = task.reward_ceiling if len(reward) < self.num_episodes: extra = self.num_episodes-len(reward) logger.info('Only %s rewards for %s; adding %s', len(reward), env_id, extra) reward = np.concatenate([reward, [floor] * extra]) # Grab the indexes where we reached the ceiling solved = reward >= ceiling # Linearly rescale rewards to between 0 and 1 clipped = np.clip((reward - floor) / (ceiling - floor), 0, 1) # Take the mean rescaled score score = np.mean(clipped) scores.append(score) # Record the list of solved episodes solves.append(solved) # Record the list of rewards rewards.append(reward) # Record the timestamp of the last episode timestamp _timestamps.append(last_timestamp) return { 'rewards': rewards, 'scores': scores, 'solves': solves, 'timestamps': _timestamps, }
def score_evaluation(self, benchmark, env_id, data_sources, initial_reset_timestamps, episode_lengths, episode_rewards, episode_types, timestamps): # TODO refactor code shared with the clip scoring rule above tasks = benchmark.task_specs(env_id) spec = envs.spec(env_id) #### 0. Compute timing stats if len(initial_reset_timestamps) > 0: initial_reset_timestamp = min(initial_reset_timestamps) else: initial_reset_timestamp = 0 # How long each episode actually took durations = np.zeros(len(timestamps)) data_sources = np.array(data_sources) timestamps = np.array(timestamps) for source, initial_ts in enumerate(initial_reset_timestamps): (source_indexes,) = np.where(data_sources == source) # Once we know the indexes corresponding to a particular # source (i.e. worker thread), we can just subtract # adjoining values durations[source_indexes[0]] = timestamps[source_indexes[0]] - initial_ts durations[source_indexes[1:]] = timestamps[source_indexes[1:]] - timestamps[source_indexes[:-1]] #### Grab the data corresponding to each of evaluation/training lengths = np.array(episode_lengths) rewards = np.array(episode_rewards) durations = np.array(durations) #### Calculate the total elapsed time (in various units) #### for each episode # How many training timesteps have elapsed by the end of each # episode. Not to be confused with Unix timestamps. elapsed_timesteps = np.cumsum(lengths) # Total number of seconds elapsed by the end of each # episode. Note that with n parallel workers each running for # m seconds, we want to count the total time as n * m. elapsed_seconds = np.cumsum(durations) scores = [] solves = [] rewards = [] _timestamps = [] elapsed_times = [] for task in tasks: # Find the first episode where we're over the allotted # training timesteps. cutoff_idx = np.inf if task.max_timesteps: # this looks a little funny, but we want the first idx greater # than the cutoff (timestep_cutoff,) = np.where(elapsed_timesteps > task.max_timesteps) if len(timestep_cutoff) > 0: cutoff_idx = min(cutoff_idx, timestep_cutoff[0]) if task.max_seconds: (seconds_cutoff,) = np.where(elapsed_seconds > task.max_seconds) if len(seconds_cutoff) > 0: cutoff_idx = min(cutoff_idx, seconds_cutoff[0]) if not np.isfinite(cutoff_idx): # All episodes are fair game cutoff_idx = len(lengths) reward = np.array(episode_rewards)[:cutoff_idx] floor = task.reward_floor ceiling = task.reward_ceiling solved = reward >= ceiling # Sum raw rewards, linearly rescale to between 0 and 1 score = np.clip((np.mean(reward) - floor) / (ceiling - floor), 0, 1) # Take the mean rescaled score scores.append(score) # Record the list of solved episodes solves.append(solved) # Record the list of rewards rewards.append(reward) if np.any(timestamps[:cutoff_idx]): last_idx = cutoff_idx - 1 last_timestamp = timestamps[last_idx] elapsed_time = elapsed_seconds[last_idx] else: # If we don't have any valid episodes, then the # last valid timestamp is when we started. last_timestamp = initial_reset_timestamp elapsed_time = 0.0 # Record the timestamp of the last episode timestamp _timestamps.append(last_timestamp) elapsed_times.append(elapsed_time) return { 'rewards': rewards, 'scores': scores, 'solves': solves, 'timestamps': _timestamps, 'elapsed_times': elapsed_times, 'initial_reset_timestamp': initial_reset_timestamp, }
def score_evaluation(self, benchmark, env_id, data_sources, initial_reset_timestamps, episode_lengths, episode_rewards, episode_types, timestamps): tasks = benchmark.task_specs(env_id) spec = envs.spec(env_id) #### 0. Compute timing stats if len(initial_reset_timestamps) > 0: initial_reset_timestamp = min(initial_reset_timestamps) else: initial_reset_timestamp = 0 # How long each episode actually took # How long each episode actually took durations = np.zeros(len(timestamps)) data_sources = np.array(data_sources) timestamps = np.array(timestamps) for source, initial_ts in enumerate(initial_reset_timestamps): (source_indexes,) = np.where(data_sources == source) if len(source_indexes) == 0: continue # Once we know the indexes corresponding to a particular # source (i.e. worker thread), we can just subtract # adjoining values durations[source_indexes[0]] = timestamps[source_indexes[0]] - initial_ts durations[source_indexes[1:]] = timestamps[source_indexes[1:]] - timestamps[source_indexes[:-1]] #### 1. Select out which indexes are for evaluation and which are for training (t_idx,) = np.where([t == 't' for t in episode_types]) # training episodes (e_idx,) = np.where([t == 'e' for t in episode_types]) # evaluation episodes if len(e_idx) == 0: # If no episodes marked for evaluation, consider # everything both a training and evaluation episode. (t_idx,) = np.where([True for t in episode_types]) (e_idx,) = np.where([True for t in episode_types]) #### 2. Grab the data corresponding to each of evaluation/training training_lengths = np.array(episode_lengths)[t_idx] training_rewards = np.array(episode_rewards)[t_idx] training_durations = np.array(durations)[t_idx] evaluation_lengths = np.array(episode_lengths)[e_idx] evaluation_rewards = np.array(episode_rewards)[e_idx] evaluation_durations = np.array(durations)[e_idx] #### 3. Calculate the total elapsed time (in various units) #### for each episode # How many training timesteps have elapsed by the end of each # episode. Not to be confused with Unix timestamps. elapsed_timesteps = np.cumsum(training_lengths) # Total number of seconds elapsed by the end of each # episode. Note that with n parallel workers each running for # m seconds, we want to count the total time as n * m. elapsed_seconds = np.cumsum(training_durations) scores = [] solves = [] rewards = [] _timestamps = [] elapsed_times = [] for task in tasks: # Find the first episode where we're over the allotted # training timesteps. cutoff_idx = np.inf if task.max_timesteps: # this looks a little funny, but we want the first idx greater # than the cutoff (timestep_cutoff,) = np.where(elapsed_timesteps > task.max_timesteps) if len(timestep_cutoff) > 0: cutoff_idx = min(cutoff_idx, timestep_cutoff[0]) if task.max_seconds: (seconds_cutoff,) = np.where(elapsed_seconds > task.max_seconds) if len(seconds_cutoff) > 0: cutoff_idx = min(cutoff_idx, seconds_cutoff[0]) if np.isfinite(cutoff_idx): orig_cutoff_idx = t_idx[cutoff_idx] # cutoff index in the original (i.e. before filtering to training/evaluation) (allowed_e_idx,) = np.where(e_idx < orig_cutoff_idx) # restrict to earlier episodes else: # All episodes are fair game allowed_e_idx = e_idx # Grab the last num_episodes evaluation episodes from # before the cutoff (at which point we've gathered too # much experience). # # This probably won't work long-term but is fine for now. allowed_episode_rewards = np.array(episode_rewards)[allowed_e_idx] reward = allowed_episode_rewards[-self.num_episodes:] floor = task.reward_floor ceiling = task.reward_ceiling if len(reward) < self.num_episodes: extra = self.num_episodes-len(reward) logger.info('Only %s rewards for %s; adding %s', len(reward), env_id, extra) reward = np.concatenate([reward, [floor] * extra]) # Grab the indexes where we reached the ceiling solved = reward >= ceiling # Linearly rescale rewards to between 0 and 1 clipped = np.clip((reward - floor) / (ceiling - floor), 0, 1) # Take the mean rescaled score score = np.mean(clipped) scores.append(score) # Record the list of solved episodes solves.append(solved) # Record the list of rewards rewards.append(reward) if len(allowed_e_idx) > 0: if not np.isfinite(cutoff_idx): cutoff_idx = len(elapsed_seconds) - 1 last_t_idx = t_idx[cutoff_idx] # timestamps is full length last_timestamp = timestamps[last_t_idx] # elapsed seconds contains only training elapsed_time = elapsed_seconds[cutoff_idx] else: # If we don't have any evaluation episodes, then the # last valid timestamp is when we started. last_timestamp = initial_reset_timestamp elapsed_time = 0.0 # Record the timestamp of the last episode timestamp _timestamps.append(last_timestamp) elapsed_times.append(elapsed_time) return { 'rewards': rewards, 'scores': scores, 'solves': solves, 'timestamps': _timestamps, 'elapsed_times': elapsed_times, 'initial_reset_timestamp': initial_reset_timestamp, }
def score_evaluation(self, benchmark, env_id, data_sources, initial_reset_timestamps, episode_lengths, episode_rewards, episode_types, timestamps): tasks = benchmark.task_specs(env_id) spec = envs.spec(env_id) #### 0. Compute timing stats if len(initial_reset_timestamps) > 0: initial_reset_timestamp = min(initial_reset_timestamps) else: initial_reset_timestamp = 0 # How long each episode actually took # How long each episode actually took durations = np.zeros(len(timestamps)) data_sources = np.array(data_sources) timestamps = np.array(timestamps) for source, initial_ts in enumerate(initial_reset_timestamps): (source_indexes,) = np.where(data_sources == source) # Once we know the indexes corresponding to a particular # source (i.e. worker thread), we can just subtract # adjoining values durations[source_indexes[0]] = timestamps[source_indexes[0]] - initial_ts durations[source_indexes[1:]] = timestamps[source_indexes[1:]] - timestamps[source_indexes[:-1]] #### 1. Select out which indexes are for evaluation and which are for training (t_idx,) = np.where([t == 't' for t in episode_types]) # training episodes (e_idx,) = np.where([t == 'e' for t in episode_types]) # evaluation episodes if len(e_idx) == 0: # If no episodes marked for evaluation, consider # everything both a training and evaluation episode. (t_idx,) = np.where([True for t in episode_types]) (e_idx,) = np.where([True for t in episode_types]) #### 2. Grab the data corresponding to each of evaluation/training training_lengths = np.array(episode_lengths)[t_idx] training_rewards = np.array(episode_rewards)[t_idx] training_durations = np.array(durations)[t_idx] evaluation_lengths = np.array(episode_lengths)[e_idx] evaluation_rewards = np.array(episode_rewards)[e_idx] evaluation_durations = np.array(durations)[e_idx] #### 3. Calculate the total elapsed time (in various units) #### for each episode # How many training timesteps have elapsed by the end of each # episode. Not to be confused with Unix timestamps. elapsed_timesteps = np.cumsum(training_lengths) # Total number of seconds elapsed by the end of each # episode. Note that with n parallel workers each running for # m seconds, we want to count the total time as n * m. elapsed_seconds = np.cumsum(training_durations) scores = [] solves = [] rewards = [] _timestamps = [] elapsed_times = [] for task in tasks: # Find the first episode where we're over the allotted # training timesteps. cutoff_idx = np.inf if task.max_timesteps: # this looks a little funny, but we want the first idx greater # than the cutoff (timestep_cutoff,) = np.where(elapsed_timesteps > task.max_timesteps) if len(timestep_cutoff) > 0: cutoff_idx = min(cutoff_idx, timestep_cutoff[0]) if task.max_seconds: (seconds_cutoff,) = np.where(elapsed_seconds > task.max_seconds) if len(seconds_cutoff) > 0: cutoff_idx = min(cutoff_idx, seconds_cutoff[0]) if np.isfinite(cutoff_idx): orig_cutoff_idx = t_idx[cutoff_idx] # cutoff index in the original (i.e. before filtering to training/evaluation) (allowed_e_idx,) = np.where(e_idx < orig_cutoff_idx) # restrict to earlier episodes else: # All episodes are fair game allowed_e_idx = e_idx # Grab the last num_episodes evaluation episodes from # before the cutoff (at which point we've gathered too # much experience). # # This probably won't work long-term but is fine for now. allowed_episode_rewards = np.array(episode_rewards)[allowed_e_idx] reward = allowed_episode_rewards[-self.num_episodes:] floor = task.reward_floor ceiling = task.reward_ceiling if len(reward) < self.num_episodes: extra = self.num_episodes-len(reward) logger.info('Only %s rewards for %s; adding %s', len(reward), env_id, extra) reward = np.concatenate([reward, [floor] * extra]) # Grab the indexes where we reached the ceiling solved = reward >= ceiling # Linearly rescale rewards to between 0 and 1 clipped = np.clip((reward - floor) / (ceiling - floor), 0, 1) # Take the mean rescaled score score = np.mean(clipped) scores.append(score) # Record the list of solved episodes solves.append(solved) # Record the list of rewards rewards.append(reward) if len(allowed_e_idx) > 0: if not np.isfinite(cutoff_idx): cutoff_idx = len(elapsed_seconds) - 1 last_t_idx = t_idx[cutoff_idx] # timestamps is full length last_timestamp = timestamps[last_t_idx] # elapsed seconds contains only training elapsed_time = elapsed_seconds[cutoff_idx] else: # If we don't have any evaluation episodes, then the # last valid timestamp is when we started. last_timestamp = initial_reset_timestamp elapsed_time = 0.0 # Record the timestamp of the last episode timestamp _timestamps.append(last_timestamp) elapsed_times.append(elapsed_time) return { 'rewards': rewards, 'scores': scores, 'solves': solves, 'timestamps': _timestamps, 'elapsed_times': elapsed_times, 'initial_reset_timestamp': initial_reset_timestamp, }
def generate_progress_yaml(exp_names, output_path, number=None): # The meaning of number: if None, extract all checkpoints from all trials # if is an integer, then extract N checkpoints for each trials. assert (number is None) or (isinstance(number, int)) assert spec(args.env_name) # make sure no typo in env_name trial_json_dict = {} if isinstance(exp_names, str): exp_names = [exp_names] for exp_name in exp_names: trial_json_dict.update(get_trial_json_dict(exp_name)) # Get the trial_name-trial_data dict. This is not ordered. trial_data_dict = get_trial_data_dict(trial_json_dict) def get_video_name(trial_name, performance, num_iters): # trial_name: PPO_BipedalWalker-v2_38_seed=138 # result: "PPO seed=139 rew=249.01" components = trial_name.split("_") assert len(components) == 4 return "{0} {3} rew={4:.2f} iter={5:}" \ .format(*components, performance, num_iters) # Return: [{"name": NAME, "path": CKPT_PATH, ...}, {...}, ...] results = [] for (trial_name, dataframe) in trial_data_dict.items(): # We assume all iteration have stored the checkpoints. # But sometimes we store checkpoint in some interval. if number is None or number * 2 > len(dataframe): data_list = dataframe else: interval = int(floor(len(dataframe) / number)) start_index = len(dataframe) % number - 1 data_list = dataframe[:start_index:-interval][::-1] assert (len(data_list) == number) or ( len(dataframe) == len(data_list)), \ len(data_list) for _, series in data_list.iterrows(): # varibales show here: # trial_name: PPO_xx_seed=199 # json_path: xxx/xxx/trial/result.json # trial_path: xxx/xxx/trial num_iters = series["training_iteration"] json_path = trial_json_dict[trial_name] trial_path = os.path.dirname(json_path) # Todo you sure the index of this dataframe is that of iteration? ckpt = get_checkpoint(trial_path, num_iters) if ckpt is None: continue run_name = trial_name.split("_")[0] env_name = trial_name.split("_")[1] performance = series[PERFORMANCE_METRIC] cool_name = get_video_name(trial_name, performance, num_iters) results.append({ "name": cool_name, "path": ckpt['path'], "performance": float(performance), "run_name": run_name, "env_name": env_name, "iter": num_iters }) save_yaml(results, output_path) print("Successfully collect yaml file containing {} checkpoints.".format( len(results))) return results