def provincial_eval(trainer, eval_workers): """Evaluates the performance of the domray model by playing it against Provincial using preset buy menus. Args: trainer (Trainer): trainer class to evaluate. eval_workers (WorkerSet): evaluation workers. Returns: metrics (dict): evaluation metrics dict """ global eval_metrics for i in range(num_episodes_per_scenario): ray.get([w.sample.remote() for w in eval_workers.remote_workers()]) #for worker in eval_workers.remote_workers(): # worker.foreach_env.remote(lambda env: env.debug()) episodes, _ = collect_episodes( remote_workers=eval_workers.remote_workers(), timeout_seconds=600) metrics = summarize_episodes(episodes) eval_metrics.append(metrics) return metrics
def collect_metrics(self, timeout_seconds, min_history=100, selected_evaluators=None): """Returns evaluator and optimizer stats. Arguments: timeout_seconds (int): Max wait time for a evaluator before dropping its results. This usually indicates a hung evaluator. min_history (int): Min history length to smooth results over. selected_evaluators (list): Override the list of remote evaluators to collect metrics from. Returns: res (dict): A training result dict from evaluator metrics with `info` replaced with stats from self. """ episodes, num_dropped = collect_episodes( self.local_evaluator, selected_evaluators or self.remote_evaluators, timeout_seconds=timeout_seconds) orig_episodes = list(episodes) missing = min_history - len(episodes) if missing > 0: episodes.extend(self.episode_history[-missing:]) assert len(episodes) <= min_history self.episode_history.extend(orig_episodes) self.episode_history = self.episode_history[-min_history:] res = summarize_episodes(episodes, orig_episodes, num_dropped) res.update(info=self.stats()) return res
def collect_metrics(self): assert self.episodes metrics = { k: summarize_episodes(v, v, 0) for k, v in self.episodes.items() } return metrics
def __call__(self, _): # Collect worker metrics. episodes, self.to_be_collected = collect_episodes( self.workers.local_worker(), self.workers.remote_workers(), self.to_be_collected, timeout_seconds=self.timeout_seconds) orig_episodes = list(episodes) missing = self.min_history - len(episodes) if missing > 0: episodes.extend(self.episode_history[-missing:]) assert len(episodes) <= self.min_history self.episode_history.extend(orig_episodes) self.episode_history = self.episode_history[-self.min_history:] res = summarize_episodes(episodes, orig_episodes) # Add in iterator metrics. metrics = LocalIterator.get_metrics() timers = {} counters = {} info = {} info.update(metrics.info) for k, counter in metrics.counters.items(): counters[k] = counter for k, timer in metrics.timers.items(): timers["{}_time_ms".format(k)] = round(timer.mean * 1000, 3) if timer.has_units_processed(): timers["{}_throughput".format(k)] = round( timer.mean_throughput, 3) res.update({ "num_healthy_workers": len(self.workers.remote_workers()), "timesteps_total": metrics.counters[STEPS_SAMPLED_COUNTER], }) res["timers"] = timers res["info"] = info res["info"].update(counters) relevant = [ "info", "custom_metrics", "sampler_perf", "timesteps_total", "policy_reward_mean", "episode_len_mean" ] d = {k: res[k] for k in relevant} d["evaluation"] = res.get("evaluation", {}) if self.log_to_neptune: metrics_to_be_logged = ["info", "evaluation"] def log_metric(metrics, base_string=''): if isinstance(metrics, dict): for k in metrics: log_metric(metrics[k], base_string + '{}_'.format(k)) else: neptune.log_metric(base_string, metrics) for k in d: if k in metrics_to_be_logged: log_metric(d[k], base_string='{}_'.format(k)) return d
def __call__(self, _: Any) -> Dict: # Collect worker metrics. episodes, self.to_be_collected = collect_episodes( self.workers.local_worker(), self.selected_workers or self.workers.remote_workers(), self.to_be_collected, timeout_seconds=self.timeout_seconds, ) orig_episodes = list(episodes) missing = self.min_history - len(episodes) if missing > 0: episodes = self.episode_history[-missing:] + episodes assert len(episodes) <= self.min_history self.episode_history.extend(orig_episodes) self.episode_history = self.episode_history[-self.min_history :] res = summarize_episodes(episodes, orig_episodes, self.keep_custom_metrics) # Add in iterator metrics. metrics = _get_shared_metrics() custom_metrics_from_info = metrics.info.pop("custom_metrics", {}) timers = {} counters = {} info = {} info.update(metrics.info) for k, counter in metrics.counters.items(): counters[k] = counter for k, timer in metrics.timers.items(): timers["{}_time_ms".format(k)] = round(timer.mean * 1000, 3) if timer.has_units_processed(): timers["{}_throughput".format(k)] = round(timer.mean_throughput, 3) res.update( { "num_healthy_workers": len(self.workers.remote_workers()), "timesteps_total": ( metrics.counters[STEPS_TRAINED_COUNTER] if self.by_steps_trained else metrics.counters[STEPS_SAMPLED_COUNTER] ), # tune.Trainable uses timesteps_this_iter for tracking # total timesteps. "timesteps_this_iter": metrics.counters[ STEPS_TRAINED_THIS_ITER_COUNTER ], "agent_timesteps_total": metrics.counters.get( AGENT_STEPS_SAMPLED_COUNTER, 0 ), } ) res["timers"] = timers res["info"] = info res["info"].update(counters) res["custom_metrics"] = res.get("custom_metrics", {}) res["episode_media"] = res.get("episode_media", {}) res["custom_metrics"].update(custom_metrics_from_info) return res
def __call__(self, _: Any) -> Dict: # Collect worker metrics. episodes, self.to_be_collected = collect_episodes( self.workers.local_worker(), self.selected_workers or self.workers.remote_workers(), self.to_be_collected, timeout_seconds=self.timeout_seconds) orig_episodes = list(episodes) missing = self.min_history - len(episodes) if missing > 0: episodes = self.episode_history[-missing:] + episodes assert len(episodes) <= self.min_history self.episode_history.extend(orig_episodes) self.episode_history = self.episode_history[-self.min_history:] res = summarize_episodes(episodes, orig_episodes) # Add in iterator metrics. metrics = _get_shared_metrics() custom_metrics_from_info = metrics.info.pop("custom_metrics", {}) timers = {} counters = {} info = {} info.update(metrics.info) for k, counter in metrics.counters.items(): counters[k] = counter for k, timer in metrics.timers.items(): timers["{}_time_ms".format(k)] = round(timer.mean * 1000, 3) if timer.has_units_processed(): timers["{}_throughput".format(k)] = round( timer.mean_throughput, 3) throughput = timer.mean_throughput with Log.timer(log=True, logger=self.logger, info="THROUGHPUT") as logging_metrics: logging_metrics.append(throughput) res.update({ "num_healthy_workers": len(self.workers.remote_workers()), "timesteps_total": metrics.counters[STEPS_SAMPLED_COUNTER], "agent_timesteps_total": metrics.counters.get(AGENT_STEPS_SAMPLED_COUNTER, 0), }) res["timers"] = timers res["info"] = info res["info"].update(counters) res["custom_metrics"] = res.get("custom_metrics", {}) res["episode_media"] = res.get("episode_media", {}) res["custom_metrics"].update(custom_metrics_from_info) return res
def __call__(self, info): episodes, self.to_be_collected = collect_episodes( self.workers.local_worker(), self.workers.remote_workers(), self.to_be_collected, timeout_seconds=self.timeout_seconds) orig_episodes = list(episodes) missing = self.min_history - len(episodes) if missing > 0: episodes.extend(self.episode_history[-missing:]) assert len(episodes) <= self.min_history self.episode_history.extend(orig_episodes) self.episode_history = self.episode_history[-self.min_history:] res = summarize_episodes(episodes, orig_episodes) res.update(info=info) return res
def collect_metrics(self): dist_episodes = ray.get([ e.apply.remote(lambda ev: ev.episodes) for e in self.remote_evaluators ]) aggregated_episodes = defaultdict(list) for episodes in dist_episodes: for k, v in episodes.items(): aggregated_episodes[k].extend(v) aggregated_episodes = dict(aggregated_episodes) res = { k: summarize_episodes(v, v, 0) for k, v in aggregated_episodes.items() } return {"inner_update_metrics": res}
def __call__(self, _): # Collect worker metrics. episodes, self.to_be_collected = collect_episodes( self.workers.local_worker(), self.workers.remote_workers(), self.to_be_collected, timeout_seconds=self.timeout_seconds) orig_episodes = list(episodes) missing = self.min_history - len(episodes) if missing > 0: episodes.extend(self.episode_history[-missing:]) assert len(episodes) <= self.min_history self.episode_history.extend(orig_episodes) self.episode_history = self.episode_history[-self.min_history:] res = summarize_episodes(episodes, orig_episodes) # Add in iterator metrics. metrics = LocalIterator.get_metrics() if metrics.parent_metrics: print("TODO: support nested metrics better") all_metrics = [metrics] + metrics.parent_metrics timers = {} counters = {} info = {} for metrics in all_metrics: info.update(metrics.info) for k, counter in metrics.counters.items(): counters[k] = counter for k, timer in metrics.timers.items(): timers["{}_time_ms".format(k)] = round(timer.mean * 1000, 3) if timer.has_units_processed(): timers["{}_throughput".format(k)] = round( timer.mean_throughput, 3) res.update({ "num_healthy_workers": len(self.workers.remote_workers()), "timesteps_total": metrics.counters[STEPS_SAMPLED_COUNTER], }) res["timers"] = timers res["info"] = info res["info"].update(counters) return res
def collect_metrics(self, timeout_seconds, min_history=100, selected_workers=None): """Returns worker and optimizer stats. Arguments: timeout_seconds (int): Max wait time for a worker before dropping its results. This usually indicates a hung worker. min_history (int): Min history length to smooth results over. selected_workers (list): Override the list of remote workers to collect metrics from. Returns: res (dict): A training result dict from worker metrics with `info` replaced with stats from self. """ return_stats = {} episode_storage = {} for ws_id, workers in self.workers.items(): episodes, self.to_be_collected[ws_id] = collect_episodes( workers.local_worker(), selected_workers or workers.remote_workers(), self.to_be_collected[ws_id], timeout_seconds=timeout_seconds) orig_episodes = list(episodes) missing = min_history - len(episodes) if missing > 0: episodes.extend(self.episode_history[ws_id][-missing:]) assert len(episodes) <= min_history self.episode_history[ws_id].extend(orig_episodes) self.episode_history[ws_id] = self.episode_history[ws_id][ -min_history:] episode_storage[ws_id] = episodes res = summarize_episodes(episodes, orig_episodes) return_stats[ws_id] = res return_stats = parse_stats(return_stats, episode_storage) return_stats.update(info=self.stats()) return_stats["info"]["learner_queue"].pop("size_quantiles") return return_stats
def custom_eval_function(trainer, eval_workers): """Example of a custom evaluation function. Arguments: trainer (Trainer): trainer class to evaluate. eval_workers (WorkerSet): evaluation workers. Returns: metrics (dict): evaluation metrics dict. """ # We configured 2 eval workers in the training config. worker_1, worker_2 = eval_workers.remote_workers() # Set different env settings for each worker. Here we use a fixed config, # which also could have been computed in each worker by looking at # env_config.worker_index (printed in SimpleCorridor class above). worker_1.foreach_env.remote(lambda env: env.set_corridor_length(4)) worker_2.foreach_env.remote(lambda env: env.set_corridor_length(7)) for i in range(5): print("Custom evaluation round", i) # Calling .sample() runs exactly one episode per worker due to how the # eval workers are configured. ray.get([w.sample.remote() for w in eval_workers.remote_workers()]) # Collect the accumulated episodes on the workers, and then summarize the # episode stats into a metrics dict. episodes, _ = collect_episodes( remote_workers=eval_workers.remote_workers(), timeout_seconds=99999) # You can compute metrics from the episodes manually, or use the # convenient `summarize_episodes()` utility: metrics = summarize_episodes(episodes) # Note that the above two statements are the equivalent of: # metrics = collect_metrics(eval_workers.local_worker(), # eval_workers.remote_workers()) # You can also put custom values in the metrics dict. metrics["foo"] = 1 return metrics
def collect_metrics(self, min_history=100): """Returns evaluator and optimizer stats. Arguments: min_history (int): Min history length to smooth results over. Returns: res (dict): A training result dict from evaluator metrics with `info` replaced with stats from self. """ episodes = collect_episodes(self.local_evaluator, self.remote_evaluators) orig_episodes = list(episodes) missing = min_history - len(episodes) if missing > 0: episodes.extend(self.episode_history[-missing:]) assert len(episodes) <= min_history self.episode_history.extend(orig_episodes) self.episode_history = self.episode_history[-min_history:] res = summarize_episodes(episodes) res.update(info=self.stats()) return res