def _score(self, y_pred: List[Performance], y_true: List[Performance]) -> pd.DataFrame: df_pred = Performance.to_dataframe(y_pred) df_true = Performance.to_dataframe(y_true) if self.metrics is not None: df_pred = df_pred[self.metrics] df_true = df_true[self.metrics] # We extract the NumPy arrays so that indexing is easier. Each metric is computed such that # it results in an array of shape [D] where D is the number of metrics. columns = df_pred.columns y_pred_min = df_pred.to_numpy() # type: ignore y_true_min = df_true.to_numpy() # type: ignore # Return all results metrics = { "nrmse": nrmse(y_pred_min, y_true_min), "smape": smape(y_pred_min, y_true_min), "mrr": mrr(y_pred_min, y_true_min), **{ f"precision_{k}": precision_k(k, y_pred_min, y_true_min) for k in (5, 10, 20) }, "ndcg": ndcg(y_pred_min, y_true_min), } column_index = pd.MultiIndex.from_tuples([(c, m) for m in sorted(metrics) for c in columns]) values = np.concatenate([metrics[m] for m in sorted(metrics)]) return pd.DataFrame(np.reshape(values, (1, -1)), columns=column_index)
def _dummy_performance() -> Performance: return Performance.from_dict( { mm: np.nan for m in Performance.metrics() for mm in [f"{m}_mean", f"{m}_std"] } )
def run( self, model_config: Union[ModelConfig, Dict[str, ModelConfig]]) -> pd.DataFrame: """ Runs the evaluation, providing a configuration's performances for all datasets. Args: model_config: A single configuration for which to obtain performances or a mapping from dataset names to model configurations. Returns: The metrics on individual datasets. """ results = [] for dataset in self.datasets: # Construct the config if isinstance(model_config, dict): config = Config(model_config[dataset.name()], dataset) else: config = Config(model_config, dataset) # Get the performance and append to results performance = self.tracker.get_performance(config) df = Performance.to_dataframe( [performance]).assign(test_dataset=dataset.name()) results.append(df) return pd.concat(results).set_index("test_dataset")
def fit(self, X: List[Performance], _y: Any = None) -> PerformanceEncoder: df = Performance.to_dataframe(X) self.all_feature_names_ = df.columns.tolist() if self.metrics is None: self.feature_names_ = df.columns.tolist() else: assert all(m in df.columns for m in self.metrics) self.feature_names_ = self.metrics return self
def inverse_transform(self, X: npt.NDArray[np.float32], _y: Any = None) -> List[Performance]: df = pd.DataFrame(X, columns=self.feature_names_).assign( **{ col: np.nan for col in set(self.all_feature_names_) - set(self.feature_names_) }) return [ Performance.from_dict(row.to_dict()) for _, row in df.iterrows() ]
def performance(cls, evaluations: List[Evaluation]) -> Performance: """ Aggregates the provided evaluations into a single performance object. Args: evaluations: The evaluations. Returns: The performance object. Since this is not part of the evaluation, it has the `num_model_parameters` attribute unset (set to zero). """ metrics = [e.summary for e in evaluations] kwargs = { m: (Metric(0, 0) if m == "num_model_parameters" else Metric( np.mean([ metric[m] if m in metric else np.nan for metric in metrics ]), np.std([ metric[m] if m in metric else np.nan for metric in metrics ]), )) for m in Performance.metrics() } return Performance(**kwargs) # type: ignore
def recommend( self, dataset: DatasetConfig, candidates: Optional[List[T]] = None, max_count: int = 10, ) -> List[Recommendation[T]]: """ This method takes a dataset and a set of constraints and outputs a set of recommendations. The recommendations provide both the configurations of the recommended model as well as the expected performance. Args: dataset: The configuration of the dataset for which to recommend a model. candidates: A list of model configurations that are allowed to be recommended. If `None`, any model configuration is permitted. max_count: The maximum number of models to recommend. Returns: The recommendations which (approximately) satisfy the provided constraints. """ model_configs = self.generator.generate(candidates) configs = [Config(m, dataset) for m in model_configs] performances = self._get_performances(configs) # We construct a data frame, extracting the performance metrics to minimize. # Then, we invert the performance metrics for the metrics to maximize. df = Performance.to_dataframe(performances)[self.objectives] # Then, we perform a nondominated sort argsort = argsort_nondominated( df.to_numpy(), # type: ignore dim=df.columns.tolist().index(self.focus) if self.focus is not None else None, max_items=max_count, ) # And get the recommendations result = [] for choice in cast(List[int], argsort): config = configs[choice] recommendation = Recommendation(config.model, performances[choice]) result.append(recommendation) return result
def dataframe(self, std: bool = True) -> pd.DataFrame: """ Returns a dataframe which contains the performance metrics as columns and the configurations as multi-index. Args: std: Whether to include the standard deviation of performance metrics in the dataframe. """ # Should implement this for ensembles as well index_df = Config.to_dataframe( cast(List[Config[ModelConfig]], self.configurations)) # Reorder columns column_order = ["dataset"] + [ c for c in index_df.columns.tolist() if c != "dataset" ] index = pd.MultiIndex.from_frame(index_df[column_order]) df = Performance.to_dataframe(self.performances, std=std) df.index = index return df.sort_index()
def fit( self, configs: List[Config[ModelConfig]], performances: List[Performance], ) -> None: super().fit(configs, performances) # We need to sort by dataset to have the same ordering for each model config ordering = np.argsort([c.dataset.name() for c in configs]) performance_df = Performance.to_dataframe(performances) # Extract all metrics metric_map = defaultdict(list) for i in ordering: metric_map[configs[i].model].append( performance_df.iloc[i][self.objectives].to_numpy(), # type: ignore ) # Build the properties self.metrics = np.stack(list(metric_map.values()), axis=1) self.model_indices = {model: i for i, model in enumerate(metric_map)} # If we are in the multi-objective setting, we have to apply dataset-level quantile # normalization of each objective. Otherwise, we perform standardization. if not self.enforce_single_objective and len(self.objectives) > 1: transformer = QuantileTransformer( n_quantiles=min(1000, self.metrics.shape[0]) ) self.metrics = np.stack( [ transformer.fit_transform(dataset_metrics) for dataset_metrics in self.metrics ] ) else: transformer = StandardScaler() self.metrics = np.stack( [ transformer.fit_transform(dataset_metrics) for dataset_metrics in self.metrics ] )
def performances(self) -> list[Performance]: """ Returns the list of performances for all models associated with this job. The variances of all metrics will be set to 0. """ return [ Performance( training_time=Metric(p["training"]["duration"], 0), latency=Metric(self.static_metrics["latency"], 0), num_model_parameters=Metric( self.static_metrics["num_model_parameters"], 0), num_gradient_updates=Metric( p["training"]["num_gradient_updates"], 0), **{ k: Metric(p["testing"][k], 0) for k in ["mase", "smape", "nrmse", "nd", "ncrps"] }, ) for p in self.metrics ]
def transform(self, X: List[Performance], _y: Any = None) -> npt.NDArray[np.float32]: df = Performance.to_dataframe(X) return df[self.feature_names_].to_numpy()
def extract_job_infos( training_jobs: List[Job], validation_metric: Optional[ValidationMetric], group_seeds: bool, data_path: Union[str, Path] = DEFAULT_DATA_PATH, ) -> List[JobInfo]: """ Returns a list of the job information objects available for all training jobs provided. """ # We group the jobs by hyperparameters, excluding the seed if group_seeds: grouped_jobs = defaultdict(list) for job in training_jobs: hypers = { "model": job.model, "dataset": job.dataset, **job.hyperparameters, } grouped_jobs[tuple(sorted(hypers.items()))].append(job) all_jobs = grouped_jobs.values() else: all_jobs = [[job] for job in training_jobs] # Then, we can instantiate the info objects by iterating over groups of jobs runs = [] for jobs in tqdm(all_jobs): ref_job = jobs[0] model_name = ref_job.model base_hyperparams = {**ref_job.hyperparameters} # First, we reconstruct the training times if issubclass(MODEL_REGISTRY[model_name], TrainConfig): training_fractions = [1 / 81, 1 / 27] + [ i / 9 for i in range(1, 10) ] else: training_fractions = [0] assert all( len(job.metrics) == len(training_fractions) for job in jobs ), "Job does not provide sufficiently many models." # Then, we iterate over the Hyperband training times if len(training_fractions) == 1: training_fraction_indices = [0] else: training_fraction_indices = [0, 1, 2, 4, 10] # Then, we iterate over all training times, construct the hyperparameters and collect # the performane metrics for i in training_fraction_indices: # Create the config object hyperparams = { **base_hyperparams, "training_fraction": training_fractions[i], } model_config = get_model_config(model_name, **hyperparams) config = Config( model_config, get_dataset_config(ref_job.dataset, data_path) ) # Get the indices of the models that should be used to derive the performance if validation_metric is None or len(training_fractions) == 1: # If the model does not require training, or we don't look at the validation # performance, we just choose the current index choices = [i] * len(jobs) else: # Otherwise, we get the minimum value for the metric up to this point in time choices = [ np.argmin( [ p["evaluation"][validation_metric] for p in job.metrics ][: i + 1] ).item() for job in jobs ] # Get the performances of the chosen models performances = [ job.performances[choice] for choice, job in zip(choices, jobs) ] # And average the performance averaged_performance = Performance( **{ metric: Metric( np.mean( [getattr(p, metric).mean for p in performances] ), np.std( [getattr(p, metric).mean for p in performances] ), ) for metric in Performance.metrics() } ) # Get validation scores if available try: val_ncrps = np.mean( [ job.metrics[c]["evaluation"]["val_ncrps"] for (job, c) in zip(jobs, choices) ] ) val_loss = np.mean( [ job.metrics[c]["evaluation"]["val_loss"] for (job, c) in zip(jobs, choices) ] ).item() val_scores = ValidationScores(val_ncrps, val_loss) except KeyError: val_scores = None # Initialize the info object runs.append( JobInfo( config, averaged_performance, val_scores, jobs, choices ) ) return runs