예제 #1
0
    def _score(self, y_pred: List[Performance],
               y_true: List[Performance]) -> pd.DataFrame:
        df_pred = Performance.to_dataframe(y_pred)
        df_true = Performance.to_dataframe(y_true)

        if self.metrics is not None:
            df_pred = df_pred[self.metrics]
            df_true = df_true[self.metrics]

        # We extract the NumPy arrays so that indexing is easier. Each metric is computed such that
        # it results in an array of shape [D] where D is the number of metrics.
        columns = df_pred.columns
        y_pred_min = df_pred.to_numpy()  # type: ignore
        y_true_min = df_true.to_numpy()  # type: ignore

        # Return all results
        metrics = {
            "nrmse": nrmse(y_pred_min, y_true_min),
            "smape": smape(y_pred_min, y_true_min),
            "mrr": mrr(y_pred_min, y_true_min),
            **{
                f"precision_{k}": precision_k(k, y_pred_min, y_true_min)
                for k in (5, 10, 20)
            },
            "ndcg": ndcg(y_pred_min, y_true_min),
        }
        column_index = pd.MultiIndex.from_tuples([(c, m)
                                                  for m in sorted(metrics)
                                                  for c in columns])
        values = np.concatenate([metrics[m] for m in sorted(metrics)])
        return pd.DataFrame(np.reshape(values, (1, -1)), columns=column_index)
예제 #2
0
def _dummy_performance() -> Performance:
    return Performance.from_dict(
        {
            mm: np.nan
            for m in Performance.metrics()
            for mm in [f"{m}_mean", f"{m}_std"]
        }
    )
예제 #3
0
    def run(
        self, model_config: Union[ModelConfig,
                                  Dict[str, ModelConfig]]) -> pd.DataFrame:
        """
        Runs the evaluation, providing a configuration's performances for all
        datasets.

        Args:
            model_config: A single configuration for which to obtain performances or a mapping from
                dataset names to model configurations.

        Returns:
            The metrics on individual datasets.
        """
        results = []
        for dataset in self.datasets:
            # Construct the config
            if isinstance(model_config, dict):
                config = Config(model_config[dataset.name()], dataset)
            else:
                config = Config(model_config, dataset)

            # Get the performance and append to results
            performance = self.tracker.get_performance(config)
            df = Performance.to_dataframe(
                [performance]).assign(test_dataset=dataset.name())
            results.append(df)

        return pd.concat(results).set_index("test_dataset")
예제 #4
0
 def fit(self, X: List[Performance], _y: Any = None) -> PerformanceEncoder:
     df = Performance.to_dataframe(X)
     self.all_feature_names_ = df.columns.tolist()
     if self.metrics is None:
         self.feature_names_ = df.columns.tolist()
     else:
         assert all(m in df.columns for m in self.metrics)
         self.feature_names_ = self.metrics
     return self
예제 #5
0
 def inverse_transform(self,
                       X: npt.NDArray[np.float32],
                       _y: Any = None) -> List[Performance]:
     df = pd.DataFrame(X, columns=self.feature_names_).assign(
         **{
             col: np.nan
             for col in set(self.all_feature_names_) -
             set(self.feature_names_)
         })
     return [
         Performance.from_dict(row.to_dict()) for _, row in df.iterrows()
     ]
예제 #6
0
    def performance(cls, evaluations: List[Evaluation]) -> Performance:
        """
        Aggregates the provided evaluations into a single performance object.

        Args:
            evaluations: The evaluations.

        Returns:
            The performance object. Since this is not part of the evaluation, it has the
                `num_model_parameters` attribute unset (set to zero).
        """
        metrics = [e.summary for e in evaluations]
        kwargs = {
            m: (Metric(0, 0) if m == "num_model_parameters" else Metric(
                np.mean([
                    metric[m] if m in metric else np.nan for metric in metrics
                ]),
                np.std([
                    metric[m] if m in metric else np.nan for metric in metrics
                ]),
            ))
            for m in Performance.metrics()
        }
        return Performance(**kwargs)  # type: ignore
예제 #7
0
    def recommend(
        self,
        dataset: DatasetConfig,
        candidates: Optional[List[T]] = None,
        max_count: int = 10,
    ) -> List[Recommendation[T]]:
        """
        This method takes a dataset and a set of constraints and outputs a set of recommendations.
        The recommendations provide both the configurations of the recommended model as well as the
        expected performance.

        Args:
            dataset: The configuration of the dataset for which to recommend a model.
            candidates: A list of model configurations that are allowed to be recommended. If
                `None`, any model configuration is permitted.
            max_count: The maximum number of models to recommend.

        Returns:
            The recommendations which (approximately) satisfy the provided constraints.
        """
        model_configs = self.generator.generate(candidates)
        configs = [Config(m, dataset) for m in model_configs]
        performances = self._get_performances(configs)

        # We construct a data frame, extracting the performance metrics to minimize.
        # Then, we invert the performance metrics for the metrics to maximize.
        df = Performance.to_dataframe(performances)[self.objectives]

        # Then, we perform a nondominated sort
        argsort = argsort_nondominated(
            df.to_numpy(),  # type: ignore
            dim=df.columns.tolist().index(self.focus)
            if self.focus is not None
            else None,
            max_items=max_count,
        )

        # And get the recommendations
        result = []
        for choice in cast(List[int], argsort):
            config = configs[choice]
            recommendation = Recommendation(config.model, performances[choice])
            result.append(recommendation)

        return result
예제 #8
0
    def dataframe(self, std: bool = True) -> pd.DataFrame:
        """
        Returns a dataframe which contains the performance metrics as columns and the
        configurations as multi-index.

        Args:
            std: Whether to include the standard deviation of performance metrics in the dataframe.
        """
        # Should implement this for ensembles as well
        index_df = Config.to_dataframe(
            cast(List[Config[ModelConfig]], self.configurations))
        # Reorder columns
        column_order = ["dataset"] + [
            c for c in index_df.columns.tolist() if c != "dataset"
        ]
        index = pd.MultiIndex.from_frame(index_df[column_order])
        df = Performance.to_dataframe(self.performances, std=std)
        df.index = index
        return df.sort_index()
예제 #9
0
    def fit(
        self,
        configs: List[Config[ModelConfig]],
        performances: List[Performance],
    ) -> None:
        super().fit(configs, performances)

        # We need to sort by dataset to have the same ordering for each model config
        ordering = np.argsort([c.dataset.name() for c in configs])
        performance_df = Performance.to_dataframe(performances)

        # Extract all metrics
        metric_map = defaultdict(list)
        for i in ordering:
            metric_map[configs[i].model].append(
                performance_df.iloc[i][self.objectives].to_numpy(),  # type: ignore
            )

        # Build the properties
        self.metrics = np.stack(list(metric_map.values()), axis=1)
        self.model_indices = {model: i for i, model in enumerate(metric_map)}

        # If we are in the multi-objective setting, we have to apply dataset-level quantile
        # normalization of each objective. Otherwise, we perform standardization.
        if not self.enforce_single_objective and len(self.objectives) > 1:
            transformer = QuantileTransformer(
                n_quantiles=min(1000, self.metrics.shape[0])
            )
            self.metrics = np.stack(
                [
                    transformer.fit_transform(dataset_metrics)
                    for dataset_metrics in self.metrics
                ]
            )
        else:
            transformer = StandardScaler()
            self.metrics = np.stack(
                [
                    transformer.fit_transform(dataset_metrics)
                    for dataset_metrics in self.metrics
                ]
            )
예제 #10
0
    def performances(self) -> list[Performance]:
        """
        Returns the list of performances for all models associated with this
        job.

        The variances of all metrics will be set to 0.
        """
        return [
            Performance(
                training_time=Metric(p["training"]["duration"], 0),
                latency=Metric(self.static_metrics["latency"], 0),
                num_model_parameters=Metric(
                    self.static_metrics["num_model_parameters"], 0),
                num_gradient_updates=Metric(
                    p["training"]["num_gradient_updates"], 0),
                **{
                    k: Metric(p["testing"][k], 0)
                    for k in ["mase", "smape", "nrmse", "nd", "ncrps"]
                },
            ) for p in self.metrics
        ]
예제 #11
0
 def transform(self,
               X: List[Performance],
               _y: Any = None) -> npt.NDArray[np.float32]:
     df = Performance.to_dataframe(X)
     return df[self.feature_names_].to_numpy()
예제 #12
0
def extract_job_infos(
    training_jobs: List[Job],
    validation_metric: Optional[ValidationMetric],
    group_seeds: bool,
    data_path: Union[str, Path] = DEFAULT_DATA_PATH,
) -> List[JobInfo]:
    """
    Returns a list of the job information objects available for all training
    jobs provided.
    """
    # We group the jobs by hyperparameters, excluding the seed
    if group_seeds:
        grouped_jobs = defaultdict(list)
        for job in training_jobs:
            hypers = {
                "model": job.model,
                "dataset": job.dataset,
                **job.hyperparameters,
            }
            grouped_jobs[tuple(sorted(hypers.items()))].append(job)
        all_jobs = grouped_jobs.values()
    else:
        all_jobs = [[job] for job in training_jobs]

    # Then, we can instantiate the info objects by iterating over groups of jobs
    runs = []
    for jobs in tqdm(all_jobs):
        ref_job = jobs[0]
        model_name = ref_job.model
        base_hyperparams = {**ref_job.hyperparameters}

        # First, we reconstruct the training times
        if issubclass(MODEL_REGISTRY[model_name], TrainConfig):
            training_fractions = [1 / 81, 1 / 27] + [
                i / 9 for i in range(1, 10)
            ]
        else:
            training_fractions = [0]

        assert all(
            len(job.metrics) == len(training_fractions) for job in jobs
        ), "Job does not provide sufficiently many models."

        # Then, we iterate over the Hyperband training times
        if len(training_fractions) == 1:
            training_fraction_indices = [0]
        else:
            training_fraction_indices = [0, 1, 2, 4, 10]

        # Then, we iterate over all training times, construct the hyperparameters and collect
        # the performane metrics
        for i in training_fraction_indices:
            # Create the config object
            hyperparams = {
                **base_hyperparams,
                "training_fraction": training_fractions[i],
            }
            model_config = get_model_config(model_name, **hyperparams)
            config = Config(
                model_config, get_dataset_config(ref_job.dataset, data_path)
            )

            # Get the indices of the models that should be used to derive the performance
            if validation_metric is None or len(training_fractions) == 1:
                # If the model does not require training, or we don't look at the validation
                # performance, we just choose the current index
                choices = [i] * len(jobs)
            else:
                # Otherwise, we get the minimum value for the metric up to this point in time
                choices = [
                    np.argmin(
                        [
                            p["evaluation"][validation_metric]
                            for p in job.metrics
                        ][: i + 1]
                    ).item()
                    for job in jobs
                ]

            # Get the performances of the chosen models
            performances = [
                job.performances[choice] for choice, job in zip(choices, jobs)
            ]

            # And average the performance
            averaged_performance = Performance(
                **{
                    metric: Metric(
                        np.mean(
                            [getattr(p, metric).mean for p in performances]
                        ),
                        np.std(
                            [getattr(p, metric).mean for p in performances]
                        ),
                    )
                    for metric in Performance.metrics()
                }
            )

            # Get validation scores if available
            try:
                val_ncrps = np.mean(
                    [
                        job.metrics[c]["evaluation"]["val_ncrps"]
                        for (job, c) in zip(jobs, choices)
                    ]
                )
                val_loss = np.mean(
                    [
                        job.metrics[c]["evaluation"]["val_loss"]
                        for (job, c) in zip(jobs, choices)
                    ]
                ).item()
                val_scores = ValidationScores(val_ncrps, val_loss)
            except KeyError:
                val_scores = None

            # Initialize the info object
            runs.append(
                JobInfo(
                    config, averaged_performance, val_scores, jobs, choices
                )
            )

    return runs