def download(dataset: Optional[str], path: str): """ Downloads and preprocesses either a single dataset or all datasets in the registry. """ base = Path(path) if dataset is not None: dataset_cls = DATASET_REGISTRY[dataset](base) dataset_cls.generate() dataset_cls.prepare() return # Start off by downloading an M3 dataset dataset_cls = DATASET_REGISTRY["m3_monthly"](base) dataset_cls.generate() dataset_cls.prepare() # Then, we can download the rest in parallel (by preloading, we don't download the M3 data in # parallel) run_parallel( partial(_download_dataset, base=base), list(DATASET_REGISTRY.keys()), num_processes=min( num_fitting_processes(cpus_per_process=1, memory_per_process=8), len(DATASET_REGISTRY), ), )
def _download_public_evaluations( include_forecasts: bool, evaluations_path: Path ) -> None: public_bucket = "odp-tsbench" session = default_session() client = session.client( "s3", config=botocore.client.Config( # type: ignore signature_version=botocore.UNSIGNED, max_pool_connections=2 * cast(int, os.cpu_count()), ), ) # First, download the metrics print("Downloading metrics...") with tempfile.TemporaryDirectory() as tmp: file = Path(tmp) / "metrics.tar.gz" client.download_file(public_bucket, "metrics.tar.gz", str(file)) with tarfile.open(file, mode="r:gz") as tar: tar.extractall(evaluations_path) # Then, optionally download the forecasts if include_forecasts: print("Downloading forecasts...") # First, get all files with tqdm(desc="List objects") as progress: response = client.list_objects(Bucket=public_bucket) objects = _extract_object_names(response) progress.update() while response["IsTruncated"]: response = client.list_objects( Bucket=public_bucket, Marker=objects[-1] ) objects.extend(_extract_object_names(response)) progress.update() # Then, download all of the objects run_parallel( partial( _download_object, bucket=public_bucket, client=client, destination=evaluations_path, ), objects, 2 * cast(int, os.cpu_count()), )
def run(self) -> pd.DataFrame: """ Runs the evaluation on the surrogate by applying LOOCV on the datasets being trained on. Metrics are then provided per test dataset. Returns: A data frame with the results for each fold, the metrics being the columns. The rows are indexed by the dataset which was left out. """ if isinstance(self.surrogate, AutoGluonSurrogate): metrics = [ self._run_on_dataset(x) for x in tqdm(list(loocv_split(self.tracker))) ] else: data = list(loocv_split(self.tracker)) metrics = run_parallel( self._run_on_dataset, data=data, num_processes=min( num_fitting_processes( cpus_per_process=self.surrogate.required_cpus, memory_per_process=self.surrogate.required_memory, ), len(data), ), ) return pd.concat(metrics).set_index("test_dataset")
def run(self) -> List[Dict[str, ModelConfig]]: """ Runs the evaluation on all datasets and returns the selected models for each dataset. The config evaluator can be used to construct a data frame of performances from the configurations. Returns: The recommended models. The outer list provides the index of the recommendations, i.e. the first item of the list provides all the first recommendations of the recommender, etc. """ data = list(loocv_split(self.tracker)) results = run_parallel( self._run_on_dataset, data=data, num_processes=min( len(data), num_fitting_processes( cpus_per_process=self.recommender.required_cpus, memory_per_process=self.recommender.required_memory, ), ), ) recommendations = {k: v for r in results for k, v in r.items()} return [{k: v[i] for k, v in recommendations.items()} for i in range(self.num_recommendations)]
def run(self) -> Tuple[pd.DataFrame, Dict[str, List[ModelConfig]]]: """ Runs the evaluation on the data provided via the tracker. The data obtained from the tracker is partitioned by the dataset and we run "grouped LOOCV" to compute performance metrics on datasets. Metrics on each dataset are then returned as data frame. Returns: The metrics on the individual datasets. The model choices for each dataset. """ results = run_parallel( self._run_on_dataset, data=list(loocv_split(self.tracker)), num_processes=num_fitting_processes(), ) performances = [r[0] for r in results] member_mapping = {k: v for r in results for k, v in r[1].items()} df = pd.concat(performances).set_index("test_dataset") return df, member_mapping
def simulate( data_path: str, evaluations_path: str, output_path: str, max_ensemble_size: int, default_samples: int, hyperensemble_samples: int, random_samples: int, sample_datasets: bool, seed: int, ): """ Simulates the performance of various ensembles. The ensembles are built from configurations (i.e. model types and hyperparameters) for which offline evaluations are available. """ assert any([ default_samples != 0, hyperensemble_samples != 0, random_samples != 0 ]), "No samples are specified." random.seed(seed) # Load the experiments print("Loading experiments...") tracker = ModelTracker.from_directory(Path(evaluations_path), data_path=Path(data_path)) # Sample configurations print("Sampling configurations...") unique_configurations = tracker.unique_model_configs() default_configurations = [ cast(ModelConfig, c) for c in unique_configurations if not isinstance(c, TrainConfig) or c == c.__class__() ] choices: List[Tuple[ModelConfig]] = [] # If desired, we combine all base configurations into ensembles of sizes between 2 and the # provided maximum. Then, we potentially sample from this collection. For 13 default # configurations and a maximum ensemble size of 10, this results in 8,086 ensembles. if default_samples != 0: available_ensembles = [ combination for i in range(2, max_ensemble_size + 1) for combination in combinations(default_configurations, i) ] if default_samples == -1: choices.extend(available_ensembles) else: choices.extend(random.sample(available_ensembles, default_samples)) if hyperensemble_samples != 0: available_ensembles = [] for config in default_configurations: all_configs = [ c for c in unique_configurations if isinstance(c, config.__class__) and ( not isinstance(c, TrainConfig) or c.training_fraction == 1) ] if len(all_configs) == 1: continue hyper_ensembles = [ combination for i in range(2, max_ensemble_size + 1) for combination in combinations(all_configs, i) ] available_ensembles.extend(hyper_ensembles) if hyperensemble_samples == -1: choices.extend(available_ensembles) else: choices.extend( random.sample(available_ensembles, hyperensemble_samples)) # Then, we add some randomly sampled ensembles of model configurations for _ in range(random_samples): ensemble_size = random.randrange(2, max_ensemble_size + 1) configs = random.sample(unique_configurations, ensemble_size) choices.append(tuple(configs)) # Then, we either evaluate each chosen configuration on all datasets or on a randomly sampled # one. datasets = list( {c.dataset for c in tracker.get_evaluations().configurations}) if sample_datasets: evaluations = [(model_config, random.choice(datasets)) for model_config in choices] else: evaluations = list(product(choices, datasets)) # Eventually, we can evaluate the ensembles that we have sampled print("Evaluating ensembles...") evaluator = EnsembleAnalyzer(tracker) results = run_parallel( partial(_evaluate_ensemble, evaluator=evaluator), evaluations, num_fitting_processes(cpus_per_process=1, memory_per_process=8), ) # Afterwards, we can store all configurations along with their results. For now, we are just # storing them as pickled objects. with Path(output_path).open("wb+") as f: pickle.dump( [{ "configurations": list(evaluation[0]), "dataset": evaluation[1], "performance": result, } for evaluation, result in zip(evaluations, results)], f, )