def process_point_jobs(dataset, tag, job, conn): """ Extract information from a dictionary with point benchmark results and save it on a database :param dataset: the benchmark dataset name :param tag: alias for the benchmark group being executed :param job: a dictionary with the benchmark results :param conn: a connection to a Sqlite database :return: """ data = bUtil.process_common_data(dataset, tag, 'point',job) rmse = deepcopy(data) rmse.extend(["rmse", job["rmse"]]) bUtil.insert_benchmark(rmse, conn) smape = deepcopy(data) smape.extend(["smape", job["smape"]]) bUtil.insert_benchmark(smape, conn) u = deepcopy(data) u.extend(["u", job["u"]]) bUtil.insert_benchmark(u, conn) time = deepcopy(data) time.extend(["time", job["time"]]) bUtil.insert_benchmark(time, conn)
def process_point_jobs(dataset, tag, job, conn): data = bUtil.process_common_data(dataset, tag, 'point', job) rmse = deepcopy(data) rmse.extend(["rmse", job["rmse"]]) bUtil.insert_benchmark(rmse, conn) smape = deepcopy(data) smape.extend(["smape", job["smape"]]) bUtil.insert_benchmark(smape, conn) u = deepcopy(data) u.extend(["u", job["u"]]) bUtil.insert_benchmark(u, conn) time = deepcopy(data) time.extend(["time", job["time"]]) bUtil.insert_benchmark(time, conn)
def process_probabilistic_jobs(dataset, tag, job, conn): data = bUtil.process_common_data(dataset, tag, 'density', job) crps = deepcopy(data) crps.extend(["crps", job["CRPS"]]) bUtil.insert_benchmark(crps, conn) time = deepcopy(data) time.extend(["time", job["time"]]) bUtil.insert_benchmark(time, conn) brier = deepcopy(data) brier.extend(["brier", job["brier"]]) bUtil.insert_benchmark(brier, conn)
def process_probabilistic_jobs(dataset, tag, job, conn): """ Extract information from an dictionary with probabilistic benchmark results and save it on a database :param dataset: the benchmark dataset name :param tag: alias for the benchmark group being executed :param job: a dictionary with the benchmark results :param conn: a connection to a Sqlite database :return: """ data = bUtil.process_common_data(dataset, tag, 'density', job) crps = deepcopy(data) crps.extend(["crps",job["CRPS"]]) bUtil.insert_benchmark(crps, conn) time = deepcopy(data) time.extend(["time", job["time"]]) bUtil.insert_benchmark(time, conn) brier = deepcopy(data) brier.extend(["brier", job["brier"]]) bUtil.insert_benchmark(brier, conn)
def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs): """ Sliding window benchmarks for FTS forecasters. For each data window, a train and test datasets will be splitted. For each train split, number of partitions and partitioning method will be created a partitioner model. And for each partitioner, order, steps ahead and FTS method a foreasting model will be trained. Then all trained models are benchmarked on the test data and the metrics are stored on a sqlite3 database (identified by the 'file' parameter) for posterior analysis. All these process can be distributed on a dispy cluster, setting the atributed 'distributed' to true and informing the list of dispy nodes on 'nodes' parameter. The number of experiments is determined by 'windowsize' and 'inc' parameters. :param data: test data :param windowsize: size of sliding window :param train: percentual of sliding window data used to train the models :param kwargs: dict, optional arguments :keyword benchmark_methods: a list with Non FTS models to benchmark. The default is None. benchmark_methods_parameters: a list with Non FTS models parameters. The default is None. benchmark_models: A boolean value indicating if external FTS methods will be used on benchmark. The default is False. build_methods: A boolean value indicating if the default FTS methods will be used on benchmark. The default is True. dataset: the dataset name to identify the current set of benchmarks results on database. distributed: A boolean value indicating if the forecasting procedure will be distributed in a dispy cluster. . The default is False file: file path to save the results. The default is benchmarks.db. inc: a float on interval [0,1] indicating the percentage of the windowsize to move the window methods: a list with FTS class names. The default depends on the forecasting type and contains the list of all FTS methods. models: a list with prebuilt FTS objects. The default is None. nodes: a list with the dispy cluster nodes addresses. The default is [127.0.0.1]. orders: a list with orders of the models (for high order models). The default is [1,2,3]. partitions: a list with the numbers of partitions on the Universe of Discourse. The default is [10]. partitioners_models: a list with prebuilt Universe of Discourse partitioners objects. The default is None. partitioners_methods: a list with Universe of Discourse partitioners class names. The default is [partitioners.Grid.GridPartitioner]. progress: If true a progress bar will be displayed during the benchmarks. The default is False. start: in the multi step forecasting, the index of the data where to start forecasting. The default is 0. steps_ahead: a list with the forecasting horizons, i. e., the number of steps ahead to forecast. The default is 1. tag: a name to identify the current set of benchmarks results on database. type: the forecasting type, one of these values: point(default), interval or distribution. The default is point. transformations: a list with data transformations do apply . The default is [None]. """ tag = __pop('tag', None, kwargs) dataset = __pop('dataset', None, kwargs) distributed = __pop('distributed', False, kwargs) transformations = kwargs.get('transformations', [None]) progress = kwargs.get('progress', None) type = kwargs.get("type", 'point') orders = __pop("orders", [1, 2, 3], kwargs) partitioners_models = __pop("partitioners_models", None, kwargs) partitioners_methods = __pop("partitioners_methods", [Grid.GridPartitioner], kwargs) partitions = __pop("partitions", [10], kwargs) steps_ahead = __pop('steps_ahead', [1], kwargs) methods = __pop('methods', None, kwargs) models = __pop('models', None, kwargs) pool = [] if models is None else models if methods is None: if type == 'point': methods = get_point_methods() elif type == 'interval': methods = get_interval_methods() elif type == 'distribution': methods = get_probabilistic_methods() build_methods = __pop("build_methods", True, kwargs) if build_methods: for method in methods: mfts = method() if mfts.is_high_order: for order in orders: if order >= mfts.min_order: mfts = method() mfts.order = order pool.append(mfts) else: mfts.order = 1 pool.append(mfts) benchmark_models = __pop("benchmark_models", False, kwargs) if benchmark_models != False: benchmark_methods = __pop("benchmark_methods", None, kwargs) benchmark_methods_parameters = __pop("benchmark_methods_parameters", None, kwargs) benchmark_pool = [] if ( benchmark_models is None or not isinstance(benchmark_models, list)) \ else benchmark_models if benchmark_models is None and benchmark_methods is None: if type == 'point' or type == 'partition': benchmark_methods = get_benchmark_point_methods() elif type == 'interval': benchmark_methods = get_benchmark_interval_methods() elif type == 'distribution': benchmark_methods = get_benchmark_probabilistic_methods() if benchmark_methods is not None: for transformation in transformations: for count, model in enumerate(benchmark_methods, start=0): par = benchmark_methods_parameters[count] mfts = model(**par) mfts.append_transformation(transformation) benchmark_pool.append(mfts) if type == 'point': experiment_method = run_point synthesis_method = process_point_jobs elif type == 'interval': experiment_method = run_interval synthesis_method = process_interval_jobs elif type == 'distribution': experiment_method = run_probabilistic synthesis_method = process_probabilistic_jobs else: raise ValueError("Type parameter has a unkown value!") if distributed: import dispy, dispy.httpd nodes = kwargs.get("nodes", ['127.0.0.1']) cluster, http_server = cUtil.start_dispy_cluster( experiment_method, nodes) jobs = [] inc = __pop("inc", 0.1, kwargs) if progress: from tqdm import tqdm _tdata = len(data) / (windowsize * inc) _tasks = (len(partitioners_models) * len(orders) * len(partitions) * len(transformations) * len(steps_ahead)) _tbcmk = len(benchmark_pool) * len(steps_ahead) progressbar = tqdm(total=_tdata * _tasks + _tdata * _tbcmk, desc="Benchmarks:") file = kwargs.get('file', "benchmarks.db") conn = bUtil.open_benchmark_db(file) for ct, train, test in cUtil.sliding_window(data, windowsize, train, inc=inc, **kwargs): if benchmark_models != False: for model in benchmark_pool: for step in steps_ahead: kwargs['steps_ahead'] = step if not distributed: if progress: progressbar.update(1) try: job = experiment_method(deepcopy(model), None, train, test, **kwargs) synthesis_method(dataset, tag, job, conn) except Exception as ex: print('EXCEPTION! ', model.shortname, model.order) traceback.print_exc() else: job = cluster.submit(deepcopy(model), None, train, test, **kwargs) jobs.append(job) partitioners_pool = [] if partitioners_models is None: for transformation in transformations: for partition in partitions: for partitioner in partitioners_methods: data_train_fs = partitioner( data=train, npart=partition, transformation=transformation) partitioners_pool.append(data_train_fs) else: partitioners_pool = partitioners_models for step in steps_ahead: for partitioner in partitioners_pool: for _id, model in enumerate(pool, start=0): kwargs['steps_ahead'] = step if not distributed: if progress: progressbar.update(1) try: job = experiment_method(deepcopy(model), deepcopy(partitioner), train, test, **kwargs) synthesis_method(dataset, tag, job, conn) except Exception as ex: print('EXCEPTION! ', model.shortname, model.order, partitioner.name, partitioner.partitions, str(partitioner.transformation)) traceback.print_exc() else: job = cluster.submit(deepcopy(model), deepcopy(partitioner), train, test, **kwargs) job.id = id # associate an ID to identify jobs (if needed later) jobs.append(job) if progress: progressbar.close() if distributed: for job in jobs: if progress: progressbar.update(1) job() if job.status == dispy.DispyJob.Finished and job is not None: tmp = job.result synthesis_method(dataset, tag, tmp, conn) else: print("status", job.status) print("result", job.result) print("stdout", job.stdout) print("stderr", job.exception) cluster.wait() # wait for all jobs to finish cUtil.stop_dispy_cluster(cluster, http_server) conn.close()
def process_interval_jobs(dataset, tag, job, conn): data = bUtil.process_common_data(dataset, tag, 'interval', job) sharpness = deepcopy(data) sharpness.extend(["sharpness", job["sharpness"]]) bUtil.insert_benchmark(sharpness, conn) resolution = deepcopy(data) resolution.extend(["resolution", job["resolution"]]) bUtil.insert_benchmark(resolution, conn) coverage = deepcopy(data) coverage.extend(["coverage", job["coverage"]]) bUtil.insert_benchmark(coverage, conn) time = deepcopy(data) time.extend(["time", job["time"]]) bUtil.insert_benchmark(time, conn) Q05 = deepcopy(data) Q05.extend(["Q05", job["Q05"]]) bUtil.insert_benchmark(Q05, conn) Q25 = deepcopy(data) Q25.extend(["Q25", job["Q25"]]) bUtil.insert_benchmark(Q25, conn) Q75 = deepcopy(data) Q75.extend(["Q75", job["Q75"]]) bUtil.insert_benchmark(Q75, conn) Q95 = deepcopy(data) Q95.extend(["Q95", job["Q95"]]) bUtil.insert_benchmark(Q95, conn) W05 = deepcopy(data) W05.extend(["winkler05", job["winkler05"]]) bUtil.insert_benchmark(W05, conn) W25 = deepcopy(data) W25.extend(["winkler25", job["winkler25"]]) bUtil.insert_benchmark(W25, conn)
def process_interval_jobs(dataset, tag, job, conn): """ Extract information from an dictionary with interval benchmark results and save it on a database :param dataset: the benchmark dataset name :param tag: alias for the benchmark group being executed :param job: a dictionary with the benchmark results :param conn: a connection to a Sqlite database :return: """ data = bUtil.process_common_data(dataset, tag, 'interval', job) sharpness = deepcopy(data) sharpness.extend(["sharpness", job["sharpness"]]) bUtil.insert_benchmark(sharpness, conn) resolution = deepcopy(data) resolution.extend(["resolution", job["resolution"]]) bUtil.insert_benchmark(resolution, conn) coverage = deepcopy(data) coverage.extend(["coverage", job["coverage"]]) bUtil.insert_benchmark(coverage, conn) time = deepcopy(data) time.extend(["time", job["time"]]) bUtil.insert_benchmark(time, conn) Q05 = deepcopy(data) Q05.extend(["Q05", job["Q05"]]) bUtil.insert_benchmark(Q05, conn) Q25 = deepcopy(data) Q25.extend(["Q25", job["Q25"]]) bUtil.insert_benchmark(Q25, conn) Q75 = deepcopy(data) Q75.extend(["Q75", job["Q75"]]) bUtil.insert_benchmark(Q75, conn) Q95 = deepcopy(data) Q95.extend(["Q95", job["Q95"]]) bUtil.insert_benchmark(Q95, conn) W05 = deepcopy(data) W05.extend(["winkler05", job["winkler05"]]) bUtil.insert_benchmark(W05, conn) W25 = deepcopy(data) W25.extend(["winkler25", job["winkler25"]]) bUtil.insert_benchmark(W25, conn)
def ahead_sliding_window(data, windowsize, steps, resolution, train=0.8, inc=0.1, models=None, partitioners=[Grid.GridPartitioner], partitions=[10], max_order=3, transformation=None, indexer=None, dump=False, benchmark_models=None, benchmark_models_parameters=None, save=False, file=None, synthetic=False, nodes=None): """ Distributed sliding window benchmarks for FTS probabilistic forecasters :param data: :param windowsize: size of sliding window :param train: percentual of sliding window data used to train the models :param steps: :param resolution: :param models: FTS point forecasters :param partitioners: Universe of Discourse partitioner :param partitions: the max number of partitions on the Universe of Discourse :param max_order: the max order of the models (for high order models) :param transformation: data transformation :param indexer: seasonal indexer :param dump: :param save: save results :param file: file path to save the results :param synthetic: if true only the average and standard deviation of the results :param nodes: list of cluster nodes to distribute tasks :param depends: list of module dependencies :return: DataFrame with the results """ alphas = [0.05, 0.25] if benchmark_models is None and models is None: benchmark_models = [ arima.ARIMA, arima.ARIMA, arima.ARIMA, arima.ARIMA, arima.ARIMA ] if benchmark_models_parameters is None: benchmark_models_parameters = [(1, 0, 0), (1, 0, 1), (2, 0, 0), (2, 0, 1), (2, 0, 2)] cluster = dispy.JobCluster(benchmarks.run_ahead, nodes=nodes) # , depends=dependencies) http_server = dispy.httpd.DispyHTTPServer(cluster) _process_start = time.time() print("Process Start: {0: %H:%M:%S}".format(datetime.datetime.now())) pool = [] jobs = [] objs = {} crps_interval = {} crps_distr = {} times1 = {} times2 = {} if models is None: models = benchmarks.get_probabilistic_methods() for model in models: mfts = model("") if mfts.is_high_order: for order in np.arange(1, max_order + 1): if order >= mfts.min_order: mfts = model("") mfts.order = order pool.append(mfts) else: pool.append(mfts) if benchmark_models is not None: for count, model in enumerate(benchmark_models, start=0): for a in alphas: par = benchmark_models_parameters[count] mfts = model(str(par if par is not None else ""), alpha=a, dist=True) mfts.order = par pool.append(mfts) experiments = 0 for ct, train, test in Util.sliding_window(data, windowsize, train, inc=inc): experiments += 1 benchmarks_only = {} if dump: print('\nWindow: {0}\n'.format(ct)) for partition in partitions: for partitioner in partitioners: data_train_fs = partitioner(train, partition, transformation=transformation) for id, m in enumerate(pool, start=0): if m.benchmark_only and m.shortname in benchmarks_only: continue else: benchmarks_only[m.shortname] = m job = cluster.submit(m, data_train_fs, train, test, steps, resolution, ct, transformation, indexer) job.id = id # associate an ID to identify jobs (if needed later) jobs.append(job) for job in jobs: tmp = job() if job.status == dispy.DispyJob.Finished and tmp is not None: if tmp['key'] not in objs: objs[tmp['key']] = tmp['obj'] crps_interval[tmp['key']] = [] crps_distr[tmp['key']] = [] times1[tmp['key']] = [] times2[tmp['key']] = [] crps_interval[tmp['key']].append_rhs(tmp['CRPS_Interval']) crps_distr[tmp['key']].append_rhs(tmp['CRPS_Distribution']) times1[tmp['key']].append_rhs(tmp['TIME_Interval']) times2[tmp['key']].append_rhs(tmp['TIME_Distribution']) else: print(job.exception) print(job.stdout) _process_end = time.time() print("Process End: {0: %H:%M:%S}".format(datetime.datetime.now())) print("Process Duration: {0}".format(_process_end - _process_start)) cluster.wait() # wait for all jobs to finish cluster.print_status() http_server.shutdown() # this waits until browser gets all updates cluster.close() return bUtil.save_dataframe_ahead(experiments, file, objs, crps_interval, crps_distr, times1, times2, save, synthetic)
def point_sliding_window(data, windowsize, train=0.8, inc=0.1, models=None, partitioners=[Grid.GridPartitioner], partitions=[10], max_order=3, transformation=None, indexer=None, dump=False, benchmark_models=None, benchmark_models_parameters=None, save=False, file=None, sintetic=False, nodes=None, depends=None): """ Distributed sliding window benchmarks for FTS point forecasters :param data: :param windowsize: size of sliding window :param train: percentual of sliding window data used to train the models :param inc: percentual of window is used do increment :param models: FTS point forecasters :param partitioners: Universe of Discourse partitioner :param partitions: the max number of partitions on the Universe of Discourse :param max_order: the max order of the models (for high order models) :param transformation: data transformation :param indexer: seasonal indexer :param dump: :param benchmark_models: Non FTS models to benchmark :param benchmark_models_parameters: Non FTS models parameters :param save: save results :param file: file path to save the results :param sintetic: if true only the average and standard deviation of the results :param nodes: list of cluster nodes to distribute tasks :param depends: list of module dependencies :return: DataFrame with the results """ cluster = dispy.JobCluster(benchmarks.run_point, nodes=nodes) #, depends=dependencies) http_server = dispy.httpd.DispyHTTPServer(cluster) _process_start = time.time() print("Process Start: {0: %H:%M:%S}".format(datetime.datetime.now())) jobs = [] objs = {} rmse = {} smape = {} u = {} times = {} pool = build_model_pool_point(models, max_order, benchmark_models, benchmark_models_parameters) experiments = 0 for ct, train, test in Util.sliding_window(data, windowsize, train, inc): experiments += 1 benchmarks_only = {} if dump: print('\nWindow: {0}\n'.format(ct)) for partition in partitions: for partitioner in partitioners: data_train_fs = partitioner(train, partition, transformation=transformation) for _id, m in enumerate(pool, start=0): if m.benchmark_only and m.shortname in benchmarks_only: continue else: benchmarks_only[m.shortname] = m job = cluster.submit(m, data_train_fs, train, test, ct, transformation) job.id = _id # associate an ID to identify jobs (if needed later) jobs.append(job) for job in jobs: tmp = job() if job.status == dispy.DispyJob.Finished and tmp is not None: if tmp['key'] not in objs: objs[tmp['key']] = tmp['obj'] rmse[tmp['key']] = [] smape[tmp['key']] = [] u[tmp['key']] = [] times[tmp['key']] = [] rmse[tmp['key']].append_rhs(tmp['rmse']) smape[tmp['key']].append_rhs(tmp['smape']) u[tmp['key']].append_rhs(tmp['u']) times[tmp['key']].append_rhs(tmp['time']) print(tmp['key'], tmp['window']) else: print(job.exception) print(job.stdout) _process_end = time.time() print("Process End: {0: %H:%M:%S}".format(datetime.datetime.now())) print("Process Duration: {0}".format(_process_end - _process_start)) cluster.wait() # wait for all jobs to finish cluster.print_status() http_server.shutdown() # this waits until browser gets all updates cluster.close() return bUtil.save_dataframe_point(experiments, file, objs, rmse, save, sintetic, smape, times, u)