def execute(datasetname, dataset, **kwargs): nodes = kwargs.get('nodes', ['127.0.0.1']) cluster, http_server = Util.start_dispy_cluster(cluster_method, nodes=nodes) conn = hUtil.open_hyperparam_db('hyperparam.db') ngen = kwargs.get('ngen', 70) npop = kwargs.get('npop', 20) pcruz = kwargs.get('pcruz', .8) pmut = kwargs.get('pmut', .2) option = kwargs.get('option', 1) jobs = [] for i in range(kwargs.get('experiments', 30)): print("Experiment {}".format(i)) job = cluster.submit(dataset, ngen, npop, pcruz, pmut, option) jobs.append(job) process_jobs(jobs, datasetname, conn) Util.stop_dispy_cluster(cluster, http_server)
def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs): """ Sliding window benchmarks for FTS forecasters. For each data window, a train and test datasets will be splitted. For each train split, number of partitions and partitioning method will be created a partitioner model. And for each partitioner, order, steps ahead and FTS method a foreasting model will be trained. Then all trained models are benchmarked on the test data and the metrics are stored on a sqlite3 database (identified by the 'file' parameter) for posterior analysis. All these process can be distributed on a dispy cluster, setting the atributed 'distributed' to true and informing the list of dispy nodes on 'nodes' parameter. The number of experiments is determined by 'windowsize' and 'inc' parameters. :param data: test data :param windowsize: size of sliding window :param train: percentual of sliding window data used to train the models :param kwargs: dict, optional arguments :keyword benchmark_methods: a list with Non FTS models to benchmark. The default is None. benchmark_methods_parameters: a list with Non FTS models parameters. The default is None. benchmark_models: A boolean value indicating if external FTS methods will be used on benchmark. The default is False. build_methods: A boolean value indicating if the default FTS methods will be used on benchmark. The default is True. dataset: the dataset name to identify the current set of benchmarks results on database. distributed: A boolean value indicating if the forecasting procedure will be distributed in a dispy cluster. . The default is False file: file path to save the results. The default is benchmarks.db. inc: a float on interval [0,1] indicating the percentage of the windowsize to move the window methods: a list with FTS class names. The default depends on the forecasting type and contains the list of all FTS methods. models: a list with prebuilt FTS objects. The default is None. nodes: a list with the dispy cluster nodes addresses. The default is [127.0.0.1]. orders: a list with orders of the models (for high order models). The default is [1,2,3]. partitions: a list with the numbers of partitions on the Universe of Discourse. The default is [10]. partitioners_models: a list with prebuilt Universe of Discourse partitioners objects. The default is None. partitioners_methods: a list with Universe of Discourse partitioners class names. The default is [partitioners.Grid.GridPartitioner]. progress: If true a progress bar will be displayed during the benchmarks. The default is False. start: in the multi step forecasting, the index of the data where to start forecasting. The default is 0. steps_ahead: a list with the forecasting horizons, i. e., the number of steps ahead to forecast. The default is 1. tag: a name to identify the current set of benchmarks results on database. type: the forecasting type, one of these values: point(default), interval or distribution. The default is point. transformations: a list with data transformations do apply . The default is [None]. """ tag = __pop('tag', None, kwargs) dataset = __pop('dataset', None, kwargs) distributed = __pop('distributed', False, kwargs) transformations = kwargs.get('transformations', [None]) progress = kwargs.get('progress', None) type = kwargs.get("type", 'point') orders = __pop("orders", [1, 2, 3], kwargs) partitioners_models = __pop("partitioners_models", None, kwargs) partitioners_methods = __pop("partitioners_methods", [Grid.GridPartitioner], kwargs) partitions = __pop("partitions", [10], kwargs) steps_ahead = __pop('steps_ahead', [1], kwargs) methods = __pop('methods', None, kwargs) models = __pop('models', None, kwargs) pool = [] if models is None else models if methods is None: if type == 'point': methods = get_point_methods() elif type == 'interval': methods = get_interval_methods() elif type == 'distribution': methods = get_probabilistic_methods() build_methods = __pop("build_methods", True, kwargs) if build_methods: for method in methods: mfts = method() if mfts.is_high_order: for order in orders: if order >= mfts.min_order: mfts = method() mfts.order = order pool.append(mfts) else: mfts.order = 1 pool.append(mfts) benchmark_models = __pop("benchmark_models", False, kwargs) if benchmark_models != False: benchmark_methods = __pop("benchmark_methods", None, kwargs) benchmark_methods_parameters = __pop("benchmark_methods_parameters", None, kwargs) benchmark_pool = [] if ( benchmark_models is None or not isinstance(benchmark_models, list)) \ else benchmark_models if benchmark_models is None and benchmark_methods is None: if type == 'point' or type == 'partition': benchmark_methods = get_benchmark_point_methods() elif type == 'interval': benchmark_methods = get_benchmark_interval_methods() elif type == 'distribution': benchmark_methods = get_benchmark_probabilistic_methods() if benchmark_methods is not None: for transformation in transformations: for count, model in enumerate(benchmark_methods, start=0): par = benchmark_methods_parameters[count] mfts = model(**par) mfts.append_transformation(transformation) benchmark_pool.append(mfts) if type == 'point': experiment_method = run_point synthesis_method = process_point_jobs elif type == 'interval': experiment_method = run_interval synthesis_method = process_interval_jobs elif type == 'distribution': experiment_method = run_probabilistic synthesis_method = process_probabilistic_jobs else: raise ValueError("Type parameter has a unkown value!") if distributed: import dispy, dispy.httpd nodes = kwargs.get("nodes", ['127.0.0.1']) cluster, http_server = cUtil.start_dispy_cluster( experiment_method, nodes) jobs = [] inc = __pop("inc", 0.1, kwargs) if progress: from tqdm import tqdm _tdata = len(data) / (windowsize * inc) _tasks = (len(partitioners_models) * len(orders) * len(partitions) * len(transformations) * len(steps_ahead)) _tbcmk = len(benchmark_pool) * len(steps_ahead) progressbar = tqdm(total=_tdata * _tasks + _tdata * _tbcmk, desc="Benchmarks:") file = kwargs.get('file', "benchmarks.db") conn = bUtil.open_benchmark_db(file) for ct, train, test in cUtil.sliding_window(data, windowsize, train, inc=inc, **kwargs): if benchmark_models != False: for model in benchmark_pool: for step in steps_ahead: kwargs['steps_ahead'] = step if not distributed: if progress: progressbar.update(1) try: job = experiment_method(deepcopy(model), None, train, test, **kwargs) synthesis_method(dataset, tag, job, conn) except Exception as ex: print('EXCEPTION! ', model.shortname, model.order) traceback.print_exc() else: job = cluster.submit(deepcopy(model), None, train, test, **kwargs) jobs.append(job) partitioners_pool = [] if partitioners_models is None: for transformation in transformations: for partition in partitions: for partitioner in partitioners_methods: data_train_fs = partitioner( data=train, npart=partition, transformation=transformation) partitioners_pool.append(data_train_fs) else: partitioners_pool = partitioners_models for step in steps_ahead: for partitioner in partitioners_pool: for _id, model in enumerate(pool, start=0): kwargs['steps_ahead'] = step if not distributed: if progress: progressbar.update(1) try: job = experiment_method(deepcopy(model), deepcopy(partitioner), train, test, **kwargs) synthesis_method(dataset, tag, job, conn) except Exception as ex: print('EXCEPTION! ', model.shortname, model.order, partitioner.name, partitioner.partitions, str(partitioner.transformation)) traceback.print_exc() else: job = cluster.submit(deepcopy(model), deepcopy(partitioner), train, test, **kwargs) job.id = id # associate an ID to identify jobs (if needed later) jobs.append(job) if progress: progressbar.close() if distributed: for job in jobs: if progress: progressbar.update(1) job() if job.status == dispy.DispyJob.Finished and job is not None: tmp = job.result synthesis_method(dataset, tag, tmp, conn) else: print("status", job.status) print("result", job.result) print("stdout", job.stdout) print("stderr", job.exception) cluster.wait() # wait for all jobs to finish cUtil.stop_dispy_cluster(cluster, http_server) conn.close()
def execute(hyperparams, datasetname, train, test, **kwargs): nodes = kwargs.get('nodes',['127.0.0.1']) individuals = [] if 'lags' in hyperparams: lags = hyperparams.pop('lags') else: lags = [k for k in np.arange(50)] keys_sorted = [k for k in sorted(hyperparams.keys())] index = {} for k in np.arange(len(keys_sorted)): index[keys_sorted[k]] = k print("Evaluation order: \n {}".format(index)) hp_values = [ [v for v in hyperparams[hp]] for hp in keys_sorted ] print("Evaluation values: \n {}".format(hp_values)) cluster, http_server = Util.start_dispy_cluster(cluster_method, nodes=nodes) conn = hUtil.open_hyperparam_db('hyperparam.db') for instance in product(*hp_values): partitions = instance[index['partitions']] partitioner = instance[index['partitioner']] mf = instance[index['mf']] alpha_cut = instance[index['alpha']] order = instance[index['order']] count = 0 for lag1 in lags: # o é o lag1 _lags = [lag1] count += 1 if order > 1: for lag2 in lags: # o é o lag1 _lags2 = [lag1, lag1+lag2] count += 1 if order > 2: for lag3 in lags: # o é o lag1 count += 1 _lags3 = [lag1, lag1 + lag2, lag1 + lag2+lag3 ] individuals.append(dict_individual(mf, partitioner, partitions, order, _lags3, alpha_cut)) else: individuals.append( dict_individual(mf, partitioner, partitions, order, _lags2, alpha_cut)) else: individuals.append(dict_individual(mf, partitioner, partitions, order, _lags, alpha_cut)) if count > 50: jobs = [] for ind in individuals: print("Testing individual {}".format(ind)) job = cluster.submit(ind, train, test) jobs.append(job) process_jobs(jobs, datasetname, conn) count = 0 individuals = [] Util.stop_dispy_cluster(cluster, http_server)