Exemplo n.º 1
0
def execute(dataset, **kwargs):
    file = kwargs.get('file', 'hyperparam.db')

    conn = hUtil.open_hyperparam_db(file)

    experiments = kwargs.get('experiments', 30)

    distributed = kwargs.get('distributed', False)

    if distributed == 'dispy':
        import dispy
        from pyFTS.distributed import dispy as dUtil
        nodes = kwargs.get('nodes', ['127.0.0.1'])
        cluster, http_server = dUtil.start_dispy_cluster(evaluate, nodes=nodes)
        kwargs['cluster'] = cluster

    ret = []
    for i in np.arange(experiments):
        print("Experiment {}".format(i))

        start = time.time()
        ret, statistics = GeneticAlgorithm(dataset, **kwargs)
        end = time.time()
        ret['time'] = end - start
        experiment = {'individual': ret, 'statistics': statistics}

        ret = process_experiment(experiment, '', conn)

    if distributed == 'dispy':
        dUtil.stop_dispy_cluster(cluster, http_server)

    return ret
Exemplo n.º 2
0
def execute(datasetname, dataset, **kwargs):
    """
    Batch execution of Distributed Evolutionary Hyperparameter Optimization (DEHO) for monovariate methods

    :param datasetname:
    :param dataset: The time series to optimize the FTS
    :keyword database_file:
    :keyword experiments:
    :keyword distributed:
    :keyword ngen: An integer value with the maximum number of generations, default value: 30
    :keyword mgen: An integer value with the maximum number of generations without improvement to stop, default value 7
    :keyword npop: An integer value with the population size, default value: 20
    :keyword pcross: A float value between 0 and 1 with the probability of crossover, default: .5
    :keyword psel: A float value between 0 and 1 with the probability of selection, default: .5
    :keyword pmut: A float value between 0 and 1 with the probability of mutation, default: .3
    :keyword fts_method: The FTS method to optimize
    :keyword parameters: dict with model specific arguments for fts_method
    :keyword elitism: A boolean value indicating if the best individual must always survive to next population
    :keyword initial_operator: a function that receives npop and return a random population with size npop
    :keyword random_individual: create an random genotype
    :keyword evalutation_operator: a function that receives a dataset and an individual and return its fitness
    :keyword selection_operator: a function that receives the whole population and return a selected individual
    :keyword crossover_operator: a function that receives the whole population and return a descendent individual
    :keyword mutation_operator: a function that receives one individual and return a changed individual
    :keyword window_size: An integer value with the the length of scrolling window for train/test on dataset
    :keyword train_rate: A float value between 0 and 1 with the train/test split ([0,1])
    :keyword increment_rate: A float value between 0 and 1 with the the increment of the scrolling window,
             relative to the window_size ([0,1])
    :keyword collect_statistics: A boolean value indicating to collect statistics for each generation
    :keyword distributed: A value indicating it the execution will be local and sequential (distributed=False),
             or parallel and distributed (distributed='dispy' or distributed='spark')
    :keyword cluster: If distributed='dispy' the list of cluster nodes, else if distributed='spark' it is the master node
    :return: the best genotype
    """

    file = kwargs.get('database_file', 'hyperparam.db')

    conn = hUtil.open_hyperparam_db(file)

    experiments = kwargs.get('experiments', 30)

    distributed = kwargs.get('distributed', False)

    fts_method = kwargs.get('fts_method', hofts.WeightedHighOrderFTS)
    shortname = str(fts_method.__module__).split('.')[-1]

    if distributed == 'dispy':
        nodes = kwargs.get('nodes', ['127.0.0.1'])
        cluster, http_server = dUtil.start_dispy_cluster(evaluate, nodes=nodes)
        kwargs['cluster'] = cluster

    ret = []
    for i in np.arange(experiments):
        print("Experiment {}".format(i))

        start = time.time()
        ret, statistics = GeneticAlgorithm(dataset, **kwargs)
        end = time.time()
        ret['time'] = end - start
        experiment = {'individual': ret, 'statistics': statistics}

        ret = process_experiment(shortname, experiment, datasetname, conn)

    if distributed == 'dispy':
        dUtil.stop_dispy_cluster(cluster, http_server)

    return ret
Exemplo n.º 3
0
def execute(hyperparams, datasetname, dataset, **kwargs):

    nodes = kwargs.get('nodes', ['127.0.0.1'])

    individuals = []

    if 'lags' in hyperparams:
        lags = hyperparams.pop('lags')
    else:
        lags = [k for k in np.arange(50)]

    keys_sorted = [k for k in sorted(hyperparams.keys())]

    index = {}
    for k in np.arange(len(keys_sorted)):
        index[keys_sorted[k]] = k

    print("Evaluation order: \n {}".format(index))

    hp_values = [[v for v in hyperparams[hp]] for hp in keys_sorted]

    print("Evaluation values: \n {}".format(hp_values))

    cluster, http_server = dUtil.start_dispy_cluster(cluster_method,
                                                     nodes=nodes)
    conn = hUtil.open_hyperparam_db('hyperparam.db')

    for instance in product(*hp_values):
        partitions = instance[index['partitions']]
        partitioner = instance[index['partitioner']]
        mf = instance[index['mf']]
        alpha_cut = instance[index['alpha']]
        order = instance[index['order']]
        count = 0
        for lag1 in lags:  # o é o lag1
            _lags = [lag1]
            count += 1
            if order > 1:
                for lag2 in lags:  # o é o lag1
                    _lags2 = [lag1, lag1 + lag2]
                    count += 1
                    if order > 2:
                        for lag3 in lags:  # o é o lag1
                            count += 1
                            _lags3 = [lag1, lag1 + lag2, lag1 + lag2 + lag3]
                            individuals.append(
                                dict_individual(mf, partitioner, partitions,
                                                order, _lags3, alpha_cut))
                    else:
                        individuals.append(
                            dict_individual(mf, partitioner, partitions, order,
                                            _lags2, alpha_cut))
            else:
                individuals.append(
                    dict_individual(mf, partitioner, partitions, order, _lags,
                                    alpha_cut))

            if count > 10:
                jobs = []

                for ind in individuals:
                    print("Testing individual {}".format(ind))
                    job = cluster.submit(ind, dataset, **kwargs)
                    jobs.append(job)

                process_jobs(jobs, datasetname, conn)

                count = 0

                individuals = []

    dUtil.stop_dispy_cluster(cluster, http_server)
Exemplo n.º 4
0
def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
    """
    Sliding window benchmarks for FTS forecasters.

    For each data window, a train and test datasets will be splitted. For each train split, number of
    partitions and partitioning method will be created a partitioner model. And for each partitioner, order,
    steps ahead and FTS method a foreasting model will be trained.

    Then all trained models are benchmarked on the test data and the metrics are stored on a sqlite3 database
    (identified by the 'file' parameter) for posterior analysis.

    All these process can be distributed on a dispy cluster, setting the atributed 'distributed' to true and
    informing the list of dispy nodes on 'nodes' parameter.

    The number of experiments is determined by 'windowsize' and 'inc' parameters.

    :param data: test data
    :param windowsize: size of sliding window
    :param train: percentual of sliding window data used to train the models
    :param kwargs: dict, optional arguments

    :keyword benchmark_methods:  a list with Non FTS models to benchmark. The default is None.
    :keyword benchmark_methods_parameters:  a list with Non FTS models parameters. The default is None.
    :keyword benchmark_models: A boolean value indicating if external FTS methods will be used on benchmark. The default is False.
    :keyword build_methods: A boolean value indicating if the default FTS methods will be used on benchmark. The default is True.
    :keyword dataset: the dataset name to identify the current set of benchmarks results on database.
    :keyword distributed: A boolean value indicating if the forecasting procedure will be distributed in a dispy cluster. . The default is False
    :keyword file: file path to save the results. The default is benchmarks.db.
    :keyword inc: a float on interval [0,1] indicating the percentage of the windowsize to move the window
    :keyword methods: a list with FTS class names. The default depends on the forecasting type and contains the list of all FTS methods.
    :keyword models: a list with prebuilt FTS objects. The default is None.
    :keyword nodes: a list with the dispy cluster nodes addresses. The default is [127.0.0.1].
    :keyword orders: a list with orders of the models (for high order models). The default is [1,2,3].
    :keyword partitions: a list with the numbers of partitions on the Universe of Discourse. The default is [10].
    :keyword partitioners_models: a list with prebuilt Universe of Discourse partitioners objects. The default is None.
    :keyword partitioners_methods: a list with Universe of Discourse partitioners class names. The default is [partitioners.Grid.GridPartitioner].
    :keyword progress: If true a progress bar will be displayed during the benchmarks. The default is False.
    :keyword start: in the multi step forecasting, the index of the data where to start forecasting. The default is 0.
    :keyword steps_ahead: a list with  the forecasting horizons, i. e., the number of steps ahead to forecast. The default is 1.
    :keyword tag: a name to identify the current set of benchmarks results on database.
    :keyword type: the forecasting type, one of these values: point(default), interval or distribution. The default is point.
    :keyword transformations: a list with data transformations do apply . The default is [None].
    """

    tag = __pop('tag', None, kwargs)
    dataset = __pop('dataset', None, kwargs)

    distributed = __pop('distributed', False, kwargs)

    transformations = kwargs.get('transformations', [None])
    progress = kwargs.get('progress', None)
    type = kwargs.get("type", 'point')

    orders = __pop("orders", [1,2,3], kwargs)

    partitioners_models = __pop("partitioners_models", None, kwargs)
    partitioners_methods = __pop("partitioners_methods", [Grid.GridPartitioner], kwargs)
    partitions = __pop("partitions", [10], kwargs)

    steps_ahead = __pop('steps_ahead', [1], kwargs)

    methods = __pop('methods', None, kwargs)

    models = __pop('models', None, kwargs)

    pool = [] if models is None else models

    if methods is None:
        if type  == 'point':
            methods = get_point_methods()
        elif type == 'interval':
            methods = get_interval_methods()
        elif type == 'distribution':
            methods = get_probabilistic_methods()

    build_methods = __pop("build_methods", True, kwargs)

    if build_methods:
        for method in methods:
            mfts = method()

            if mfts.is_high_order:
                for order in orders:
                    if order >= mfts.min_order:
                        mfts = method()
                        mfts.order = order
                        pool.append(mfts)
            else:
                mfts.order = 1
                pool.append(mfts)

    benchmark_models = __pop("benchmark_models", False, kwargs)

    if benchmark_models != False:

        benchmark_methods = __pop("benchmark_methods", None, kwargs)
        benchmark_methods_parameters = __pop("benchmark_methods_parameters", None, kwargs)

        benchmark_pool = [] if ( benchmark_models is None or not isinstance(benchmark_models, list)) \
            else benchmark_models

        if benchmark_models is None and benchmark_methods is None:
            if type == 'point'or type  == 'partition':
                benchmark_methods = get_benchmark_point_methods()
            elif type == 'interval':
                benchmark_methods = get_benchmark_interval_methods()
            elif type == 'distribution':
                benchmark_methods = get_benchmark_probabilistic_methods()

        if benchmark_methods is not None:
            for transformation in transformations:
                for count, model in enumerate(benchmark_methods, start=0):
                    par = benchmark_methods_parameters[count]
                    mfts = model(**par)
                    mfts.append_transformation(transformation)
                    benchmark_pool.append(mfts)

    if type == 'point':
        experiment_method = run_point
        synthesis_method = process_point_jobs
    elif type == 'interval':
        experiment_method = run_interval
        synthesis_method = process_interval_jobs
    elif type == 'distribution':
        experiment_method = run_probabilistic
        synthesis_method = process_probabilistic_jobs
    else:
        raise ValueError("Type parameter has a unkown value!")

    if distributed:
        import pyFTS.distributed.dispy as dispy

        nodes = kwargs.get("nodes", ['127.0.0.1'])
        cluster, http_server = dispy.start_dispy_cluster(experiment_method, nodes)

    jobs = []

    inc = __pop("inc", 0.1, kwargs)

    if progress:
        from tqdm import tqdm
        _tdata = len(data) / (windowsize * inc)
        _tasks = (len(partitioners_models) * len(orders) * len(partitions) * len(transformations) * len(steps_ahead))
        _tbcmk = len(benchmark_pool)*len(steps_ahead)
        progressbar = tqdm(total=_tdata*_tasks + _tdata*_tbcmk, desc="Benchmarks:")

    file = kwargs.get('file', "benchmarks.db")

    conn = bUtil.open_benchmark_db(file)

    for ct, train, test in cUtil.sliding_window(data, windowsize, train, inc=inc, **kwargs):
        if benchmark_models != False:
            for model in benchmark_pool:
                for step in steps_ahead:

                    kwargs['steps_ahead'] = step

                    if not distributed:
                        if progress:
                            progressbar.update(1)
                        try:
                            job = experiment_method(deepcopy(model), None, train, test, **kwargs)
                            synthesis_method(dataset, tag, job, conn)
                        except Exception as ex:
                            print('EXCEPTION! ', model.shortname, model.order)
                            traceback.print_exc()
                    else:
                        job = cluster.submit(deepcopy(model), None, train, test, **kwargs)
                        jobs.append(job)

        partitioners_pool = []

        if partitioners_models is None:

            for transformation in transformations:

                for partition in partitions:

                    for partitioner in partitioners_methods:

                        data_train_fs = partitioner(data=train, npart=partition, transformation=transformation)

                        partitioners_pool.append(data_train_fs)
        else:
            partitioners_pool = partitioners_models

        for step in steps_ahead:

            for partitioner in partitioners_pool:

                for _id, model in enumerate(pool,start=0):

                    kwargs['steps_ahead'] = step

                    if not distributed:
                        if progress:
                            progressbar.update(1)
                        try:
                            job = experiment_method(deepcopy(model), deepcopy(partitioner), train, test, **kwargs)
                            synthesis_method(dataset, tag, job, conn)
                        except Exception as ex:
                            print('EXCEPTION! ',model.shortname, model.order, partitioner.name,
                                  partitioner.partitions, str(partitioner.transformation))
                            traceback.print_exc()
                    else:
                        job = cluster.submit(deepcopy(model), deepcopy(partitioner), train, test, **kwargs)
                        job.id = id  # associate an ID to identify jobs (if needed later)
                        jobs.append(job)

    if progress:
        progressbar.close()

    if distributed:

        for job in jobs:
            if progress:
                progressbar.update(1)
            job()
            if job.status == dispy.dispy.DispyJob.Finished and job is not None:
                tmp = job.result
                synthesis_method(dataset, tag, tmp, conn)
            else:
                print("status",job.status)
                print("result",job.result)
                print("stdout",job.stdout)
                print("stderr",job.exception)

        cluster.wait()  # wait for all jobs to finish

        dispy.stop_dispy_cluster(cluster, http_server)

    conn.close()