示例#1
0
文件: benchmarks.py 项目: bekti7/pmi
def process_point_jobs(dataset, tag,  job, conn):
    """
    Extract information from a dictionary with point benchmark results and save it on a database

    :param dataset: the benchmark dataset name
    :param tag: alias for the benchmark group being executed
    :param job: a dictionary with the benchmark results
    :param conn: a connection to a Sqlite database
    :return:
    """

    data = bUtil.process_common_data(dataset, tag, 'point',job)

    rmse = deepcopy(data)
    rmse.extend(["rmse", job["rmse"]])
    bUtil.insert_benchmark(rmse, conn)
    smape = deepcopy(data)
    smape.extend(["smape", job["smape"]])
    bUtil.insert_benchmark(smape, conn)
    u = deepcopy(data)
    u.extend(["u", job["u"]])
    bUtil.insert_benchmark(u, conn)
    time = deepcopy(data)
    time.extend(["time", job["time"]])
    bUtil.insert_benchmark(time, conn)
示例#2
0
def process_point_jobs(dataset, tag, job, conn):

    data = bUtil.process_common_data(dataset, tag, 'point', job)

    rmse = deepcopy(data)
    rmse.extend(["rmse", job["rmse"]])
    bUtil.insert_benchmark(rmse, conn)
    smape = deepcopy(data)
    smape.extend(["smape", job["smape"]])
    bUtil.insert_benchmark(smape, conn)
    u = deepcopy(data)
    u.extend(["u", job["u"]])
    bUtil.insert_benchmark(u, conn)
    time = deepcopy(data)
    time.extend(["time", job["time"]])
    bUtil.insert_benchmark(time, conn)
示例#3
0
def process_probabilistic_jobs(dataset, tag, job, conn):

    data = bUtil.process_common_data(dataset, tag, 'density', job)

    crps = deepcopy(data)
    crps.extend(["crps", job["CRPS"]])
    bUtil.insert_benchmark(crps, conn)
    time = deepcopy(data)
    time.extend(["time", job["time"]])
    bUtil.insert_benchmark(time, conn)
    brier = deepcopy(data)
    brier.extend(["brier", job["brier"]])
    bUtil.insert_benchmark(brier, conn)
示例#4
0
文件: benchmarks.py 项目: bekti7/pmi
def process_probabilistic_jobs(dataset, tag,  job, conn):
    """
    Extract information from an dictionary with probabilistic benchmark results and save it on a database

    :param dataset: the benchmark dataset name
    :param tag: alias for the benchmark group being executed
    :param job: a dictionary with the benchmark results
    :param conn: a connection to a Sqlite database
    :return:
    """

    data = bUtil.process_common_data(dataset, tag,  'density', job)

    crps = deepcopy(data)
    crps.extend(["crps",job["CRPS"]])
    bUtil.insert_benchmark(crps, conn)
    time = deepcopy(data)
    time.extend(["time", job["time"]])
    bUtil.insert_benchmark(time, conn)
    brier = deepcopy(data)
    brier.extend(["brier", job["brier"]])
    bUtil.insert_benchmark(brier, conn)
示例#5
0
def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
    """
    Sliding window benchmarks for FTS forecasters.

    For each data window, a train and test datasets will be splitted. For each train split, number of
    partitions and partitioning method will be created a partitioner model. And for each partitioner, order,
    steps ahead and FTS method a foreasting model will be trained.

    Then all trained models are benchmarked on the test data and the metrics are stored on a sqlite3 database
    (identified by the 'file' parameter) for posterior analysis.

    All these process can be distributed on a dispy cluster, setting the atributed 'distributed' to true and
    informing the list of dispy nodes on 'nodes' parameter.

    The number of experiments is determined by 'windowsize' and 'inc' parameters.

    :param data: test data
    :param windowsize: size of sliding window
    :param train: percentual of sliding window data used to train the models
    :param kwargs: dict, optional arguments

    :keyword
        benchmark_methods:  a list with Non FTS models to benchmark. The default is None.
        benchmark_methods_parameters:  a list with Non FTS models parameters. The default is None.
        benchmark_models: A boolean value indicating if external FTS methods will be used on benchmark. The default is False.
        build_methods: A boolean value indicating if the default FTS methods will be used on benchmark. The default is True.
        dataset: the dataset name to identify the current set of benchmarks results on database.
        distributed: A boolean value indicating if the forecasting procedure will be distributed in a dispy cluster. . The default is False
        file: file path to save the results. The default is benchmarks.db.
        inc: a float on interval [0,1] indicating the percentage of the windowsize to move the window
        methods: a list with FTS class names. The default depends on the forecasting type and contains the list of all FTS methods.
        models: a list with prebuilt FTS objects. The default is None.
        nodes: a list with the dispy cluster nodes addresses. The default is [127.0.0.1].
        orders: a list with orders of the models (for high order models). The default is [1,2,3].
        partitions: a list with the numbers of partitions on the Universe of Discourse. The default is [10].
        partitioners_models: a list with prebuilt Universe of Discourse partitioners objects. The default is None.
        partitioners_methods: a list with Universe of Discourse partitioners class names. The default is [partitioners.Grid.GridPartitioner].
        progress: If true a progress bar will be displayed during the benchmarks. The default is False.
        start: in the multi step forecasting, the index of the data where to start forecasting. The default is 0.
        steps_ahead: a list with  the forecasting horizons, i. e., the number of steps ahead to forecast. The default is 1.
        tag: a name to identify the current set of benchmarks results on database.
        type: the forecasting type, one of these values: point(default), interval or distribution. The default is point.
        transformations: a list with data transformations do apply . The default is [None].
    """

    tag = __pop('tag', None, kwargs)
    dataset = __pop('dataset', None, kwargs)

    distributed = __pop('distributed', False, kwargs)

    transformations = kwargs.get('transformations', [None])
    progress = kwargs.get('progress', None)
    type = kwargs.get("type", 'point')

    orders = __pop("orders", [1, 2, 3], kwargs)

    partitioners_models = __pop("partitioners_models", None, kwargs)
    partitioners_methods = __pop("partitioners_methods",
                                 [Grid.GridPartitioner], kwargs)
    partitions = __pop("partitions", [10], kwargs)

    steps_ahead = __pop('steps_ahead', [1], kwargs)

    methods = __pop('methods', None, kwargs)

    models = __pop('models', None, kwargs)

    pool = [] if models is None else models

    if methods is None:
        if type == 'point':
            methods = get_point_methods()
        elif type == 'interval':
            methods = get_interval_methods()
        elif type == 'distribution':
            methods = get_probabilistic_methods()

    build_methods = __pop("build_methods", True, kwargs)

    if build_methods:
        for method in methods:
            mfts = method()

            if mfts.is_high_order:
                for order in orders:
                    if order >= mfts.min_order:
                        mfts = method()
                        mfts.order = order
                        pool.append(mfts)
            else:
                mfts.order = 1
                pool.append(mfts)

    benchmark_models = __pop("benchmark_models", False, kwargs)

    if benchmark_models != False:

        benchmark_methods = __pop("benchmark_methods", None, kwargs)
        benchmark_methods_parameters = __pop("benchmark_methods_parameters",
                                             None, kwargs)

        benchmark_pool = [] if ( benchmark_models is None or not isinstance(benchmark_models, list)) \
            else benchmark_models

        if benchmark_models is None and benchmark_methods is None:
            if type == 'point' or type == 'partition':
                benchmark_methods = get_benchmark_point_methods()
            elif type == 'interval':
                benchmark_methods = get_benchmark_interval_methods()
            elif type == 'distribution':
                benchmark_methods = get_benchmark_probabilistic_methods()

        if benchmark_methods is not None:
            for transformation in transformations:
                for count, model in enumerate(benchmark_methods, start=0):
                    par = benchmark_methods_parameters[count]
                    mfts = model(**par)
                    mfts.append_transformation(transformation)
                    benchmark_pool.append(mfts)

    if type == 'point':
        experiment_method = run_point
        synthesis_method = process_point_jobs
    elif type == 'interval':
        experiment_method = run_interval
        synthesis_method = process_interval_jobs
    elif type == 'distribution':
        experiment_method = run_probabilistic
        synthesis_method = process_probabilistic_jobs
    else:
        raise ValueError("Type parameter has a unkown value!")

    if distributed:
        import dispy, dispy.httpd

        nodes = kwargs.get("nodes", ['127.0.0.1'])
        cluster, http_server = cUtil.start_dispy_cluster(
            experiment_method, nodes)

    jobs = []

    inc = __pop("inc", 0.1, kwargs)

    if progress:
        from tqdm import tqdm
        _tdata = len(data) / (windowsize * inc)
        _tasks = (len(partitioners_models) * len(orders) * len(partitions) *
                  len(transformations) * len(steps_ahead))
        _tbcmk = len(benchmark_pool) * len(steps_ahead)
        progressbar = tqdm(total=_tdata * _tasks + _tdata * _tbcmk,
                           desc="Benchmarks:")

    file = kwargs.get('file', "benchmarks.db")

    conn = bUtil.open_benchmark_db(file)

    for ct, train, test in cUtil.sliding_window(data,
                                                windowsize,
                                                train,
                                                inc=inc,
                                                **kwargs):
        if benchmark_models != False:
            for model in benchmark_pool:
                for step in steps_ahead:

                    kwargs['steps_ahead'] = step

                    if not distributed:
                        if progress:
                            progressbar.update(1)
                        try:
                            job = experiment_method(deepcopy(model), None,
                                                    train, test, **kwargs)
                            synthesis_method(dataset, tag, job, conn)
                        except Exception as ex:
                            print('EXCEPTION! ', model.shortname, model.order)
                            traceback.print_exc()
                    else:
                        job = cluster.submit(deepcopy(model), None, train,
                                             test, **kwargs)
                        jobs.append(job)

        partitioners_pool = []

        if partitioners_models is None:

            for transformation in transformations:

                for partition in partitions:

                    for partitioner in partitioners_methods:

                        data_train_fs = partitioner(
                            data=train,
                            npart=partition,
                            transformation=transformation)

                        partitioners_pool.append(data_train_fs)
        else:
            partitioners_pool = partitioners_models

        for step in steps_ahead:

            for partitioner in partitioners_pool:

                for _id, model in enumerate(pool, start=0):

                    kwargs['steps_ahead'] = step

                    if not distributed:
                        if progress:
                            progressbar.update(1)
                        try:
                            job = experiment_method(deepcopy(model),
                                                    deepcopy(partitioner),
                                                    train, test, **kwargs)
                            synthesis_method(dataset, tag, job, conn)
                        except Exception as ex:
                            print('EXCEPTION! ', model.shortname, model.order,
                                  partitioner.name, partitioner.partitions,
                                  str(partitioner.transformation))
                            traceback.print_exc()
                    else:
                        job = cluster.submit(deepcopy(model),
                                             deepcopy(partitioner), train,
                                             test, **kwargs)
                        job.id = id  # associate an ID to identify jobs (if needed later)
                        jobs.append(job)

    if progress:
        progressbar.close()

    if distributed:

        for job in jobs:
            if progress:
                progressbar.update(1)
            job()
            if job.status == dispy.DispyJob.Finished and job is not None:
                tmp = job.result
                synthesis_method(dataset, tag, tmp, conn)
            else:
                print("status", job.status)
                print("result", job.result)
                print("stdout", job.stdout)
                print("stderr", job.exception)

        cluster.wait()  # wait for all jobs to finish

        cUtil.stop_dispy_cluster(cluster, http_server)

    conn.close()
示例#6
0
def process_interval_jobs(dataset, tag, job, conn):

    data = bUtil.process_common_data(dataset, tag, 'interval', job)

    sharpness = deepcopy(data)
    sharpness.extend(["sharpness", job["sharpness"]])
    bUtil.insert_benchmark(sharpness, conn)
    resolution = deepcopy(data)
    resolution.extend(["resolution", job["resolution"]])
    bUtil.insert_benchmark(resolution, conn)
    coverage = deepcopy(data)
    coverage.extend(["coverage", job["coverage"]])
    bUtil.insert_benchmark(coverage, conn)
    time = deepcopy(data)
    time.extend(["time", job["time"]])
    bUtil.insert_benchmark(time, conn)
    Q05 = deepcopy(data)
    Q05.extend(["Q05", job["Q05"]])
    bUtil.insert_benchmark(Q05, conn)
    Q25 = deepcopy(data)
    Q25.extend(["Q25", job["Q25"]])
    bUtil.insert_benchmark(Q25, conn)
    Q75 = deepcopy(data)
    Q75.extend(["Q75", job["Q75"]])
    bUtil.insert_benchmark(Q75, conn)
    Q95 = deepcopy(data)
    Q95.extend(["Q95", job["Q95"]])
    bUtil.insert_benchmark(Q95, conn)
    W05 = deepcopy(data)
    W05.extend(["winkler05", job["winkler05"]])
    bUtil.insert_benchmark(W05, conn)
    W25 = deepcopy(data)
    W25.extend(["winkler25", job["winkler25"]])
    bUtil.insert_benchmark(W25, conn)
示例#7
0
文件: benchmarks.py 项目: bekti7/pmi
def process_interval_jobs(dataset, tag, job, conn):
    """
    Extract information from an dictionary with interval benchmark results and save it on a database

    :param dataset: the benchmark dataset name
    :param tag: alias for the benchmark group being executed
    :param job: a dictionary with the benchmark results
    :param conn: a connection to a Sqlite database
    :return:
    """

    data = bUtil.process_common_data(dataset, tag, 'interval', job)

    sharpness = deepcopy(data)
    sharpness.extend(["sharpness", job["sharpness"]])
    bUtil.insert_benchmark(sharpness, conn)
    resolution = deepcopy(data)
    resolution.extend(["resolution", job["resolution"]])
    bUtil.insert_benchmark(resolution, conn)
    coverage = deepcopy(data)
    coverage.extend(["coverage", job["coverage"]])
    bUtil.insert_benchmark(coverage, conn)
    time = deepcopy(data)
    time.extend(["time", job["time"]])
    bUtil.insert_benchmark(time, conn)
    Q05 = deepcopy(data)
    Q05.extend(["Q05", job["Q05"]])
    bUtil.insert_benchmark(Q05, conn)
    Q25 = deepcopy(data)
    Q25.extend(["Q25", job["Q25"]])
    bUtil.insert_benchmark(Q25, conn)
    Q75 = deepcopy(data)
    Q75.extend(["Q75", job["Q75"]])
    bUtil.insert_benchmark(Q75, conn)
    Q95 = deepcopy(data)
    Q95.extend(["Q95", job["Q95"]])
    bUtil.insert_benchmark(Q95, conn)
    W05 = deepcopy(data)
    W05.extend(["winkler05", job["winkler05"]])
    bUtil.insert_benchmark(W05, conn)
    W25 = deepcopy(data)
    W25.extend(["winkler25", job["winkler25"]])
    bUtil.insert_benchmark(W25, conn)
示例#8
0
def ahead_sliding_window(data,
                         windowsize,
                         steps,
                         resolution,
                         train=0.8,
                         inc=0.1,
                         models=None,
                         partitioners=[Grid.GridPartitioner],
                         partitions=[10],
                         max_order=3,
                         transformation=None,
                         indexer=None,
                         dump=False,
                         benchmark_models=None,
                         benchmark_models_parameters=None,
                         save=False,
                         file=None,
                         synthetic=False,
                         nodes=None):
    """
    Distributed sliding window benchmarks for FTS probabilistic forecasters
    :param data: 
    :param windowsize: size of sliding window
    :param train: percentual of sliding window data used to train the models
    :param steps: 
    :param resolution: 
    :param models: FTS point forecasters
    :param partitioners: Universe of Discourse partitioner
    :param partitions: the max number of partitions on the Universe of Discourse 
    :param max_order: the max order of the models (for high order models)
    :param transformation: data transformation
    :param indexer: seasonal indexer
    :param dump: 
    :param save: save results
    :param file: file path to save the results
    :param synthetic: if true only the average and standard deviation of the results
    :param nodes: list of cluster nodes to distribute tasks
    :param depends: list of module dependencies 
    :return: DataFrame with the results 
    """

    alphas = [0.05, 0.25]

    if benchmark_models is None and models is None:
        benchmark_models = [
            arima.ARIMA, arima.ARIMA, arima.ARIMA, arima.ARIMA, arima.ARIMA
        ]

    if benchmark_models_parameters is None:
        benchmark_models_parameters = [(1, 0, 0), (1, 0, 1), (2, 0, 0),
                                       (2, 0, 1), (2, 0, 2)]

    cluster = dispy.JobCluster(benchmarks.run_ahead,
                               nodes=nodes)  # , depends=dependencies)

    http_server = dispy.httpd.DispyHTTPServer(cluster)

    _process_start = time.time()

    print("Process Start: {0: %H:%M:%S}".format(datetime.datetime.now()))

    pool = []
    jobs = []
    objs = {}
    crps_interval = {}
    crps_distr = {}
    times1 = {}
    times2 = {}

    if models is None:
        models = benchmarks.get_probabilistic_methods()

    for model in models:
        mfts = model("")

        if mfts.is_high_order:
            for order in np.arange(1, max_order + 1):
                if order >= mfts.min_order:
                    mfts = model("")
                    mfts.order = order
                    pool.append(mfts)
        else:
            pool.append(mfts)

    if benchmark_models is not None:
        for count, model in enumerate(benchmark_models, start=0):
            for a in alphas:
                par = benchmark_models_parameters[count]
                mfts = model(str(par if par is not None else ""),
                             alpha=a,
                             dist=True)
                mfts.order = par
                pool.append(mfts)

    experiments = 0
    for ct, train, test in Util.sliding_window(data,
                                               windowsize,
                                               train,
                                               inc=inc):
        experiments += 1

        benchmarks_only = {}

        if dump: print('\nWindow: {0}\n'.format(ct))

        for partition in partitions:

            for partitioner in partitioners:

                data_train_fs = partitioner(train,
                                            partition,
                                            transformation=transformation)

                for id, m in enumerate(pool, start=0):
                    if m.benchmark_only and m.shortname in benchmarks_only:
                        continue
                    else:
                        benchmarks_only[m.shortname] = m
                    job = cluster.submit(m, data_train_fs, train, test, steps,
                                         resolution, ct, transformation,
                                         indexer)
                    job.id = id  # associate an ID to identify jobs (if needed later)
                    jobs.append(job)

    for job in jobs:
        tmp = job()
        if job.status == dispy.DispyJob.Finished and tmp is not None:
            if tmp['key'] not in objs:
                objs[tmp['key']] = tmp['obj']
                crps_interval[tmp['key']] = []
                crps_distr[tmp['key']] = []
                times1[tmp['key']] = []
                times2[tmp['key']] = []
            crps_interval[tmp['key']].append_rhs(tmp['CRPS_Interval'])
            crps_distr[tmp['key']].append_rhs(tmp['CRPS_Distribution'])
            times1[tmp['key']].append_rhs(tmp['TIME_Interval'])
            times2[tmp['key']].append_rhs(tmp['TIME_Distribution'])

        else:
            print(job.exception)
            print(job.stdout)

    _process_end = time.time()

    print("Process End: {0: %H:%M:%S}".format(datetime.datetime.now()))

    print("Process Duration: {0}".format(_process_end - _process_start))

    cluster.wait()  # wait for all jobs to finish

    cluster.print_status()

    http_server.shutdown()  # this waits until browser gets all updates
    cluster.close()

    return bUtil.save_dataframe_ahead(experiments, file, objs, crps_interval,
                                      crps_distr, times1, times2, save,
                                      synthetic)
示例#9
0
def point_sliding_window(data,
                         windowsize,
                         train=0.8,
                         inc=0.1,
                         models=None,
                         partitioners=[Grid.GridPartitioner],
                         partitions=[10],
                         max_order=3,
                         transformation=None,
                         indexer=None,
                         dump=False,
                         benchmark_models=None,
                         benchmark_models_parameters=None,
                         save=False,
                         file=None,
                         sintetic=False,
                         nodes=None,
                         depends=None):
    """
    Distributed sliding window benchmarks for FTS point forecasters
    :param data: 
    :param windowsize: size of sliding window
    :param train: percentual of sliding window data used to train the models
    :param inc: percentual of window is used do increment 
    :param models: FTS point forecasters
    :param partitioners: Universe of Discourse partitioner
    :param partitions: the max number of partitions on the Universe of Discourse 
    :param max_order: the max order of the models (for high order models)
    :param transformation: data transformation
    :param indexer: seasonal indexer
    :param dump: 
    :param benchmark_models: Non FTS models to benchmark
    :param benchmark_models_parameters: Non FTS models parameters
    :param save: save results
    :param file: file path to save the results
    :param sintetic: if true only the average and standard deviation of the results
    :param nodes: list of cluster nodes to distribute tasks
    :param depends: list of module dependencies 
    :return: DataFrame with the results
    """

    cluster = dispy.JobCluster(benchmarks.run_point,
                               nodes=nodes)  #, depends=dependencies)

    http_server = dispy.httpd.DispyHTTPServer(cluster)

    _process_start = time.time()

    print("Process Start: {0: %H:%M:%S}".format(datetime.datetime.now()))

    jobs = []
    objs = {}
    rmse = {}
    smape = {}
    u = {}
    times = {}

    pool = build_model_pool_point(models, max_order, benchmark_models,
                                  benchmark_models_parameters)

    experiments = 0
    for ct, train, test in Util.sliding_window(data, windowsize, train, inc):
        experiments += 1

        benchmarks_only = {}

        if dump: print('\nWindow: {0}\n'.format(ct))

        for partition in partitions:

            for partitioner in partitioners:

                data_train_fs = partitioner(train,
                                            partition,
                                            transformation=transformation)

                for _id, m in enumerate(pool, start=0):
                    if m.benchmark_only and m.shortname in benchmarks_only:
                        continue
                    else:
                        benchmarks_only[m.shortname] = m
                    job = cluster.submit(m, data_train_fs, train, test, ct,
                                         transformation)
                    job.id = _id  # associate an ID to identify jobs (if needed later)
                    jobs.append(job)

    for job in jobs:
        tmp = job()
        if job.status == dispy.DispyJob.Finished and tmp is not None:
            if tmp['key'] not in objs:
                objs[tmp['key']] = tmp['obj']
                rmse[tmp['key']] = []
                smape[tmp['key']] = []
                u[tmp['key']] = []
                times[tmp['key']] = []
            rmse[tmp['key']].append_rhs(tmp['rmse'])
            smape[tmp['key']].append_rhs(tmp['smape'])
            u[tmp['key']].append_rhs(tmp['u'])
            times[tmp['key']].append_rhs(tmp['time'])
            print(tmp['key'], tmp['window'])
        else:
            print(job.exception)
            print(job.stdout)

    _process_end = time.time()

    print("Process End: {0: %H:%M:%S}".format(datetime.datetime.now()))

    print("Process Duration: {0}".format(_process_end - _process_start))

    cluster.wait()  # wait for all jobs to finish

    cluster.print_status()

    http_server.shutdown()  # this waits until browser gets all updates
    cluster.close()

    return bUtil.save_dataframe_point(experiments, file, objs, rmse, save,
                                      sintetic, smape, times, u)