def _getPipeline():
    select_stage = [("split", Stage('read', process=read))]
    join_stage = [("group_label",
                   Stage('join', init=join_init, process=join,
                         done=join_done))]

    def combine_row(state, k, v, func):
        if k not in state:
            state[k] = 0
        state[k] = state[k] + func(v)

    node_combine_stage = [
        ("group_node_label",
         Stage('node_combine',
               init=partial(combine_init, init=lambda: {}),
               process=partial(combine,
                               func=partial(combine_row, func=lambda v: 1)),
               done=combine_done))
    ]

    combine_all_stage = [
        ("group_label",
         Stage('combine_all',
               init=partial(combine_init, init=lambda: {}),
               process=partial(combine,
                               func=partial(combine_row, func=lambda v: v)),
               done=combine_done))
    ]

    return select_stage + join_stage + node_combine_stage + combine_all_stage
示例#2
0
def measure(test_data,
            predictions,
            measure="ca",
            save_results=True,
            show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.worker.task_io import task_input_stream, chain_reader

    if measure not in ["ca", "mse"]:
        raise Exception("measure should be ca or mse.")
    if test_data.params["id_index"] == -1:
        raise Exception("ID index should be defined.")

    if predictions == []:
        return "No predictions", None

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=test_data.params["input_chain"],
                           init=simple_init,
                           process=map_test_data))]

    job.params = test_data.params
    job.run(name="ma_parse_testdata", input=test_data.params["data_tag"])
    parsed_testdata = job.wait(show=show)

    reduce_proces = reduce_ca if measure == "ca" else reduce_mse

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           init=simple_init,
                           input_chain=[task_input_stream, chain_reader],
                           process=map_predictions)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_proces,
                           sort=True,
                           combine=True))]

    job.run(name="ma_measure_accuracy", input=parsed_testdata + predictions)

    measure, acc = [
        (measure, acc) for measure, acc in result_iterator(job.wait(show=show))
    ][0]
    return measure, acc
class CountWords(Job):
    def __init__(self):
        from disco.worker.pipeline.worker import Worker
        super(CountWords, self).__init__(worker=Worker())

    pipeline = [("split",
                 Stage("map",
                       process=map,
                       input_chain=[task_input_stream, chain_reader])),
                ("group_label",
                 Stage("reduce",
                       process=reduce,
                       combine=True,
                       output_chain=(task_output_stream, plain_output_stream),
                       sort=True))]
示例#4
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Predict the closest clusters for the datapoints in input.
    """

    from disco.job import Job
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import result_iterator

    if "kmeans_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.params = dict(dataset.params.items() + mean_point_center.items())
    job.params["centers"] = [
        (i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"])
    ]

    job.pipeline = [("split",
                     Stage("kmeans_predict",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=predict_map))]

    job.run(input=dataset.params["data_tag"], name="kmeans_predict")

    return job.wait(show=show)
示例#5
0
class SimplePipe(TestPipe):
    def map(interface, state, label, inp):
        out = interface.output(0)
        for e in inp:
            out.add(int(e), (bytes_to_str(e)).strip())

    def reduce(interface, state, label, inp):
        for k, v in sorted(inp):
            state.append((k, v))

    pipeline = [("split", Stage("map", process=map)),
                ("group_all",
                 Stage("reduce",
                       init=reduce_init,
                       process=reduce,
                       done=reduce_done))]
示例#6
0
def fit(dataset, nu=0.1, save_results=True, show=False):
    """
    Function starts a job for calculation of model parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    nu - parameter to adjust the classifier
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    if dataset.params["y_map"] == []:
        raise Exception(
            "Linear proximal SVM requires a target label mapping parameter.")
    try:
        nu = float(nu)
        if nu <= 0:
            raise Exception("Parameter nu should be greater than 0")
    except ValueError:
        raise Exception("Parameter should be numerical.")

    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers and joins them with one reducer
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           combine=True))]

    job.params = dataset.params
    job.params["nu"] = nu
    job.run(name="linearsvm_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"linsvm_fitmodel": fitmodel_url}  # return results url
示例#7
0
class WordCount(Job):
    def __init__(self):
        from disco.worker.pipeline.worker import Worker
        super(WordCount, self).__init__(worker=Worker())

    pipeline = [("split",
                 Stage(
                     "map",
                     process=map,
                     input_chain=[task_input_stream, chain_reader],
                     output_chain=[
                         partial(redis_inter_stream_out,
                                 redis_server=redis_server)
                     ],
                 )),
                ("group_label",
                 Stage("reduce",
                       process=reduce,
                       input_chain=[task_input_stream,
                                    redis_inter_stream_in]))]
def fit(dataset, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           combine=True))]

    job.params = dataset.params
    job.run(name="linreg_fit", input=dataset.params["data_tag"])

    fitmodel_url = job.wait(show=show)
    return {"linreg_fitmodel": fitmodel_url}  # return results url
示例#9
0
class SimplerPipe(SimplePipe):
    def reduce_init(interface, params):
        return []

    def reduce(interface, state, label, inp):
        for rec in sorted(inp):
            state.append((int(rec), (bytes_to_str(rec).strip())))

    pipeline = [("group_all",
                 Stage("reduce",
                       init=reduce_init,
                       process=reduce,
                       done=reduce_done))]
示例#10
0
def estimate(master, input, center, k, iterations):
    """
    Optimize k-clustering for `iterations` iterations with cluster
    center definitions as given in `center`.
    """
    from kclustering_pipeline import Estimate
    job = Estimate()
    job.pipeline = [("split",
                 Stage("k_cluster_init_map", input_chain =
                     [task_input_stream, reader], init = map_init,
                       process = random_init_map)),
                ('group_label',
                 Stage("k_cluster_init_reduce", process = estimate_reduce, init = simple_init))]
    job.params = center
    job.params['seed'] = 0
    job.params['k'] = k


    job.run(input = input)
    centers = [(i,c) for i,c in result_iterator(job.wait())]
    job.purge()

    for j in range(iterations):
        job = Estimate()
        job.params = center
        job.params['k'] = k
        job.params['centers'] = centers

        job.pipeline = [('split', Stage("kcluster_map_iter_%s" %(j,),
                input_chain = [task_input_stream, reader],
                process=estimate_map, init = simple_init)),
            ('group_label', Stage("kcluster_reduce_iter_%s" %(j,),
                process=estimate_reduce, init = simple_init))]
        job.run(input = input)
        centers = [(i,c) for i,c in result_iterator(job.wait())]
        job.purge()

    return centers
示例#11
0
def fit(dataset, save_results=True, show=False):
    """
    Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers, sorts intermediate pairs and joins them with one reducer
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           sort=True,
                           combine=True))]

    job.params = dataset.params  # job parameters (dataset object)
    # define name of a job and input data urls
    job.run(name="naivebayes_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"naivebayes_fitmodel": fitmodel_url}  # return results url
示例#12
0
def _fit_predict(fit_data, samples, tau, save_results, show):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [("split",
                     Stage("map",
                           input_chain=fit_data.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           sort=True,
                           combine=True))]

    job.params = fit_data.params
    job.params["tau"] = tau
    job.params["samples"] = samples

    job.run(name="lwlr_fit_predict", input=fit_data.params["data_tag"])
    return job.wait(show=show)
示例#13
0
def predict(master, input, center, centers):
    """
    Predict the closest clusters for the datapoints in input.
    """
    from kclustering_pipeline import Estimate
    job = Estimate()
    job.pipeline = [("split",
                 Stage("k_cluster_predict", input_chain =
                     [task_input_stream, reader], init = simple_init,
                       process = predict_map))]
    job.params = center
    job.params['centers'] = centers
    job.run(input = input)

    return job.wait()
示例#14
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if "linreg_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]
    job.params = dataset.params
    job.params["thetas"] = [
        v for _, v in result_iterator(fitmodel_url["linreg_fitmodel"])
    ][0]

    job.run(name="linreg_predict", input=dataset.params["data_tag"])
    return job.wait(show=show)
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls with predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if dataset.params["y_map"] == []:
        raise Exception(
            "Logistic regression requires a target label mapping parameter.")
    if "logreg_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    # job parallelizes execution of mappers
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]

    job.params = dataset.params  # job parameters (dataset object)
    job.params["thetas"] = [
        v for k, v in result_iterator(fitmodel_url["logreg_fitmodel"])
        if k == "thetas"
    ][0]  # thetas are loaded from ddfs

    job.run(name="logreg_predict", input=dataset.params["data_tag"])
    results = job.wait(show=show)
    return results
示例#16
0
def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll
    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]

    if "dwf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    try:
        coeff = float(coeff)
        if coeff < 0:
            raise Exception("Parameter coeff should be greater than 0.")
    except ValueError:
        raise Exception("Parameter coeff should be numerical.")

    job.params = dataset.params
    job.params["coeff"] = coeff
    for k, v in result_iterator(fitmodel_url["dwf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_weighted_forest_predict",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)
def predict(dataset,
            fitmodel_url,
            voting=False,
            save_results=True,
            show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll

    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    if "drf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split",
         Stage("map",
               input_chain=dataset.params["input_chain"],
               init=simple_init,
               process=map_predict_voting if voting else map_predict_dist))
    ]

    job.params = dataset.params
    for k, v in result_iterator(fitmodel_url["drf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_random_forest_predict",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)
示例#18
0
def fit(dataset,
        trees_per_chunk=1,
        bootstrap=True,
        max_tree_nodes=50,
        min_samples_leaf=10,
        min_samples_split=5,
        class_majority=1,
        separate_max=True,
        measure="info_gain",
        accuracy=1,
        random_state=None,
        save_results=True,
        show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    import discomll
    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    try:
        trees_per_chunk = int(trees_per_chunk)
        max_tree_nodes = int(
            max_tree_nodes) if max_tree_nodes != None else max_tree_nodes
        min_samples_leaf = int(min_samples_leaf)
        min_samples_split = int(min_samples_split)
        class_majority = float(class_majority)
        accuracy = int(accuracy)
        separate_max = separate_max
        if trees_per_chunk > 1 and bootstrap == False:
            raise Exception(
                "Parameter trees_per_chunk (or Trees per subset) should be 1 to disable bootstrap."
            )
        if trees_per_chunk <= 0 or min_samples_leaf <= 0 or class_majority <= 0 or min_samples_split <= 0 and accuracy < 0 or type(
                bootstrap) != bool:
            raise Exception("Parameters should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    if measure not in ["info_gain", "mdl"]:
        raise Exception("measure should be set to info_gain or mdl.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split",
         Stage("map",
               input_chain=dataset.params["input_chain"],
               init=map_init,
               process=map_fit_bootstrap if bootstrap else map_fit)),
        ('group_all',
         Stage("reduce", init=simple_init, process=reduce_fit, combine=True))
    ]

    job.params = dataset.params
    job.params["trees_per_chunk"] = trees_per_chunk
    job.params["max_tree_nodes"] = max_tree_nodes
    job.params["min_samples_leaf"] = min_samples_leaf
    job.params["min_samples_split"] = min_samples_split
    job.params["class_majority"] = class_majority
    job.params["measure"] = measure
    job.params["bootstrap"] = bootstrap
    job.params["accuracy"] = accuracy
    job.params["separate_max"] = separate_max
    job.params['seed'] = random_state

    job.run(name="forest_distributed_decision_trees_fit",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py", path + "measures.py"])

    fitmodel_url = job.wait(show=show)
    return {"fddt_fitmodel": fitmodel_url}  # return results url
示例#19
0
def predict(dataset, fitmodel_url, m=1, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    m - m estimate is used with discrete features
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    try:
        m = float(m)
    except ValueError:
        raise Exception("Parameter m should be numerical.")

    if "naivebayes_fitmodel" in fitmodel_url:
        # fit model is loaded from ddfs
        fit_model = dict(
            (k, v)
            for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"]))
        if len(fit_model["y_labels"]) < 2:
            print "There is only one class in training data."
            return []
    else:
        raise Exception("Incorrect fit model.")

    if dataset.params["X_meta"].count(
            "d") > 0:  # if there are discrete features in the model
        # code calculates logarithms to optimize predict phase as opposed to calculation by every mapped.
        np.seterr(divide='ignore')
        for iv in fit_model["iv"]:
            dist = [
                fit_model.pop((y, ) + iv, 0) for y in fit_model["y_labels"]
            ]
            fit_model[iv] = np.nan_to_num(
                np.log(
                    np.true_divide(
                        np.array(dist) + m * fit_model["prior"],
                        np.sum(dist) + m))) - fit_model["prior_log"]
        del (fit_model["iv"])

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes execution of mappers
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]

    job.params = dataset.params  # job parameters (dataset object)
    job.params["fit_model"] = fit_model
    # define name of a job and input data urls
    job.run(name="naivebayes_predict", input=dataset.params["data_tag"])
    results = job.wait(show=show)
    return results
示例#20
0
def getPipeline(count, type):
    intermediates = [(type, Stage("inter_%d" % i, process=intermediate)) for i in range(count)]
    pipeline = [("split", Stage("map", process=map))] + intermediates + [("group_label", Stage("reduce",
                                                                          init=reduce_init,
                                                                          process=reduce, done=reduce_done))]
    return pipeline
class SortJob(TestPipe):
    scheduler = {'max_cores': 7}
    pipeline = [("split", Stage("Map", process=Map)),
                ("group_label",
                 Stage("Reduce", process=Reduce, combine=True, sort=True))]
示例#22
0
def fit_predict(training_data,
                fitting_data,
                tau=1,
                samples_per_job=0,
                save_results=True,
                show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.core import Disco
    """
    training_data - training samples
    fitting_data - dataset to be fitted to training data.
    tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x.
    samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job.
    """

    try:
        tau = float(tau)
        if tau <= 0:
            raise Exception("Parameter tau should be >= 0.")
    except ValueError:
        raise Exception("Parameter tau should be numerical.")

    if fitting_data.params["id_index"] == -1:
        raise Exception("Predict data should have id_index set.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=fitting_data.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]
    job.params = fitting_data.params
    job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"])

    samples = {}
    results = []
    tau = float(2 * tau**2)  # calculate tau once
    counter = 0

    for test_id, x in result_iterator(job.wait(show=show)):
        if samples_per_job == 0:
            # calculate number of samples per job
            if len(x) <= 100:  # if there is less than 100 attributes
                samples_per_job = 100  # 100 samples is max per on job
            else:
                # there is more than 100 attributes
                samples_per_job = len(x) * -25 / 900. + 53  # linear function

        samples[test_id] = x
        if counter == samples_per_job:
            results.append(
                _fit_predict(training_data, samples, tau, save_results, show))
            counter = 0
            samples = {}
        counter += 1

    if len(samples) > 0:  # if there is some samples left in the the dictionary
        results.append(
            _fit_predict(training_data, samples, tau, save_results, show))

    # merge results of every iteration into a single tag
    ddfs = Disco().ddfs
    ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results])

    return ["tag://" + job.name]
示例#23
0
def fit(dataset,
        n_clusters=5,
        max_iterations=10,
        random_state=None,
        save_results=True,
        show=False):
    """
    Optimize k-clustering for `iterations` iterations with cluster
    center definitions as given in `center`.
    """
    from disco.job import Job
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import result_iterator

    try:
        n_clusters = int(n_clusters)
        max_iterations = int(max_iterations)
        if n_clusters < 2:
            raise Exception("Parameter n_clusters should be greater than 1.")
        if max_iterations < 1:
            raise Exception(
                "Parameter max_iterations should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("kmeans_init_map",
                           input_chain=dataset.params["input_chain"],
                           init=map_init,
                           process=random_init_map)),
                    ('group_label',
                     Stage("kmeans_init_reduce",
                           process=estimate_reduce,
                           init=simple_init,
                           combine=True))]
    job.params = dict(dataset.params.items() + mean_point_center.items())
    job.params['seed'] = random_state
    job.params['k'] = n_clusters

    job.run(input=dataset.params["data_tag"], name="kmeans_init")
    init = job.wait(show=show)
    centers = [(i, c) for i, c in result_iterator(init)]

    for j in range(max_iterations):
        job = Job(worker=Worker(save_results=save_results))
        job.params = dict(dataset.params.items() + mean_point_center.items())
        job.params['k'] = n_clusters
        job.params['centers'] = centers

        job.pipeline = [('split',
                         Stage("kmeans_map_iter_%s" % (j + 1, ),
                               input_chain=dataset.params["input_chain"],
                               process=estimate_map,
                               init=simple_init)),
                        ('group_label',
                         Stage("kmeans_reduce_iter_%s" % (j + 1, ),
                               process=estimate_reduce,
                               init=simple_init,
                               combine=True))]

        job.run(input=dataset.params["data_tag"],
                name='kmeans_iter_%d' % (j + 1, ))
        fitmodel_url = job.wait(show=show)
        centers = [(i, c) for i, c in result_iterator(fitmodel_url)]

    return {"kmeans_fitmodel": fitmodel_url}  # return results url
示例#24
0
class SortJob(TestPipe):
    pipeline = [("split", Stage("Map", process=Map)),
                ("group_label",
                 Stage("Reduce", process=Reduce, combine=True, sort=True))]
示例#25
0
class RawJob(TestPipe):
    pipeline = [("split", Stage("map", process=map))]
def fit(dataset, alpha=1e-8, max_iterations=10, save_results=True, show=False):
    """
    Function starts a job for calculation of theta parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    alpha - convergence value
    max_iterations - define maximum number of iterations
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    if dataset.params["y_map"] == []:
        raise Exception(
            "Logistic regression requires a target label mapping parameter.")
    try:
        alpha = float(alpha)
        max_iterations = int(max_iterations)
        if max_iterations < 1:
            raise Exception(
                "Parameter max_iterations should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    # initialize thetas to 0 and add intercept term
    thetas = np.zeros(len(dataset.params["X_indices"]) + 1)

    J = [0]  # J cost function values for every iteration
    for i in range(max_iterations):
        job = Job(worker=Worker(save_results=save_results))
        # job parallelizes mappers and joins them with one reducer
        job.pipeline = [("split",
                         Stage("map",
                               input_chain=dataset.params["input_chain"],
                               init=simple_init,
                               process=map_fit)),
                        ('group_all',
                         Stage("reduce",
                               init=simple_init,
                               process=reduce_fit,
                               combine=True))]

        job.params = dataset.params  # job parameters (dataset object)
        job.params["thetas"] = thetas  # every iteration set new thetas
        job.run(name="logreg_fit_iter_%d" % (i + 1),
                input=dataset.params["data_tag"])

        fitmodel_url = job.wait(show=show)
        for k, v in result_iterator(fitmodel_url):
            if k == "J":  #
                J.append(v)  # save value of J cost function
            else:
                thetas = v  # save new thetas
        if np.abs(J[-2] - J[-1]) < alpha:  # check for convergence
            if show:
                print("Converged at iteration %d" % (i + 1))
            break

    return {"logreg_fitmodel": fitmodel_url}  # return results url