Python Job.wait 예제들, disco.core.Job.wait Python 예제들

예제 #1

0

파일 보기

파일: accuracy.py 프로젝트: romanorac/discomll

def measure(test_data, predictions, measure="ca", save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.worker.task_io import task_input_stream, chain_reader

    if measure not in ["ca", "mse"]:
        raise Exception("measure should be ca or mse.")
    if test_data.params["id_index"] == -1:
        raise Exception("ID index should be defined.")

    if predictions == []:
        return "No predictions", None

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=test_data.params["input_chain"], init=simple_init, process=map_test_data))]

    job.params = test_data.params
    job.run(name="ma_parse_testdata", input=test_data.params["data_tag"])
    parsed_testdata = job.wait(show=show)

    reduce_proces = reduce_ca if measure == "ca" else reduce_mse

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split", Stage("map", init=simple_init, input_chain=[task_input_stream, chain_reader],
                                    process=map_predictions)),
                    ('group_all', Stage("reduce", init=simple_init, process=reduce_proces, sort=True, combine=True))]

    job.run(name="ma_measure_accuracy", input=parsed_testdata + predictions)

    measure, acc = [(measure, acc) for measure, acc in result_iterator(job.wait(show=show))][0]
    return measure, acc

예제 #2

0

파일 보기

파일: job.py 프로젝트: dcrosta/mongo-disco

class DiscoJob():


    def __init__(self,config,map,reduce):
        import config_util

        self.config = config_util.config
        #if the user doesn't specify output, print to stdout
        if not config.get('output_uri') and not config.get('print_to_stdout'):
            config['print_to_stdout'] = True

        for item in config:
            self.config[item] = config[item]

        self.map = map
        self.reduce = reduce
        self.job = Job()
        self.params = Params()
        for key in self.config:
            self.params.__dict__[key] = self.config[key]

    def run(self):

        if self.config['print_to_stdout']:

            self.job.run(input = do_split(self.config),
                     map = self.map,
                     reduce = self.reduce,
                     params = self.params,
                     map_input_stream = mongodb_input_stream,
                     required_modules= ['mongodb_io',
                                        'mongodb_input',
                                        'config_util',
                                        'mongo_util',
                                        'mongodb_output'])
            for key, value in result_iterator(self.job.wait(show=True)):
                print key, value

        else:
            self.job.run(input = do_split(self.config),
                     map = self.map,
                     reduce = self.reduce,
                     params = self.params,
                     map_input_stream = mongodb_input_stream,
                     reduce_output_stream = mongodb_output_stream,
                     required_modules= ['mongodb_io',
                                        'mongodb_input',
                                        'config_util',
                                        'mongo_util',
                                        'mongodb_output'])

            if self.config.get("job_wait",False):
                self.job.wait(show=True)

예제 #3

0

파일 보기

파일: accuracy.py 프로젝트: sb123456789sb/discomll

def measure(test_data,
            predictions,
            measure="ca",
            save_results=True,
            show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.worker.task_io import task_input_stream, chain_reader

    if measure not in ["ca", "mse"]:
        raise Exception("measure should be ca or mse.")
    if test_data.params["id_index"] == -1:
        raise Exception("ID index should be defined.")

    if predictions == []:
        return "No predictions", None

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=test_data.params["input_chain"],
                           init=simple_init,
                           process=map_test_data))]

    job.params = test_data.params
    job.run(name="ma_parse_testdata", input=test_data.params["data_tag"])
    parsed_testdata = job.wait(show=show)

    reduce_proces = reduce_ca if measure == "ca" else reduce_mse

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           init=simple_init,
                           input_chain=[task_input_stream, chain_reader],
                           process=map_predictions)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_proces,
                           sort=True,
                           combine=True))]

    job.run(name="ma_measure_accuracy", input=parsed_testdata + predictions)

    measure, acc = [
        (measure, acc) for measure, acc in result_iterator(job.wait(show=show))
    ][0]
    return measure, acc

예제 #4

0

파일 보기

파일: distributed_random_forest.py 프로젝트: romanorac/discomll

def predict(dataset, fitmodel_url, voting=False, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll

    path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    if "drf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init,
                                    process=map_predict_voting if voting else map_predict_dist))]

    job.params = dataset.params
    for k, v in result_iterator(fitmodel_url["drf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_random_forest_predict", input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)

예제 #5

0

파일 보기

파일: resolve.py 프로젝트: CrazyWisdom/auth

def auth(clazz, province, input, output, date):
    dirList      = os.listdir(input)
    ptime        = datetime.strptime(date, "%Y%m%d")
    file_filter  = ptime.strftime('%Y-%m-%d')

    input = ["file:///" + input + "/" + file for file in dirList 
            if ( re.search(date, file) or re.search(file_filter, file) )]
    if input:
        if clazz == 'c+w':
            if cw_map_funs.has_key(province):
                mapfun = cw_map_funs[province]
            else:
                mapfun = cw_map
        else:
            if fixed_map_funs.has_key(province):
                mapfun = fixed_map_funs[province]
            else:
                mapfun = fixed_map

        job = Job().run(input=input, map=mapfun)
        file = open(output + "/" + clazz + "-" + date + ".ctl", "w")
        sqldr_header(file)
        for user, line in result_iterator(job.wait(show=True)):
            print >>file, line
        file.close()
    else:
        print 'resolve.py: Can not find any auth files.'

예제 #6

0

파일 보기

파일: distributed_weighted_forest_rand.py 프로젝트: romanorac/discomll

def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll
    path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

    if "dwfr_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")
    try:
        coeff = float(coeff)
        if coeff < 0:
            raise Exception("Parameter coeff should be greater than 0.")
    except ValueError:
        raise Exception("Parameter coeff should be numerical.")

    job.params = dataset.params
    job.params["coeff"] = coeff
    for k, v in result_iterator(fitmodel_url["dwfr_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_weighted_forest_rand_predict", input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)

예제 #7

0

파일 보기

파일: linear_svm.py 프로젝트: romanorac/discomll

def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls with predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if "linsvm_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    # job parallelizes execution of mappers
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

    job.params = dataset.params
    job.params["fit_params"] = [v for _, v in result_iterator(fitmodel_url["linsvm_fitmodel"])][0]
    job.run(name="linsvm_predict", input=dataset.params["data_tag"])

    return job.wait(show=show)

예제 #8

0

파일 보기

파일: naivebayes.py 프로젝트: romanorac/discomll

def fit(dataset, save_results=True, show=False):
    """
    Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers, sorts intermediate pairs and joins them with one reducer
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True))]

    job.params = dataset.params  # job parameters (dataset object)
    # define name of a job and input data urls
    job.run(name="naivebayes_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"naivebayes_fitmodel": fitmodel_url}  # return results url

예제 #9

0

파일 보기

파일: locally_weighted_linear_regression.py 프로젝트: romanorac/discomll

def fit_predict(training_data, fitting_data, tau=1, samples_per_job=0, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.core import Disco

    """
    training_data - training samples
    fitting_data - dataset to be fitted to training data.
    tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x.
    samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job.
    """

    try:
        tau = float(tau)
        if tau <= 0:
            raise Exception("Parameter tau should be >= 0.")
    except ValueError:
        raise Exception("Parameter tau should be numerical.")

    if fitting_data.params["id_index"] == -1:
        raise Exception("Predict data should have id_index set.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=fitting_data.params["input_chain"], init=simple_init, process=map_predict))
    ]
    job.params = fitting_data.params
    job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"])

    samples = {}
    results = []
    tau = float(2 * tau ** 2)  # calculate tau once
    counter = 0

    for test_id, x in result_iterator(job.wait(show=show)):
        if samples_per_job == 0:
            # calculate number of samples per job
            if len(x) <= 100:  # if there is less than 100 attributes
                samples_per_job = 100  # 100 samples is max per on job
            else:
                # there is more than 100 attributes
                samples_per_job = len(x) * -25 / 900.0 + 53  # linear function

        samples[test_id] = x
        if counter == samples_per_job:
            results.append(_fit_predict(training_data, samples, tau, save_results, show))
            counter = 0
            samples = {}
        counter += 1

    if len(samples) > 0:  # if there is some samples left in the the dictionary
        results.append(_fit_predict(training_data, samples, tau, save_results, show))

    # merge results of every iteration into a single tag
    ddfs = Disco().ddfs
    ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results])

    return ["tag://" + job.name]

예제 #10

0

파일 보기

파일: naivebayes.py 프로젝트: romanorac/discomll

def predict(dataset, fitmodel_url, m=1, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    m - m estimate is used with discrete features
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    try:
        m = float(m)
    except ValueError:
        raise Exception("Parameter m should be numerical.")

    if "naivebayes_fitmodel" in fitmodel_url:
        # fit model is loaded from ddfs
        fit_model = dict((k, v) for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"]))
        if len(fit_model["y_labels"]) < 2:
            print "There is only one class in training data."
            return []
    else:
        raise Exception("Incorrect fit model.")

    if dataset.params["X_meta"].count("d") > 0:  # if there are discrete features in the model
        # code calculates logarithms to optimize predict phase as opposed to calculation by every mapped.
        np.seterr(divide='ignore')
        for iv in fit_model["iv"]:
            dist = [fit_model.pop((y,) + iv, 0) for y in fit_model["y_labels"]]
            fit_model[iv] = np.nan_to_num(
                np.log(np.true_divide(np.array(dist) + m * fit_model["prior"], np.sum(dist) + m))) - fit_model[
                                "prior_log"]
        del (fit_model["iv"])

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes execution of mappers
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

    job.params = dataset.params  # job parameters (dataset object)
    job.params["fit_model"] = fit_model
    # define name of a job and input data urls
    job.run(name="naivebayes_predict", input=dataset.params["data_tag"])
    results = job.wait(show=show)
    return results

예제 #11

0

파일 보기

파일: logistic_regression.py 프로젝트: romanorac/discomll

def fit(dataset, alpha=1e-8, max_iterations=10, save_results=True, show=False):
    """
    Function starts a job for calculation of theta parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    alpha - convergence value
    max_iterations - define maximum number of iterations
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    if dataset.params["y_map"] == []:
        raise Exception("Logistic regression requires a target label mapping parameter.")
    try:
        alpha = float(alpha)
        max_iterations = int(max_iterations)
        if max_iterations < 1:
            raise Exception("Parameter max_iterations should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    # initialize thetas to 0 and add intercept term
    thetas = np.zeros(len(dataset.params["X_indices"]) + 1)

    J = [0]  # J cost function values for every iteration
    for i in range(max_iterations):
        job = Job(worker=Worker(save_results=save_results))
        # job parallelizes mappers and joins them with one reducer
        job.pipeline = [
            ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
            ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

        job.params = dataset.params  # job parameters (dataset object)
        job.params["thetas"] = thetas  # every iteration set new thetas
        job.run(name="logreg_fit_iter_%d" % (i + 1), input=dataset.params["data_tag"])

        fitmodel_url = job.wait(show=show)
        for k, v in result_iterator(fitmodel_url):
            if k == "J":  #
                J.append(v)  # save value of J cost function
            else:
                thetas = v  # save new thetas
        if np.abs(J[-2] - J[-1]) < alpha:  # check for convergence
            if show:
                print("Converged at iteration %d" % (i + 1))
            break

    return {"logreg_fitmodel": fitmodel_url}  # return results url

예제 #12

0

파일 보기

파일: naive_bayes.py 프로젝트: nicolasramy/disco

def predict(input, loglikelihoods, ys, splitter=" ", map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])
    job = Job(name="naive_bayes_predict")
    job.run(
        input=input,
        map_reader=map_reader,
        map=predict_map,
        params=Params(loglikelihoods=loglikelihoods, ys=ys, splitter=splitter),
        clean=False,
    )
    return job.wait()

예제 #13

0

파일 보기

파일: map_hashtag.py 프로젝트: fangjin/Hate

def	main():
	args = parse_args()
	news_file = args.news_file
	job = Job().run(
                    input=news_file,
                    map_reader=disco.worker.classic.func.chain_reader,
                    map=read_twitter,
                    reduce=reduce)
	with open("output_result",'w') as out:
		for word, count in result_iterator(job.wait(show=False)):
			out.write(word + "\t" + str(count))

예제 #14

0

파일 보기

파일: naive_bayes.py 프로젝트: sajal/disco

def predict(input, loglikelihoods, ys, splitter=' ', map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])
    job = Job(name='naive_bayes_predict')
    job.run(input=input,
            map_reader=map_reader,
            map=predict_map,
            params=Params(loglikelihoods=loglikelihoods,
                          ys=ys,
                          splitter=splitter),
            clean=False)
    return job.wait()

예제 #15

0

파일 보기

파일: forest_distributed_decision_trees.py 프로젝트: romanorac/discomll

def fit(dataset, trees_per_chunk=1, bootstrap=True, max_tree_nodes=50, min_samples_leaf=10, min_samples_split=5,
        class_majority=1, separate_max=True, measure="info_gain", accuracy=1, random_state=None, save_results=True,
        show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    import discomll
    path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    try:
        trees_per_chunk = int(trees_per_chunk)
        max_tree_nodes = int(max_tree_nodes) if max_tree_nodes != None else max_tree_nodes
        min_samples_leaf = int(min_samples_leaf)
        min_samples_split = int(min_samples_split)
        class_majority = float(class_majority)
        accuracy = int(accuracy)
        separate_max = separate_max
        if trees_per_chunk > 1 and bootstrap == False:
            raise Exception("Parameter trees_per_chunk (or Trees per subset) should be 1 to disable bootstrap.")
        if trees_per_chunk <= 0 or min_samples_leaf <= 0 or class_majority <= 0 or min_samples_split <= 0 and accuracy < 0 or type(
                bootstrap) != bool:
            raise Exception("Parameters should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    if measure not in ["info_gain", "mdl"]:
        raise Exception("measure should be set to info_gain or mdl.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=map_init,
                        process=map_fit_bootstrap if bootstrap else map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

    job.params = dataset.params
    job.params["trees_per_chunk"] = trees_per_chunk
    job.params["max_tree_nodes"] = max_tree_nodes
    job.params["min_samples_leaf"] = min_samples_leaf
    job.params["min_samples_split"] = min_samples_split
    job.params["class_majority"] = class_majority
    job.params["measure"] = measure
    job.params["bootstrap"] = bootstrap
    job.params["accuracy"] = accuracy
    job.params["separate_max"] = separate_max
    job.params['seed'] = random_state

    job.run(name="forest_distributed_decision_trees_fit", input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py", path + "measures.py"])

    fitmodel_url = job.wait(show=show)
    return {"fddt_fitmodel": fitmodel_url}  # return results url

예제 #16

0

파일 보기

파일: linear_regression.py 프로젝트: romanorac/discomll

def fit(dataset, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

    job.params = dataset.params
    job.run(name="linreg_fit", input=dataset.params["data_tag"])

    fitmodel_url = job.wait(show=show)
    return {"linreg_fitmodel": fitmodel_url}  # return results url

예제 #17

0

파일 보기

파일: linear_regression.py 프로젝트: romanorac/discomll

def predict(dataset, fitmodel_url, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if "linreg_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]
    job.params = dataset.params
    job.params["thetas"] = [v for _, v in result_iterator(fitmodel_url["linreg_fitmodel"])][0]

    job.run(name="linreg_predict", input=dataset.params["data_tag"])
    return job.wait(show=show)

예제 #18

0

파일 보기

파일: distribution.py 프로젝트: romanorac/discomll

def measure(input, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [
        ("split", Stage("map", input_chain=input.params["input_chain"], init=simple_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

    job.params = input.params  # job parameters (dataset object)

    job.run(name="Distribution", input=input.params["data_tag"])
    return job.wait(show=show)  # return results url

예제 #19

0

파일 보기

def fit(dataset, nu=0.1, save_results=True, show=False):
    """
    Function starts a job for calculation of model parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    nu - parameter to adjust the classifier
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    if dataset.params["y_map"] == []:
        raise Exception(
            "Linear proximal SVM requires a target label mapping parameter.")
    try:
        nu = float(nu)
        if nu <= 0:
            raise Exception("Parameter nu should be greater than 0")
    except ValueError:
        raise Exception("Parameter should be numerical.")

    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers and joins them with one reducer
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           combine=True))]

    job.params = dataset.params
    job.params["nu"] = nu
    job.run(name="linearsvm_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"linsvm_fitmodel": fitmodel_url}  # return results url

예제 #20

0

파일 보기

파일: locally_weighted_linear_regression.py 프로젝트: romanorac/discomll

def _fit_predict(fit_data, samples, tau, save_results, show):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [
        ("split", Stage("map", input_chain=fit_data.params["input_chain"], init=simple_init, process=map_fit)),
        ("group_all", Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True)),
    ]

    job.params = fit_data.params
    job.params["tau"] = tau
    job.params["samples"] = samples

    job.run(name="lwlr_fit_predict", input=fit_data.params["data_tag"])
    return job.wait(show=show)

예제 #21

0

파일 보기

파일: parallel_processing.py 프로젝트: stefanv/cesium

def process_prediction_data_featurization_with_disco(input_list,params,partitions=4):
    '''
    Called from within featurize_prediction_data_in_parallel
    Returns disco.core.result_iterator
    Arguments:
        input_list: path to file listing filename,unused_string for each individual time series data file.
        params: dictionary of parameters to be passed to each map & reduce function.
        partitions: Number of nodes/partitions in system.
    '''
    from disco.core import Job, result_iterator
    job = Job().run(input=input_list,
                    map=pred_map,
                    partitions=partitions,
                    reduce=pred_featurize_reduce,
                    params=params)
    
    result = result_iterator(job.wait(show=True))
    return result

예제 #22

0

파일 보기

def process_featurization_with_disco(input_list, params, partitions=4):
    '''
	Called from within featurize_in_parallel.
	Returns disco.core.result_iterator
	Arguments:
		input_list: path to file listing filename,class_name for each individual time series data file.
		params: dictionary of parameters to be passed to each map & reduce function.
		partitions: Number of nodes/partitions in system.
	'''
    from disco.core import Job, result_iterator
    job = Job().run(input=input_list,
                    map=map,
                    partitions=partitions,
                    reduce=featurize_reduce,
                    params=params)

    result = result_iterator(job.wait(show=True))
    return result

예제 #23

0

파일 보기

파일: mr_categorizer.py 프로젝트: trein/criteo-challenge

def main():
    job = Job().run(input=[TRAIN_IN], map=mapper, reduce=reducer, sort=True)
    category_options = defaultdict(dict)
    category_values = defaultdict(int)
    for cat_id, counter in result_iterator(job.wait(show=True)):
        if len(counter) > MAX_CATEGORICAL_OPTIONS:
            continue

        for cat_value in counter:
            if cat_value not in category_options[cat_id]:
                category_options[cat_id][cat_value] = category_values[cat_id]
                category_values[cat_id] += 1

    # save possible categorical data
    with open(CATEGORY_MAPPING_OUT, 'w') as f:
        f.write(dumps(category_options))

    with open(CATEGORY_STATUS_OUT, 'w') as f:
        f.write(dumps(category_values))

예제 #24

0

파일 보기

파일: build_model.py 프로젝트: gitter-badger/mltsp

def fit_model_disco(data_dict, featureset_key, model_type):
    """
    """
    from disco.core import Job, result_iterator
    params = {"data_dict": data_dict,
              "featureset_key": featureset_key,
              "model_type": model_type}
    input_list = [("placeholder")]
    job = Job('with_modules').run(
        input=input_list,
        reduce=reduce,
        params=params,
        required_modules=[("mltsp",
                           os.path.dirname(os.path.dirname(__file__))),
                          "sklearn"])
    result_iter = result_iterator(job.wait(show=True))
    rf_fit = None
    for rf_obj, dummy_str in result_iter:
        rf_fit = rf_obj
    return rf_fit

예제 #25

0

파일 보기

파일: linear_regression.py 프로젝트: sb123456789sb/discomll

def predict(dataset, fitmodel_url, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if "linreg_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]
    job.params = dataset.params
    job.params["thetas"] = [
        v for _, v in result_iterator(fitmodel_url["linreg_fitmodel"])
    ][0]

    job.run(name="linreg_predict", input=dataset.params["data_tag"])
    return job.wait(show=show)

예제 #26

0

파일 보기

파일: logistic_regression.py 프로젝트: sb123456789sb/discomll

def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls with predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if dataset.params["y_map"] == []:
        raise Exception(
            "Logistic regression requires a target label mapping parameter.")
    if "logreg_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    # job parallelizes execution of mappers
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]

    job.params = dataset.params  # job parameters (dataset object)
    job.params["thetas"] = [
        v for k, v in result_iterator(fitmodel_url["logreg_fitmodel"])
        if k == "thetas"
    ][0]  # thetas are loaded from ddfs

    job.run(name="logreg_predict", input=dataset.params["data_tag"])
    results = job.wait(show=show)
    return results

예제 #27

0

파일 보기

파일: linear_regression.py 프로젝트: sb123456789sb/discomll

def fit(dataset, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           combine=True))]

    job.params = dataset.params
    job.run(name="linreg_fit", input=dataset.params["data_tag"])

    fitmodel_url = job.wait(show=show)
    return {"linreg_fitmodel": fitmodel_url}  # return results url

예제 #28

0

파일 보기

파일: linear_svm.py 프로젝트: romanorac/discomll

def fit(dataset, nu=0.1, save_results=True, show=False):
    """
    Function starts a job for calculation of model parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    nu - parameter to adjust the classifier
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    if dataset.params["y_map"] == []:
        raise Exception("Linear proximal SVM requires a target label mapping parameter.")
    try:
        nu = float(nu)
        if nu <= 0:
            raise Exception("Parameter nu should be greater than 0")
    except ValueError:
        raise Exception("Parameter should be numerical.")

    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers and joins them with one reducer
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

    job.params = dataset.params
    job.params["nu"] = nu
    job.run(name="linearsvm_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"linsvm_fitmodel": fitmodel_url}  # return results url

예제 #29

0

파일 보기

파일: parallel_processing.py 프로젝트: gitter-badger/mltsp

def process_featurization_with_disco(input_list, params, partitions=4):
    """Featurize time-series data in parallel as a Disco job.

    Called from within the `featurize_in_parallel` function.

    Parameters
    ----------
    input_list : str
        Path to file listing the file name and class name
        (comma-separated) for each individual time series data file,
        one per line.
    params : dict
        Dictionary of parameters to be passed to each map & reduce
        function.
    partitions : int, optional
        Number of nodes/partitions in system. Defaults to 4.

    Returns
    -------
    iterator
        disco.core.result_iterator(), an interator of two-element
        tuples, each containing the file name of the original time
        series data file, and a dictionary of the associated features
        generated.

    """
    from disco.core import Job, result_iterator
    job = Job('with_modules').run(
        input=input_list,
        map_reader=custom_reader,
        map=map,
        partitions=partitions,
        reduce=featurize_reduce,
        params=params,
        required_modules=[("mltsp",
                           os.path.dirname(os.path.dirname(__file__)))])

    result = result_iterator(job.wait(show=True))
    return result

예제 #30

0

파일 보기

def fit(dataset, save_results=True, show=False):
    """
    Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers, sorts intermediate pairs and joins them with one reducer
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           sort=True,
                           combine=True))]

    job.params = dataset.params  # job parameters (dataset object)
    # define name of a job and input data urls
    job.run(name="naivebayes_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"naivebayes_fitmodel": fitmodel_url}  # return results url

예제 #31

0

파일 보기

def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll
    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]

    if "dwf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    try:
        coeff = float(coeff)
        if coeff < 0:
            raise Exception("Parameter coeff should be greater than 0.")
    except ValueError:
        raise Exception("Parameter coeff should be numerical.")

    job.params = dataset.params
    job.params["coeff"] = coeff
    for k, v in result_iterator(fitmodel_url["dwf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_weighted_forest_predict",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)

예제 #32

0

파일 보기

def _fit_predict(fit_data, samples, tau, save_results, show):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [("split",
                     Stage("map",
                           input_chain=fit_data.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           sort=True,
                           combine=True))]

    job.params = fit_data.params
    job.params["tau"] = tau
    job.params["samples"] = samples

    job.run(name="lwlr_fit_predict", input=fit_data.params["data_tag"])
    return job.wait(show=show)

예제 #33

0

파일 보기

파일: distributed_random_forest.py 프로젝트: sb123456789sb/discomll

def predict(dataset,
            fitmodel_url,
            voting=False,
            save_results=True,
            show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll

    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    if "drf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split",
         Stage("map",
               input_chain=dataset.params["input_chain"],
               init=simple_init,
               process=map_predict_voting if voting else map_predict_dist))
    ]

    job.params = dataset.params
    for k, v in result_iterator(fitmodel_url["drf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_random_forest_predict",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)

예제 #34

0

파일 보기

파일: test_output.py 프로젝트: babycaseny/mongo-disco

from disco.core import Job, result_iterator

def map(line, params):
    for word in line.split():
        yield word,1

def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)

'''
def mongodb_output(stream,partition,url,params):
    return mongoDisco_output.MongoDBoutput(stream,params)
'''

if __name__ == '__main__':

    job = Job().run(input=["r"],
            map=map,
            reduce=reduce,
            map_input_stream = mongodb_input_stream
            reduce_output_stream=mongodb_output_stream)

    job.wait(show=True)

예제 #35

0

파일 보기

def fit_predict(training_data,
                fitting_data,
                tau=1,
                samples_per_job=0,
                save_results=True,
                show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.core import Disco
    """
    training_data - training samples
    fitting_data - dataset to be fitted to training data.
    tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x.
    samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job.
    """

    try:
        tau = float(tau)
        if tau <= 0:
            raise Exception("Parameter tau should be >= 0.")
    except ValueError:
        raise Exception("Parameter tau should be numerical.")

    if fitting_data.params["id_index"] == -1:
        raise Exception("Predict data should have id_index set.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=fitting_data.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]
    job.params = fitting_data.params
    job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"])

    samples = {}
    results = []
    tau = float(2 * tau**2)  # calculate tau once
    counter = 0

    for test_id, x in result_iterator(job.wait(show=show)):
        if samples_per_job == 0:
            # calculate number of samples per job
            if len(x) <= 100:  # if there is less than 100 attributes
                samples_per_job = 100  # 100 samples is max per on job
            else:
                # there is more than 100 attributes
                samples_per_job = len(x) * -25 / 900. + 53  # linear function

        samples[test_id] = x
        if counter == samples_per_job:
            results.append(
                _fit_predict(training_data, samples, tau, save_results, show))
            counter = 0
            samples = {}
        counter += 1

    if len(samples) > 0:  # if there is some samples left in the the dictionary
        results.append(
            _fit_predict(training_data, samples, tau, save_results, show))

    # merge results of every iteration into a single tag
    ddfs = Disco().ddfs
    ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results])

    return ["tag://" + job.name]

예제 #36

0

파일 보기

                           we know we do not know.\
                           But there are also unknown unknowns.\
                           There are things \
                           we do not know we don't know",
                    map=map,
                    reduce=reduce)


    sort_in_numerical_order =\
                            open('SortNumerical.txt', 'w')
    sort_in_alpbabetically_order = \
                                 open('SortAlphabetical.txt', 'w')

    wordCount = []
    for word, count in \
        result_iterator(job.wait(show=True)):
        sort_in_alpbabetically_order.write('%s \t %d\n' %
                                           (str(word), int(count)))
        wordCount.append((word, count))

    sortedWordCount =sorted(wordCount, \
                            key=lambda count: count[1],\
                            reverse=True)

    for word, count in sortedWordCount:
        sort_in_numerical_order.write('%s \t %d\n'\
                                      % (str(word), int(count)) )

    sort_in_alpbabetically_order.close()
    sort_in_numerical_order.close()

예제 #37

0

파일 보기

파일: dummy_test.py 프로젝트: bearrito/disco_playground

from disco.core import Job, result_iterator

def map(line, params):
    for word in line.split():
        yield word, 1

def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)

if __name__ == '__main__':
    job = Job().run(input=["erl://erl_inputs:test/dummy"],
                    map=map,
                    reduce=reduce)
    for word, count in result_iterator(job.wait(show=True)):
        print word, count

예제 #38

0

파일 보기

파일: job.py 프로젝트: sajal/MongoDisco

class DiscoJob():

    DEFAULT_CONFIG = {
        "job_output_key": "_id",
        "job_output_value": "value",
        "input_uri": "mongodb://localhost/test.in",
        "output_uri": "mongodb://localhost/test.out",
        "print_to_stdout": False,
        "job_wait": True,
        "split_size": 8,
        "split_key": {
            "_id": 1
        },
        "create_input_splits": True,
        "use_shards": False,
        "use_chunks": True,
        "slave_ok": False,
        "limit": 0,
        "skip": 0,
        "input_key": None,
        "sort": None,
        "timeout": False,
        "fields": None,
        "query": {}
    }

    def __init__(self, config, map, reduce):
        self.config = DiscoJob.DEFAULT_CONFIG.copy()
        self.config.update(config)

        self.map = map
        self.reduce = reduce
        self.job = Job()
        self.params = Params(**self.config)

    def run(self):

        if self.config['print_to_stdout']:

            self.job.run(input=do_split(self.config),
                         map=self.map,
                         reduce=self.reduce,
                         params=self.params,
                         map_input_stream=mongodb_input_stream,
                         required_modules=[
                             'mongodisco.mongodb_io',
                             'mongodisco.mongodb_input',
                             'mongodisco.mongo_util',
                             'mongodisco.mongodb_output'
                         ])
            for key, value in result_iterator(self.job.wait(show=True)):
                print key, value

        else:
            self.job.run(input=do_split(self.config),
                         map=self.map,
                         reduce=self.reduce,
                         params=self.params,
                         map_input_stream=mongodb_input_stream,
                         reduce_output_stream=mongodb_output_stream,
                         required_modules=[
                             'mongodisco.mongodb_io',
                             'mongodisco.mongodb_input',
                             'mongodisco.mongo_util',
                             'mongodisco.mongodb_output'
                         ])

            if self.config.get("job_wait", False):
                self.job.wait(show=True)

예제 #39

0

파일 보기

파일: vc_obs_best_week_day_for_billing_msisdn.py 프로젝트: miranetworks/ryanm_hackday_2013

def reduce(iter, params):
    from disco.util import kvgroup
    for key, counts in kvgroup(sorted(iter)):
        Day = ''
        Num = 0
        DayList = list(counts)
        Days = set(DayList)
        for j in Days:
            if DayList.count(j) > Num:
                Num = DayList.count(j)
                Day = j
        
        if Num > 1:
            yield key, Day

if __name__ == '__main__':
    job = Job().run(input=["data:vcobssplit"],
                    map=map,
                    reduce=reduce)
    
    output_filename = "output.csv"

    if len(sys.argv) > 1:
        output_filename = sys.argv[1]

    with open(output_filename, 'w') as fp:
        writer = csv.writer(fp)
        for key, date in result_iterator(job.wait(show=True)):
            writer.writerow([key] + [date])

예제 #40

0

파일 보기

파일: count_tweet_words_1.py 프로젝트: imxiaohui/pycon2013_applied_parallel_computing

    tweeter, tweet = count_tweet_words.get_username_tweet(line)
    # return each word in the tweet (to count frequency of each term)
    for word in tweet.split():
        yield word, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


if __name__ == '__main__':
    input_filename = "./tweet_data/tweets_357.json"
    #input_filename = "./tweet_data/tweets_859157.json"
    #input_filename = "/media/3TBStorage/tweets_all.json"

    # we need a fully qualified file name for the server
    fully_qualified_path = os.path.realpath(input_filename)
    input = [fully_qualified_path]

    # import this module so pickle knows what to send to workers
    import count_tweet_words

    job = Job().run(input=input, map=map, reduce=reduce)

    out = open(OUTPUT_FILENAME, 'w')
    for word, count in result_iterator(job.wait(show=True)):
        #print(word, count)
        out.write(json.dumps([word, count]) + '\n')

예제 #41

0

파일 보기

파일: wikiextract.py 프로젝트: pavlobaron/travolta

from disco.core import Job, result_iterator
from travolta.wiki import wiki_input_stream

def map(tin, params):
    if tin:
        l = tin.split("|||||")
        text = l[1]
        cat = l[0]
        if text != "None":
            yield text, cat

if __name__ == '__main__':
    job = Job().run(input =
                    ["/Users/pb/code/travolta/examples/data/wikiextract.xml"],
                    map = map,
                    map_input_stream = [wiki_input_stream],
                    required_modules = [('travolta.wiki', '/Users/pb/code/travolta/python')],
                    params = {"tag": "text"})
    for text, _cat in result_iterator(job.wait(show=True)):
        print text

예제 #42

0

파일 보기

파일: run_disco_tst.py 프로젝트: kod3r/mltsp

def disco_word_count():
    job = Job().run(input=["http://discoproject.org/media/text/chekhov.txt"],
                    map=map,
                    reduce=reduce)
    for word, count in result_iterator(job.wait(show=True)):
        print(word, count)

예제 #43

0

파일 보기

파일: xml_reader.py 프로젝트: nagyistge/discoproject.org-disco

        item = q.get()
        if item == 0:
            return
        yield item
        q.task_done()


def map(line, params):
    import __builtin__
    unwanted = u",!.#()][{}-><=|/\"'*:?"
    words = line.translate(
        __builtin__.dict.fromkeys([ord(x) for x in unwanted], u" ")).lower()
    for word in words.split():
        yield word, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


if __name__ == '__main__':
    job = Job().run(input=["tag://" + DDFS_TAG],
                    map=map,
                    reduce=reduce,
                    map_reader=chain_reader)

    for line, count in result_iterator(job.wait(show=True)):
        print(line, count)

예제 #44

0

파일 보기

파일: logistic_regression.py 프로젝트: sb123456789sb/discomll

def fit(dataset, alpha=1e-8, max_iterations=10, save_results=True, show=False):
    """
    Function starts a job for calculation of theta parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    alpha - convergence value
    max_iterations - define maximum number of iterations
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    if dataset.params["y_map"] == []:
        raise Exception(
            "Logistic regression requires a target label mapping parameter.")
    try:
        alpha = float(alpha)
        max_iterations = int(max_iterations)
        if max_iterations < 1:
            raise Exception(
                "Parameter max_iterations should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    # initialize thetas to 0 and add intercept term
    thetas = np.zeros(len(dataset.params["X_indices"]) + 1)

    J = [0]  # J cost function values for every iteration
    for i in range(max_iterations):
        job = Job(worker=Worker(save_results=save_results))
        # job parallelizes mappers and joins them with one reducer
        job.pipeline = [("split",
                         Stage("map",
                               input_chain=dataset.params["input_chain"],
                               init=simple_init,
                               process=map_fit)),
                        ('group_all',
                         Stage("reduce",
                               init=simple_init,
                               process=reduce_fit,
                               combine=True))]

        job.params = dataset.params  # job parameters (dataset object)
        job.params["thetas"] = thetas  # every iteration set new thetas
        job.run(name="logreg_fit_iter_%d" % (i + 1),
                input=dataset.params["data_tag"])

        fitmodel_url = job.wait(show=show)
        for k, v in result_iterator(fitmodel_url):
            if k == "J":  #
                J.append(v)  # save value of J cost function
            else:
                thetas = v  # save new thetas
        if np.abs(J[-2] - J[-1]) < alpha:  # check for convergence
            if show:
                print("Converged at iteration %d" % (i + 1))
            break

    return {"logreg_fitmodel": fitmodel_url}  # return results url

예제 #45

0

파일 보기

def map(line, params):
    for char in line.lower():
        if char >= 'a' and char <= 'z':
            yield char, 1

def reduce(iter, params):
    from disco.util import kvgroup
    for char, counts in kvgroup(sorted(iter)):
        yield char, sum(counts)

# run the disco job
from disco.core import Job, result_iterator
job = Job().run(input=["http://en.wikipedia.org/wiki/MapReduce"], map=map, reduce=reduce)

# plot the results with matplotlib
#%matplotlib inline
xs, ys = zip(*result_iterator(job.wait()))
import scipy
from matplotlib import pylab
x = scipy.arange(len(xs))
y = scipy.array(ys)
f = pylab.figure()
ax = f.add_axes([0, 0, 3, 1])
ax.bar(x, y, align='center')
ax.set_xticks(x)
ax.set_xticklabels(xs)
f.show()

예제 #46

0

파일 보기

파일: page_rank.py 프로젝트: nagyistge/discoproject.org-disco

                neighbors = v
        score = 1 - d + d * sum_v
        yield node_id, str(node_id) + " " + str(score) + " " + neighbors


if __name__ == '__main__':
    parser = OptionParser(usage='%prog [options] inputs')
    parser.add_option('--iterations', default=10, help='Numbers of iteration')
    parser.add_option(
        '--damping-factor',
        default=0.85,
        help='probability a web surfer will continue clicking on links')

    (options, input) = parser.parse_args()

    results = input

    params = Params(damping_factor=float(options.damping_factor))

    for j in range(int(options.iterations)):
        job = Job().run(input=results,
                        map=send_score,
                        map_reader=chain_reader,
                        reduce=receive_score,
                        params=params)
        results = job.wait()

    for _, node in result_iterator(results):
        fields = node.split()
        print fields[0], ":", fields[1]

예제 #47

0

파일 보기

def fit(dataset,
        trees_per_chunk=1,
        bootstrap=True,
        max_tree_nodes=50,
        min_samples_leaf=10,
        min_samples_split=5,
        class_majority=1,
        separate_max=True,
        measure="info_gain",
        accuracy=1,
        random_state=None,
        save_results=True,
        show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    import discomll
    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    try:
        trees_per_chunk = int(trees_per_chunk)
        max_tree_nodes = int(
            max_tree_nodes) if max_tree_nodes != None else max_tree_nodes
        min_samples_leaf = int(min_samples_leaf)
        min_samples_split = int(min_samples_split)
        class_majority = float(class_majority)
        accuracy = int(accuracy)
        separate_max = separate_max
        if trees_per_chunk > 1 and bootstrap == False:
            raise Exception(
                "Parameter trees_per_chunk (or Trees per subset) should be 1 to disable bootstrap."
            )
        if trees_per_chunk <= 0 or min_samples_leaf <= 0 or class_majority <= 0 or min_samples_split <= 0 and accuracy < 0 or type(
                bootstrap) != bool:
            raise Exception("Parameters should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    if measure not in ["info_gain", "mdl"]:
        raise Exception("measure should be set to info_gain or mdl.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split",
         Stage("map",
               input_chain=dataset.params["input_chain"],
               init=map_init,
               process=map_fit_bootstrap if bootstrap else map_fit)),
        ('group_all',
         Stage("reduce", init=simple_init, process=reduce_fit, combine=True))
    ]

    job.params = dataset.params
    job.params["trees_per_chunk"] = trees_per_chunk
    job.params["max_tree_nodes"] = max_tree_nodes
    job.params["min_samples_leaf"] = min_samples_leaf
    job.params["min_samples_split"] = min_samples_split
    job.params["class_majority"] = class_majority
    job.params["measure"] = measure
    job.params["bootstrap"] = bootstrap
    job.params["accuracy"] = accuracy
    job.params["separate_max"] = separate_max
    job.params['seed'] = random_state

    job.run(name="forest_distributed_decision_trees_fit",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py", path + "measures.py"])

    fitmodel_url = job.wait(show=show)
    return {"fddt_fitmodel": fitmodel_url}  # return results url

예제 #48

0

파일 보기

파일: resolve.py 프로젝트: zzdhidden/wifioss

                    dest='date', help='date')
    (options, args) = parser.parse_args()

    if not (options.clazz and options.input and options.output):
        print usage
        exit(1)
    if not (options.clazz == 'c+w' or options.clazz == 'fixed'):
        print 'class should be \'c+w\' or \'fixed\''
        exit(1)

    dirList=os.listdir(options.input)

    input = ["file:///"+options.input+"/"+file for file in dirList 
                if file.endswith(".TXT") or file.endswith(".txt")]
    
    if options.clazz == 'c+w':
        job = Job().run(input=input, map=cw_map)
    else:
        if fixed_map_funs.has_key(options.province):
            mapfun = fixed_map_funs[options.province]
        else:
            mapfun = fixed_map
        job = Job().run(input=input, map=mapfun)
        
    file = open(options.output + "/" + options.clazz + "-" + options.date + ".ctl", "w")
    sqldr_header(file)
    for user, line in result_iterator(job.wait(show=True)):
        print >>file, line
    file.close()

예제 #49

0

파일 보기

파일: page_rank.py 프로젝트: AlexArgus/disco

        for t, v in vals:
            if t == "s":
                sum_v += v
            else:
                neighbors = v
        score = 1 - d + d * sum_v
        yield node_id, str(node_id) + " " + str(score) + " " + neighbors

if __name__ == '__main__':
    parser = OptionParser(usage='%prog [options] inputs')
    parser.add_option('--iterations',
                      default=10,
                      help='Numbers of iteration')
    parser.add_option('--damping-factor',
                      default=0.85,
                      help='probability a web surfer will continue clicking on links')

    (options, input) = parser.parse_args()

    results = input

    params = Params(damping_factor=float(options.damping_factor))

    for j in range(int(options.iterations)):
        job = Job().run(input=results, map=send_score, map_reader = chain_reader, reduce=receive_score, params = params)
        results = job.wait()

    for _, node in result_iterator(results):
        fields = node.split()
        print fields[0], ":", fields[1]

예제 #50

0

파일 보기

# This program estimates the value of pi (3.14...)
# Usage:
# python estimate_pi.py

from disco.core import Job, result_iterator


def map(line, params):
    from random import random
    x, y = random(), random()
    yield 0, 1 if x * x + y * y < 1 else 0


if __name__ == '__main__':
    COUNT = 5000
    job = Job().run(input=["raw://0"] * COUNT, map=map)
    tot = 0
    for k, v in result_iterator(job.wait()):
        tot += v
    print(4.0 * tot) / COUNT

예제 #51

0

파일 보기

파일: test_job.py 프로젝트: isabella232/mongo-disco

from mongoDisco_output import MongoDBoutput
from disco.worker.classic.func import task_output_stream
import logging


def map(record, params):
    logging.info("%s" % record.get('_id'))
    yield record.get('name', "NoName"), 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


def mongodb_output(stream, partition, url, params):
    return mongoDisco_output.MongoDBoutput(stream, params)


if __name__ == '__main__':
    mongodb_stream = tuple([mongodb_output])
    job = Job().run(input=["mongodb://localhost/test.modforty"],
                    map=map,
                    reduce=reduce,
                    reduce_output_stream=mongodb_stream)

    job.wait(show=True)
#    for word, count in result_iterator(job.wait(show=True)):
#       print word, count

예제 #52

0

파일 보기

def predict(dataset, fitmodel_url, m=1, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    m - m estimate is used with discrete features
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    try:
        m = float(m)
    except ValueError:
        raise Exception("Parameter m should be numerical.")

    if "naivebayes_fitmodel" in fitmodel_url:
        # fit model is loaded from ddfs
        fit_model = dict(
            (k, v)
            for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"]))
        if len(fit_model["y_labels"]) < 2:
            print "There is only one class in training data."
            return []
    else:
        raise Exception("Incorrect fit model.")

    if dataset.params["X_meta"].count(
            "d") > 0:  # if there are discrete features in the model
        # code calculates logarithms to optimize predict phase as opposed to calculation by every mapped.
        np.seterr(divide='ignore')
        for iv in fit_model["iv"]:
            dist = [
                fit_model.pop((y, ) + iv, 0) for y in fit_model["y_labels"]
            ]
            fit_model[iv] = np.nan_to_num(
                np.log(
                    np.true_divide(
                        np.array(dist) + m * fit_model["prior"],
                        np.sum(dist) + m))) - fit_model["prior_log"]
        del (fit_model["iv"])

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes execution of mappers
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]

    job.params = dataset.params  # job parameters (dataset object)
    job.params["fit_model"] = fit_model
    # define name of a job and input data urls
    job.run(name="naivebayes_predict", input=dataset.params["data_tag"])
    results = job.wait(show=show)
    return results

예제 #53

0

파일 보기

파일: disco_job.py 프로젝트: pooya/github_crawler

def map(line, params):
    import github_crawler
    n = int(line)
    users = github_crawler.get_users_with_n_followers(n)
    print str(len(users)) + " users"
    for user in users:
        repos = github_crawler.get_user_parent_repos(user)
        print str(len(repos)) + " repos"
        for owner, repo, branch in repos:
            print owner + "/" + repo + "#" + branch
            directory = github_crawler.clone_repo(owner, repo, branch)
            for item in github_crawler.analyze_repo(directory):
                yield item


def reduce(iter, params):
    from disco.util import kvgroup
    for extension, ratios in kvgroup(sorted(iter)):
        l_ratios = [r for r in ratios]
        yield extension, sum(l_ratios) / len(l_ratios)


if __name__ == "__main__":
    input = ["raw://" + str(i) for i in range(0, 1000, 10)]
    job = Job().run(input=input,
                    map=map,
                    reduce=reduce,
                    required_files=["github_crawler.py"])
    for extension, avg in result_iterator(job.wait(show=True)):
        print extension, ": ", avg

예제 #54

0

파일 보기

파일: naive_bayes.py 프로젝트: sajal/disco

def estimate(input, ys, splitter=' ', map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])

    job = Job(name='naive_bayes_estimate')

    job.run(input=input,
            map_reader=map_reader,
            map=estimate_map,
            combiner=estimate_combiner,
            reduce=estimate_reduce,
            params=Params(ys=ys, splitter=splitter),
            clean=False)
    results = job.wait()

    total = 0
    # will include the items for which we'll be classifying,
    # for example if the dataset includes males and females,
    # this dict will include the keys male and female and the
    # number of times these have been observed in the train set
    items = {}

    # the number of times the classes have been observed.  For
    # example,  if the feature is something like tall or short, then the dict
    # will contain the total number of times we have seen tall and short.
    classes = {}

    # the number of times we have seen a class with a feature.
    pairs = {}

    for key, value in result_iterator(results):
        l = key.split(splitter)
        value = int(value)
        if len(l) == 1:
            if l[0] == '':
                total = value
            elif ys.has_key(l[0]):
                classes[l[0]] = value
            else:
                items[l[0]] = value
        else:
            pairs[key] = value


#counts[key] = [[c,i], [not c, i], [c, not i], [not c, not i]]
    counts = {}
    for i in items:
        for y in ys:
            key = y + splitter + i
            counts[key] = [0, 0, 0, 0]
            if pairs.has_key(key):
                counts[key][0] = pairs[key]
            counts[key][1] = items[i] - counts[key][0]
            if not classes.has_key(y):
                counts[key][2] = 0
            else:
                counts[key][2] = classes[y] - counts[key][0]
            counts[key][3] = total - sum(counts[key][:3])

            # add pseudocounts
            counts[key] = map(lambda x: x + 1, counts[key])
    total += 4

    import math
    loglikelihoods = {}
    for key, value in counts.iteritems():
        l = key.split(splitter)
        if not loglikelihoods.has_key(l[0]):
            loglikelihoods[l[0]] = 0.0
        loglikelihoods[l[0]] += math.log(value[0] +
                                         value[2]) - math.log(value[1] +
                                                              value[3])
        loglikelihoods[key] = math.log(value[0]) - math.log(value[1])

    return loglikelihoods