def measure(test_data, predictions, measure="ca", save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator from disco.worker.task_io import task_input_stream, chain_reader if measure not in ["ca", "mse"]: raise Exception("measure should be ca or mse.") if test_data.params["id_index"] == -1: raise Exception("ID index should be defined.") if predictions == []: return "No predictions", None # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=test_data.params["input_chain"], init=simple_init, process=map_test_data))] job.params = test_data.params job.run(name="ma_parse_testdata", input=test_data.params["data_tag"]) parsed_testdata = job.wait(show=show) reduce_proces = reduce_ca if measure == "ca" else reduce_mse job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", init=simple_init, input_chain=[task_input_stream, chain_reader], process=map_predictions)), ('group_all', Stage("reduce", init=simple_init, process=reduce_proces, sort=True, combine=True))] job.run(name="ma_measure_accuracy", input=parsed_testdata + predictions) measure, acc = [(measure, acc) for measure, acc in result_iterator(job.wait(show=show))][0] return measure, acc
class DiscoJob(): def __init__(self,config,map,reduce): import config_util self.config = config_util.config #if the user doesn't specify output, print to stdout if not config.get('output_uri') and not config.get('print_to_stdout'): config['print_to_stdout'] = True for item in config: self.config[item] = config[item] self.map = map self.reduce = reduce self.job = Job() self.params = Params() for key in self.config: self.params.__dict__[key] = self.config[key] def run(self): if self.config['print_to_stdout']: self.job.run(input = do_split(self.config), map = self.map, reduce = self.reduce, params = self.params, map_input_stream = mongodb_input_stream, required_modules= ['mongodb_io', 'mongodb_input', 'config_util', 'mongo_util', 'mongodb_output']) for key, value in result_iterator(self.job.wait(show=True)): print key, value else: self.job.run(input = do_split(self.config), map = self.map, reduce = self.reduce, params = self.params, map_input_stream = mongodb_input_stream, reduce_output_stream = mongodb_output_stream, required_modules= ['mongodb_io', 'mongodb_input', 'config_util', 'mongo_util', 'mongodb_output']) if self.config.get("job_wait",False): self.job.wait(show=True)
def measure(test_data, predictions, measure="ca", save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator from disco.worker.task_io import task_input_stream, chain_reader if measure not in ["ca", "mse"]: raise Exception("measure should be ca or mse.") if test_data.params["id_index"] == -1: raise Exception("ID index should be defined.") if predictions == []: return "No predictions", None # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=test_data.params["input_chain"], init=simple_init, process=map_test_data))] job.params = test_data.params job.run(name="ma_parse_testdata", input=test_data.params["data_tag"]) parsed_testdata = job.wait(show=show) reduce_proces = reduce_ca if measure == "ca" else reduce_mse job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", init=simple_init, input_chain=[task_input_stream, chain_reader], process=map_predictions)), ('group_all', Stage("reduce", init=simple_init, process=reduce_proces, sort=True, combine=True))] job.run(name="ma_measure_accuracy", input=parsed_testdata + predictions) measure, acc = [ (measure, acc) for measure, acc in result_iterator(job.wait(show=show)) ][0] return measure, acc
def predict(dataset, fitmodel_url, voting=False, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import discomll path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""]) if "drf_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict_voting if voting else map_predict_dist))] job.params = dataset.params for k, v in result_iterator(fitmodel_url["drf_fitmodel"]): job.params[k] = v if len(job.params["forest"]) == 0: print "Warning: There is no decision trees in forest" return [] job.run(name="distributed_random_forest_predict", input=dataset.params["data_tag"], required_files=[path + "decision_tree.py"]) return job.wait(show=show)
def auth(clazz, province, input, output, date): dirList = os.listdir(input) ptime = datetime.strptime(date, "%Y%m%d") file_filter = ptime.strftime('%Y-%m-%d') input = ["file:///" + input + "/" + file for file in dirList if ( re.search(date, file) or re.search(file_filter, file) )] if input: if clazz == 'c+w': if cw_map_funs.has_key(province): mapfun = cw_map_funs[province] else: mapfun = cw_map else: if fixed_map_funs.has_key(province): mapfun = fixed_map_funs[province] else: mapfun = fixed_map job = Job().run(input=input, map=mapfun) file = open(output + "/" + clazz + "-" + date + ".ctl", "w") sqldr_header(file) for user, line in result_iterator(job.wait(show=True)): print >>file, line file.close() else: print 'resolve.py: Can not find any auth files.'
def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import discomll path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""]) job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] if "dwfr_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") try: coeff = float(coeff) if coeff < 0: raise Exception("Parameter coeff should be greater than 0.") except ValueError: raise Exception("Parameter coeff should be numerical.") job.params = dataset.params job.params["coeff"] = coeff for k, v in result_iterator(fitmodel_url["dwfr_fitmodel"]): job.params[k] = v if len(job.params["forest"]) == 0: print "Warning: There is no decision trees in forest" return [] job.run(name="distributed_weighted_forest_rand_predict", input=dataset.params["data_tag"], required_files=[path + "decision_tree.py"]) return job.wait(show=show)
def predict(dataset, fitmodel_url, save_results=True, show=False): """ Function starts a job that makes predictions to input data with a given model. Parameters ---------- input - dataset object with input urls and other parameters fitmodel_url - model created in fit phase save_results - save results to ddfs show - show info about job execution Returns ------- Urls with predictions on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator if "linsvm_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) # job parallelizes execution of mappers job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] job.params = dataset.params job.params["fit_params"] = [v for _, v in result_iterator(fitmodel_url["linsvm_fitmodel"])][0] job.run(name="linsvm_predict", input=dataset.params["data_tag"]) return job.wait(show=show)
def fit(dataset, save_results=True, show=False): """ Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model. Parameters ---------- input - dataset object with input urls and other parameters save_results - save results to ddfs show - show info about job execution Returns ------- Urls of fit model results on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) # job parallelizes mappers, sorts intermediate pairs and joins them with one reducer job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True))] job.params = dataset.params # job parameters (dataset object) # define name of a job and input data urls job.run(name="naivebayes_fit", input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) return {"naivebayes_fitmodel": fitmodel_url} # return results url
def fit_predict(training_data, fitting_data, tau=1, samples_per_job=0, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator from disco.core import Disco """ training_data - training samples fitting_data - dataset to be fitted to training data. tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x. samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job. """ try: tau = float(tau) if tau <= 0: raise Exception("Parameter tau should be >= 0.") except ValueError: raise Exception("Parameter tau should be numerical.") if fitting_data.params["id_index"] == -1: raise Exception("Predict data should have id_index set.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=fitting_data.params["input_chain"], init=simple_init, process=map_predict)) ] job.params = fitting_data.params job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"]) samples = {} results = [] tau = float(2 * tau ** 2) # calculate tau once counter = 0 for test_id, x in result_iterator(job.wait(show=show)): if samples_per_job == 0: # calculate number of samples per job if len(x) <= 100: # if there is less than 100 attributes samples_per_job = 100 # 100 samples is max per on job else: # there is more than 100 attributes samples_per_job = len(x) * -25 / 900.0 + 53 # linear function samples[test_id] = x if counter == samples_per_job: results.append(_fit_predict(training_data, samples, tau, save_results, show)) counter = 0 samples = {} counter += 1 if len(samples) > 0: # if there is some samples left in the the dictionary results.append(_fit_predict(training_data, samples, tau, save_results, show)) # merge results of every iteration into a single tag ddfs = Disco().ddfs ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results]) return ["tag://" + job.name]
def predict(dataset, fitmodel_url, m=1, save_results=True, show=False): """ Function starts a job that makes predictions to input data with a given model Parameters ---------- input - dataset object with input urls and other parameters fitmodel_url - model created in fit phase m - m estimate is used with discrete features save_results - save results to ddfs show - show info about job execution Returns ------- Urls of predictions on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import numpy as np try: m = float(m) except ValueError: raise Exception("Parameter m should be numerical.") if "naivebayes_fitmodel" in fitmodel_url: # fit model is loaded from ddfs fit_model = dict((k, v) for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"])) if len(fit_model["y_labels"]) < 2: print "There is only one class in training data." return [] else: raise Exception("Incorrect fit model.") if dataset.params["X_meta"].count("d") > 0: # if there are discrete features in the model # code calculates logarithms to optimize predict phase as opposed to calculation by every mapped. np.seterr(divide='ignore') for iv in fit_model["iv"]: dist = [fit_model.pop((y,) + iv, 0) for y in fit_model["y_labels"]] fit_model[iv] = np.nan_to_num( np.log(np.true_divide(np.array(dist) + m * fit_model["prior"], np.sum(dist) + m))) - fit_model[ "prior_log"] del (fit_model["iv"]) # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) # job parallelizes execution of mappers job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] job.params = dataset.params # job parameters (dataset object) job.params["fit_model"] = fit_model # define name of a job and input data urls job.run(name="naivebayes_predict", input=dataset.params["data_tag"]) results = job.wait(show=show) return results
def fit(dataset, alpha=1e-8, max_iterations=10, save_results=True, show=False): """ Function starts a job for calculation of theta parameters Parameters ---------- input - dataset object with input urls and other parameters alpha - convergence value max_iterations - define maximum number of iterations save_results - save results to ddfs show - show info about job execution Returns ------- Urls of fit model results on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import numpy as np if dataset.params["y_map"] == []: raise Exception("Logistic regression requires a target label mapping parameter.") try: alpha = float(alpha) max_iterations = int(max_iterations) if max_iterations < 1: raise Exception("Parameter max_iterations should be greater than 0.") except ValueError: raise Exception("Parameters should be numerical.") # initialize thetas to 0 and add intercept term thetas = np.zeros(len(dataset.params["X_indices"]) + 1) J = [0] # J cost function values for every iteration for i in range(max_iterations): job = Job(worker=Worker(save_results=save_results)) # job parallelizes mappers and joins them with one reducer job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = dataset.params # job parameters (dataset object) job.params["thetas"] = thetas # every iteration set new thetas job.run(name="logreg_fit_iter_%d" % (i + 1), input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) for k, v in result_iterator(fitmodel_url): if k == "J": # J.append(v) # save value of J cost function else: thetas = v # save new thetas if np.abs(J[-2] - J[-1]) < alpha: # check for convergence if show: print("Converged at iteration %d" % (i + 1)) break return {"logreg_fitmodel": fitmodel_url} # return results url
def predict(input, loglikelihoods, ys, splitter=" ", map_reader=chain_reader): ys = dict([(id, 1) for id in ys]) job = Job(name="naive_bayes_predict") job.run( input=input, map_reader=map_reader, map=predict_map, params=Params(loglikelihoods=loglikelihoods, ys=ys, splitter=splitter), clean=False, ) return job.wait()
def main(): args = parse_args() news_file = args.news_file job = Job().run( input=news_file, map_reader=disco.worker.classic.func.chain_reader, map=read_twitter, reduce=reduce) with open("output_result",'w') as out: for word, count in result_iterator(job.wait(show=False)): out.write(word + "\t" + str(count))
def predict(input, loglikelihoods, ys, splitter=' ', map_reader=chain_reader): ys = dict([(id, 1) for id in ys]) job = Job(name='naive_bayes_predict') job.run(input=input, map_reader=map_reader, map=predict_map, params=Params(loglikelihoods=loglikelihoods, ys=ys, splitter=splitter), clean=False) return job.wait()
def fit(dataset, trees_per_chunk=1, bootstrap=True, max_tree_nodes=50, min_samples_leaf=10, min_samples_split=5, class_majority=1, separate_max=True, measure="info_gain", accuracy=1, random_state=None, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job import discomll path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""]) try: trees_per_chunk = int(trees_per_chunk) max_tree_nodes = int(max_tree_nodes) if max_tree_nodes != None else max_tree_nodes min_samples_leaf = int(min_samples_leaf) min_samples_split = int(min_samples_split) class_majority = float(class_majority) accuracy = int(accuracy) separate_max = separate_max if trees_per_chunk > 1 and bootstrap == False: raise Exception("Parameter trees_per_chunk (or Trees per subset) should be 1 to disable bootstrap.") if trees_per_chunk <= 0 or min_samples_leaf <= 0 or class_majority <= 0 or min_samples_split <= 0 and accuracy < 0 or type( bootstrap) != bool: raise Exception("Parameters should be greater than 0.") except ValueError: raise Exception("Parameters should be numerical.") if measure not in ["info_gain", "mdl"]: raise Exception("measure should be set to info_gain or mdl.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=map_init, process=map_fit_bootstrap if bootstrap else map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = dataset.params job.params["trees_per_chunk"] = trees_per_chunk job.params["max_tree_nodes"] = max_tree_nodes job.params["min_samples_leaf"] = min_samples_leaf job.params["min_samples_split"] = min_samples_split job.params["class_majority"] = class_majority job.params["measure"] = measure job.params["bootstrap"] = bootstrap job.params["accuracy"] = accuracy job.params["separate_max"] = separate_max job.params['seed'] = random_state job.run(name="forest_distributed_decision_trees_fit", input=dataset.params["data_tag"], required_files=[path + "decision_tree.py", path + "measures.py"]) fitmodel_url = job.wait(show=show) return {"fddt_fitmodel": fitmodel_url} # return results url
def fit(dataset, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = dataset.params job.run(name="linreg_fit", input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) return {"linreg_fitmodel": fitmodel_url} # return results url
def predict(dataset, fitmodel_url, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator if "linreg_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] job.params = dataset.params job.params["thetas"] = [v for _, v in result_iterator(fitmodel_url["linreg_fitmodel"])][0] job.run(name="linreg_predict", input=dataset.params["data_tag"]) return job.wait(show=show)
def measure(input, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=input.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = input.params # job parameters (dataset object) job.run(name="Distribution", input=input.params["data_tag"]) return job.wait(show=show) # return results url
def fit(dataset, nu=0.1, save_results=True, show=False): """ Function starts a job for calculation of model parameters Parameters ---------- input - dataset object with input urls and other parameters nu - parameter to adjust the classifier save_results - save results to ddfs show - show info about job execution Returns ------- Urls of fit model results on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job if dataset.params["y_map"] == []: raise Exception( "Linear proximal SVM requires a target label mapping parameter.") try: nu = float(nu) if nu <= 0: raise Exception("Parameter nu should be greater than 0") except ValueError: raise Exception("Parameter should be numerical.") job = Job(worker=Worker(save_results=save_results)) # job parallelizes mappers and joins them with one reducer job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = dataset.params job.params["nu"] = nu job.run(name="linearsvm_fit", input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) return {"linsvm_fitmodel": fitmodel_url} # return results url
def _fit_predict(fit_data, samples, tau, save_results, show): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=fit_data.params["input_chain"], init=simple_init, process=map_fit)), ("group_all", Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True)), ] job.params = fit_data.params job.params["tau"] = tau job.params["samples"] = samples job.run(name="lwlr_fit_predict", input=fit_data.params["data_tag"]) return job.wait(show=show)
def process_prediction_data_featurization_with_disco(input_list,params,partitions=4): ''' Called from within featurize_prediction_data_in_parallel Returns disco.core.result_iterator Arguments: input_list: path to file listing filename,unused_string for each individual time series data file. params: dictionary of parameters to be passed to each map & reduce function. partitions: Number of nodes/partitions in system. ''' from disco.core import Job, result_iterator job = Job().run(input=input_list, map=pred_map, partitions=partitions, reduce=pred_featurize_reduce, params=params) result = result_iterator(job.wait(show=True)) return result
def process_featurization_with_disco(input_list, params, partitions=4): ''' Called from within featurize_in_parallel. Returns disco.core.result_iterator Arguments: input_list: path to file listing filename,class_name for each individual time series data file. params: dictionary of parameters to be passed to each map & reduce function. partitions: Number of nodes/partitions in system. ''' from disco.core import Job, result_iterator job = Job().run(input=input_list, map=map, partitions=partitions, reduce=featurize_reduce, params=params) result = result_iterator(job.wait(show=True)) return result
def main(): job = Job().run(input=[TRAIN_IN], map=mapper, reduce=reducer, sort=True) category_options = defaultdict(dict) category_values = defaultdict(int) for cat_id, counter in result_iterator(job.wait(show=True)): if len(counter) > MAX_CATEGORICAL_OPTIONS: continue for cat_value in counter: if cat_value not in category_options[cat_id]: category_options[cat_id][cat_value] = category_values[cat_id] category_values[cat_id] += 1 # save possible categorical data with open(CATEGORY_MAPPING_OUT, 'w') as f: f.write(dumps(category_options)) with open(CATEGORY_STATUS_OUT, 'w') as f: f.write(dumps(category_values))
def fit_model_disco(data_dict, featureset_key, model_type): """ """ from disco.core import Job, result_iterator params = {"data_dict": data_dict, "featureset_key": featureset_key, "model_type": model_type} input_list = [("placeholder")] job = Job('with_modules').run( input=input_list, reduce=reduce, params=params, required_modules=[("mltsp", os.path.dirname(os.path.dirname(__file__))), "sklearn"]) result_iter = result_iterator(job.wait(show=True)) rf_fit = None for rf_obj, dummy_str in result_iter: rf_fit = rf_obj return rf_fit
def predict(dataset, fitmodel_url, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator if "linreg_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] job.params = dataset.params job.params["thetas"] = [ v for _, v in result_iterator(fitmodel_url["linreg_fitmodel"]) ][0] job.run(name="linreg_predict", input=dataset.params["data_tag"]) return job.wait(show=show)
def predict(dataset, fitmodel_url, save_results=True, show=False): """ Function starts a job that makes predictions to input data with a given model Parameters ---------- input - dataset object with input urls and other parameters fitmodel_url - model created in fit phase save_results - save results to ddfs show - show info about job execution Returns ------- Urls with predictions on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator if dataset.params["y_map"] == []: raise Exception( "Logistic regression requires a target label mapping parameter.") if "logreg_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) # job parallelizes execution of mappers job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] job.params = dataset.params # job parameters (dataset object) job.params["thetas"] = [ v for k, v in result_iterator(fitmodel_url["logreg_fitmodel"]) if k == "thetas" ][0] # thetas are loaded from ddfs job.run(name="logreg_predict", input=dataset.params["data_tag"]) results = job.wait(show=show) return results
def fit(dataset, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = dataset.params job.run(name="linreg_fit", input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) return {"linreg_fitmodel": fitmodel_url} # return results url
def fit(dataset, nu=0.1, save_results=True, show=False): """ Function starts a job for calculation of model parameters Parameters ---------- input - dataset object with input urls and other parameters nu - parameter to adjust the classifier save_results - save results to ddfs show - show info about job execution Returns ------- Urls of fit model results on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job if dataset.params["y_map"] == []: raise Exception("Linear proximal SVM requires a target label mapping parameter.") try: nu = float(nu) if nu <= 0: raise Exception("Parameter nu should be greater than 0") except ValueError: raise Exception("Parameter should be numerical.") job = Job(worker=Worker(save_results=save_results)) # job parallelizes mappers and joins them with one reducer job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = dataset.params job.params["nu"] = nu job.run(name="linearsvm_fit", input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) return {"linsvm_fitmodel": fitmodel_url} # return results url
def process_featurization_with_disco(input_list, params, partitions=4): """Featurize time-series data in parallel as a Disco job. Called from within the `featurize_in_parallel` function. Parameters ---------- input_list : str Path to file listing the file name and class name (comma-separated) for each individual time series data file, one per line. params : dict Dictionary of parameters to be passed to each map & reduce function. partitions : int, optional Number of nodes/partitions in system. Defaults to 4. Returns ------- iterator disco.core.result_iterator(), an interator of two-element tuples, each containing the file name of the original time series data file, and a dictionary of the associated features generated. """ from disco.core import Job, result_iterator job = Job('with_modules').run( input=input_list, map_reader=custom_reader, map=map, partitions=partitions, reduce=featurize_reduce, params=params, required_modules=[("mltsp", os.path.dirname(os.path.dirname(__file__)))]) result = result_iterator(job.wait(show=True)) return result
def fit(dataset, save_results=True, show=False): """ Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model. Parameters ---------- input - dataset object with input urls and other parameters save_results - save results to ddfs show - show info about job execution Returns ------- Urls of fit model results on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) # job parallelizes mappers, sorts intermediate pairs and joins them with one reducer job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True))] job.params = dataset.params # job parameters (dataset object) # define name of a job and input data urls job.run(name="naivebayes_fit", input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) return {"naivebayes_fitmodel": fitmodel_url} # return results url
def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import discomll path = "/".join( discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""]) job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] if "dwf_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") try: coeff = float(coeff) if coeff < 0: raise Exception("Parameter coeff should be greater than 0.") except ValueError: raise Exception("Parameter coeff should be numerical.") job.params = dataset.params job.params["coeff"] = coeff for k, v in result_iterator(fitmodel_url["dwf_fitmodel"]): job.params[k] = v if len(job.params["forest"]) == 0: print "Warning: There is no decision trees in forest" return [] job.run(name="distributed_weighted_forest_predict", input=dataset.params["data_tag"], required_files=[path + "decision_tree.py"]) return job.wait(show=show)
def _fit_predict(fit_data, samples, tau, save_results, show): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=fit_data.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True))] job.params = fit_data.params job.params["tau"] = tau job.params["samples"] = samples job.run(name="lwlr_fit_predict", input=fit_data.params["data_tag"]) return job.wait(show=show)
def predict(dataset, fitmodel_url, voting=False, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import discomll path = "/".join( discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""]) if "drf_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict_voting if voting else map_predict_dist)) ] job.params = dataset.params for k, v in result_iterator(fitmodel_url["drf_fitmodel"]): job.params[k] = v if len(job.params["forest"]) == 0: print "Warning: There is no decision trees in forest" return [] job.run(name="distributed_random_forest_predict", input=dataset.params["data_tag"], required_files=[path + "decision_tree.py"]) return job.wait(show=show)
from disco.core import Job, result_iterator def map(line, params): for word in line.split(): yield word,1 def reduce(iter, params): from disco.util import kvgroup for word, counts in kvgroup(sorted(iter)): yield word, sum(counts) ''' def mongodb_output(stream,partition,url,params): return mongoDisco_output.MongoDBoutput(stream,params) ''' if __name__ == '__main__': job = Job().run(input=["r"], map=map, reduce=reduce, map_input_stream = mongodb_input_stream reduce_output_stream=mongodb_output_stream) job.wait(show=True)
def fit_predict(training_data, fitting_data, tau=1, samples_per_job=0, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator from disco.core import Disco """ training_data - training samples fitting_data - dataset to be fitted to training data. tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x. samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job. """ try: tau = float(tau) if tau <= 0: raise Exception("Parameter tau should be >= 0.") except ValueError: raise Exception("Parameter tau should be numerical.") if fitting_data.params["id_index"] == -1: raise Exception("Predict data should have id_index set.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=fitting_data.params["input_chain"], init=simple_init, process=map_predict))] job.params = fitting_data.params job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"]) samples = {} results = [] tau = float(2 * tau**2) # calculate tau once counter = 0 for test_id, x in result_iterator(job.wait(show=show)): if samples_per_job == 0: # calculate number of samples per job if len(x) <= 100: # if there is less than 100 attributes samples_per_job = 100 # 100 samples is max per on job else: # there is more than 100 attributes samples_per_job = len(x) * -25 / 900. + 53 # linear function samples[test_id] = x if counter == samples_per_job: results.append( _fit_predict(training_data, samples, tau, save_results, show)) counter = 0 samples = {} counter += 1 if len(samples) > 0: # if there is some samples left in the the dictionary results.append( _fit_predict(training_data, samples, tau, save_results, show)) # merge results of every iteration into a single tag ddfs = Disco().ddfs ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results]) return ["tag://" + job.name]
we know we do not know.\ But there are also unknown unknowns.\ There are things \ we do not know we don't know", map=map, reduce=reduce) sort_in_numerical_order =\ open('SortNumerical.txt', 'w') sort_in_alpbabetically_order = \ open('SortAlphabetical.txt', 'w') wordCount = [] for word, count in \ result_iterator(job.wait(show=True)): sort_in_alpbabetically_order.write('%s \t %d\n' % (str(word), int(count))) wordCount.append((word, count)) sortedWordCount =sorted(wordCount, \ key=lambda count: count[1],\ reverse=True) for word, count in sortedWordCount: sort_in_numerical_order.write('%s \t %d\n'\ % (str(word), int(count)) ) sort_in_alpbabetically_order.close() sort_in_numerical_order.close()
from disco.core import Job, result_iterator def map(line, params): for word in line.split(): yield word, 1 def reduce(iter, params): from disco.util import kvgroup for word, counts in kvgroup(sorted(iter)): yield word, sum(counts) if __name__ == '__main__': job = Job().run(input=["erl://erl_inputs:test/dummy"], map=map, reduce=reduce) for word, count in result_iterator(job.wait(show=True)): print word, count
class DiscoJob(): DEFAULT_CONFIG = { "job_output_key": "_id", "job_output_value": "value", "input_uri": "mongodb://localhost/test.in", "output_uri": "mongodb://localhost/test.out", "print_to_stdout": False, "job_wait": True, "split_size": 8, "split_key": { "_id": 1 }, "create_input_splits": True, "use_shards": False, "use_chunks": True, "slave_ok": False, "limit": 0, "skip": 0, "input_key": None, "sort": None, "timeout": False, "fields": None, "query": {} } def __init__(self, config, map, reduce): self.config = DiscoJob.DEFAULT_CONFIG.copy() self.config.update(config) self.map = map self.reduce = reduce self.job = Job() self.params = Params(**self.config) def run(self): if self.config['print_to_stdout']: self.job.run(input=do_split(self.config), map=self.map, reduce=self.reduce, params=self.params, map_input_stream=mongodb_input_stream, required_modules=[ 'mongodisco.mongodb_io', 'mongodisco.mongodb_input', 'mongodisco.mongo_util', 'mongodisco.mongodb_output' ]) for key, value in result_iterator(self.job.wait(show=True)): print key, value else: self.job.run(input=do_split(self.config), map=self.map, reduce=self.reduce, params=self.params, map_input_stream=mongodb_input_stream, reduce_output_stream=mongodb_output_stream, required_modules=[ 'mongodisco.mongodb_io', 'mongodisco.mongodb_input', 'mongodisco.mongo_util', 'mongodisco.mongodb_output' ]) if self.config.get("job_wait", False): self.job.wait(show=True)
def reduce(iter, params): from disco.util import kvgroup for key, counts in kvgroup(sorted(iter)): Day = '' Num = 0 DayList = list(counts) Days = set(DayList) for j in Days: if DayList.count(j) > Num: Num = DayList.count(j) Day = j if Num > 1: yield key, Day if __name__ == '__main__': job = Job().run(input=["data:vcobssplit"], map=map, reduce=reduce) output_filename = "output.csv" if len(sys.argv) > 1: output_filename = sys.argv[1] with open(output_filename, 'w') as fp: writer = csv.writer(fp) for key, date in result_iterator(job.wait(show=True)): writer.writerow([key] + [date])
tweeter, tweet = count_tweet_words.get_username_tweet(line) # return each word in the tweet (to count frequency of each term) for word in tweet.split(): yield word, 1 def reduce(iter, params): from disco.util import kvgroup for word, counts in kvgroup(sorted(iter)): yield word, sum(counts) if __name__ == '__main__': input_filename = "./tweet_data/tweets_357.json" #input_filename = "./tweet_data/tweets_859157.json" #input_filename = "/media/3TBStorage/tweets_all.json" # we need a fully qualified file name for the server fully_qualified_path = os.path.realpath(input_filename) input = [fully_qualified_path] # import this module so pickle knows what to send to workers import count_tweet_words job = Job().run(input=input, map=map, reduce=reduce) out = open(OUTPUT_FILENAME, 'w') for word, count in result_iterator(job.wait(show=True)): #print(word, count) out.write(json.dumps([word, count]) + '\n')
from disco.core import Job, result_iterator from travolta.wiki import wiki_input_stream def map(tin, params): if tin: l = tin.split("|||||") text = l[1] cat = l[0] if text != "None": yield text, cat if __name__ == '__main__': job = Job().run(input = ["/Users/pb/code/travolta/examples/data/wikiextract.xml"], map = map, map_input_stream = [wiki_input_stream], required_modules = [('travolta.wiki', '/Users/pb/code/travolta/python')], params = {"tag": "text"}) for text, _cat in result_iterator(job.wait(show=True)): print text
def disco_word_count(): job = Job().run(input=["http://discoproject.org/media/text/chekhov.txt"], map=map, reduce=reduce) for word, count in result_iterator(job.wait(show=True)): print(word, count)
item = q.get() if item == 0: return yield item q.task_done() def map(line, params): import __builtin__ unwanted = u",!.#()][{}-><=|/\"'*:?" words = line.translate( __builtin__.dict.fromkeys([ord(x) for x in unwanted], u" ")).lower() for word in words.split(): yield word, 1 def reduce(iter, params): from disco.util import kvgroup for word, counts in kvgroup(sorted(iter)): yield word, sum(counts) if __name__ == '__main__': job = Job().run(input=["tag://" + DDFS_TAG], map=map, reduce=reduce, map_reader=chain_reader) for line, count in result_iterator(job.wait(show=True)): print(line, count)
def fit(dataset, alpha=1e-8, max_iterations=10, save_results=True, show=False): """ Function starts a job for calculation of theta parameters Parameters ---------- input - dataset object with input urls and other parameters alpha - convergence value max_iterations - define maximum number of iterations save_results - save results to ddfs show - show info about job execution Returns ------- Urls of fit model results on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import numpy as np if dataset.params["y_map"] == []: raise Exception( "Logistic regression requires a target label mapping parameter.") try: alpha = float(alpha) max_iterations = int(max_iterations) if max_iterations < 1: raise Exception( "Parameter max_iterations should be greater than 0.") except ValueError: raise Exception("Parameters should be numerical.") # initialize thetas to 0 and add intercept term thetas = np.zeros(len(dataset.params["X_indices"]) + 1) J = [0] # J cost function values for every iteration for i in range(max_iterations): job = Job(worker=Worker(save_results=save_results)) # job parallelizes mappers and joins them with one reducer job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = dataset.params # job parameters (dataset object) job.params["thetas"] = thetas # every iteration set new thetas job.run(name="logreg_fit_iter_%d" % (i + 1), input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) for k, v in result_iterator(fitmodel_url): if k == "J": # J.append(v) # save value of J cost function else: thetas = v # save new thetas if np.abs(J[-2] - J[-1]) < alpha: # check for convergence if show: print("Converged at iteration %d" % (i + 1)) break return {"logreg_fitmodel": fitmodel_url} # return results url
def map(line, params): for char in line.lower(): if char >= 'a' and char <= 'z': yield char, 1 def reduce(iter, params): from disco.util import kvgroup for char, counts in kvgroup(sorted(iter)): yield char, sum(counts) # run the disco job from disco.core import Job, result_iterator job = Job().run(input=["http://en.wikipedia.org/wiki/MapReduce"], map=map, reduce=reduce) # plot the results with matplotlib #%matplotlib inline xs, ys = zip(*result_iterator(job.wait())) import scipy from matplotlib import pylab x = scipy.arange(len(xs)) y = scipy.array(ys) f = pylab.figure() ax = f.add_axes([0, 0, 3, 1]) ax.bar(x, y, align='center') ax.set_xticks(x) ax.set_xticklabels(xs) f.show()
neighbors = v score = 1 - d + d * sum_v yield node_id, str(node_id) + " " + str(score) + " " + neighbors if __name__ == '__main__': parser = OptionParser(usage='%prog [options] inputs') parser.add_option('--iterations', default=10, help='Numbers of iteration') parser.add_option( '--damping-factor', default=0.85, help='probability a web surfer will continue clicking on links') (options, input) = parser.parse_args() results = input params = Params(damping_factor=float(options.damping_factor)) for j in range(int(options.iterations)): job = Job().run(input=results, map=send_score, map_reader=chain_reader, reduce=receive_score, params=params) results = job.wait() for _, node in result_iterator(results): fields = node.split() print fields[0], ":", fields[1]
def fit(dataset, trees_per_chunk=1, bootstrap=True, max_tree_nodes=50, min_samples_leaf=10, min_samples_split=5, class_majority=1, separate_max=True, measure="info_gain", accuracy=1, random_state=None, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job import discomll path = "/".join( discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""]) try: trees_per_chunk = int(trees_per_chunk) max_tree_nodes = int( max_tree_nodes) if max_tree_nodes != None else max_tree_nodes min_samples_leaf = int(min_samples_leaf) min_samples_split = int(min_samples_split) class_majority = float(class_majority) accuracy = int(accuracy) separate_max = separate_max if trees_per_chunk > 1 and bootstrap == False: raise Exception( "Parameter trees_per_chunk (or Trees per subset) should be 1 to disable bootstrap." ) if trees_per_chunk <= 0 or min_samples_leaf <= 0 or class_majority <= 0 or min_samples_split <= 0 and accuracy < 0 or type( bootstrap) != bool: raise Exception("Parameters should be greater than 0.") except ValueError: raise Exception("Parameters should be numerical.") if measure not in ["info_gain", "mdl"]: raise Exception("measure should be set to info_gain or mdl.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=map_init, process=map_fit_bootstrap if bootstrap else map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True)) ] job.params = dataset.params job.params["trees_per_chunk"] = trees_per_chunk job.params["max_tree_nodes"] = max_tree_nodes job.params["min_samples_leaf"] = min_samples_leaf job.params["min_samples_split"] = min_samples_split job.params["class_majority"] = class_majority job.params["measure"] = measure job.params["bootstrap"] = bootstrap job.params["accuracy"] = accuracy job.params["separate_max"] = separate_max job.params['seed'] = random_state job.run(name="forest_distributed_decision_trees_fit", input=dataset.params["data_tag"], required_files=[path + "decision_tree.py", path + "measures.py"]) fitmodel_url = job.wait(show=show) return {"fddt_fitmodel": fitmodel_url} # return results url
dest='date', help='date') (options, args) = parser.parse_args() if not (options.clazz and options.input and options.output): print usage exit(1) if not (options.clazz == 'c+w' or options.clazz == 'fixed'): print 'class should be \'c+w\' or \'fixed\'' exit(1) dirList=os.listdir(options.input) input = ["file:///"+options.input+"/"+file for file in dirList if file.endswith(".TXT") or file.endswith(".txt")] if options.clazz == 'c+w': job = Job().run(input=input, map=cw_map) else: if fixed_map_funs.has_key(options.province): mapfun = fixed_map_funs[options.province] else: mapfun = fixed_map job = Job().run(input=input, map=mapfun) file = open(options.output + "/" + options.clazz + "-" + options.date + ".ctl", "w") sqldr_header(file) for user, line in result_iterator(job.wait(show=True)): print >>file, line file.close()
for t, v in vals: if t == "s": sum_v += v else: neighbors = v score = 1 - d + d * sum_v yield node_id, str(node_id) + " " + str(score) + " " + neighbors if __name__ == '__main__': parser = OptionParser(usage='%prog [options] inputs') parser.add_option('--iterations', default=10, help='Numbers of iteration') parser.add_option('--damping-factor', default=0.85, help='probability a web surfer will continue clicking on links') (options, input) = parser.parse_args() results = input params = Params(damping_factor=float(options.damping_factor)) for j in range(int(options.iterations)): job = Job().run(input=results, map=send_score, map_reader = chain_reader, reduce=receive_score, params = params) results = job.wait() for _, node in result_iterator(results): fields = node.split() print fields[0], ":", fields[1]
# This program estimates the value of pi (3.14...) # Usage: # python estimate_pi.py from disco.core import Job, result_iterator def map(line, params): from random import random x, y = random(), random() yield 0, 1 if x * x + y * y < 1 else 0 if __name__ == '__main__': COUNT = 5000 job = Job().run(input=["raw://0"] * COUNT, map=map) tot = 0 for k, v in result_iterator(job.wait()): tot += v print(4.0 * tot) / COUNT
from mongoDisco_output import MongoDBoutput from disco.worker.classic.func import task_output_stream import logging def map(record, params): logging.info("%s" % record.get('_id')) yield record.get('name', "NoName"), 1 def reduce(iter, params): from disco.util import kvgroup for word, counts in kvgroup(sorted(iter)): yield word, sum(counts) def mongodb_output(stream, partition, url, params): return mongoDisco_output.MongoDBoutput(stream, params) if __name__ == '__main__': mongodb_stream = tuple([mongodb_output]) job = Job().run(input=["mongodb://localhost/test.modforty"], map=map, reduce=reduce, reduce_output_stream=mongodb_stream) job.wait(show=True) # for word, count in result_iterator(job.wait(show=True)): # print word, count
def predict(dataset, fitmodel_url, m=1, save_results=True, show=False): """ Function starts a job that makes predictions to input data with a given model Parameters ---------- input - dataset object with input urls and other parameters fitmodel_url - model created in fit phase m - m estimate is used with discrete features save_results - save results to ddfs show - show info about job execution Returns ------- Urls of predictions on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import numpy as np try: m = float(m) except ValueError: raise Exception("Parameter m should be numerical.") if "naivebayes_fitmodel" in fitmodel_url: # fit model is loaded from ddfs fit_model = dict( (k, v) for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"])) if len(fit_model["y_labels"]) < 2: print "There is only one class in training data." return [] else: raise Exception("Incorrect fit model.") if dataset.params["X_meta"].count( "d") > 0: # if there are discrete features in the model # code calculates logarithms to optimize predict phase as opposed to calculation by every mapped. np.seterr(divide='ignore') for iv in fit_model["iv"]: dist = [ fit_model.pop((y, ) + iv, 0) for y in fit_model["y_labels"] ] fit_model[iv] = np.nan_to_num( np.log( np.true_divide( np.array(dist) + m * fit_model["prior"], np.sum(dist) + m))) - fit_model["prior_log"] del (fit_model["iv"]) # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) # job parallelizes execution of mappers job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] job.params = dataset.params # job parameters (dataset object) job.params["fit_model"] = fit_model # define name of a job and input data urls job.run(name="naivebayes_predict", input=dataset.params["data_tag"]) results = job.wait(show=show) return results
def map(line, params): import github_crawler n = int(line) users = github_crawler.get_users_with_n_followers(n) print str(len(users)) + " users" for user in users: repos = github_crawler.get_user_parent_repos(user) print str(len(repos)) + " repos" for owner, repo, branch in repos: print owner + "/" + repo + "#" + branch directory = github_crawler.clone_repo(owner, repo, branch) for item in github_crawler.analyze_repo(directory): yield item def reduce(iter, params): from disco.util import kvgroup for extension, ratios in kvgroup(sorted(iter)): l_ratios = [r for r in ratios] yield extension, sum(l_ratios) / len(l_ratios) if __name__ == "__main__": input = ["raw://" + str(i) for i in range(0, 1000, 10)] job = Job().run(input=input, map=map, reduce=reduce, required_files=["github_crawler.py"]) for extension, avg in result_iterator(job.wait(show=True)): print extension, ": ", avg
def estimate(input, ys, splitter=' ', map_reader=chain_reader): ys = dict([(id, 1) for id in ys]) job = Job(name='naive_bayes_estimate') job.run(input=input, map_reader=map_reader, map=estimate_map, combiner=estimate_combiner, reduce=estimate_reduce, params=Params(ys=ys, splitter=splitter), clean=False) results = job.wait() total = 0 # will include the items for which we'll be classifying, # for example if the dataset includes males and females, # this dict will include the keys male and female and the # number of times these have been observed in the train set items = {} # the number of times the classes have been observed. For # example, if the feature is something like tall or short, then the dict # will contain the total number of times we have seen tall and short. classes = {} # the number of times we have seen a class with a feature. pairs = {} for key, value in result_iterator(results): l = key.split(splitter) value = int(value) if len(l) == 1: if l[0] == '': total = value elif ys.has_key(l[0]): classes[l[0]] = value else: items[l[0]] = value else: pairs[key] = value #counts[key] = [[c,i], [not c, i], [c, not i], [not c, not i]] counts = {} for i in items: for y in ys: key = y + splitter + i counts[key] = [0, 0, 0, 0] if pairs.has_key(key): counts[key][0] = pairs[key] counts[key][1] = items[i] - counts[key][0] if not classes.has_key(y): counts[key][2] = 0 else: counts[key][2] = classes[y] - counts[key][0] counts[key][3] = total - sum(counts[key][:3]) # add pseudocounts counts[key] = map(lambda x: x + 1, counts[key]) total += 4 import math loglikelihoods = {} for key, value in counts.iteritems(): l = key.split(splitter) if not loglikelihoods.has_key(l[0]): loglikelihoods[l[0]] = 0.0 loglikelihoods[l[0]] += math.log(value[0] + value[2]) - math.log(value[1] + value[3]) loglikelihoods[key] = math.log(value[0]) - math.log(value[1]) return loglikelihoods