def predict(master, input, center, centers): """ Predict the closest clusters for the datapoints in input. """ from kclustering_pipeline import Estimate job = Estimate() job.pipeline = [("split", Stage("k_cluster_predict", input_chain = [task_input_stream, reader], init = simple_init, process = predict_map))] job.params = center job.params['centers'] = centers job.run(input = input) return job.wait()
def estimate(master, input, center, k, iterations): """ Optimize k-clustering for `iterations` iterations with cluster center definitions as given in `center`. """ from kclustering_pipeline import Estimate job = Estimate() job.pipeline = [("split", Stage("k_cluster_init_map", input_chain = [task_input_stream, reader], init = map_init, process = random_init_map)), ('group_label', Stage("k_cluster_init_reduce", process = estimate_reduce, init = simple_init))] job.params = center job.params['seed'] = 0 job.params['k'] = k job.run(input = input) centers = [(i,c) for i,c in result_iterator(job.wait())] job.purge() for j in range(iterations): job = Estimate() job.params = center job.params['k'] = k job.params['centers'] = centers job.pipeline = [('split', Stage("kcluster_map_iter_%s" %(j,), input_chain = [task_input_stream, reader], process=estimate_map, init = simple_init)), ('group_label', Stage("kcluster_reduce_iter_%s" %(j,), process=estimate_reduce, init = simple_init))] job.run(input = input) centers = [(i,c) for i,c in result_iterator(job.wait())] job.purge() return centers