class IndexJob(object): def __init__(self, spec, discodex, disco_addr="disco://localhost", profile=False): # TODO(sqs): refactoring potential with PagerankJob self.spec = spec self.discodex = discodex self.docset = Docset(spec.docset_name) self.disco = Disco(DiscoSettings()["DISCO_MASTER"]) self.nr_partitions = 8 self.profile = profile def start(self): results = self.__run_job(self.__index_job()) self.__run_discodex_index(results) def __run_job(self, job): results = job.wait() if self.profile: self.__profile_job(job) return results def __index_job(self): return self.disco.new_job( name="index_tfidf", input=["tag://" + self.docset.ddfs_tag], map_reader=docparse, map=TfIdf.map, reduce=TfIdf.reduce, sort=True, partitions=self.nr_partitions, partition=TfIdf.partition, merge_partitions=False, profile=self.profile, params=dict(doc_count=self.docset.doc_count), ) def __run_discodex_index(self, results): opts = { "parser": "disco.func.chain_reader", "demuxer": "freequery.index.tf_idf.TfIdf_demux", "nr_ichunks": 1, # TODO(sqs): after disco#181 fixed, increase this } ds = DataSet(input=results, options=opts) origname = self.discodex.index(ds) self.disco.wait(origname) # origname is also the disco job name self.discodex.clone(origname, self.spec.invindex_name)
class LinkParseJob(object): def __init__(self, spec, verbose=False, **kwargs): self.spec = spec self.docset = Docset(self.spec.docset_name) self.disco = Disco("disco://localhost") self.verbose = verbose def start(self): from disco import func job = self.disco.new_job( name="linkparse", input=self.docset.dump_uris(), map_reader=docparse, map=linkparse_map, map_output_stream=(func.map_output_stream, func.disco_output_stream, LinkFileOutputStream.disco_output_stream), partitions=0, save=True, ) results = job.wait() self.__tag_results(results) if self.verbose: self.__print_results(results) def __tag_results(self, results): from disco.ddfs import DDFS ddfs = DDFS() results_tag = results[0] ddfs.put(self.docset.ddfs_link_file_tag, list(ddfs.blobs(results_tag))) # remove old, temporary tag ddfs.delete(results_tag) def __print_results(self, results): for doc in result_iterator(results, tempdir=False, reader=doclinksparse): print "%s\n\t%s" % (doc.uri, "\n\t".join(doc.link_uris))
import sys from disco.core import Disco, result_iterator from disco.settings import DiscoSettings def map(line, params): for word in line.split(): yield word, 1 def reduce(iter, params): from disco.util import kvgroup for word, counts in kvgroup(sorted(iter)): yield word, sum(counts) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.." print "Go to %s to see status of the job." % disco.master results = disco.new_job( name="wordcount", input=["http://discoproject.org/media/text/chekhov.txt"], map=map, reduce=reduce, save=True).wait() print "Job done. Results:" for word, count in result_iterator(results): print word, count
tserver.run_server(data_gen) N = 10 results = {} inputs = [] for i in range(N): a = [i] * 10 b = range(i, i + 10) inputs += ["%d:%d" % x for x in zip(a, b)] results[str(i)] = sum(b) disco = Disco(sys.argv[1]) # map results in individual files, one per input file (default mode) job1 = disco.new_job(\ name = "test_partfile1", input = tserver.makeurl(inputs), map = fun_map) # map results in one big partition file per host job2 = disco.new_job(\ name = "test_partfile2", input = tserver.makeurl(inputs), map = fun_map, nr_reduces = 1) check_results(job1) check_results(job2) job1.purge() job2.purge() print "ok"
import sys from disco.core import Disco, result_iterator from disco.settings import DiscoSettings def map(entry, params): for word in entry.split(): yield word, 1 def reduce(iter, out, params): s = {} for word, freq in iter: s[word] = s.get(word, 0) + int(freq) for word, freq in s.iteritems(): out.add(word, freq) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.." print "Go to %s to see status of the job." % disco.master results = disco.new_job(name="wordcount", input=["http://discoproject.org/chekhov.txt"], map=map, reduce=reduce).wait() print "Job done. Results:" for word, freq in result_iterator(results): print word, freq
from disco.core import Disco, result_iterator from disco.util import kvgroup from disco.settings import DiscoSettings def fun_map((key, value), params): bucket_range = (params.upper - params.lower) // params.num_buckets bucket = value // bucket_range if bucket >= params.num_buckets: yield params.num_buckets - 1, value yield bucket, value def fun_reduce(iter, params): for k, v in kvgroup(sorted(iter)) yield k, sorted(v) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.. " results = disco.new_job(name = "Sorting job", input = [(1, 1), (2, 2), (5, 5), (4, 4), (-1, -1)] map = fun_map, reduce = fun_reduce, params = disco.core.Params(lower = 0, upper = 10, num_buckets = 3)).wait() print "Job done. Results:" for k, v in result_iterator(results): print k, v
newdistances = {} def minFrom(d, a): for k, v in a.items(): d[k] = mymin(d.get(k, -1), v) for d in distances: if d.get("nodes"): nodes = d["nodes"] minFrom(newdistances, d["distances"]) yield node, json.dumps([node,newdistances,nodes]) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.." print "Go to %s to see status of the job." % disco.master results = disco.new_job(name="shortestpath", input=["file:///home/marko/tmp/disco/out.txt"], map=map, reduce=reduce, save=True).wait() print "Job done" out = file("out.txt", "w") for node, data in result_iterator(results): print >>out, data out.close()
def fun_reduce(iter, out, params): for k, v in iter: out.add("[%s]" % k, v) tserver.run_server(data_gen) disco = Disco(sys.argv[1]) num = sum(x['max_workers'] for x in disco.nodeinfo()['available']) print >> sys.stderr, num, "slots available" inputs = tserver.makeurl(range(num * 10)) random.shuffle(inputs) jobs = [] for i in range(5): jobs.append(disco.new_job(name = "test_async_%d" % i, input = inputs[i * (num * 2):(i + 1) * (num * 2)], map = fun_map, reduce = fun_reduce, nr_reduces = 11, sort = False)) time.sleep(1) all = dict(("[%s]" % i, 0) for i in range(num * 10)) while jobs: ready, jobs = disco.results(jobs) for name, results in ready: for k, v in result_iterator(results[1]): all[k] += 1 disco.purge(name) for v in all.values(): if v != 10: raise "Invalid results: %s" % all
date_obj = datetime.fromtimestamp(float(time[:-3])) # timestamp has milliseconds, shave em off nearest_minute = date_obj - timedelta(minutes=date_obj.minute % 1, seconds=date_obj.second, microseconds=date_obj.microsecond) yield (nearest_minute, {'unique_id': uid, 'query': query, 'frequency': frequency}) def reduce(iter, params): # This doesn't work at all, its from an old example. for unique_id, counts in kvgroup(sorted(iter)): yield unique_id, sum(counts) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.." print "Go to %s to see status of the job." % disco.master """ :clicks (ad id,people who clicked the ads) """ results = disco.new_job(name="bartekc", input=["tag://hackreduce:search:history"], map_input_stream=( func.map_input_stream, func.chain_reader, ), map=map, reduce=reduce, save=True).wait() print "Job done. Results:" for word, count in result_iterator(results): print word, count
return path[1:] + "\n" def fun_map(e, params): x, y = map(float, e.split("|")) return [(mod1.plusceil(x, y) + math.ceil(1.5), "")] tserver.run_server(data_gen) disco = Disco(sys.argv[1]) inputs = ["0.5|1.2"] print "disco tests.." # default job = disco.new_job( name = "test_modutil1", input = tserver.makeurl(inputs), map = fun_map) checkl("test_modutil1", result_iterator(job.wait()), [("4.0", "")]) job.purge() print "test_modutil1 ok" job = disco.new_job( name = "test_modutil2", input = tserver.makeurl(inputs), required_modules = modutil.find_modules([fun_map]), map = fun_map) checkl("test_modutil2", result_iterator(job.wait()), [("4.0", "")]) job.purge() print "test_modutil2 ok" job = disco.new_job(
newdistances = {} def minFrom(d, a): for k, v in a.items(): d[k] = mymin(d.get(k, -1), v) for d in distances: if d.get("nodes"): nodes = d["nodes"] minFrom(newdistances, d["distances"]) yield node, json.dumps([node, newdistances, nodes]) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.." print "Go to %s to see status of the job." % disco.master results = disco.new_job(name="shortestpath", input=["file:///home/marko/tmp/disco/out.txt"], map=map, reduce=reduce, save=True).wait() print "Job done" out = file("out.txt", "w") for node, data in result_iterator(results): print >> out, data out.close()
from disco.core import Disco, result_iterator def data_gen(path): return path[1:] + "\n" def fun_map(e, params): k = str(int(math.ceil(float(e))) ** 2) return [(base64.encodestring(k), "")] tserver.run_server(data_gen) disco = Disco(sys.argv[1]) inputs = [1, 485, 3245] job = disco.new_job(name = "test_reqmodules", nr_reduces = 1, input = tserver.makeurl(inputs), map = fun_map, sort = False) res = list(result_iterator(job.wait())) if len(res) != len(inputs): raise Exception("Too few results: Got: %d Should be %d" % (len(res), len(inputs))) cor = map(lambda x: base64.encodestring(str(int(math.ceil(x)) ** 2)), inputs) for k, v in res: if k not in cor: raise Exception("Invalid answer: %s" % k) cor.remove(k)
inputs = [] for i in range(N): a = [i] * 10 b = range(i, i + 10) inputs += ["%d:%d" % x for x in zip(a, b)] results[str(i)] = str(sum(b)) random.shuffle(inputs) disco = Disco(sys.argv[1]) print "Running two map jobs.." map1 = disco.new_job(\ name = "test_onlyreduce1", input = tserver.makeurl(inputs[:len(inputs) / 2]), map = fun_map, partition = fun_partition, nr_reduces = N) map2 = disco.new_job(\ name = "test_onlyreduce2", input = tserver.makeurl(inputs[len(inputs) / 2:]), map = fun_map, partition = fun_partition, nr_reduces = N) results1 = map1.wait() print "map1 done" results2 = map2.wait() print "map2 done"
def fun_map(e, params): if type(e) == tuple: return [(e[0] + params['suffix'], int(e[1]) + 1)] else: return [(e + params['suffix'], 0)] def fun_reduce(iter, out, params): for k, v in iter: out.add(k + "-", v) tserver.run_server(data_gen) disco = Disco(sys.argv[1]) results = disco.new_job(name = "test_chain_0", input = tserver.makeurl([""] * 100), map = fun_map, reduce = fun_reduce, nr_reduces = 4, sort = False, params = {'suffix': '0'}).wait() i = 1 while i < 10: nresults = disco.new_job(name = "test_chain_%d" % i, input = results, map = fun_map, reduce = fun_reduce, nr_reduces = 4, map_reader = chain_reader, sort = False, params = {'suffix': str(i)}).wait() disco.purge(jobname(results[0])) results = nresults i += 1 for key, value in result_iterator(results): if key[:5] not in ani or key[5:] != "0-1-2-3-4-5-6-7-8-9-":
import time from mapper import map from reducer import reduce name = "gap-%s" % int(time.time()) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job (%s).." % name print "Go to %s to see status of the job." % disco.master results = disco.new_job(name=name, input=["tag://gap:1million"], map_input_stream=( func.map_input_stream, func.chain_reader, ), map=map, reduce=reduce, save=True).wait() print "Job done. Results:" f = open('data.js', 'w') for time_of_day, scores in result_iterator(results): str_time = time_of_day.strftime("%Y-%m-%d %H:%M") s = json.dumps({ 'time': str_time, 'scores': scores }) f.write(s + ",\n")
return [] def fun_map3(e, params): fail def fun_map4(e, params): time.sleep(4) return [] tserver.run_server(data_gen) disco = Disco(sys.argv[1]) jobs = [] for i, m in enumerate([fun_map1, fun_map2, fun_map3, fun_map4]): jobs.append(disco.new_job( name = "test_waitmany_%d" % (i + 1), input = tserver.makeurl([""] * 5), map = m)) res = [] while jobs: cont = False ready, jobs = disco.results(jobs, timeout = 2000) res += ready for n, r in res: if n.startswith("test_waitmany_3"): if r[0] != "dead": raise Exception("Invalid job status: %s" % n) elif r[0] != "ready": raise Exception("Invalid job status: %s" % n) disco.purge(n)
import tserver, sys, time from disco.core import Disco def data_gen(path): return "1 2 3\n" def fun_map(e, params): import time time.sleep(100) return [] disco = Disco(sys.argv[1]) num = sum(x['max_workers'] for x in disco.nodeinfo()['available']) print >> sys.stderr, num, "slots available" tserver.run_server(data_gen) job = disco.new_job(name = "test_kill", input = tserver.makeurl([""] * num * 2), map = fun_map) time.sleep(10) print >> sys.stderr, "Killing", job.name job.kill() time.sleep(5) if job.jobinfo()['active'] == "dead": print "ok" job.purge() else: raise Exception("Killing failed")
title = line[-4] year = line[-1] yield year, title def reduce(iter, params): from disco.util import kvgroup for year, titles in kvgroup(sorted(iter)): romantic_titles = [title for title in titles if "love" in title.lower()] yield year, len(romantic_titles) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.." print "Go to %s to see status of the job." % disco.master results = disco.new_job(name="song-titles", input=["tag://hackreduce:millionsongs:subset"], map=map, reduce=reduce, save=True).wait() print "Job done. Results:" chart_url = "http://chart.apis.google.com/chart?chxr=0,0,15&chxt=y&chbh=a,4,10&chs=738x220&cht=bvs&chco=4D89F9&chds=0,15&chd=t:" res_list = [] # Print result to user for year, titles in result_iterator(results): res_list.append(str(titles)) chart_url += ",".join(res_list) chart_url += "&chdl=Songs+with+%22Love%22+in+their+titles&chtt=Most+Romantic+Year+by+Song+Titles" print chart_url
class PagerankJob(object): def __init__(self, spec, disco_addr="disco://localhost", alpha=0.15, niter=2, profile=False): self.spec = spec self.docset = Docset(spec.docset_name) self.disco = Disco("disco://localhost") self.alpha = alpha self.niter = niter self.nr_partitions = 16 self.merge_partitions = False self.profile = profile def start(self): results = self.__first_mass_job() results = self.__teleport_distribute_job(0, results) for i in range(1, self.niter + 1): # print "Iteration %d:" % (i - 1) # print self.__result_stats(results) results = self.__mass_job(i, results) results = self.__teleport_distribute_job(i, results) self.__write_scores(results) def __write_scores(self, results): db = ScoreDBWriter(self.spec.scoredb_path) score_iter = ((doc.uri, doc.pagerank) for doc, _ in result_iterator(results) if isinstance(doc, Document)) db.set_scores(score_iter) db.save_and_close() def __run_job(self, job): results = job.wait() if self.profile: self.__profile_job(job) return results def __first_mass_job(self): return self.__run_job( self.disco.new_job( name="pagerank_mass0", input=["tag://" + self.docset.ddfs_link_file_tag], map_reader=doclinksparse, map=Pagerank.map_mass, reduce=Pagerank.reduce_mass, sort=True, partitions=self.nr_partitions, partition=Pagerank.partition, merge_partitions=self.merge_partitions, profile=self.profile, params=dict(iter=0, doc_count=self.docset.doc_count), ) ) def __mass_job(self, i, results): return self.__run_job( self.disco.new_job( name="pagerank_mass%d" % i, input=results, map_reader=chain_reader, map=Pagerank.map_mass, reduce=Pagerank.reduce_mass, sort=True, partitions=self.nr_partitions, partition=Pagerank.partition, merge_partitions=self.merge_partitions, profile=self.profile, params=dict(iter=i), ) ) def __teleport_distribute_job(self, i, results): lost_mass = sum(v for k, v in result_iterator(results) if k == DANGLING_MASS_KEY) lost_mass_per = float(lost_mass) / self.docset.doc_count return self.__run_job( self.disco.new_job( name="pagerank_teleport_distribute%d" % (i - 1), input=results, map_reader=chain_reader, map=Pagerank.map_teleport_distribute, sort=True, partitions=self.nr_partitions, partition=Pagerank.partition, merge_partitions=self.merge_partitions, profile=self.profile, params=dict(iter=i, alpha=self.alpha, doc_count=self.docset.doc_count, lost_mass_per=lost_mass_per), ) ) def __profile_job(self, job): stats = job.profile_stats() stats.sort_stats("cumulative") stats.print_stats() def __result_stats(self, results): from disco.core import result_iterator o = [] p_sum = 0.0 for k, v in result_iterator(results): if hasattr(k, "pagerank"): doc = k o.append("%f\t%s" % (doc.pagerank, doc.uri)) p_sum += doc.pagerank else: o.append("%f\t(dangling mass)" % v) p_sum += v o.append("%f\tSUM" % p_sum) return "\n".join(o)