class IndexJob(object): def __init__(self, spec, discodex, disco_addr="disco://localhost", profile=False): # TODO(sqs): refactoring potential with PagerankJob self.spec = spec self.discodex = discodex self.docset = Docset(spec.docset_name) self.disco = Disco(DiscoSettings()["DISCO_MASTER"]) self.nr_partitions = 8 self.profile = profile def start(self): results = self.__run_job(self.__index_job()) self.__run_discodex_index(results) def __run_job(self, job): results = job.wait() if self.profile: self.__profile_job(job) return results def __index_job(self): return self.disco.new_job( name="index_tfidf", input=["tag://" + self.docset.ddfs_tag], map_reader=docparse, map=TfIdf.map, reduce=TfIdf.reduce, sort=True, partitions=self.nr_partitions, partition=TfIdf.partition, merge_partitions=False, profile=self.profile, params=dict(doc_count=self.docset.doc_count), ) def __run_discodex_index(self, results): opts = { "parser": "disco.func.chain_reader", "demuxer": "freequery.index.tf_idf.TfIdf_demux", "nr_ichunks": 1, # TODO(sqs): after disco#181 fixed, increase this } ds = DataSet(input=results, options=opts) origname = self.discodex.index(ds) self.disco.wait(origname) # origname is also the disco job name self.discodex.clone(origname, self.spec.invindex_name)
def data_gen(path): return "\n".join([path[1:]] * 10) def fun_map(e, params): return [("=" + e, e)] def fun_reduce(iter, out, params): s = 1 for k, v in iter: if k != "=" + v: raise Exception("Corrupted key") s *= int(v) out.add("result", s) tserver.run_server(data_gen) inputs = [3, 5, 7, 11, 13, 17, 19, 23, 29, 31] job = Disco(sys.argv[1]).new_job( name="test_simple", input=tserver.makeurl(inputs), map=fun_map, reduce=fun_reduce, nr_reduces=1, sort=False ) if list(result_iterator(job.wait())) != [("result", ANS)]: raise Exception("Invalid answer") job.purge() print "ok"
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ from disco.core import Disco, result_iterator from disco.util import external ext_map_exec = "java_map.sh" ext_reduce_exec = "java_reduce.sh" map_class = "rmaus.disco.external.sample.WordCountMap" reduce_class = "rmaus.disco.external.sample.WordCountReduce" job = Disco("http://discomaster-dr-01:8989").new_job( name = "java_wordcount", input = ["raw://foo", "raw://bar", "raw://foo"], ext_params = { "mapFunction" : map_class, "reduceFunction" : reduce_class, "testKey" : "testValue" }, map = external([ext_map_exec]), reduce = external([ext_reduce_exec])) results = job.wait(show=True) for result in sorted(result_iterator(results), key=lambda x:x[1]): print result
out.add(k, v) tserver.run_server(data_gen) job = Disco(sys.argv[1]).new_job(\ name = "test_profile",\ input = tserver.makeurl([""] * int(100)),\ map = really_unique_function_name,\ reduce = fun_reduce,\ nr_reduces = 30,\ sort = False,\ profile = True) ANS = {"gutta": int(1e4), "cavat": int(2e4), "capidem": int(1e4)} i = 0 for key, value in result_iterator(job.wait()): i += 1 if ANS[key] == int(value): print "Correct: %s %s" % (key, value) else: raise "Results don't match (%s): Got %d expected %d" %\ (key, int(value), ANS[key]) if i != 3: raise "Too few results" buf = cStringIO.StringIO() sys.stdout = buf job.profile_stats().print_stats() sys.stdout = sys.__stdout__ #stats = job.profile_stats()
import sys from disco.core import Disco, result_iterator def fun_map(e, params): for i in range(3): msg("--special_test_string_%d--" % i) return [(e, "")] inputs = ["raw://discoapi"] job = Disco(sys.argv[1]).new_job(name = "test_discoapi", input = inputs, map = fun_map) r = list(result_iterator(job.wait())) if [("discoapi", "")] != r: raise Exception("Invalid result: <%s> " % r) n = job.jobspec()["name"] if not n.startswith("test_discoapi"): raise Exception("Invalid jobspec: Expected name prefix test_discoapi, "\ "got %s" % n) events = [ev[2] for offs, ev in job.events()] for i in range(3): m = "--special_test_string_%d--" % i if not [x for x in events if m in x]: raise Exception("Message '%s' not found in events" % m) job.purge()
tserver.run_server(data_gen) inputs = ["01/11/1965", "14/03/1983", "12/12/2002"] job = Disco(sys.argv[1]).new_job(name = "test_objectrw", input = tserver.makeurl(inputs), map = fun_map, map_writer = func.object_writer, reduce = fun_reduce, reduce_reader = func.object_reader, reduce_writer = func.object_writer, required_modules = ["math", "datetime", "time"], nr_reduces = 1, sort = False) i = 0 for k, v in result_iterator(job.wait(), reader = func.object_reader): if k["PI2"] != math.pi: raise "Invalid key: %s" % k if v.strftime("%d/%m/%Y") not in inputs: raise "Invalid value: %s" % v i += 1 if i != 30: raise "Wrong number of results, got %d, expected 30" % i job.purge() print "ok"
tserver.run_server(data_gen) inputs = tserver.makeurl([1]) job = Disco(sys.argv[1]).new_job(name = "test_ratelimit", input = inputs, map = fun_map) time.sleep(5) check_dead(job) job = Disco(sys.argv[1]).new_job(name = "test_ratelimit2", input = inputs, map = fun_map2, status_interval = 1) time.sleep(5) check_dead(job) job = Disco(sys.argv[1]).new_job(name = "test_ratelimit3", input = inputs, map = fun_map3, status_interval = 1) time.sleep(5) check_dead(job) job = Disco(sys.argv[1]).new_job(name = "test_ratelimit4", input = inputs, map = fun_map2, status_interval = 0) job.wait() job.purge() print "ok"
fail = ["1", "2", "3"] def data_gen(path): lock.acquire() e = path[1:] if e in fail: fail.remove(e) lock.release() raise tserver.FailedReply() else: lock.release() return str(int(e) * 10) + "\n" def fun_map(e, params): return [(int(e) * 10, "")] tserver.run_server(data_gen) job = Disco(sys.argv[1]).new_job( name = "test_tempfail", input = tserver.makeurl(map(str, range(10))), map = fun_map) res = sum(int(x) for x, y in result_iterator(job.wait())) if res != 4500: raise Exception("Invalid result: Got %d, expected 4500" % res) job.purge() print "ok"
s = 1 for k, v in iter: if k != "=" + v: raise Exception("Corrupted key") s *= int(v) out.add("result", s) tserver.run_server(data_gen) inputs = [3, 5, 7, 11, 13, 17, 19, 23, 29, 31] job = Disco(sys.argv[1]).new_job( name = "test_writers", input = tserver.makeurl(inputs), map = fun_map, map_writer = fun_map_writer, reduce = fun_reduce, reduce_reader = fun_reduce_reader, reduce_writer = fun_reduce_writer, nr_reduces = 1, sort = False) res = list(result_iterator(job.wait(), reader = result_reader)) if res != [ANS]: raise Exception("Invalid answer: %s" % res) job.purge() print "ok"
import sys from disco.core import Disco, result_iterator def fun_map(e, params): return [("", e + ":map")] inputs = ["raw://eeny", "raw://meeny", "raw://miny", "raw://moe"] job = Disco(sys.argv[1]).new_job(name = "test_raw", input = inputs, map = fun_map) res = dict((x[6:] + ":map", True) for x in inputs) for x in result_iterator(job.wait()): if x[1] not in res: raise "Invalid result: <%s> " % x[1] del res[x[1]] if res: raise "Invalid number of results %d" %\ (len(inputs) - len(res)) job.purge() print "ok"
tserver.run_server(data_gen) inputs = ["apple", "orange", "pear"] job = Disco(sys.argv[1]).new_job( name="test_streams", input=tserver.makeurl(inputs), map=fun_map, reduce=fun_reduce, nr_reduces=1, map_reader = map_reader, map_input_stream = [map_input_stream, map_input1, map_input2, map_input3], reduce_output_stream = [reduce_output1, reduce_output2]) for k, v in result_iterator(job.wait(), input_stream = [resultiter_input1, map_input_stream]): if not k.startswith("red:cba"): raise Exception("Invalid prefix in key. Got '%s' "\ "expected prefix 'red:cba'" % k) if k[7:] not in inputs: raise Exception("Invalid result '%s'" % k) inputs.remove(k[7:]) if inputs: raise Exception("Expected 3 results, got %d" % 3 - len(inputs)) print "ok"
out.add("red_" + k, "red_" + v) tserver.run_server(data_gen) inputs = ["ape", "cat", "dog"] params = {"test1": "1,2,3",\ "one two three": "dim\ndam\n",\ "dummy": "value"} job = Disco(sys.argv[1]).new_job( name = "test_external", input = tserver.makeurl(inputs), map = external(["ext_test"]), reduce = fun_reduce, ext_params = params, nr_reduces = 1, sort = False) results = sorted([(v, k) for k, v in result_iterator(job.wait())]) for i, e in enumerate(results): v, k = e if k != "red_dkey" or v != "red_test_%s" % inputs[i / 3]: raise Exception("Invalid answer: %s, %s" % (k, v)) if len(results) != 9: raise Exception("Wrong number of results: %u vs. 9" % len(results)) job.purge() print "ok"
raise tserver.FailedReply() else: return str(int(path[2:]) * 10) + "\n" def fun_map(e, params): return [(e, "")] def fun_reduce(iter, out, params): s = 0 for k, v in iter: s += int(k) out.add(s, "") tserver.run_server(data_gen) inputs = ["X1", ["2_fail", "2_still_fail", "X200"], "X3", ["4_fail", "X400"]] job = Disco(sys.argv[1]).new_job( name = "test_redundant", input = tserver.makeurl(inputs), map = fun_map, reduce = fun_reduce, nr_reduces = 1) if result_iterator(job.wait()).next()[0] != "6040": raise Exception("Invalid result: Got %s, expected 6040" % res) job.purge() print "ok"
import sys, tserver from disco.core import Disco, result_iterator def data_gen(path): return "\n".join([path[1:]]) def fun_map(e, params): x = extramodule1.magic(int(e)) y = extramodule2.kungfu(x) return [("", y)] inputs = ["123"] tserver.run_server(data_gen) job = Disco(sys.argv[1]).new_job(name = "test_requiredfiles", input = tserver.makeurl(inputs), required_files = ["extramodule1.py", "extramodule2.py"], required_modules = ["extramodule1", "extramodule2"], map = fun_map) exp = int(inputs[0]) ** 2 + 2 got = int([y for x, y in result_iterator(job.wait())][0]) if exp != got: raise "Wrong result! expected %d, got %d" % (exp, got) job.purge() print "ok"