예제 #1
0
파일: job.py 프로젝트: sqs/freequery
class IndexJob(object):
    def __init__(self, spec, discodex, disco_addr="disco://localhost", profile=False):
        # TODO(sqs): refactoring potential with PagerankJob
        self.spec = spec
        self.discodex = discodex
        self.docset = Docset(spec.docset_name)
        self.disco = Disco(DiscoSettings()["DISCO_MASTER"])
        self.nr_partitions = 8
        self.profile = profile

    def start(self):
        results = self.__run_job(self.__index_job())
        self.__run_discodex_index(results)

    def __run_job(self, job):
        results = job.wait()
        if self.profile:
            self.__profile_job(job)
        return results

    def __index_job(self):
        return self.disco.new_job(
            name="index_tfidf",
            input=["tag://" + self.docset.ddfs_tag],
            map_reader=docparse,
            map=TfIdf.map,
            reduce=TfIdf.reduce,
            sort=True,
            partitions=self.nr_partitions,
            partition=TfIdf.partition,
            merge_partitions=False,
            profile=self.profile,
            params=dict(doc_count=self.docset.doc_count),
        )

    def __run_discodex_index(self, results):
        opts = {
            "parser": "disco.func.chain_reader",
            "demuxer": "freequery.index.tf_idf.TfIdf_demux",
            "nr_ichunks": 1,  # TODO(sqs): after disco#181 fixed, increase this
        }
        ds = DataSet(input=results, options=opts)
        origname = self.discodex.index(ds)
        self.disco.wait(origname)  # origname is also the disco job name
        self.discodex.clone(origname, self.spec.invindex_name)
예제 #2
0
def data_gen(path):
    return "\n".join([path[1:]] * 10)


def fun_map(e, params):
    return [("=" + e, e)]


def fun_reduce(iter, out, params):
    s = 1
    for k, v in iter:
        if k != "=" + v:
            raise Exception("Corrupted key")
        s *= int(v)
    out.add("result", s)


tserver.run_server(data_gen)

inputs = [3, 5, 7, 11, 13, 17, 19, 23, 29, 31]
job = Disco(sys.argv[1]).new_job(
    name="test_simple", input=tserver.makeurl(inputs), map=fun_map, reduce=fun_reduce, nr_reduces=1, sort=False
)

if list(result_iterator(job.wait())) != [("result", ANS)]:
    raise Exception("Invalid answer")

job.purge()
print "ok"
예제 #3
0
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""

from disco.core import Disco, result_iterator
from disco.util import external

ext_map_exec = "java_map.sh"
ext_reduce_exec = "java_reduce.sh"
map_class = "rmaus.disco.external.sample.WordCountMap"
reduce_class = "rmaus.disco.external.sample.WordCountReduce"

job = Disco("http://discomaster-dr-01:8989").new_job(
        name = "java_wordcount",
	input = ["raw://foo", "raw://bar", "raw://foo"],
	ext_params = { "mapFunction" : map_class, "reduceFunction" : reduce_class, "testKey" : "testValue" },
        map = external([ext_map_exec]),
        reduce = external([ext_reduce_exec]))

results = job.wait(show=True)

for result in sorted(result_iterator(results), key=lambda x:x[1]):
	print result
예제 #4
0
                out.add(k, v)

tserver.run_server(data_gen)

job = Disco(sys.argv[1]).new_job(\
        name = "test_profile",\
        input = tserver.makeurl([""] * int(100)),\
        map = really_unique_function_name,\
        reduce = fun_reduce,\
        nr_reduces = 30,\
        sort = False,\
        profile = True)

ANS = {"gutta": int(1e4), "cavat": int(2e4), "capidem": int(1e4)}
i = 0
for key, value in result_iterator(job.wait()):
        i += 1
        if ANS[key] == int(value):
                print "Correct: %s %s" % (key, value)
        else:
                raise "Results don't match (%s): Got %d expected %d" %\
                        (key, int(value), ANS[key])
if i != 3:
        raise "Too few results"

buf = cStringIO.StringIO()
sys.stdout = buf
job.profile_stats().print_stats()
sys.stdout = sys.__stdout__

#stats = job.profile_stats()
예제 #5
0
import sys
from disco.core import Disco, result_iterator

def fun_map(e, params):
        for i in range(3):
                msg("--special_test_string_%d--" % i)
        return [(e, "")]

inputs = ["raw://discoapi"]

job = Disco(sys.argv[1]).new_job(name = "test_discoapi",
        input = inputs,
        map = fun_map)

r = list(result_iterator(job.wait()))
if [("discoapi", "")] != r:
        raise Exception("Invalid result: <%s> " % r)

n = job.jobspec()["name"]
if not n.startswith("test_discoapi"):
        raise Exception("Invalid jobspec: Expected name prefix test_discoapi, "\
                        "got %s" % n)

events = [ev[2] for offs, ev in job.events()]

for i in range(3):
        m = "--special_test_string_%d--" % i
        if not [x for x in events if m in x]:
                raise Exception("Message '%s' not found in events" % m)

job.purge()
예제 #6
0
tserver.run_server(data_gen)

inputs = ["01/11/1965", "14/03/1983", "12/12/2002"]

job = Disco(sys.argv[1]).new_job(name = "test_objectrw",
                input = tserver.makeurl(inputs),
                map = fun_map,
                map_writer = func.object_writer,
                reduce = fun_reduce, 
                reduce_reader = func.object_reader,
                reduce_writer = func.object_writer,
                required_modules = ["math", "datetime", "time"],
                nr_reduces = 1,
                sort = False)

i = 0
for k, v in result_iterator(job.wait(), reader = func.object_reader):
        if k["PI2"] != math.pi:
                raise "Invalid key: %s" % k
        if v.strftime("%d/%m/%Y") not in inputs:
                raise "Invalid value: %s" % v
        i += 1

if i != 30:
        raise "Wrong number of results, got %d, expected 30" % i

job.purge()

print "ok"
예제 #7
0
tserver.run_server(data_gen)
inputs = tserver.makeurl([1])
job = Disco(sys.argv[1]).new_job(name = "test_ratelimit",
        input = inputs, map = fun_map)

time.sleep(5)
check_dead(job)

job = Disco(sys.argv[1]).new_job(name = "test_ratelimit2",
        input = inputs, map = fun_map2, status_interval = 1)

time.sleep(5)
check_dead(job)

job = Disco(sys.argv[1]).new_job(name = "test_ratelimit3",
        input = inputs, map = fun_map3, status_interval = 1)

time.sleep(5)
check_dead(job)

job = Disco(sys.argv[1]).new_job(name = "test_ratelimit4",
        input = inputs, map = fun_map2, status_interval = 0)
job.wait()
job.purge()

print "ok"



예제 #8
0
fail = ["1", "2", "3"]

def data_gen(path):
        lock.acquire()
        e = path[1:]
        if e in fail:
                fail.remove(e)
                lock.release()
                raise tserver.FailedReply()
        else:
                lock.release()
                return str(int(e) * 10) + "\n"

def fun_map(e, params):
        return [(int(e) * 10, "")]

tserver.run_server(data_gen)

job = Disco(sys.argv[1]).new_job(
        name = "test_tempfail",
        input = tserver.makeurl(map(str, range(10))),
        map = fun_map)

res = sum(int(x) for x, y in result_iterator(job.wait()))
if res != 4500:
        raise Exception("Invalid result: Got %d, expected 4500" % res)

job.purge()
print "ok"

예제 #9
0
        s = 1
        for k, v in iter:
                if k != "=" + v:
                        raise Exception("Corrupted key")
                s *= int(v)
        out.add("result", s)

tserver.run_server(data_gen)

inputs = [3, 5, 7, 11, 13, 17, 19, 23, 29, 31]

job = Disco(sys.argv[1]).new_job(
                name = "test_writers", 
                input = tserver.makeurl(inputs),
                map = fun_map,
                map_writer = fun_map_writer,
                reduce = fun_reduce, 
                reduce_reader = fun_reduce_reader,
                reduce_writer = fun_reduce_writer,
                nr_reduces = 1,
                sort = False)

res = list(result_iterator(job.wait(), reader = result_reader))

if res != [ANS]:
        raise Exception("Invalid answer: %s" % res)

job.purge()

print "ok"
예제 #10
0
import sys
from disco.core import Disco, result_iterator

def fun_map(e, params):
        return [("", e + ":map")]

inputs = ["raw://eeny", "raw://meeny", "raw://miny", "raw://moe"]

job = Disco(sys.argv[1]).new_job(name = "test_raw",
        input = inputs,
        map = fun_map)

res = dict((x[6:] + ":map", True) for x in inputs)

for x in result_iterator(job.wait()):
        if x[1] not in res:
                raise "Invalid result: <%s> " % x[1]
        del res[x[1]]

if res:
        raise "Invalid number of results %d" %\
                (len(inputs) - len(res))

job.purge()

print "ok"

예제 #11
0
파일: test_streams.py 프로젝트: davin/disco
tserver.run_server(data_gen)

inputs = ["apple", "orange", "pear"]

job = Disco(sys.argv[1]).new_job(
        name="test_streams",
        input=tserver.makeurl(inputs),
        map=fun_map,
        reduce=fun_reduce,
        nr_reduces=1,
        map_reader = map_reader,
        map_input_stream =
                [map_input_stream, map_input1, map_input2, map_input3],
        reduce_output_stream = [reduce_output1, reduce_output2])

for k, v in result_iterator(job.wait(),
                input_stream = [resultiter_input1, map_input_stream]):

        if not k.startswith("red:cba"):
                raise Exception("Invalid prefix in key. Got '%s' "\
                        "expected prefix 'red:cba'" % k)

        if k[7:] not in inputs:
                raise Exception("Invalid result '%s'" % k)
        inputs.remove(k[7:])

if inputs:
        raise Exception("Expected 3 results, got %d" % 3 - len(inputs))

print "ok"
예제 #12
0
                out.add("red_" + k, "red_" + v)
        
tserver.run_server(data_gen)

inputs = ["ape", "cat", "dog"]
params = {"test1": "1,2,3",\
          "one two three": "dim\ndam\n",\
          "dummy": "value"}

job = Disco(sys.argv[1]).new_job(
            name = "test_external",
            input = tserver.makeurl(inputs),
            map = external(["ext_test"]), 
            reduce = fun_reduce, 
            ext_params = params,
            nr_reduces = 1,
            sort = False)

results = sorted([(v, k) for k, v in result_iterator(job.wait())])
for i, e in enumerate(results): 
        v, k = e
        if k != "red_dkey" or v != "red_test_%s" % inputs[i / 3]:
                raise Exception("Invalid answer: %s, %s" % (k, v))

if len(results) != 9:
        raise Exception("Wrong number of results: %u vs. 9" % len(results))

job.purge()

print "ok"
예제 #13
0
                raise tserver.FailedReply()
        else:
                return str(int(path[2:]) * 10) + "\n"

def fun_map(e, params):
        return [(e, "")]

def fun_reduce(iter, out, params):
        s = 0
        for k, v in iter:
                s += int(k)
        out.add(s, "")

tserver.run_server(data_gen)

inputs = ["X1", ["2_fail", "2_still_fail", "X200"], "X3", ["4_fail", "X400"]]

job = Disco(sys.argv[1]).new_job(
        name = "test_redundant",
        input = tserver.makeurl(inputs),
        map = fun_map,
        reduce = fun_reduce,
        nr_reduces = 1)

if result_iterator(job.wait()).next()[0] != "6040":
        raise Exception("Invalid result: Got %s, expected 6040" % res)

job.purge()
print "ok"

예제 #14
0
import sys, tserver
from disco.core import Disco, result_iterator

def data_gen(path):
        return "\n".join([path[1:]])

def fun_map(e, params):
        x = extramodule1.magic(int(e))
        y = extramodule2.kungfu(x)
        return [("", y)]

inputs = ["123"]

tserver.run_server(data_gen)

job = Disco(sys.argv[1]).new_job(name = "test_requiredfiles",
        input = tserver.makeurl(inputs),
        required_files = ["extramodule1.py", "extramodule2.py"],
        required_modules = ["extramodule1", "extramodule2"],
        map = fun_map)

exp = int(inputs[0]) ** 2 + 2
got = int([y for x, y in result_iterator(job.wait())][0])
if exp != got:
        raise "Wrong result! expected %d, got %d" % (exp, got)

job.purge()

print "ok"