示例#1
0
def pack_kv(e):
    if isinstance(e, tuple):
        k, v = e
    else:
        k = b''
        v = e
    return struct.pack("I", len(k)) + str_to_bytes(k) \
           + struct.pack("I", len(v)) + str_to_bytes(v)
示例#2
0
def pack_kv(e):
    if isinstance(e, tuple):
        k, v = e
    else:
        k = b''
        v = e
    return struct.pack("I", len(k)) + str_to_bytes(k) \
           + struct.pack("I", len(v)) + str_to_bytes(v)
示例#3
0
文件: util.py 项目: JohnEmhoff/disco
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.fileutils import AtomicFile
    from disco.worker.task_io import re_reader
    if worker:
        worker.send('MSG', "Downloading {0}".format(filename))
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, bytes):
            raise ValueError("Keys must be bytes for external sort", key)
        if b'\xff' in key or b'\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write(key + b'\xff')
            out_fd.write(encode(pickle_dumps(value, 0)) + b'\x00')
    out_fd.close()
    if worker:
        worker.send('MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename))))
        worker.send('MSG', "Sorting {0}...".format(filename))
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    if worker:
        worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in sort_reader(fd, fd.url):
        yield k, bytes_to_str(decode(str_to_bytes(pickle_loads(v))))
示例#4
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.fileutils import AtomicFile
    from disco.worker.task_io import re_reader
    if worker:
        worker.send('MSG', "Downloading {0}".format(filename))
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, bytes):
            raise ValueError("Keys must be bytes for external sort", key)
        if b'\xff' in key or b'\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write(key + b'\xff')
            out_fd.write(encode(pickle_dumps(value, 0)) + b'\x00')
    out_fd.close()
    if worker:
        worker.send(
            'MSG',
            "Downloaded {0:s} OK".format(format_size(getsize(filename))))
        worker.send('MSG', "Sorting {0}...".format(filename))
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    if worker:
        worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in sort_reader(fd, fd.url):
        yield k, bytes_to_str(decode(str_to_bytes(pickle_loads(v))))
示例#5
0
文件: test_save.py 项目: caox/disco
 def test_save_map(self):
     input = range(10)
     self.job = SaveMapJob().run(input=self.test_server.urls(input))
     results = sorted(self.results(self.job))
     self.tag = self.disco.results(self.job.name)[1][0]
     self.assertEquals(len(list(self.ddfs.blobs(self.tag))), len(input))
     self.assertEquals(results, [(str_to_bytes(str(e)+'!'), '') for e in input])
示例#6
0
 def test_save_map(self):
     input = range(10)
     self.job = SaveMapJob().run(input=self.test_server.urls(input))
     results = sorted(self.results(self.job))
     self.tag = self.disco.results(self.job.name)[1][0]
     self.assertEquals(len(list(self.ddfs.blobs(self.tag))), len(input))
     self.assertEquals(results,
                       [(str_to_bytes(str(e) + '!'), '') for e in input])
示例#7
0
 def test_save(self):
     ducks = ['dewey', 'huey', 'louie']
     a, b = SaveJob1(), SaveJob2()
     self.job = JobChain({a: self.test_server.urls(ducks),
                          b: a}).wait()
     self.tag = self.disco.results(b)[1][0]
     self.assertAllEqual(sorted(self.results(b)),
                         [(str_to_bytes('{0}!?!?'.format(d)), '') for d in ducks])
示例#8
0
 def test_save(self):
     ducks = ['dewey', 'huey', 'louie']
     a, b = SaveJob1(), SaveJob2()
     self.job = JobChain({a: self.test_server.urls(ducks), b: a}).wait()
     self.tag = self.disco.results(b)[1][0]
     self.assertAllEqual(sorted(self.results(b)),
                         [(str_to_bytes('{0}!?!?'.format(d)), '')
                          for d in ducks])
 def test_extreduce(self):
     self.job = ExternalJob().run(input=self.test_server.urls(self.inputs),
                                  map=lambda e, params: [('', e)],
                                  reduce=external([self.binary]))
     ans = str_to_bytes(
         str(
             sum(
                 map(ord, ''.join('test_{0}\n'.format(i)
                                  for i in self.inputs)))))
     self.assertEquals([(ans, ans)] * 10, list(self.results(self.job)))
示例#10
0
def prepare(params, mode):
    global proc
    # op -> worker
    # find required files
    path = os.path.join('ext.{0}'.format(mode), 'op')
    os.chmod(path, stat.S_IEXEC)
    proc = Popen([path, mode], stdin=PIPE, stdout=PIPE, stderr=PIPE)
    register_poll()

    if params and isinstance(params, dict):
        proc.stdin.write(str_to_bytes(encode_netstring_fd(params)))
    else:
        proc.stdin.write('0\n')
    return globals()[mode]
示例#11
0
def prepare(params, mode):
    global proc
    # op -> worker
    # find required files
    path = os.path.join('ext.{0}'.format(mode), 'op')
    os.chmod(path, stat.S_IEXEC)
    proc = Popen([path, mode], stdin=PIPE, stdout=PIPE, stderr=PIPE)
    register_poll()

    if params and isinstance(params, dict):
        proc.stdin.write(str_to_bytes(encode_netstring_fd(params)))
    else:
        proc.stdin.write('0\n')
    return globals()[mode]
示例#12
0
    def test_save_map(self):
        input = range(10)
        self.job = SaveMapJob().run(input=self.test_server.urls(input))
        results = sorted(self.results(self.job))
        self.tag = self.disco.results(self.job.name)[1][0]

        # Previously, each map would save one blob into DDFS.  Now,
        # the pipeline termination does it, using the output of the
        # shuffle stage.  So now, the number of blobs in the tag
        # depends on the grouping used for shuffle, and also the
        # number of nodes used.  Hence, we cannot anymore assert on
        # the number of blobs in the tag.

        # self.assertEquals(len(list(self.ddfs.blobs(self.tag))), len(input))

        self.assertEquals(results, [(str_to_bytes(str(e)+'!'), '') for e in input])
示例#13
0
    def test_save_map(self):
        input = range(10)
        self.job = SaveMapJob().run(input=self.test_server.urls(input))
        results = sorted(self.results(self.job))
        self.tag = self.disco.results(self.job.name)[1][0]

        # Previously, each map would save one blob into DDFS.  Now,
        # the pipeline termination does it, using the output of the
        # shuffle stage.  So now, the number of blobs in the tag
        # depends on the grouping used for shuffle, and also the
        # number of nodes used.  Hence, we cannot anymore assert on
        # the number of blobs in the tag.

        # self.assertEquals(len(list(self.ddfs.blobs(self.tag))), len(input))

        self.assertEquals(results,
                          [(str_to_bytes(str(e) + '!'), '') for e in input])
def read(interface, state, label, inp):
    from disco import util
    for e in inp:
        scheme, netloc, _ = util.urlsplit(e)
        fileName, joinColumn = str(netloc).split('?')
        File = open(PREFIX + fileName, 'r')
        col = int(joinColumn)

        reader = csv.reader(File)
        firstRow = True
        for row in reader:
            if firstRow:
                tableName = row[0]
                firstRow = False
            else:
                fullName = tableName + '?' + str(col)
                Hash = int(
                    hashlib.md5(str_to_bytes(row[col])).hexdigest(), 16) % 160
                interface.output(Hash).add(fullName, row)
示例#15
0
def read(interface, state, label, inp):
    from disco import util

    for e in inp:
        scheme, netloc, _ = util.urlsplit(e)
        fileName, joinColumn = str(netloc).split("?")
        File = open(PREFIX + fileName, "r")
        col = int(joinColumn)

        reader = csv.reader(File)
        firstRow = True
        for row in reader:
            if firstRow:
                tableName = row[0]
                firstRow = False
            else:
                fullName = tableName + "?" + str(col)
                Hash = int(hashlib.md5(str_to_bytes(row[col])).hexdigest(), 16) % 160
                interface.output(Hash).add(fullName, row)
示例#16
0
 def runTest(self):
     self.job = RawJob().run(input=self.test_server.urls(self.input))
     self.assertEqual(sorted(self.results(self.job)),
                      sorted((str_to_bytes(i), '') for i in self.input))
示例#17
0
文件: test.py 项目: AlexArgus/disco
 def send_data(self, data):
     self.send_response(OK)
     self.send_header('Content-length', len(data or []))
     self.end_headers()
     self.wfile.write(str_to_bytes(data))
示例#18
0
 def ansi_text(self, text, bgcolor=WHITE, fgcolor=BLACK):
     return self.background(bgcolor) + self.foreground(
         fgcolor) + str_to_bytes(text)
示例#19
0
 def runTest(self):
     self.job = RawJob().run(input=self.test_server.urls(self.input))
     self.assertEqual(sorted(self.results(self.job)),
                      sorted((str_to_bytes(i), '') for i in self.input))
 def map(e, params):
     for i in range(10):
         put('{0}-{1}'.format(e, i),
             str_to_bytes('val:{0}-{1}'.format(e, i)))
     return []
 def map(e, params):
     k = bytes_to_str(e)
     v = str_to_bytes('value:{0}'.format(k))
     put(k, v)
     yield k, v
示例#22
0
 def ansi_text(self, text, bgcolor=WHITE, fgcolor=BLACK):
     return self.background(bgcolor) + self.foreground(fgcolor) + str_to_bytes(text)
示例#23
0
def Map(interface, state, label, inp):
    out = interface.output(0)
    for i in inp:
        for k, v in shuffled((base64.encodestring(str_to_bytes(c)), b'')
                             for c in bytes_to_str(str_to_bytes(i) * 10)):
            out.add(k, v)
示例#24
0
文件: test_config.py 项目: yuj/disco
 def checkAnswers(self, job, input):
     self.assertEquals(sorted(self.results(job)),
                       sorted((str_to_bytes(str(i)), '') for i in input))
示例#25
0
def Map(interface, state, label, inp):
    out = interface.output(0)
    for i in inp:
        for k, v in shuffled((base64.encodestring(str_to_bytes(c)), b'') for c in bytes_to_str(str_to_bytes(i) * 10)):
            out.add(k, v)
示例#26
0
 def runTest(self):
     self.job = SortJob().run(input=self.test_server.urls([''] * 100))
     result = [i for i in self.results(self.job)]
     self.assertResults(self.job, sorted((str_to_bytes(c), 1000) for c in alphanum))
 def map(string, params):
     return shuffled((base64.encodestring(str_to_bytes(c)), b'')
                     for c in bytes_to_str(string * 10))
示例#28
0
def Map(interface, state, label, inp):
    out = interface.output(0)
    for i in inp:
        out.add(str_to_bytes(i), u'\x00\x00')
示例#29
0
def Reduce(interface, state, label, inp):
    out = interface.output(0)
    for k, vs in kvgroup(inp):
        out.add(str_to_bytes(k), 0)
示例#30
0
文件: fileutils.py 项目: yuj/disco
 def add(self, k, v):
     k, v = str(k), str(v)
     self.stream.write(
         str_to_bytes("%d %s %d %s\n" % (len(k), k, len(v), v)))
示例#31
0
文件: comm.py 项目: Cheng--Li/disco
 def read(self):
     if self.isopen:
         return BytesIO(str_to_bytes(self.source)).read
     return open(self.source, 'rb').read
示例#32
0
文件: test_oob.py 项目: caox/disco
 def map(e, params):
     for i in range(10):
         put('{0}-{1}'.format(e, i), str_to_bytes('val:{0}-{1}'.format(e, i)))
     return []
示例#33
0
文件: test.py 项目: yuj/disco
 def send_data(self, data):
     self.send_response(OK)
     self.send_header('Content-length', len(data or []))
     self.end_headers()
     self.wfile.write(str_to_bytes(data))
示例#34
0
文件: test_oob.py 项目: caox/disco
 def reduce(iter, params):
     for k, v in iter:
         assert v == get(k)
     x = 'reduce:{0}'.format(this_partition())
     put(x, str_to_bytes('value:{0}'.format(x)))
     yield 'all', 'ok'
 def reduce(iter, params):
     for k, v in iter:
         assert v == get(k)
     x = 'reduce:{0}'.format(this_partition())
     put(x, str_to_bytes('value:{0}'.format(x)))
     yield 'all', 'ok'
def Map(interface, state, label, inp):
    out = interface.output(0)
    for i in inp:
        out.add(str_to_bytes(i), u'\x00\x00')
示例#37
0
文件: test_oob.py 项目: caox/disco
 def map(e, params):
     k = bytes_to_str(e)
     v = str_to_bytes('value:{0}'.format(k))
     put(k, v)
     yield k, v
def Reduce(interface, state, label, inp):
    out = interface.output(0)
    for k, vs in kvgroup(inp):
        out.add(str_to_bytes(k), 0)
示例#39
0
 def test_extreduce(self):
     self.job = ExternalJob().run(input=self.test_server.urls(self.inputs),
                                  map=lambda e, params: [('', e)],
                                  reduce=external([self.binary]))
     ans = str_to_bytes(str(sum(map(ord, ''.join('test_{0}\n'.format(i) for i in self.inputs)))))
     self.assertEquals([(ans, ans)] * 10, list(self.results(self.job)))
示例#40
0
 def runTest(self):
     self.job = SortJob().run(input=self.test_server.urls([''] * 100))
     result = [i for i in self.results(self.job)]
     self.assertResults(self.job,
                        sorted((str_to_bytes(c), 1000) for c in alphanum))
示例#41
0
 def add(self, k, v):
     k, v = str(k), str(v)
     self.stream.write(str_to_bytes("%d %s %d %s\n" % (len(k), k, len(v), v)))
示例#42
0
 def map(string, params):
     return shuffled((base64.encodestring(str_to_bytes(c)), b'') for c in bytes_to_str(string * 10))
示例#43
0
 def read(self):
     if self.isopen:
         return BytesIO(str_to_bytes(self.source)).read
     return open(self.source, 'rb').read
示例#44
0
 def checkAnswers(self, job, input):
     self.assertEquals(sorted(self.results(job)),
                       sorted((str_to_bytes(str(i)), '') for i in input))
def getHash(line):
    return int(hashlib.md5(str_to_bytes(line)).hexdigest(), 16) % 128
示例#46
0
def getHash(line):
    return int(hashlib.md5(str_to_bytes(line)).hexdigest(), 16) % 128