def disk_sort(worker, input, filename, sort_buffer_size='10%'): from os.path import getsize from disco.comm import open_local from disco.fileutils import AtomicFile from disco.worker.task_io import re_reader if worker: worker.send('MSG', "Downloading {0}".format(filename)) out_fd = AtomicFile(filename) for key, value in input: if not isinstance(key, bytes): raise ValueError("Keys must be bytes for external sort", key) if b'\xff' in key or b'\x00' in key: raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key) else: # value pickled using protocol 0 will always be printable ASCII out_fd.write(key + b'\xff') out_fd.write(encode(pickle_dumps(value, 0)) + b'\x00') out_fd.close() if worker: worker.send('MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename)))) worker.send('MSG', "Sorting {0}...".format(filename)) unix_sort(filename, sort_buffer_size=sort_buffer_size) if worker: worker.send('MSG', ("Finished sorting")) fd = open_local(filename) for k, v in sort_reader(fd, fd.url): yield k, bytes_to_str(decode(str_to_bytes(pickle_loads(v))))
def disk_sort(worker, input, filename, sort_buffer_size='10%'): from os.path import getsize from disco.comm import open_local from disco.util import format_size from disco.fileutils import AtomicFile worker.send('MSG', "Downloading {0}".format(filename)) out_fd = AtomicFile(filename) for key, value in input: if not isinstance(key, bytes): raise ValueError("Keys must be bytes for external sort", key) if b'\xff' in key or b'\x00' in key: raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key) else: # value pickled using protocol 0 will always be printable ASCII out_fd.write(key + b'\xff') out_fd.write(pickle_dumps(value, 0) + b'\x00') out_fd.close() worker.send('MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename)))) worker.send('MSG', "Sorting {0}...".format(filename)) unix_sort(filename, sort_buffer_size=sort_buffer_size) worker.send('MSG', ("Finished sorting")) fd = open_local(filename) for k, v in re_reader(b"(?s)(.*?)\xff(.*?)\x00", fd, len(fd), fd.url): yield k, pickle_loads(v)
def append(self, record): self.hunk_write(pickle_dumps(record, 1)) if self.hunk_size > self.min_hunk_size: self.flush()
def transform(self, s): pickled = pickle_dumps(s, 2) self.assertEqual(pickled, decode(encode(pickled)))