def download_and_sort(self, params): dlname = Task.path("REDUCE_DL", Task.id) msg("Reduce will be downloaded to %s" % dlname) out_fd = AtomicFile(dlname, "w") for url in self.inputs: fd, sze, url = connect_input(url, params) for k, v in fun_reader(fd, sze, url): if " " in k: err("Spaces are not allowed in keys "\ "with external sort.") if "\0" in v: err("Zero bytes are not allowed in "\ "values with external sort. "\ "Consider using base64 encoding.") out_fd.write("%s %s\0" % (k, v)) out_fd.close() msg("Reduce input downloaded ok") msg("Starting external sort") sortname = Task.path("REDUCE_SORTED", Task.id) ensure_path(os.path.dirname(sortname)) cmd = ["sort", "-n", "-k", "1,1", "-z",\ "-t", " ", "-o", sortname, dlname] proc = subprocess.Popen(cmd) ret = proc.wait() if ret: err("Sorting %s to %s failed (%d)" %\ (dlname, sortname, ret)) msg("External sort done: %s" % sortname) return self.multi_file_iterator([sortname], params, reader =\ lambda fd, sze, url:\ re_reader("(?s)(.*?) (.*?)\000", fd, sze, url))
def sorted_entries(self): dlname = self.path('reduce-in-%d.dl' % self.id) Message("Downloading %s" % dlname) out_fd = AtomicFile(dlname, 'w') for key, value in self.entries: if not isinstance(key, str): raise ValueError("Keys must be strings for external sort") if '\xff' in key or '\x00' in key: raise ValueError("Cannot sort keys with 0xFF or 0x00 bytes") else: # value pickled using protocol 0 will always be printable ASCII out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0))) out_fd.close() Message("Downloaded OK") self.disk_sort(dlname) fd, size, url = comm.open_local(dlname) for k, v in func.re_reader("(?s)(.*?)\xff(.*?)\x00", fd, size, url): yield k, cPickle.loads(v)
def sorted_entries(self): dlname = self.path('reduce-in-%d.dl' % self.id) Status("Downloading %s" % dlname) out_fd = AtomicFile(dlname, 'w') for key, value in self.entries: if not isinstance(key, str): raise ValueError("Keys must be strings for external sort", key) if '\xff' in key or '\x00' in key: raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key) else: # value pickled using protocol 0 will always be printable ASCII out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0))) out_fd.close() Status("Downloaded OK") self.disk_sort(dlname) fd, size, url = comm.open_local(dlname) for k, v in func.re_reader("(?s)(.*?)\xff(.*?)\x00", fd, size, url): yield k, cPickle.loads(v)
def download_and_sort(self): dlname = REDUCE_DL % (job_name, this_partition()) ensure_path(dlname, False) msg("Reduce will be downloaded to %s" % dlname) out_fd = file(dlname + ".partial", "w") for fname in self.inputs: sze, fd = connect_input(fname) for k, v in netstr_reader(fd, sze, fname): if " " in k: err("Spaces are not allowed in keys "\ "with external sort.") if "\0" in v: err("Zero bytes are not allowed in "\ "values with external sort. "\ "Consider using base64 encoding.") out_fd.write("%s %s\0" % (k, v)) out_fd.close() os.rename(dlname + ".partial", dlname) msg("Reduce input downloaded ok") msg("Starting external sort") sortname = REDUCE_SORTED % (job_name, this_partition()) ensure_path(sortname, False) cmd = ["sort", "-n", "-s", "-k", "1,1", "-z",\ "-t", " ", "-o", sortname, dlname] proc = subprocess.Popen(cmd) ret = proc.wait() if ret: err("Sorting %s to %s failed (%d)" %\ (dlname, sortname, ret)) msg("External sort done: %s" % sortname) return self.multi_file_iterator([sortname], reader =\ lambda fd, sze, fname:\ re_reader("(.*?) (.*?)\000", fd, sze, fname))
def sort_reader(self, url): fd, sze, url = comm.open_local(url, url) for k, v in func.re_reader("(?s)(.*?)\xff(.*?)\x00", fd, sze, url): yield k, cPickle.loads(v)