def result_iterator(results, notifier = None, proxy = None): if not proxy: proxy = os.environ.get("DISCO_PROXY", None) if proxy: if proxy.startswith("disco://"): proxy = "%s:%s" % (proxy[8:], util.MASTER_PORT) elif proxy.startswith("http://"): proxy = proxy[7:] res = [] for dir_url in results: if dir_url.startswith("dir://"): res += util.parse_dir(dir_url, proxy) else: res.append(dir_url) for url in res: if url.startswith("file://"): fname = url[7:] fd = file(fname) sze = os.stat(fname).st_size http = None else: host, fname = url[8:].split("/", 1) if proxy: ext_host = proxy fname = "/disco/node/%s/%s" % (host, fname) else: ext_host = host + ":" + util.HTTP_PORT ext_file = "/" + fname http = httplib.HTTPConnection(ext_host) http.request("GET", ext_file, "") fd = http.getresponse() if fd.status != 200: raise "HTTP error %d" % fd.status sze = int(fd.getheader("content-length")) if notifier: notifier(url) for x in func.netstr_reader(fd, sze, fname): yield x if http: http.close() else: fd.close()
def download_and_sort(self): dlname = REDUCE_DL % (job_name, this_partition()) ensure_path(dlname, False) msg("Reduce will be downloaded to %s" % dlname) out_fd = file(dlname + ".partial", "w") for fname in self.inputs: sze, fd = connect_input(fname) for k, v in netstr_reader(fd, sze, fname): if " " in k: err("Spaces are not allowed in keys "\ "with external sort.") if "\0" in v: err("Zero bytes are not allowed in "\ "values with external sort. "\ "Consider using base64 encoding.") out_fd.write("%s %s\0" % (k, v)) out_fd.close() os.rename(dlname + ".partial", dlname) msg("Reduce input downloaded ok") msg("Starting external sort") sortname = REDUCE_SORTED % (job_name, this_partition()) ensure_path(sortname, False) cmd = ["sort", "-n", "-s", "-k", "1,1", "-z",\ "-t", " ", "-o", sortname, dlname] proc = subprocess.Popen(cmd) ret = proc.wait() if ret: err("Sorting %s to %s failed (%d)" %\ (dlname, sortname, ret)) msg("External sort done: %s" % sortname) return self.multi_file_iterator([sortname], reader =\ lambda fd, sze, fname:\ re_reader("(.*?) (.*?)\000", fd, sze, fname))
def json_reader(fd, size, filename): from disco.func import netstr_reader from discodex import json for k, v in netstr_reader(fd, size, filename): yield json.loads(k), json.loads(v)
def netstrparse(fd, size, fname, params): """Reads (key, value) pairs directly from `netstr` input.""" from disco import func return func.netstr_reader(fd, size, fname, params)
def reader(fd, size, fname, extra): from disco.func import netstr_reader for k, v in netstr_reader(fd, size, fname): yield k + extra, v