def __init__(self, input_files, do_sort, mem_sort_limit): self.inputs = [] part = PART_SUFFIX % this_partition() for input in input_files: if input.startswith("dir://"): try: self.inputs += parse_dir(input, part_id = this_partition()) except: data_err("Couldn't resolve address %s"\ % input, input) else: self.inputs.append(input) self.line_count = 0 if do_sort: total_size = 0 for input in self.inputs: sze, fd = connect_input(input) total_size += sze msg("Reduce[%d] input is %.2fMB" %\ (this_partition(), total_size / 1024.0**2)) if total_size > mem_sort_limit: self.iterator = self.download_and_sort() else: msg("Sorting in memory") m = list(self.multi_file_iterator(self.inputs, False)) m.sort(num_cmp) self.iterator = self.list_iterator(m) else: self.iterator = self.multi_file_iterator(self.inputs)
def re_reader(item_re_str, fd, content_len, fname, output_tail = False, read_buffer_size=8192): item_re = re.compile(item_re_str) buf = "" tot = 0 while True: if content_len: r = fd.read(min(read_buffer_size, content_len - tot)) else: r = fd.read(read_buffer_size) tot += len(r) buf += r m = item_re.match(buf) while m: yield m.groups() buf = buf[m.end():] m = item_re.match(buf) if not len(r) or tot >= content_len: if content_len != None and tot < content_len: data_err("Truncated input (%s). "\ "Expected %d bytes, got %d" %\ (fname, content_len, tot), fname) if len(buf): if output_tail: yield [buf] else: msg("Couldn't match the last %d "\ "bytes in %s. Some bytes may be "\ "missing from input." %\ (len(buf), fname)) break
def get(key, job = None): try: job = job or this_name() return load_oob("http://" + this_master(), job, key) except comm.CommException, x: data_err("OOB key (%s) not found at %s: HTTP status '%s'" %\ (key, url, x.http_code), key)
def open_local(input, fname): try: f = file(fname) sze = os.stat(fname).st_size return sze, f except: data_err("Can't access a local input file (%s): %s"\ % (input, fname), input)
def open_remote(input, ext_host, ext_file, is_chunk): try: # We can't open a new HTTP connection for each intermediate # result -- this would result to M * R TCP connections where # M is the number of maps and R the number of reduces. Instead, # we pool connections and reuse them whenever possible. HTTP # 1.1 defaults to keep-alive anyway. if ext_host in http_pool: http = http_pool[ext_host] if http._HTTPConnection__response: http._HTTPConnection__response.read() else: http = httplib.HTTPConnection(ext_host) http_pool[ext_host] = http if is_chunk: pos = this_partition() * 8 rge = "bytes=%d-%d" % (pos, pos + 15) #msg("Reading offsets at %s" % rge) http.request("GET", ext_file, None, {"Range": rge}) fd = http.getresponse() if fd.status != 206: raise "HTTP error %d" % fd.status start, end = struct.unpack("QQ", fd.read()) if start == end: return 0, cStringIO.StringIO() else: rge = "bytes=%d-%d" % (start, end - 1) #msg("Reading data at %s" % rge) http.request("GET", ext_file, None, {"Range": rge}) fd = http.getresponse() if fd.status != 206: raise "HTTP error %d" % fd.status else: http.request("GET", ext_file, "") fd = http.getresponse() if fd.status != 200: raise "HTTP error %d" % fd.status sze = fd.getheader("content-length") if sze: sze = int(sze) return sze, fd except httplib.BadStatusLine: # BadStatusLine is caused by a closed connection. Re-open a new # connection by deleting this connection from the pool and # calling this function again. Note that this might result in # endless recursion if something went seriously wrong. http.close() del http_pool[ext_host] return open_remote(input, ext_host, ext_file, is_chunk) except: data_err("Can't access an external input file (%s/%s): %s"\ % (ext_host, ext_file, input), input)
def get(key, job = None): if job: c = urllib.urlopen(OOB_URL % (job, key)) else: c = urllib.urlopen(OOB_URL % (job_name, key)) if "status" in c.headers and not c.headers["status"].startswith("200"): data_err("OOB <%s> key (%s) not found" % (c.headers["status"], key), key) else: r = c.read() c.close() return r
def open_local(input, fname, is_chunk): try: f = file(fname) if is_chunk: f.seek(this_partition() * 8) start, end = struct.unpack("QQ", f.read(16)) sze = end - start f.seek(start) else: sze = os.stat(fname).st_size return sze, f except: data_err("Can't access a local input file: %s"\ % input, input)
def read_netstr(idx, data, tot): ldata = len(data) i = 0 lenstr = "" if ldata - idx < 11: data = data[idx:] + fd.read(8192) ldata = len(data) idx = 0 i = data.find(" ", idx, idx + 11) if i == -1: err("Corrupted input (%s). Could not "\ "parse a value length at %d bytes."\ % (fname, tot)) else: lenstr = data[idx:i + 1] idx = i + 1 if ldata < i + 1: data_err("Truncated input (%s). "\ "Expected %d bytes, got %d" %\ (fname, content_len, tot), fname) try: llen = int(lenstr) except ValueError: err("Corrupted input (%s). Could not "\ "parse a value length at %d bytes."\ % (fname, tot)) tot += len(lenstr) if ldata - idx < llen + 1: data = data[idx:] + fd.read(llen + 8193) ldata = len(data) idx = 0 msg = data[idx:idx + llen] if idx + llen + 1 > ldata: data_err("Truncated input (%s). "\ "Expected a value of %d bytes "\ "(offset %u bytes)" %\ (fname, llen + 1, tot), fname) tot += llen + 1 idx += llen + 1 return idx, data, tot, msg
def merge_chunks(partitions): mapout = CHUNK_OUTPUT % (job_name, this_partition()) f = file(mapout + ".partial", "w") offset = (len(partitions) + 1) * 8 for p in partitions: f.write(struct.pack("Q", offset)) offset += os.stat(p.fname).st_size f.write(struct.pack("Q", offset)) f.close() if subprocess.call("cat %s >> %s.partial" % (" ".join([p.fname for p in partitions]), mapout), shell = True): data_err("Couldn't create a chunk", mapout) os.rename(mapout + ".partial", mapout) for p in partitions: os.remove(p.fname)
def ensure_file(fname, data = None, timeout = 60, mode = 500): while timeout > 0: if os.path.exists(fname): return False try: fd = os.open(fname + ".partial", os.O_CREAT | os.O_EXCL | os.O_WRONLY, mode) if callable(data): data = data() os.write(fd, data) os.close(fd) os.rename(fname + ".partial", fname) return True except OSError, x: if x.errno == errno.EEXIST: time.sleep(1) timeout -= 1 else: data_err("Writing external file %s failed"\ % fname, fname)
return outstream.write(buf) except Exception, x: # output file is inconsistent state # we must crash the job err("Updating file %s failed: %s" %\ (outfile, x)) except IOError, x: # Python doc guides us to check both the # EWOULDBLOCK (11) and EACCES (13) errors if x.errno == 11 or x.errno == 13: time.sleep(1) timeout -= 1 else: raise data_err("Timeout when updating file %s" % outfile, outfile) def ensure_file(fname, data = None, timeout = 60, mode = 500): while timeout > 0: if os.path.exists(fname): return False try: fd = os.open(fname + ".partial", os.O_CREAT | os.O_EXCL | os.O_WRONLY, mode) if type(data) == str: os.write(fd, data) else: os.write(fd, data()) os.close(fd) os.rename(fname + ".partial", fname)
def open_remote(input, ext_host, ext_file): try: return comm.open_remote("http://%s%s" % (ext_host, ext_file)) except Exception, x: data_err("Can't access an external input file (%s%s): %s"\ % (ext_host, ext_file, x), x)