def unix_sort(filename, sort_buffer_size='10%'): import subprocess, os.path if not os.path.isfile(filename): raise DataError("Invalid sort input file {0}".format(filename), filename) try: env = os.environ.copy() env['LC_ALL'] = 'C' cmd, shell = sort_cmd(filename, sort_buffer_size) subprocess.check_call(cmd, env=env, shell=shell) except subprocess.CalledProcessError as e: raise DataError("Sorting {0} failed: {1}".format(filename, e), filename)
def swap(self, error=None): try: def skip(iter, N): from itertools import dropwhile return dropwhile(lambda n_rec: n_rec[0] < N, enumerate(iter)) self.iter = skip(self.open(next(self.urls)), self.last + 1) except DataError: self.swap(traceback.format_exc()) except StopIteration: if error: raise DataError("Exhausted all available replicas, " "last error was:\n\n{0}".format(error), self.input) raise DataError("Exhausted all available replicas", self.input)
def read_netstr(idx, data, tot): ldata = len(data) i = 0 lenstr = '' if ldata - idx < 11: data = data[idx:] + bytes_to_str(fd.read(8192)) ldata = len(data) idx = 0 i = data.find(' ', idx, idx + 11) if i == -1: raise DataError( "Corrupted input: " "Could not parse a value length at {0} bytes.".format(tot), fname) else: lenstr = data[idx:i + 1] idx = i + 1 if ldata < i + 1: raise DataError( "Truncated input: " "Expected {0} bytes, got {1}".format(size, tot), fname) try: llen = int(lenstr) except ValueError: raise DataError( "Corrupted input: " "Could not parse a value length at {0} bytes.".format(tot), fname) tot += len(lenstr) if ldata - idx < llen + 1: data = data[idx:] + bytes_to_str(fd.read(llen + 8193)) ldata = len(data) idx = 0 msg = data[idx:idx + llen] if idx + llen + 1 > ldata: raise DataError( "Truncated input: " "Expected a value of {0} bytes (offset {1} bytes)".format( llen + 1, tot), fname) tot += llen + 1 idx += llen + 1 return idx, data, tot, msg
def delimited_reader(fd, size, fname, delimiter, line_terminator='\n', output_tail=False, read_buffer_size=8192): tail = [] tot = 0 while True: if size: r = fd.read(min(read_buffer_size, size - tot)) else: r = fd.read(read_buffer_size) tot += len(r) split_lines = r.split(line_terminator) if len(split_lines) > 1: tail.append(split_lines[0]) split_lines[0] = ''.join(tail) tail = [] if split_lines[-1] != '': tail.append(split_lines[-1]) for line in split_lines[:-1]: yield line.split(delimiter) if not len(r) or (size != None and tot >= size): if size != None and tot < size: raise DataError("Truncated input: "\ "Expected %d bytes, got %d" % (size, tot), fname) break if len(tail) > 0: if output_tail: yield tail else: print "Couldn't match the last %d bytes in %s. "\ "Some bytes may be missing from input." % (sum((len(chunk) for chunk in tail)), fname)
def ensure_free_space(fname): s = os.statvfs(fname) free = s.f_bsize * s.f_bavail if free < MIN_DISK_SPACE: raise DataError( "Only {0} KB disk space available. Task failed.".format( (free / 1024), fname))
def sort_reader(fd, fname, read_buffer_size=8192): buf = b"" while True: r = fd.read(read_buffer_size) if not len(r): break if len(buf) > read_buffer_size: raise DataError("Could not parse the sorted file.", fname) buf += r keyValues = buf.split(b"\x00") buf = keyValues[-1] for keyValue in keyValues[:-1]: key, value = keyValue.split(b"\xff") yield key, value if len(buf): raise DataError("Could not parse the tail of the sorted file.", fname)
def get_input(cls, id): done, inputs = cls.send('INPUT', ['include', [id]]) _id, status, replicas = inputs[0] if status == 'busy': raise Wait if status == 'failed': raise DataError("Can't handle broken input", id) return [(id, str(url)) for id, url in replicas]
def disco_input_stream(stream, size, url, ignore_corrupt=False): """Input stream for Disco's internal compression format.""" from disco.compat import BytesIO, int_of_byte from disco.compat import pickle_load import struct, gzip, zlib offset = 0 while True: header = stream.read(1) if not header: return if int_of_byte(header[0]) < 128: for e in old_netstr_reader(stream, size, url, header): yield e return try: is_compressed, checksum, hunk_size =\ struct.unpack('<BIQ', stream.read(13)) except: raise DataError("Truncated data at {0} bytes".format(offset), url) if not hunk_size: return hunk = stream.read(hunk_size) data = b'' try: data = zlib.decompress(hunk) if is_compressed else hunk if checksum != (zlib.crc32(data) & 0xFFFFFFFF): raise ValueError("Checksum does not match") except (ValueError, zlib.error) as e: if not ignore_corrupt: raise DataError( "Corrupted data between bytes {0}-{1}: {2}".format( offset, offset + hunk_size, e), url) offset += hunk_size hunk = BytesIO(data) while True: try: yield pickle_load(hunk) except EOFError: break except UnpicklingError as e: if not ignore_corrupt: raise DataError( "Corrupted data between bytes {0}-{1}: {2}".format( offset - hunk_size, offset, e), url)
def data_err(message, url): """ Raises a data error with the reason *message*. This signals the master to re-run the task on another node. If the same task raises data error on several different nodes, the master terminates the job. Thus data error should only be raised if it is likely that the occurred error is temporary. Typically this function is used by map readers to signal a temporary failure in accessing an input file. """ raise DataError(message, url)
def data_err(message, url): """ .. deprecated:: 0.4 raise :class:`disco.error.DataError` instead. Raises a :class:`disco.error.DataError`. A data error should only be raised if it is likely that the error is transient. Typically this function is used by map readers to signal a temporary failure in accessing an input file. """ raise DataError(message, url)
def swap(self): try: def skip(iter, N): from itertools import dropwhile return dropwhile(lambda (n, rec): n < N, enumerate(iter)) self.iter = skip(self.open(self.urls.next()), self.last + 1) except DataError: self.swap() except StopIteration: raise DataError("Exhausted all available replicas", self.input)
def unix_sort(filename, sort_buffer_size='10%'): import subprocess try: env = os.environ.copy() env['LC_ALL'] = 'C' subprocess.check_call([ 'sort', '-z', '-t', '\xff', '-k', '1,1', '-T', '.', '-S', sort_buffer_size, '-o', filename, filename ], env=env) except subprocess.CalledProcessError, e: raise DataError("Sorting %s failed: %s" % (filename, e), filename)
def disk_sort(self, filename): Status("Sorting %s..." % filename) try: subprocess.check_call(['sort', '-z', '-t', '\xff', '-k', '1,1', '-T', '.', '-S', self.sort_buffer_size, '-o', filename, filename]) except subprocess.CalledProcessError, e: raise DataError("Sorting %s failed: %s" % (filename, e), filename)
def ensure_file(fname, data = None, timeout = 60, mode = 500): while timeout > 0: if os.path.exists(fname): return False try: fd = os.open(fname + ".partial", os.O_CREAT | os.O_EXCL | os.O_WRONLY, mode) if callable(data): data = data() n = os.write(fd, data) if n != len(data): raise DataError("Writing file failed (only wrote %d/%d bytes)." " Out of disk space?" % (n, len(data)), fname) os.close(fd) os.rename(fname + ".partial", fname) return True except OSError, x: if x.errno == errno.EEXIST: time.sleep(1) timeout -= 1 else: raise DataError("Writing external file failed", fname)
def disco_input_stream(stream, size, url, ignore_corrupt=False): """Input stream for Disco's internal compression format.""" import struct, cStringIO, gzip, cPickle, zlib offset = 0 while True: header = stream.read(1) if not header: return if ord(header[0]) < 128: for e in old_netstr_reader(stream, size, url, header): yield e return try: is_compressed, checksum, hunk_size =\ struct.unpack('<BIQ', stream.read(13)) except: raise DataError("Truncated data at %d bytes" % offset, url) if not hunk_size: return hunk = stream.read(hunk_size) data = '' try: data = zlib.decompress(hunk) if is_compressed else hunk if checksum != (zlib.crc32(data) & 0xFFFFFFFF): raise ValueError("Checksum does not match") except (ValueError, zlib.error), e: if not ignore_corrupt: raise DataError( "Corrupted data between bytes %d-%d: %s" % (offset, offset + hunk_size, e), url) offset += hunk_size hunk = cStringIO.StringIO(data) while True: try: yield cPickle.load(hunk) except EOFError: break
try: fd = os.open(fname + ".partial", os.O_CREAT | os.O_EXCL | os.O_WRONLY, mode) if callable(data): data = data() os.write(fd, data) os.close(fd) os.rename(fname + ".partial", fname) return True except OSError, x: if x.errno == errno.EEXIST: time.sleep(1) timeout -= 1 else: raise DataError("Writing external file failed", fname) raise DataError("Timeout in writing external file", fname) def write_files(files, path): if files: path = os.path.abspath(path) ensure_path(path) for fname, data in files.iteritems(): # make sure that no files are written outside the given path p = os.path.abspath(os.path.join(path, fname)) if os.path.dirname(p) == path: ensure_file(path + "/" + fname, data=data) else: raise ValueError("Unsafe filename %s" % fname)
def re_reader(item_re_str, fd, size, fname, output_tail=False, read_buffer_size=8192): """ A map reader that uses an arbitrary regular expression to parse the input stream. :param item_re_str: regular expression for matching input items The reader works as follows: 1. X bytes is read from *fd* and appended to an internal buffer *buf*. 2. ``m = regexp.match(buf)`` is executed. 3. If *buf* produces a match, ``m.groups()`` is yielded, which contains an input entry for the map function. Step 2. is executed for the remaining part of *buf*. If no match is made, go to step 1. 4. If *fd* is exhausted before *size* bytes have been read, and *size* tests ``True``, a :class:`disco.error.DataError` is raised. 5. When *fd* is exhausted but *buf* contains unmatched bytes, two modes are available: If ``output_tail=True``, the remaining *buf* is yielded as is. Otherwise, a message is sent that warns about trailing bytes. The remaining *buf* is discarded. Note that :func:`re_reader` fails if the input streams contains unmatched bytes between matched entries. Make sure that your *item_re_str* is constructed so that it covers all bytes in the input stream. :func:`re_reader` provides an easy way to construct parsers for textual input streams. For instance, the following reader produces full HTML documents as input entries:: def html_reader(fd, size, fname): for x in re_reader("<HTML>(.*?)</HTML>", fd, size, fname): yield x[0] """ item_re = re.compile(item_re_str) buf = "" tot = 0 while True: if size: r = fd.read(min(read_buffer_size, size - tot)) else: r = fd.read(read_buffer_size) tot += len(r) buf += r m = item_re.match(buf) while m: yield m.groups() buf = buf[m.end():] m = item_re.match(buf) if not len(r) or (size != None and tot >= size): if size != None and tot < size: raise DataError("Truncated input: "\ "Expected %d bytes, got %d" % (size, tot), fname) if len(buf): if output_tail: yield [buf] else: print "Couldn't match the last %d bytes in %s. "\ "Some bytes may be missing from input." % (len(buf), fname) break
def corrupt_reader(fd, size, url, params): yield 'hello' if 'corrupt' in url: raise DataError("Corrupt!", url) yield 'there'