def writetb(path, kvs, java_mem_mb=256): """Write typedbytes sequence file to HDFS given an iterator of KeyValue pairs :param path: HDFS path (string) :param kvs: Iterator of (key, value) :param java_mem_mb: Integer of java heap size in MB (default 256) :raises: IOError: An error occurred while saving the data. """ read_fd, write_fd = os.pipe() read_fp = os.fdopen(read_fd, 'r') hstreaming = _find_hstreaming() cmd = 'hadoop jar %s loadtb %s' % (hstreaming, path) p = _hadoop_fs_command(cmd, stdin=read_fp, java_mem_mb=java_mem_mb) read_fp.close() with hadoopy.TypedBytesFile(write_fd=write_fd) as tb_fp: for kv in kvs: if p.poll() is not None: raise IOError( 'writetb: Hadoop process quit while we were sending it data. Hadoop output below...\nstdout\n%s\nstderr\n%s' % p.communicate()) tb_fp.write(kv) tb_fp.flush() p.wait() if p.returncode is not 0: raise IOError( 'writetb: Hadoop process returned [%d]. Hadoop output below...\nstderr\n%s' % (p.returncode, p.stderr.read()))
def writetb(path, kvs, java_mem_mb=256): """Write typedbytes sequence file to HDFS given an iterator of KeyValue pairs :param path: HDFS path (string) :param kvs: Iterator of (key, value) :param java_mem_mb: Integer of java heap size in MB (default 256) :raises: IOError: An error occurred while saving the data. """ read_fd, write_fd = os.pipe() read_fp = os.fdopen(read_fd, "r") hstreaming = _find_hstreaming() cmd = "hadoop jar %s loadtb %s" % (hstreaming, path) p = _hadoop_fs_command(cmd, stdin=read_fp, java_mem_mb=java_mem_mb) read_fp.close() with hadoopy.TypedBytesFile(write_fd=write_fd) as tb_fp: for kv in kvs: if p.poll() is not None: raise IOError( "writetb: Hadoop process quit while we were sending it data. Hadoop output below...\nstdout\n%s\nstderr\n%s" % p.communicate() ) tb_fp.write(kv) tb_fp.flush() p.wait() if p.returncode is not 0: raise IOError( "writetb: Hadoop process returned [%d]. Hadoop output below...\nstderr\n%s" % (p.returncode, p.stderr.read()) )
def cat(path, procs=10): """Read typedbytes sequence files on HDFS (with optional compression). Args: path: A string (potentially with wildcards). procs: Number of processes to use. Returns: An iterator of key, value pairs. Raises: IOError: An error occurred listing the directory (e.g., not available). """ max_files = 100 hstreaming = _find_hstreaming() all_paths = ls(path) p = multiprocessing.Pool(min((procs, max_files, len(all_paths)))) while all_paths: paths = all_paths[:max_files] del all_paths[:max_files] fps = [tempfile.NamedTemporaryFile() for x in paths] p.map(_hdfs_cat_tb, [(path, hstreaming, fp.name) for path, fp in zip(paths, fps)]) for y in fps: for x in hadoopy.TypedBytesFile(y.name, 'r'): yield x
def cat(path, ignore_logs=True, procs=10): """Read typedbytes sequence files on HDFS (with optional compression). By default, ignores files who's names start with an underscore '_' as they are log files. This allows you to cat a directory that may be a variety of outputs from hadoop (e.g., _SUCCESS, _logs). Args: path: A string (potentially with wildcards). procs: Number of processes to use. ignore_logs: If True, ignore all files who's name starts with an underscore. Defaults to True. Returns: An iterator of key, value pairs. Raises: IOError: An error occurred listing the directory (e.g., not available). """ max_files = 100 hstreaming = _find_hstreaming() all_paths = ls(path) if ignore_logs: # Ignore any files that start with an underscore keep_file = lambda x: os.path.basename(x)[0] != '_' all_paths = filter(keep_file, all_paths) p = multiprocessing.Pool(min((procs, max_files, len(all_paths)))) while all_paths: paths = all_paths[:max_files] del all_paths[:max_files] fps = [tempfile.NamedTemporaryFile() for x in paths] p.map(_hdfs_cat_tb, [(path, hstreaming, fp.name) for path, fp in zip(paths, fps)]) for y in fps: for x in hadoopy.TypedBytesFile(y.name, 'r'): yield x
def readtb(paths, num_procs=10, java_mem_mb=256, ignore_logs=True): """Read typedbytes sequence files on HDFS (with optional compression). By default, ignores files who's names start with an underscore '_' as they are log files. This allows you to cat a directory that may be a variety of outputs from hadoop (e.g., _SUCCESS, _logs). This works on directories and files. The KV pairs may be interleaved between files (they are read in parallel). :param paths: HDFS path (str) or paths (iterator) :param num_procs: Number of reading procs to open (default 10) :param java_mem_mb: Integer of java heap size in MB (default 256) :param ignore_logs: If True, ignore all files who's name starts with an underscore. Defaults to True. :returns: An iterator of key, value pairs. :raises: IOError: An error occurred reading the directory (e.g., not available). """ import select hstreaming = _find_hstreaming() if isinstance(paths, (str, unicode)): paths = [paths] read_fds = set() procs = {} tb_fps = {} def _open_tb(cur_path): cmd = 'hadoop jar %s dumptb %s' % (hstreaming, cur_path) read_fd, write_fd = os.pipe() write_fp = os.fdopen(write_fd, 'w') p = _hadoop_fs_command(cmd, stdout=write_fp, java_mem_mb=java_mem_mb) write_fp.close() read_fds.add(read_fd) procs[read_fd] = p tb_fps[read_fd] = hadoopy.TypedBytesFile(read_fd=read_fd) def _path_gen(): for root_path in paths: try: all_paths = ls(root_path) except IOError: raise IOError("No such file or directory: '%s'" % root_path) if ignore_logs: # Ignore any files that start with an underscore keep_file = lambda x: os.path.basename(x)[0] != '_' all_paths = filter(keep_file, all_paths) for cur_path in all_paths: yield _open_tb(cur_path) try: path_gen = _path_gen() for x in range(num_procs): try: path_gen.next() except (AttributeError, StopIteration): path_gen = None while read_fds: cur_fds = select.select(read_fds, [], [])[0] for read_fd in cur_fds: p = procs[read_fd] tp_fp = tb_fps[read_fd] try: yield tp_fp.next() except StopIteration: p.wait() del procs[read_fd] del tb_fps[read_fd] del p os.close(read_fd) read_fds.remove(read_fd) try: path_gen.next() except (AttributeError, StopIteration): path_gen = None finally: # Cleanup outstanding procs for p in procs.values(): p.kill() p.wait()
def readtb(paths, num_procs=10, java_mem_mb=256, ignore_logs=True): """Read typedbytes sequence files on HDFS (with optional compression). By default, ignores files who's names start with an underscore '_' as they are log files. This allows you to cat a directory that may be a variety of outputs from hadoop (e.g., _SUCCESS, _logs). This works on directories and files. The KV pairs may be interleaved between files (they are read in parallel). :param paths: HDFS path (str) or paths (iterator) :param num_procs: Number of reading procs to open (default 10) :param java_mem_mb: Integer of java heap size in MB (default 256) :param ignore_logs: If True, ignore all files who's name starts with an underscore. Defaults to True. :returns: An iterator of key, value pairs. :raises: IOError: An error occurred reading the directory (e.g., not available). """ import select hstreaming = _find_hstreaming() if isinstance(paths, (str, unicode)): paths = [paths] read_fds = set() procs = {} tb_fps = {} def _open_tb(cur_path): cmd = "hadoop jar %s dumptb %s" % (hstreaming, cur_path) read_fd, write_fd = os.pipe() write_fp = os.fdopen(write_fd, "w") p = _hadoop_fs_command(cmd, stdout=write_fp, java_mem_mb=java_mem_mb) write_fp.close() read_fds.add(read_fd) procs[read_fd] = p tb_fps[read_fd] = hadoopy.TypedBytesFile(read_fd=read_fd) def _path_gen(): for root_path in paths: try: all_paths = ls(root_path) except IOError: raise IOError("No such file or directory: '%s'" % root_path) if ignore_logs: # Ignore any files that start with an underscore keep_file = lambda x: os.path.basename(x)[0] != "_" all_paths = filter(keep_file, all_paths) for cur_path in all_paths: yield _open_tb(cur_path) try: path_gen = _path_gen() for x in range(num_procs): try: path_gen.next() except (AttributeError, StopIteration): path_gen = None while read_fds: cur_fds = select.select(read_fds, [], [])[0] for read_fd in cur_fds: p = procs[read_fd] tp_fp = tb_fps[read_fd] try: yield tp_fp.next() except StopIteration: p.wait() del procs[read_fd] del tb_fps[read_fd] del p os.close(read_fd) read_fds.remove(read_fd) try: path_gen.next() except (AttributeError, StopIteration): path_gen = None finally: # Cleanup outstanding procs for p in procs.values(): p.kill() p.wait()