def _cat_file(self, gcs_uri): tmp_fd, tmp_path = tempfile.mkstemp() with os.fdopen(tmp_fd, 'w+b') as tmp_fileobj: self._download_io(gcs_uri, tmp_fileobj) tmp_fileobj.seek(0) for chunk in decompress(tmp_fileobj, gcs_uri): yield chunk
def _cat_file(self, gcs_uri): tmp_fd, tmp_path = tempfile.mkstemp() with os.fdopen(tmp_fd, 'w+b') as tmp_fileobj: self._download_io(gcs_uri, tmp_fileobj) tmp_fileobj.seek(0) for chunk in decompress(tmp_fileobj, gcs_uri): yield chunk
def _cat_file(self, filename): m = _SSH_URI_RE.match(filename) addr = m.group('hostname') path = m.group('filesystem_path') p = self._ssh_launch(addr, ['cat', path]) for chunk in decompress(p.stdout, path): yield chunk self._ssh_finish_run(p)
def _cat_file(self, path): m = _SSH_URI_RE.match(path) addr = m.group('hostname') fs_path = m.group('filesystem_path') p = self._ssh_launch(addr, ['cat', fs_path]) for chunk in decompress(p.stdout, fs_path): yield chunk self._ssh_finish_run(p)
def read_file(path, fileobj=None, yields_lines=True, cleanup=None): """Yields lines from a file, possibly decompressing it based on file extension. Currently we handle compressed files with the extensions ``.gz`` and ``.bz2``. :param string path: file path. Need not be a path on the local filesystem (URIs are okay) as long as you specify *fileobj* too. :param fileobj: file object to read from. Need not be seekable. If this is omitted, we ``open(path)``. :param yields_lines: Does iterating over *fileobj* yield lines (like file objects are supposed to)? If not, set this to ``False`` (useful for objects that correspond to objects on cluster filesystems) :param cleanup: Optional callback to call with no arguments when EOF is reached or an exception is thrown. .. deprecated:: 0.6.0 """ log.warning('read_file() is deprecated and will be removed in v0.7.0.' ' Try mrjob.cat.decompress() and mrjob.util.to_lines()') # sometimes values declared in the ``try`` block aren't accessible from the # ``finally`` block. not sure why. f = None try: # open path if we need to if fileobj is None: f = open(path, 'rb') else: f = fileobj decompressed_f = decompress(f, path) if decompressed_f is f and yields_lines: # this could be important; iterating over to_lines(f) is about 8x # slower than iterating over f lines = f else: lines = to_lines(decompressed_f) for line in lines: yield line finally: try: if f and f is not fileobj: f.close() finally: if cleanup: cleanup()
def read_file(path, fileobj=None, yields_lines=True, cleanup=None): """Yields lines from a file, possibly decompressing it based on file extension. Currently we handle compressed files with the extensions ``.gz`` and ``.bz2``. :param string path: file path. Need not be a path on the local filesystem (URIs are okay) as long as you specify *fileobj* too. :param fileobj: file object to read from. Need not be seekable. If this is omitted, we ``open(path)``. :param yields_lines: Does iterating over *fileobj* yield lines (like file objects are supposed to)? If not, set this to ``False`` (useful for objects that correspond to objects on cluster filesystems) :param cleanup: Optional callback to call with no arguments when EOF is reached or an exception is thrown. .. deprecated:: 0.6.0 """ log.warning('read_file() is deprecated and will be removed in v0.7.0.' ' Try mrjob.cat.decompress() and mrjob.util.to_lines()') # sometimes values declared in the ``try`` block aren't accessible from the # ``finally`` block. not sure why. f = None try: # open path if we need to if fileobj is None: f = open(path, 'rb') else: f = fileobj decompressed_f = decompress(f, path) if decompressed_f is f and yields_lines: # this could be important; iterating over to_lines(f) is about 8x # slower than iterating over f lines = f else: lines = to_lines(decompressed_f) for line in lines: yield line finally: try: if f and f is not fileobj: f.close() finally: if cleanup: cleanup()
def _cat_file(self, gcs_uri): blob = self._get_blob(gcs_uri) if not blob: return # don't cat nonexistent files with TemporaryFile(dir=self._local_tmp_dir) as temp: blob.download_to_file(temp) # now read from that file temp.seek(0) for chunk in decompress(temp, gcs_uri): yield chunk
def gz_test(self, dir_path_name): contents_gz = [ b'bar\n', b'qux\n', b'foo\n', b'bar\n', b'qux\n', b'foo\n' ] contents_normal = [b'foo\n', b'bar\n', b'bar\n'] all_contents_sorted = sorted(contents_gz + contents_normal) input_gz_path = join(dir_path_name, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b''.join(contents_gz)) input_gz.close() input_path2 = join(dir_path_name, 'input2') with open(input_path2, 'wb') as input_file: input_file.write(b''.join(contents_normal)) runner = LocalMRJobRunner(conf_paths=[]) # split into 3 files file_splits = runner._get_file_splits([input_gz_path, input_path2], 3) # Make sure that input.gz occurs in a single split that starts at # its beginning and ends at its end for split_info in file_splits.values(): if split_info['orig_name'] == input_gz_path: self.assertEqual(split_info['start'], 0) self.assertEqual(split_info['length'], os.stat(input_gz_path)[stat.ST_SIZE]) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved content = [] for file_name in file_splits: with open(file_name, 'rb') as f: lines = list(to_lines(decompress(f, file_name))) # make sure the input_gz split got its entire contents if file_name == input_gz_path: self.assertEqual(lines, contents_gz) content.extend(lines) self.assertEqual(sorted(content), all_contents_sorted)
def gz_test(self, dir_path_name): contents_gz = [b'bar\n', b'qux\n', b'foo\n', b'bar\n', b'qux\n', b'foo\n'] contents_normal = [b'foo\n', b'bar\n', b'bar\n'] all_contents_sorted = sorted(contents_gz + contents_normal) input_gz_path = join(dir_path_name, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b''.join(contents_gz)) input_gz.close() input_path2 = join(dir_path_name, 'input2') with open(input_path2, 'wb') as input_file: input_file.write(b''.join(contents_normal)) runner = LocalMRJobRunner(conf_paths=[]) # split into 3 files file_splits = runner._get_file_splits([input_gz_path, input_path2], 3) # Make sure that input.gz occurs in a single split that starts at # its beginning and ends at its end for split_info in file_splits.values(): if split_info['orig_name'] == input_gz_path: self.assertEqual(split_info['start'], 0) self.assertEqual(split_info['length'], os.stat(input_gz_path)[stat.ST_SIZE]) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved content = [] for file_name in file_splits: with open(file_name, 'rb') as f: lines = list(to_lines(decompress(f, file_name))) # make sure the input_gz split got its entire contents if file_name == input_gz_path: self.assertEqual(lines, contents_gz) content.extend(lines) self.assertEqual(sorted(content), all_contents_sorted)
def _read_input(self): """Read from stdin, or one more files, or directories. Yield one line at time. - Resolve globs (``foo_*.gz``). - Decompress ``.gz`` and ``.bz2`` files. - If path is ``-``, read from STDIN. - Recursively read all files in a directory """ paths = self.options.args or ['-'] for path in paths: if path == '-': for line in self.stdin: yield line else: with open(path, 'rb') as f: for line in to_lines(decompress(f, path)): yield line
def _read_input(self): """Read from stdin, or one more files, or directories. Yield one line at time. - Resolve globs (``foo_*.gz``). - Decompress ``.gz`` and ``.bz2`` files. - If path is ``-``, read from STDIN. - Recursively read all files in a directory """ paths = self.options.args or ['-'] for path in paths: if path == '-': for line in self.stdin: yield line else: with open(path, 'rb') as f: for line in to_lines(decompress(f, path)): yield line
def _cat_file(self, filename): # stream from HDFS cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename] log.debug('> %s' % cmd_line(cat_args)) cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE) for chunk in decompress(cat_proc.stdout, filename): yield chunk # this does someties happen; see #1396 for line in cat_proc.stderr: log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n'))) cat_proc.stdout.close() cat_proc.stderr.close() returncode = cat_proc.wait() if returncode != 0: raise IOError("Could not stream %s" % filename)
def _cat_file(self, filename): # stream from HDFS cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename] log.debug('> %s' % cmd_line(cat_args)) cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE) for chunk in decompress(cat_proc.stdout, filename): yield chunk # this does someties happen; see #1396 for line in cat_proc.stderr: log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n'))) cat_proc.stdout.close() cat_proc.stderr.close() returncode = cat_proc.wait() if returncode != 0: raise IOError("Could not stream %s" % filename)
def _cat_file(self, path): return decompress(self._cat_blob(path), path)
def _cat_file(self, filename): # stream lines from the s3 key s3_key = self._get_s3_key(filename) body = s3_key.get()['Body'] return decompress(body, filename)
def _cat_file(self, filename): # stream lines from the s3 key s3_key = self._get_s3_key(filename) body = s3_key.get()['Body'] return decompress(body, filename)
def _cat_file(self, gcs_uri): return decompress(self._cat_blob(gcs_uri), gcs_uri)
def _split_mapper_input(self, input_paths, step_num): """Take one or more input paths (which may be compressed) and split it to create the input files for the map tasks. Yields "splits", which are dictionaries with the following keys: input: path of input for one mapper file: path of original file start, length: chunk of original file in *input* Uncompressed files will not be split (even ``.bz2`` files); uncompressed files will be split as to to attempt to create twice as many input files as there are mappers. """ input_paths = list(input_paths) manifest = (step_num == 0 and self._uses_input_manifest()) # determine split size if manifest: split_size = 1 # one line per mapper else: split_size = self._pick_mapper_split_size(input_paths, step_num) # yield output fileobjs as needed split_fileobj_gen = self._yield_split_fileobjs('mapper', step_num) results = [] for path in input_paths: with open(path, 'rb') as src: if is_compressed(path): if manifest: raise Exception('input manifest %s should not be' ' compressed!' % path) # if file is compressed, uncompress it into a single split # Hadoop tracks the compressed file's size size = os.stat(path)[stat.ST_SIZE] with next(split_fileobj_gen) as dest: for chunk in decompress(src, path): dest.write(chunk) results.append(dict( file=path, start=0, length=size, )) else: # otherwise, split into one or more input files start = 0 length = 0 for lines in _split_records(src, split_size): with next(split_fileobj_gen) as dest: for line in lines: # simulate NLinesInputFormat by prefixing # each line with byte number if manifest: i = start + length dest.write(('%d\t' % i).encode('ascii')) dest.write(line) length += len(line) results.append(dict( file=path, start=start, length=length, )) start += length length = 0 return results
def _split_mapper_input(self, input_paths, step_num): """Take one or more input paths (which may be compressed) and split it to create the input files for the map tasks. Yields "splits", which are dictionaries with the following keys: input: path of input for one mapper file: path of original file start, length: chunk of original file in *input* Uncompressed files will not be split (even ``.bz2`` files); uncompressed files will be split as to to attempt to create twice as many input files as there are mappers. """ input_paths = list(input_paths) # determine split size split_size = self._pick_mapper_split_size(input_paths, step_num) # yield output fileobjs as needed split_fileobj_gen = self._yield_split_fileobjs('mapper', step_num) results = [] for path in input_paths: with open(path, 'rb') as src: if is_compressed(path): # if file is compressed, uncompress it into a single split # Hadoop tracks the compressed file's size size = os.stat(path)[stat.ST_SIZE] with next(split_fileobj_gen) as dest: for chunk in decompress(src, path): dest.write(chunk) results.append(dict( file=path, start=0, length=size, )) else: # otherwise, split into one or more input files start = 0 length = 0 for lines in _split_records(src, split_size): with next(split_fileobj_gen) as dest: for line in lines: dest.write(line) length += len(line) results.append(dict( file=path, start=start, length=length, )) start += length length = 0 return results
def _cat_file(self, path): path = _from_file_uri(path) with open(path, 'rb') as f: for chunk in decompress(f, path): yield chunk
def _split_mapper_input(self, input_paths, step_num): """Take one or more input paths (which may be compressed) and split it to create the input files for the map tasks. Yields "splits", which are dictionaries with the following keys: input: path of input for one mapper file: path of original file start, length: chunk of original file in *input* Uncompressed files will not be split (even ``.bz2`` files); uncompressed files will be split as to to attempt to create twice as many input files as there are mappers. """ input_paths = list(input_paths) manifest = (step_num == 0 and self._uses_input_manifest()) # determine split size if manifest: split_size = 1 # one line per mapper else: split_size = self._pick_mapper_split_size(input_paths, step_num) # yield output fileobjs as needed split_fileobj_gen = self._yield_split_fileobjs('mapper', step_num) results = [] for path in input_paths: with open(path, 'rb') as src: if is_compressed(path): if manifest: raise Exception('input manifest %s should not be' ' compressed!' % path) # if file is compressed, uncompress it into a single split # Hadoop tracks the compressed file's size size = os.stat(path)[stat.ST_SIZE] with next(split_fileobj_gen) as dest: for chunk in decompress(src, path): dest.write(chunk) results.append(dict( file=path, start=0, length=size, )) else: # otherwise, split into one or more input files start = 0 length = 0 for lines in _split_records(src, split_size): with next(split_fileobj_gen) as dest: for line in lines: # simulate NLinesInputFormat by prefixing # each line with byte number if manifest: i = start + length dest.write(('%d\t' % i).encode('ascii')) dest.write(line) length += len(line) results.append(dict( file=path, start=start, length=length, )) start += length length = 0 return results
def _cat_file(self, gcs_uri): return decompress(self._cat_blob(gcs_uri), gcs_uri)
def _cat_file(self, filename): with open(filename, 'rb') as f: for chunk in decompress(f, filename): yield chunk
def _cat_file(self, filename): with open(filename, 'rb') as f: for chunk in decompress(f, filename): yield chunk