Пример #1
0
    def test_cat_compressed_stream(self):
        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'w')
        input_gz.write('foo\nbar\n')
        input_gz.close()

        # restrict a file object to only the read() method
        class OnlyReadWrapper(object):
            def __init__(self, fp):
                self.fp = fp

            def read(self, *args, **kwargs):
                return self.fp.read(*args, **kwargs)

        output = []
        with open(input_gz_path) as f:
            for line in read_file(input_gz_path, fileobj=OnlyReadWrapper(f)):
                output.append(line)

        self.assertEqual(output, ['foo\n', 'bar\n'])

        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'w')
        input_bz2.write('bar\nbar\nfoo\n')
        input_bz2.close()

        output = []
        for line in read_file(input_bz2_path, fileobj=open(input_bz2_path)):
            output.append(line)

        self.assertEqual(output, ['bar\n', 'bar\n', 'foo\n'])
Пример #2
0
    def test_cat_compressed_stream(self):
        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'w')
        input_gz.write('foo\nbar\n')
        input_gz.close()

        # restrict a file object to only the read() method
        class OnlyReadWrapper(object):

            def __init__(self, fp):
                self.fp = fp

            def read(self, *args, **kwargs):
                return self.fp.read(*args, **kwargs)

        output = []
        with open(input_gz_path) as f:
            for line in read_file(input_gz_path, fileobj=OnlyReadWrapper(f)):
                output.append(line)

        self.assertEqual(output, ['foo\n', 'bar\n'])

        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'w')
        input_bz2.write('bar\nbar\nfoo\n')
        input_bz2.close()

        output = []
        for line in read_file(input_bz2_path, fileobj=open(input_bz2_path)):
            output.append(line)

        self.assertEqual(output, ['bar\n', 'bar\n', 'foo\n'])
Пример #3
0
 def _cat_file(self, filename):
     # stream lines from the s3 key
     s3_key = self.get_s3_key(filename)
     # yields_lines=False: warn read_file that s3_key yields chunks of bytes
     return read_file(s3_key_to_uri(s3_key),
                      fileobj=s3_key,
                      yields_lines=False)
Пример #4
0
    def _cat_file(self, filename):
        if is_uri(filename):
            # stream from HDFS
            cat_args = self._opts['hadoop_bin'] + ['fs', '-cat', filename]
            log.debug('> %s' % cmd_line(cat_args))

            cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

            def stream():
                for line in cat_proc.stdout:
                    yield line

                # there shouldn't be any stderr
                for line in cat_proc.stderr:
                    log.error('STDERR: ' + line)

                returncode = cat_proc.wait()

                if returncode != 0:
                    raise CalledProcessError(returncode, cat_args)

            return read_file(filename, stream())
        else:
            # read from local filesystem
            return super(HadoopJobRunner, self)._cat_file(filename)
Пример #5
0
    def _cat_file(self, filename):
        if is_uri(filename):
            # stream from HDFS
            cat_args = self._opts['hadoop_bin'] + ['fs', '-cat', filename]
            log.debug('> %s' % cmd_line(cat_args))

            cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

            def stream():
                for line in cat_proc.stdout:
                    yield line

                # there shouldn't be any stderr
                for line in cat_proc.stderr:
                    log.error('STDERR: ' + line)

                returncode = cat_proc.wait()

                if returncode != 0:
                    raise CalledProcessError(returncode, cat_args)

            return read_file(filename, stream())
        else:
            # read from local filesystem
            return super(HadoopJobRunner, self)._cat_file(filename)
Пример #6
0
    def test_read_file_uncompressed_stream(self):
        input_path = os.path.join(self.tmp_dir, "input")
        with open(input_path, "w") as input_file:
            input_file.write("bar\nfoo\n")

        output = []
        for line in read_file(input_path, fileobj=open(input_path)):
            output.append(line)

        self.assertEqual(output, ["bar\n", "foo\n"])
Пример #7
0
    def test_read_file_uncompressed_stream(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'w') as input_file:
            input_file.write('bar\nfoo\n')

        output = []
        for line in read_file(input_path, fileobj=open(input_path)):
            output.append(line)

        assert_equal(output, ['bar\n', 'foo\n'])
Пример #8
0
    def test_read_uncompressed_file(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'wb') as input_file:
            input_file.write(b'bar\nfoo\n')

        output = []
        for line in read_file(input_path):
            output.append(line)

        self.assertEqual(output, [b'bar\n', b'foo\n'])
Пример #9
0
    def test_read_file_uncompressed_stream(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'w') as input_file:
            input_file.write('bar\nfoo\n')

        output = []
        for line in read_file(input_path, fileobj=open(input_path)):
            output.append(line)

        self.assertEqual(output, ['bar\n', 'foo\n'])
Пример #10
0
    def test_read_uncompressed_file(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'wb') as input_file:
            input_file.write(b'bar\nfoo\n')

        output = []
        for line in read_file(input_path):
            output.append(line)

        self.assertEqual(output, [b'bar\n', b'foo\n'])
Пример #11
0
    def test_read_uncompressed_file(self):
        input_path = os.path.join(self.tmp_dir, "input")
        with open(input_path, "wb") as input_file:
            input_file.write(b"bar\nfoo\n")

        output = []
        for line in read_file(input_path):
            output.append(line)

        self.assertEqual(output, [b"bar\n", b"foo\n"])
Пример #12
0
    def test_read_bz2_file(self):
        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'wb')
        input_bz2.write(b'bar\nbar\nfoo\n')
        input_bz2.close()

        output = []
        for line in read_file(input_bz2_path):
            output.append(line)

        self.assertEqual(output, [b'bar\n', b'bar\n', b'foo\n'])
Пример #13
0
    def test_read_gz_file(self):
        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b'foo\nbar\n')
        input_gz.close()

        output = []
        for line in read_file(input_gz_path):
            output.append(line)

        self.assertEqual(output, [b'foo\n', b'bar\n'])
Пример #14
0
    def test_read_bz2_file(self):
        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'wb')
        input_bz2.write(b'bar\nbar\nfoo\n')
        input_bz2.close()

        output = []
        for line in read_file(input_bz2_path):
            output.append(line)

        self.assertEqual(output, [b'bar\n', b'bar\n', b'foo\n'])
Пример #15
0
    def _cat_file(self, filename):
        # stream lines from the s3 key
        s3_key = self.get_s3_key(filename)

        # stream the key to a fileobj
        stream = StringIO()
        s3_key.get_file(stream)
        stream.seek(0)

        buffer_iterator = read_file(s3_key_to_uri(s3_key), fileobj=stream)
        return buffer_iterator_to_line_iterator(buffer_iterator)
Пример #16
0
    def test_read_gz_file(self):
        input_gz_path = os.path.join(self.tmp_dir, "input.gz")
        input_gz = gzip.GzipFile(input_gz_path, "wb")
        input_gz.write(b"foo\nbar\n")
        input_gz.close()

        output = []
        for line in read_file(input_gz_path):
            output.append(line)

        self.assertEqual(output, [b"foo\n", b"bar\n"])
Пример #17
0
    def test_read_bz2_file(self):
        input_bz2_path = os.path.join(self.tmp_dir, "input.bz2")
        input_bz2 = bz2.BZ2File(input_bz2_path, "wb")
        input_bz2.write(b"bar\nbar\nfoo\n")
        input_bz2.close()

        output = []
        for line in read_file(input_bz2_path):
            output.append(line)

        self.assertEqual(output, [b"bar\n", b"bar\n", b"foo\n"])
Пример #18
0
    def test_read_gz_file(self):
        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b'foo\nbar\n')
        input_gz.close()

        output = []
        for line in read_file(input_gz_path):
            output.append(line)

        self.assertEqual(output, [b'foo\n', b'bar\n'])
Пример #19
0
    def _cat_file(self, gcs_uri):
        tmp_fd, tmp_path = tempfile.mkstemp()

        with os.fdopen(tmp_fd, 'w+b') as tmp_fileobj:
            self._download_io(gcs_uri, tmp_fileobj)

            tmp_fileobj.seek(0)

            line_gen = read_file(
                gcs_uri, fileobj=tmp_fileobj, yields_lines=False)
            for current_line in line_gen:
                yield current_line
Пример #20
0
    def test_read_gz_file_from_fileobj(self):
        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'w')
        input_gz.write('foo\nbar\n')
        input_gz.close()

        output = []
        with open(input_gz_path) as f:
            for line in read_file(input_gz_path, fileobj=OnlyReadWrapper(f)):
                output.append(line)

        self.assertEqual(output, ['foo\n', 'bar\n'])
Пример #21
0
    def test_read_bz2_file_from_fileobj(self):
        input_bz2_path = os.path.join(self.tmp_dir, "input.bz2")
        input_bz2 = bz2.BZ2File(input_bz2_path, "wb")
        input_bz2.write(b"bar\nbar\nfoo\n")
        input_bz2.close()

        output = []
        with open(input_bz2_path, "rb") as f:
            for line in read_file(input_bz2_path, fileobj=OnlyReadWrapper(f)):
                output.append(line)

        self.assertEqual(output, [b"bar\n", b"bar\n", b"foo\n"])
Пример #22
0
    def test_read_gz_file_from_fileobj(self):
        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'w')
        input_gz.write('foo\nbar\n')
        input_gz.close()

        output = []
        with open(input_gz_path) as f:
            for line in read_file(input_gz_path, fileobj=OnlyReadWrapper(f)):
                output.append(line)

        self.assertEqual(output, ['foo\n', 'bar\n'])
Пример #23
0
    def test_read_bz2_file_from_fileobj(self):
        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'wb')
        input_bz2.write(b'bar\nbar\nfoo\n')
        input_bz2.close()

        output = []
        with open(input_bz2_path, 'rb') as f:
            for line in read_file(input_bz2_path, fileobj=OnlyReadWrapper(f)):
                output.append(line)

        self.assertEqual(output, [b'bar\n', b'bar\n', b'foo\n'])
Пример #24
0
    def test_read_bz2_file_from_fileobj(self):
        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'wb')
        input_bz2.write(b'bar\nbar\nfoo\n')
        input_bz2.close()

        output = []
        with open(input_bz2_path, 'rb') as f:
            for line in read_file(input_bz2_path, fileobj=OnlyReadWrapper(f)):
                output.append(line)

        self.assertEqual(output, [b'bar\n', b'bar\n', b'foo\n'])
Пример #25
0
    def _cat_file(self, gcs_uri):
        tmp_fd, tmp_path = tempfile.mkstemp()

        with os.fdopen(tmp_fd, 'w+b') as tmp_fileobj:
            self._download_io(gcs_uri, tmp_fileobj)

            tmp_fileobj.seek(0)

            line_gen = read_file(gcs_uri,
                                 fileobj=tmp_fileobj,
                                 yields_lines=False)
            for current_line in line_gen:
                yield current_line
Пример #26
0
    def test_cat_compressed_stream(self):
        input_gz_path = os.path.join(self.tmp_dir, "input.gz")
        input_gz = gzip.GzipFile(input_gz_path, "w")
        input_gz.write("foo\nbar\n")
        input_gz.close()

        output = []
        for line in read_file(input_gz_path, fileobj=open(input_gz_path)):
            output.append(line)

        self.assertEqual(output, ["foo\n", "bar\n"])

        input_bz2_path = os.path.join(self.tmp_dir, "input.bz2")
        input_bz2 = bz2.BZ2File(input_bz2_path, "w")
        input_bz2.write("bar\nbar\nfoo\n")
        input_bz2.close()

        output = []
        for line in read_file(input_bz2_path, fileobj=open(input_bz2_path)):
            output.append(line)

        self.assertEqual(output, ["bar\n", "bar\n", "foo\n"])
Пример #27
0
 def _cat_file(self, filename):
     ssh_match = SSH_URI_RE.match(filename)
     addr = ssh_match.group('hostname') or self._address_of_master()
     if '!' in addr and self.ssh_key_name is None:
         raise ValueError('ssh_key_name must not be None')
     output = ssh_cat(
         self._ssh_bin,
         addr,
         self._ec2_key_pair_file,
         ssh_match.group('filesystem_path'),
         self.ssh_key_name,
     )
     return read_file(filename, fileobj=StringIO(output))
Пример #28
0
Файл: ssh.py Проект: Anihc/mrjob
 def _cat_file(self, filename):
     ssh_match = SSH_URI_RE.match(filename)
     addr = ssh_match.group('hostname') or self._address_of_master()
     if '!' in addr and self.ssh_key_name is None:
         raise ValueError('ssh_key_name must not be None')
     output = ssh_cat(
         self._ssh_bin,
         addr,
         self._ec2_key_pair_file,
         ssh_match.group('filesystem_path'),
         self.ssh_key_name,
     )
     return read_file(filename, fileobj=StringIO(output))
Пример #29
0
    def test_cat_compressed_stream(self):
        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'w')
        input_gz.write('foo\nbar\n')
        input_gz.close()

        output = []
        for line in read_file(input_gz_path, fileobj=open(input_gz_path)):
            output.append(line)

        assert_equal(output, ['foo\n', 'bar\n'])

        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'w')
        input_bz2.write('bar\nbar\nfoo\n')
        input_bz2.close()

        output = []
        for line in read_file(input_bz2_path, fileobj=open(input_bz2_path)):
            output.append(line)

        assert_equal(output, ['bar\n', 'bar\n', 'foo\n'])
Пример #30
0
    def test_cat_compressed_stream(self):
        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'w')
        input_gz.write('foo\nbar\n')
        input_gz.close()

        output = []
        for line in read_file(input_gz_path, fileobj=open(input_gz_path)):
            output.append(line)

        self.assertEqual(output, ['foo\n', 'bar\n'])

        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'w')
        input_bz2.write('bar\nbar\nfoo\n')
        input_bz2.close()

        output = []
        for line in read_file(input_bz2_path, fileobj=open(input_bz2_path)):
            output.append(line)

        self.assertEqual(output, ['bar\n', 'bar\n', 'foo\n'])
Пример #31
0
    def _cat_file(self, filename):
        ssh_match = _SSH_URI_RE.match(filename)
        addr = ssh_match.group('hostname') or self._address_of_master()

        keyfile = self._key_filename_for(addr)

        output = _ssh_cat(
            self._ssh_bin,
            addr,
            self._ec2_key_pair_file,
            ssh_match.group('filesystem_path'),
            keyfile,
        )
        return read_file(filename, fileobj=BytesIO(output))
Пример #32
0
    def _cat_file(self, filename):
        ssh_match = _SSH_URI_RE.match(filename)
        addr = ssh_match.group('hostname') or self._address_of_master()

        keyfile = self._key_filename_for(addr)

        output = ssh_cat(
            self._ssh_bin,
            addr,
            self._ec2_key_pair_file,
            ssh_match.group('filesystem_path'),
            keyfile,
        )
        return read_file(filename, fileobj=BytesIO(output))
Пример #33
0
    def _cat_file(self, filename):
        # stream from HDFS
        cat_args = self._hadoop_bin + ['fs', '-cat', filename]
        log.debug('> %s' % cmd_line(cat_args))

        cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

        def cleanup():
            # there shouldn't be any stderr
            for line in cat_proc.stderr:
                log.error('STDERR: ' + line)

            returncode = cat_proc.wait()

            if returncode != 0:
                raise IOError("Could not stream %s" % filename)

        return read_file(filename, cat_proc.stdout, cleanup=cleanup)
Пример #34
0
    def _cat_file(self, filename):
        # stream from HDFS
        cat_args = self._hadoop_bin + ['fs', '-cat', filename]
        log.debug('> %s' % cmd_line(cat_args))

        cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

        def cleanup():
            # there shouldn't be any stderr
            for line in cat_proc.stderr:
                log.error('STDERR: ' + line)

            returncode = cat_proc.wait()

            if returncode != 0:
                raise IOError("Could not stream %s" % filename)

        return read_file(filename, cat_proc.stdout, cleanup=cleanup)
Пример #35
0
    def test_dont_split_gz(self):
        contents_gz = ['bar\n', 'qux\n', 'foo\n', 'bar\n', 'qux\n', 'foo\n']
        contents_normal = ['foo\n', 'bar\n', 'bar\n']
        all_contents_sorted = sorted(contents_gz + contents_normal)

        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'w')
        input_gz.write(''.join(contents_gz))
        input_gz.close()

        input_path2 = os.path.join(self.tmp_dir, 'input2')
        with open(input_path2, 'w') as input_file:
            input_file.write(''.join(contents_normal))

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)

        # Make sure that input.gz occurs in a single split that starts at
        # its beginning and ends at its end
        for split_info in file_splits.values():
            if split_info['orig_name'] == input_gz_path:
                self.assertEqual(split_info['start'], 0)
                self.assertEqual(split_info['length'],
                                 os.stat(input_gz_path)[stat.ST_SIZE])

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            lines = list(read_file(file_name))

            # make sure the input_gz split got its entire contents
            if file_name == input_gz_path:
                self.assertEqual(lines, contents_gz)

            content.extend(lines)

        self.assertEqual(sorted(content),
                         all_contents_sorted)
Пример #36
0
    def gz_test(self, dir_path_name):
        contents_gz = [
            b'bar\n', b'qux\n', b'foo\n', b'bar\n', b'qux\n', b'foo\n'
        ]
        contents_normal = [b'foo\n', b'bar\n', b'bar\n']
        all_contents_sorted = sorted(contents_gz + contents_normal)

        input_gz_path = os.path.join(dir_path_name, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b''.join(contents_gz))
        input_gz.close()
        input_path2 = os.path.join(dir_path_name, 'input2')
        with open(input_path2, 'wb') as input_file:
            input_file.write(b''.join(contents_normal))

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)

        # Make sure that input.gz occurs in a single split that starts at
        # its beginning and ends at its end
        for split_info in file_splits.values():
            if split_info['orig_name'] == input_gz_path:
                self.assertEqual(split_info['start'], 0)
                self.assertEqual(split_info['length'],
                                 os.stat(input_gz_path)[stat.ST_SIZE])

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            lines = list(read_file(file_name))

            # make sure the input_gz split got its entire contents
            if file_name == input_gz_path:
                self.assertEqual(lines, contents_gz)

            content.extend(lines)

        self.assertEqual(sorted(content), all_contents_sorted)
Пример #37
0
    def test_read_large_bz2_file(self):
        # catch incorrect use of bz2 library (Issue #814)

        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'w')

        # can't just repeat same value, because we need the file to be
        # compressed! 50000 lines is too few to catch the bug.
        random.seed(0)
        for _ in xrange(100000):
            input_bz2.write('%016x\n' % random.randint(0, 2**64 - 1))
        input_bz2.close()

        random.seed(0)
        num_lines = 0
        for line in read_file(input_bz2_path):
            self.assertEqual(line, '%016x\n' % random.randint(0, 2**64 - 1))
            num_lines += 1

        self.assertEqual(num_lines, 100000)
Пример #38
0
    def test_read_large_bz2_file(self):
        # catch incorrect use of bz2 library (Issue #814)

        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'w')

        # can't just repeat same value, because we need the file to be
        # compressed! 50000 lines is too few to catch the bug.
        random.seed(0)
        for _ in xrange(100000):
            input_bz2.write('%016x\n' % random.randint(0, 2 ** 64 - 1))
        input_bz2.close()

        random.seed(0)
        num_lines = 0
        for line in read_file(input_bz2_path):
            self.assertEqual(line, '%016x\n' % random.randint(0, 2 ** 64 - 1))
            num_lines += 1

        self.assertEqual(num_lines, 100000)
Пример #39
0
    def test_read_large_bz2_file(self):
        # catch incorrect use of bz2 library (Issue #814)

        input_bz2_path = os.path.join(self.tmp_dir, "input.bz2")
        input_bz2 = bz2.BZ2File(input_bz2_path, "wb")

        # can't just repeat same value, because we need the file to be
        # compressed! 50000 lines is too few to catch the bug.
        with random_seed(0):
            for _ in range(100000):
                input_bz2.write((random_identifier() + "\n").encode("ascii"))
            input_bz2.close()

        # now expect to read back the same bytes
        with random_seed(0):
            num_lines = 0
            for line in read_file(input_bz2_path):
                self.assertEqual(line, (random_identifier() + "\n").encode("ascii"))
                num_lines += 1

            self.assertEqual(num_lines, 100000)
Пример #40
0
    def _cat_file(self, filename):
        # stream from HDFS
        cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename]
        log.debug('> %s' % cmd_line(cat_args))

        cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

        def cleanup():
            # this does someties happen; see #1396
            for line in cat_proc.stderr:
                log.error('STDERR: ' + to_string(line.rstrip(b'\r\n')))

            cat_proc.stdout.close()
            cat_proc.stderr.close()

            returncode = cat_proc.wait()

            if returncode != 0:
                raise IOError("Could not stream %s" % filename)

        return read_file(filename, cat_proc.stdout, cleanup=cleanup)
Пример #41
0
    def _cat_file(self, filename):
        # stream from HDFS
        cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename]
        log.debug('> %s' % cmd_line(cat_args))

        cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

        def cleanup():
            # this does someties happen; see #1396
            for line in cat_proc.stderr:
                log.error('STDERR: ' + to_string(line.rstrip(b'\r\n')))

            cat_proc.stdout.close()
            cat_proc.stderr.close()

            returncode = cat_proc.wait()

            if returncode != 0:
                raise IOError("Could not stream %s" % filename)

        return read_file(filename, cat_proc.stdout, cleanup=cleanup)
Пример #42
0
    def gz_test(self, dir_path_name):
        contents_gz = [b"bar\n", b"qux\n", b"foo\n", b"bar\n", b"qux\n", b"foo\n"]
        contents_normal = [b"foo\n", b"bar\n", b"bar\n"]
        all_contents_sorted = sorted(contents_gz + contents_normal)

        input_gz_path = os.path.join(dir_path_name, "input.gz")
        input_gz = gzip.GzipFile(input_gz_path, "wb")
        input_gz.write(b"".join(contents_gz))
        input_gz.close()
        input_path2 = os.path.join(dir_path_name, "input2")
        with open(input_path2, "wb") as input_file:
            input_file.write(b"".join(contents_normal))

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)

        # Make sure that input.gz occurs in a single split that starts at
        # its beginning and ends at its end
        for split_info in file_splits.values():
            if split_info["orig_name"] == input_gz_path:
                self.assertEqual(split_info["start"], 0)
                self.assertEqual(split_info["length"], os.stat(input_gz_path)[stat.ST_SIZE])

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            lines = list(read_file(file_name))

            # make sure the input_gz split got its entire contents
            if file_name == input_gz_path:
                self.assertEqual(lines, contents_gz)

            content.extend(lines)

        self.assertEqual(sorted(content), all_contents_sorted)
Пример #43
0
    def test_read_large_bz2_file(self):
        # catch incorrect use of bz2 library (Issue #814)

        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'wb')

        # can't just repeat same value, because we need the file to be
        # compressed! 50000 lines is too few to catch the bug.
        with random_seed(0):
            for _ in range(100000):
                input_bz2.write((random_identifier() + '\n').encode('ascii'))
            input_bz2.close()

        # now expect to read back the same bytes
        with random_seed(0):
            num_lines = 0
            for line in read_file(input_bz2_path):
                self.assertEqual(line,
                                 (random_identifier() + '\n').encode('ascii'))
                num_lines += 1

            self.assertEqual(num_lines, 100000)
Пример #44
0
 def _cat_file(self, filename):
     # stream lines from the s3 key
     s3_key = self.get_s3_key(filename)
     buffer_iterator = read_file(s3_key_to_uri(s3_key), fileobj=s3_key)
     return buffer_iterator_to_line_iterator(buffer_iterator)
Пример #45
0
 def _cat_file(self, filename):
     """cat a file, decompress if necessary."""
     for line in read_file(filename):
         yield line
Пример #46
0
 def _cat_file(self, filename):
     for line in read_file(filename):
         yield line
Пример #47
0
 def _cat_file(self, filename):
     # stream lines from the s3 key
     s3_key = self.get_s3_key(filename)
     # yields_lines=False: warn read_file that s3_key yields chunks of bytes
     return read_file(
         s3_key_to_uri(s3_key), fileobj=s3_key, yields_lines=False)
Пример #48
0
 def _cat_file(self, filename):
     return read_file(filename)
Пример #49
0
    def _cat_file(self, filename):
        # stream lines from the s3 key
        s3_key = self._get_s3_key(filename)
        body = s3_key.get()['Body']

        return read_file(filename, fileobj=body, yields_lines=False)
Пример #50
0
Файл: s3.py Проект: yuanda/mrjob
 def _cat_file(self, filename):
     # stream lines from the s3 key
     s3_key = self.get_s3_key(filename)
     buffer_iterator = read_file(s3_key_to_uri(s3_key), fileobj=s3_key)
     return buffer_iterator_to_line_iterator(buffer_iterator)
Пример #51
0
 def _cat_file(self, filename):
     return read_file(filename)