예제 #1
0
파일: gcs.py 프로젝트: okomestudio/mrjob
    def _cat_file(self, gcs_uri):
        tmp_fd, tmp_path = tempfile.mkstemp()

        with os.fdopen(tmp_fd, 'w+b') as tmp_fileobj:
            self._download_io(gcs_uri, tmp_fileobj)

            tmp_fileobj.seek(0)

            for chunk in decompress(tmp_fileobj, gcs_uri):
                yield chunk
예제 #2
0
파일: gcs.py 프로젝트: Alberne/mrjob
    def _cat_file(self, gcs_uri):
        tmp_fd, tmp_path = tempfile.mkstemp()

        with os.fdopen(tmp_fd, 'w+b') as tmp_fileobj:
            self._download_io(gcs_uri, tmp_fileobj)

            tmp_fileobj.seek(0)

            for chunk in decompress(tmp_fileobj, gcs_uri):
                yield chunk
예제 #3
0
파일: ssh.py 프로젝트: Affirm/mrjob
    def _cat_file(self, filename):
        m = _SSH_URI_RE.match(filename)
        addr = m.group('hostname')
        path = m.group('filesystem_path')

        p = self._ssh_launch(addr, ['cat', path])

        for chunk in decompress(p.stdout, path):
            yield chunk

        self._ssh_finish_run(p)
예제 #4
0
    def _cat_file(self, path):
        m = _SSH_URI_RE.match(path)
        addr = m.group('hostname')
        fs_path = m.group('filesystem_path')

        p = self._ssh_launch(addr, ['cat', fs_path])

        for chunk in decompress(p.stdout, fs_path):
            yield chunk

        self._ssh_finish_run(p)
예제 #5
0
파일: util.py 프로젝트: zhiaozhou/mrjob
def read_file(path, fileobj=None, yields_lines=True, cleanup=None):
    """Yields lines from a file, possibly decompressing it based on file
    extension.

    Currently we handle compressed files with the extensions ``.gz`` and
    ``.bz2``.

    :param string path: file path. Need not be a path on the local filesystem
                        (URIs are okay) as long as you specify *fileobj* too.
    :param fileobj: file object to read from. Need not be seekable. If this
                    is omitted, we ``open(path)``.
    :param yields_lines: Does iterating over *fileobj* yield lines (like
                         file objects are supposed to)? If not, set this to
                         ``False`` (useful for objects that correspond
                         to objects on cluster filesystems)
    :param cleanup: Optional callback to call with no arguments when EOF is
                    reached or an exception is thrown.

    .. deprecated:: 0.6.0
    """
    log.warning('read_file() is deprecated and will be removed in v0.7.0.'
                ' Try mrjob.cat.decompress() and mrjob.util.to_lines()')

    # sometimes values declared in the ``try`` block aren't accessible from the
    # ``finally`` block. not sure why.
    f = None
    try:
        # open path if we need to
        if fileobj is None:
            f = open(path, 'rb')
        else:
            f = fileobj

        decompressed_f = decompress(f, path)

        if decompressed_f is f and yields_lines:
            # this could be important; iterating over to_lines(f) is about 8x
            # slower than iterating over f
            lines = f
        else:
            lines = to_lines(decompressed_f)

        for line in lines:
            yield line
    finally:
        try:
            if f and f is not fileobj:
                f.close()
        finally:
            if cleanup:
                cleanup()
예제 #6
0
파일: util.py 프로젝트: Yelp/mrjob
def read_file(path, fileobj=None, yields_lines=True, cleanup=None):
    """Yields lines from a file, possibly decompressing it based on file
    extension.

    Currently we handle compressed files with the extensions ``.gz`` and
    ``.bz2``.

    :param string path: file path. Need not be a path on the local filesystem
                        (URIs are okay) as long as you specify *fileobj* too.
    :param fileobj: file object to read from. Need not be seekable. If this
                    is omitted, we ``open(path)``.
    :param yields_lines: Does iterating over *fileobj* yield lines (like
                         file objects are supposed to)? If not, set this to
                         ``False`` (useful for objects that correspond
                         to objects on cluster filesystems)
    :param cleanup: Optional callback to call with no arguments when EOF is
                    reached or an exception is thrown.

    .. deprecated:: 0.6.0
    """
    log.warning('read_file() is deprecated and will be removed in v0.7.0.'
                ' Try mrjob.cat.decompress() and mrjob.util.to_lines()')

    # sometimes values declared in the ``try`` block aren't accessible from the
    # ``finally`` block. not sure why.
    f = None
    try:
        # open path if we need to
        if fileobj is None:
            f = open(path, 'rb')
        else:
            f = fileobj

        decompressed_f = decompress(f, path)

        if decompressed_f is f and yields_lines:
            # this could be important; iterating over to_lines(f) is about 8x
            # slower than iterating over f
            lines = f
        else:
            lines = to_lines(decompressed_f)

        for line in lines:
            yield line
    finally:
        try:
            if f and f is not fileobj:
                f.close()
        finally:
            if cleanup:
                cleanup()
예제 #7
0
파일: gcs.py 프로젝트: honeyflyfish/mrjob
    def _cat_file(self, gcs_uri):
        blob = self._get_blob(gcs_uri)

        if not blob:
            return  # don't cat nonexistent files

        with TemporaryFile(dir=self._local_tmp_dir) as temp:
            blob.download_to_file(temp)

            # now read from that file
            temp.seek(0)

            for chunk in decompress(temp, gcs_uri):
                yield chunk
예제 #8
0
    def gz_test(self, dir_path_name):
        contents_gz = [
            b'bar\n', b'qux\n', b'foo\n', b'bar\n', b'qux\n', b'foo\n'
        ]
        contents_normal = [b'foo\n', b'bar\n', b'bar\n']
        all_contents_sorted = sorted(contents_gz + contents_normal)

        input_gz_path = join(dir_path_name, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b''.join(contents_gz))
        input_gz.close()
        input_path2 = join(dir_path_name, 'input2')
        with open(input_path2, 'wb') as input_file:
            input_file.write(b''.join(contents_normal))

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)

        # Make sure that input.gz occurs in a single split that starts at
        # its beginning and ends at its end
        for split_info in file_splits.values():
            if split_info['orig_name'] == input_gz_path:
                self.assertEqual(split_info['start'], 0)
                self.assertEqual(split_info['length'],
                                 os.stat(input_gz_path)[stat.ST_SIZE])

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            with open(file_name, 'rb') as f:
                lines = list(to_lines(decompress(f, file_name)))

            # make sure the input_gz split got its entire contents
            if file_name == input_gz_path:
                self.assertEqual(lines, contents_gz)

            content.extend(lines)

        self.assertEqual(sorted(content), all_contents_sorted)
예제 #9
0
파일: test_local.py 프로젝트: Affirm/mrjob
    def gz_test(self, dir_path_name):
        contents_gz = [b'bar\n', b'qux\n', b'foo\n', b'bar\n',
                       b'qux\n', b'foo\n']
        contents_normal = [b'foo\n', b'bar\n', b'bar\n']
        all_contents_sorted = sorted(contents_gz + contents_normal)

        input_gz_path = join(dir_path_name, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b''.join(contents_gz))
        input_gz.close()
        input_path2 = join(dir_path_name, 'input2')
        with open(input_path2, 'wb') as input_file:
            input_file.write(b''.join(contents_normal))

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)

        # Make sure that input.gz occurs in a single split that starts at
        # its beginning and ends at its end
        for split_info in file_splits.values():
            if split_info['orig_name'] == input_gz_path:
                self.assertEqual(split_info['start'], 0)
                self.assertEqual(split_info['length'],
                                 os.stat(input_gz_path)[stat.ST_SIZE])

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            with open(file_name, 'rb') as f:
                lines = list(to_lines(decompress(f, file_name)))

            # make sure the input_gz split got its entire contents
            if file_name == input_gz_path:
                self.assertEqual(lines, contents_gz)

            content.extend(lines)

        self.assertEqual(sorted(content),
                         all_contents_sorted)
예제 #10
0
파일: job.py 프로젝트: Yelp/mrjob
    def _read_input(self):
        """Read from stdin, or one more files, or directories.
        Yield one line at time.

        - Resolve globs (``foo_*.gz``).
        - Decompress ``.gz`` and ``.bz2`` files.
        - If path is ``-``, read from STDIN.
        - Recursively read all files in a directory
        """
        paths = self.options.args or ['-']

        for path in paths:
            if path == '-':
                for line in self.stdin:
                    yield line
            else:
                with open(path, 'rb') as f:
                    for line in to_lines(decompress(f, path)):
                        yield line
예제 #11
0
    def _read_input(self):
        """Read from stdin, or one more files, or directories.
        Yield one line at time.

        - Resolve globs (``foo_*.gz``).
        - Decompress ``.gz`` and ``.bz2`` files.
        - If path is ``-``, read from STDIN.
        - Recursively read all files in a directory
        """
        paths = self.options.args or ['-']

        for path in paths:
            if path == '-':
                for line in self.stdin:
                    yield line
            else:
                with open(path, 'rb') as f:
                    for line in to_lines(decompress(f, path)):
                        yield line
예제 #12
0
파일: hadoop.py 프로젝트: Yelp/mrjob
    def _cat_file(self, filename):
        # stream from HDFS
        cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename]
        log.debug('> %s' % cmd_line(cat_args))

        cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

        for chunk in decompress(cat_proc.stdout, filename):
            yield chunk

        # this does someties happen; see #1396
        for line in cat_proc.stderr:
            log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n')))

        cat_proc.stdout.close()
        cat_proc.stderr.close()

        returncode = cat_proc.wait()

        if returncode != 0:
            raise IOError("Could not stream %s" % filename)
예제 #13
0
파일: hadoop.py 프로젝트: espenwiik91/BDEM
    def _cat_file(self, filename):
        # stream from HDFS
        cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename]
        log.debug('> %s' % cmd_line(cat_args))

        cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

        for chunk in decompress(cat_proc.stdout, filename):
            yield chunk

        # this does someties happen; see #1396
        for line in cat_proc.stderr:
            log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n')))

        cat_proc.stdout.close()
        cat_proc.stderr.close()

        returncode = cat_proc.wait()

        if returncode != 0:
            raise IOError("Could not stream %s" % filename)
예제 #14
0
 def _cat_file(self, path):
     return decompress(self._cat_blob(path), path)
예제 #15
0
파일: s3.py 프로젝트: okomestudio/mrjob
    def _cat_file(self, filename):
        # stream lines from the s3 key
        s3_key = self._get_s3_key(filename)
        body = s3_key.get()['Body']

        return decompress(body, filename)
예제 #16
0
    def _cat_file(self, filename):
        # stream lines from the s3 key
        s3_key = self._get_s3_key(filename)
        body = s3_key.get()['Body']

        return decompress(body, filename)
예제 #17
0
파일: gcs.py 프로젝트: Affirm/mrjob
 def _cat_file(self, gcs_uri):
     return decompress(self._cat_blob(gcs_uri), gcs_uri)
예제 #18
0
파일: sim.py 프로젝트: Affirm/mrjob
    def _split_mapper_input(self, input_paths, step_num):
        """Take one or more input paths (which may be compressed) and split
        it to create the input files for the map tasks.

        Yields "splits", which are dictionaries with the following keys:

        input: path of input for one mapper
        file: path of original file
        start, length: chunk of original file in *input*

        Uncompressed files will not be split (even ``.bz2`` files);
        uncompressed files will be split as to to attempt to create
        twice as many input files as there are mappers.
        """
        input_paths = list(input_paths)
        manifest = (step_num == 0 and self._uses_input_manifest())

        # determine split size
        if manifest:
            split_size = 1  # one line per mapper
        else:
            split_size = self._pick_mapper_split_size(input_paths, step_num)

        # yield output fileobjs as needed
        split_fileobj_gen = self._yield_split_fileobjs('mapper', step_num)

        results = []

        for path in input_paths:
            with open(path, 'rb') as src:
                if is_compressed(path):
                    if manifest:
                        raise Exception('input manifest %s should not be'
                                        ' compressed!' % path)

                    # if file is compressed, uncompress it into a single split

                    # Hadoop tracks the compressed file's size
                    size = os.stat(path)[stat.ST_SIZE]

                    with next(split_fileobj_gen) as dest:
                        for chunk in decompress(src, path):
                            dest.write(chunk)

                    results.append(dict(
                        file=path,
                        start=0,
                        length=size,
                    ))
                else:
                    # otherwise, split into one or more input files
                    start = 0
                    length = 0

                    for lines in _split_records(src, split_size):
                        with next(split_fileobj_gen) as dest:
                            for line in lines:
                                # simulate NLinesInputFormat by prefixing
                                # each line with byte number
                                if manifest:
                                    i = start + length
                                    dest.write(('%d\t' % i).encode('ascii'))
                                dest.write(line)
                                length += len(line)

                        results.append(dict(
                            file=path,
                            start=start,
                            length=length,
                        ))

                        start += length
                        length = 0

        return results
예제 #19
0
파일: sim.py 프로젝트: gaybro8777/mrjob
    def _split_mapper_input(self, input_paths, step_num):
        """Take one or more input paths (which may be compressed) and split
        it to create the input files for the map tasks.

        Yields "splits", which are dictionaries with the following keys:

        input: path of input for one mapper
        file: path of original file
        start, length: chunk of original file in *input*

        Uncompressed files will not be split (even ``.bz2`` files);
        uncompressed files will be split as to to attempt to create
        twice as many input files as there are mappers.
        """
        input_paths = list(input_paths)

        # determine split size
        split_size = self._pick_mapper_split_size(input_paths, step_num)

        # yield output fileobjs as needed
        split_fileobj_gen = self._yield_split_fileobjs('mapper', step_num)

        results = []

        for path in input_paths:
            with open(path, 'rb') as src:
                if is_compressed(path):
                    # if file is compressed, uncompress it into a single split

                    # Hadoop tracks the compressed file's size
                    size = os.stat(path)[stat.ST_SIZE]

                    with next(split_fileobj_gen) as dest:
                        for chunk in decompress(src, path):
                            dest.write(chunk)

                    results.append(dict(
                        file=path,
                        start=0,
                        length=size,
                    ))
                else:
                    # otherwise, split into one or more input files
                    start = 0
                    length = 0

                    for lines in _split_records(src, split_size):
                        with next(split_fileobj_gen) as dest:
                            for line in lines:
                                dest.write(line)
                                length += len(line)

                        results.append(dict(
                            file=path,
                            start=start,
                            length=length,
                        ))

                        start += length
                        length = 0

        return results
예제 #20
0
 def _cat_file(self, path):
     path = _from_file_uri(path)
     with open(path, 'rb') as f:
         for chunk in decompress(f, path):
             yield chunk
예제 #21
0
    def _split_mapper_input(self, input_paths, step_num):
        """Take one or more input paths (which may be compressed) and split
        it to create the input files for the map tasks.

        Yields "splits", which are dictionaries with the following keys:

        input: path of input for one mapper
        file: path of original file
        start, length: chunk of original file in *input*

        Uncompressed files will not be split (even ``.bz2`` files);
        uncompressed files will be split as to to attempt to create
        twice as many input files as there are mappers.
        """
        input_paths = list(input_paths)
        manifest = (step_num == 0 and self._uses_input_manifest())

        # determine split size
        if manifest:
            split_size = 1  # one line per mapper
        else:
            split_size = self._pick_mapper_split_size(input_paths, step_num)

        # yield output fileobjs as needed
        split_fileobj_gen = self._yield_split_fileobjs('mapper', step_num)

        results = []

        for path in input_paths:
            with open(path, 'rb') as src:
                if is_compressed(path):
                    if manifest:
                        raise Exception('input manifest %s should not be'
                                        ' compressed!' % path)

                    # if file is compressed, uncompress it into a single split

                    # Hadoop tracks the compressed file's size
                    size = os.stat(path)[stat.ST_SIZE]

                    with next(split_fileobj_gen) as dest:
                        for chunk in decompress(src, path):
                            dest.write(chunk)

                    results.append(dict(
                        file=path,
                        start=0,
                        length=size,
                    ))
                else:
                    # otherwise, split into one or more input files
                    start = 0
                    length = 0

                    for lines in _split_records(src, split_size):
                        with next(split_fileobj_gen) as dest:
                            for line in lines:
                                # simulate NLinesInputFormat by prefixing
                                # each line with byte number
                                if manifest:
                                    i = start + length
                                    dest.write(('%d\t' % i).encode('ascii'))
                                dest.write(line)
                                length += len(line)

                        results.append(dict(
                            file=path,
                            start=start,
                            length=length,
                        ))

                        start += length
                        length = 0

        return results
예제 #22
0
 def _cat_file(self, gcs_uri):
     return decompress(self._cat_blob(gcs_uri), gcs_uri)
예제 #23
0
 def _cat_file(self, filename):
     with open(filename, 'rb') as f:
         for chunk in decompress(f, filename):
             yield chunk
예제 #24
0
파일: local.py 프로젝트: Yelp/mrjob
 def _cat_file(self, filename):
     with open(filename, 'rb') as f:
         for chunk in decompress(f, filename):
             yield chunk