Пример #1
0
        def line_group_generator(input_path):
            # Generate lines from a given input_path, if keep_sorted is True,
            # group lines by key; otherwise have one line per group
            # concatenate all lines with the same key and yield them
            # together
            if keep_sorted:
                def reducer_key(line):
                    return line.split('\t')[0]

                # assume that input is a collection of key <tab> value pairs
                # match all non-tab characters
                for _, lines in itertools.groupby(
                        read_input(input_path), key=reducer_key):
                    yield lines
            else:
                for line in read_input(input_path):
                    yield (line,)
Пример #2
0
Файл: job.py Проект: mtai/mrjob
    def _read_input(self):
        """Read from stdin, or one more files, or directories.
        Yield one line at time.

        - Resolve globs (``foo_*.gz``).
        - Decompress ``.gz`` and ``.bz2`` files.
        - If path is ``-``, read from STDIN.
        - Recursively read all files in a directory
        """
        paths = self.args or ['-']
        for path in paths:
            for line in read_input(path, stdin=self.stdin):
                yield line
Пример #3
0
    def _read_input(self):
        """Read from stdin, or one more files, or directories.
        Yield one line at time.

        - Resolve globs (``foo_*.gz``).
        - Decompress ``.gz`` and ``.bz2`` files.
        - If path is ``-``, read from STDIN.
        - Recursively read all files in a directory
        """
        paths = self.args or ['-']
        for path in paths:
            for line in read_input(path, stdin=self.stdin):
                yield line
Пример #4
0
 def test_stdin(self):
     lines = read_input('-', stdin=BytesIO(self.BEAVER_DATA))
     self.assertEqual(list(lines), [self.BEAVER_DATA])
Пример #5
0
 def test_glob(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers.*'))
     assert_equal(list(lines), [self.BEAVER_DATA] * 3)
Пример #6
0
 def test_stdin(self):
     lines = read_input('-', stdin=StringIO(self.BEAVER_DATA))
     assert_equal(list(lines), [self.BEAVER_DATA])
Пример #7
0
 def test_bz2_file(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers.bz2'))
     assert_equal(list(lines), [self.BEAVER_DATA])
Пример #8
0
 def test_dir_recursion(self):
     lines = read_input(self.tmpdir)
     self.assertEqual(list(lines), [self.BEAVER_DATA] * 4)
Пример #9
0
 def test_bad_glob(self):
     # read_input is a generator, so we won't get an error
     # until we try to read from it
     self.assertRaises(IOError, list,
                       read_input(os.path.join(self.tmpdir, 'lions*')))
Пример #10
0
 def test_dir_recursion(self):
     lines = read_input(self.tmpdir)
     self.assertEqual(list(lines), [self.BEAVER_DATA] * 4)
Пример #11
0
 def test_glob_including_dir(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers*'))
     self.assertEqual(list(lines), [self.BEAVER_DATA] * 4)
Пример #12
0
 def test_stdin_can_be_iterator(self):
     lines = read_input('-', stdin=[self.BEAVER_DATA] * 5)
     self.assertEqual(list(lines), [self.BEAVER_DATA] * 5)
Пример #13
0
 def test_dir(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers/'))
     self.assertEqual(list(lines), [self.BEAVER_DATA])
Пример #14
0
 def test_stdin(self):
     lines = read_input('-', stdin=BytesIO(self.BEAVER_DATA))
     self.assertEqual(list(lines), [self.BEAVER_DATA])
Пример #15
0
 def test_glob(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers.*'))
     assert_equal(list(lines), [self.BEAVER_DATA] * 3)
Пример #16
0
 def test_bz2_file(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers.bz2'))
     assert_equal(list(lines), [self.BEAVER_DATA])
Пример #17
0
 def test_stdin_can_be_iterator(self):
     lines = read_input('-', stdin=[self.BEAVER_DATA] * 5)
     self.assertEqual(list(lines), [self.BEAVER_DATA] * 5)
Пример #18
0
 def test_bad_glob(self):
     # read_input is a generator, so we won't get an error
     # until we try to read from it
     self.assertRaises(IOError, list,
                       read_input(os.path.join(self.tmpdir, 'lions*')))
Пример #19
0
 def test_dir(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers/'))
     self.assertEqual(list(lines), [self.BEAVER_DATA])
Пример #20
0
 def test_bz2_file(self):
     lines = read_input(os.path.join(self.tmpdir, "beavers.bz2"))
     self.assertEqual(list(lines), [self.BEAVER_DATA])
Пример #21
0
 def test_glob_including_dir(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers*'))
     self.assertEqual(list(lines), [self.BEAVER_DATA] * 4)
Пример #22
0
 def test_stdin(self):
     lines = read_input('-', stdin=StringIO(self.BEAVER_DATA))
     assert_equal(list(lines), [self.BEAVER_DATA])
Пример #23
0
 def test_stdin(self):
     lines = read_input("-", stdin=StringIO(self.BEAVER_DATA))
     self.assertEqual(list(lines), [self.BEAVER_DATA])
Пример #24
0
 def test_glob(self):
     lines = read_input(os.path.join(self.tmpdir, "beavers.*"))
     self.assertEqual(list(lines), [self.BEAVER_DATA] * 3)