def line_group_generator(input_path): # Generate lines from a given input_path, if keep_sorted is True, # group lines by key; otherwise have one line per group # concatenate all lines with the same key and yield them # together if keep_sorted: def reducer_key(line): return line.split('\t')[0] # assume that input is a collection of key <tab> value pairs # match all non-tab characters for _, lines in itertools.groupby( read_input(input_path), key=reducer_key): yield lines else: for line in read_input(input_path): yield (line,)
def _read_input(self): """Read from stdin, or one more files, or directories. Yield one line at time. - Resolve globs (``foo_*.gz``). - Decompress ``.gz`` and ``.bz2`` files. - If path is ``-``, read from STDIN. - Recursively read all files in a directory """ paths = self.args or ['-'] for path in paths: for line in read_input(path, stdin=self.stdin): yield line
def test_stdin(self): lines = read_input('-', stdin=BytesIO(self.BEAVER_DATA)) self.assertEqual(list(lines), [self.BEAVER_DATA])
def test_glob(self): lines = read_input(os.path.join(self.tmpdir, 'beavers.*')) assert_equal(list(lines), [self.BEAVER_DATA] * 3)
def test_stdin(self): lines = read_input('-', stdin=StringIO(self.BEAVER_DATA)) assert_equal(list(lines), [self.BEAVER_DATA])
def test_bz2_file(self): lines = read_input(os.path.join(self.tmpdir, 'beavers.bz2')) assert_equal(list(lines), [self.BEAVER_DATA])
def test_dir_recursion(self): lines = read_input(self.tmpdir) self.assertEqual(list(lines), [self.BEAVER_DATA] * 4)
def test_bad_glob(self): # read_input is a generator, so we won't get an error # until we try to read from it self.assertRaises(IOError, list, read_input(os.path.join(self.tmpdir, 'lions*')))
def test_glob_including_dir(self): lines = read_input(os.path.join(self.tmpdir, 'beavers*')) self.assertEqual(list(lines), [self.BEAVER_DATA] * 4)
def test_stdin_can_be_iterator(self): lines = read_input('-', stdin=[self.BEAVER_DATA] * 5) self.assertEqual(list(lines), [self.BEAVER_DATA] * 5)
def test_dir(self): lines = read_input(os.path.join(self.tmpdir, 'beavers/')) self.assertEqual(list(lines), [self.BEAVER_DATA])
def test_bz2_file(self): lines = read_input(os.path.join(self.tmpdir, "beavers.bz2")) self.assertEqual(list(lines), [self.BEAVER_DATA])
def test_stdin(self): lines = read_input("-", stdin=StringIO(self.BEAVER_DATA)) self.assertEqual(list(lines), [self.BEAVER_DATA])
def test_glob(self): lines = read_input(os.path.join(self.tmpdir, "beavers.*")) self.assertEqual(list(lines), [self.BEAVER_DATA] * 3)