def line_group_generator(input_path): # Generate lines from a given input_path, if keep_sorted is True, # group lines by key; otherwise have one line per group # concatenate all lines with the same key and yield them # together if keep_sorted: def reducer_key(line): return line.split(b'\t')[0] # assume that input is a collection of key <tab> value pairs # match all non-tab characters for _, lines in itertools.groupby(read_text_input(input_path), key=reducer_key): yield lines else: for line in read_text_input(input_path): yield (line, )
def test_bad_glob(self): # read_input is a generator, so we won't get an error # until we try to read from it self.assertRaises(IOError, list, read_text_input(os.path.join(self.tmpdir, 'lions*')))
def test_glob_including_dir(self): lines = read_text_input(os.path.join(self.tmpdir, 'beavers*')) self.assertEqual(list(lines), [self.BEAVER_DATA] * 4)
def test_dir_recursion(self): lines = read_text_input(self.tmpdir) self.assertEqual(list(lines), [self.BEAVER_DATA] * 4)
def test_bz2_file(self): lines = read_text_input(os.path.join(self.tmpdir, 'beavers.bz2')) self.assertEqual(list(lines), [self.BEAVER_DATA])
def test_stdin_can_be_iterator(self): lines = read_text_input('-', stdin=[self.BEAVER_DATA] * 5) self.assertEqual(list(lines), [self.BEAVER_DATA] * 5)
def test_stdin(self): lines = read_text_input('-', stdin=BytesIO(self.BEAVER_DATA)) self.assertEqual(list(lines), [self.BEAVER_DATA])