def test_sort_values(self): job = MRSortAndGroup(['-r', 'spark']) job.sandbox(stdin=BytesIO( b'alligator\nactuary\nbowling\nartichoke\nballoon\nbaby\n')) with job.make_runner() as runner: runner.run() self.assertEqual( dict(job.parse_output(runner.cat_output())), dict(a=['actuary', 'alligator', 'artichoke'], b=['baby', 'balloon', 'bowling']))
def test_sort_values(self): job = MRSortAndGroup(['-r', self.RUNNER]) job.sandbox(stdin=BytesIO(self._INPUT)) with job.make_runner() as runner: runner.run() output = list(job.parse_output(runner.cat_output())) self.assertEqual( sorted(output), [('a', ['actuary', 'alligator', 'artichoke']), ('b', ['baby', 'balloon', 'bowling'])])
def test_sorting_is_case_sensitive(self): job = MRSortAndGroup(['-r', self.RUNNER]) job.sandbox(stdin=BytesIO(b'Aaron\naardvark\nABC\n')) with job.make_runner() as runner: runner.run() output = list(job.parse_output(runner.cat_output())) self.assertEqual( sorted(output), [('A', ['ABC', 'Aaron']), ('a', ['aardvark'])])
def test_custom_sort_bin_overrides_sort_values(self): # this breaks SORT_VALUES; see #1699 for a fix job = MRSortAndGroup(['-r', 'local', '--sort-bin', 'sort -r']) job.sandbox(stdin=BytesIO( b'apples\nbabies\nbuffaloes\nbears\nbicycles')) with job.make_runner() as runner: runner.run() self.assertEqual( sorted(job.parse_output(runner.cat_output())), [('a', ['apples']), ('b', ['buffaloes', 'bicycles', 'bears', 'babies'])]) self.assertTrue(self.check_call.called) self.assertFalse(self._sort_lines_in_memory.called) sort_args = self.check_call.call_args[0][0] self.assertEqual(sort_args[:2], ['sort', '-r'])
def test_default_sort_bin_sort_values(self): job = MRSortAndGroup(['-r', 'local']) job.sandbox(stdin=BytesIO( b'apples\nbuffaloes\nbears')) with job.make_runner() as runner: runner.run() self.assertEqual( sorted(job.parse_output(runner.cat_output())), [('a', ['apples']), ('b', ['bears', 'buffaloes'])]) self.assertTrue(self.check_call.called) self.assertFalse(self._sort_lines_in_memory.called) sort_args = self.check_call.call_args[0][0] self.assertEqual(sort_args[:1], ['sort']) self.assertNotEqual(sort_args[:6], ['sort', '-t', '\t', '-k', '1,1', '-s'])
def test_ignore_format_and_sort_kwargs(self): # hadoop formats and SORT_VALUES are read directly from the job, # so the runner's constructor ignores the corresponding kwargs # # see #2022 # same set up as test_sort_values(), above runner = SparkMRJobRunner( mr_job_script=MRSortAndGroup.mr_job_script(), mrjob_cls=MRSortAndGroup, stdin=BytesIO( b'alligator\nactuary\nbowling\nartichoke\nballoon\nbaby\n'), hadoop_input_format='TerribleInputFormat', hadoop_output_format='AwfulOutputFormat', sort_values=False) runner.run() self.assertEqual( dict(MRSortAndGroup().parse_output(runner.cat_output())), dict(a=['actuary', 'alligator', 'artichoke'], b=['baby', 'balloon', 'bowling']))
def test_sorting_is_case_sensitive(self): job = MRSortAndGroup(['-r', self.RUNNER]) job.sandbox(stdin=BytesIO(b'Aaron\naardvark\nABC\n')) with job.make_runner() as runner: runner.run() output = list(job.parse_output(runner.cat_output())) self.assertEqual(sorted(output), [('A', ['ABC', 'Aaron']), ('a', ['aardvark'])])
def test_sort_values(self): job = MRSortAndGroup(['-r', self.RUNNER]) job.sandbox(stdin=BytesIO(self._INPUT)) with job.make_runner() as runner: runner.run() output = list(job.parse_output(runner.cat_output())) self.assertEqual(sorted(output), [('a', ['actuary', 'alligator', 'artichoke']), ('b', ['baby', 'balloon', 'bowling'])])
def test_default_sort_bin_sort_values(self): job = MRSortAndGroup(['-r', 'local']) job.sandbox(stdin=BytesIO(b'apples\nbuffaloes\nbears')) with job.make_runner() as runner: runner.run() self.assertEqual(sorted(job.parse_output(runner.cat_output())), [('a', ['apples']), ('b', ['bears', 'buffaloes'])]) self.assertTrue(self.check_call.called) self.assertFalse(self._sort_lines_in_memory.called) sort_args = self.check_call.call_args[0][0] self.assertEqual(sort_args[:1], ['sort']) self.assertNotEqual(sort_args[:6], ['sort', '-t', '\t', '-k', '1,1', '-s'])
def test_custom_sort_bin_overrides_sort_values(self): # this breaks SORT_VALUES; see #1699 for a fix job = MRSortAndGroup(['-r', 'local', '--sort-bin', 'sort -r']) job.sandbox( stdin=BytesIO(b'apples\nbabies\nbuffaloes\nbears\nbicycles')) with job.make_runner() as runner: runner.run() self.assertEqual( sorted(job.parse_output(runner.cat_output())), [('a', ['apples']), ('b', ['buffaloes', 'bicycles', 'bears', 'babies'])]) self.assertTrue(self.check_call.called) self.assertFalse(self._sort_lines_in_memory.called) sort_args = self.check_call.call_args[0][0] self.assertEqual(sort_args[:2], ['sort', '-r'])