示例#1
0
    def test_sort_values(self):
        job = MRSortAndGroup(['-r', 'spark'])
        job.sandbox(stdin=BytesIO(
            b'alligator\nactuary\nbowling\nartichoke\nballoon\nbaby\n'))

        with job.make_runner() as runner:
            runner.run()

            self.assertEqual(
                dict(job.parse_output(runner.cat_output())),
                dict(a=['actuary', 'alligator', 'artichoke'],
                     b=['baby', 'balloon', 'bowling']))
示例#2
0
    def test_sort_values(self):
        job = MRSortAndGroup(['-r', self.RUNNER])
        job.sandbox(stdin=BytesIO(self._INPUT))

        with job.make_runner() as runner:
            runner.run()
            output = list(job.parse_output(runner.cat_output()))

            self.assertEqual(
                sorted(output),
                [('a', ['actuary', 'alligator', 'artichoke']),
                 ('b', ['baby', 'balloon', 'bowling'])])
示例#3
0
    def test_sorting_is_case_sensitive(self):
        job = MRSortAndGroup(['-r', self.RUNNER])
        job.sandbox(stdin=BytesIO(b'Aaron\naardvark\nABC\n'))

        with job.make_runner() as runner:
            runner.run()
            output = list(job.parse_output(runner.cat_output()))

            self.assertEqual(
                sorted(output),
                [('A', ['ABC', 'Aaron']),
                 ('a', ['aardvark'])])
示例#4
0
    def test_custom_sort_bin_overrides_sort_values(self):
        # this breaks SORT_VALUES; see #1699 for a fix
        job = MRSortAndGroup(['-r', 'local', '--sort-bin', 'sort -r'])
        job.sandbox(stdin=BytesIO(
            b'apples\nbabies\nbuffaloes\nbears\nbicycles'))

        with job.make_runner() as runner:
            runner.run()

            self.assertEqual(
                sorted(job.parse_output(runner.cat_output())),
                [('a', ['apples']),
                 ('b', ['buffaloes', 'bicycles', 'bears', 'babies'])])

        self.assertTrue(self.check_call.called)
        self.assertFalse(self._sort_lines_in_memory.called)

        sort_args = self.check_call.call_args[0][0]

        self.assertEqual(sort_args[:2], ['sort', '-r'])
示例#5
0
    def test_default_sort_bin_sort_values(self):
        job = MRSortAndGroup(['-r', 'local'])
        job.sandbox(stdin=BytesIO(
            b'apples\nbuffaloes\nbears'))

        with job.make_runner() as runner:
            runner.run()

            self.assertEqual(
                sorted(job.parse_output(runner.cat_output())),
                [('a', ['apples']), ('b', ['bears', 'buffaloes'])])

        self.assertTrue(self.check_call.called)
        self.assertFalse(self._sort_lines_in_memory.called)

        sort_args = self.check_call.call_args[0][0]

        self.assertEqual(sort_args[:1], ['sort'])
        self.assertNotEqual(sort_args[:6],
                            ['sort', '-t', '\t', '-k', '1,1', '-s'])
示例#6
0
    def test_ignore_format_and_sort_kwargs(self):
        # hadoop formats and SORT_VALUES are read directly from the job,
        # so the runner's constructor ignores the corresponding kwargs
        #
        # see #2022

        # same set up as test_sort_values(), above
        runner = SparkMRJobRunner(
            mr_job_script=MRSortAndGroup.mr_job_script(),
            mrjob_cls=MRSortAndGroup,
            stdin=BytesIO(
                b'alligator\nactuary\nbowling\nartichoke\nballoon\nbaby\n'),
            hadoop_input_format='TerribleInputFormat',
            hadoop_output_format='AwfulOutputFormat',
            sort_values=False)

        runner.run()

        self.assertEqual(
            dict(MRSortAndGroup().parse_output(runner.cat_output())),
            dict(a=['actuary', 'alligator', 'artichoke'],
                 b=['baby', 'balloon', 'bowling']))
示例#7
0
    def test_sorting_is_case_sensitive(self):
        job = MRSortAndGroup(['-r', self.RUNNER])
        job.sandbox(stdin=BytesIO(b'Aaron\naardvark\nABC\n'))

        with job.make_runner() as runner:
            runner.run()
            output = list(job.parse_output(runner.cat_output()))

            self.assertEqual(sorted(output), [('A', ['ABC', 'Aaron']),
                                              ('a', ['aardvark'])])
示例#8
0
    def test_sort_values(self):
        job = MRSortAndGroup(['-r', self.RUNNER])
        job.sandbox(stdin=BytesIO(self._INPUT))

        with job.make_runner() as runner:
            runner.run()
            output = list(job.parse_output(runner.cat_output()))

            self.assertEqual(sorted(output),
                             [('a', ['actuary', 'alligator', 'artichoke']),
                              ('b', ['baby', 'balloon', 'bowling'])])
示例#9
0
    def test_sort_values(self):
        job = MRSortAndGroup(['-r', 'spark'])
        job.sandbox(stdin=BytesIO(
            b'alligator\nactuary\nbowling\nartichoke\nballoon\nbaby\n'))

        with job.make_runner() as runner:
            runner.run()

            self.assertEqual(
                dict(job.parse_output(runner.cat_output())),
                dict(a=['actuary', 'alligator', 'artichoke'],
                     b=['baby', 'balloon', 'bowling']))
示例#10
0
    def test_default_sort_bin_sort_values(self):
        job = MRSortAndGroup(['-r', 'local'])
        job.sandbox(stdin=BytesIO(b'apples\nbuffaloes\nbears'))

        with job.make_runner() as runner:
            runner.run()

            self.assertEqual(sorted(job.parse_output(runner.cat_output())),
                             [('a', ['apples']),
                              ('b', ['bears', 'buffaloes'])])

        self.assertTrue(self.check_call.called)
        self.assertFalse(self._sort_lines_in_memory.called)

        sort_args = self.check_call.call_args[0][0]

        self.assertEqual(sort_args[:1], ['sort'])
        self.assertNotEqual(sort_args[:6],
                            ['sort', '-t', '\t', '-k', '1,1', '-s'])
示例#11
0
    def test_custom_sort_bin_overrides_sort_values(self):
        # this breaks SORT_VALUES; see #1699 for a fix
        job = MRSortAndGroup(['-r', 'local', '--sort-bin', 'sort -r'])
        job.sandbox(
            stdin=BytesIO(b'apples\nbabies\nbuffaloes\nbears\nbicycles'))

        with job.make_runner() as runner:
            runner.run()

            self.assertEqual(
                sorted(job.parse_output(runner.cat_output())),
                [('a', ['apples']),
                 ('b', ['buffaloes', 'bicycles', 'bears', 'babies'])])

        self.assertTrue(self.check_call.called)
        self.assertFalse(self._sort_lines_in_memory.called)

        sort_args = self.check_call.call_args[0][0]

        self.assertEqual(sort_args[:2], ['sort', '-r'])