Пример #1
0
    def test_hadoop_extra_args_comes_first(self):
        job = MRWordCount(
            ['--cmdenv', 'FOO=bar',
             '--hadoop-arg', '-libjar', '--hadoop-arg', 'qux.jar',
             '--jobconf', 'baz=qux',
             '--partitioner', 'java.lang.Object'])
        job.HADOOP_INPUT_FORMAT = 'FooInputFormat'
        job.HADOOP_OUTPUT_FORMAT = 'BarOutputFormat'

        with job.make_runner() as runner:
            hadoop_args = runner._hadoop_args_for_step(0)
            self.assertEqual(hadoop_args[:2], ['-libjar', 'qux.jar'])
            self.assertEqual(len(hadoop_args), 12)
Пример #2
0
    def test_hadoop_extra_args_comes_first(self):
        job = MRWordCount([
            '--cmdenv', 'FOO=bar', '--hadoop-arg', '-libjar', '--hadoop-arg',
            'qux.jar', '--jobconf', 'baz=qux', '--partitioner',
            'java.lang.Object'
        ])
        job.HADOOP_INPUT_FORMAT = 'FooInputFormat'
        job.HADOOP_OUTPUT_FORMAT = 'BarOutputFormat'

        with job.make_runner() as runner:
            hadoop_args = runner._hadoop_args_for_step(0)
            self.assertEqual(hadoop_args[:2], ['-libjar', 'qux.jar'])
            self.assertEqual(len(hadoop_args), 12)
Пример #3
0
    def test_hadoop_output_format(self):
        output_format = "org.apache.hadoop.mapred.SequenceFileOutputFormat"

        # one-step job
        job1 = MRWordCount()
        # no cmd-line argument for this because it's part of job semantics
        job1.HADOOP_OUTPUT_FORMAT = output_format
        with job1.make_runner() as runner1:
            self.assertEqual(runner1._hadoop_args_for_step(0), ["-outputformat", output_format])

        # multi-step job: only use -outputformat on the last step
        job2 = MRTwoStepJob()
        job2.HADOOP_OUTPUT_FORMAT = output_format
        with job2.make_runner() as runner2:
            self.assertEqual(runner2._hadoop_args_for_step(0), [])
            self.assertEqual(runner2._hadoop_args_for_step(1), ["-outputformat", output_format])
Пример #4
0
    def test_hadoop_output_format(self):
        output_format = 'org.apache.hadoop.mapred.SequenceFileOutputFormat'

        # one-step job
        job1 = MRWordCount()
        # no cmd-line argument for this because it's part of job semantics
        job1.HADOOP_OUTPUT_FORMAT = output_format
        with job1.make_runner() as runner1:
            self.assertEqual(runner1._hadoop_args_for_step(0),
                             ['-outputformat', output_format])

        # multi-step job: only use -outputformat on the last step
        job2 = MRTwoStepJob()
        job2.HADOOP_OUTPUT_FORMAT = output_format
        with job2.make_runner() as runner2:
            self.assertEqual(runner2._hadoop_args_for_step(0), [])
            self.assertEqual(runner2._hadoop_args_for_step(1),
                             ['-outputformat', output_format])
Пример #5
0
    def test_hadoop_extra_args_comes_first(self):
        job = MRWordCount(
            [
                "--cmdenv",
                "FOO=bar",
                "--hadoop-arg",
                "-libjar",
                "--hadoop-arg",
                "qux.jar",
                "--jobconf",
                "baz=qux",
                "--partitioner",
                "java.lang.Object",
            ]
        )
        job.HADOOP_INPUT_FORMAT = "FooInputFormat"
        job.HADOOP_OUTPUT_FORMAT = "BarOutputFormat"

        with job.make_runner() as runner:
            hadoop_args = runner._hadoop_args_for_step(0)
            self.assertEqual(hadoop_args[:2], ["-libjar", "qux.jar"])
            self.assertEqual(len(hadoop_args), 12)