示例#1
0
    def _hadoop_streaming_commands(self, step_num):
        version = self.get_hadoop_version()

        # Hadoop streaming stuff
        mapper, bash_wrap_mapper = self._render_substep(
            step_num, 'mapper')

        combiner, bash_wrap_combiner = self._render_substep(
            step_num, 'combiner')

        reducer, bash_wrap_reducer = self._render_substep(
            step_num, 'reducer')

        if (combiner is not None and
            not supports_combiners_in_hadoop_streaming(version)):

            # krazy hack to support combiners on hadoop <0.20
            bash_wrap_mapper = True
            mapper = "%s | sort | %s" % (mapper, combiner)

            # take the combiner away, hadoop will just be confused
            combiner = None
            bash_wrap_combiner = False

        if bash_wrap_mapper:
            mapper = bash_wrap(mapper)

        if bash_wrap_combiner:
            combiner = bash_wrap(combiner)

        if bash_wrap_reducer:
            reducer = bash_wrap(reducer)

        return mapper, combiner, reducer
示例#2
0
 def steps(self):
     steps = []
     for step in self.options.steps:
         step_kwargs = {}
         if 'mapper' in step:
             step_kwargs['mapper_cmd'] = bash_wrap(step['mapper'])
         if 'combiner' in step:
             step_kwargs['combiner_cmd'] = bash_wrap(step['combiner'])
         if 'reducer' in step:
             step_kwargs['reducer_cmd'] = bash_wrap(step['reducer'])
         steps.append(self.mr(**step_kwargs))
     return steps
示例#3
0
文件: mr_cmd.py 项目: hophacker/mrjob
 def steps(self):
     steps = []
     for step in self.options.steps:
         step_kwargs = {}
         if "mapper" in step:
             step_kwargs["mapper_cmd"] = bash_wrap(step["mapper"])
         if "combiner" in step:
             step_kwargs["combiner_cmd"] = bash_wrap(step["combiner"])
         if "reducer" in step:
             step_kwargs["reducer_cmd"] = bash_wrap(step["reducer"])
         steps.append(MRStep(**step_kwargs))
     return steps
示例#4
0
文件: mr_cmd.py 项目: Anihc/mrjob
 def steps(self):
     steps = []
     for step in self.options.steps:
         step_kwargs = {}
         if 'mapper' in step:
             step_kwargs['mapper_cmd'] = bash_wrap(step['mapper'])
         if 'combiner' in step:
             step_kwargs['combiner_cmd'] = bash_wrap(step['combiner'])
         if 'reducer' in step:
             step_kwargs['reducer_cmd'] = bash_wrap(step['reducer'])
         steps.append(self.mr(**step_kwargs))
     return steps
示例#5
0
    def test_multiple(self):
        data = b'x\nx\nx\nx\nx\nx\n'
        mapper_cmd = 'cat -e'
        reducer_cmd = bash_wrap('wc -l | tr -Cd "[:digit:]"')
        job = CmdJob([
            '--runner', 'local', '--mapper-cmd', mapper_cmd, '--combiner-cmd',
            'uniq', '--reducer-cmd', reducer_cmd
        ])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(r._get_steps(), [{
                'type': 'streaming',
                'mapper': {
                    'type': 'command',
                    'command': mapper_cmd
                },
                'combiner': {
                    'type': 'command',
                    'command': 'uniq'
                },
                'reducer': {
                    'type': 'command',
                    'command': reducer_cmd
                },
            }])

            r.run()

            self.assertEqual(list(r.stream_output()), [b'2'])
示例#6
0
文件: runner.py 项目: irskep/mrjob
    def _hadoop_streaming_commands(self, step_num):
        # Hadoop streaming stuff
        mapper, bash_wrap_mapper = self._render_substep(step_num, "mapper")

        combiner, bash_wrap_combiner = self._render_substep(step_num, "combiner")

        reducer, bash_wrap_reducer = self._render_substep(step_num, "reducer")

        if bash_wrap_mapper:
            mapper = bash_wrap(mapper)

        if bash_wrap_combiner:
            combiner = bash_wrap(combiner)

        if bash_wrap_reducer:
            reducer = bash_wrap(reducer)

        return mapper, combiner, reducer
示例#7
0
    def _hadoop_streaming_commands(self, step_num):
        # Hadoop streaming stuff
        mapper, bash_wrap_mapper = self._render_substep(step_num, 'mapper')

        combiner, bash_wrap_combiner = self._render_substep(
            step_num, 'combiner')

        reducer, bash_wrap_reducer = self._render_substep(step_num, 'reducer')

        if bash_wrap_mapper:
            mapper = bash_wrap(mapper)

        if bash_wrap_combiner:
            combiner = bash_wrap(combiner)

        if bash_wrap_reducer:
            reducer = bash_wrap(reducer)

        return mapper, combiner, reducer
示例#8
0
 def test_pre_filter_escaping(self):
     # ESCAPE ALL THE THINGS!!!
     self._assert_streaming_step(
         {"type": "streaming", "mapper": {"type": "script", "pre_filter": bash_wrap("grep 'anything'")}},
         [
             "-mapper",
             "bash -c 'bash -c '\\''grep"
             " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |"
             " python my_job.py --step-num=0 --mapper'",
             "-jobconf",
             "mapred.reduce.tasks=0",
         ],
     )
示例#9
0
文件: runner.py 项目: Milkigit/mrjob
    def _hadoop_streaming_commands(self, step_num):
        version = self.get_hadoop_version()

        # Hadoop streaming stuff
        mapper, bash_wrap_mapper = self._render_substep(
            step_num, 'mapper')

        combiner, bash_wrap_combiner = self._render_substep(
            step_num, 'combiner')

        reducer, bash_wrap_reducer = self._render_substep(
            step_num, 'reducer')

        if bash_wrap_mapper:
            mapper = bash_wrap(mapper)

        if bash_wrap_combiner:
            combiner = bash_wrap(combiner)

        if bash_wrap_reducer:
            reducer = bash_wrap(reducer)

        return mapper, combiner, reducer
示例#10
0
 def test_pre_filter_escaping(self):
     # ESCAPE ALL THE THINGS!!!
     self._assert_streaming_step(
         {
             'type': 'streaming',
             'mapper': {
                 'type': 'script',
                 'pre_filter': bash_wrap("grep 'anything'"),
             },
         },
         ['-mapper',
          "bash -c 'bash -c '\\''grep"
              " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |"
              " python my_job.py --step-num=0 --mapper'",
          '-jobconf', 'mapred.reduce.tasks=0'])
示例#11
0
 def test_pre_filter_escaping(self):
     # ESCAPE ALL THE THINGS!!!
     self._assert_streaming_step(
         {
             'type': 'streaming',
             'mapper': {
                 'type': 'script',
                 'pre_filter': bash_wrap("grep 'anything'"),
             },
         }, [
             '-mapper', "bash -c 'bash -c '\\''grep"
             " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |"
             " python my_job.py --step-num=0 --mapper'", '-jobconf',
             'mapred.reduce.tasks=0'
         ])
示例#12
0
    def test_pre_filter_escaping(self):
        # ESCAPE ALL THE THINGS!!!
        self.runner._steps = [
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': bash_wrap("grep 'anything'"),
                },
            },
        ]

        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            (self.BASIC_HADOOP_ARGS + ['-D', 'mapreduce.job.reduces=0'] +
             self.BASIC_JOB_ARGS + [
                 '-mapper', "bash -c 'bash -c '\\''grep"
                 " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' | " +
                 PYTHON_BIN + " my_job.py --step-num=0 --mapper'"
             ]))
示例#13
0
    def test_pre_filter_escaping(self):
        # ESCAPE ALL THE THINGS!!!
        self.runner._steps = [
            {
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                    'pre_filter': bash_wrap("grep 'anything'"),
                },
            },
        ]

        self.assertEqual(
            self.runner._args_for_streaming_step(0),
            (self.BASIC_HADOOP_ARGS + ['-D', 'mapreduce.job.reduces=0'] +
             self.BASIC_JOB_ARGS + [
                 '-mapper',
                 "bash -c 'bash -c '\\''grep"
                 " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' | " +
                 PYTHON_BIN +
                 " my_job.py --step-num=0 --mapper'"]))
示例#14
0
    def test_multiple(self):
        data = 'x\nx\nx\nx\nx\nx\n'
        mapper_cmd = 'cat -e'
        reducer_cmd = bash_wrap('wc -l | tr -Cd "[:digit:]"')
        job = CmdJob([
            '--runner', 'local',
            '--mapper-cmd', mapper_cmd,
            '--combiner-cmd', 'uniq',
            '--reducer-cmd', reducer_cmd])
        job.sandbox(stdin=StringIO(data))
        with job.make_runner() as r:
            self.assertEqual(
                r._get_steps(),
                [{
                    'type': 'streaming',
                    'mapper': {'type': 'command', 'command': mapper_cmd},
                    'combiner': {'type': 'command', 'command': 'uniq'},
                    'reducer': {'type': 'command', 'command': reducer_cmd},
                }])

            r.run()

            self.assertEqual(list(r.stream_output()), ['2'])
示例#15
0
    def test_multiple(self):
        data = b"x\nx\nx\nx\nx\nx\n"
        mapper_cmd = "cat -e"
        reducer_cmd = bash_wrap('wc -l | tr -Cd "[:digit:]"')
        job = CmdJob(
            ["--runner", "local", "--mapper-cmd", mapper_cmd, "--combiner-cmd", "uniq", "--reducer-cmd", reducer_cmd]
        )
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(
                r._get_steps(),
                [
                    {
                        "type": "streaming",
                        "mapper": {"type": "command", "command": mapper_cmd},
                        "combiner": {"type": "command", "command": "uniq"},
                        "reducer": {"type": "command", "command": reducer_cmd},
                    }
                ],
            )

            r.run()

            self.assertEqual(list(r.stream_output()), [b"2"])
示例#16
0
 def mapper_cmd(self):
     return bash_wrap('./wordcount.sh mapper')
示例#17
0
 def reducer_cmd(self):
     return bash_wrap('./wordcount.sh reducer')
示例#18
0
 def reducer_cmd(self):
     return bash_wrap('./wordcount.sh reducer')
示例#19
0
 def mapper_cmd(self):
     return bash_wrap('./wordcount.sh mapper')