예제 #1
0
 def test_supports_combiners(self):
     self.assertEqual(supports_combiners_in_hadoop_streaming('0.19'), False)
     self.assertEqual(supports_combiners_in_hadoop_streaming('0.19.2'),
                      False)
     self.assertEqual(supports_combiners_in_hadoop_streaming('0.20'), True)
     self.assertEqual(supports_combiners_in_hadoop_streaming('0.20.203'),
                      True)
예제 #2
0
파일: test_compat.py 프로젝트: Anihc/mrjob
 def test_supports_combiners(self):
     self.assertEqual(supports_combiners_in_hadoop_streaming('0.19'),
                      False)
     self.assertEqual(supports_combiners_in_hadoop_streaming('0.19.2'),
                      False)
     self.assertEqual(supports_combiners_in_hadoop_streaming('0.20'),
                      True)
     self.assertEqual(supports_combiners_in_hadoop_streaming('0.20.203'),
                      True)
예제 #3
0
    def _hadoop_streaming_commands(self, step_num):
        version = self.get_hadoop_version()

        # Hadoop streaming stuff
        mapper, bash_wrap_mapper = self._render_substep(
            step_num, 'mapper')

        combiner, bash_wrap_combiner = self._render_substep(
            step_num, 'combiner')

        reducer, bash_wrap_reducer = self._render_substep(
            step_num, 'reducer')

        if (combiner is not None and
            not supports_combiners_in_hadoop_streaming(version)):

            # krazy hack to support combiners on hadoop <0.20
            bash_wrap_mapper = True
            mapper = "%s | sort | %s" % (mapper, combiner)

            # take the combiner away, hadoop will just be confused
            combiner = None
            bash_wrap_combiner = False

        if bash_wrap_mapper:
            mapper = bash_wrap(mapper)

        if bash_wrap_combiner:
            combiner = bash_wrap(combiner)

        if bash_wrap_reducer:
            reducer = bash_wrap(reducer)

        return mapper, combiner, reducer
예제 #4
0
    def _run_job_in_hadoop(self):
        # figure out local names for our files
        self._name_files()

        # send script and wrapper script (if any) to working dir
        assert self._script  # shouldn't be able to run if no script
        self._script['upload'] = 'file'
        if self._wrapper_script:
            self._wrapper_script['upload'] = 'file'

        self._counters = []
        steps = self._get_steps()

        version = self.get_hadoop_version()

        for step_num, step in enumerate(steps):
            log.debug('running step %d of %d' % (step_num + 1, len(steps)))

            streaming_args = (self._opts['hadoop_bin'] +
                              ['jar', self._opts['hadoop_streaming_jar']])

            # -files/-archives (generic options, new-style)
            if compat.supports_new_distributed_cache_options(version):
                # set up uploading from HDFS to the working dir
                streaming_args.extend(self._upload_args())

            # Add extra hadoop args first as hadoop args could be a hadoop
            # specific argument (e.g. -libjar) which must come before job
            # specific args.
            streaming_args.extend(
                self._hadoop_conf_args(step_num, len(steps)))

            # set up input
            for input_uri in self._hdfs_step_input_files(step_num):
                streaming_args.extend(['-input', input_uri])

            # set up output
            streaming_args.append('-output')
            streaming_args.append(self._hdfs_step_output_dir(step_num))

            # -cacheFile/-cacheArchive (streaming options, old-style)
            if not compat.supports_new_distributed_cache_options(version):
                # set up uploading from HDFS to the working dir
                streaming_args.extend(self._upload_args())

            # set up mapper and reducer
            if 'M' not in step:
                mapper = 'cat'
            else:
                mapper = cmd_line(self._mapper_args(step_num))

            if 'C' in step:
                combiner_cmd = cmd_line(self._combiner_args(step_num))
                version = self.get_hadoop_version()
                if compat.supports_combiners_in_hadoop_streaming(version):
                    combiner = combiner_cmd
                else:
                    mapper = ("bash -c '%s | sort | %s'" %
                              (mapper, combiner_cmd))
                    combiner = None
            else:
                combiner = None

            streaming_args.append('-mapper')
            streaming_args.append(mapper)

            if combiner:
                streaming_args.append('-combiner')
                streaming_args.append(combiner)

            if 'R' in step:
                streaming_args.append('-reducer')
                streaming_args.append(cmd_line(self._reducer_args(step_num)))
            else:
                streaming_args.extend(['-jobconf', 'mapred.reduce.tasks=0'])

            log.debug('> %s' % cmd_line(streaming_args))
            step_proc = Popen(streaming_args, stdout=PIPE, stderr=PIPE)

            # TODO: use a pty or something so that the hadoop binary
            # won't buffer the status messages
            self._process_stderr_from_streaming(step_proc.stderr)

            # there shouldn't be much output to STDOUT
            for line in step_proc.stdout:
                log.error('STDOUT: ' + line.strip('\n'))

            returncode = step_proc.wait()
            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = ('Job failed with return code %d: %s' %
                       (step_proc.returncode, streaming_args))
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure(
                    [step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append('Probable cause of failure (from %s):' %
                               cause['log_file_uri'])
                    cause_msg.extend(line.strip('\n')
                                     for line in cause['lines'])
                    if cause['input_uri']:
                        cause_msg.append('(while reading from %s)' %
                                         cause['input_uri'])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += '\n' + '\n'.join(cause_msg) + '\n'

                raise Exception(msg)
                raise CalledProcessError(step_proc.returncode, streaming_args)