Пример #1
0
    def test_cache_opts(self):
        self.assertEqual(supports_new_distributed_cache_options('0.18'), False)
        self.assertEqual(supports_new_distributed_cache_options('0.20'), False)
        self.assertEqual(supports_new_distributed_cache_options('0.20.203'),
                         True)

        # default to True
        self.assertEqual(supports_new_distributed_cache_options(None), True)
Пример #2
0
    def test_cache_opts(self):
        self.assertEqual(supports_new_distributed_cache_options('0.18'), False)
        self.assertEqual(supports_new_distributed_cache_options('0.20'), False)
        self.assertEqual(
            supports_new_distributed_cache_options('0.20.203'), True)

        # default to True
        self.assertEqual(
            supports_new_distributed_cache_options(None), True)
Пример #3
0
    def _args_for_streaming_step(self, step_num):
        version = self.get_hadoop_version()

        hadoop_streaming_jar = self.get_hadoop_streaming_jar()
        if not hadoop_streaming_jar:
            raise Exception('no Hadoop streaming jar')

        args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar]

        # -files/-archives (generic options, new-style)
        if supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(
                self._upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._hdfs_step_output_dir(step_num))

        # -cacheFile/-cacheArchive (streaming options, old-style)
        if not supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(
                self._pre_0_20_upload_args(self._upload_mgr))

        mapper, combiner, reducer = (
            self._hadoop_streaming_commands(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)
        else:
            args.extend(['-jobconf', 'mapred.reduce.tasks=0'])

        return args
Пример #4
0
    def _streaming_args(self, step, step_num, num_steps):
        version = self.get_hadoop_version()

        streaming_args = (self._opts['hadoop_bin'] +
                          ['jar', self._opts['hadoop_streaming_jar']])

        # -files/-archives (generic options, new-style)
        if supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            streaming_args.extend(
                self._new_upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        streaming_args.extend(
            self._hadoop_conf_args(step, step_num, num_steps))

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            streaming_args.extend(['-input', input_uri])

        # set up output
        streaming_args.append('-output')
        streaming_args.append(self._hdfs_step_output_dir(step_num))

        # -cacheFile/-cacheArchive (streaming options, old-style)
        if not supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            streaming_args.extend(
                self._old_upload_args(self._upload_mgr))

        mapper, combiner, reducer = (
            self._hadoop_streaming_commands(step, step_num))

        streaming_args.append('-mapper')
        streaming_args.append(mapper)

        if combiner:
            streaming_args.append('-combiner')
            streaming_args.append(combiner)

        if reducer:
            streaming_args.append('-reducer')
            streaming_args.append(reducer)
        else:
            streaming_args.extend(['-jobconf', 'mapred.reduce.tasks=0'])

        return streaming_args
Пример #5
0
    def _args_for_streaming_step(self, step_num):
        version = self.get_hadoop_version()

        args = (self._opts['hadoop_bin'] +
                          ['jar', self._opts['hadoop_streaming_jar']])

        # -files/-archives (generic options, new-style)
        if supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(
                self._new_upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._hdfs_step_output_dir(step_num))

        # -cacheFile/-cacheArchive (streaming options, old-style)
        if not supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(
                self._old_upload_args(self._upload_mgr))

        mapper, combiner, reducer = (
            self._hadoop_streaming_commands(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)
        else:
            args.extend(['-jobconf', 'mapred.reduce.tasks=0'])

        return args
Пример #6
0
    def _args_for_streaming_step(self, step_num):
        version = self.get_hadoop_version()

        args = self._opts["hadoop_bin"] + ["jar", self._opts["hadoop_streaming_jar"]]

        # -files/-archives (generic options, new-style)
        if supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(self._new_upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            args.extend(["-input", input_uri])

        # set up output
        args.append("-output")
        args.append(self._hdfs_step_output_dir(step_num))

        # -cacheFile/-cacheArchive (streaming options, old-style)
        if not supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(self._old_upload_args(self._upload_mgr))

        mapper, combiner, reducer = self._hadoop_streaming_commands(step_num)

        args.append("-mapper")
        args.append(mapper)

        if combiner:
            args.append("-combiner")
            args.append(combiner)

        if reducer:
            args.append("-reducer")
            args.append(reducer)
        else:
            args.extend(["-jobconf", "mapred.reduce.tasks=0"])

        return args
Пример #7
0
    def _upload_args(self):
        """Args to upload files from HDFS to the hadoop nodes."""
        args = []

        version = self.get_hadoop_version()

        if compat.supports_new_distributed_cache_options(version):

            # return list of strings ready for comma-joining for passing to the
            # hadoop binary
            def escaped_paths(file_dicts):
                return [
                    "%s#%s" % (fd['hdfs_uri'], fd['name']) for fd in file_dicts
                ]

            # index by type
            all_files = {}
            for fd in self._files:
                all_files.setdefault(fd.get('upload'), []).append(fd)

            if 'file' in all_files:
                args.append('-files')
                args.append(','.join(escaped_paths(all_files['file'])))

            if 'archive' in all_files:
                args.append('-archives')
                args.append(','.join(escaped_paths(all_files['archive'])))

        else:
            for file_dict in self._files:
                if file_dict.get('upload') == 'file':
                    args.append('-cacheFile')
                    args.append('%s#%s' %
                                (file_dict['hdfs_uri'], file_dict['name']))

                elif file_dict.get('upload') == 'archive':
                    args.append('-cacheArchive')
                    args.append('%s#%s' %
                                (file_dict['hdfs_uri'], file_dict['name']))

        return args
Пример #8
0
    def _upload_args(self):
        """Args to upload files from HDFS to the hadoop nodes."""
        args = []

        version = self.get_hadoop_version()

        if compat.supports_new_distributed_cache_options(version):

            # return list of strings ready for comma-joining for passing to the
            # hadoop binary
            def escaped_paths(file_dicts):
                return ["%s#%s" % (fd['hdfs_uri'], fd['name'])
                        for fd in file_dicts]

            # index by type
            all_files = {}
            for fd in self._files:
                all_files.setdefault(fd.get('upload'), []).append(fd)

            if 'file' in all_files:
                args.append('-files')
                args.append(','.join(escaped_paths(all_files['file'])))

            if 'archive' in all_files:
                args.append('-archives')
                args.append(','.join(escaped_paths(all_files['archive'])))

        else:
            for file_dict in self._files:
                if file_dict.get('upload') == 'file':
                    args.append('-cacheFile')
                    args.append(
                        '%s#%s' % (file_dict['hdfs_uri'], file_dict['name']))

                elif file_dict.get('upload') == 'archive':
                    args.append('-cacheArchive')
                    args.append(
                        '%s#%s' % (file_dict['hdfs_uri'], file_dict['name']))

        return args
Пример #9
0
    def _upload_args(self):
        """Args to upload files from HDFS to the hadoop nodes."""
        args = []

        version = self.get_hadoop_version()

        if compat.supports_new_distributed_cache_options(version):

            # return list of strings ready for comma-joining for passing to the
            # hadoop binary
            def escaped_paths(file_dicts):
                return ["%s#%s" % (fd["hdfs_uri"], fd["name"]) for fd in file_dicts]

            # index by type
            all_files = {}
            for fd in self._files:
                all_files.setdefault(fd.get("upload"), []).append(fd)

            if "file" in all_files:
                args.append("-files")
                args.append(",".join(escaped_paths(all_files["file"])))

            if "archive" in all_files:
                args.append("-archives")
                args.append(",".join(escaped_paths(all_files["archive"])))

        else:
            for file_dict in self._files:
                if file_dict.get("upload") == "file":
                    args.append("-cacheFile")
                    args.append("%s#%s" % (file_dict["hdfs_uri"], file_dict["name"]))

                elif file_dict.get("upload") == "archive":
                    args.append("-cacheArchive")
                    args.append("%s#%s" % (file_dict["hdfs_uri"], file_dict["name"]))

        return args
Пример #10
0
    def _run_job_in_hadoop(self):
        # figure out local names for our files
        self._name_files()

        # send script and wrapper script (if any) to working dir
        assert self._script  # shouldn't be able to run if no script
        self._script['upload'] = 'file'
        if self._wrapper_script:
            self._wrapper_script['upload'] = 'file'

        self._counters = []
        steps = self._get_steps()

        version = self.get_hadoop_version()

        for step_num, step in enumerate(steps):
            log.debug('running step %d of %d' % (step_num + 1, len(steps)))

            streaming_args = (self._opts['hadoop_bin'] +
                              ['jar', self._opts['hadoop_streaming_jar']])

            # -files/-archives (generic options, new-style)
            if compat.supports_new_distributed_cache_options(version):
                # set up uploading from HDFS to the working dir
                streaming_args.extend(self._upload_args())

            # Add extra hadoop args first as hadoop args could be a hadoop
            # specific argument (e.g. -libjar) which must come before job
            # specific args.
            streaming_args.extend(
                self._hadoop_conf_args(step_num, len(steps)))

            # set up input
            for input_uri in self._hdfs_step_input_files(step_num):
                streaming_args.extend(['-input', input_uri])

            # set up output
            streaming_args.append('-output')
            streaming_args.append(self._hdfs_step_output_dir(step_num))

            # -cacheFile/-cacheArchive (streaming options, old-style)
            if not compat.supports_new_distributed_cache_options(version):
                # set up uploading from HDFS to the working dir
                streaming_args.extend(self._upload_args())

            # set up mapper and reducer
            if 'M' not in step:
                mapper = 'cat'
            else:
                mapper = cmd_line(self._mapper_args(step_num))

            if 'C' in step:
                combiner_cmd = cmd_line(self._combiner_args(step_num))
                version = self.get_hadoop_version()
                if compat.supports_combiners_in_hadoop_streaming(version):
                    combiner = combiner_cmd
                else:
                    mapper = ("bash -c '%s | sort | %s'" %
                              (mapper, combiner_cmd))
                    combiner = None
            else:
                combiner = None

            streaming_args.append('-mapper')
            streaming_args.append(mapper)

            if combiner:
                streaming_args.append('-combiner')
                streaming_args.append(combiner)

            if 'R' in step:
                streaming_args.append('-reducer')
                streaming_args.append(cmd_line(self._reducer_args(step_num)))
            else:
                streaming_args.extend(['-jobconf', 'mapred.reduce.tasks=0'])

            log.debug('> %s' % cmd_line(streaming_args))
            step_proc = Popen(streaming_args, stdout=PIPE, stderr=PIPE)

            # TODO: use a pty or something so that the hadoop binary
            # won't buffer the status messages
            self._process_stderr_from_streaming(step_proc.stderr)

            # there shouldn't be much output to STDOUT
            for line in step_proc.stdout:
                log.error('STDOUT: ' + line.strip('\n'))

            returncode = step_proc.wait()
            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = ('Job failed with return code %d: %s' %
                       (step_proc.returncode, streaming_args))
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure(
                    [step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append('Probable cause of failure (from %s):' %
                               cause['log_file_uri'])
                    cause_msg.extend(line.strip('\n')
                                     for line in cause['lines'])
                    if cause['input_uri']:
                        cause_msg.append('(while reading from %s)' %
                                         cause['input_uri'])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += '\n' + '\n'.join(cause_msg) + '\n'

                raise Exception(msg)
                raise CalledProcessError(step_proc.returncode, streaming_args)