Пример #1
0
    def _create_master_bootstrap_script_if_needed(self):
        """Helper for :py:meth:`_add_bootstrap_files_for_upload`.

        Create the master bootstrap script and write it into our local
        temp directory. Set self._master_bootstrap_script_path.

        This will do nothing if there are no bootstrap scripts or commands,
        or if it has already been called."""
        if self._master_bootstrap_script_path:
            return

        # don't bother if we're not starting a cluster
        if self._cluster_id:
            return

        # Also don't bother if we're not bootstrapping
        if not (self._bootstrap or self._bootstrap_mrjob()):
            return

        # create mrjob.zip if we need it, and add commands to install it
        mrjob_bootstrap = []
        if self._bootstrap_mrjob():
            assert self._mrjob_zip_path
            path_dict = {
                'type': 'file', 'name': None, 'path': self._mrjob_zip_path}
            self._bootstrap_dir_mgr.add(**path_dict)

            # find out where python keeps its libraries
            mrjob_bootstrap.append([
                "__mrjob_PYTHON_LIB=$(%s -c "
                "'from distutils.sysconfig import get_python_lib;"
                " print(get_python_lib())')" %
                cmd_line(self._python_bin())])
            # unzip mrjob.zip
            mrjob_bootstrap.append(
                ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB'])
            # re-compile pyc files now, since mappers/reducers can't
            # write to this directory. Don't fail if there is extra
            # un-compileable crud in the tarball (this would matter if
            # sh_bin were 'sh -e')
            mrjob_bootstrap.append(
                ['sudo %s -m compileall -q'
                 ' -f $__mrjob_PYTHON_LIB/mrjob && true' %
                 cmd_line(self._python_bin())])

        # we call the script b.py because there's a character limit on
        # bootstrap script names (or there was at one time, anyway)
        path = os.path.join(self._get_local_tmp_dir(), 'b.py')
        log.info('writing master bootstrap script to %s' % path)

        contents = self._master_bootstrap_script_content(
            self._bootstrap + mrjob_bootstrap)
        for line in contents:
            log.debug('BOOTSTRAP: ' + line.rstrip('\r\n'))

        with open(path, 'w') as f:
            for line in contents:
                f.write(line)

        self._master_bootstrap_script_path = path
Пример #2
0
    def _spark_cmdenv(self, step_num):
        """Returns a dictionary mapping environment variable to value,
        including mapping PYSPARK_PYTHON to self._python_bin()
        """
        step = self._get_step(step_num)

        cmdenv = {}

        if step['type'] in ('spark', 'spark_script'):  # not spark_jar
            driver_python = cmd_line(self._python_bin())

            if self._spark_python_wrapper_path:
                executor_python = './%s' % self._working_dir_mgr.name(
                    'file', self._spark_python_wrapper_path)
            else:
                executor_python = cmd_line(self._task_python_bin())

            if self._spark_deploy_mode() == 'cluster':
                # treat driver like executors (they run in same environment)
                cmdenv['PYSPARK_PYTHON'] = executor_python
            elif driver_python == executor_python:
                # no difference, just set $PYSPARK_PYTHON
                cmdenv['PYSPARK_PYTHON'] = driver_python
            else:
                # set different pythons for driver and executor
                cmdenv['PYSPARK_PYTHON'] = executor_python
                cmdenv['PYSPARK_DRIVER_PYTHON'] = driver_python

        cmdenv.update(self._opts['cmdenv'])
        return cmdenv
Пример #3
0
    def _run_job_in_hadoop(self):
        # figure out local names for our files
        self._name_files()

        # send script and wrapper script (if any) to working dir
        assert self._script # shouldn't be able to run if no script
        self._script['upload'] = 'file'
        if self._wrapper_script:
            self._wrapper_script['upload'] = 'file'

        steps = self._get_steps()

        for step_num, step in enumerate(steps):
            log.debug('running step %d of %d' % (step_num+1, len(steps)))

            streaming_args = [self._opts['hadoop_bin'], 'jar', self._opts['hadoop_streaming_jar']]

            # Add extra hadoop args first as hadoop args could be a hadoop
            # specific argument (e.g. -libjar) which must come before job
            # specific args.
            streaming_args.extend(
                self._hadoop_conf_args(step_num, len(steps)))

            # setup input
            for input_uri in self._hdfs_step_input_files(step_num):
                streaming_args.extend(['-input', input_uri])

            # setup output
            streaming_args.append('-output')
            streaming_args.append(self._hdfs_step_output_dir(step_num))

            # set up uploading from HDFS to the working dir
            streaming_args.extend(self._upload_args())

            # set up mapper and reducer
            streaming_args.append('-mapper')
            streaming_args.append(cmd_line(self._mapper_args(step_num)))
            if 'R' in step:
                streaming_args.append('-reducer')
                streaming_args.append(cmd_line(self._reducer_args(step_num)))
            else:
                streaming_args.extend(['-jobconf', 'mapred.reduce.tasks=0'])

            log.debug('> %s' % cmd_line(streaming_args))
            step_proc = Popen(streaming_args, stdout=PIPE, stderr=PIPE)

            # TODO: use a pty or something so that the hadoop binary
            # won't buffer the status messages
            self._process_stderr_from_streaming(step_proc.stderr)

            # there shouldn't be much output to STDOUT
            for line in step_proc.stdout:
                log.error('STDOUT: ' + line.strip('\n'))

            returncode = step_proc.wait()
            if returncode != 0:
                raise CalledProcessError(step_proc.returncode, streaming_args)
Пример #4
0
    def _setup_wrapper_script_content(
            self, setup, manifest=False, wrap_python=False):
        """Return a (Bourne) shell script that runs the setup commands and then
        executes whatever is passed to it (this will be our mapper/reducer),
        as a list of strings (one for each line, including newlines).

        We obtain a file lock so that two copies of the setup commands
        cannot run simultaneously on the same machine (this helps for running
        :command:`make` on a shared source code archive, for example).
        """
        lines = []

        # TODO: this is very similar to _start_of_sh_script() in cloud.py

        if wrap_python:
            # start with shebang
            sh_bin = self._sh_bin()

            if os.path.isabs(sh_bin[0]):
                shebang_bin = sh_bin
            else:
                shebang_bin = ['/usr/bin/env'] + list(sh_bin)

            if len(shebang_bin) > 2:
                # Linux limits shebang to one binary and one arg
                shebang_bin = shebang_bin[:2]
                log.warning('Limiting shebang to two arguments:'
                            '#!%s' % cmd_line(shebang_bin))

            lines.append('#!%s' % cmd_line(shebang_bin))

        # hook for 'set -e', etc.
        pre_commands = self._sh_pre_commands()
        if pre_commands:
            for cmd in pre_commands:
                lines.append(cmd)
            lines.append('')

        if setup:
            lines.extend(self._setup_cmd_content(setup))

        # handle arguments to the script
        if wrap_python:
            # pretend to be python ($@ is arguments to the python binary)
            python_bin = self._task_python_bin()
            lines.append('%s "$@"' % cmd_line(python_bin))
        elif manifest:
            # arguments ($@) are a command
            # eventually runs: "$@" $INPUT_PATH $INPUT_URI
            lines.extend(self._manifest_download_content())
        else:
            # arguments ($@) are a command, just run it
            lines.append('"$@"')

        return lines
Пример #5
0
    def _invoke_process(self, args, outfile_name, env, combiner_args=None):
        """invoke the process described by *args* and write to *outfile_name*

        :param combiner_args: If this mapper has a combiner, we need to do
                              some extra shell wrangling, so pass the combiner
                              arguments in separately.

        :return: dict(proc=Popen, args=[process args], write_to=file)
        """
        if combiner_args:
            log.info('> %s | sort | %s' %
                     (cmd_line(args), cmd_line(combiner_args)))
        else:
            log.info('> %s' % cmd_line(args))

        # set up outfile
        outfile = os.path.join(self._get_local_tmp_dir(), outfile_name)
        log.info('writing to %s' % outfile)

        self._prev_outfiles.append(outfile)
        write_to = open(outfile, 'w')


        with open(outfile, 'w') as write_to:
            if combiner_args:
                # set up a pipeline: mapper | sort | combiner
                mapper_proc = Popen(args, stdout=PIPE, stderr=PIPE,
                                    cwd=self._working_dir, env=env)

                sort_proc = Popen(['sort'], stdin=mapper_proc.stdout,
                                  stdout=PIPE, stderr=PIPE,
                                  cwd=self._working_dir, env=env)

                combiner_proc = Popen(combiner_args, stdin=sort_proc.stdout,
                                      stdout=write_to, stderr=PIPE,
                                      cwd=self._working_dir, env=env)

                # this process shouldn't read from the pipes
                mapper_proc.stdout.close()
                sort_proc.stdout.close()

                return [
                    {'proc': mapper_proc, 'args': args},
                    {'proc': sort_proc, 'args': ['sort']},
                    {'proc': combiner_proc, 'args': combiner_args},
                ]
            else:
                # just run the mapper process
                proc = Popen(args, stdout=write_to, stderr=PIPE,
                             cwd=self._working_dir, env=env)
                return [{'proc': proc, 'args': args}]
Пример #6
0
    def _substep_args(self, step_num, mrc):
        step = self._get_step(step_num)

        if step[mrc]['type'] == 'command':
            cmd = step[mrc]['command']

            # never wrap custom hadoop streaming commands in bash
            if isinstance(cmd, string_types):
                return shlex_split(cmd)
            else:
                return cmd

        elif step[mrc]['type'] == 'script':
            script_args = self._script_args_for_step(
                step_num, mrc, input_manifest=step.get('input_manifest'))

            if 'pre_filter' in step[mrc]:
                return self._sh_wrap(
                    '%s | %s' % (step[mrc]['pre_filter'],
                                 cmd_line(script_args)))
            else:
                return script_args
        else:
            raise ValueError("Invalid %s step %d: %r" % (
                mrc, step_num, step[mrc]))
Пример #7
0
    def _load_steps(self):
        args = (self._executable(True) + ['--steps'] +
                self._mr_job_extra_args(local=True))
        log.debug('> %s' % cmd_line(args))

        # add . to PYTHONPATH (in case mrjob isn't actually installed)
        env = combine_local_envs(os.environ,
                                 {'PYTHONPATH': os.path.abspath('.')})
        steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env)
        stdout, stderr = steps_proc.communicate()

        if steps_proc.returncode != 0:
            raise Exception(
                'error getting step information: \n%s' % stderr)

        # on Python 3, convert stdout to str so we can json.loads() it
        if not isinstance(stdout, str):
            stdout = stdout.decode('utf_8')

        try:
            steps = json.loads(stdout)
        except ValueError:
            raise ValueError("Bad --steps response: \n%s" % stdout)

        # verify that this is a proper step description
        if not steps or not stdout:
            raise ValueError('step description is empty!')

        return steps
Пример #8
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf',
                               '-r', 'local'])
        mr_job.sandbox(stdin=[b'bar\n'])

        with mr_job.make_runner() as runner:
            runner.run()

            # expect python -v crud in stderr

            with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                self.assertTrue(any(
                    'import mrjob' in line or  # Python 2
                    "import 'mrjob'" in line
                    for line in lines))

            with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                self.assertTrue(any(
                    '#' in line for line in lines))

            # should still get expected results
            self.assertEqual(
                sorted(to_lines(runner.cat_output())),
                sorted([b'1\tnull\n', b'1\t"bar"\n']))
Пример #9
0
    def _cat_file(self, filename):
        if is_uri(filename):
            # stream from HDFS
            cat_args = self._opts['hadoop_bin'] + ['fs', '-cat', filename]
            log.debug('> %s' % cmd_line(cat_args))

            cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

            def stream():
                for line in cat_proc.stdout:
                    yield line

                # there shouldn't be any stderr
                for line in cat_proc.stderr:
                    log.error('STDERR: ' + line)

                returncode = cat_proc.wait()

                if returncode != 0:
                    raise CalledProcessError(returncode, cat_args)

            return read_file(filename, stream())
        else:
            # read from local filesystem
            return super(HadoopJobRunner, self)._cat_file(filename)
Пример #10
0
    def _parse_setup(self):
        """Parse the *setup* option with
        :py:func:`mrjob.setup.parse_setup_cmd()`.

        If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both
        true, create mrjob.tar.gz (if it doesn't exist already) and
        prepend a setup command that adds it to PYTHONPATH.

        Also patch in the deprecated
        options *python_archives*, *setup_cmd*, and *setup_script*
        as setup commands.
        """
        setup = []

        # python_archives
        for path in self._opts['python_archives']:
            path_dict = parse_legacy_hash_path('archive', path)
            setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH'])

        # setup
        for cmd in self._opts['setup']:
            setup.append(parse_setup_cmd(cmd))

        # setup_cmds
        for cmd in self._opts['setup_cmds']:
            if not isinstance(cmd, basestring):
                cmd = cmd_line(cmd)
            setup.append([cmd])

        # setup_scripts
        for path in self._opts['setup_scripts']:
            path_dict = parse_legacy_hash_path('file', path)
            setup.append([path_dict])

        return setup
Пример #11
0
    def _run_job_in_hadoop(self):
        self._counters = []

        for step_num in range(self._num_steps()):
            log.debug("running step %d of %d" % (step_num + 1, self._num_steps()))

            step_args = self._args_for_step(step_num)

            log.debug("> %s" % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen
                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                self._process_stderr_from_streaming(step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    log.error("STDOUT: " + to_string(line.strip(b"\n")))

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    with os.fdopen(master_fd, "rb") as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        self._process_stderr_from_streaming(master)
                        _, returncode = os.waitpid(pid, 0)

            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = "Job failed with return code %d: %s" % (returncode, step_args)
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure([step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append("Probable cause of failure (from %s):" % cause["log_file_uri"])
                    cause_msg.extend(line.strip("\n") for line in cause["lines"])
                    if cause["input_uri"]:
                        cause_msg.append("(while reading from %s)" % cause["input_uri"])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += "\n" + "\n".join(cause_msg) + "\n"

                raise CalledProcessError(returncode, step_args)
Пример #12
0
    def archive_and_unarchive(self, extension, archive_template,
                              added_files=[]):
        join = os.path.join

        # archive it up
        archive_name = 'a.' + extension
        variables = dict(archive_name=join('..', archive_name),
                         files_to_archive='.')
        archive_command = [arg % variables for arg in archive_template]

        # sometime the relevant command isn't available or doesn't work;
        # if so, skip the test
        try:
            proc = Popen(archive_command, cwd=join(self.tmp_dir, 'a'),
                         stdout=PIPE, stderr=PIPE)
        except OSError as e:
            if e.errno == 2:
                self.skipTest("No %s command" % archive_command[0])
            else:
                raise
        proc.communicate()  # discard output
        if proc.returncode != 0:
            self.skipTest("Can't run `%s` to create archive." %
                          cmd_line(archive_command))

        # unarchive it into b/
        unarchive(join(self.tmp_dir, archive_name), join(self.tmp_dir, 'b'))

        self.ensure_expected_results(added_files=added_files)
Пример #13
0
    def _run_spark_submit(self, spark_submit_args, env, record_callback):
        """Run the spark submit binary in a subprocess, using a PTY if possible

        :param spark_submit_args: spark-submit binary and arguments, as as list
        :param env: environment variables, as a dict
        :param record_callback: a function that takes a single log4j record
                                as its argument (see
                                :py:func:`~mrjob.logs.log4j\
                                ._parse_hadoop_log4j_records)

        :return: the subprocess's return code
        """
        log.debug('> %s' % cmd_line(spark_submit_args))
        log.debug('  with environment: %r' % sorted(env.items()))

        returncode = 0  # should always be set, but just in case

        # try to use a PTY if it's available
        try:
            pid, master_fd = pty.fork()
        except (AttributeError, OSError):
            # no PTYs, just use Popen

            # user won't get much feedback for a while, so tell them
            # spark-submit is running
            log.debug('No PTY available, using Popen() to invoke spark-submit')

            step_proc = Popen(
                spark_submit_args, stdout=PIPE, stderr=PIPE, env=env)

            for line in step_proc.stderr:
                for record in _parse_hadoop_log4j_records(
                        _yield_lines_from_pty_or_pipe(step_proc.stderr)):
                    record_callback(record)

            # there shouldn't be much output on STDOUT
            for record in _parse_hadoop_log4j_records(step_proc.stdout):
                record_callback(record)

            step_proc.stdout.close()
            step_proc.stderr.close()

            returncode = step_proc.wait()
        else:
            # we have PTYs
            if pid == 0:  # we are the child process
                os.execvpe(spark_submit_args[0], spark_submit_args, env)
                # now this process is no longer Python
            else:
                log.debug('Invoking spark-submit via PTY')

                with os.fdopen(master_fd, 'rb') as master:
                    for record in _parse_hadoop_log4j_records(
                            _yield_lines_from_pty_or_pipe(master)):
                        record_callback(record)
                    _, returncode = os.waitpid(pid, 0)

        return returncode
Пример #14
0
    def _run_job_in_hadoop(self):
        self._counters = []
        steps = self._get_steps()

        for step_num, step in enumerate(steps):
            log.debug('running step %d of %d' % (step_num + 1, len(steps)))

            streaming_args = self._streaming_args(step, step_num, len(steps))

            log.debug('> %s' % cmd_line(streaming_args))

            master, slave = pty.openpty()

            step_proc = Popen(streaming_args, stdout=PIPE, stderr=slave)

            stderr = os.fdopen(master)

            self._process_stderr_from_streaming(step_proc, stderr)

            stderr.close()

            # there shouldn't be much output to STDOUT
            for line in step_proc.stdout:
                log.error('STDOUT: ' + line.strip('\n'))

            returncode = step_proc.wait()
            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = ('Job failed with return code %d: %s' %
                       (step_proc.returncode, streaming_args))
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure(
                    [step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append('Probable cause of failure (from %s):' %
                               cause['log_file_uri'])
                    cause_msg.extend(line.strip('\n')
                                     for line in cause['lines'])
                    if cause['input_uri']:
                        cause_msg.append('(while reading from %s)' %
                                         cause['input_uri'])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += '\n' + '\n'.join(cause_msg) + '\n'

                raise Exception(msg)
                raise CalledProcessError(step_proc.returncode, streaming_args)
Пример #15
0
    def _setup_wrapper_script_content(self, setup, mrjob_tar_gz_name=None):
        """Return a (Bourne) shell script that runs the setup commands and then
        executes whatever is passed to it (this will be our mapper/reducer).

        We obtain a file lock so that two copies of the setup commands
        cannot run simultaneously on the same machine (this helps for running
        :command:`make` on a shared source code archive, for example).
        """
        out = StringIO()

        def writeln(line=''):
            out.write(line + '\n')

        # we're always going to execute this script as an argument to
        # sh, so there's no need to add a shebang (e.g. #!/bin/sh)

        writeln('# store $PWD')
        writeln('__mrjob_PWD=$PWD')
        writeln('')

        writeln('# obtain exclusive file lock')
        # Basically, we're going to tie file descriptor 9 to our lockfile,
        # use a subprocess to obtain a lock (which we somehow inherit too),
        # and then release the lock by closing the file descriptor.
        # File descriptors 10 and higher are used internally by the shell,
        # so 9 is as out-of-the-way as we can get.
        writeln('exec 9>/tmp/wrapper.lock.%s' % self._job_name)
        # would use flock(1), but it's not always available
        writeln("%s -c 'import fcntl; fcntl.flock(9, fcntl.LOCK_EX)'" %
                cmd_line(self._opts['python_bin']))
        writeln()

        writeln('# setup commands')
        for cmd in setup:
            # reconstruct the command line, substituting $__mrjob_PWD/<name>
            # for path dicts
            line = ''
            for token in cmd:
                if isinstance(token, dict):
                    # it's a path dictionary
                    line += '$__mrjob_PWD/'
                    line += pipes.quote(self._working_dir_mgr.name(**token))
                else:
                    # it's raw script
                    line += token
            writeln(line)
        writeln()

        writeln('# release exclusive file lock')
        writeln('exec 9>&-')
        writeln()

        writeln('# run job from the original working directory')
        writeln('cd $__mrjob_PWD')
        writeln('"$@"')

        return out.getvalue()
Пример #16
0
    def _setup_cmd_content(self, setup):
        """Write setup script content to obtain a file lock, run setup
        commands in a way that doesn't perturb the script, and then
        release the lock and return to the original working directory."""
        lines = []

        lines.append('# store $PWD')
        lines.append('__mrjob_PWD=$PWD')
        lines.append('')

        lines.append('# obtain exclusive file lock')
        # Basically, we're going to tie file descriptor 9 to our lockfile,
        # use a subprocess to obtain a lock (which we somehow inherit too),
        # and then release the lock by closing the file descriptor.
        # File descriptors 10 and higher are used internally by the shell,
        # so 9 is as out-of-the-way as we can get.
        lines.append('exec 9>/tmp/wrapper.lock.%s' % self._job_key)
        # would use flock(1), but it's not always available
        lines.append("%s -c 'import fcntl; fcntl.flock(9, fcntl.LOCK_EX)'" %
                     cmd_line(self._python_bin()))
        lines.append('')

        lines.append('# setup commands')
        # group setup commands so we can redirect their input/output (see
        # below). Don't use parens; this would invoke a subshell, which would
        # keep us from exporting environment variables to the task.
        lines.append('{')
        for cmd in setup:
            # reconstruct the command line, substituting $__mrjob_PWD/<name>
            # for path dicts
            line = '  '  # indent, since these commands are in a group
            for token in cmd:
                if isinstance(token, dict):
                    # it's a path dictionary
                    line += '$__mrjob_PWD/'
                    line += pipes.quote(self._working_dir_mgr.name(**token))
                else:
                    # it's raw script
                    line += token
            lines.append(line)
        # redirect setup commands' input/output so they don't interfere
        # with the task (see Issue #803).
        lines.append('} 0</dev/null 1>&2')
        lines.append('')

        lines.append('# release exclusive file lock')
        lines.append('exec 9>&-')
        lines.append('')

        lines.append('# run task from the original working directory')
        lines.append('cd $__mrjob_PWD')

        return lines
Пример #17
0
    def _invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,
                       return_stdout=False):
        """Run the given hadoop command, raising an exception on non-zero
        return code. This only works for commands whose output we don't
        care about.

        Args:
        ok_returncodes -- a list/tuple/set of return codes we expect to
            get back from hadoop (e.g. [0,1]). By default, we only expect 0.
            If we get an unexpected return code, we raise a CalledProcessError.
        ok_stderr -- don't log STDERR or raise CalledProcessError if stderr
            matches a regex in this list (even if the returncode is bad)
        return_stdout -- return the stdout from the hadoop command rather
            than logging it. If this is False, we return the returncode
            instead.
        """
        if args[0] == 'fs':
            if self._opts['hdfs_namenode']:
                args = [args[0]] + ['-fs', self._opts['hdfs_namenode']] + args[1:]

        args = self._opts['hadoop_bin'] + args

        log.debug('> %s' % cmd_line(args))

        proc = Popen(args, stdout=PIPE, stderr=PIPE)
        stdout, stderr = proc.communicate()

        log_func = log.debug if proc.returncode == 0 else log.error
        if not return_stdout:
            for line in StringIO(stdout):
                log_func('STDOUT: ' + line.rstrip('\r\n'))

        # check if STDERR is okay
        stderr_is_ok = False
        if ok_stderr:
            for stderr_re in ok_stderr:
                if stderr_re.match(stderr):
                    stderr_is_ok = True
                    break

        if not stderr_is_ok:
            for line in StringIO(stderr):
                log_func('STDERR: ' + line.rstrip('\r\n'))

        ok_returncodes = ok_returncodes or [0]

        if not stderr_is_ok and proc.returncode not in ok_returncodes:
            raise CalledProcessError(proc.returncode, args)

        if return_stdout:
            return stdout
        else:
            return proc.returncode
Пример #18
0
    def _spark_cmdenv(self, step_num):
        """Returns a dictionary mapping environment variable to value,
        including mapping PYSPARK_PYTHON to self._python_bin()
        """
        step = self._get_step(step_num)

        cmdenv = {}

        if step['type'] in ('spark', 'spark_script'):  # not spark_jar
            cmdenv = dict(PYSPARK_PYTHON=cmd_line(self._python_bin()))
        cmdenv.update(self._opts['cmdenv'])
        return cmdenv
Пример #19
0
    def _ssh_launch(self, address, cmd_args, stdin=None):
        """Copy SSH keys if necessary, then launch the given command
        over SSH and return a Popen."""
        self._ssh_copy_key(address)

        args = self._ssh_cmd_args(address, cmd_args)

        log.debug('  > ' + cmd_line(args))
        try:
            return Popen(args, stdout=PIPE, stderr=PIPE, stdin=stdin)
        except OSError as ex:
            raise IOError(ex.strerror)
Пример #20
0
 def _render_substep(self, cmd_key, pre_filter_key=None):
     if self._steps[cmd_key]:
         cmd = self._steps[cmd_key]
         if not isinstance(cmd, string_types):
             cmd = cmd_line(cmd)
         if pre_filter_key and self._steps[pre_filter_key]:
             raise ValueError("Cannot specify both %s and %s" % (cmd_key, pre_filter_key))
         return {"type": "command", "command": cmd}
     else:
         substep = {"type": "script"}
         if pre_filter_key and self._steps[pre_filter_key]:
             substep["pre_filter"] = self._steps[pre_filter_key]
         return substep
Пример #21
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf'])
        mr_job.sandbox(stdin=['bar\n'])

        with no_handlers_for_logger():
            mr_job.run_job()

        # expect debugging messages in stderr
        assert_in('import mrjob', mr_job.stderr.getvalue())
        assert_in('#', mr_job.stderr.getvalue())

        # should still get expected results
        assert_equal(sorted(mr_job.parse_output()), [(1, None), (1, 'bar')])
Пример #22
0
 def _render_substep(self, cmd_key, pre_filter_key=None):
     if self._steps[cmd_key]:
         cmd = self._steps[cmd_key]
         if not isinstance(cmd, basestring):
             cmd = cmd_line(cmd)
         if (pre_filter_key and self._steps[pre_filter_key]):
             raise ValueError('Cannot specify both %s and %s' %
                              (cmd_key, pre_filter_key))
         return {'type': 'command', 'command': cmd}
     else:
         substep = {'type': 'script'}
         if (pre_filter_key and self._steps[pre_filter_key]):
             substep['pre_filter'] = self._steps[pre_filter_key]
         return substep
Пример #23
0
def _sort_lines_with_sort_bin(input_paths, output_path, sort_bin,
                              sort_values=False, tmp_dir=None):
    """Sort lines the given *input_paths* into *output_path*,
    using *sort_bin*. If there is a problem, fall back to in-memory sort.

    This is a helper for :py:meth:`LocalMRJobRunner._sort_input_func`.

    *tmp_dir* determines the value of :envvar:`$TMP` and :envvar:`$TMPDIR`
    that *sort_bin* sees.
    """
    if input_paths:
        env = os.environ.copy()

        # ignore locale when sorting
        env['LC_ALL'] = 'C'

        # Make sure that the tmp dir environment variables are changed if
        # the default is changed.
        env['TMP'] = tmp_dir
        env['TMPDIR'] = tmp_dir

        with open(output_path, 'wb') as output:
            args = sort_bin + list(input_paths)
            log.debug('> %s' % cmd_line(args))

            try:
                check_call(args, stdout=output, env=env)
                return
            except CalledProcessError:
                log.error(
                    '`%s` failed, falling back to in-memory sort' %
                    cmd_line(sort_bin))
            except OSError:
                log.error(
                    'no sort binary, falling back to in-memory sort')

    _sort_lines_in_memory(input_paths, output_path, sort_values=sort_values)
Пример #24
0
def _sort_lines_with_sort_bin(input_paths, output_path, sort_bin,
                              sort_values=False, tmp_dir=None):
    """Sort lines the given *input_paths* into *output_path*,
    using *sort_bin*. If there is a problem, fall back to in-memory sort.

    This is a helper for :py:meth:`LocalMRJobRunner._sort_input_func`.

    *tmp_dir* determines the value of :envvar:`$TMP` and :envvar:`$TMPDIR`
    that *sort_bin* sees.
    """
    if input_paths:
        env = os.environ.copy()

        # ignore locale when sorting
        env['LC_ALL'] = 'C'

        # Make sure that the tmp dir environment variables are changed if
        # the default is changed.
        env['TMP'] = tmp_dir
        env['TMPDIR'] = tmp_dir

        with open(output_path, 'wb') as output:
            args = sort_bin + list(input_paths)
            log.debug('> %s' % cmd_line(args))

            try:
                check_call(args, stdout=output, env=env)
                return
            except CalledProcessError:
                log.error(
                    '`%s` failed, falling back to in-memory sort' %
                    cmd_line(sort_bin))
            except OSError:
                log.error(
                    'no sort binary, falling back to in-memory sort')

    _sort_lines_in_memory(input_paths, output_path, sort_values=sort_values)
Пример #25
0
    def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,
                      return_stdout=False):
        """Run the given hadoop command, raising an exception on non-zero
        return code. This only works for commands whose output we don't
        care about.

        Args:
        ok_returncodes -- a list/tuple/set of return codes we expect to
            get back from hadoop (e.g. [0,1]). By default, we only expect 0.
            If we get an unexpected return code, we raise a CalledProcessError.
        ok_stderr -- don't log STDERR or raise CalledProcessError if stderr
            matches a regex in this list (even if the returncode is bad)
        return_stdout -- return the stdout from the hadoop command rather
            than logging it. If this is False, we return the returncode
            instead.
        """
        args = self._hadoop_bin + args

        log.debug('> %s' % cmd_line(args))

        proc = Popen(args, stdout=PIPE, stderr=PIPE)
        stdout, stderr = proc.communicate()

        log_func = log.debug if proc.returncode == 0 else log.error
        if not return_stdout:
            for line in StringIO(stdout):
                log_func('STDOUT: ' + line.rstrip('\r\n'))

        # check if STDERR is okay
        stderr_is_ok = False
        if ok_stderr:
            for stderr_re in ok_stderr:
                if stderr_re.match(stderr):
                    stderr_is_ok = True
                    break

        if not stderr_is_ok:
            for line in StringIO(stderr):
                log_func('STDERR: ' + line.rstrip('\r\n'))

        ok_returncodes = ok_returncodes or [0]

        if not stderr_is_ok and proc.returncode not in ok_returncodes:
            raise CalledProcessError(proc.returncode, args)

        if return_stdout:
            return stdout
        else:
            return proc.returncode
Пример #26
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf'])
        mr_job.sandbox(stdin=['bar\n'])

        with no_handlers_for_logger():
            mr_job.run_job()

        # expect debugging messages in stderr
        self.assertIn('import mrjob', mr_job.stderr.getvalue())
        self.assertIn('#', mr_job.stderr.getvalue())

        # should still get expected results
        self.assertEqual(sorted(mr_job.parse_output()), [(1, None),
                                                         (1, 'bar')])
Пример #27
0
    def _start_of_sh_script(self):
        """Return a list of lines (without trailing newlines) containing the
        shell script shebang and pre-commands."""
        out = []

        # shebang
        sh_bin = self._sh_bin()
        if not sh_bin[0].startswith('/'):
            sh_bin = ['/usr/bin/env'] + sh_bin
        out.append('#!' + cmd_line(sh_bin))

        # hook for 'set -e', etc. (see #1549)
        out.extend(self._sh_pre_commands())

        return out
Пример #28
0
    def _start_of_sh_script(self):
        """Return a list of lines (without trailing newlines) containing the
        shell script shebang and pre-commands."""
        out = []

        # shebang
        sh_bin = self._sh_bin()
        if not sh_bin[0].startswith('/'):
            sh_bin = ['/usr/bin/env'] + sh_bin
        out.append('#!' + cmd_line(sh_bin))

        # hook for 'set -e', etc. (see #1549)
        out.extend(self._sh_pre_commands())

        return out
Пример #29
0
def _ssh_run(ssh_bin, address, ec2_key_pair_file, cmd_args, stdin=''):
    """Shortcut to call ssh on a Hadoop node via ``subprocess``.

    :param ssh_bin: Path to ``ssh`` binary
    :param address: Address of your job's master node
    :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``)
    :param cmd_args: The command you want to run
    :param stdin: String to pass to the process's standard input

    :return: (stdout, stderr)
    """
    args = _ssh_args(ssh_bin, address, ec2_key_pair_file) + list(cmd_args)
    log.debug('> %s' % cmd_line(args))
    p = Popen(args, stdout=PIPE, stderr=PIPE, stdin=PIPE)
    return p.communicate(stdin)
Пример #30
0
Файл: step.py Проект: mtai/mrjob
 def _render_substep(self, cmd_key, pre_filter_key=None):
     if self._steps[cmd_key]:
         cmd = self._steps[cmd_key]
         if not isinstance(cmd, string_types):
             cmd = cmd_line(cmd)
         if (pre_filter_key and self._steps[pre_filter_key]):
             raise ValueError('Cannot specify both %s and %s' % (
                 cmd_key, pre_filter_key))
         return {'type': 'command', 'command': cmd}
     else:
         substep = {'type': 'script'}
         if (pre_filter_key and
                 self._steps[pre_filter_key]):
             substep['pre_filter'] = self._steps[pre_filter_key]
         return substep
Пример #31
0
def _ssh_run(ssh_bin, address, ec2_key_pair_file, cmd_args, stdin=''):
    """Shortcut to call ssh on a Hadoop node via ``subprocess``.

    :param ssh_bin: Path to ``ssh`` binary
    :param address: Address of your job's master node
    :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``)
    :param cmd_args: The command you want to run
    :param stdin: String to pass to the process's standard input

    :return: (stdout, stderr)
    """
    args = _ssh_args(ssh_bin, address, ec2_key_pair_file) + list(cmd_args)
    log.debug('> %s' % cmd_line(args))
    p = Popen(args, stdout=PIPE, stderr=PIPE, stdin=PIPE)
    return p.communicate(stdin)
Пример #32
0
    def _harness_job(self, job_class, input_bytes=b'', input_paths=(),
                     runner_alias='inline', compression_codec=None,
                     job_args=None, spark_conf=None, first_step_num=None,
                     last_step_num=None, counter_output_dir=None,
                     num_reducers=None, max_output_files=None,
                     emulate_map_input_file=False,
                     skip_internal_protocol=False):
        from tests.mr_spark_harness import MRSparkHarness

        job_class_path = '%s.%s' % (job_class.__module__, job_class.__name__)

        harness_job_args = ['-r', runner_alias, '--job-class', job_class_path]
        if spark_conf:
            for key, value in spark_conf.items():
                harness_job_args.append('--jobconf')
                harness_job_args.append('%s=%s' % (key, value))
        if compression_codec:
            harness_job_args.append('--compression-codec')
            harness_job_args.append(compression_codec)
        if job_args:
            harness_job_args.extend(['--job-args', cmd_line(job_args)])
        if first_step_num is not None:
            harness_job_args.extend(['--first-step-num', str(first_step_num)])
        if last_step_num is not None:
            harness_job_args.extend(['--last-step-num', str(last_step_num)])
        if counter_output_dir is not None:
            harness_job_args.extend(
                ['--counter-output-dir', counter_output_dir])
        if num_reducers is not None:
            harness_job_args.extend(
                ['--num-reducers', str(num_reducers)])

        if max_output_files is not None:
            harness_job_args.extend(
                ['--max-output-files', str(max_output_files)])

        if emulate_map_input_file:
            harness_job_args.append('--emulate-map-input-file')

        if skip_internal_protocol:
            harness_job_args.append('--skip-internal-protocol')

        harness_job_args.extend(input_paths)

        harness_job = MRSparkHarness(harness_job_args)
        harness_job.sandbox(stdin=BytesIO(input_bytes))

        return harness_job
Пример #33
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(
            ['--python-bin', python_cmd, '--no-conf', '-r', 'local'])
        mr_job.sandbox(stdin=['bar\n'])

        with no_handlers_for_logger():
            mr_job.run_job()

        # expect debugging messages in stderr
        self.assertIn('import mrjob', mr_job.stderr.getvalue())
        self.assertIn('#', mr_job.stderr.getvalue())

        # should still get expected results
        self.assertItemsEqual(mr_job.stdout.getvalue().splitlines(),
                              ['1\tnull', '1\t"bar"'])
Пример #34
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf',
                               '-r', 'local'])
        mr_job.sandbox(stdin=['bar\n'])

        with no_handlers_for_logger():
            mr_job.run_job()

        # expect debugging messages in stderr
        self.assertIn('import mrjob', mr_job.stderr.getvalue())
        self.assertIn('#', mr_job.stderr.getvalue())

        # should still get expected results
        self.assertItemsEqual(mr_job.stdout.getvalue().splitlines(),
                              ['1\tnull', '1\t"bar"'])
Пример #35
0
    def _ssh_add_key(self):
        """Add ``self._ec2_key_pair_file`` to the ssh agent with ``ssh-add``.
        """
        args = self._ssh_add_bin + ['-t', '60', self._ec2_key_pair_file]

        log.debug('  > ' + cmd_line(args))

        try:
            p = Popen(args, stdout=PIPE, stderr=PIPE)
        except OSError as ex:
            raise IOError(ex.strerror)

        stdout, stderr = p.communicate()

        if p.returncode != 0:
            raise IOError(to_unicode(stderr))
Пример #36
0
    def _run_job_in_hadoop(self):
        self._counters = []
        steps = self._get_steps()

        for step_num, step in enumerate(steps):
            log.debug("running step %d of %d" % (step_num + 1, len(steps)))

            streaming_args = self._streaming_args(step, step_num, len(steps))

            log.debug("> %s" % cmd_line(streaming_args))
            step_proc = Popen(streaming_args, stdout=PIPE, stderr=PIPE)

            # TODO: use a pty or something so that the hadoop binary
            # won't buffer the status messages
            self._process_stderr_from_streaming(step_proc.stderr)

            # there shouldn't be much output to STDOUT
            for line in step_proc.stdout:
                log.error("STDOUT: " + line.strip("\n"))

            returncode = step_proc.wait()
            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = "Job failed with return code %d: %s" % (step_proc.returncode, streaming_args)
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure([step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append("Probable cause of failure (from %s):" % cause["log_file_uri"])
                    cause_msg.extend(line.strip("\n") for line in cause["lines"])
                    if cause["input_uri"]:
                        cause_msg.append("(while reading from %s)" % cause["input_uri"])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += "\n" + "\n".join(cause_msg) + "\n"

                raise Exception(msg)
                raise CalledProcessError(step_proc.returncode, streaming_args)
Пример #37
0
    def _get_steps(self):
        """Call the job script to find out how many steps it has, and whether
        there are mappers and reducers for each step. Validate its
        output.

        Returns output as described in :ref:`steps-format`.

        Results are cached, so call this as many times as you want.
        """
        if self._steps is None:
            if not self._script_path:
                self._steps = []
            else:
                args = (self._executable(True) + ['--steps'] +
                        self._mr_job_extra_args(local=True))
                log.debug('> %s' % cmd_line(args))
                # add . to PYTHONPATH (in case mrjob isn't actually installed)
                env = combine_local_envs(os.environ,
                                         {'PYTHONPATH': os.path.abspath('.')})
                steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env)
                stdout, stderr = steps_proc.communicate()

                if steps_proc.returncode != 0:
                    raise Exception(
                        'error getting step information: \n%s' % stderr)

                # on Python 3, convert stdout to str so we can json.loads() it
                if not isinstance(stdout, str):
                    stdout = stdout.decode('utf_8')

                try:
                    steps = json.loads(stdout)
                except ValueError:
                    raise ValueError("Bad --steps response: \n%s" % stdout)

                # verify that this is a proper step description
                if not steps or not stdout:
                    raise ValueError('step description is empty!')
                for step in steps:
                    if step['type'] not in STEP_TYPES:
                        raise ValueError(
                            'unexpected step type %r in steps %r' % (
                                step['type'], stdout))

                self._steps = steps

        return self._steps
Пример #38
0
    def _parse_setup(self):
        """Parse the *setup* option with
        :py:func:`mrjob.setup.parse_setup_cmd()`.

        If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both
        true, create mrjob.tar.gz (if it doesn't exist already) and
        prepend a setup command that adds it to PYTHONPATH.

        Also patch in the deprecated
        options *python_archives*, *setup_cmd*, and *setup_script*
        as setup commands.
        """
        setup = []

        # python_archives
        for path in self._opts["python_archives"]:
            path_dict = parse_legacy_hash_path("archive", path)
            setup.append(["export PYTHONPATH=", path_dict, ":$PYTHONPATH"])

        # setup
        for cmd in self._opts["setup"]:
            setup.append(parse_setup_cmd(cmd))

        # setup_cmds
        if self._opts["setup_cmds"]:
            log.warning(
                "setup_cmds is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead."
            )

        for cmd in self._opts["setup_cmds"]:
            if not isinstance(cmd, string_types):
                cmd = cmd_line(cmd)
            setup.append([cmd])

        # setup_scripts
        if self._opts["setup_scripts"]:
            log.warning(
                "setup_scripts is deprecated since v0.4.2 and will be removed"
                " in v0.6.0. Consider using setup instead."
            )

        for path in self._opts["setup_scripts"]:
            path_dict = parse_legacy_hash_path("file", path)
            setup.append([path_dict])

        return setup
Пример #39
0
    def _substep_cmd_line(self, step, step_num, mrc):
        if step[mrc]['type'] == 'command':
            # never wrap custom hadoop streaming commands in bash
            return step[mrc]['command'], False

        elif step[mrc]['type'] == 'script':
            cmd = cmd_line(self._script_args_for_step(step_num, mrc))

            # filter input and pipe for great speed, if user asks
            # but we have to wrap the command in bash
            if 'pre_filter' in step[mrc]:
                return '%s | %s' % (step[mrc]['pre_filter'], cmd), True
            else:
                return cmd, False
        else:
            raise ValueError("Invalid %s step %d: %r" % (
                mrc, step_num, step[mrc]))
Пример #40
0
    def _substep_cmd_line(self, step, step_num, mrc):
        if step[mrc]['type'] == 'command':
            # never wrap custom hadoop streaming commands in bash
            return step[mrc]['command'], False

        elif step[mrc]['type'] == 'script':
            cmd = cmd_line(self._script_args_for_step(step_num, mrc))

            # filter input and pipe for great speed, if user asks
            # but we have to wrap the command in bash
            if 'pre_filter' in step[mrc]:
                return '%s | %s' % (step[mrc]['pre_filter'], cmd), True
            else:
                return cmd, False
        else:
            raise ValueError("Invalid %s step %d: %r" %
                             (mrc, step_num, step[mrc]))
Пример #41
0
    def _get_steps(self):
        """Call the mr_job to find out how many steps it has, and whether
        there are mappers and reducers for each step. Validate its
        output.

        Returns output like ['MR', 'M']
        (two steps, second only has a mapper)

        We'll cache the result (so you can call _get_steps() as many times
        as you want)
        """
        if self._steps is None:
            if not self._script:
                self._steps = []
            else:
                # don't use self._opts['python_bin'] because that
                # refers to the python binary to use inside Hadoop
                python_bin = sys.executable or 'python'
                args = ([python_bin, self._script['path'], '--steps'] +
                        self._mr_job_extra_args(local=True))
                log.debug('> %s' % cmd_line(args))
                # add . to PYTHONPATH (in case mrjob isn't actually installed)
                env = combine_local_envs(os.environ,
                                         {'PYTHONPATH': os.path.abspath('.')})
                steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env)
                stdout, stderr = steps_proc.communicate()

                if steps_proc.returncode != 0:
                    raise Exception('error getting step information: %s',
                                    stderr)

                steps = stdout.strip().split(' ')

                # verify that this is a proper step description
                if not steps:
                    raise ValueError('step description is empty!')
                for step in steps:
                    if step not in ('MR', 'M'):
                        raise ValueError(
                            'unexpected step type %r in steps %r' %
                            (step, stdout))

                self._steps = steps

        return self._steps
Пример #42
0
    def _parse_setup(self):
        """Parse the *setup* option with
        :py:func:`mrjob.setup.parse_setup_cmd()`.

        If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both
        true, create mrjob.tar.gz (if it doesn't exist already) and
        prepend a setup command that adds it to PYTHONPATH.

        Also patch in the deprecated
        options *python_archives*, *setup_cmd*, and *setup_script*
        as setup commands.
        """
        setup = []

        # python_archives
        for path in self._opts['python_archives']:
            path_dict = parse_legacy_hash_path('archive', path)
            setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH'])

        # setup
        for cmd in self._opts['setup']:
            setup.append(parse_setup_cmd(cmd))

        # setup_cmds
        if self._opts['setup_cmds']:
            log.warning(
                "setup_cmds is deprecated since v0.4.2 and will be removed"
                " in v0.6.0. Consider using setup instead.")

        for cmd in self._opts['setup_cmds']:
            if not isinstance(cmd, string_types):
                cmd = cmd_line(cmd)
            setup.append([cmd])

        # setup_scripts
        if self._opts['setup_scripts']:
            log.warning(
                "setup_scripts is deprecated since v0.4.2 and will be removed"
                " in v0.6.0. Consider using setup instead.")

        for path in self._opts['setup_scripts']:
            path_dict = parse_legacy_hash_path('file', path)
            setup.append([path_dict])

        return setup
Пример #43
0
    def _invoke_processes(self, procs_args, output_path, working_dir, env):
        """invoke the process described by *args* and write to *output_path*

        :param combiner_args: If this mapper has a combiner, we need to do
                              some extra shell wrangling, so pass the combiner
                              arguments in separately.

        :return: dict(proc=Popen, args=[process args], write_to=file)
        """
        log.info('> %s > %s' % (' | '.join(
            args if isinstance(args, basestring) else cmd_line(args)
            for args in procs_args), output_path))

        with open(output_path, 'w') as write_to:
            procs = _chain_procs(procs_args, stdout=write_to, stderr=PIPE,
                                 cwd=working_dir, env=env)
            return [{'args': a, 'proc': proc, 'write_to': write_to}
                    for a, proc in zip(procs_args, procs)]
Пример #44
0
    def _cat_file(self, filename):
        # stream from HDFS
        cat_args = self._hadoop_bin + ['fs', '-cat', filename]
        log.debug('> %s' % cmd_line(cat_args))

        cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

        def cleanup():
            # there shouldn't be any stderr
            for line in cat_proc.stderr:
                log.error('STDERR: ' + line)

            returncode = cat_proc.wait()

            if returncode != 0:
                raise IOError("Could not stream %s" % filename)

        return read_file(filename, cat_proc.stdout, cleanup=cleanup)
Пример #45
0
def _invoke_task_in_subprocess(
        task_type, step_num, task_num,
        args, num_steps,
        stdin, stdout, stderr, wd, env):
    """A pickleable function that invokes a task in a subprocess."""
    log.debug('> %s' % cmd_line(args))

    try:
        check_call(args, stdin=stdin, stdout=stdout, stderr=stderr,
                   cwd=wd, env=env)
    except Exception as ex:
        raise _TaskFailedException(
            reason=str(ex),
            step_num=step_num,
            num_steps=num_steps,
            task_type=task_type,
            task_num=task_num,
        )
Пример #46
0
    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_type = step['type']
            step_args = self._args_for_step(step_num)
            env = _fix_env(self._env_for_step(step_num))

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            if self._step_type_uses_spark(step_type):
                returncode, step_interpretation = self._run_spark_submit(
                    step_args, env, record_callback=_log_log4j_record)
            else:
                returncode, step_interpretation = self._run_hadoop(
                    step_args, env, record_callback=_log_record_from_hadoop)

            # make sure output_dir is filled (used for history log)
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            self._log_counters(log_interpretation, step_num)

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    _log_probable_cause_of_failure(log, error)

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(
                    reason=reason, step_num=step_num,
                    num_steps=self._num_steps())
Пример #47
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(
            ['--python-bin', python_cmd, '--no-conf', '-r', 'local'])
        mr_job.sandbox(stdin=[b'bar\n'])

        with no_handlers_for_logger():
            mr_job.run_job()

        # expect debugging messages in stderr.
        stderr = mr_job.stderr.getvalue()

        # stderr is huge, so don't use assertIn()
        self.assertTrue(b'import mrjob' in stderr or  # Python 2
                        b"import 'mrjob'" in stderr)  # Python 3
        self.assertTrue(b'#' in stderr)

        # should still get expected results
        self.assertEqual(sorted(mr_job.stdout.getvalue().splitlines()),
                         sorted([b'1\tnull', b'1\t"bar"']))
Пример #48
0
    def _stream_output(self):
        output_dir = posixpath.join(self._output_dir, 'part-*')
        log.info('Streaming output from %s from HDFS' % output_dir)

        cat_args = [self._opts['hadoop_bin'], 'fs', '-cat', output_dir]
        log.debug('> %s' % cmd_line(cat_args))

        cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

        for line in cat_proc.stdout:
            yield line

        # there shouldn't be any stderr
        for line in cat_proc.stderr:
            log.error('STDERR: ' + line)

        returncode = cat_proc.wait()

        if returncode != 0:
            raise CalledProcessError(returncode, cat_args)
Пример #49
0
    def _cat_file(self, filename):
        # stream from HDFS
        cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename]
        log.debug('> %s' % cmd_line(cat_args))

        cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

        for chunk in decompress(cat_proc.stdout, filename):
            yield chunk

        # this does someties happen; see #1396
        for line in cat_proc.stderr:
            log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n')))

        cat_proc.stdout.close()
        cat_proc.stderr.close()

        returncode = cat_proc.wait()

        if returncode != 0:
            raise IOError("Could not stream %s" % filename)
Пример #50
0
    def _cat_file(self, filename):
        # stream from HDFS
        cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename]
        log.debug('> %s' % cmd_line(cat_args))

        cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

        def cleanup():
            # this does someties happen; see #1396
            for line in cat_proc.stderr:
                log.error('STDERR: ' + to_string(line.rstrip(b'\r\n')))

            cat_proc.stdout.close()
            cat_proc.stderr.close()

            returncode = cat_proc.wait()

            if returncode != 0:
                raise IOError("Could not stream %s" % filename)

        return read_file(filename, cat_proc.stdout, cleanup=cleanup)
Пример #51
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(
            ['--python-bin', python_cmd, '--no-conf', '-r', 'local'])
        mr_job.sandbox(stdin=[b'bar\n'])

        with mr_job.make_runner() as runner:
            runner.run()

            # expect python -v crud in stderr

            with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                self.assertTrue(
                    any('import mrjob' in line or  # Python 2
                        "import 'mrjob'" in line for line in lines))

            with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                self.assertTrue(any('#' in line for line in lines))

            # should still get expected results
            self.assertEqual(sorted(to_lines(runner.cat_output())),
                             sorted([b'1\tnull\n', b'1\t"bar"\n']))
Пример #52
0
    def _parse_setup(self):
        """Helper for :py:meth:`_create_setup_wrapper_script`.

        Parse the *setup* option with
        :py:func:`mrjob.setup.parse_setup_cmd()`.

        If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both
        true, create mrjob.tar.gz (if it doesn't exist already) and
        prepend a setup command that adds it to PYTHONPATH.

        Also patch in the deprecated
        options *python_archives*, *setup_cmd*, and *setup_script*
        as setup commands.
        """
        setup = []

        # python_archives
        for path in self._opts['python_archives']:
            path_dict = parse_legacy_hash_path('archive', path)
            setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH'])

        # setup
        for cmd in self._opts['setup']:
            setup.append(parse_setup_cmd(cmd))

        # setup_cmds
        for cmd in self._opts['setup_cmds']:
            if not isinstance(cmd, basestring):
                cmd = cmd_line(cmd)
            setup.append([cmd])

        # setup_scripts
        for path in self._opts['setup_scripts']:
            path_dict = parse_legacy_hash_path('file', path)
            setup.append([path_dict])

        return setup
Пример #53
0
    def _substep_args(self, step_num, mrc):
        step = self._get_step(step_num)

        if step[mrc]['type'] == 'command':
            cmd = step[mrc]['command']

            # never wrap custom hadoop streaming commands in bash
            if isinstance(cmd, string_types):
                return shlex_split(cmd)
            else:
                return cmd

        elif step[mrc]['type'] == 'script':
            script_args = self._script_args_for_step(step_num, mrc)

            if 'pre_filter' in step[mrc]:
                return self._sh_wrap(
                    '%s | %s' %
                    (step[mrc]['pre_filter'], cmd_line(script_args)))
            else:
                return script_args
        else:
            raise ValueError("Invalid %s step %d: %r" %
                             (mrc, step_num, step[mrc]))
Пример #54
0
    def _load_steps(self):
        if not self._script_path:
            return []

        args = (self._executable(True) + ['--steps'] +
        self._mr_job_extra_args(local=True))
        log.debug('> %s' % cmd_line(args))
        # add . to PYTHONPATH (in case mrjob isn't actually installed)
        env = combine_local_envs(os.environ,
                                 {'PYTHONPATH': os.path.abspath('.')})
        steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env)
        stdout, stderr = steps_proc.communicate()

        if steps_proc.returncode != 0:
            raise Exception(
                'error getting step information: \n%s' % stderr)

        # on Python 3, convert stdout to str so we can json.loads() it
        if not isinstance(stdout, str):
            stdout = stdout.decode('utf_8')

        try:
            steps = json.loads(stdout)
        except ValueError:
            raise ValueError("Bad --steps response: \n%s" % stdout)

        # verify that this is a proper step description
        if not steps or not stdout:
            raise ValueError('step description is empty!')
        for step in steps:
            if step['type'] not in STEP_TYPES:
                raise ValueError(
                    'unexpected step type %r in steps %r' % (
                        step['type'], stdout))

        return steps
Пример #55
0
    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = self._env_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr, record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_string(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            step_type = step['type']

            if not _is_spark_step_type(step_type):
                counters = self._pick_counters(log_interpretation, step_type)
                if counters:
                    log.info(_format_counters(counters))
                else:
                    log.warning('No counters found')

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(reason=reason,
                                          step_num=step_num,
                                          num_steps=self._num_steps())
Пример #56
0
    def _invoke_sort(self, input_paths, output_path):
        """Use the local sort command to sort one or more input files. Raise
        an exception if there is a problem.

        This is is just a wrapper to handle limitations of Windows sort
        (see Issue #288).

        :type input_paths: list of str
        :param input_paths: paths of one or more input files
        :type output_path: str
        :param output_path: where to pipe sorted output into
        """
        if not input_paths:
            raise ValueError('Must specify at least one input path.')

        # ignore locale when sorting
        env = os.environ.copy()
        env['LC_ALL'] = 'C'

        # Make sure that the tmp dir environment variables are changed if
        # the default is changed.
        env['TMP'] = self._opts['local_tmp_dir']
        env['TMPDIR'] = self._opts['local_tmp_dir']
        env['TEMP'] = self._opts['local_tmp_dir']

        log.info('writing to %s' % output_path)

        err_path = os.path.join(self._get_local_tmp_dir(), 'sort-stderr')

        # assume we're using UNIX sort unless we know otherwise
        if (not self._sort_is_windows_sort) or len(input_paths) == 1:
            with open(output_path, 'wb') as output:
                with open(err_path, 'wb') as err:
                    args = ['sort'] + list(input_paths)
                    log.info('> %s' % cmd_line(args))
                    try:
                        check_call(args, stdout=output, stderr=err, env=env)
                        return
                    except CalledProcessError:
                        pass

        # Looks like we're using Windows sort
        self._sort_is_windows_sort = True

        log.info('Piping files into sort for Windows compatibility')
        with open(output_path, 'wb') as output:
            with open(err_path, 'wb') as err:
                args = ['sort']
                log.info('> %s' % cmd_line(args))
                proc = Popen(args, stdin=PIPE, stdout=output, stderr=err,
                             env=env)

                # shovel bytes into the sort process
                for input_path in input_paths:
                    with open(input_path, 'rb') as input:
                        while True:
                            buf = input.read(_BUFFER_SIZE)
                            if not buf:
                                break
                            proc.stdin.write(buf)

                proc.stdin.close()
                proc.wait()

                if proc.returncode == 0:
                    return

        # looks like there was a problem. log it and raise an error
        with open(err_path) as err:
            for line in err:
                log.error('STDERR: %s' % line.rstrip('\r\n'))
        raise CalledProcessError(proc.returncode, args)
Пример #57
0
    def _setup_wrapper_script_content(self, setup, mrjob_tar_gz_name=None):
        """Return a (Bourne) shell script that runs the setup commands and then
        executes whatever is passed to it (this will be our mapper/reducer),
        as a list of strings (one for each line, including newlines).

        We obtain a file lock so that two copies of the setup commands
        cannot run simultaneously on the same machine (this helps for running
        :command:`make` on a shared source code archive, for example).
        """
        out = []

        def writeln(line=''):
            out.append(line + '\n')

        # we're always going to execute this script as an argument to
        # sh, so there's no need to add a shebang (e.g. #!/bin/sh)

        writeln('# store $PWD')
        writeln('__mrjob_PWD=$PWD')
        writeln()

        writeln('# obtain exclusive file lock')
        # Basically, we're going to tie file descriptor 9 to our lockfile,
        # use a subprocess to obtain a lock (which we somehow inherit too),
        # and then release the lock by closing the file descriptor.
        # File descriptors 10 and higher are used internally by the shell,
        # so 9 is as out-of-the-way as we can get.
        writeln('exec 9>/tmp/wrapper.lock.%s' % self._job_key)
        # would use flock(1), but it's not always available
        writeln("%s -c 'import fcntl; fcntl.flock(9, fcntl.LOCK_EX)'" %
                cmd_line(self._python_bin()))
        writeln()

        writeln('# setup commands')
        # group setup commands so we can redirect their input/output (see
        # below). Don't use parens; this would invoke a subshell, which would
        # keep us from exporting environment variables to the task.
        writeln('{')
        for cmd in setup:
            # reconstruct the command line, substituting $__mrjob_PWD/<name>
            # for path dicts
            line = '  '  # indent, since these commands are in a group
            for token in cmd:
                if isinstance(token, dict):
                    # it's a path dictionary
                    line += '$__mrjob_PWD/'
                    line += pipes.quote(self._working_dir_mgr.name(**token))
                else:
                    # it's raw script
                    line += token
            writeln(line)
        # redirect setup commands' input/output so they don't interfere
        # with the task (see Issue #803).
        writeln('} 0</dev/null 1>&2')
        writeln()

        writeln('# release exclusive file lock')
        writeln('exec 9>&-')
        writeln()

        writeln('# run task from the original working directory')
        writeln('cd $__mrjob_PWD')
        writeln('"$@"')

        return out
Пример #58
0
    def _run_spark_submit(self, spark_submit_args, env, record_callback):
        """Run the spark submit binary in a subprocess, using a PTY if possible

        :param spark_submit_args: spark-submit binary and arguments, as as list
        :param env: environment variables, as a dict
        :param record_callback: a function that takes a single log4j record
                                as its argument (see
                                :py:func:`~mrjob.logs.log4j\
                                ._parse_hadoop_log4j_records)

        :return: tuple of the subprocess's return code and a
                 step interpretation dictionary
        """
        log.debug('> %s' % cmd_line(spark_submit_args))
        log.debug('  with environment: %r' % sorted(env.items()))

        # these should always be set, but just in case
        returncode = 0
        step_interpretation = {}

        # try to use a PTY if it's available
        try:
            pid, master_fd = pty.fork()
        except (AttributeError, OSError):
            # no PTYs, just use Popen

            # user won't get much feedback for a while, so tell them
            # spark-submit is running
            log.debug('No PTY available, using Popen() to invoke spark-submit')

            step_proc = Popen(
                spark_submit_args, stdout=PIPE, stderr=PIPE, env=env)

            # parse driver output
            step_interpretation = _parse_spark_log(
                step_proc.stderr, record_callback=record_callback)

            # there shouldn't be much output on STDOUT, just echo it
            for record in _parse_hadoop_log4j_records(step_proc.stdout):
                record_callback(record)

            step_proc.stdout.close()
            step_proc.stderr.close()

            returncode = step_proc.wait()
        else:
            # we have PTYs
            if pid == 0:  # we are the child process
                try:
                    os.execvpe(spark_submit_args[0], spark_submit_args, env)
                    # now this process is no longer Python
                except OSError as ex:
                    # use _exit() so we don't do cleanup, etc. that's
                    # the parent process's job
                    os._exit(ex.errno)
                finally:
                    # if we get some other exception, still exit hard
                    os._exit(-1)
            else:
                log.debug('Invoking spark-submit via PTY')

                with os.fdopen(master_fd, 'rb') as master:
                    step_interpretation = (
                        _parse_spark_log(
                            _eio_to_eof(master),
                            record_callback=record_callback))

                    _, returncode = os.waitpid(pid, 0)

        return (returncode, step_interpretation)
Пример #59
0
 def test_cmd_line(self):
     self.assertEqual(cmd_line(['cut', '-f', 2, '-d', ' ']),
                      "cut -f 2 -d ' '")
     self.assertIn(
         cmd_line(['grep', '-e', "# DON'T USE$"]),
         ("grep -e \"# DON'T USE\\$\"", 'grep -e \'# DON\'"\'"\'T USE$\''))