def _create_master_bootstrap_script_if_needed(self): """Helper for :py:meth:`_add_bootstrap_files_for_upload`. Create the master bootstrap script and write it into our local temp directory. Set self._master_bootstrap_script_path. This will do nothing if there are no bootstrap scripts or commands, or if it has already been called.""" if self._master_bootstrap_script_path: return # don't bother if we're not starting a cluster if self._cluster_id: return # Also don't bother if we're not bootstrapping if not (self._bootstrap or self._bootstrap_mrjob()): return # create mrjob.zip if we need it, and add commands to install it mrjob_bootstrap = [] if self._bootstrap_mrjob(): assert self._mrjob_zip_path path_dict = { 'type': 'file', 'name': None, 'path': self._mrjob_zip_path} self._bootstrap_dir_mgr.add(**path_dict) # find out where python keeps its libraries mrjob_bootstrap.append([ "__mrjob_PYTHON_LIB=$(%s -c " "'from distutils.sysconfig import get_python_lib;" " print(get_python_lib())')" % cmd_line(self._python_bin())]) # unzip mrjob.zip mrjob_bootstrap.append( ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB']) # re-compile pyc files now, since mappers/reducers can't # write to this directory. Don't fail if there is extra # un-compileable crud in the tarball (this would matter if # sh_bin were 'sh -e') mrjob_bootstrap.append( ['sudo %s -m compileall -q' ' -f $__mrjob_PYTHON_LIB/mrjob && true' % cmd_line(self._python_bin())]) # we call the script b.py because there's a character limit on # bootstrap script names (or there was at one time, anyway) path = os.path.join(self._get_local_tmp_dir(), 'b.py') log.info('writing master bootstrap script to %s' % path) contents = self._master_bootstrap_script_content( self._bootstrap + mrjob_bootstrap) for line in contents: log.debug('BOOTSTRAP: ' + line.rstrip('\r\n')) with open(path, 'w') as f: for line in contents: f.write(line) self._master_bootstrap_script_path = path
def _spark_cmdenv(self, step_num): """Returns a dictionary mapping environment variable to value, including mapping PYSPARK_PYTHON to self._python_bin() """ step = self._get_step(step_num) cmdenv = {} if step['type'] in ('spark', 'spark_script'): # not spark_jar driver_python = cmd_line(self._python_bin()) if self._spark_python_wrapper_path: executor_python = './%s' % self._working_dir_mgr.name( 'file', self._spark_python_wrapper_path) else: executor_python = cmd_line(self._task_python_bin()) if self._spark_deploy_mode() == 'cluster': # treat driver like executors (they run in same environment) cmdenv['PYSPARK_PYTHON'] = executor_python elif driver_python == executor_python: # no difference, just set $PYSPARK_PYTHON cmdenv['PYSPARK_PYTHON'] = driver_python else: # set different pythons for driver and executor cmdenv['PYSPARK_PYTHON'] = executor_python cmdenv['PYSPARK_DRIVER_PYTHON'] = driver_python cmdenv.update(self._opts['cmdenv']) return cmdenv
def _run_job_in_hadoop(self): # figure out local names for our files self._name_files() # send script and wrapper script (if any) to working dir assert self._script # shouldn't be able to run if no script self._script['upload'] = 'file' if self._wrapper_script: self._wrapper_script['upload'] = 'file' steps = self._get_steps() for step_num, step in enumerate(steps): log.debug('running step %d of %d' % (step_num+1, len(steps))) streaming_args = [self._opts['hadoop_bin'], 'jar', self._opts['hadoop_streaming_jar']] # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. streaming_args.extend( self._hadoop_conf_args(step_num, len(steps))) # setup input for input_uri in self._hdfs_step_input_files(step_num): streaming_args.extend(['-input', input_uri]) # setup output streaming_args.append('-output') streaming_args.append(self._hdfs_step_output_dir(step_num)) # set up uploading from HDFS to the working dir streaming_args.extend(self._upload_args()) # set up mapper and reducer streaming_args.append('-mapper') streaming_args.append(cmd_line(self._mapper_args(step_num))) if 'R' in step: streaming_args.append('-reducer') streaming_args.append(cmd_line(self._reducer_args(step_num))) else: streaming_args.extend(['-jobconf', 'mapred.reduce.tasks=0']) log.debug('> %s' % cmd_line(streaming_args)) step_proc = Popen(streaming_args, stdout=PIPE, stderr=PIPE) # TODO: use a pty or something so that the hadoop binary # won't buffer the status messages self._process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error('STDOUT: ' + line.strip('\n')) returncode = step_proc.wait() if returncode != 0: raise CalledProcessError(step_proc.returncode, streaming_args)
def _setup_wrapper_script_content( self, setup, manifest=False, wrap_python=False): """Return a (Bourne) shell script that runs the setup commands and then executes whatever is passed to it (this will be our mapper/reducer), as a list of strings (one for each line, including newlines). We obtain a file lock so that two copies of the setup commands cannot run simultaneously on the same machine (this helps for running :command:`make` on a shared source code archive, for example). """ lines = [] # TODO: this is very similar to _start_of_sh_script() in cloud.py if wrap_python: # start with shebang sh_bin = self._sh_bin() if os.path.isabs(sh_bin[0]): shebang_bin = sh_bin else: shebang_bin = ['/usr/bin/env'] + list(sh_bin) if len(shebang_bin) > 2: # Linux limits shebang to one binary and one arg shebang_bin = shebang_bin[:2] log.warning('Limiting shebang to two arguments:' '#!%s' % cmd_line(shebang_bin)) lines.append('#!%s' % cmd_line(shebang_bin)) # hook for 'set -e', etc. pre_commands = self._sh_pre_commands() if pre_commands: for cmd in pre_commands: lines.append(cmd) lines.append('') if setup: lines.extend(self._setup_cmd_content(setup)) # handle arguments to the script if wrap_python: # pretend to be python ($@ is arguments to the python binary) python_bin = self._task_python_bin() lines.append('%s "$@"' % cmd_line(python_bin)) elif manifest: # arguments ($@) are a command # eventually runs: "$@" $INPUT_PATH $INPUT_URI lines.extend(self._manifest_download_content()) else: # arguments ($@) are a command, just run it lines.append('"$@"') return lines
def _invoke_process(self, args, outfile_name, env, combiner_args=None): """invoke the process described by *args* and write to *outfile_name* :param combiner_args: If this mapper has a combiner, we need to do some extra shell wrangling, so pass the combiner arguments in separately. :return: dict(proc=Popen, args=[process args], write_to=file) """ if combiner_args: log.info('> %s | sort | %s' % (cmd_line(args), cmd_line(combiner_args))) else: log.info('> %s' % cmd_line(args)) # set up outfile outfile = os.path.join(self._get_local_tmp_dir(), outfile_name) log.info('writing to %s' % outfile) self._prev_outfiles.append(outfile) write_to = open(outfile, 'w') with open(outfile, 'w') as write_to: if combiner_args: # set up a pipeline: mapper | sort | combiner mapper_proc = Popen(args, stdout=PIPE, stderr=PIPE, cwd=self._working_dir, env=env) sort_proc = Popen(['sort'], stdin=mapper_proc.stdout, stdout=PIPE, stderr=PIPE, cwd=self._working_dir, env=env) combiner_proc = Popen(combiner_args, stdin=sort_proc.stdout, stdout=write_to, stderr=PIPE, cwd=self._working_dir, env=env) # this process shouldn't read from the pipes mapper_proc.stdout.close() sort_proc.stdout.close() return [ {'proc': mapper_proc, 'args': args}, {'proc': sort_proc, 'args': ['sort']}, {'proc': combiner_proc, 'args': combiner_args}, ] else: # just run the mapper process proc = Popen(args, stdout=write_to, stderr=PIPE, cwd=self._working_dir, env=env) return [{'proc': proc, 'args': args}]
def _substep_args(self, step_num, mrc): step = self._get_step(step_num) if step[mrc]['type'] == 'command': cmd = step[mrc]['command'] # never wrap custom hadoop streaming commands in bash if isinstance(cmd, string_types): return shlex_split(cmd) else: return cmd elif step[mrc]['type'] == 'script': script_args = self._script_args_for_step( step_num, mrc, input_manifest=step.get('input_manifest')) if 'pre_filter' in step[mrc]: return self._sh_wrap( '%s | %s' % (step[mrc]['pre_filter'], cmd_line(script_args))) else: return script_args else: raise ValueError("Invalid %s step %d: %r" % ( mrc, step_num, step[mrc]))
def _load_steps(self): args = (self._executable(True) + ['--steps'] + self._mr_job_extra_args(local=True)) log.debug('> %s' % cmd_line(args)) # add . to PYTHONPATH (in case mrjob isn't actually installed) env = combine_local_envs(os.environ, {'PYTHONPATH': os.path.abspath('.')}) steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env) stdout, stderr = steps_proc.communicate() if steps_proc.returncode != 0: raise Exception( 'error getting step information: \n%s' % stderr) # on Python 3, convert stdout to str so we can json.loads() it if not isinstance(stdout, str): stdout = stdout.decode('utf_8') try: steps = json.loads(stdout) except ValueError: raise ValueError("Bad --steps response: \n%s" % stdout) # verify that this is a proper step description if not steps or not stdout: raise ValueError('step description is empty!') return steps
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=[b'bar\n']) with mr_job.make_runner() as runner: runner.run() # expect python -v crud in stderr with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue(any( 'import mrjob' in line or # Python 2 "import 'mrjob'" in line for line in lines)) with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue(any( '#' in line for line in lines)) # should still get expected results self.assertEqual( sorted(to_lines(runner.cat_output())), sorted([b'1\tnull\n', b'1\t"bar"\n']))
def _cat_file(self, filename): if is_uri(filename): # stream from HDFS cat_args = self._opts['hadoop_bin'] + ['fs', '-cat', filename] log.debug('> %s' % cmd_line(cat_args)) cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE) def stream(): for line in cat_proc.stdout: yield line # there shouldn't be any stderr for line in cat_proc.stderr: log.error('STDERR: ' + line) returncode = cat_proc.wait() if returncode != 0: raise CalledProcessError(returncode, cat_args) return read_file(filename, stream()) else: # read from local filesystem return super(HadoopJobRunner, self)._cat_file(filename)
def _parse_setup(self): """Parse the *setup* option with :py:func:`mrjob.setup.parse_setup_cmd()`. If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both true, create mrjob.tar.gz (if it doesn't exist already) and prepend a setup command that adds it to PYTHONPATH. Also patch in the deprecated options *python_archives*, *setup_cmd*, and *setup_script* as setup commands. """ setup = [] # python_archives for path in self._opts['python_archives']: path_dict = parse_legacy_hash_path('archive', path) setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH']) # setup for cmd in self._opts['setup']: setup.append(parse_setup_cmd(cmd)) # setup_cmds for cmd in self._opts['setup_cmds']: if not isinstance(cmd, basestring): cmd = cmd_line(cmd) setup.append([cmd]) # setup_scripts for path in self._opts['setup_scripts']: path_dict = parse_legacy_hash_path('file', path) setup.append([path_dict]) return setup
def _run_job_in_hadoop(self): self._counters = [] for step_num in range(self._num_steps()): log.debug("running step %d of %d" % (step_num + 1, self._num_steps())) step_args = self._args_for_step(step_num) log.debug("> %s" % cmd_line(step_args)) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) self._process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error("STDOUT: " + to_string(line.strip(b"\n"))) returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: with os.fdopen(master_fd, "rb") as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) self._process_stderr_from_streaming(master) _, returncode = os.waitpid(pid, 0) if returncode == 0: # parsing needs step number for whole job self._fetch_counters([step_num + self._start_step_num]) # printing needs step number relevant to this run of mrjob self.print_counters([step_num + 1]) else: msg = "Job failed with return code %d: %s" % (returncode, step_args) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure([step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append("Probable cause of failure (from %s):" % cause["log_file_uri"]) cause_msg.extend(line.strip("\n") for line in cause["lines"]) if cause["input_uri"]: cause_msg.append("(while reading from %s)" % cause["input_uri"]) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += "\n" + "\n".join(cause_msg) + "\n" raise CalledProcessError(returncode, step_args)
def archive_and_unarchive(self, extension, archive_template, added_files=[]): join = os.path.join # archive it up archive_name = 'a.' + extension variables = dict(archive_name=join('..', archive_name), files_to_archive='.') archive_command = [arg % variables for arg in archive_template] # sometime the relevant command isn't available or doesn't work; # if so, skip the test try: proc = Popen(archive_command, cwd=join(self.tmp_dir, 'a'), stdout=PIPE, stderr=PIPE) except OSError as e: if e.errno == 2: self.skipTest("No %s command" % archive_command[0]) else: raise proc.communicate() # discard output if proc.returncode != 0: self.skipTest("Can't run `%s` to create archive." % cmd_line(archive_command)) # unarchive it into b/ unarchive(join(self.tmp_dir, archive_name), join(self.tmp_dir, 'b')) self.ensure_expected_results(added_files=added_files)
def _run_spark_submit(self, spark_submit_args, env, record_callback): """Run the spark submit binary in a subprocess, using a PTY if possible :param spark_submit_args: spark-submit binary and arguments, as as list :param env: environment variables, as a dict :param record_callback: a function that takes a single log4j record as its argument (see :py:func:`~mrjob.logs.log4j\ ._parse_hadoop_log4j_records) :return: the subprocess's return code """ log.debug('> %s' % cmd_line(spark_submit_args)) log.debug(' with environment: %r' % sorted(env.items())) returncode = 0 # should always be set, but just in case # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # spark-submit is running log.debug('No PTY available, using Popen() to invoke spark-submit') step_proc = Popen( spark_submit_args, stdout=PIPE, stderr=PIPE, env=env) for line in step_proc.stderr: for record in _parse_hadoop_log4j_records( _yield_lines_from_pty_or_pipe(step_proc.stderr)): record_callback(record) # there shouldn't be much output on STDOUT for record in _parse_hadoop_log4j_records(step_proc.stdout): record_callback(record) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(spark_submit_args[0], spark_submit_args, env) # now this process is no longer Python else: log.debug('Invoking spark-submit via PTY') with os.fdopen(master_fd, 'rb') as master: for record in _parse_hadoop_log4j_records( _yield_lines_from_pty_or_pipe(master)): record_callback(record) _, returncode = os.waitpid(pid, 0) return returncode
def _run_job_in_hadoop(self): self._counters = [] steps = self._get_steps() for step_num, step in enumerate(steps): log.debug('running step %d of %d' % (step_num + 1, len(steps))) streaming_args = self._streaming_args(step, step_num, len(steps)) log.debug('> %s' % cmd_line(streaming_args)) master, slave = pty.openpty() step_proc = Popen(streaming_args, stdout=PIPE, stderr=slave) stderr = os.fdopen(master) self._process_stderr_from_streaming(step_proc, stderr) stderr.close() # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error('STDOUT: ' + line.strip('\n')) returncode = step_proc.wait() if returncode == 0: # parsing needs step number for whole job self._fetch_counters([step_num + self._start_step_num]) # printing needs step number relevant to this run of mrjob self.print_counters([step_num + 1]) else: msg = ('Job failed with return code %d: %s' % (step_proc.returncode, streaming_args)) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure( [step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append('Probable cause of failure (from %s):' % cause['log_file_uri']) cause_msg.extend(line.strip('\n') for line in cause['lines']) if cause['input_uri']: cause_msg.append('(while reading from %s)' % cause['input_uri']) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += '\n' + '\n'.join(cause_msg) + '\n' raise Exception(msg) raise CalledProcessError(step_proc.returncode, streaming_args)
def _setup_wrapper_script_content(self, setup, mrjob_tar_gz_name=None): """Return a (Bourne) shell script that runs the setup commands and then executes whatever is passed to it (this will be our mapper/reducer). We obtain a file lock so that two copies of the setup commands cannot run simultaneously on the same machine (this helps for running :command:`make` on a shared source code archive, for example). """ out = StringIO() def writeln(line=''): out.write(line + '\n') # we're always going to execute this script as an argument to # sh, so there's no need to add a shebang (e.g. #!/bin/sh) writeln('# store $PWD') writeln('__mrjob_PWD=$PWD') writeln('') writeln('# obtain exclusive file lock') # Basically, we're going to tie file descriptor 9 to our lockfile, # use a subprocess to obtain a lock (which we somehow inherit too), # and then release the lock by closing the file descriptor. # File descriptors 10 and higher are used internally by the shell, # so 9 is as out-of-the-way as we can get. writeln('exec 9>/tmp/wrapper.lock.%s' % self._job_name) # would use flock(1), but it's not always available writeln("%s -c 'import fcntl; fcntl.flock(9, fcntl.LOCK_EX)'" % cmd_line(self._opts['python_bin'])) writeln() writeln('# setup commands') for cmd in setup: # reconstruct the command line, substituting $__mrjob_PWD/<name> # for path dicts line = '' for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._working_dir_mgr.name(**token)) else: # it's raw script line += token writeln(line) writeln() writeln('# release exclusive file lock') writeln('exec 9>&-') writeln() writeln('# run job from the original working directory') writeln('cd $__mrjob_PWD') writeln('"$@"') return out.getvalue()
def _setup_cmd_content(self, setup): """Write setup script content to obtain a file lock, run setup commands in a way that doesn't perturb the script, and then release the lock and return to the original working directory.""" lines = [] lines.append('# store $PWD') lines.append('__mrjob_PWD=$PWD') lines.append('') lines.append('# obtain exclusive file lock') # Basically, we're going to tie file descriptor 9 to our lockfile, # use a subprocess to obtain a lock (which we somehow inherit too), # and then release the lock by closing the file descriptor. # File descriptors 10 and higher are used internally by the shell, # so 9 is as out-of-the-way as we can get. lines.append('exec 9>/tmp/wrapper.lock.%s' % self._job_key) # would use flock(1), but it's not always available lines.append("%s -c 'import fcntl; fcntl.flock(9, fcntl.LOCK_EX)'" % cmd_line(self._python_bin())) lines.append('') lines.append('# setup commands') # group setup commands so we can redirect their input/output (see # below). Don't use parens; this would invoke a subshell, which would # keep us from exporting environment variables to the task. lines.append('{') for cmd in setup: # reconstruct the command line, substituting $__mrjob_PWD/<name> # for path dicts line = ' ' # indent, since these commands are in a group for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._working_dir_mgr.name(**token)) else: # it's raw script line += token lines.append(line) # redirect setup commands' input/output so they don't interfere # with the task (see Issue #803). lines.append('} 0</dev/null 1>&2') lines.append('') lines.append('# release exclusive file lock') lines.append('exec 9>&-') lines.append('') lines.append('# run task from the original working directory') lines.append('cd $__mrjob_PWD') return lines
def _invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None, return_stdout=False): """Run the given hadoop command, raising an exception on non-zero return code. This only works for commands whose output we don't care about. Args: ok_returncodes -- a list/tuple/set of return codes we expect to get back from hadoop (e.g. [0,1]). By default, we only expect 0. If we get an unexpected return code, we raise a CalledProcessError. ok_stderr -- don't log STDERR or raise CalledProcessError if stderr matches a regex in this list (even if the returncode is bad) return_stdout -- return the stdout from the hadoop command rather than logging it. If this is False, we return the returncode instead. """ if args[0] == 'fs': if self._opts['hdfs_namenode']: args = [args[0]] + ['-fs', self._opts['hdfs_namenode']] + args[1:] args = self._opts['hadoop_bin'] + args log.debug('> %s' % cmd_line(args)) proc = Popen(args, stdout=PIPE, stderr=PIPE) stdout, stderr = proc.communicate() log_func = log.debug if proc.returncode == 0 else log.error if not return_stdout: for line in StringIO(stdout): log_func('STDOUT: ' + line.rstrip('\r\n')) # check if STDERR is okay stderr_is_ok = False if ok_stderr: for stderr_re in ok_stderr: if stderr_re.match(stderr): stderr_is_ok = True break if not stderr_is_ok: for line in StringIO(stderr): log_func('STDERR: ' + line.rstrip('\r\n')) ok_returncodes = ok_returncodes or [0] if not stderr_is_ok and proc.returncode not in ok_returncodes: raise CalledProcessError(proc.returncode, args) if return_stdout: return stdout else: return proc.returncode
def _spark_cmdenv(self, step_num): """Returns a dictionary mapping environment variable to value, including mapping PYSPARK_PYTHON to self._python_bin() """ step = self._get_step(step_num) cmdenv = {} if step['type'] in ('spark', 'spark_script'): # not spark_jar cmdenv = dict(PYSPARK_PYTHON=cmd_line(self._python_bin())) cmdenv.update(self._opts['cmdenv']) return cmdenv
def _ssh_launch(self, address, cmd_args, stdin=None): """Copy SSH keys if necessary, then launch the given command over SSH and return a Popen.""" self._ssh_copy_key(address) args = self._ssh_cmd_args(address, cmd_args) log.debug(' > ' + cmd_line(args)) try: return Popen(args, stdout=PIPE, stderr=PIPE, stdin=stdin) except OSError as ex: raise IOError(ex.strerror)
def _render_substep(self, cmd_key, pre_filter_key=None): if self._steps[cmd_key]: cmd = self._steps[cmd_key] if not isinstance(cmd, string_types): cmd = cmd_line(cmd) if pre_filter_key and self._steps[pre_filter_key]: raise ValueError("Cannot specify both %s and %s" % (cmd_key, pre_filter_key)) return {"type": "command", "command": cmd} else: substep = {"type": "script"} if pre_filter_key and self._steps[pre_filter_key]: substep["pre_filter"] = self._steps[pre_filter_key] return substep
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf']) mr_job.sandbox(stdin=['bar\n']) with no_handlers_for_logger(): mr_job.run_job() # expect debugging messages in stderr assert_in('import mrjob', mr_job.stderr.getvalue()) assert_in('#', mr_job.stderr.getvalue()) # should still get expected results assert_equal(sorted(mr_job.parse_output()), [(1, None), (1, 'bar')])
def _render_substep(self, cmd_key, pre_filter_key=None): if self._steps[cmd_key]: cmd = self._steps[cmd_key] if not isinstance(cmd, basestring): cmd = cmd_line(cmd) if (pre_filter_key and self._steps[pre_filter_key]): raise ValueError('Cannot specify both %s and %s' % (cmd_key, pre_filter_key)) return {'type': 'command', 'command': cmd} else: substep = {'type': 'script'} if (pre_filter_key and self._steps[pre_filter_key]): substep['pre_filter'] = self._steps[pre_filter_key] return substep
def _sort_lines_with_sort_bin(input_paths, output_path, sort_bin, sort_values=False, tmp_dir=None): """Sort lines the given *input_paths* into *output_path*, using *sort_bin*. If there is a problem, fall back to in-memory sort. This is a helper for :py:meth:`LocalMRJobRunner._sort_input_func`. *tmp_dir* determines the value of :envvar:`$TMP` and :envvar:`$TMPDIR` that *sort_bin* sees. """ if input_paths: env = os.environ.copy() # ignore locale when sorting env['LC_ALL'] = 'C' # Make sure that the tmp dir environment variables are changed if # the default is changed. env['TMP'] = tmp_dir env['TMPDIR'] = tmp_dir with open(output_path, 'wb') as output: args = sort_bin + list(input_paths) log.debug('> %s' % cmd_line(args)) try: check_call(args, stdout=output, env=env) return except CalledProcessError: log.error( '`%s` failed, falling back to in-memory sort' % cmd_line(sort_bin)) except OSError: log.error( 'no sort binary, falling back to in-memory sort') _sort_lines_in_memory(input_paths, output_path, sort_values=sort_values)
def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None, return_stdout=False): """Run the given hadoop command, raising an exception on non-zero return code. This only works for commands whose output we don't care about. Args: ok_returncodes -- a list/tuple/set of return codes we expect to get back from hadoop (e.g. [0,1]). By default, we only expect 0. If we get an unexpected return code, we raise a CalledProcessError. ok_stderr -- don't log STDERR or raise CalledProcessError if stderr matches a regex in this list (even if the returncode is bad) return_stdout -- return the stdout from the hadoop command rather than logging it. If this is False, we return the returncode instead. """ args = self._hadoop_bin + args log.debug('> %s' % cmd_line(args)) proc = Popen(args, stdout=PIPE, stderr=PIPE) stdout, stderr = proc.communicate() log_func = log.debug if proc.returncode == 0 else log.error if not return_stdout: for line in StringIO(stdout): log_func('STDOUT: ' + line.rstrip('\r\n')) # check if STDERR is okay stderr_is_ok = False if ok_stderr: for stderr_re in ok_stderr: if stderr_re.match(stderr): stderr_is_ok = True break if not stderr_is_ok: for line in StringIO(stderr): log_func('STDERR: ' + line.rstrip('\r\n')) ok_returncodes = ok_returncodes or [0] if not stderr_is_ok and proc.returncode not in ok_returncodes: raise CalledProcessError(proc.returncode, args) if return_stdout: return stdout else: return proc.returncode
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf']) mr_job.sandbox(stdin=['bar\n']) with no_handlers_for_logger(): mr_job.run_job() # expect debugging messages in stderr self.assertIn('import mrjob', mr_job.stderr.getvalue()) self.assertIn('#', mr_job.stderr.getvalue()) # should still get expected results self.assertEqual(sorted(mr_job.parse_output()), [(1, None), (1, 'bar')])
def _start_of_sh_script(self): """Return a list of lines (without trailing newlines) containing the shell script shebang and pre-commands.""" out = [] # shebang sh_bin = self._sh_bin() if not sh_bin[0].startswith('/'): sh_bin = ['/usr/bin/env'] + sh_bin out.append('#!' + cmd_line(sh_bin)) # hook for 'set -e', etc. (see #1549) out.extend(self._sh_pre_commands()) return out
def _ssh_run(ssh_bin, address, ec2_key_pair_file, cmd_args, stdin=''): """Shortcut to call ssh on a Hadoop node via ``subprocess``. :param ssh_bin: Path to ``ssh`` binary :param address: Address of your job's master node :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``) :param cmd_args: The command you want to run :param stdin: String to pass to the process's standard input :return: (stdout, stderr) """ args = _ssh_args(ssh_bin, address, ec2_key_pair_file) + list(cmd_args) log.debug('> %s' % cmd_line(args)) p = Popen(args, stdout=PIPE, stderr=PIPE, stdin=PIPE) return p.communicate(stdin)
def _render_substep(self, cmd_key, pre_filter_key=None): if self._steps[cmd_key]: cmd = self._steps[cmd_key] if not isinstance(cmd, string_types): cmd = cmd_line(cmd) if (pre_filter_key and self._steps[pre_filter_key]): raise ValueError('Cannot specify both %s and %s' % ( cmd_key, pre_filter_key)) return {'type': 'command', 'command': cmd} else: substep = {'type': 'script'} if (pre_filter_key and self._steps[pre_filter_key]): substep['pre_filter'] = self._steps[pre_filter_key] return substep
def _harness_job(self, job_class, input_bytes=b'', input_paths=(), runner_alias='inline', compression_codec=None, job_args=None, spark_conf=None, first_step_num=None, last_step_num=None, counter_output_dir=None, num_reducers=None, max_output_files=None, emulate_map_input_file=False, skip_internal_protocol=False): from tests.mr_spark_harness import MRSparkHarness job_class_path = '%s.%s' % (job_class.__module__, job_class.__name__) harness_job_args = ['-r', runner_alias, '--job-class', job_class_path] if spark_conf: for key, value in spark_conf.items(): harness_job_args.append('--jobconf') harness_job_args.append('%s=%s' % (key, value)) if compression_codec: harness_job_args.append('--compression-codec') harness_job_args.append(compression_codec) if job_args: harness_job_args.extend(['--job-args', cmd_line(job_args)]) if first_step_num is not None: harness_job_args.extend(['--first-step-num', str(first_step_num)]) if last_step_num is not None: harness_job_args.extend(['--last-step-num', str(last_step_num)]) if counter_output_dir is not None: harness_job_args.extend( ['--counter-output-dir', counter_output_dir]) if num_reducers is not None: harness_job_args.extend( ['--num-reducers', str(num_reducers)]) if max_output_files is not None: harness_job_args.extend( ['--max-output-files', str(max_output_files)]) if emulate_map_input_file: harness_job_args.append('--emulate-map-input-file') if skip_internal_protocol: harness_job_args.append('--skip-internal-protocol') harness_job_args.extend(input_paths) harness_job = MRSparkHarness(harness_job_args) harness_job.sandbox(stdin=BytesIO(input_bytes)) return harness_job
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob( ['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=['bar\n']) with no_handlers_for_logger(): mr_job.run_job() # expect debugging messages in stderr self.assertIn('import mrjob', mr_job.stderr.getvalue()) self.assertIn('#', mr_job.stderr.getvalue()) # should still get expected results self.assertItemsEqual(mr_job.stdout.getvalue().splitlines(), ['1\tnull', '1\t"bar"'])
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=['bar\n']) with no_handlers_for_logger(): mr_job.run_job() # expect debugging messages in stderr self.assertIn('import mrjob', mr_job.stderr.getvalue()) self.assertIn('#', mr_job.stderr.getvalue()) # should still get expected results self.assertItemsEqual(mr_job.stdout.getvalue().splitlines(), ['1\tnull', '1\t"bar"'])
def _ssh_add_key(self): """Add ``self._ec2_key_pair_file`` to the ssh agent with ``ssh-add``. """ args = self._ssh_add_bin + ['-t', '60', self._ec2_key_pair_file] log.debug(' > ' + cmd_line(args)) try: p = Popen(args, stdout=PIPE, stderr=PIPE) except OSError as ex: raise IOError(ex.strerror) stdout, stderr = p.communicate() if p.returncode != 0: raise IOError(to_unicode(stderr))
def _run_job_in_hadoop(self): self._counters = [] steps = self._get_steps() for step_num, step in enumerate(steps): log.debug("running step %d of %d" % (step_num + 1, len(steps))) streaming_args = self._streaming_args(step, step_num, len(steps)) log.debug("> %s" % cmd_line(streaming_args)) step_proc = Popen(streaming_args, stdout=PIPE, stderr=PIPE) # TODO: use a pty or something so that the hadoop binary # won't buffer the status messages self._process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error("STDOUT: " + line.strip("\n")) returncode = step_proc.wait() if returncode == 0: # parsing needs step number for whole job self._fetch_counters([step_num + self._start_step_num]) # printing needs step number relevant to this run of mrjob self.print_counters([step_num + 1]) else: msg = "Job failed with return code %d: %s" % (step_proc.returncode, streaming_args) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure([step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append("Probable cause of failure (from %s):" % cause["log_file_uri"]) cause_msg.extend(line.strip("\n") for line in cause["lines"]) if cause["input_uri"]: cause_msg.append("(while reading from %s)" % cause["input_uri"]) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += "\n" + "\n".join(cause_msg) + "\n" raise Exception(msg) raise CalledProcessError(step_proc.returncode, streaming_args)
def _get_steps(self): """Call the job script to find out how many steps it has, and whether there are mappers and reducers for each step. Validate its output. Returns output as described in :ref:`steps-format`. Results are cached, so call this as many times as you want. """ if self._steps is None: if not self._script_path: self._steps = [] else: args = (self._executable(True) + ['--steps'] + self._mr_job_extra_args(local=True)) log.debug('> %s' % cmd_line(args)) # add . to PYTHONPATH (in case mrjob isn't actually installed) env = combine_local_envs(os.environ, {'PYTHONPATH': os.path.abspath('.')}) steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env) stdout, stderr = steps_proc.communicate() if steps_proc.returncode != 0: raise Exception( 'error getting step information: \n%s' % stderr) # on Python 3, convert stdout to str so we can json.loads() it if not isinstance(stdout, str): stdout = stdout.decode('utf_8') try: steps = json.loads(stdout) except ValueError: raise ValueError("Bad --steps response: \n%s" % stdout) # verify that this is a proper step description if not steps or not stdout: raise ValueError('step description is empty!') for step in steps: if step['type'] not in STEP_TYPES: raise ValueError( 'unexpected step type %r in steps %r' % ( step['type'], stdout)) self._steps = steps return self._steps
def _parse_setup(self): """Parse the *setup* option with :py:func:`mrjob.setup.parse_setup_cmd()`. If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both true, create mrjob.tar.gz (if it doesn't exist already) and prepend a setup command that adds it to PYTHONPATH. Also patch in the deprecated options *python_archives*, *setup_cmd*, and *setup_script* as setup commands. """ setup = [] # python_archives for path in self._opts["python_archives"]: path_dict = parse_legacy_hash_path("archive", path) setup.append(["export PYTHONPATH=", path_dict, ":$PYTHONPATH"]) # setup for cmd in self._opts["setup"]: setup.append(parse_setup_cmd(cmd)) # setup_cmds if self._opts["setup_cmds"]: log.warning( "setup_cmds is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead." ) for cmd in self._opts["setup_cmds"]: if not isinstance(cmd, string_types): cmd = cmd_line(cmd) setup.append([cmd]) # setup_scripts if self._opts["setup_scripts"]: log.warning( "setup_scripts is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead." ) for path in self._opts["setup_scripts"]: path_dict = parse_legacy_hash_path("file", path) setup.append([path_dict]) return setup
def _substep_cmd_line(self, step, step_num, mrc): if step[mrc]['type'] == 'command': # never wrap custom hadoop streaming commands in bash return step[mrc]['command'], False elif step[mrc]['type'] == 'script': cmd = cmd_line(self._script_args_for_step(step_num, mrc)) # filter input and pipe for great speed, if user asks # but we have to wrap the command in bash if 'pre_filter' in step[mrc]: return '%s | %s' % (step[mrc]['pre_filter'], cmd), True else: return cmd, False else: raise ValueError("Invalid %s step %d: %r" % ( mrc, step_num, step[mrc]))
def _substep_cmd_line(self, step, step_num, mrc): if step[mrc]['type'] == 'command': # never wrap custom hadoop streaming commands in bash return step[mrc]['command'], False elif step[mrc]['type'] == 'script': cmd = cmd_line(self._script_args_for_step(step_num, mrc)) # filter input and pipe for great speed, if user asks # but we have to wrap the command in bash if 'pre_filter' in step[mrc]: return '%s | %s' % (step[mrc]['pre_filter'], cmd), True else: return cmd, False else: raise ValueError("Invalid %s step %d: %r" % (mrc, step_num, step[mrc]))
def _get_steps(self): """Call the mr_job to find out how many steps it has, and whether there are mappers and reducers for each step. Validate its output. Returns output like ['MR', 'M'] (two steps, second only has a mapper) We'll cache the result (so you can call _get_steps() as many times as you want) """ if self._steps is None: if not self._script: self._steps = [] else: # don't use self._opts['python_bin'] because that # refers to the python binary to use inside Hadoop python_bin = sys.executable or 'python' args = ([python_bin, self._script['path'], '--steps'] + self._mr_job_extra_args(local=True)) log.debug('> %s' % cmd_line(args)) # add . to PYTHONPATH (in case mrjob isn't actually installed) env = combine_local_envs(os.environ, {'PYTHONPATH': os.path.abspath('.')}) steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env) stdout, stderr = steps_proc.communicate() if steps_proc.returncode != 0: raise Exception('error getting step information: %s', stderr) steps = stdout.strip().split(' ') # verify that this is a proper step description if not steps: raise ValueError('step description is empty!') for step in steps: if step not in ('MR', 'M'): raise ValueError( 'unexpected step type %r in steps %r' % (step, stdout)) self._steps = steps return self._steps
def _parse_setup(self): """Parse the *setup* option with :py:func:`mrjob.setup.parse_setup_cmd()`. If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both true, create mrjob.tar.gz (if it doesn't exist already) and prepend a setup command that adds it to PYTHONPATH. Also patch in the deprecated options *python_archives*, *setup_cmd*, and *setup_script* as setup commands. """ setup = [] # python_archives for path in self._opts['python_archives']: path_dict = parse_legacy_hash_path('archive', path) setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH']) # setup for cmd in self._opts['setup']: setup.append(parse_setup_cmd(cmd)) # setup_cmds if self._opts['setup_cmds']: log.warning( "setup_cmds is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead.") for cmd in self._opts['setup_cmds']: if not isinstance(cmd, string_types): cmd = cmd_line(cmd) setup.append([cmd]) # setup_scripts if self._opts['setup_scripts']: log.warning( "setup_scripts is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead.") for path in self._opts['setup_scripts']: path_dict = parse_legacy_hash_path('file', path) setup.append([path_dict]) return setup
def _invoke_processes(self, procs_args, output_path, working_dir, env): """invoke the process described by *args* and write to *output_path* :param combiner_args: If this mapper has a combiner, we need to do some extra shell wrangling, so pass the combiner arguments in separately. :return: dict(proc=Popen, args=[process args], write_to=file) """ log.info('> %s > %s' % (' | '.join( args if isinstance(args, basestring) else cmd_line(args) for args in procs_args), output_path)) with open(output_path, 'w') as write_to: procs = _chain_procs(procs_args, stdout=write_to, stderr=PIPE, cwd=working_dir, env=env) return [{'args': a, 'proc': proc, 'write_to': write_to} for a, proc in zip(procs_args, procs)]
def _cat_file(self, filename): # stream from HDFS cat_args = self._hadoop_bin + ['fs', '-cat', filename] log.debug('> %s' % cmd_line(cat_args)) cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE) def cleanup(): # there shouldn't be any stderr for line in cat_proc.stderr: log.error('STDERR: ' + line) returncode = cat_proc.wait() if returncode != 0: raise IOError("Could not stream %s" % filename) return read_file(filename, cat_proc.stdout, cleanup=cleanup)
def _invoke_task_in_subprocess( task_type, step_num, task_num, args, num_steps, stdin, stdout, stderr, wd, env): """A pickleable function that invokes a task in a subprocess.""" log.debug('> %s' % cmd_line(args)) try: check_call(args, stdin=stdin, stdout=stdout, stderr=stderr, cwd=wd, env=env) except Exception as ex: raise _TaskFailedException( reason=str(ex), step_num=step_num, num_steps=num_steps, task_type=task_type, task_num=task_num, )
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_type = step['type'] step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) if self._step_type_uses_spark(step_type): returncode, step_interpretation = self._run_spark_submit( step_args, env, record_callback=_log_log4j_record) else: returncode, step_interpretation = self._run_hadoop( step_args, env, record_callback=_log_record_from_hadoop) # make sure output_dir is filled (used for history log) if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation self._log_counters(log_interpretation, step_num) if returncode: error = self._pick_error(log_interpretation, step_type) if error: _log_probable_cause_of_failure(log, error) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps())
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob( ['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=[b'bar\n']) with no_handlers_for_logger(): mr_job.run_job() # expect debugging messages in stderr. stderr = mr_job.stderr.getvalue() # stderr is huge, so don't use assertIn() self.assertTrue(b'import mrjob' in stderr or # Python 2 b"import 'mrjob'" in stderr) # Python 3 self.assertTrue(b'#' in stderr) # should still get expected results self.assertEqual(sorted(mr_job.stdout.getvalue().splitlines()), sorted([b'1\tnull', b'1\t"bar"']))
def _stream_output(self): output_dir = posixpath.join(self._output_dir, 'part-*') log.info('Streaming output from %s from HDFS' % output_dir) cat_args = [self._opts['hadoop_bin'], 'fs', '-cat', output_dir] log.debug('> %s' % cmd_line(cat_args)) cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE) for line in cat_proc.stdout: yield line # there shouldn't be any stderr for line in cat_proc.stderr: log.error('STDERR: ' + line) returncode = cat_proc.wait() if returncode != 0: raise CalledProcessError(returncode, cat_args)
def _cat_file(self, filename): # stream from HDFS cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename] log.debug('> %s' % cmd_line(cat_args)) cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE) for chunk in decompress(cat_proc.stdout, filename): yield chunk # this does someties happen; see #1396 for line in cat_proc.stderr: log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n'))) cat_proc.stdout.close() cat_proc.stderr.close() returncode = cat_proc.wait() if returncode != 0: raise IOError("Could not stream %s" % filename)
def _cat_file(self, filename): # stream from HDFS cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename] log.debug('> %s' % cmd_line(cat_args)) cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE) def cleanup(): # this does someties happen; see #1396 for line in cat_proc.stderr: log.error('STDERR: ' + to_string(line.rstrip(b'\r\n'))) cat_proc.stdout.close() cat_proc.stderr.close() returncode = cat_proc.wait() if returncode != 0: raise IOError("Could not stream %s" % filename) return read_file(filename, cat_proc.stdout, cleanup=cleanup)
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob( ['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=[b'bar\n']) with mr_job.make_runner() as runner: runner.run() # expect python -v crud in stderr with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue( any('import mrjob' in line or # Python 2 "import 'mrjob'" in line for line in lines)) with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue(any('#' in line for line in lines)) # should still get expected results self.assertEqual(sorted(to_lines(runner.cat_output())), sorted([b'1\tnull\n', b'1\t"bar"\n']))
def _parse_setup(self): """Helper for :py:meth:`_create_setup_wrapper_script`. Parse the *setup* option with :py:func:`mrjob.setup.parse_setup_cmd()`. If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both true, create mrjob.tar.gz (if it doesn't exist already) and prepend a setup command that adds it to PYTHONPATH. Also patch in the deprecated options *python_archives*, *setup_cmd*, and *setup_script* as setup commands. """ setup = [] # python_archives for path in self._opts['python_archives']: path_dict = parse_legacy_hash_path('archive', path) setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH']) # setup for cmd in self._opts['setup']: setup.append(parse_setup_cmd(cmd)) # setup_cmds for cmd in self._opts['setup_cmds']: if not isinstance(cmd, basestring): cmd = cmd_line(cmd) setup.append([cmd]) # setup_scripts for path in self._opts['setup_scripts']: path_dict = parse_legacy_hash_path('file', path) setup.append([path_dict]) return setup
def _substep_args(self, step_num, mrc): step = self._get_step(step_num) if step[mrc]['type'] == 'command': cmd = step[mrc]['command'] # never wrap custom hadoop streaming commands in bash if isinstance(cmd, string_types): return shlex_split(cmd) else: return cmd elif step[mrc]['type'] == 'script': script_args = self._script_args_for_step(step_num, mrc) if 'pre_filter' in step[mrc]: return self._sh_wrap( '%s | %s' % (step[mrc]['pre_filter'], cmd_line(script_args))) else: return script_args else: raise ValueError("Invalid %s step %d: %r" % (mrc, step_num, step[mrc]))
def _load_steps(self): if not self._script_path: return [] args = (self._executable(True) + ['--steps'] + self._mr_job_extra_args(local=True)) log.debug('> %s' % cmd_line(args)) # add . to PYTHONPATH (in case mrjob isn't actually installed) env = combine_local_envs(os.environ, {'PYTHONPATH': os.path.abspath('.')}) steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env) stdout, stderr = steps_proc.communicate() if steps_proc.returncode != 0: raise Exception( 'error getting step information: \n%s' % stderr) # on Python 3, convert stdout to str so we can json.loads() it if not isinstance(stdout, str): stdout = stdout.decode('utf_8') try: steps = json.loads(stdout) except ValueError: raise ValueError("Bad --steps response: \n%s" % stdout) # verify that this is a proper step description if not steps or not stdout: raise ValueError('step description is empty!') for step in steps: if step['type'] not in STEP_TYPES: raise ValueError( 'unexpected step type %r in steps %r' % ( step['type'], stdout)) return steps
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = self._env_for_step(step_num) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_string(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation step_type = step['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps())
def _invoke_sort(self, input_paths, output_path): """Use the local sort command to sort one or more input files. Raise an exception if there is a problem. This is is just a wrapper to handle limitations of Windows sort (see Issue #288). :type input_paths: list of str :param input_paths: paths of one or more input files :type output_path: str :param output_path: where to pipe sorted output into """ if not input_paths: raise ValueError('Must specify at least one input path.') # ignore locale when sorting env = os.environ.copy() env['LC_ALL'] = 'C' # Make sure that the tmp dir environment variables are changed if # the default is changed. env['TMP'] = self._opts['local_tmp_dir'] env['TMPDIR'] = self._opts['local_tmp_dir'] env['TEMP'] = self._opts['local_tmp_dir'] log.info('writing to %s' % output_path) err_path = os.path.join(self._get_local_tmp_dir(), 'sort-stderr') # assume we're using UNIX sort unless we know otherwise if (not self._sort_is_windows_sort) or len(input_paths) == 1: with open(output_path, 'wb') as output: with open(err_path, 'wb') as err: args = ['sort'] + list(input_paths) log.info('> %s' % cmd_line(args)) try: check_call(args, stdout=output, stderr=err, env=env) return except CalledProcessError: pass # Looks like we're using Windows sort self._sort_is_windows_sort = True log.info('Piping files into sort for Windows compatibility') with open(output_path, 'wb') as output: with open(err_path, 'wb') as err: args = ['sort'] log.info('> %s' % cmd_line(args)) proc = Popen(args, stdin=PIPE, stdout=output, stderr=err, env=env) # shovel bytes into the sort process for input_path in input_paths: with open(input_path, 'rb') as input: while True: buf = input.read(_BUFFER_SIZE) if not buf: break proc.stdin.write(buf) proc.stdin.close() proc.wait() if proc.returncode == 0: return # looks like there was a problem. log it and raise an error with open(err_path) as err: for line in err: log.error('STDERR: %s' % line.rstrip('\r\n')) raise CalledProcessError(proc.returncode, args)
def _setup_wrapper_script_content(self, setup, mrjob_tar_gz_name=None): """Return a (Bourne) shell script that runs the setup commands and then executes whatever is passed to it (this will be our mapper/reducer), as a list of strings (one for each line, including newlines). We obtain a file lock so that two copies of the setup commands cannot run simultaneously on the same machine (this helps for running :command:`make` on a shared source code archive, for example). """ out = [] def writeln(line=''): out.append(line + '\n') # we're always going to execute this script as an argument to # sh, so there's no need to add a shebang (e.g. #!/bin/sh) writeln('# store $PWD') writeln('__mrjob_PWD=$PWD') writeln() writeln('# obtain exclusive file lock') # Basically, we're going to tie file descriptor 9 to our lockfile, # use a subprocess to obtain a lock (which we somehow inherit too), # and then release the lock by closing the file descriptor. # File descriptors 10 and higher are used internally by the shell, # so 9 is as out-of-the-way as we can get. writeln('exec 9>/tmp/wrapper.lock.%s' % self._job_key) # would use flock(1), but it's not always available writeln("%s -c 'import fcntl; fcntl.flock(9, fcntl.LOCK_EX)'" % cmd_line(self._python_bin())) writeln() writeln('# setup commands') # group setup commands so we can redirect their input/output (see # below). Don't use parens; this would invoke a subshell, which would # keep us from exporting environment variables to the task. writeln('{') for cmd in setup: # reconstruct the command line, substituting $__mrjob_PWD/<name> # for path dicts line = ' ' # indent, since these commands are in a group for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._working_dir_mgr.name(**token)) else: # it's raw script line += token writeln(line) # redirect setup commands' input/output so they don't interfere # with the task (see Issue #803). writeln('} 0</dev/null 1>&2') writeln() writeln('# release exclusive file lock') writeln('exec 9>&-') writeln() writeln('# run task from the original working directory') writeln('cd $__mrjob_PWD') writeln('"$@"') return out
def _run_spark_submit(self, spark_submit_args, env, record_callback): """Run the spark submit binary in a subprocess, using a PTY if possible :param spark_submit_args: spark-submit binary and arguments, as as list :param env: environment variables, as a dict :param record_callback: a function that takes a single log4j record as its argument (see :py:func:`~mrjob.logs.log4j\ ._parse_hadoop_log4j_records) :return: tuple of the subprocess's return code and a step interpretation dictionary """ log.debug('> %s' % cmd_line(spark_submit_args)) log.debug(' with environment: %r' % sorted(env.items())) # these should always be set, but just in case returncode = 0 step_interpretation = {} # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # spark-submit is running log.debug('No PTY available, using Popen() to invoke spark-submit') step_proc = Popen( spark_submit_args, stdout=PIPE, stderr=PIPE, env=env) # parse driver output step_interpretation = _parse_spark_log( step_proc.stderr, record_callback=record_callback) # there shouldn't be much output on STDOUT, just echo it for record in _parse_hadoop_log4j_records(step_proc.stdout): record_callback(record) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process try: os.execvpe(spark_submit_args[0], spark_submit_args, env) # now this process is no longer Python except OSError as ex: # use _exit() so we don't do cleanup, etc. that's # the parent process's job os._exit(ex.errno) finally: # if we get some other exception, still exit hard os._exit(-1) else: log.debug('Invoking spark-submit via PTY') with os.fdopen(master_fd, 'rb') as master: step_interpretation = ( _parse_spark_log( _eio_to_eof(master), record_callback=record_callback)) _, returncode = os.waitpid(pid, 0) return (returncode, step_interpretation)
def test_cmd_line(self): self.assertEqual(cmd_line(['cut', '-f', 2, '-d', ' ']), "cut -f 2 -d ' '") self.assertIn( cmd_line(['grep', '-e', "# DON'T USE$"]), ("grep -e \"# DON'T USE\\$\"", 'grep -e \'# DON\'"\'"\'T USE$\''))