def _ls_task_logs(self, step_type, application_id=None, job_id=None, output_dir=None, error_attempt_ids=None, attempt_to_container_id=None): """Yield task log matches.""" if not self._read_logs(): return if _is_spark_step_type(step_type): ls_func = _ls_spark_task_logs else: ls_func = _ls_task_logs # logging messages are handled by a callback in _interpret_task_logs() matches = ls_func( self.fs, self._stream_task_log_dirs( application_id=application_id, output_dir=output_dir), application_id=application_id, job_id=job_id, error_attempt_ids=error_attempt_ids, attempt_to_container_id=attempt_to_container_id, ) for match in matches: yield match
def _run_step(self, step, step_num): """Run an individual step. You can assume that setup wrapper scripts are created and self._counters has a dictionary for that step already. """ if _is_spark_step_type(step['type']): self._run_step_on_spark(step, step_num) else: self._run_streaming_step(step, step_num)
def _has_spark_steps(self): """Are any of our steps Spark steps? (e.g. spark, spark_jar, spark_script) Generally used to determine if we need to install Spark on a cluster. """ return any(_is_spark_step_type(step['type']) for step in self._get_steps())
def _has_spark_steps(self): """Are any of our steps Spark steps? (e.g. spark, spark_jar, spark_script) Generally used to determine if we need to install Spark on a cluster. """ return any( _is_spark_step_type(step['type']) for step in self._get_steps())
def _warn_about_spark_archives(self, step): """If *step* is a Spark step, the *upload_archives* option is set, and *spark_master* is not ``'yarn'``, warn that *upload_archives* will be ignored by Spark.""" if (_is_spark_step_type(step['type']) and self._opts['spark_master'] != 'yarn' and self._opts['upload_archives']): log.warning('Spark will probably ignore archives because' " spark_master is not set to 'yarn'")
def _spark_submit_args(self, step_num): """Build a list of extra args to the spark-submit binary for the given spark or spark_script step.""" step = self._get_step(step_num) if not _is_spark_step_type(step['type']): raise TypeError('non-Spark step: %r' % step) args = [] # add --master if self._spark_master(): args.extend(['--master', self._spark_master()]) # add --deploy-mode if self._spark_deploy_mode(): args.extend(['--deploy-mode', self._spark_deploy_mode()]) # add --class (JAR steps) if step.get('main_class'): args.extend(['--class', step['main_class']]) # add --jars, if any libjar_paths = self._libjar_paths() if libjar_paths: args.extend(['--jars', ','.join(libjar_paths)]) # --conf arguments include python bin, cmdenv, jobconf. Make sure # that we can always override these manually jobconf = {} for key, value in self._spark_cmdenv(step_num).items(): jobconf['spark.executorEnv.%s' % key] = value jobconf['spark.yarn.appMasterEnv.%s' % key] = value jobconf.update(self._jobconf_for_step(step_num)) for key, value in sorted(jobconf.items()): args.extend(['--conf', '%s=%s' % (key, value)]) # --files and --archives args.extend(self._spark_upload_args()) # --py-files (Python only) if step['type'] in ('spark', 'spark_script'): py_file_uris = self._upload_uris(self._py_files()) if py_file_uris: args.extend(['--py-files', ','.join(py_file_uris)]) # spark_args option args.extend(self._opts['spark_args']) # step spark_args if step.get('spark_args'): args.extend(step['spark_args']) return args
def _env_for_step(self, step_num): step = self._get_step(step_num) env = dict(os.environ) # when running spark-submit, set its environment directly. See #1464 if _is_spark_step_type(step['type']): env.update(self._spark_cmdenv(step_num)) return env
def _log_counters(self, log_interpretation, step_num): """Utility for logging counters (if any) for a step.""" step_type = self._get_step(step_num)['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found')
def _args_for_step(self, step_num): step = self._get_step(step_num) if step['type'] == 'streaming': return self._args_for_streaming_step(step_num) elif step['type'] == 'jar': return self._args_for_jar_step(step_num) elif _is_spark_step_type(step['type']): return self._args_for_spark_step(step_num) else: raise AssertionError('Bad step type: %r' % (step['type'], ))
def _args_for_step(self, step_num): step = self._get_step(step_num) if step['type'] == 'streaming': return self._args_for_streaming_step(step_num) elif step['type'] == 'jar': return self._args_for_jar_step(step_num) elif _is_spark_step_type(step['type']): return self._args_for_spark_step(step_num) else: raise AssertionError('Bad step type: %r' % (step['type'],))
def _log_counters(self, log_interpretation, step_num): """Utility for logging counters (if any) for a step.""" step_type = self._get_step(step_num)['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters( log_interpretation, step_type) if counters: log.info(_format_counters(counters)) elif self._read_logs(): # should only log this if we actually looked for counters log.warning('No counters found')
def _spark_submit_args(self, step_num): """Build a list of extra args to the spark-submit binary for the given spark or spark_script step.""" step = self._get_step(step_num) if not _is_spark_step_type(step['type']): raise TypeError('non-Spark step: %r' % step) args = [] # add runner-specific args args.extend(self._spark_submit_arg_prefix()) # add --class (JAR steps) if step.get('main_class'): args.extend(['--class', step['main_class']]) # add --jars, if any libjar_paths = self._libjar_paths() if libjar_paths: args.extend(['--jars', ','.join(libjar_paths)]) # --conf arguments include python bin, cmdenv, jobconf. Make sure # that we can always override these manually jobconf = {} for key, value in self._spark_cmdenv(step_num).items(): jobconf['spark.executorEnv.%s' % key] = value jobconf['spark.yarn.appMasterEnv.%s' % key] = value jobconf.update(self._jobconf_for_step(step_num)) for key, value in sorted(jobconf.items()): if value is not None: args.extend(['--conf', '%s=%s' % (key, value)]) # --files and --archives args.extend(self._spark_upload_args()) # --py-files (Python only) if step['type'] in ('spark', 'spark_script'): py_files_arg = ','.join(self._spark_py_files()) if py_files_arg: args.extend(['--py-files', py_files_arg]) # spark_args option args.extend(self._opts['spark_args']) # step spark_args args.extend(step['spark_args']) return args
def _interpret_task_logs( self, log_interpretation, step_type, error_attempt_ids=(), partial=True): """Fetch task syslogs and stderr, and add 'task' to interpretation.""" if 'task' in log_interpretation and ( partial or not log_interpretation['task'].get('partial')): return # already interpreted if not self._read_logs(): return step_interpretation = log_interpretation.get('step') or {} application_id = step_interpretation.get('application_id') job_id = step_interpretation.get('job_id') output_dir = step_interpretation.get('output_dir') yarn = uses_yarn(self.get_hadoop_version()) attempt_to_container_id = log_interpretation.get('history', {}).get( 'attempt_to_container_id', {}) if yarn: if not application_id: if not log_interpretation.get('no_job'): log.warning( "Can't fetch task logs; missing application ID") return else: if not job_id: if not log_interpretation.get('no_job'): log.warning("Can't fetch task logs; missing job ID") return if _is_spark_step_type(step_type): interpret_func = _interpret_spark_task_logs else: interpret_func = _interpret_task_logs log_interpretation['task'] = interpret_func( self.fs, self._ls_task_logs( step_type, application_id=application_id, job_id=job_id, output_dir=output_dir, error_attempt_ids=error_attempt_ids, attempt_to_container_id=attempt_to_container_id, ), partial=partial, log_callback=_log_parsing_task_log)
def _ls_task_logs(self, step_type, application_id=None, job_id=None, output_dir=None): """Yield task log matches.""" if _is_spark_step_type(step_type): ls_func = _ls_spark_task_logs else: ls_func = _ls_task_logs # logging messages are handled by a callback in _interpret_task_logs() for match in ls_func( self.fs, self._stream_task_log_dirs( application_id=application_id, output_dir=output_dir), application_id=application_id, job_id=job_id): yield match
def _pick_counters(self, log_interpretation, step_type): """Pick counters from our log interpretation, interpreting history logs if need be.""" if _is_spark_step_type(step_type): return {} counters = _pick_counters(log_interpretation) if not counters: log.info('Attempting to fetch counters from logs...') self._interpret_step_logs(log_interpretation, step_type) counters = _pick_counters(log_interpretation) if not counters: self._interpret_history_log(log_interpretation) counters = _pick_counters(log_interpretation) return counters
def _ls_task_logs(self, step_type, application_id=None, job_id=None, output_dir=None): """Yield task log matches.""" if _is_spark_step_type(step_type): ls_func = _ls_spark_task_logs else: ls_func = _ls_task_logs # logging messages are handled by a callback in _interpret_task_logs() for match in ls_func(self.fs, self._stream_task_log_dirs( application_id=application_id, output_dir=output_dir), application_id=application_id, job_id=job_id): yield match
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_unicode(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation step_type = step['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps())
def _has_spark_steps(self): """Are any of our steps Spark steps (either spark or spark_script)""" return any( _is_spark_step_type(step['type']) for step in self._get_steps())
def _has_spark_steps(self): """Are any of our steps Spark steps (either spark or spark_script)""" return any(_is_spark_step_type(step['type']) for step in self._get_steps())
def _run_step(self, step, step_num): if _is_spark_step_type(step['type']): self._run_step_on_spark(step, step_num) else: super(LocalMRJobRunner, self)._run_step(step, step_num)
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = self._env_for_step(step_num) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_string(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation step_type = step['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps())