def _hadoop_log_dirs(self, output_dir=None): """Yield all possible places to look for hadoop logs.""" # hadoop_log_dirs opt overrides all this if self._opts['hadoop_log_dirs']: for path in self._opts['hadoop_log_dirs']: yield path return hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR') if hadoop_log_dir: yield hadoop_log_dir yarn = uses_yarn(self.get_hadoop_version()) if yarn: yarn_log_dir = os.environ.get('YARN_LOG_DIR') if yarn_log_dir: yield yarn_log_dir yield _DEFAULT_YARN_HDFS_LOG_DIR if output_dir: # Cloudera style of logging yield posixpath.join(output_dir, '_logs') for hadoop_dir in self._hadoop_dirs(): yield posixpath.join(hadoop_dir, 'logs') # hard-coded fallback paths if yarn: for path in _FALLBACK_HADOOP_YARN_LOG_DIRS: yield path for path in _FALLBACK_HADOOP_LOG_DIRS: yield path
def _hadoop_log_dirs(self, output_dir=None): """Yield all possible places to look for hadoop logs.""" # hadoop_log_dirs opt overrides all this if self._opts['hadoop_log_dirs']: for path in self._opts['hadoop_log_dirs']: yield path return hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR') if hadoop_log_dir: yield hadoop_log_dir if uses_yarn(self.get_hadoop_version()): yarn_log_dir = os.environ.get('YARN_LOG_DIR') if yarn_log_dir: yield yarn_log_dir if output_dir: # Cloudera style of logging yield posixpath.join(output_dir, '_logs') for hadoop_dir in self._hadoop_dirs(): yield posixpath.join(hadoop_dir, 'logs') # hard-coded log paths for EMR, so this can work out-of-the-box for path in _EMR_HADOOP_LOG_DIRS: yield path
def _interpret_task_logs(self, log_interpretation, partial=True): """Fetch task syslogs and stderr, and add 'task' to interpretation.""" if 'task' in log_interpretation and ( partial or not log_interpretation['task'].get('partial')): return # already interpreted step_interpretation = log_interpretation.get('step') or {} application_id = step_interpretation.get('application_id') job_id = step_interpretation.get('job_id') output_dir = step_interpretation.get('output_dir') yarn = uses_yarn(self.get_hadoop_version()) if yarn: if not application_id: log.warning("Can't fetch task logs; missing application ID") return else: if not job_id: log.warning("Can't fetch task logs; missing job ID") return log_interpretation['task'] = _interpret_task_logs( self.fs, self._ls_task_syslogs( application_id=application_id, job_id=job_id, output_dir=output_dir), partial=partial, stderr_callback=_log_parsing_task_stderr)
def _interpret_task_logs(self, log_interpretation): """Find and interpret the task logs, storing the interpretation in ``log_interpretation['task']``.""" if 'task' not in log_interpretation: # get job/application ID from output of hadoop command step_interpretation = log_interpretation.get('step') or {} application_id = step_interpretation.get('application_id') job_id = step_interpretation.get('job_id') yarn = uses_yarn(self.get_hadoop_version()) if yarn and application_id is None: log.warning("Can't fetch task logs without application ID") return {} elif not yarn and job_id is None: log.warning("Can't fetch task logs without job ID") return {} # Note: this is unlikely to be super-helpful on "real" (multi-node) # pre-YARN Hadoop because task logs aren't generally shipped to a # local directory. It's a start, anyways. See #1201. def stream_task_log_dirs(): for log_dir in unique( self._hadoop_log_dirs( output_dir=step_interpretation.get('output_dir'))): if yarn: path = self.fs.join( log_dir, 'userlogs', application_id) else: # sometimes pre-YARN attempt logs are organized by # job_id, # sometimes not. Play it safe path = self.fs.join(log_dir, 'userlogs') if self.fs.exists(path): log.info('Scanning task syslogs in %s' % path) yield [path] # wrap _ls_task_syslogs() to add logging def ls_task_syslogs(): # there should be at most one history log for match in _ls_task_syslogs( self.fs, stream_task_log_dirs(), application_id=application_id, job_id=job_id): # TODO: this isn't really correct because # _interpret_task_logs() sorts the logs paths and # scans starting at the most recent one. Probably # should have _ls_task_syslogs() do the sorting. log.info(' Scanning for errors: %s' % match['path']) yield match log_interpretation['task'] = _interpret_task_logs( self.fs, ls_task_syslogs()) return log_interpretation['task']
def ls(self, path_glob): components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) version = self.get_hadoop_version() # use ls -R on Hadoop 2 (see #1152) if uses_yarn(version): args = ['fs', '-ls', '-R', path_glob] else: args = ['fs', '-lsr', path_glob] try: stdout = self.invoke_hadoop(args, return_stdout=True, ok_stderr=[_HADOOP_LS_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not ls %s" % path_glob) for line in BytesIO(stdout): line = line.rstrip(b'\r\n') # ignore total item count if line.startswith(b'Found '): continue fields = line.split(b' ') # Throw out directories if fields[0].startswith(b'd'): continue # Try to figure out which part of the line is the path # Expected lines: # # HDFS: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # # S3: # -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar path_index = None for index, field in enumerate(fields): # look for time field, and pick one after that # (can't use field[2] because that's an int in Python 3) if len(field) == 5 and field[2:3] == b':': path_index = (index + 1) if not path_index: raise IOError("Could not locate path in string %r" % line) path = to_unicode(line.split(b' ', path_index)[-1]) # handle fully qualified URIs from newer versions of Hadoop ls # (see Pull Request #577) if is_uri(path): yield path else: yield hdfs_prefix + path
def mkdir(self, path): version = self.get_hadoop_version() # use -p on Hadoop 2 (see #991, #845) if uses_yarn(version): args = ['fs', '-mkdir', '-p', path] else: args = ['fs', '-mkdir', path] try: self.invoke_hadoop(args, ok_stderr=[_HADOOP_FILE_EXISTS_RE]) except CalledProcessError: raise IOError("Could not mkdir %s" % path)
def _interpret_task_logs( self, log_interpretation, step_type, error_attempt_ids=(), partial=True): """Fetch task syslogs and stderr, and add 'task' to interpretation.""" if 'task' in log_interpretation and ( partial or not log_interpretation['task'].get('partial')): return # already interpreted if not self._read_logs(): return step_interpretation = log_interpretation.get('step') or {} application_id = step_interpretation.get('application_id') job_id = step_interpretation.get('job_id') output_dir = step_interpretation.get('output_dir') yarn = uses_yarn(self.get_hadoop_version()) attempt_to_container_id = log_interpretation.get('history', {}).get( 'attempt_to_container_id', {}) if yarn: if not application_id: if not log_interpretation.get('no_job'): log.warning( "Can't fetch task logs; missing application ID") return else: if not job_id: if not log_interpretation.get('no_job'): log.warning("Can't fetch task logs; missing job ID") return if _is_spark_step_type(step_type): interpret_func = _interpret_spark_task_logs else: interpret_func = _interpret_task_logs log_interpretation['task'] = interpret_func( self.fs, self._ls_task_logs( step_type, application_id=application_id, job_id=job_id, output_dir=output_dir, error_attempt_ids=error_attempt_ids, attempt_to_container_id=attempt_to_container_id, ), partial=partial, log_callback=_log_parsing_task_log)
def _interpret_task_logs(self, log_interpretation, step_type, error_attempt_ids=(), partial=True): """Fetch task syslogs and stderr, and add 'task' to interpretation.""" if 'task' in log_interpretation and ( partial or not log_interpretation['task'].get('partial')): return # already interpreted step_interpretation = log_interpretation.get('step') or {} application_id = step_interpretation.get('application_id') job_id = step_interpretation.get('job_id') output_dir = step_interpretation.get('output_dir') yarn = uses_yarn(self.get_hadoop_version()) attempt_to_container_id = log_interpretation.get('history', {}).get( 'attempt_to_container_id', {}) if yarn: if not application_id: if not log_interpretation.get('no_job'): log.warning( "Can't fetch task logs; missing application ID") return else: if not job_id: if not log_interpretation.get('no_job'): log.warning("Can't fetch task logs; missing job ID") return if _is_spark_step_type(step_type): interpret_func = _interpret_spark_task_logs else: interpret_func = _interpret_task_logs log_interpretation['task'] = interpret_func( self.fs, self._ls_task_logs( step_type, application_id=application_id, job_id=job_id, output_dir=output_dir, error_attempt_ids=error_attempt_ids, attempt_to_container_id=attempt_to_container_id, ), partial=partial, log_callback=_log_parsing_task_log)
def rm(self, path_glob): if not is_uri(path_glob): super(HadoopFilesystem, self).rm(path_glob) version = self.get_hadoop_version() if uses_yarn(version): args = ['fs', '-rm', '-R', '-f', '-skipTrash', path_glob] else: args = ['fs', '-rmr', '-skipTrash', path_glob] try: self.invoke_hadoop(args, return_stdout=True, ok_stderr=[_HADOOP_RM_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not rm %s" % path_glob)
def rm(self, path_glob): if not is_uri(path_glob): super(HadoopFilesystem, self).rm(path_glob) version = self.get_hadoop_version() if uses_yarn(version): args = ['fs', '-rm', '-R', '-f', '-skipTrash', path_glob] else: args = ['fs', '-rmr', '-skipTrash', path_glob] try: self.invoke_hadoop( args, return_stdout=True, ok_stderr=[_HADOOP_RM_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not rm %s" % path_glob)
def _find_probable_cause_of_failure(self, application_id=None, job_id=None, output_dir=None, **ignored): """Find probable cause of failure. Currently we just scan task logs. On YARN, you must set application_id, and pre-YARN, you must set job_id. """ # package up logs for _find_error_intask_logs(), # and log where we're looking. hadoop_version = self.get_hadoop_version() yarn = uses_yarn(hadoop_version) if yarn and application_id is None: log.warning("No application ID!") return None if not yarn and job_id is None: log.warning("No job ID!") return None # Note: this is unlikely to be super-helpful on "real" (multi-node) # pre-YARN Hadoop because task logs aren't generally shipped to a local # directory. It's a start, anyways. See #1201. def stream_task_log_dirs(): for log_dir in unique( self._hadoop_log_dirs(output_dir=output_dir)): if yarn: path = self.fs.join(log_dir, 'userlogs', application_id) else: # sometimes pre-YARN attempt logs are organized by job_id, # sometimes not. Play it safe path = self.fs.join(log_dir, 'userlogs') if self.fs.exists(path): log.info('looking for logs in %s' % path) yield [path] return _find_error_in_task_logs(self.fs, stream_task_log_dirs(), hadoop_version, application_id=application_id, job_id=job_id)
def _find_probable_cause_of_failure(self, application_id=None, job_id=None, output_dir=None, **ignored): """Find probable cause of failure. Currently we just scan task logs. On YARN, you must set application_id, and pre-YARN, you must set job_id. """ # package up logs for _find_error_intask_logs(), # and log where we're looking. hadoop_version = self.get_hadoop_version() yarn = uses_yarn(hadoop_version) if yarn and application_id is None: log.warning("No application ID!") return None if not yarn and job_id is None: log.warning("No job ID!") return None # Note: this is unlikely to be super-helpful on "real" (multi-node) # pre-YARN Hadoop because task logs aren't generally shipped to a local # directory. It's a start, anyways. See #1201. def stream_task_log_dirs(): for log_dir in unique( self._hadoop_log_dirs(output_dir=output_dir)): if yarn: path = self.fs.join(log_dir, 'userlogs', application_id) else: # sometimes pre-YARN attempt logs are organized by job_id, # sometimes not. Play it safe path = self.fs.join(log_dir, 'userlogs') if self.fs.exists(path): log.info('looking for logs in %s' % path) yield [path] return _find_error_in_task_logs( self.fs, stream_task_log_dirs(), hadoop_version, application_id=application_id, job_id=job_id)
def test_uses_yarn(self): self.assertEqual(uses_yarn('0.22'), False) self.assertEqual(uses_yarn('0.23'), True) self.assertEqual(uses_yarn('0.24'), True) self.assertEqual(uses_yarn('1.0.0'), False) self.assertEqual(uses_yarn('2.0.0'), True)
def mock_hadoop_uses_yarn(environ): return uses_yarn(environ['MOCK_HADOOP_VERSION'])
def _find_error_in_task_logs(fs, log_dirs_stream, hadoop_version, application_id=None, job_id=None): """Given a filesystem and a stream of lists of log dirs to search in, find the last error and return details about it. *hadoop_version* is required, as task logs have very different paths in YARN. In YARN, you must set *application_id*, and pre-YARN, you must set *job_id*, or we'll bail out and return None. Returns a dictionary with the following keys ("optional" means that something may be None): syslog: dict with keys: path: path of syslog we found error in error: error details; dict with keys: exception: Java exception (as string) stack_trace: array of lines with Java stack trace split: optional input split we were reading; dict with keys: path: path of input file start_line: first line of split (0-indexed) num_lines: number of lines in split stderr: optional dict with keys: path: path of stderr corresponding to syslog error: optional error details; dict with keys: exception: string (Python exception) traceback: array of lines with Python stack trace type: always set to 'task' """ syslog_paths = [] yarn = uses_yarn(hadoop_version) if ((yarn and application_id is None) or (not yarn and job_id is None)): return None # we assume that each set of log paths contains the same copies # of syslogs, so stop once we find any non-empty set of log dirs for log_dirs in log_dirs_stream: if yarn: syslog_paths = _ls_yarn_task_syslogs(fs, log_dirs, application_id=application_id) else: syslog_paths = _ls_pre_yarn_task_syslogs(fs, log_dirs, job_id=job_id) if syslog_paths: break for syslog_path in syslog_paths: log.debug('Looking for error in %s' % syslog_path) syslog_info = _parse_task_syslog(_cat_log(fs, syslog_path)) if not syslog_info['error']: continue # found error! see if we can explain it # TODO: don't bother if error wasn't due to child process stderr_path = _stderr_for_syslog(syslog_path) stderr_info = _parse_python_task_stderr(_cat_log(fs, stderr_path)) # output error info syslog_info['path'] = syslog_path stderr_info['path'] = stderr_path return dict(type='task', syslog=syslog_info, stderr=stderr_info) return None
def test_uses_yarn(self): self.assertEqual(uses_yarn("0.22"), False) self.assertEqual(uses_yarn("0.23"), True) self.assertEqual(uses_yarn("0.24"), True) self.assertEqual(uses_yarn("1.0.0"), False) self.assertEqual(uses_yarn("2.0.0"), True)