def _parse_task_attempts(fs, log_paths): """Like :py:func:`_parse_simple_logs()`, but with lots of special cases for task attempt logs """ tasks_seen = set() for path in log_paths: # skip subsequent logs for same task m = _TASK_LOG_PATH_RE.match(path) if not m: continue m_groups = m.groupdict() task_key = tuple(m_groups.get(k) for k in ["step_num", "task_type", "task_num", "stream"]) if task_key in tasks_seen: continue tasks_seen.add(task_key) # Python tracebacks should win in a single file, but Java tracebacks # should win for later attempts if path.endswith("stderr"): lines = _parsed_error(fs, path, find_python_traceback) or _parsed_error( fs, path, find_hadoop_java_stack_trace ) else: lines = _parsed_error(fs, path, find_hadoop_java_stack_trace) if lines: input_uri = _scan_for_input_uri(path, fs) return {"lines": lines, "log_file_uri": path, "input_uri": input_uri}
def test_task_re_on_3_x_ami(self): uri = 's3://mrjob-35cdec11663cb1cb/tmp/logs/j-21QKHYM5WJJHS/task-attempts/application_1441057410014_0001/container_1441057410014_0001_01_000004/stderr.gz' # noqa m = _TASK_LOG_PATH_RE.match(uri) self.assertTrue(m) self.assertEqual(m.group('timestamp'), '1441057410014') self.assertEqual(m.group('step_num'), '0001') self.assertEqual(m.group('task_type'), None) self.assertEqual(m.group('yarn_attempt_num'), '01') self.assertEqual(m.group('task_num'), '000004') self.assertEqual(m.group('attempt_num'), None) self.assertEqual(m.group('stream'), 'stderr')
def test_task_re_on_2_x_ami(self): uri = 's3://mrjob-35cdec11663cb1cb/tmp/logs/j-3J3Y9EBUUBRFW/task-attempts/job_201508312315_0002/attempt_201508312315_0002_m_000000_0/syslog' # noqa m = _TASK_LOG_PATH_RE.match(uri) self.assertTrue(m) self.assertEqual(m.group('timestamp'), '201508312315') self.assertEqual(m.group('step_num'), '0002') self.assertEqual(m.group('task_type'), 'm') self.assertEqual(m.group('yarn_attempt_num'), None) self.assertEqual(m.group('task_num'), '000000') self.assertEqual(m.group('attempt_num'), '0') self.assertEqual(m.group('stream'), 'syslog')
def _parse_task_attempts(fs, log_paths): """Like :py:func:`_parse_simple_logs()`, but with lots of special cases for task attempt logs """ tasks_seen = set() for path in log_paths: # skip subsequent logs for same task m = _TASK_LOG_PATH_RE.match(path) if not m: continue m_groups = m.groupdict() task_key = tuple( m_groups.get(k) for k in ['step_num', 'task_type', 'task_num', 'stream']) if task_key in tasks_seen: continue tasks_seen.add(task_key) # Python tracebacks should win in a single file, but Java tracebacks # should win for later attempts if path.endswith('stderr'): lines = (_parsed_error(fs, path, find_python_traceback) or _parsed_error(fs, path, find_hadoop_java_stack_trace)) else: lines = _parsed_error(fs, path, find_hadoop_java_stack_trace) if lines: input_uri = _scan_for_input_uri(path, fs) return { 'lines': lines, 'log_file_uri': path, 'input_uri': input_uri, }