Exemplo n.º 1
0
def _parse_task_attempts(fs, log_paths):
    """Like :py:func:`_parse_simple_logs()`, but with lots of special cases for
    task attempt logs
    """
    tasks_seen = set()
    for path in log_paths:
        # skip subsequent logs for same task
        m = _TASK_LOG_PATH_RE.match(path)
        if not m:
            continue

        m_groups = m.groupdict()
        task_key = tuple(m_groups.get(k) for k in ["step_num", "task_type", "task_num", "stream"])
        if task_key in tasks_seen:
            continue

        tasks_seen.add(task_key)

        # Python tracebacks should win in a single file, but Java tracebacks
        # should win for later attempts
        if path.endswith("stderr"):
            lines = _parsed_error(fs, path, find_python_traceback) or _parsed_error(
                fs, path, find_hadoop_java_stack_trace
            )
        else:
            lines = _parsed_error(fs, path, find_hadoop_java_stack_trace)

        if lines:
            input_uri = _scan_for_input_uri(path, fs)

            return {"lines": lines, "log_file_uri": path, "input_uri": input_uri}
Exemplo n.º 2
0
    def test_task_re_on_3_x_ami(self):
        uri = 's3://mrjob-35cdec11663cb1cb/tmp/logs/j-21QKHYM5WJJHS/task-attempts/application_1441057410014_0001/container_1441057410014_0001_01_000004/stderr.gz'  # noqa

        m = _TASK_LOG_PATH_RE.match(uri)

        self.assertTrue(m)
        self.assertEqual(m.group('timestamp'), '1441057410014')
        self.assertEqual(m.group('step_num'), '0001')
        self.assertEqual(m.group('task_type'), None)
        self.assertEqual(m.group('yarn_attempt_num'), '01')
        self.assertEqual(m.group('task_num'), '000004')
        self.assertEqual(m.group('attempt_num'), None)
        self.assertEqual(m.group('stream'), 'stderr')
Exemplo n.º 3
0
    def test_task_re_on_2_x_ami(self):
        uri = 's3://mrjob-35cdec11663cb1cb/tmp/logs/j-3J3Y9EBUUBRFW/task-attempts/job_201508312315_0002/attempt_201508312315_0002_m_000000_0/syslog'  # noqa

        m = _TASK_LOG_PATH_RE.match(uri)

        self.assertTrue(m)
        self.assertEqual(m.group('timestamp'), '201508312315')
        self.assertEqual(m.group('step_num'), '0002')
        self.assertEqual(m.group('task_type'), 'm')
        self.assertEqual(m.group('yarn_attempt_num'), None)
        self.assertEqual(m.group('task_num'), '000000')
        self.assertEqual(m.group('attempt_num'), '0')
        self.assertEqual(m.group('stream'), 'syslog')
Exemplo n.º 4
0
    def test_task_re_on_3_x_ami(self):
        uri = 's3://mrjob-35cdec11663cb1cb/tmp/logs/j-21QKHYM5WJJHS/task-attempts/application_1441057410014_0001/container_1441057410014_0001_01_000004/stderr.gz'  # noqa

        m = _TASK_LOG_PATH_RE.match(uri)

        self.assertTrue(m)
        self.assertEqual(m.group('timestamp'), '1441057410014')
        self.assertEqual(m.group('step_num'), '0001')
        self.assertEqual(m.group('task_type'), None)
        self.assertEqual(m.group('yarn_attempt_num'), '01')
        self.assertEqual(m.group('task_num'), '000004')
        self.assertEqual(m.group('attempt_num'), None)
        self.assertEqual(m.group('stream'), 'stderr')
Exemplo n.º 5
0
    def test_task_re_on_2_x_ami(self):
        uri = 's3://mrjob-35cdec11663cb1cb/tmp/logs/j-3J3Y9EBUUBRFW/task-attempts/job_201508312315_0002/attempt_201508312315_0002_m_000000_0/syslog'  # noqa

        m = _TASK_LOG_PATH_RE.match(uri)

        self.assertTrue(m)
        self.assertEqual(m.group('timestamp'), '201508312315')
        self.assertEqual(m.group('step_num'), '0002')
        self.assertEqual(m.group('task_type'), 'm')
        self.assertEqual(m.group('yarn_attempt_num'), None)
        self.assertEqual(m.group('task_num'), '000000')
        self.assertEqual(m.group('attempt_num'), '0')
        self.assertEqual(m.group('stream'), 'syslog')
Exemplo n.º 6
0
def _parse_task_attempts(fs, log_paths):
    """Like :py:func:`_parse_simple_logs()`, but with lots of special cases for
    task attempt logs
    """
    tasks_seen = set()
    for path in log_paths:
        # skip subsequent logs for same task
        m = _TASK_LOG_PATH_RE.match(path)
        if not m:
            continue

        m_groups = m.groupdict()
        task_key = tuple(
            m_groups.get(k)
            for k in ['step_num', 'task_type', 'task_num', 'stream'])
        if task_key in tasks_seen:
            continue

        tasks_seen.add(task_key)

        # Python tracebacks should win in a single file, but Java tracebacks
        # should win for later attempts
        if path.endswith('stderr'):
            lines = (_parsed_error(fs, path, find_python_traceback)
                     or _parsed_error(fs, path, find_hadoop_java_stack_trace))
        else:
            lines = _parsed_error(fs, path, find_hadoop_java_stack_trace)

        if lines:
            input_uri = _scan_for_input_uri(path, fs)

            return {
                'lines': lines,
                'log_file_uri': path,
                'input_uri': input_uri,
            }