예제 #1
0
    def test_yarn_error(self):
        lines = [
            '2015-12-21 14:06:18,538 WARN [main]'
            ' org.apache.hadoop.mapred.YarnChild: Exception running child'
            ' : java.lang.RuntimeException: PipeMapRed.waitOutputThreads():'
            ' subprocess failed with code 1\n',
            '        at org.apache.hadoop.streaming.PipeMapRed'
            '.waitOutputThreads(PipeMapRed.java:322)\n',
            '        at org.apache.hadoop.streaming.PipeMapRed'
            '.mapRedFinished(PipeMapRed.java:535)\n',
        ]

        self.assertEqual(
            _parse_task_syslog(lines),
            dict(split=None,
                 error=dict(
                     exception=('java.lang.RuntimeException:'
                                ' PipeMapRed.waitOutputThreads():'
                                ' subprocess failed with code 1'),
                     stack_trace=[
                         '        at org.apache.hadoop.streaming.PipeMapRed'
                         '.waitOutputThreads(PipeMapRed.java:322)',
                         '        at org.apache.hadoop.streaming.PipeMapRed'
                         '.mapRedFinished(PipeMapRed.java:535)',
                     ])))
예제 #2
0
    def test_yarn_error(self):
        lines = [
            "2015-12-21 14:06:18,538 WARN [main]"
            " org.apache.hadoop.mapred.YarnChild: Exception running child"
            " : java.lang.RuntimeException: PipeMapRed.waitOutputThreads():"
            " subprocess failed with code 1\n",
            "        at org.apache.hadoop.streaming.PipeMapRed" ".waitOutputThreads(PipeMapRed.java:322)\n",
            "        at org.apache.hadoop.streaming.PipeMapRed" ".mapRedFinished(PipeMapRed.java:535)\n",
        ]

        self.assertEqual(
            _parse_task_syslog(lines),
            dict(
                split=None,
                error=dict(
                    exception=(
                        "java.lang.RuntimeException:"
                        " PipeMapRed.waitOutputThreads():"
                        " subprocess failed with code 1"
                    ),
                    stack_trace=[
                        "        at org.apache.hadoop.streaming.PipeMapRed" ".waitOutputThreads(PipeMapRed.java:322)",
                        "        at org.apache.hadoop.streaming.PipeMapRed" ".mapRedFinished(PipeMapRed.java:535)",
                    ],
                ),
            ),
        )
예제 #3
0
    def test_opening_file(self):
        lines = [
            '2010-07-27 17:54:54,344 INFO'
            ' org.apache.hadoop.fs.s3native.NativeS3FileSystem (main):'
            " Opening 's3://yourbucket/logs/2010/07/23/log2-00077.gz'"
            ' for reading\n'
        ]

        self.assertEqual(
            _parse_task_syslog(lines),
            dict(error=None, split=dict(
                path='s3://yourbucket/logs/2010/07/23/log2-00077.gz',
                start_line=None,
                num_lines=None)))
예제 #4
0
파일: test_parse.py 프로젝트: sebratt/mrjob
    def test_opening_file(self):
        lines = [
            '2010-07-27 17:54:54,344 INFO'
            ' org.apache.hadoop.fs.s3native.NativeS3FileSystem (main):'
            " Opening 's3://yourbucket/logs/2010/07/23/log2-00077.gz'"
            ' for reading\n'
        ]

        self.assertEqual(
            _parse_task_syslog(lines),
            dict(error=None, split=dict(
                path='s3://yourbucket/logs/2010/07/23/log2-00077.gz',
                start_line=None,
                num_lines=None)))
예제 #5
0
    def test_split(self):
        lines = [
            '2015-12-21 14:06:17,707 INFO [main]'
            ' org.apache.hadoop.mapred.MapTask: Processing split:'
            ' hdfs://e4270474c8ee:9000/user/root/tmp/mrjob'
            '/mr_boom.root.20151221.190511.059097/files/bootstrap.sh:0+335\n',
        ]

        self.assertEqual(
            _parse_task_syslog(lines),
            dict(error=None, split=dict(
                path=('hdfs://e4270474c8ee:9000/user/root/tmp/mrjob'
                     '/mr_boom.root.20151221.190511.059097/files'
                     '/bootstrap.sh'),
                start_line=0,
                num_lines=335)))
예제 #6
0
파일: test_parse.py 프로젝트: sebratt/mrjob
    def test_split(self):
        lines = [
            '2015-12-21 14:06:17,707 INFO [main]'
            ' org.apache.hadoop.mapred.MapTask: Processing split:'
            ' hdfs://e4270474c8ee:9000/user/root/tmp/mrjob'
            '/mr_boom.root.20151221.190511.059097/files/bootstrap.sh:0+335\n',
        ]

        self.assertEqual(
            _parse_task_syslog(lines),
            dict(error=None, split=dict(
                path=('hdfs://e4270474c8ee:9000/user/root/tmp/mrjob'
                     '/mr_boom.root.20151221.190511.059097/files'
                     '/bootstrap.sh'),
                start_line=0,
                num_lines=335)))
예제 #7
0
    def test_pre_yarn_error(self):
        lines = [
            '2015-12-30 19:21:39,980 WARN'
            ' org.apache.hadoop.mapred.Child (main): Error running child\n',
            'java.lang.RuntimeException: PipeMapRed.waitOutputThreads():'
            ' subprocess failed with code 1\n',
            '        at org.apache.hadoop.streaming.PipeMapRed'
            '.waitOutputThreads(PipeMapRed.java:372)\n',
        ]

        self.assertEqual(
            _parse_task_syslog(lines),
            dict(split=None, error=dict(
                exception=('java.lang.RuntimeException:'
                           ' PipeMapRed.waitOutputThreads():'
                           ' subprocess failed with code 1'),
                stack_trace=[
                    '        at org.apache.hadoop.streaming.PipeMapRed'
                    '.waitOutputThreads(PipeMapRed.java:372)',
                ])))
예제 #8
0
파일: test_parse.py 프로젝트: sebratt/mrjob
    def test_pre_yarn_error(self):
        lines = [
            '2015-12-30 19:21:39,980 WARN'
            ' org.apache.hadoop.mapred.Child (main): Error running child\n',
            'java.lang.RuntimeException: PipeMapRed.waitOutputThreads():'
            ' subprocess failed with code 1\n',
            '        at org.apache.hadoop.streaming.PipeMapRed'
            '.waitOutputThreads(PipeMapRed.java:372)\n',
        ]

        self.assertEqual(
            _parse_task_syslog(lines),
            dict(split=None, error=dict(
                exception=('java.lang.RuntimeException:'
                           ' PipeMapRed.waitOutputThreads():'
                           ' subprocess failed with code 1'),
                stack_trace=[
                    '        at org.apache.hadoop.streaming.PipeMapRed'
                    '.waitOutputThreads(PipeMapRed.java:372)',
                ])))
예제 #9
0
    def test_pre_yarn_error(self):
        lines = [
            "2015-12-30 19:21:39,980 WARN" " org.apache.hadoop.mapred.Child (main): Error running child\n",
            "java.lang.RuntimeException: PipeMapRed.waitOutputThreads():" " subprocess failed with code 1\n",
            "        at org.apache.hadoop.streaming.PipeMapRed" ".waitOutputThreads(PipeMapRed.java:372)\n",
        ]

        self.assertEqual(
            _parse_task_syslog(lines),
            dict(
                split=None,
                error=dict(
                    exception=(
                        "java.lang.RuntimeException:"
                        " PipeMapRed.waitOutputThreads():"
                        " subprocess failed with code 1"
                    ),
                    stack_trace=[
                        "        at org.apache.hadoop.streaming.PipeMapRed" ".waitOutputThreads(PipeMapRed.java:372)"
                    ],
                ),
            ),
        )
예제 #10
0
파일: test_parse.py 프로젝트: sebratt/mrjob
    def test_yarn_error(self):
        lines = [
            '2015-12-21 14:06:18,538 WARN [main]'
            ' org.apache.hadoop.mapred.YarnChild: Exception running child'
            ' : java.lang.RuntimeException: PipeMapRed.waitOutputThreads():'
            ' subprocess failed with code 1\n',
            '        at org.apache.hadoop.streaming.PipeMapRed'
            '.waitOutputThreads(PipeMapRed.java:322)\n',
            '        at org.apache.hadoop.streaming.PipeMapRed'
            '.mapRedFinished(PipeMapRed.java:535)\n',
        ]

        self.assertEqual(
            _parse_task_syslog(lines),
            dict(split=None, error=dict(
                exception=('java.lang.RuntimeException:'
                           ' PipeMapRed.waitOutputThreads():'
                           ' subprocess failed with code 1'),
                stack_trace=[
                    '        at org.apache.hadoop.streaming.PipeMapRed'
                    '.waitOutputThreads(PipeMapRed.java:322)',
                    '        at org.apache.hadoop.streaming.PipeMapRed'
                    '.mapRedFinished(PipeMapRed.java:535)',
                ])))
예제 #11
0
 def test_empty(self):
     self.assertEqual(_parse_task_syslog([]), dict(error=None, split=None))
예제 #12
0
 def test_empty(self):
     self.assertEqual(_parse_task_syslog([]), dict(error=None, split=None))
예제 #13
0
def _find_error_in_task_logs(fs,
                             log_dirs_stream,
                             hadoop_version,
                             application_id=None,
                             job_id=None):
    """Given a filesystem and a stream of lists of log dirs to search in,
    find the last error and return details about it. *hadoop_version*
    is required, as task logs have very different paths in YARN.

    In YARN, you must set *application_id*, and pre-YARN, you must set
    *job_id*, or we'll bail out and return None.

    Returns a dictionary with the following keys ("optional" means
    that something may be None):

    syslog: dict with keys:
       path: path of syslog we found error in
       error: error details; dict with keys:
           exception: Java exception (as string)
           stack_trace: array of lines with Java stack trace
       split: optional input split we were reading; dict with keys:
           path: path of input file
           start_line: first line of split (0-indexed)
           num_lines: number of lines in split
    stderr: optional dict with keys:
       path: path of stderr corresponding to syslog
       error: optional error details; dict with keys:
           exception: string  (Python exception)
           traceback: array of lines with Python stack trace
    type: always set to 'task'
    """
    syslog_paths = []

    yarn = uses_yarn(hadoop_version)

    if ((yarn and application_id is None) or (not yarn and job_id is None)):
        return None

    # we assume that each set of log paths contains the same copies
    # of syslogs, so stop once we find any non-empty set of log dirs
    for log_dirs in log_dirs_stream:
        if yarn:
            syslog_paths = _ls_yarn_task_syslogs(fs,
                                                 log_dirs,
                                                 application_id=application_id)
        else:
            syslog_paths = _ls_pre_yarn_task_syslogs(fs,
                                                     log_dirs,
                                                     job_id=job_id)

        if syslog_paths:
            break

    for syslog_path in syslog_paths:
        log.debug('Looking for error in %s' % syslog_path)
        syslog_info = _parse_task_syslog(_cat_log(fs, syslog_path))

        if not syslog_info['error']:
            continue

        # found error! see if we can explain it

        # TODO: don't bother if error wasn't due to child process
        stderr_path = _stderr_for_syslog(syslog_path)

        stderr_info = _parse_python_task_stderr(_cat_log(fs, stderr_path))

        # output error info
        syslog_info['path'] = syslog_path
        stderr_info['path'] = stderr_path

        return dict(type='task', syslog=syslog_info, stderr=stderr_info)

    return None
예제 #14
0
def _find_error_in_task_logs(fs, log_dirs_stream, hadoop_version,
                             application_id=None, job_id=None):
    """Given a filesystem and a stream of lists of log dirs to search in,
    find the last error and return details about it. *hadoop_version*
    is required, as task logs have very different paths in YARN.

    In YARN, you must set *application_id*, and pre-YARN, you must set
    *job_id*, or we'll bail out and return None.

    Returns a dictionary with the following keys ("optional" means
    that something may be None):

    syslog: dict with keys:
       path: path of syslog we found error in
       error: error details; dict with keys:
           exception: Java exception (as string)
           stack_trace: array of lines with Java stack trace
       split: optional input split we were reading; dict with keys:
           path: path of input file
           start_line: first line of split (0-indexed)
           num_lines: number of lines in split
    stderr: optional dict with keys:
       path: path of stderr corresponding to syslog
       error: optional error details; dict with keys:
           exception: string  (Python exception)
           traceback: array of lines with Python stack trace
    type: always set to 'task'
    """
    syslog_paths = []

    yarn = uses_yarn(hadoop_version)

    if ((yarn and application_id is None) or (not yarn and job_id is None)):
        return None

    # we assume that each set of log paths contains the same copies
    # of syslogs, so stop once we find any non-empty set of log dirs
    for log_dirs in log_dirs_stream:
        if yarn:
            syslog_paths = _ls_yarn_task_syslogs(fs, log_dirs,
                                                 application_id=application_id)
        else:
            syslog_paths = _ls_pre_yarn_task_syslogs(fs, log_dirs,
                                                     job_id=job_id)

        if syslog_paths:
            break

    for syslog_path in syslog_paths:
        log.debug('Looking for error in %s' % syslog_path)
        syslog_info = _parse_task_syslog(_cat_log(fs, syslog_path))

        if not syslog_info['error']:
            continue

        # found error! see if we can explain it

        # TODO: don't bother if error wasn't due to child process
        stderr_path = _stderr_for_syslog(syslog_path)

        stderr_info = _parse_python_task_stderr(_cat_log(fs, stderr_path))

        # output error info
        syslog_info['path'] = syslog_path
        stderr_info['path'] = stderr_path

        return dict(type='task', syslog=syslog_info, stderr=stderr_info)

    return None