def test_read_logs_from_at_most_one_dir(self): self.mock_paths = [ '/log/dir/userlogs/application_1450486922681_0004' '/container_1450486922681_0005_01_000003/syslog', ] self.assertEqual( _ls_yarn_task_syslogs(self.mock_fs, ['hdfs:///output/_logs', '/log/dir']), [ '/log/dir/userlogs/application_1450486922681_0004' '/container_1450486922681_0005_01_000003/syslog' ]) self.mock_paths.append( 'hdfs:///output/_logs/userlogs/application_1450486922681_0004' '/container_1450486922681_0005_01_000003/syslog') self.assertEqual( _ls_yarn_task_syslogs(self.mock_fs, ['hdfs:///output/_logs', '/log/dir']), [ 'hdfs:///output/_logs/userlogs/application_1450486922681_0004' '/container_1450486922681_0005_01_000003/syslog' ])
def test_filter_and_sort(self): self.mock_paths = [ '/log/dir/userlogs/application_1450486922681_0004' '/container_1450486922681_0005_01_000003/syslog', '/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/syslog', '/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000003/syslog', '/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000003/stderr', '/log/dir/random-crud', ] # should be sorted in reverse order by app and container ID self.assertEqual(_ls_yarn_task_syslogs(self.mock_fs, ['/log/dir']), [ '/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/syslog', '/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000003/syslog', '/log/dir/userlogs/application_1450486922681_0004' '/container_1450486922681_0005_01_000003/syslog' ]) # test filter by application ID self.assertEqual( _ls_yarn_task_syslogs( self.mock_fs, ['/log/dir'], application_id='application_1450486922681_0004'), [ '/log/dir/userlogs/application_1450486922681_0004' '/container_1450486922681_0005_01_000003/syslog' ]) # test subdir self.assertEqual( _ls_yarn_task_syslogs( self.mock_fs, ['/log/dir/userlogs/application_1450486922681_0005']), [ '/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/syslog', '/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000003/syslog' ])
def test_filter_and_sort(self): self.mock_paths = [ '/log/dir/userlogs/application_1450486922681_0004' '/container_1450486922681_0005_01_000003/syslog', '/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/syslog', '/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000003/syslog', '/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000003/stderr', '/log/dir/random-crud', ] # should be sorted in reverse order by app and container ID self.assertEqual( _ls_yarn_task_syslogs( self.mock_fs, ['/log/dir']), ['/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/syslog', '/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000003/syslog', '/log/dir/userlogs/application_1450486922681_0004' '/container_1450486922681_0005_01_000003/syslog']) # test filter by application ID self.assertEqual( _ls_yarn_task_syslogs( self.mock_fs, ['/log/dir'], application_id='application_1450486922681_0004'), ['/log/dir/userlogs/application_1450486922681_0004' '/container_1450486922681_0005_01_000003/syslog']) # test subdir self.assertEqual( _ls_yarn_task_syslogs( self.mock_fs, ['/log/dir/userlogs/application_1450486922681_0005']), ['/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/syslog', '/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000003/syslog'])
def test_read_logs_from_at_most_one_dir(self): self.mock_paths = [ '/log/dir/userlogs/application_1450486922681_0004' '/container_1450486922681_0005_01_000003/syslog', ] self.assertEqual( _ls_yarn_task_syslogs( self.mock_fs, ['hdfs:///output/_logs', '/log/dir']), ['/log/dir/userlogs/application_1450486922681_0004' '/container_1450486922681_0005_01_000003/syslog']) self.mock_paths.append( 'hdfs:///output/_logs/userlogs/application_1450486922681_0004' '/container_1450486922681_0005_01_000003/syslog') self.assertEqual( _ls_yarn_task_syslogs( self.mock_fs, ['hdfs:///output/_logs', '/log/dir']), ['hdfs:///output/_logs/userlogs/application_1450486922681_0004' '/container_1450486922681_0005_01_000003/syslog'])
def _find_error_in_task_logs(fs, log_dirs_stream, hadoop_version, application_id=None, job_id=None): """Given a filesystem and a stream of lists of log dirs to search in, find the last error and return details about it. *hadoop_version* is required, as task logs have very different paths in YARN. In YARN, you must set *application_id*, and pre-YARN, you must set *job_id*, or we'll bail out and return None. Returns a dictionary with the following keys ("optional" means that something may be None): syslog: dict with keys: path: path of syslog we found error in error: error details; dict with keys: exception: Java exception (as string) stack_trace: array of lines with Java stack trace split: optional input split we were reading; dict with keys: path: path of input file start_line: first line of split (0-indexed) num_lines: number of lines in split stderr: optional dict with keys: path: path of stderr corresponding to syslog error: optional error details; dict with keys: exception: string (Python exception) traceback: array of lines with Python stack trace type: always set to 'task' """ syslog_paths = [] yarn = uses_yarn(hadoop_version) if ((yarn and application_id is None) or (not yarn and job_id is None)): return None # we assume that each set of log paths contains the same copies # of syslogs, so stop once we find any non-empty set of log dirs for log_dirs in log_dirs_stream: if yarn: syslog_paths = _ls_yarn_task_syslogs(fs, log_dirs, application_id=application_id) else: syslog_paths = _ls_pre_yarn_task_syslogs(fs, log_dirs, job_id=job_id) if syslog_paths: break for syslog_path in syslog_paths: log.debug('Looking for error in %s' % syslog_path) syslog_info = _parse_task_syslog(_cat_log(fs, syslog_path)) if not syslog_info['error']: continue # found error! see if we can explain it # TODO: don't bother if error wasn't due to child process stderr_path = _stderr_for_syslog(syslog_path) stderr_info = _parse_python_task_stderr(_cat_log(fs, stderr_path)) # output error info syslog_info['path'] = syslog_path stderr_info['path'] = stderr_path return dict(type='task', syslog=syslog_info, stderr=stderr_info) return None
def test_no_log_dirs(self): self.assertEqual(_ls_yarn_task_syslogs(self.mock_fs, []), [])