def test_single_line_error(self): self.assertEqual( _parse_spark_log(_SINGLE_LINE_ERROR.split('\n')), dict(errors=[ dict(spark_error=(dict( message=_SINGLE_LINE_ERROR[49:], start_line=0, num_lines=1, ))) ]))
def test_multi_line_error(self): self.assertEqual( _parse_spark_log(_MULTI_LINE_ERROR.split('\n')), dict(errors=[ dict(spark_error=dict( message=_MULTI_LINE_ERROR[37:], start_line=0, num_lines=10, )) ]))
def test_multi_line_warning(self): # on the local-cluster master, Python Tracebacks are only available # from warnings, not errors self.assertEqual( _parse_spark_log(_MULTI_LINE_WARNING.split('\n')), dict(errors=[ dict(spark_error=dict( message=_MULTI_LINE_WARNING[180:], start_line=1, num_lines=13, )) ]))
def test_multiple_errors(self): ERRORS = '\n'.join( [_SINGLE_LINE_ERROR, _MULTI_LINE_ERROR, _MULTI_LINE_WARNING]) self.assertEqual( _parse_spark_log(ERRORS.split('\n')), dict(errors=[ dict(spark_error=(dict( message=_SINGLE_LINE_ERROR[49:], start_line=0, num_lines=1, ))), dict(spark_error=dict( message=_MULTI_LINE_ERROR[37:], start_line=1, num_lines=10, )), dict(spark_error=dict( message=_MULTI_LINE_WARNING[180:], start_line=12, num_lines=13, )) ]))
def _run_spark_submit(self, spark_submit_args, env, record_callback): """Run the spark submit binary in a subprocess, using a PTY if possible :param spark_submit_args: spark-submit binary and arguments, as as list :param env: environment variables, as a dict :param record_callback: a function that takes a single log4j record as its argument (see :py:func:`~mrjob.logs.log4j\ ._parse_hadoop_log4j_records) :return: tuple of the subprocess's return code and a step interpretation dictionary """ log.debug('> %s' % cmd_line(spark_submit_args)) log.debug(' with environment: %r' % sorted(env.items())) # these should always be set, but just in case returncode = 0 step_interpretation = {} # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # spark-submit is running log.debug('No PTY available, using Popen() to invoke spark-submit') step_proc = Popen( spark_submit_args, stdout=PIPE, stderr=PIPE, env=env) # parse driver output step_interpretation = _parse_spark_log( step_proc.stderr, record_callback=record_callback) # there shouldn't be much output on STDOUT, just echo it for record in _parse_hadoop_log4j_records(step_proc.stdout): record_callback(record) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process try: os.execvpe(spark_submit_args[0], spark_submit_args, env) # now this process is no longer Python except OSError as ex: # use _exit() so we don't do cleanup, etc. that's # the parent process's job os._exit(ex.errno) finally: # if we get some other exception, still exit hard os._exit(-1) else: log.debug('Invoking spark-submit via PTY') with os.fdopen(master_fd, 'rb') as master: step_interpretation = ( _parse_spark_log( _eio_to_eof(master), record_callback=record_callback)) _, returncode = os.waitpid(pid, 0) return (returncode, step_interpretation)
def test_ignore_single_line_warning(self): # single-line warnings can be all sorts of irrelevant things self.assertEqual(_parse_spark_log(_SINGLE_LINE_WARNING.split('\n')), {})
def test_empty(self): self.assertEqual(_parse_spark_log([]), {})
def test_application_id(self): self.assertEqual(_parse_spark_log(_APPLICATION_ID_LINE.split('\n')), dict(application_id='application_1568415025507_0001'))