def test_simple_log4j_parsing(): res = parse_hadoop_log4j_records(SIMPLE_LOG4J) expected = [ Log4jRecord( caller_location='', level='INFO', logger='SparkContext', message='Running Spark version 2.4.4', num_lines=1, start_line=0, thread=None, timestamp='20/02/05 17:26:43', ), Log4jRecord( caller_location='', level='INFO', logger='SparkContext', message='Submitted application: blah', num_lines=1, start_line=1, thread=None, timestamp='20/02/05 17:26:43', ), Log4jRecord( caller_location='', level='INFO', logger='SecurityManager', message='Changing view acls to: hadoop', num_lines=1, start_line=2, thread=None, timestamp='20/02/05 17:26:43', ), ] assert list(res) == expected
def test_simple_log4j_parsing(): res = parse_hadoop_log4j_records(SIMPLE_LOG4J) expected = [ Log4jRecord( caller_location="", level="INFO", logger="SparkContext", message="Running Spark version 2.4.4", num_lines=1, start_line=0, thread=None, timestamp="20/02/05 17:26:43", ), Log4jRecord( caller_location="", level="INFO", logger="SparkContext", message="Submitted application: blah", num_lines=1, start_line=1, thread=None, timestamp="20/02/05 17:26:43", ), Log4jRecord( caller_location="", level="INFO", logger="SecurityManager", message="Changing view acls to: hadoop", num_lines=1, start_line=2, thread=None, timestamp="20/02/05 17:26:43", ), ] assert list(res) == expected
def _log_logs_from_s3(self, log, emr_step_id): '''Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs them to the given log.''' stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id( log, self.cluster_id, emr_step_id) # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for # Dagster's logging system. records = parse_hadoop_log4j_records(stderr_log) for record in records: log._log( # pylint: disable=protected-access record.level, record.logger + ': ' + record.message, {}) log.info(stdout_log)
def test_multiline_log4j_parsing(): res = parse_hadoop_log4j_records(MULTILINE_LOG4J) expected = [ Log4jRecord( caller_location='', level='INFO', logger='Client', message='Application report for application_1580918830280_0002 (state: ACCEPTED)', num_lines=1, start_line=0, thread=None, timestamp='20/02/05 17:26:50', ), Log4jRecord( caller_location='', level='INFO', logger='Client', message='\n'.join( [ '', ' client token: N/A', ' diagnostics: AM container is launched, waiting for AM container to Register with RM', ' ApplicationMaster host: N/A', ' ApplicationMaster RPC port: -1', ' queue: default', ' start time: 1580923609467', ' final status: UNDEFINED', ' tracking URL: http://ip-172-31-2-74.us-west-1.compute.internal:20888/proxy/application_1580918830280_0002/', ' user: hadoop', ] ), num_lines=10, start_line=1, thread=None, timestamp='20/02/05 17:26:50', ), Log4jRecord( caller_location='', level='INFO', logger='Client', message='Application report for application_1580918830280_0002 (state: ACCEPTED)', num_lines=1, start_line=11, thread=None, timestamp='20/02/05 17:26:51', ), ] assert list(res) == expected
def test_multiline_log4j_parsing(): res = parse_hadoop_log4j_records(MULTILINE_LOG4J) expected = [ Log4jRecord( caller_location="", level="INFO", logger="Client", message= "Application report for application_1580918830280_0002 (state: ACCEPTED)", num_lines=1, start_line=0, thread=None, timestamp="20/02/05 17:26:50", ), Log4jRecord( caller_location="", level="INFO", logger="Client", message="\n".join([ "", " client token: N/A", " diagnostics: AM container is launched, waiting for AM container to Register with RM", " ApplicationMaster host: N/A", " ApplicationMaster RPC port: -1", " queue: default", " start time: 1580923609467", " final status: UNDEFINED", " tracking URL: http://ip-172-31-2-74.us-west-1.compute.internal:20888/proxy/application_1580918830280_0002/", " user: hadoop", ]), num_lines=10, start_line=1, thread=None, timestamp="20/02/05 17:26:50", ), Log4jRecord( caller_location="", level="INFO", logger="Client", message= "Application report for application_1580918830280_0002 (state: ACCEPTED)", num_lines=1, start_line=11, thread=None, timestamp="20/02/05 17:26:51", ), ] assert list(res) == expected
def new_compute_fn(context, *args, **kwargs): # pylint: disable=unused-argument self._sync_code_to_s3(context, solid_name) step_defs = self._get_execute_steps(context, solid_name) step_ids = self.emr_job_runner.add_job_flow_steps( context, self.config['cluster_id'], step_defs) self.emr_job_runner.wait_for_steps_to_complete( context, self.config['cluster_id'], step_ids) if self.config['wait_for_logs']: stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id( context, self.config['cluster_id'], step_ids[1]) # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for # Dagster's logging system. records = parse_hadoop_log4j_records(stderr_log) for record in records: context.log._log( # pylint: disable=protected-access record.level, record.logger + ': ' + record.message, {}) context.log.info(stdout_log)
def _log_logs_from_s3(self, log, emr_step_id): """Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs them to the given log.""" stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id( log, self.cluster_id, emr_step_id) # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for # Dagster's logging system. records = parse_hadoop_log4j_records(stderr_log) for record in records: if record.level: log._log( # pylint: disable=protected-access record.level, "".join([ "Spark Driver stderr: ", record.logger, ": ", record.message ]), {}, ) else: log.debug(f"Spark Driver stderr: {record.message}") log.info("Spark Driver stdout: " + stdout_log)
def _log_logs_from_s3(self, log, emr_step_id): """Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs them to the given log.""" stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id( log, self.cluster_id, emr_step_id) # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for # Dagster's logging system. records = parse_hadoop_log4j_records(stderr_log) for record in records: if record.level: log.log( level=record.level, msg="".join([ "Spark Driver stderr: ", record.logger, ": ", record.message ]), ) else: log.debug(f"Spark Driver stderr: {record.message}") sys.stdout.write("---------- Spark Driver stdout: ----------\n" + stdout_log + "\n" + "---------- End of Spark Driver stdout ----------\n")