Exemplo n.º 1
0
def test_simple_log4j_parsing():
    res = parse_hadoop_log4j_records(SIMPLE_LOG4J)
    expected = [
        Log4jRecord(
            caller_location='',
            level='INFO',
            logger='SparkContext',
            message='Running Spark version 2.4.4',
            num_lines=1,
            start_line=0,
            thread=None,
            timestamp='20/02/05 17:26:43',
        ),
        Log4jRecord(
            caller_location='',
            level='INFO',
            logger='SparkContext',
            message='Submitted application: blah',
            num_lines=1,
            start_line=1,
            thread=None,
            timestamp='20/02/05 17:26:43',
        ),
        Log4jRecord(
            caller_location='',
            level='INFO',
            logger='SecurityManager',
            message='Changing view acls to: hadoop',
            num_lines=1,
            start_line=2,
            thread=None,
            timestamp='20/02/05 17:26:43',
        ),
    ]
    assert list(res) == expected
Exemplo n.º 2
0
def test_simple_log4j_parsing():
    res = parse_hadoop_log4j_records(SIMPLE_LOG4J)
    expected = [
        Log4jRecord(
            caller_location="",
            level="INFO",
            logger="SparkContext",
            message="Running Spark version 2.4.4",
            num_lines=1,
            start_line=0,
            thread=None,
            timestamp="20/02/05 17:26:43",
        ),
        Log4jRecord(
            caller_location="",
            level="INFO",
            logger="SparkContext",
            message="Submitted application: blah",
            num_lines=1,
            start_line=1,
            thread=None,
            timestamp="20/02/05 17:26:43",
        ),
        Log4jRecord(
            caller_location="",
            level="INFO",
            logger="SecurityManager",
            message="Changing view acls to: hadoop",
            num_lines=1,
            start_line=2,
            thread=None,
            timestamp="20/02/05 17:26:43",
        ),
    ]
    assert list(res) == expected
Exemplo n.º 3
0
 def _log_logs_from_s3(self, log, emr_step_id):
     '''Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs
     them to the given log.'''
     stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id(
         log, self.cluster_id, emr_step_id)
     # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for
     # Dagster's logging system.
     records = parse_hadoop_log4j_records(stderr_log)
     for record in records:
         log._log(  # pylint: disable=protected-access
             record.level, record.logger + ': ' + record.message, {})
     log.info(stdout_log)
Exemplo n.º 4
0
def test_multiline_log4j_parsing():
    res = parse_hadoop_log4j_records(MULTILINE_LOG4J)

    expected = [
        Log4jRecord(
            caller_location='',
            level='INFO',
            logger='Client',
            message='Application report for application_1580918830280_0002 (state: ACCEPTED)',
            num_lines=1,
            start_line=0,
            thread=None,
            timestamp='20/02/05 17:26:50',
        ),
        Log4jRecord(
            caller_location='',
            level='INFO',
            logger='Client',
            message='\n'.join(
                [
                    '',
                    '         client token: N/A',
                    '         diagnostics: AM container is launched, waiting for AM container to Register with RM',
                    '         ApplicationMaster host: N/A',
                    '         ApplicationMaster RPC port: -1',
                    '         queue: default',
                    '         start time: 1580923609467',
                    '         final status: UNDEFINED',
                    '         tracking URL: http://ip-172-31-2-74.us-west-1.compute.internal:20888/proxy/application_1580918830280_0002/',
                    '         user: hadoop',
                ]
            ),
            num_lines=10,
            start_line=1,
            thread=None,
            timestamp='20/02/05 17:26:50',
        ),
        Log4jRecord(
            caller_location='',
            level='INFO',
            logger='Client',
            message='Application report for application_1580918830280_0002 (state: ACCEPTED)',
            num_lines=1,
            start_line=11,
            thread=None,
            timestamp='20/02/05 17:26:51',
        ),
    ]
    assert list(res) == expected
Exemplo n.º 5
0
def test_multiline_log4j_parsing():
    res = parse_hadoop_log4j_records(MULTILINE_LOG4J)

    expected = [
        Log4jRecord(
            caller_location="",
            level="INFO",
            logger="Client",
            message=
            "Application report for application_1580918830280_0002 (state: ACCEPTED)",
            num_lines=1,
            start_line=0,
            thread=None,
            timestamp="20/02/05 17:26:50",
        ),
        Log4jRecord(
            caller_location="",
            level="INFO",
            logger="Client",
            message="\n".join([
                "",
                "         client token: N/A",
                "         diagnostics: AM container is launched, waiting for AM container to Register with RM",
                "         ApplicationMaster host: N/A",
                "         ApplicationMaster RPC port: -1",
                "         queue: default",
                "         start time: 1580923609467",
                "         final status: UNDEFINED",
                "         tracking URL: http://ip-172-31-2-74.us-west-1.compute.internal:20888/proxy/application_1580918830280_0002/",
                "         user: hadoop",
            ]),
            num_lines=10,
            start_line=1,
            thread=None,
            timestamp="20/02/05 17:26:50",
        ),
        Log4jRecord(
            caller_location="",
            level="INFO",
            logger="Client",
            message=
            "Application report for application_1580918830280_0002 (state: ACCEPTED)",
            num_lines=1,
            start_line=11,
            thread=None,
            timestamp="20/02/05 17:26:51",
        ),
    ]
    assert list(res) == expected
Exemplo n.º 6
0
 def new_compute_fn(context, *args, **kwargs):  # pylint: disable=unused-argument
     self._sync_code_to_s3(context, solid_name)
     step_defs = self._get_execute_steps(context, solid_name)
     step_ids = self.emr_job_runner.add_job_flow_steps(
         context, self.config['cluster_id'], step_defs)
     self.emr_job_runner.wait_for_steps_to_complete(
         context, self.config['cluster_id'], step_ids)
     if self.config['wait_for_logs']:
         stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id(
             context, self.config['cluster_id'], step_ids[1])
         # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for
         # Dagster's logging system.
         records = parse_hadoop_log4j_records(stderr_log)
         for record in records:
             context.log._log(  # pylint: disable=protected-access
                 record.level, record.logger + ': ' + record.message,
                 {})
         context.log.info(stdout_log)
Exemplo n.º 7
0
 def _log_logs_from_s3(self, log, emr_step_id):
     """Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs
     them to the given log."""
     stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id(
         log, self.cluster_id, emr_step_id)
     # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for
     # Dagster's logging system.
     records = parse_hadoop_log4j_records(stderr_log)
     for record in records:
         if record.level:
             log._log(  # pylint: disable=protected-access
                 record.level,
                 "".join([
                     "Spark Driver stderr: ", record.logger, ": ",
                     record.message
                 ]),
                 {},
             )
         else:
             log.debug(f"Spark Driver stderr: {record.message}")
     log.info("Spark Driver stdout: " + stdout_log)
Exemplo n.º 8
0
    def _log_logs_from_s3(self, log, emr_step_id):
        """Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs
        them to the given log."""
        stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id(
            log, self.cluster_id, emr_step_id)
        # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for
        # Dagster's logging system.
        records = parse_hadoop_log4j_records(stderr_log)
        for record in records:
            if record.level:
                log.log(
                    level=record.level,
                    msg="".join([
                        "Spark Driver stderr: ", record.logger, ": ",
                        record.message
                    ]),
                )
            else:
                log.debug(f"Spark Driver stderr: {record.message}")

        sys.stdout.write("---------- Spark Driver stdout: ----------\n" +
                         stdout_log + "\n" +
                         "---------- End of Spark Driver stdout ----------\n")