Exemplo n.º 1
0
    def test_syslog_with_empty_corresponding_stderr(self):
        syslog_path = '/userlogs/attempt_201512232143_0008_m_000001_3/syslog'
        stderr_path = '/userlogs/attempt_201512232143_0008_m_000001_3/stderr'

        self.mock_paths = [syslog_path, stderr_path]

        self.path_to_mock_result = {
            syslog_path: dict(hadoop_error=dict(message='BOOM')),
        }

        self.assertEqual(
            self.interpret_task_logs(),
            dict(
                errors=[
                    dict(
                        attempt_id='attempt_201512232143_0008_m_000001_3',
                        hadoop_error=dict(
                            message='BOOM',
                            path=syslog_path,
                        ),
                        task_id='task_201512232143_0008_m_000001',
                    ),
                ],
                partial=True,
            )
        )

        self.assertEqual(
            self.mock_log_callback.call_args_list,
            [call(stderr_path), call(syslog_path)])
Exemplo n.º 2
0
    def test_mixed_job(self):
        self._run_job(MRStreamingAndSpark)

        self.run_step_on_spark.assert_has_calls([
            call(ANY, 0, 0),
            call(ANY, 1, 1),
        ])
Exemplo n.º 3
0
    def test_syslog_with_empty_corresponding_stderr(self):
        syslog_path = '/userlogs/attempt_201512232143_0008_m_000001_3/syslog'
        stderr_path = '/userlogs/attempt_201512232143_0008_m_000001_3/stderr'

        self.mock_paths = [syslog_path, stderr_path]

        self.path_to_mock_result = {
            syslog_path: dict(hadoop_error=dict(message='BOOM')),
        }

        self.assertEqual(
            self.interpret_task_logs(),
            dict(
                errors=[
                    dict(
                        attempt_id='attempt_201512232143_0008_m_000001_3',
                        hadoop_error=dict(
                            message='BOOM',
                            path=syslog_path,
                        ),
                        task_id='task_201512232143_0008_m_000001',
                    ),
                ],
                partial=True,
            ))

        self.assertEqual(
            self.mock_log_callback.call_args_list,
            [call(stderr_path), call(syslog_path)])
Exemplo n.º 4
0
    def test_stderr_with_application_exited_and_empty_stdout(self):
        stderr_path = ('/log/dir/userlogs/application_1450486922681_0005'
                       '/container_1450486922681_0005_01_000004/stderr')
        stdout_path = ('/log/dir/userlogs/application_1450486922681_0005'
                       '/container_1450486922681_0005_01_000004/stdout')

        self.mock_paths = [stderr_path, stdout_path]

        self.path_to_mock_result = {
            stderr_path:
            dict(check_stdout=True,
                 hadoop_error=dict(message='application exited')),
        }

        self.assertEqual(
            self.interpret_spark_task_logs(),
            dict(
                errors=[
                    dict(
                        container_id='container_1450486922681_0005_01_000004',
                        hadoop_error=dict(
                            message='application exited',
                            path=stderr_path,
                        ),
                    ),
                ],
                partial=True,
            ))

        self.assertEqual(
            self.mock_log_callback.call_args_list,
            [call(stderr_path), call(stdout_path)])
Exemplo n.º 5
0
    def test_stderr_with_application_exited_and_empty_stdout(self):
        stderr_path = ('/log/dir/userlogs/application_1450486922681_0005'
                       '/container_1450486922681_0005_01_000004/stderr')
        stdout_path = ('/log/dir/userlogs/application_1450486922681_0005'
                       '/container_1450486922681_0005_01_000004/stdout')

        self.mock_paths = [stderr_path, stdout_path]

        self.path_to_mock_result = {
            stderr_path: dict(
                check_stdout=True,
                hadoop_error=dict(message='application exited')),
        }

        self.assertEqual(
            self.interpret_spark_task_logs(),
            dict(
                errors=[
                    dict(
                        container_id='container_1450486922681_0005_01_000004',
                        hadoop_error=dict(
                            message='application exited',
                            path=stderr_path,
                        ),
                    ),
                ],
                partial=True,
            )
        )

        self.assertEqual(
            self.mock_log_callback.call_args_list,
            [call(stderr_path), call(stdout_path)])
Exemplo n.º 6
0
    def test_mixed_job(self):
        self._run_job(MRStreamingAndSpark)

        self.run_step_on_spark.assert_has_calls([
            call(ANY, 0, 0),
            call(ANY, 1, 1),
        ])
Exemplo n.º 7
0
    def test_expected(self):

        values = [self.GOOD_LIST_OUTPUT, self.GOOD_KILL_OUTPUT]

        def fake_popen(*args, **kwargs):
            m = Mock()
            m.communicate.return_value = (values.pop(0), b'')
            return m

        with patch.object(ssh, 'Popen', side_effect=fake_popen) as m:
            ssh.ssh_terminate_single_job(['ssh_bin'], 'address', 'key.pem')
            self.assertEqual(m.call_args_list[0],
                             call(self.EXPECTED_LIST_CALL,
                                  stdin=PIPE, stdout=PIPE, stderr=PIPE))
            self.assertEqual(m.call_args_list[1],
                             call(self.EXPECTED_KILL_CALL,
                                  stdin=PIPE, stdout=PIPE, stderr=PIPE))
Exemplo n.º 8
0
    def test_main_no_conf(self, mock_collect_active_jobflows, mock_job_flows_to_stats):

        mock_collect_active_jobflows.return_value = []
        mock_job_flows_to_stats.return_value = {}
        main(['-q', '--no-conf'])

        # check if args for calling collect_active_jobflows are correct
        self.assertEqual(mock_collect_active_jobflows.call_count, 1)
        self.assertEqual(mock_collect_active_jobflows.call_args_list, [call([])])
        self.assertEqual(mock_job_flows_to_stats.call_count, 1)
Exemplo n.º 9
0
    def test_log_messages(self):
        self.get_lines.return_value = [
            '18/04/17 22:06:15 INFO mapreduce.Job:  map 100% reduce 0%\n',
            '18/04/17 22:07:34 INFO mapreduce.Job: Counters: 1\n',
            '\tFile System Counters\n',
            '\t\tFILE: Number of bytes read=819\n',
        ]

        mr_job = MRWordCount(['-r', 'dataproc'])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()

        self.assertIn(call('  map 100% reduce 0%'),
                      self.log.info.call_args_list)

        self.assertIn(
            call('Counters: 1\n\tFile System Counters\n\t\tFILE:'
                 ' Number of bytes read=819'), self.log.info.call_args_list)
Exemplo n.º 10
0
    def test_main_no_conf(self, mock_collect_active_jobflows,
                          mock_job_flows_to_stats):

        mock_collect_active_jobflows.return_value = []
        mock_job_flows_to_stats.return_value = {}
        main(['-q', '--no-conf'])

        # check if args for calling collect_active_jobflows are correct
        self.assertEqual(mock_collect_active_jobflows.call_count, 1)
        self.assertEqual(mock_collect_active_jobflows.call_args_list,
                         [call([])])
        self.assertEqual(mock_job_flows_to_stats.call_count, 1)
Exemplo n.º 11
0
    def tests_streaming_steps_with_different_jobconf(self):
        class MRDifferentJobconfJob(MRJob):
            def mapper(self, key, value):
                yield key, value

            def steps(self):
                return [
                    MRStep(mapper=self.mapper),
                    MRStep(mapper=self.mapper, jobconf=dict(foo='bar')),
                    MRStep(mapper=self.mapper, jobconf=dict(foo='bar')),
                    MRStep(mapper=self.mapper, jobconf=dict(foo='baz')),
                ]

        self._run_job(MRDifferentJobconfJob)

        # steps 1 and 2 should be grouped together
        self.run_step_on_spark.assert_has_calls([
            call(ANY, 0, 0),
            call(ANY, 1, 2),
            call(ANY, 3, 3),
        ])
Exemplo n.º 12
0
    def test_setup_wrapper_script_uses_local_line_endings(self):
        job = MRTwoStepJob(["-r", "local", "--setup", "true"])
        job.sandbox(stdin=BytesIO())

        # tests #1071. Unfortunately, we mostly run these tests on machines
        # that use unix line endings anyway. So monitor open() instead
        with patch("mrjob.runner.open", create=True, side_effect=open) as m_open:
            with logger_disabled("mrjob.local"):
                with job.make_runner() as runner:
                    runner.run()

                    self.assertIn(call(runner._setup_wrapper_script_path, "w"), m_open.mock_calls)
Exemplo n.º 13
0
    def test_setup_wrapper_script_uses_local_line_endings(self):
        job = MRTwoStepJob(['-r', 'local', '--setup', 'true'])
        job.sandbox(stdin=BytesIO())

        # tests #1071. Unfortunately, we mostly run these tests on machines
        # that use unix line endings anyway. So monitor open() instead
        with patch('mrjob.sim.open', create=True, side_effect=open) as m_open:
            with job.make_runner() as runner:
                runner.run()

                self.assertIn(call(runner._setup_wrapper_script_path, 'w'),
                              m_open.mock_calls)
Exemplo n.º 14
0
    def test_collect_active_job_flows(self, mock_job_runner, mock_describe_jobflows):

        collect_active_job_flows(conf_paths=[])

        # check if args for calling describe_jobflows are correct
        self.assertEqual(mock_job_runner.call_count, 1)
        self.assertEqual(mock_job_runner.call_args_list, [call(conf_paths=[])])
        self.assertEqual(mock_describe_jobflows.call_count, 1)

        # check if args for calling describe_jobflows are correct
        active_states = ['STARTING', 'BOOTSTRAPPING', 'WAITING', 'RUNNING']
        args, kwargs = mock_describe_jobflows.call_args
        self.assertEqual(active_states, kwargs['states'])
Exemplo n.º 15
0
    def tests_streaming_steps_with_different_jobconf(self):
        class MRDifferentJobconfJob(MRJob):

            def mapper(self, key, value):
                yield key, value

            def steps(self):
                return [
                    MRStep(mapper=self.mapper),
                    MRStep(mapper=self.mapper, jobconf=dict(foo='bar')),
                    MRStep(mapper=self.mapper, jobconf=dict(foo='bar')),
                    MRStep(mapper=self.mapper, jobconf=dict(foo='baz')),
                ]

        self._run_job(MRDifferentJobconfJob)

        # steps 1 and 2 should be grouped together
        self.run_step_on_spark.assert_has_calls([
            call(ANY, 0, 0),
            call(ANY, 1, 2),
            call(ANY, 3, 3),
        ])
Exemplo n.º 16
0
    def test_expected(self):

        values = [self.GOOD_LIST_OUTPUT, self.GOOD_KILL_OUTPUT]

        def fake_popen(*args, **kwargs):
            m = Mock()
            m.communicate.return_value = (values.pop(0), b'')
            return m

        with patch.object(ssh, 'Popen', side_effect=fake_popen) as m:
            ssh.ssh_terminate_single_job(['ssh_bin'], 'address', 'key.pem')
            self.assertEqual(
                m.call_args_list[0],
                call(self.EXPECTED_LIST_CALL,
                     stdin=PIPE,
                     stdout=PIPE,
                     stderr=PIPE))
            self.assertEqual(
                m.call_args_list[1],
                call(self.EXPECTED_KILL_CALL,
                     stdin=PIPE,
                     stdout=PIPE,
                     stderr=PIPE))
Exemplo n.º 17
0
    def test_setup_wrapper_script_uses_local_line_endings(self):
        job = MRTwoStepJob(['-r', 'local', '--setup', 'true'])
        job.sandbox(stdin=BytesIO())

        # tests #1071. Unfortunately, we mostly run these tests on machines
        # that use unix line endings anyway. So monitor open() instead
        with patch(
                'mrjob.sim.open', create=True, side_effect=open) as m_open:
            with job.make_runner() as runner:
                runner.run()

                self.assertIn(
                    call(runner._setup_wrapper_script_path, 'w'),
                    m_open.mock_calls)
Exemplo n.º 18
0
    def test_collect_active_job_flows(self, mock_job_runner,
                                      mock_describe_jobflows):

        collect_active_job_flows(conf_paths=[])

        # check if args for calling describe_jobflows are correct
        self.assertEqual(mock_job_runner.call_count, 1)
        self.assertEqual(mock_job_runner.call_args_list, [call(conf_paths=[])])
        self.assertEqual(mock_describe_jobflows.call_count, 1)

        # check if args for calling describe_jobflows are correct
        active_states = ['STARTING', 'BOOTSTRAPPING', 'WAITING', 'RUNNING']
        args, kwargs = mock_describe_jobflows.call_args
        self.assertEqual(active_states, kwargs['states'])
Exemplo n.º 19
0
    def test_error_in_stdout_only(self):
        stderr_path = ('/log/dir/userlogs/application_1450486922681_0005'
                       '/container_1450486922681_0005_01_000004/stderr')
        stdout_path = ('/log/dir/userlogs/application_1450486922681_0005'
                       '/container_1450486922681_0005_01_000004/stdout')

        self.mock_paths = [stderr_path, stdout_path]

        self.path_to_mock_result = {
            stdout_path: dict(message='because, exploding code')
        }

        self.assertEqual(self.interpret_spark_task_logs(), {})

        self.assertEqual(self.mock_log_callback.call_args_list,
                         [call(stderr_path)])
Exemplo n.º 20
0
    def test_setup_wrapper_script_uses_local_line_endings(self):
        job = MRTwoStepJob(['-r', 'hadoop', '--setup', 'true'])
        job.sandbox()

        add_mock_hadoop_output([b''])
        add_mock_hadoop_output([b''])

        # tests #1071. Unfortunately, we mostly run these tests on machines
        # that use unix line endings anyway. So monitor open() instead
        with patch(
                'mrjob.runner.open', create=True, side_effect=open) as m_open:
            with logger_disabled('mrjob.hadoop'):
                with job.make_runner() as runner:
                    runner.run()

                    self.assertIn(
                        call(runner._setup_wrapper_script_path, 'wb'),
                        m_open.mock_calls)
Exemplo n.º 21
0
    def test_error_in_stdout_only(self):
        stderr_path = ('/log/dir/userlogs/application_1450486922681_0005'
                       '/container_1450486922681_0005_01_000004/stderr')
        stdout_path = ('/log/dir/userlogs/application_1450486922681_0005'
                       '/container_1450486922681_0005_01_000004/stdout')

        self.mock_paths = [stderr_path, stdout_path]

        self.path_to_mock_result = {
            stdout_path: dict(message='because, exploding code')
        }

        self.assertEqual(
            self.interpret_spark_task_logs(),
            {})

        self.assertEqual(
            self.mock_log_callback.call_args_list,
            [call(stderr_path)])
Exemplo n.º 22
0
    def test_multiple_logs(self):
        stdout1_path = ('/log/dir/userlogs/application_1450486922681_0005'
                        '/container_1450486922681_0005_01_000001/stdout')
        stderr1_path = ('/log/dir/userlogs/application_1450486922681_0005'
                        '/container_1450486922681_0005_01_000001/stderr')
        stdout2_path = ('/log/dir/userlogs/application_1450486922681_0005'
                        '/container_1450486922681_0005_01_000002/stdout')
        stderr2_path = ('/log/dir/userlogs/application_1450486922681_0005'
                        '/container_1450486922681_0005_01_000002/stderr')
        stdout3_path = ('/log/dir/userlogs/application_1450486922681_0005'
                        '/container_1450486922681_0005_01_000003/stdout')
        stderr3_path = ('/log/dir/userlogs/application_1450486922681_0005'
                        '/container_1450486922681_0005_01_000003/stderr')
        stderr4_path = ('/log/dir/userlogs/application_1450486922681_0005'
                        '/container_1450486922681_0005_01_000004/stderr')

        self.mock_paths = [
            stdout1_path,
            stderr1_path,
            stdout2_path,
            stderr2_path,
            stdout3_path,
            stderr3_path,
            stderr4_path,
        ]

        self.path_to_mock_result = {
            stderr1_path:
            dict(hadoop_error=dict(message='BOOM1')),
            stderr2_path:
            dict(check_stdout=True,
                 hadoop_error=dict(message='exited with status 2')),
            stdout2_path:
            dict(message='BoomException'),
            stderr4_path:
            dict(check_stdout=True,
                 hadoop_error=dict(message='exited with status 4')),
            # no errors for stdout1_path, stdout3_path, or stderr4_path
        }

        # we should read from stderr4_path first (later task number)
        self.assertEqual(
            self.interpret_spark_task_logs(),
            dict(
                errors=[
                    dict(
                        container_id='container_1450486922681_0005_01_000004',
                        hadoop_error=dict(
                            message='exited with status 4',
                            path=stderr4_path,
                        ),
                    ),
                ],
                partial=True,
            ))

        self.assertEqual(self.mock_log_callback.call_args_list,
                         [call(stderr4_path)])

        # try again, with partial=False
        self.mock_log_callback.reset_mock()

        # paths still get sorted by _ls_logs()
        self.assertEqual(
            self.interpret_spark_task_logs(partial=False),
            dict(errors=[
                dict(
                    container_id='container_1450486922681_0005_01_000004',
                    hadoop_error=dict(
                        message='exited with status 4',
                        path=stderr4_path,
                    ),
                ),
                dict(
                    container_id='container_1450486922681_0005_01_000002',
                    hadoop_error=dict(
                        message='exited with status 2',
                        path=stderr2_path,
                    ),
                    task_error=dict(
                        message='BoomException',
                        path=stdout2_path,
                    ),
                ),
                dict(
                    container_id='container_1450486922681_0005_01_000001',
                    hadoop_error=dict(
                        message='BOOM1',
                        path=stderr1_path,
                    ),
                ),
            ], ))

        self.assertEqual(self.mock_log_callback.call_args_list, [
            call(stderr4_path),
            call(stderr3_path),
            call(stderr2_path),
            call(stdout2_path),
            call(stderr1_path),
        ])
Exemplo n.º 23
0
    def test_multiple_logs(self):
        stdout1_path = ('/log/dir/userlogs/application_1450486922681_0005'
                        '/container_1450486922681_0005_01_000001/stdout')
        stderr1_path = ('/log/dir/userlogs/application_1450486922681_0005'
                        '/container_1450486922681_0005_01_000001/stderr')
        stdout2_path = ('/log/dir/userlogs/application_1450486922681_0005'
                        '/container_1450486922681_0005_01_000002/stdout')
        stderr2_path = ('/log/dir/userlogs/application_1450486922681_0005'
                        '/container_1450486922681_0005_01_000002/stderr')
        stdout3_path = ('/log/dir/userlogs/application_1450486922681_0005'
                        '/container_1450486922681_0005_01_000003/stdout')
        stderr3_path = ('/log/dir/userlogs/application_1450486922681_0005'
                        '/container_1450486922681_0005_01_000003/stderr')
        stderr4_path = ('/log/dir/userlogs/application_1450486922681_0005'
                        '/container_1450486922681_0005_01_000004/stderr')

        self.mock_paths = [
            stdout1_path,
            stderr1_path,
            stdout2_path,
            stderr2_path,
            stdout3_path,
            stderr3_path,
            stderr4_path,
        ]

        self.path_to_mock_result = {
            stderr1_path: dict(
                hadoop_error=dict(message='BOOM1')),
            stderr2_path: dict(
                check_stdout=True,
                hadoop_error=dict(message='exited with status 2')),
            stdout2_path: dict(message='BoomException'),
            stderr4_path: dict(
                check_stdout=True,
                hadoop_error=dict(message='exited with status 4')),
            # no errors for stdout1_path, stdout3_path, or stderr4_path
        }

        # we should yield from stderr2_path first (latest task number that
        # has a corresponding stdout)
        self.assertEqual(self.interpret_spark_task_logs(), dict(
            errors=[
                dict(
                    container_id='container_1450486922681_0005_01_000002',
                    hadoop_error=dict(
                        message='exited with status 2',
                        path=stderr2_path,
                    ),
                    task_error=dict(
                        message='BoomException',
                        path=stdout2_path,
                    ),
                ),
            ],
            partial=True,
        ))

        self.assertEqual(self.mock_log_callback.call_args_list, [
            call(stderr3_path),
            call(stderr2_path),
            call(stdout2_path),
        ])

        # try again, with partial=False
        self.mock_log_callback.reset_mock()

        # paths still get sorted by _ls_logs()
        self.assertEqual(self.interpret_spark_task_logs(partial=False), dict(
            errors=[
                dict(
                    container_id='container_1450486922681_0005_01_000002',
                    hadoop_error=dict(
                        message='exited with status 2',
                        path=stderr2_path,
                    ),
                    task_error=dict(
                        message='BoomException',
                        path=stdout2_path,
                    ),
                ),
                dict(
                    container_id='container_1450486922681_0005_01_000001',
                    hadoop_error=dict(
                        message='BOOM1',
                        path=stderr1_path,
                    ),
                ),
                dict(
                    container_id='container_1450486922681_0005_01_000004',
                    hadoop_error=dict(
                        message='exited with status 4',
                        path=stderr4_path,
                    ),
                ),
            ],
        ))

        self.assertEqual(
            self.mock_log_callback.call_args_list,
            [
                call(stderr3_path),
                call(stderr2_path),
                call(stdout2_path),
                call(stderr1_path),
                call(stderr4_path),
            ]
        )