def test_syslog_with_empty_corresponding_stderr(self): syslog_path = '/userlogs/attempt_201512232143_0008_m_000001_3/syslog' stderr_path = '/userlogs/attempt_201512232143_0008_m_000001_3/stderr' self.mock_paths = [syslog_path, stderr_path] self.path_to_mock_result = { syslog_path: dict(hadoop_error=dict(message='BOOM')), } self.assertEqual( self.interpret_task_logs(), dict( errors=[ dict( attempt_id='attempt_201512232143_0008_m_000001_3', hadoop_error=dict( message='BOOM', path=syslog_path, ), task_id='task_201512232143_0008_m_000001', ), ], partial=True, ) ) self.assertEqual( self.mock_log_callback.call_args_list, [call(stderr_path), call(syslog_path)])
def test_mixed_job(self): self._run_job(MRStreamingAndSpark) self.run_step_on_spark.assert_has_calls([ call(ANY, 0, 0), call(ANY, 1, 1), ])
def test_syslog_with_empty_corresponding_stderr(self): syslog_path = '/userlogs/attempt_201512232143_0008_m_000001_3/syslog' stderr_path = '/userlogs/attempt_201512232143_0008_m_000001_3/stderr' self.mock_paths = [syslog_path, stderr_path] self.path_to_mock_result = { syslog_path: dict(hadoop_error=dict(message='BOOM')), } self.assertEqual( self.interpret_task_logs(), dict( errors=[ dict( attempt_id='attempt_201512232143_0008_m_000001_3', hadoop_error=dict( message='BOOM', path=syslog_path, ), task_id='task_201512232143_0008_m_000001', ), ], partial=True, )) self.assertEqual( self.mock_log_callback.call_args_list, [call(stderr_path), call(syslog_path)])
def test_stderr_with_application_exited_and_empty_stdout(self): stderr_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/stderr') stdout_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/stdout') self.mock_paths = [stderr_path, stdout_path] self.path_to_mock_result = { stderr_path: dict(check_stdout=True, hadoop_error=dict(message='application exited')), } self.assertEqual( self.interpret_spark_task_logs(), dict( errors=[ dict( container_id='container_1450486922681_0005_01_000004', hadoop_error=dict( message='application exited', path=stderr_path, ), ), ], partial=True, )) self.assertEqual( self.mock_log_callback.call_args_list, [call(stderr_path), call(stdout_path)])
def test_stderr_with_application_exited_and_empty_stdout(self): stderr_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/stderr') stdout_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/stdout') self.mock_paths = [stderr_path, stdout_path] self.path_to_mock_result = { stderr_path: dict( check_stdout=True, hadoop_error=dict(message='application exited')), } self.assertEqual( self.interpret_spark_task_logs(), dict( errors=[ dict( container_id='container_1450486922681_0005_01_000004', hadoop_error=dict( message='application exited', path=stderr_path, ), ), ], partial=True, ) ) self.assertEqual( self.mock_log_callback.call_args_list, [call(stderr_path), call(stdout_path)])
def test_expected(self): values = [self.GOOD_LIST_OUTPUT, self.GOOD_KILL_OUTPUT] def fake_popen(*args, **kwargs): m = Mock() m.communicate.return_value = (values.pop(0), b'') return m with patch.object(ssh, 'Popen', side_effect=fake_popen) as m: ssh.ssh_terminate_single_job(['ssh_bin'], 'address', 'key.pem') self.assertEqual(m.call_args_list[0], call(self.EXPECTED_LIST_CALL, stdin=PIPE, stdout=PIPE, stderr=PIPE)) self.assertEqual(m.call_args_list[1], call(self.EXPECTED_KILL_CALL, stdin=PIPE, stdout=PIPE, stderr=PIPE))
def test_main_no_conf(self, mock_collect_active_jobflows, mock_job_flows_to_stats): mock_collect_active_jobflows.return_value = [] mock_job_flows_to_stats.return_value = {} main(['-q', '--no-conf']) # check if args for calling collect_active_jobflows are correct self.assertEqual(mock_collect_active_jobflows.call_count, 1) self.assertEqual(mock_collect_active_jobflows.call_args_list, [call([])]) self.assertEqual(mock_job_flows_to_stats.call_count, 1)
def test_log_messages(self): self.get_lines.return_value = [ '18/04/17 22:06:15 INFO mapreduce.Job: map 100% reduce 0%\n', '18/04/17 22:07:34 INFO mapreduce.Job: Counters: 1\n', '\tFile System Counters\n', '\t\tFILE: Number of bytes read=819\n', ] mr_job = MRWordCount(['-r', 'dataproc']) mr_job.sandbox() with mr_job.make_runner() as runner: runner.run() self.assertIn(call(' map 100% reduce 0%'), self.log.info.call_args_list) self.assertIn( call('Counters: 1\n\tFile System Counters\n\t\tFILE:' ' Number of bytes read=819'), self.log.info.call_args_list)
def tests_streaming_steps_with_different_jobconf(self): class MRDifferentJobconfJob(MRJob): def mapper(self, key, value): yield key, value def steps(self): return [ MRStep(mapper=self.mapper), MRStep(mapper=self.mapper, jobconf=dict(foo='bar')), MRStep(mapper=self.mapper, jobconf=dict(foo='bar')), MRStep(mapper=self.mapper, jobconf=dict(foo='baz')), ] self._run_job(MRDifferentJobconfJob) # steps 1 and 2 should be grouped together self.run_step_on_spark.assert_has_calls([ call(ANY, 0, 0), call(ANY, 1, 2), call(ANY, 3, 3), ])
def test_setup_wrapper_script_uses_local_line_endings(self): job = MRTwoStepJob(["-r", "local", "--setup", "true"]) job.sandbox(stdin=BytesIO()) # tests #1071. Unfortunately, we mostly run these tests on machines # that use unix line endings anyway. So monitor open() instead with patch("mrjob.runner.open", create=True, side_effect=open) as m_open: with logger_disabled("mrjob.local"): with job.make_runner() as runner: runner.run() self.assertIn(call(runner._setup_wrapper_script_path, "w"), m_open.mock_calls)
def test_setup_wrapper_script_uses_local_line_endings(self): job = MRTwoStepJob(['-r', 'local', '--setup', 'true']) job.sandbox(stdin=BytesIO()) # tests #1071. Unfortunately, we mostly run these tests on machines # that use unix line endings anyway. So monitor open() instead with patch('mrjob.sim.open', create=True, side_effect=open) as m_open: with job.make_runner() as runner: runner.run() self.assertIn(call(runner._setup_wrapper_script_path, 'w'), m_open.mock_calls)
def test_collect_active_job_flows(self, mock_job_runner, mock_describe_jobflows): collect_active_job_flows(conf_paths=[]) # check if args for calling describe_jobflows are correct self.assertEqual(mock_job_runner.call_count, 1) self.assertEqual(mock_job_runner.call_args_list, [call(conf_paths=[])]) self.assertEqual(mock_describe_jobflows.call_count, 1) # check if args for calling describe_jobflows are correct active_states = ['STARTING', 'BOOTSTRAPPING', 'WAITING', 'RUNNING'] args, kwargs = mock_describe_jobflows.call_args self.assertEqual(active_states, kwargs['states'])
def test_expected(self): values = [self.GOOD_LIST_OUTPUT, self.GOOD_KILL_OUTPUT] def fake_popen(*args, **kwargs): m = Mock() m.communicate.return_value = (values.pop(0), b'') return m with patch.object(ssh, 'Popen', side_effect=fake_popen) as m: ssh.ssh_terminate_single_job(['ssh_bin'], 'address', 'key.pem') self.assertEqual( m.call_args_list[0], call(self.EXPECTED_LIST_CALL, stdin=PIPE, stdout=PIPE, stderr=PIPE)) self.assertEqual( m.call_args_list[1], call(self.EXPECTED_KILL_CALL, stdin=PIPE, stdout=PIPE, stderr=PIPE))
def test_setup_wrapper_script_uses_local_line_endings(self): job = MRTwoStepJob(['-r', 'local', '--setup', 'true']) job.sandbox(stdin=BytesIO()) # tests #1071. Unfortunately, we mostly run these tests on machines # that use unix line endings anyway. So monitor open() instead with patch( 'mrjob.sim.open', create=True, side_effect=open) as m_open: with job.make_runner() as runner: runner.run() self.assertIn( call(runner._setup_wrapper_script_path, 'w'), m_open.mock_calls)
def test_error_in_stdout_only(self): stderr_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/stderr') stdout_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/stdout') self.mock_paths = [stderr_path, stdout_path] self.path_to_mock_result = { stdout_path: dict(message='because, exploding code') } self.assertEqual(self.interpret_spark_task_logs(), {}) self.assertEqual(self.mock_log_callback.call_args_list, [call(stderr_path)])
def test_setup_wrapper_script_uses_local_line_endings(self): job = MRTwoStepJob(['-r', 'hadoop', '--setup', 'true']) job.sandbox() add_mock_hadoop_output([b'']) add_mock_hadoop_output([b'']) # tests #1071. Unfortunately, we mostly run these tests on machines # that use unix line endings anyway. So monitor open() instead with patch( 'mrjob.runner.open', create=True, side_effect=open) as m_open: with logger_disabled('mrjob.hadoop'): with job.make_runner() as runner: runner.run() self.assertIn( call(runner._setup_wrapper_script_path, 'wb'), m_open.mock_calls)
def test_error_in_stdout_only(self): stderr_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/stderr') stdout_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/stdout') self.mock_paths = [stderr_path, stdout_path] self.path_to_mock_result = { stdout_path: dict(message='because, exploding code') } self.assertEqual( self.interpret_spark_task_logs(), {}) self.assertEqual( self.mock_log_callback.call_args_list, [call(stderr_path)])
def test_multiple_logs(self): stdout1_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000001/stdout') stderr1_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000001/stderr') stdout2_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000002/stdout') stderr2_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000002/stderr') stdout3_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000003/stdout') stderr3_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000003/stderr') stderr4_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/stderr') self.mock_paths = [ stdout1_path, stderr1_path, stdout2_path, stderr2_path, stdout3_path, stderr3_path, stderr4_path, ] self.path_to_mock_result = { stderr1_path: dict(hadoop_error=dict(message='BOOM1')), stderr2_path: dict(check_stdout=True, hadoop_error=dict(message='exited with status 2')), stdout2_path: dict(message='BoomException'), stderr4_path: dict(check_stdout=True, hadoop_error=dict(message='exited with status 4')), # no errors for stdout1_path, stdout3_path, or stderr4_path } # we should read from stderr4_path first (later task number) self.assertEqual( self.interpret_spark_task_logs(), dict( errors=[ dict( container_id='container_1450486922681_0005_01_000004', hadoop_error=dict( message='exited with status 4', path=stderr4_path, ), ), ], partial=True, )) self.assertEqual(self.mock_log_callback.call_args_list, [call(stderr4_path)]) # try again, with partial=False self.mock_log_callback.reset_mock() # paths still get sorted by _ls_logs() self.assertEqual( self.interpret_spark_task_logs(partial=False), dict(errors=[ dict( container_id='container_1450486922681_0005_01_000004', hadoop_error=dict( message='exited with status 4', path=stderr4_path, ), ), dict( container_id='container_1450486922681_0005_01_000002', hadoop_error=dict( message='exited with status 2', path=stderr2_path, ), task_error=dict( message='BoomException', path=stdout2_path, ), ), dict( container_id='container_1450486922681_0005_01_000001', hadoop_error=dict( message='BOOM1', path=stderr1_path, ), ), ], )) self.assertEqual(self.mock_log_callback.call_args_list, [ call(stderr4_path), call(stderr3_path), call(stderr2_path), call(stdout2_path), call(stderr1_path), ])
def test_multiple_logs(self): stdout1_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000001/stdout') stderr1_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000001/stderr') stdout2_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000002/stdout') stderr2_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000002/stderr') stdout3_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000003/stdout') stderr3_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000003/stderr') stderr4_path = ('/log/dir/userlogs/application_1450486922681_0005' '/container_1450486922681_0005_01_000004/stderr') self.mock_paths = [ stdout1_path, stderr1_path, stdout2_path, stderr2_path, stdout3_path, stderr3_path, stderr4_path, ] self.path_to_mock_result = { stderr1_path: dict( hadoop_error=dict(message='BOOM1')), stderr2_path: dict( check_stdout=True, hadoop_error=dict(message='exited with status 2')), stdout2_path: dict(message='BoomException'), stderr4_path: dict( check_stdout=True, hadoop_error=dict(message='exited with status 4')), # no errors for stdout1_path, stdout3_path, or stderr4_path } # we should yield from stderr2_path first (latest task number that # has a corresponding stdout) self.assertEqual(self.interpret_spark_task_logs(), dict( errors=[ dict( container_id='container_1450486922681_0005_01_000002', hadoop_error=dict( message='exited with status 2', path=stderr2_path, ), task_error=dict( message='BoomException', path=stdout2_path, ), ), ], partial=True, )) self.assertEqual(self.mock_log_callback.call_args_list, [ call(stderr3_path), call(stderr2_path), call(stdout2_path), ]) # try again, with partial=False self.mock_log_callback.reset_mock() # paths still get sorted by _ls_logs() self.assertEqual(self.interpret_spark_task_logs(partial=False), dict( errors=[ dict( container_id='container_1450486922681_0005_01_000002', hadoop_error=dict( message='exited with status 2', path=stderr2_path, ), task_error=dict( message='BoomException', path=stdout2_path, ), ), dict( container_id='container_1450486922681_0005_01_000001', hadoop_error=dict( message='BOOM1', path=stderr1_path, ), ), dict( container_id='container_1450486922681_0005_01_000004', hadoop_error=dict( message='exited with status 4', path=stderr4_path, ), ), ], )) self.assertEqual( self.mock_log_callback.call_args_list, [ call(stderr3_path), call(stderr2_path), call(stdout2_path), call(stderr1_path), call(stderr4_path), ] )