def test_non_log_lines(self): lines = StringIO('foo\n' 'bar\n' '15/12/11 13:26:08 ERROR streaming.StreamJob:' ' Error Launching job :' ' Output directory already exists\n' 'Streaming Command Failed!') with no_handlers_for_logger('mrjob.logs.parse'): stderr = StringIO() log_to_stream('mrjob.logs.parse', stderr) self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ # ignore leading non-log lines dict( timestamp='15/12/11 13:26:08', level='ERROR', logger='streaming.StreamJob', thread=None, # no way to know that Streaming Command Failed! wasn't part # of a multi-line message message=('Error Launching job :' ' Output directory already exists\n' 'Streaming Command Failed!')) ]) # should be one warning for each leading non-log line log_lines = stderr.getvalue().splitlines() self.assertEqual(len(log_lines), 2)
def test_log_lines(self): lines = StringIO('15/12/11 13:26:07 INFO client.RMProxy:' ' Connecting to ResourceManager at /0.0.0.0:8032\n' '15/12/11 13:26:08 ERROR streaming.StreamJob:' ' Error Launching job :' ' Output directory already exists\n') self.assertEqual( list(_parse_hadoop_log4j_records(lines)), [ dict( level='INFO', logger='client.RMProxy', message='Connecting to ResourceManager at /0.0.0.0:8032', num_lines=1, start_line=0, thread='', timestamp='15/12/11 13:26:07', ), dict( level='ERROR', logger='streaming.StreamJob', message=('Error Launching job :' ' Output directory already exists'), num_lines=1, start_line=1, thread='', timestamp='15/12/11 13:26:08', ), ])
def test_hadoop_runner_option_store(self): stderr = StringIO() with no_handlers_for_logger('mrjob.conf'): log_to_stream('mrjob.conf', stderr) # HadoopRunnerOptionStore really wants to find the streaming jar with patch.object(mrjob.hadoop, 'find_hadoop_streaming_jar', return_value='found'): opts = HadoopRunnerOptionStore( 'hadoop', dict(base_tmp_dir='/scratch', hadoop_home='required', hdfs_scratch_dir='hdfs:///scratch'), []) self.assertEqual(opts['local_tmp_dir'], '/scratch') self.assertNotIn('base_tmp_dir', opts) self.assertIn( 'Deprecated option base_tmp_dir has been renamed' ' to local_tmp_dir', stderr.getvalue()) self.assertEqual(opts['hadoop_tmp_dir'], 'hdfs:///scratch') self.assertNotIn('hdfs_scratch_dir', opts) self.assertIn( 'Deprecated option hdfs_scratch_dir has been renamed' ' to hadoop_tmp_dir', stderr.getvalue())
def test_yarn_output(self): # abbreviated version of real output from Hadoop 2.7.0. # Including things that might be interesting to parse later on lines = StringIO( '15/12/11 13:32:44 INFO client.RMProxy:' ' Connecting to ResourceManager at /0.0.0.0:8032\n' '15/12/11 13:32:45 INFO mapreduce.JobSubmitter:' ' Submitting tokens for job: job_1449857544442_0002\n' '15/12/11 13:32:45 INFO impl.YarnClientImpl:' ' Submitted application application_1449857544442_0002\n' '15/12/11 13:32:45 INFO mapreduce.Job:' ' The url to track the job:' ' http://0a7802e19139:8088/proxy/application_1449857544442_0002/\n' '15/12/11 13:33:11 INFO mapreduce.Job: map 100% reduce 100%\n' '15/12/11 13:33:11 INFO mapreduce.Job:' ' Job job_1449857544442_0002 completed successfully\n' '15/12/11 13:33:11 INFO mapreduce.Job: Counters: 49\n' ' File System Counters\n' ' FILE: Number of bytes read=86\n' '15/12/11 13:33:11 INFO streaming.StreamJob:' ' Output directory:' ' hdfs:///user/root/tmp/mrjob/mr_wc.root.20151211.181326.984074' '/output\n') self.assertEqual( _parse_hadoop_streaming_log(lines), dict(application_id='application_1449857544442_0002', counters={ 'File System Counters': { 'FILE: Number of bytes read': 86, } }, job_id='job_1449857544442_0002', output_dir=('hdfs:///user/root/tmp/mrjob' '/mr_wc.root.20151211.181326.984074/output')))
def test_failed_job(self): mr_job = MRTwoStepJob(['-r', 'dataproc', '-v']) mr_job.sandbox() with no_handlers_for_logger('mrjob.dataproc'): stderr = StringIO() log_to_stream('mrjob.dataproc', stderr) self._dataproc_client.job_get_advances_states = (collections.deque( ['SETUP_DONE', 'RUNNING', 'ERROR'])) with mr_job.make_runner() as runner: self.assertIsInstance(runner, DataprocJobRunner) self.assertRaises(StepFailedException, runner.run) self.assertIn(' => ERROR\n', stderr.getvalue()) cluster_id = runner.get_cluster_id() # job should get terminated cluster = ( self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id]) cluster_state = self._dataproc_client.get_state(cluster) self.assertEqual(cluster_state, 'DELETING')
def test_dry_run(self): stdout = StringIO() self.maybe_terminate_quietly( stdout=stdout, max_mins_idle=0.6, dry_run=True) # shouldn't *actually* terminate clusters self.assertEqual(self.ids_of_terminated_clusters(), [])
def test_cleanup_options(self): stderr = StringIO() with no_handlers_for_logger('mrjob.runner'): log_to_stream('mrjob.runner', stderr) opts = RunnerOptionStore( 'inline', dict(cleanup=['LOCAL_SCRATCH', 'REMOTE_SCRATCH'], cleanup_on_failure=['JOB_FLOW', 'SCRATCH']), []) self.assertEqual(opts['cleanup'], ['LOCAL_TMP', 'CLOUD_TMP']) self.assertIn( 'Deprecated cleanup option LOCAL_SCRATCH has been renamed' ' to LOCAL_TMP', stderr.getvalue()) self.assertIn( 'Deprecated cleanup option REMOTE_SCRATCH has been renamed' ' to CLOUD_TMP', stderr.getvalue()) self.assertEqual(opts['cleanup_on_failure'], ['CLUSTER', 'TMP']) self.assertIn( 'Deprecated cleanup_on_failure option JOB_FLOW has been' ' renamed to CLUSTER', stderr.getvalue()) self.assertIn( 'Deprecated cleanup_on_failure option SCRATCH has been renamed' ' to TMP', stderr.getvalue())
def test_pre_yarn_output(self): # actual output from Hadoop 1.0.3 on EMR AMI 2.4.9 # Including things that might be interesting to parse later on lines = StringIO( '15/12/11 23:08:37 INFO streaming.StreamJob:' ' getLocalDirs(): [/mnt/var/lib/hadoop/mapred]\n' '15/12/11 23:08:37 INFO streaming.StreamJob:' ' Running job: job_201512112247_0003\n' '15/12/11 23:08:37 INFO streaming.StreamJob:' ' Tracking URL:' ' http://ip-172-31-27-129.us-west-2.compute.internal:9100' '/jobdetails.jsp?jobid=job_201512112247_0003\n' '15/12/11 23:09:16 INFO streaming.StreamJob:' ' map 100% reduce 100%\n' '15/12/11 23:09:22 INFO streaming.StreamJob:' ' Output: hdfs:///user/hadoop/tmp/mrjob' '/mr_wc.hadoop.20151211.230352.433691/output\n') self.assertEqual( _parse_hadoop_streaming_log(lines), dict(application_id=None, counters=None, job_id='job_201512112247_0003', output_dir=('hdfs:///user/hadoop/tmp/mrjob' '/mr_wc.hadoop.20151211.230352.433691/output')))
def test_with_all_job_flows(self): self.mock_emr_job_flows.update(JOB_FLOWS_BY_ID) emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn() emr_conn.run_jobflow('no name', log_uri=None) main(['-q', '--no-conf']) lines = [line for line in StringIO(self.stdout.getvalue())] self.assertEqual(len(lines), len(JOB_FLOWS_BY_ID) - 1)
def test_verbose(self): with patch.object(sys, 'stderr', StringIO()) as stderr: MRJob.set_up_logging(verbose=True) log = logging.getLogger('__main__') log.info('INFO') log.debug('DEBUG') self.assertEqual(stderr.getvalue(), 'INFO\nDEBUG\n')
def assert_hadoop_version(self, JobClass, version_string): mr_job = JobClass() mock_log = StringIO() with no_handlers_for_logger('mrjob.job'): log_to_stream('mrjob.job', mock_log) self.assertEqual(mr_job.jobconf()['hadoop_version'], version_string) self.assertIn('should be a string', mock_log.getvalue())
def test_default_options(self): with no_handlers_for_logger('__main__'): with patch.object(sys, 'stderr', StringIO()) as stderr: MRJob.set_up_logging() log = logging.getLogger('__main__') log.info('INFO') log.debug('DEBUG') self.assertEqual(stderr.getvalue(), 'INFO\n')
def test_messy_error(self): counter_string = b'Job JOBID="_001" FAILED_REDUCES="0" COUNTERS="THIS IS NOT ACTUALLY A COUNTER"' with no_handlers_for_logger(''): stderr = StringIO() log_to_stream('mrjob.parse', stderr, level=logging.WARN) self.assertEqual(({}, 1), parse_hadoop_counters_from_line(counter_string)) self.assertIn('Cannot parse Hadoop counter string', stderr.getvalue())
def updated_and_warnings(self, jobconf, hadoop_version): jobconf = jobconf.copy() with no_handlers_for_logger('mrjob.runner'): stderr = StringIO() log_to_stream('mrjob.runner', stderr) self.runner._update_jobconf_for_hadoop_version( jobconf, hadoop_version) return jobconf, stderr.getvalue()
def test_trailing_carriage_return(self): lines = StringIO('15/12/11 13:26:07 INFO client.RMProxy:' ' Connecting to ResourceManager at /0.0.0.0:8032\r\n') self.assertEqual(list(_parse_hadoop_log_lines(lines)), [ dict(timestamp='15/12/11 13:26:07', level='INFO', logger='client.RMProxy', thread=None, message='Connecting to ResourceManager at /0.0.0.0:8032') ])
def test_exclude(self): for cluster in CLUSTERS: self.add_mock_emr_cluster(cluster) main(['-q', '--no-conf', '-x', 'my_key,my_value']) lines = [line for line in StringIO(self.stdout.getvalue())] self.assertEqual(len(lines), len(CLUSTERS_BY_ID) - 2) self.assertNotIn('j-COMPLETED', self.stdout.getvalue()) self.assertNotIn('j-RUNNING1STEP', self.stdout.getvalue())
def test_option_debug_printout(self): stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.runner', stderr, debug=True) InlineMRJobRunner(owner='dave') self.assertIn("'owner'", stderr.getvalue()) self.assertIn("'dave'", stderr.getvalue())
def get_debug_printout(self, opt_store_class, alias, opts): stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.runner', stderr, debug=True) # debug printout happens in constructor opt_store_class(alias, opts, []) return stderr.getvalue()
def test_empty_runner_error(self): conf = dict(runner=dict(local=dict(local_tmp_dir='/tmp'))) path = self.save_conf('basic', conf) stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.runner', stderr) RunnerOptionStore('inline', {}, [path]) self.assertEqual("No configs specified for inline runner\n", stderr.getvalue())
def test_runner_option_store(self): stderr = StringIO() with no_handlers_for_logger('mrjob.conf'): log_to_stream('mrjob.conf', stderr) opts = RunnerOptionStore( 'inline', dict(base_tmp_dir='/scratch'), []) self.assertEqual(opts['local_tmp_dir'], '/scratch') self.assertNotIn('base_tmp_dir', opts) self.assertIn('Deprecated option base_tmp_dir has been renamed' ' to local_tmp_dir', stderr.getvalue())
def test_with_all_clusters(self): for cluster in CLUSTERS: self.add_mock_emr_cluster(cluster) emr_conn = self.connect_emr() emr_conn.run_jobflow('no name', job_flow_role='fake-instance-profile', service_role='fake-service-role') main(['-q', '--no-conf']) lines = [line for line in StringIO(self.stdout.getvalue())] self.assertEqual(len(lines), len(CLUSTERS_BY_ID) - 1)
def test_recurse(self): path = os.path.join(self.tmp_dir, 'LOL.conf') recurse_conf = dict(include=path) with open(path, 'w') as f: dump_mrjob_conf(recurse_conf, f) stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.conf', stderr) RunnerOptionStore('inline', {}, [path]) self.assertIn('%s tries to recursively include %s!' % (path, path), stderr.getvalue())
def test_attrs_should_be_classes(self): with no_handlers_for_logger('mrjob.job'): stderr = StringIO() log_to_stream('mrjob.job', stderr) job = self.StrangeJob() self.assertIsInstance(job.input_protocol(), JSONProtocol) self.assertIsInstance(job.internal_protocol(), JSONProtocol) self.assertIsInstance(job.output_protocol(), JSONProtocol) logs = stderr.getvalue() self.assertIn('INPUT_PROTOCOL should be a class', logs) self.assertIn('INTERNAL_PROTOCOL should be a class', logs) self.assertIn('OUTPUT_PROTOCOL should be a class', logs)
def test_thread(self): lines = StringIO( '2015-08-22 00:46:18,411 INFO amazon.emr.metrics.MetricsSaver' ' (main): Thread 1 created MetricsLockFreeSaver 1\n') self.assertEqual(list(_parse_hadoop_log_lines(lines)), [ dict(timestamp='2015-08-22 00:46:18,411', level='INFO', logger='amazon.emr.metrics.MetricsSaver', thread='main', message='Thread 1 created MetricsLockFreeSaver 1') ])
class CollectEMRStatsTestCase(TestCase): @patch('mrjob.tools.emr.collect_emr_stats.describe_all_job_flows') @patch('mrjob.tools.emr.collect_emr_stats.EMRJobRunner') def test_collect_active_job_flows(self, mock_job_runner, mock_describe_jobflows): collect_active_job_flows(conf_paths=[]) # check if args for calling describe_jobflows are correct self.assertEqual(mock_job_runner.call_count, 1) self.assertEqual(mock_job_runner.call_args_list, [call(conf_paths=[])]) self.assertEqual(mock_describe_jobflows.call_count, 1) # check if args for calling describe_jobflows are correct active_states = ['STARTING', 'BOOTSTRAPPING', 'WAITING', 'RUNNING'] args, kwargs = mock_describe_jobflows.call_args self.assertEqual(active_states, kwargs['states']) def test_job_flows_to_stats(self): # mock jobflows NUM_JOB_FLOWS = 30 job_flows = [] for i in range(NUM_JOB_FLOWS): job_flow_id = 'j-%04d' % i job_flows.append( MockEmrObject( jobflowid=job_flow_id, instancecount=i, # each jobflow has different instance count )) stats = job_flows_to_stats(job_flows) self.assertEqual(stats['num_jobflows'], NUM_JOB_FLOWS) self.assertEqual(stats['total_instance_count'], sum(range(NUM_JOB_FLOWS))) @patch('mrjob.tools.emr.collect_emr_stats.job_flows_to_stats') @patch('mrjob.tools.emr.collect_emr_stats.collect_active_job_flows') @patch('sys.stdout', StringIO()) def test_main_no_conf(self, mock_collect_active_jobflows, mock_job_flows_to_stats): mock_collect_active_jobflows.return_value = [] mock_job_flows_to_stats.return_value = {} main(['-q', '--no-conf']) # check if args for calling collect_active_jobflows are correct self.assertEqual(mock_collect_active_jobflows.call_count, 1) self.assertEqual(mock_collect_active_jobflows.call_args_list, [call([])]) self.assertEqual(mock_job_flows_to_stats.call_count, 1)
def _test_recoverable_error(self, ex): self.mock_paths = ['/path/to/logs/oak', ex] with no_handlers_for_logger('mrjob.logs.wrap'): stderr = StringIO() log_to_stream('mrjob.logs.wrap', stderr) self.assertEqual(self._ls_logs([['/path/to/logs']]), [dict(path='/path/to/logs/oak')]) self.mock_fs.ls.assert_called_once_with('/path/to/logs') self.assertIn("couldn't ls() /path/to/logs", stderr.getvalue())
def test_io_error(self): self.mock_paths = [ IOError(), ] with no_handlers_for_logger('mrjob.logs.ls'): stderr = StringIO() log_to_stream('mrjob.logs.ls', stderr) self.assertEqual(list(_ls_logs(self.mock_fs, '/path/to/logs')), []) self.mock_fs.ls.assert_called_once_with('/path/to/logs') self.assertIn("couldn't ls() /path/to/logs", stderr.getvalue())
def test_deprecated_alias(self): with no_handlers_for_logger('mrjob.util'): stderr = StringIO() log_to_stream('mrjob.util', stderr) self.assertEqual( list(buffer_iterator_to_line_iterator(chunk for chunk in [b'The quick\nbrown fox\nju', b'mped over\nthe lazy\ndog', b's.\n'])), [b'The quick\n', b'brown fox\n', b'jumped over\n', b'the lazy\n', b'dogs.\n']) self.assertIn('has been renamed', stderr.getvalue())
def test_indentation_is_required(self): lines = [ 'File System Counters', ' FILE: Number of bytes read=8', ] with no_handlers_for_logger('mrjob.logs.step'): stderr = StringIO() log_to_stream('mrjob.logs.step', stderr) # counter line is interpreted as group self.assertEqual(_parse_indented_counters(lines), {}) # should complain self.assertNotEqual(stderr.getvalue(), '')
def test_dry_run(self): stdout = StringIO() self.maybe_terminate_quietly( stdout=stdout, max_mins_idle=0.6, dry_run=True) # dry_run doesn't actually try to lock expected_stdout_lines = self.EXPECTED_STDOUT_LINES + [ 'Terminated cluster j-IDLE_AND_LOCKED (IDLE_AND_LOCKED);' ' was idle for 2:00:00'] self.assertEqual(set(stdout.getvalue().splitlines()), set(expected_stdout_lines)) # shouldn't *actually* terminate clusters self.assertEqual(self.ids_of_terminated_clusters(), [])