def test_messy_error(self): counter_string = 'Job JOBID="_001" FAILED_REDUCES="0" COUNTERS="THIS IS NOT ACTUALLY A COUNTER"' with no_handlers_for_logger(''): stderr = StringIO() log_to_stream('mrjob.parse', stderr, level=logging.WARN) assert_equal((None, None), parse_hadoop_counters_from_line(counter_string)) assert_in('Cannot parse Hadoop counter line', stderr.getvalue())
def scan_for_counters_in_files(log_file_uris, fs, hadoop_version): """Scan *log_file_uris* for counters, using *fs* for file system access """ counters = {} relevant_logs = [] # list of (sort key, URI) for log_file_uri in log_file_uris: m = _JOB_LOG_PATH_RE.match(log_file_uri) if not m: continue relevant_logs.append((int(m.group("step_num")), log_file_uri)) relevant_logs.sort() for _, log_file_uri in relevant_logs: log_lines = fs.cat(log_file_uri) if not log_lines: continue for line in log_lines: new_counters, step_num = parse_hadoop_counters_from_line(line, hadoop_version) if new_counters: counters[step_num] = new_counters return counters
def test_freaky_counter_names(self): freaky_name = r'\\\\\{\}\(\)\[\]\.\\\\' counter_string = (r'Job JOBID="_001" FAILED_REDUCES="0" ' r'COUNTERS="{(%s)(%s)[(a)(a)(1)]}"' % (freaky_name, freaky_name)) self.assertIn('\\{}()[].\\', parse_hadoop_counters_from_line(counter_string)[0])
def scan_for_counters_in_files(log_file_uris, fs, hadoop_version): """Scan *log_file_uris* for counters, using *fs* for file system access """ counters = {} relevant_logs = [] # list of (sort key, URI) for log_file_uri in log_file_uris: m = _JOB_LOG_PATH_RE.match(log_file_uri) if not m: continue relevant_logs.append((int(m.group('step_num')), log_file_uri)) relevant_logs.sort() for _, log_file_uri in relevant_logs: log_lines = fs.cat(log_file_uri) if not log_lines: continue for line in log_lines: new_counters, step_num = (parse_hadoop_counters_from_line( line, hadoop_version)) if new_counters: counters[step_num] = new_counters return counters
def test_find_counters_0_20(self): counters, step_num = parse_hadoop_counters_from_line( r'Job JOBID="job_201106092314_0003" FINISH_TIME="1307662284564" JOB_STATUS="SUCCESS" FINISHED_MAPS="2" FINISHED_REDUCES="1" FAILED_MAPS="0" FAILED_REDUCES="0" COUNTERS="{(org\.apache\.hadoop\.mapred\.JobInProgress$Counter)(Job Counters )[(TOTAL_LAUNCHED_REDUCES)(Launched reduce tasks)(1)][(TOTAL_LAUNCHED_MAPS)(Launched map tasks)(2)][(DATA_LOCAL_MAPS)(Data-local map tasks)(2)]}{(FileSystemCounters)(FileSystemCounters)[(FILE_BYTES_READ)(FILE_BYTES_READ)(10547174)][(HDFS_BYTES_READ)(HDFS_BYTES_READ)(49661008)][(FILE_BYTES_WRITTEN)(FILE_BYTES_WRITTEN)(21773078)][(S3_BYTES_WRITTEN)(S3_BYTES_WRITTEN)(49526580)]}{(org\.apache\.hadoop\.mapred\.Task$Counter)(Map-Reduce Framework)[(REDUCE_INPUT_GROUPS)(Reduce input groups)(18843)][(COMBINE_OUTPUT_RECORDS)(Combine output records)(0)][(MAP_INPUT_RECORDS)(Map input records)(29884)][(REDUCE_SHUFFLE_BYTES)(Reduce shuffle bytes)(11225840)][(REDUCE_OUTPUT_RECORDS)(Reduce output records)(29884)][(SPILLED_RECORDS)(Spilled Records)(59768)][(MAP_OUTPUT_BYTES)(Map output bytes)(50285563)][(MAP_INPUT_BYTES)(Map input bytes)(49645726)][(MAP_OUTPUT_RECORDS)(Map output records)(29884)][(COMBINE_INPUT_RECORDS)(Combine input records)(0)][(REDUCE_INPUT_RECORDS)(Reduce input records)(29884)]}{(profile)(profile)[(reducer time \\(processing\\): 2\.51)(reducer time \\(processing\\): 2\.51)(1)][(mapper time \\(processing\\): 0\.50)(mapper time \\(processing\\): 0\.50)(1)][(mapper time \\(other\\): 3\.78)(mapper time \\(other\\): 3\.78)(1)][(mapper time \\(processing\\): 0\.46)(mapper time \\(processing\\): 0\.46)(1)][(reducer time \\(other\\): 6\.31)(reducer time \\(other\\): 6\.31)(1)][(mapper time \\(other\\): 3\.72)(mapper time \\(other\\): 3\.72)(1)]}" .' ) assert_in('reducer time (processing): 2.51', counters['profile']) assert_equal(step_num, 3)
def scan_for_counters_in_files(log_file_uris, runner, hadoop_version): """Scan *log_file_uris* for counters, using *runner* for file system access """ counters = {} relevant_logs = [] # list of (sort key, URI) for log_file_uri in log_file_uris: match = EMR_JOB_LOG_URI_RE.match(log_file_uri) if match is None: match = HADOOP_JOB_LOG_URI_RE.match(log_file_uri) if not match: continue relevant_logs.append((match.group('step_num'), log_file_uri)) relevant_logs.sort() for _, log_file_uri in relevant_logs: log_lines = runner.cat(log_file_uri) if not log_lines: continue for line in log_lines: new_counters, step_num = ( parse_hadoop_counters_from_line(line, hadoop_version)) if new_counters: counters[step_num] = new_counters return counters
def scan_for_counters_in_files(log_file_uris, runner, hadoop_version): """Scan *log_file_uris* for counters, using *runner* for file system access """ counters = {} relevant_logs = [] # list of (sort key, URI) for log_file_uri in log_file_uris: match = EMR_JOB_LOG_URI_RE.match(log_file_uri) if match is None: match = HADOOP_JOB_LOG_URI_RE.match(log_file_uri) if not match: continue relevant_logs.append((match.group('step_num'), log_file_uri)) relevant_logs.sort() for _, log_file_uri in relevant_logs: log_lines = runner.cat(log_file_uri) if not log_lines: continue for line in log_lines: new_counters, step_num = (parse_hadoop_counters_from_line( line, hadoop_version)) if new_counters: counters[step_num] = new_counters return counters
def test_find_counters_0_18(self): counters, step_num = parse_hadoop_counters_from_line('Job JOBID="job_201106061823_0001" FINISH_TIME="1307384737542" JOB_STATUS="SUCCESS" FINISHED_MAPS="2" FINISHED_REDUCES="1" FAILED_MAPS="0" FAILED_REDUCES="0" COUNTERS="File Systems.S3N bytes read:3726,File Systems.Local bytes read:4164,File Systems.S3N bytes written:1663,File Systems.Local bytes written:8410,Job Counters .Launched reduce tasks:1,Job Counters .Rack-local map tasks:2,Job Counters .Launched map tasks:2,Map-Reduce Framework.Reduce input groups:154,Map-Reduce Framework.Combine output records:0,Map-Reduce Framework.Map input records:68,Map-Reduce Framework.Reduce output records:154,Map-Reduce Framework.Map output bytes:3446,Map-Reduce Framework.Map input bytes:2483,Map-Reduce Framework.Map output records:336,Map-Reduce Framework.Combine input records:0,Map-Reduce Framework.Reduce input records:336,profile.reducer step 0 estimated IO time: 0.00:1,profile.mapper step 0 estimated IO time: 0.00:2,profile.reducer step 0 estimated CPU time: 0.00:1,profile.mapper step ☃ estimated CPU time: 0.00:2"') self.assertEqual( counters['profile']['reducer step 0 estimated IO time: 0.00'], 1) self.assertEqual( counters['profile']['mapper step ☃ estimated CPU time: 0.00'], 2) self.assertEqual(step_num, 1)
def test_find_weird_counters_0_20(self): counters, step_num = parse_hadoop_counters_from_line(r'Job JOBID="job_201106132124_0001" FINISH_TIME="1308000435810" JOB_STATUS="SUCCESS" FINISHED_MAPS="2" FINISHED_REDUCES="1" FAILED_MAPS="0" FAILED_REDUCES="0" COUNTERS="{(org\.apache\.hadoop\.mapred\.JobInProgress$Counter)(Job Counters )[(TOTAL_LAUNCHED_REDUCES)(Launched reduce tasks)(1)][(RACK_LOCAL_MAPS)(Rack-local map tasks)(2)][(TOTAL_LAUNCHED_MAPS)(Launched map tasks)(2)]}{(FileSystemCounters)(FileSystemCounters)[(FILE_BYTES_READ)(FILE_BYTES_READ)(1494)][(S3_BYTES_READ)(S3_BYTES_READ)(3726)][(FILE_BYTES_WRITTEN)(FILE_BYTES_WRITTEN)(3459)][(S3_BYTES_WRITTEN)(S3_BYTES_WRITTEN)(1663)]}{(weird counters)(weird counters)[(\\[\\])(\\[\\])(68)][(\\\\)(\\\\)(68)][(\\{\\})(\\{\\})(68)][(\\(\\))(\\(\\))(68)][(\.)(\.)(68)]}{(org\.apache\.hadoop\.mapred\.Task$Counter)(Map-Reduce Framework)[(REDUCE_INPUT_GROUPS)(Reduce input groups)(154)][(COMBINE_OUTPUT_RECORDS)(Combine output records)(0)][(MAP_INPUT_RECORDS)(Map input records)(68)][(REDUCE_SHUFFLE_BYTES)(Reduce shuffle bytes)(1901)][(REDUCE_OUTPUT_RECORDS)(Reduce output records)(154)][(SPILLED_RECORDS)(Spilled Records)(672)][(MAP_OUTPUT_BYTES)(Map output bytes)(3446)][(MAP_INPUT_BYTES)(Map input bytes)(2483)][(MAP_OUTPUT_RECORDS)(Map output records)(336)][(COMBINE_INPUT_RECORDS)(Combine input records)(0)][(REDUCE_INPUT_RECORDS)(Reduce input records)(336)]}" .') self.assertIn('{}', counters['weird counters']) self.assertIn('()', counters['weird counters']) self.assertIn('.', counters['weird counters']) self.assertIn('[]', counters['weird counters']) self.assertIn('\\', counters['weird counters']) self.assertEqual(step_num, 1)
def test_find_counters_0_18_explicit(self): counters, step_num = parse_hadoop_counters_from_line( self.TEST_COUNTERS_0_18, hadoop_version='0.18') self.assertEqual( counters['profile']['reducer step 0 estimated IO time: 0.00'], 1) self.assertEqual( counters['profile']['mapper step ☃ estimated CPU time: 0.00'], 2) self.assertEqual(step_num, 1)
def test_find_counters_0_18(self): counters, step_num = parse_hadoop_counters_from_line( 'Job JOBID="job_201106061823_0001" FINISH_TIME="1307384737542" JOB_STATUS="SUCCESS" FINISHED_MAPS="2" FINISHED_REDUCES="1" FAILED_MAPS="0" FAILED_REDUCES="0" COUNTERS="File Systems.S3N bytes read:3726,File Systems.Local bytes read:4164,File Systems.S3N bytes written:1663,File Systems.Local bytes written:8410,Job Counters .Launched reduce tasks:1,Job Counters .Rack-local map tasks:2,Job Counters .Launched map tasks:2,Map-Reduce Framework.Reduce input groups:154,Map-Reduce Framework.Combine output records:0,Map-Reduce Framework.Map input records:68,Map-Reduce Framework.Reduce output records:154,Map-Reduce Framework.Map output bytes:3446,Map-Reduce Framework.Map input bytes:2483,Map-Reduce Framework.Map output records:336,Map-Reduce Framework.Combine input records:0,Map-Reduce Framework.Reduce input records:336,profile.reducer step 0 estimated IO time: 0.00:1,profile.mapper step 0 estimated IO time: 0.00:2,profile.reducer step 0 estimated CPU time: 0.00:1,profile.mapper step ☃ estimated CPU time: 0.00:2"' ) assert_equal( counters['profile']['reducer step 0 estimated IO time: 0.00'], 1) assert_equal( counters['profile']['mapper step ☃ estimated CPU time: 0.00'], 2) assert_equal(step_num, 1)
def test_correct_counters_parsed(self): map_counters = '{(map_counters)(map_counters)[(a)(a)(1)]}' reduce_counters = '{(red_counters)(red_counters)[(b)(b)(1)]}' all_counters = '{(all_counters)(all_counters)[(c)(c)(1)]}' tricksy_line = ( 'Job JOBID="job_201106092314_0001" ' 'MAP_COUNTERS="%s" REDUCE_COUNTERS="%s" COUNTERS="%s"' % (map_counters, reduce_counters, all_counters)) counters = parse_hadoop_counters_from_line(tricksy_line, '0.20')[0] self.assertEqual(counters, {'all_counters': {'c': 1}})
def test_find_weird_counters_0_20(self): counters, step_num = parse_hadoop_counters_from_line( r'Job JOBID="job_201106132124_0001" FINISH_TIME="1308000435810" JOB_STATUS="SUCCESS" FINISHED_MAPS="2" FINISHED_REDUCES="1" FAILED_MAPS="0" FAILED_REDUCES="0" COUNTERS="{(org\.apache\.hadoop\.mapred\.JobInProgress$Counter)(Job Counters )[(TOTAL_LAUNCHED_REDUCES)(Launched reduce tasks)(1)][(RACK_LOCAL_MAPS)(Rack-local map tasks)(2)][(TOTAL_LAUNCHED_MAPS)(Launched map tasks)(2)]}{(FileSystemCounters)(FileSystemCounters)[(FILE_BYTES_READ)(FILE_BYTES_READ)(1494)][(S3_BYTES_READ)(S3_BYTES_READ)(3726)][(FILE_BYTES_WRITTEN)(FILE_BYTES_WRITTEN)(3459)][(S3_BYTES_WRITTEN)(S3_BYTES_WRITTEN)(1663)]}{(weird counters)(weird counters)[(\\[\\])(\\[\\])(68)][(\\\\)(\\\\)(68)][(\\{\\})(\\{\\})(68)][(\\(\\))(\\(\\))(68)][(\.)(\.)(68)]}{(org\.apache\.hadoop\.mapred\.Task$Counter)(Map-Reduce Framework)[(REDUCE_INPUT_GROUPS)(Reduce input groups)(154)][(COMBINE_OUTPUT_RECORDS)(Combine output records)(0)][(MAP_INPUT_RECORDS)(Map input records)(68)][(REDUCE_SHUFFLE_BYTES)(Reduce shuffle bytes)(1901)][(REDUCE_OUTPUT_RECORDS)(Reduce output records)(154)][(SPILLED_RECORDS)(Spilled Records)(672)][(MAP_OUTPUT_BYTES)(Map output bytes)(3446)][(MAP_INPUT_BYTES)(Map input bytes)(2483)][(MAP_OUTPUT_RECORDS)(Map output records)(336)][(COMBINE_INPUT_RECORDS)(Combine input records)(0)][(REDUCE_INPUT_RECORDS)(Reduce input records)(336)]}" .' ) assert_in('{}', counters['weird counters']) assert_in('()', counters['weird counters']) assert_in('.', counters['weird counters']) assert_in('[]', counters['weird counters']) assert_in('\\', counters['weird counters']) assert_equal(step_num, 1)
def test_ambiguous_version_counter(self): # minimum text required to match counter line regex line = r'JOBID="_1" COUNTERS="{(a.b:1,)(c)[(.d:2)(,e.f:2)(1)]}"' counters_018, _ = parse_hadoop_counters_from_line( line, hadoop_version='0.18') counters_020, _ = parse_hadoop_counters_from_line( line, hadoop_version='0.20') counters_inf, _ = parse_hadoop_counters_from_line(line) self.assertEqual(counters_018, { '{(a': { 'b': 1 }, ')(c)[(': { 'd': 2 }, 'e': { 'f': 2 } }) self.assertEqual(counters_020, {'c': {',e.f:2': 1}}) # if no version given, should default to 0.20 if possible self.assertEqual(counters_020, counters_inf)
def test_find_counters_2_0_explicit(self): counters, step_num = parse_hadoop_counters_from_line( self.TEST_COUNTERS_2_0, hadoop_version='2.4.0') self.assertEqual(step_num, 1) self.assertEqual(counters, { 'File System Counters': { 'FILE: Number of bytes read': 83, 'FILE: Number of bytes written': 103064, }, 'File Output Format Counters ': { 'Bytes Written': 34, } })
def test_find_counters_2_0_explicit(self): counters, step_num = parse_hadoop_counters_from_line( self.TEST_COUNTERS_2_0, hadoop_version='2.4.0') self.assertEqual(step_num, 1) self.assertEqual( counters, { 'File System Counters': { 'FILE: Number of bytes read': 83, 'FILE: Number of bytes written': 103064, }, 'File Output Format Counters ': { 'Bytes Written': 34, } })
def test_counters_fuzz(self): # test some strings that should break badly formulated parsing regexps freakquences = [ ('\\[\\]\\(\\}\\[\\{\\\\\\\\\\[\\]\\(', '[](}[{\\[]('), ('\\)\\}\\\\\\\\\\[\\[\\)\\{\\{\\}\\]', ')}\\[[){{}]'), ('\\(\\{\\(\\[\\(\\]\\\\\\\\\\(\\\\\\\\\\\\\\\\', '({([(]\\(\\\\'), ('\\)\\{\\[\\)\\)\\(\\}\\(\\\\\\\\\\\\\\\\', '){[))(}(\\\\'), ('\\}\\(\\{\\)\\]\\]\\(\\]\\[\\\\\\\\', '}({)]](][\\'), ('\\[\\{\\\\\\\\\\)\\\\\\\\\\{\\{\\]\\]\\(', '[{\\)\\{{]]('), ('\\\\\\\\\\(\\(\\)\\\\\\\\\\\\\\\\\\\\\\\\\\[\\{\\]', '\\(()\\\\\\[{]'), ('\\]\\(\\[\\)\\{\\(\\)\\)\\{\\]', ']([){()){]'), ('\\(\\[\\{\\[\\[\\(\\{\\}\\(\\{', '([{[[({}({'), ('\\(\\{\\(\\{\\[\\{\\(\\{\\}\\}', '({({[{({}}')] for in_str, out_str in freakquences: counter_string = r'Job JOBID="_001" FAILED_REDUCES="0" COUNTERS="{(%s)(%s)[(a)(a)(1)]}"' % (in_str, in_str) self.assertIn(out_str, parse_hadoop_counters_from_line(counter_string)[0])
def test_counters_fuzz(self): # test some strings that should break badly formulated parsing regexps freakquences = [ ('\\[\\]\\(\\}\\[\\{\\\\\\\\\\[\\]\\(', '[](}[{\\[]('), ('\\)\\}\\\\\\\\\\[\\[\\)\\{\\{\\}\\]', ')}\\[[){{}]'), ('\\(\\{\\(\\[\\(\\]\\\\\\\\\\(\\\\\\\\\\\\\\\\', '({([(]\\(\\\\'), ('\\)\\{\\[\\)\\)\\(\\}\\(\\\\\\\\\\\\\\\\', '){[))(}(\\\\'), ('\\}\\(\\{\\)\\]\\]\\(\\]\\[\\\\\\\\', '}({)]](][\\'), ('\\[\\{\\\\\\\\\\)\\\\\\\\\\{\\{\\]\\]\\(', '[{\\)\\{{]]('), ('\\\\\\\\\\(\\(\\)\\\\\\\\\\\\\\\\\\\\\\\\\\[\\{\\]', '\\(()\\\\\\[{]'), ('\\]\\(\\[\\)\\{\\(\\)\\)\\{\\]', ']([){()){]'), ('\\(\\[\\{\\[\\[\\(\\{\\}\\(\\{', '([{[[({}({'), ('\\(\\{\\(\\{\\[\\{\\(\\{\\}\\}', '({({[{({}}')] for in_str, out_str in freakquences: counter_string = (r'Job JOBID="_001" FAILED_REDUCES="0" ' r'COUNTERS="{(%s)(%s)[(a)(a)(1)]}"' % (in_str, in_str)) self.assertIn(out_str, parse_hadoop_counters_from_line(counter_string)[0])
def test_find_counters_0_20_explicit(self): counters, step_num = parse_hadoop_counters_from_line( self.TEST_COUNTERS_0_20, hadoop_version='0.20') self.assertIn('reducer time (processing): 2.51', counters['profile']) self.assertEqual(step_num, 3)
def test_find_counters_0_20(self): counters, step_num = parse_hadoop_counters_from_line(r'Job JOBID="job_201106092314_0003" FINISH_TIME="1307662284564" JOB_STATUS="SUCCESS" FINISHED_MAPS="2" FINISHED_REDUCES="1" FAILED_MAPS="0" FAILED_REDUCES="0" COUNTERS="{(org\.apache\.hadoop\.mapred\.JobInProgress$Counter)(Job Counters )[(TOTAL_LAUNCHED_REDUCES)(Launched reduce tasks)(1)][(TOTAL_LAUNCHED_MAPS)(Launched map tasks)(2)][(DATA_LOCAL_MAPS)(Data-local map tasks)(2)]}{(FileSystemCounters)(FileSystemCounters)[(FILE_BYTES_READ)(FILE_BYTES_READ)(10547174)][(HDFS_BYTES_READ)(HDFS_BYTES_READ)(49661008)][(FILE_BYTES_WRITTEN)(FILE_BYTES_WRITTEN)(21773078)][(S3_BYTES_WRITTEN)(S3_BYTES_WRITTEN)(49526580)]}{(org\.apache\.hadoop\.mapred\.Task$Counter)(Map-Reduce Framework)[(REDUCE_INPUT_GROUPS)(Reduce input groups)(18843)][(COMBINE_OUTPUT_RECORDS)(Combine output records)(0)][(MAP_INPUT_RECORDS)(Map input records)(29884)][(REDUCE_SHUFFLE_BYTES)(Reduce shuffle bytes)(11225840)][(REDUCE_OUTPUT_RECORDS)(Reduce output records)(29884)][(SPILLED_RECORDS)(Spilled Records)(59768)][(MAP_OUTPUT_BYTES)(Map output bytes)(50285563)][(MAP_INPUT_BYTES)(Map input bytes)(49645726)][(MAP_OUTPUT_RECORDS)(Map output records)(29884)][(COMBINE_INPUT_RECORDS)(Combine input records)(0)][(REDUCE_INPUT_RECORDS)(Reduce input records)(29884)]}{(profile)(profile)[(reducer time \\(processing\\): 2\.51)(reducer time \\(processing\\): 2\.51)(1)][(mapper time \\(processing\\): 0\.50)(mapper time \\(processing\\): 0\.50)(1)][(mapper time \\(other\\): 3\.78)(mapper time \\(other\\): 3\.78)(1)][(mapper time \\(processing\\): 0\.46)(mapper time \\(processing\\): 0\.46)(1)][(reducer time \\(other\\): 6\.31)(reducer time \\(other\\): 6\.31)(1)][(mapper time \\(other\\): 3\.72)(mapper time \\(other\\): 3\.72)(1)]}" .') self.assertIn('reducer time (processing): 2.51', counters['profile']) self.assertEqual(step_num, 3)