def test_max_output_files(self): job = MRWordFreqCount(['-r', 'spark', '--max-output-files', '1']) job.sandbox(stdin=BytesIO(b'one two one\n two three\n')) with job.make_runner() as runner: runner.run() self.assertEqual(self._num_output_files(runner), 1)
def test_num_reducers(self): jobconf_args = ['--jobconf', 'mapreduce.job.reduces=1'] job = MRWordFreqCount(['-r', 'spark'] + jobconf_args) job.sandbox(stdin=BytesIO(b'one two one\n two three\n')) with job.make_runner() as runner: runner.run() self.assertEqual(self._num_output_files(runner), 1)
def test_basic_job(self): job = MRWordFreqCount(['-r', 'spark']) job.sandbox(stdin=BytesIO( b'one fish\ntwo fish\nred fish\nblue fish\n')) with job.make_runner() as runner: runner.run() output = dict(job.parse_output(runner.cat_output())) self.assertEqual(output, dict(blue=1, fish=4, one=1, red=1, two=1))
def test_num_reducers(self): jobconf_args = ['--jobconf', 'mapreduce.job.reduces=1'] job = MRWordFreqCount(['-r', 'spark'] + jobconf_args) job.sandbox(stdin=BytesIO(b'one two one\n two three\n')) with job.make_runner() as runner: runner.run() num_output_files = sum(1 for f in listdir(runner.get_output_dir()) if f.startswith('part')) self.assertEqual(num_output_files, 1)
def test_num_reducers(self): jobconf_args = [ '--jobconf', 'mapreduce.job.reduces=1' ] job = MRWordFreqCount(['-r', 'spark'] + jobconf_args) job.sandbox(stdin=BytesIO(b'one two one\n two three\n')) with job.make_runner() as runner: runner.run() self.assertEqual(self._num_output_files(runner), 1)
def test_max_output_files_is_cmd_line_only(self): self.start(mrjob_conf_patcher( dict(runners=dict(spark=dict(max_output_files=1))))) log = self.start(patch('mrjob.runner.log')) job = MRWordFreqCount(['-r', 'spark']) job.sandbox(stdin=BytesIO(b'one two one\n two three\n')) with job.make_runner() as runner: runner.run() # by default there should be at least 2 output files self.assertNotEqual(self._num_output_files(runner), 1) self.assertTrue(log.warning.called)
class MRWordFreqCount(MRJob): OUTPUT_PROTOCOL = protocol.JSONProtocol def mapper(self, _, line): for word in WORD_RE.findall(line): yield (word.lower(), 1) def reducer(self, word, counts): yield (word, sum(counts)) if __name__ == '__main__': MRWordFreqCount().run()
def test_compression(self): # deliberately mix Hadoop 1 and 2 config properties jobconf_args = [ '--jobconf', 'mapred.output.compression.codec='\ 'org.apache.hadoop.io.compress.GzipCodec', '--jobconf', 'mapreduce.output.fileoutputformat.compress=true', ] job = MRWordFreqCount(['-r', 'spark'] + jobconf_args) job.sandbox(stdin=BytesIO(b'fa la la la la\nla la la la\n')) with job.make_runner() as runner: runner.run() self.assertTrue(runner.fs.exists( join(runner.get_output_dir(), 'part*.gz'))) self.assertEqual(dict(job.parse_output(runner.cat_output())), dict(fa=1, la=8))
def test_basic_job(self): job = MRWordFreqCount(['-r', 'spark']) job.sandbox( stdin=BytesIO(b'one fish\ntwo fish\nred fish\nblue fish\n')) with job.make_runner() as runner: runner.run() output = dict(job.parse_output(runner.cat_output())) self.assertEqual(output, dict(blue=1, fish=4, one=1, red=1, two=1))
def test_file_uris_as_input(self): input1 = self.makefile('input1.txt', b'cat rat bat') input2 = 'file://' + self.makefile('input2.txt', b'dog dog dog') job = MRWordFreqCount([input1, input2]) job.sandbox() with job.make_runner() as runner: runner.run() self.assertEqual(dict(job.parse_output(runner.cat_output())), dict(bat=1, cat=1, dog=3, rat=1))
def test_the_wheels_on_the_bus(self): RAW_INPUT = b""" The wheels on the bus go round and round, round and round, round and round The wheels on the bus go round and round, all through the town. """ EXPECTED_OUTPUT = { u'all': 1, u'and': 4, u'bus': 2, u'go': 2, u'on': 2, u'round': 8, u'the': 5, u'through': 1, u'town': 1, u'wheels': 2, } self.assertEqual(run_job(MRWordFreqCount(), RAW_INPUT), EXPECTED_OUTPUT)
def test_compression(self): # deliberately mix Hadoop 1 and 2 config properties jobconf_args = [ '--jobconf', ('mapred.output.compression.codec=' 'org.apache.hadoop.io.compress.GzipCodec'), '--jobconf', 'mapreduce.output.fileoutputformat.compress=true', ] job = MRWordFreqCount(['-r', 'spark'] + jobconf_args) job.sandbox(stdin=BytesIO(b'fa la la la la\nla la la la\n')) with job.make_runner() as runner: runner.run() self.assertTrue( runner.fs.exists(join(runner.get_output_dir(), 'part*.gz'))) self.assertEqual(dict(job.parse_output(runner.cat_output())), dict(fa=1, la=8))
def test_empty(self): self.assertEqual(run_job(MRWordFreqCount()), {})
from mrjob.job import MRJob from mrjob.step import MRStep import re from mrjob.examples.mr_word_freq_count import MRWordFreqCount import sys from operator import itemgetter, attrgetter, methodcaller WORD_RE = re.compile(r"[\w']+") list = [] if __name__ == '__main__': job = MRWordFreqCount(args=sys.argv[1:]) with job.make_runner() as runner: runner.run() for line in runner.stream_output(): key, value = job.parse_output_line(line) object = (key, value) list.append(object) sorted_list = sorted(list, key=itemgetter(1), reverse=True) print sorted_list[0:10]