def test_max_output_files(self): job = MRWordFreqCount(['-r', 'spark', '--max-output-files', '1']) job.sandbox(stdin=BytesIO(b'one two one\n two three\n')) with job.make_runner() as runner: runner.run() self.assertEqual(self._num_output_files(runner), 1)
def test_num_reducers(self): jobconf_args = ['--jobconf', 'mapreduce.job.reduces=1'] job = MRWordFreqCount(['-r', 'spark'] + jobconf_args) job.sandbox(stdin=BytesIO(b'one two one\n two three\n')) with job.make_runner() as runner: runner.run() self.assertEqual(self._num_output_files(runner), 1)
def test_basic_job(self): job = MRWordFreqCount(['-r', 'spark']) job.sandbox( stdin=BytesIO(b'one fish\ntwo fish\nred fish\nblue fish\n')) with job.make_runner() as runner: runner.run() output = dict(job.parse_output(runner.cat_output())) self.assertEqual(output, dict(blue=1, fish=4, one=1, red=1, two=1))
def test_basic_job(self): job = MRWordFreqCount(['-r', 'spark']) job.sandbox(stdin=BytesIO( b'one fish\ntwo fish\nred fish\nblue fish\n')) with job.make_runner() as runner: runner.run() output = dict(job.parse_output(runner.cat_output())) self.assertEqual(output, dict(blue=1, fish=4, one=1, red=1, two=1))
def test_num_reducers(self): jobconf_args = ['--jobconf', 'mapreduce.job.reduces=1'] job = MRWordFreqCount(['-r', 'spark'] + jobconf_args) job.sandbox(stdin=BytesIO(b'one two one\n two three\n')) with job.make_runner() as runner: runner.run() num_output_files = sum(1 for f in listdir(runner.get_output_dir()) if f.startswith('part')) self.assertEqual(num_output_files, 1)
def test_num_reducers(self): jobconf_args = [ '--jobconf', 'mapreduce.job.reduces=1' ] job = MRWordFreqCount(['-r', 'spark'] + jobconf_args) job.sandbox(stdin=BytesIO(b'one two one\n two three\n')) with job.make_runner() as runner: runner.run() self.assertEqual(self._num_output_files(runner), 1)
def test_file_uris_as_input(self): input1 = self.makefile('input1.txt', b'cat rat bat') input2 = 'file://' + self.makefile('input2.txt', b'dog dog dog') job = MRWordFreqCount([input1, input2]) job.sandbox() with job.make_runner() as runner: runner.run() self.assertEqual(dict(job.parse_output(runner.cat_output())), dict(bat=1, cat=1, dog=3, rat=1))
def test_max_output_files_is_cmd_line_only(self): self.start(mrjob_conf_patcher( dict(runners=dict(spark=dict(max_output_files=1))))) log = self.start(patch('mrjob.runner.log')) job = MRWordFreqCount(['-r', 'spark']) job.sandbox(stdin=BytesIO(b'one two one\n two three\n')) with job.make_runner() as runner: runner.run() # by default there should be at least 2 output files self.assertNotEqual(self._num_output_files(runner), 1) self.assertTrue(log.warning.called)
def test_compression(self): # deliberately mix Hadoop 1 and 2 config properties jobconf_args = [ '--jobconf', ('mapred.output.compression.codec=' 'org.apache.hadoop.io.compress.GzipCodec'), '--jobconf', 'mapreduce.output.fileoutputformat.compress=true', ] job = MRWordFreqCount(['-r', 'spark'] + jobconf_args) job.sandbox(stdin=BytesIO(b'fa la la la la\nla la la la\n')) with job.make_runner() as runner: runner.run() self.assertTrue( runner.fs.exists(join(runner.get_output_dir(), 'part*.gz'))) self.assertEqual(dict(job.parse_output(runner.cat_output())), dict(fa=1, la=8))
def test_compression(self): # deliberately mix Hadoop 1 and 2 config properties jobconf_args = [ '--jobconf', 'mapred.output.compression.codec='\ 'org.apache.hadoop.io.compress.GzipCodec', '--jobconf', 'mapreduce.output.fileoutputformat.compress=true', ] job = MRWordFreqCount(['-r', 'spark'] + jobconf_args) job.sandbox(stdin=BytesIO(b'fa la la la la\nla la la la\n')) with job.make_runner() as runner: runner.run() self.assertTrue(runner.fs.exists( join(runner.get_output_dir(), 'part*.gz'))) self.assertEqual(dict(job.parse_output(runner.cat_output())), dict(fa=1, la=8))