def test_jobconf_simulated_by_runner(self): input_path = os.path.join(self.tmp_dir, "input") with open(input_path, "wb") as input_file: input_file.write("foo\n") upload_path = os.path.join(self.tmp_dir, "upload") with open(upload_path, "wb") as upload_file: upload_file.write("PAYLOAD") mr_job = MRTestJobConf( [ "-r", self.RUNNER, "--jobconf=user.defined=something", "--jobconf=mapred.map.tasks=1", "--file", upload_path, input_path, ] ) mr_job.sandbox() results = {} # between the single line of input and setting mapred.map.tasks to 1, # we should be restricted to only one task, which will give more # predictable results with mr_job.make_runner() as runner: script_path = runner._script_path runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results[key] = value working_dir = results["mapreduce.job.local.dir"] self.assertEqual(working_dir, os.path.join(runner._get_local_tmp_dir(), "job_local_dir", "0", "mapper", "0")) self.assertEqual(results["mapreduce.job.cache.archives"], "") self.assertEqual( results["mapreduce.job.cache.files"], script_path + "#mr_test_jobconf.py" + "," + upload_path + "#upload" ) self.assertEqual(results["mapreduce.job.cache.local.archives"], "") self.assertEqual( results["mapreduce.job.cache.local.files"], os.path.join(working_dir, "mr_test_jobconf.py") + "," + os.path.join(working_dir, "upload"), ) self.assertEqual(results["mapreduce.job.id"], runner._job_name) self.assertEqual(results["mapreduce.map.input.file"], input_path) self.assertEqual(results["mapreduce.map.input.length"], "4") self.assertEqual(results["mapreduce.map.input.start"], "0") self.assertEqual(results["mapreduce.task.attempt.id"], "attempt_%s_mapper_000000_0" % runner._job_name) self.assertEqual(results["mapreduce.task.id"], "task_%s_mapper_000000" % runner._job_name) self.assertEqual(results["mapreduce.task.ismap"], "true") self.assertEqual(results["mapreduce.task.output.dir"], runner._output_dir) self.assertEqual(results["mapreduce.task.partition"], "0") self.assertEqual(results["user.defined"], "something")
def test_others(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write('foo\n') mr_job = MRTestJobConf(['-r', 'inline', '--jobconf=user.defined=something', input_path]) mr_job.sandbox() results = {} with mr_job.make_runner() as runner: runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results[key] = value self.assertEqual(results['mapreduce.job.id'], runner._job_name) self.assertEqual(results['mapreduce.job.local.dir'], runner._working_dir) self.assertEqual(results['mapreduce.map.input.file'], input_path) self.assertEqual(results['mapreduce.map.input.length'], '4') self.assertEqual(results['mapreduce.map.input.start'], '0') self.assertEqual(results['mapreduce.task.attempt.id'], 'attempt_%s_mapper_000000_0' % runner._job_name) self.assertEqual(results['mapreduce.task.id'], 'task_%s_mapper_000000' % runner._job_name) self.assertEqual(results['mapreduce.task.ismap'], 'true') self.assertEqual(results['mapreduce.task.output.dir'], runner._output_dir) self.assertEqual(results['mapreduce.task.partition'], '0') self.assertEqual(results['user.defined'], 'something')
def test_jobconf_simulated_by_runner(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write('foo\n') upload_path = os.path.join(self.tmp_dir, 'upload') with open(upload_path, 'wb') as upload_file: upload_file.write('PAYLOAD') mr_job = MRTestJobConf(['-r', self.RUNNER, '--jobconf=user.defined=something', '--jobconf=mapred.map.tasks=1', '--file', upload_path, input_path]) mr_job.sandbox() results = {} # between the single line of input and setting mapred.map.tasks to 1, # we should be restricted to only one task, which will give more # predictable results with mr_job.make_runner() as runner: script_path = runner._script_path runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results[key] = value working_dir = results['mapreduce.job.local.dir'] self.assertEqual(working_dir, os.path.join(runner._get_local_tmp_dir(), 'job_local_dir', '0', 'mapper', '0')) self.assertEqual(results['mapreduce.job.cache.archives'], '') self.assertEqual(results['mapreduce.job.cache.files'], script_path + '#mr_test_jobconf.py' + ',' + upload_path + '#upload') self.assertEqual(results['mapreduce.job.cache.local.archives'], '') self.assertEqual( results['mapreduce.job.cache.local.files'], os.path.join(working_dir, 'mr_test_jobconf.py') + ',' + os.path.join(working_dir, 'upload')) self.assertEqual(results['mapreduce.job.id'], runner._job_name) self.assertEqual(results['mapreduce.map.input.file'], input_path) self.assertEqual(results['mapreduce.map.input.length'], '4') self.assertEqual(results['mapreduce.map.input.start'], '0') self.assertEqual(results['mapreduce.task.attempt.id'], 'attempt_%s_mapper_000000_0' % runner._job_name) self.assertEqual(results['mapreduce.task.id'], 'task_%s_mapper_000000' % runner._job_name) self.assertEqual(results['mapreduce.task.ismap'], 'true') self.assertEqual(results['mapreduce.task.output.dir'], runner._output_dir) self.assertEqual(results['mapreduce.task.partition'], '0') self.assertEqual(results['user.defined'], 'something')
def test_mapper_init(self): two_lines_path = self.makefile('two_lines', b'line\nother line\n') job = MRTestJobConf( ['-r', 'spark', '--emulate-map-input-file', two_lines_path]) with job.make_runner() as runner: runner.run() output = dict(job.parse_output(runner.cat_output())) self.assertEqual(output['mapreduce.map.input.file'], 'file://' + two_lines_path)
def test_empty_file(self): two_lines_path = self.makefile('two_lines', b'line\nother line\n') no_lines_path = self.makefile('no_lines', b'') job = MRTestJobConf( ['-r', 'spark', '--emulate-map-input-file', two_lines_path]) with job.make_runner() as runner: runner.run() paths = [ path for jobconf, path in job.parse_output(runner.cat_output()) if jobconf == 'mapreduce.map.input.file' ] # ideally, no_lines_path would appear too, but what we care # about is that we don't get a crash from trying to read # the "first" line of the file self.assertEqual(paths, ['file://' + two_lines_path])
def test_others(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write('foo\n') mr_job = MRTestJobConf( ['-r', 'inline', '--jobconf=user.defined=something', input_path]) mr_job.sandbox() results = {} with mr_job.make_runner() as runner: runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results[key] = value self.assertEqual(results['mapreduce.job.id'], runner._job_name) self.assertEqual(results['mapreduce.job.local.dir'], runner._working_dir) self.assertEqual(results['mapreduce.map.input.file'], input_path) self.assertEqual(results['mapreduce.map.input.length'], '4') self.assertEqual(results['mapreduce.map.input.start'], '0') self.assertEqual(results['mapreduce.task.attempt.id'], 'attempt_%s_mapper_000000_0' % runner._job_name) self.assertEqual(results['mapreduce.task.id'], 'task_%s_mapper_000000' % runner._job_name) self.assertEqual(results['mapreduce.task.ismap'], 'true') self.assertEqual(results['mapreduce.task.output.dir'], runner._output_dir) self.assertEqual(results['mapreduce.task.partition'], '0') self.assertEqual(results['user.defined'], 'something')
def test_jobconf_simulated_by_runner(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write('foo\n') upload_path = os.path.join(self.tmp_dir, 'upload') with open(upload_path, 'wb') as upload_file: upload_file.write('PAYLOAD') mr_job = MRTestJobConf([ '-r', self.RUNNER, '--jobconf=user.defined=something', '--jobconf=mapred.map.tasks=1', '--file', upload_path, input_path ]) mr_job.sandbox() results = {} # between the single line of input and setting mapred.map.tasks to 1, # we should be restricted to only one task, which will give more # predictable results with mr_job.make_runner() as runner: script_path = runner._script_path runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results[key] = value working_dir = results['mapreduce.job.local.dir'] self.assertEqual( working_dir, os.path.join(runner._get_local_tmp_dir(), 'job_local_dir', '0', 'mapper', '0')) self.assertEqual(results['mapreduce.job.cache.archives'], '') self.assertEqual( results['mapreduce.job.cache.files'], script_path + '#mr_test_jobconf.py' + ',' + upload_path + '#upload') self.assertEqual(results['mapreduce.job.cache.local.archives'], '') self.assertEqual( results['mapreduce.job.cache.local.files'], os.path.join(working_dir, 'mr_test_jobconf.py') + ',' + os.path.join(working_dir, 'upload')) self.assertEqual(results['mapreduce.job.id'], runner._job_name) self.assertEqual(results['mapreduce.map.input.file'], input_path) self.assertEqual(results['mapreduce.map.input.length'], '4') self.assertEqual(results['mapreduce.map.input.start'], '0') self.assertEqual(results['mapreduce.task.attempt.id'], 'attempt_%s_mapper_000000_0' % runner._job_name) self.assertEqual(results['mapreduce.task.id'], 'task_%s_mapper_000000' % runner._job_name) self.assertEqual(results['mapreduce.task.ismap'], 'true') self.assertEqual(results['mapreduce.task.output.dir'], runner._output_dir) self.assertEqual(results['mapreduce.task.partition'], '0') self.assertEqual(results['user.defined'], 'something')
def test_jobconf_simulated_by_runner(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'foo\n') upload_path = os.path.join(self.tmp_dir, 'upload') with open(upload_path, 'wb') as upload_file: upload_file.write(b'PAYLOAD') # use --no-bootstrap-mrjob so we don't have to worry about # mrjob.tar.gz and the setup wrapper script self.add_mrjob_to_pythonpath() mr_job = MRTestJobConf([ '-r', self.RUNNER, '--no-bootstrap-mrjob', '--jobconf=user.defined=something', '--jobconf=mapred.map.tasks=1', '--file', upload_path, input_path ]) mr_job.sandbox() results = {} # between the single line of input and setting mapred.map.tasks to 1, # we should be restricted to only one task, which will give more # predictable results with mr_job.make_runner() as runner: script_path = runner._script_path runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results[key] = value working_dir = results['mapreduce.job.local.dir'] self.assertEqual( working_dir, os.path.join(runner._get_local_tmp_dir(), 'job_local_dir', '0', 'mapper', '0')) self.assertEqual(results['mapreduce.job.cache.archives'], '') expected_cache_files = [ script_path + '#mr_test_jobconf.py', upload_path + '#upload' ] + [ '%s#%s' % (path, name) for path, name in self._extra_expected_local_files(runner) ] self.assertEqual( sorted(results['mapreduce.job.cache.files'].split(',')), sorted(expected_cache_files)) self.assertEqual(results['mapreduce.job.cache.local.archives'], '') expected_local_files = [ os.path.join(working_dir, 'mr_test_jobconf.py'), os.path.join(working_dir, 'upload') ] + [ os.path.join(working_dir, name) for path, name in self._extra_expected_local_files(runner) ] self.assertEqual( sorted(results['mapreduce.job.cache.local.files'].split(',')), sorted(expected_local_files)) self.assertEqual(results['mapreduce.job.id'], runner._job_key) self.assertEqual(results['mapreduce.map.input.file'], input_path) self.assertEqual(results['mapreduce.map.input.length'], '4') self.assertEqual(results['mapreduce.map.input.start'], '0') self.assertEqual(results['mapreduce.task.attempt.id'], 'attempt_%s_mapper_00000_0' % runner._job_key) self.assertEqual(results['mapreduce.task.id'], 'task_%s_mapper_00000' % runner._job_key) self.assertEqual(results['mapreduce.task.ismap'], 'true') self.assertEqual(results['mapreduce.task.output.dir'], runner._output_dir) self.assertEqual(results['mapreduce.task.partition'], '0') self.assertEqual(results['user.defined'], 'something')
def test_jobconf_simulated_by_runner(self): # use a .gz file so there's only one split input_gz_path = os.path.join(self.tmp_dir, 'input.gz') with gzip.GzipFile(input_gz_path, 'wb') as input_gz: input_gz.write(b'foo\n') input_gz_size = os.stat(input_gz_path)[stat.ST_SIZE] upload_path = os.path.join(self.tmp_dir, 'upload') with open(upload_path, 'wb') as upload_file: upload_file.write(b'PAYLOAD') # use --no-bootstrap-mrjob so we don't have to worry about # mrjob.tar.gz and the setup wrapper script self.add_mrjob_to_pythonpath() mr_job = MRTestJobConf(['-r', self.RUNNER, '--no-bootstrap-mrjob', '--jobconf=user.defined=something', '--file', upload_path, input_gz_path]) mr_job.sandbox() results = {} # between the single line of input and setting mapred.map.tasks to 1, # we should be restricted to only one task, which will give more # predictable results with mr_job.make_runner() as runner: script_path = runner._script_path runner.run() results.update(dict(mr_job.parse_output(runner.cat_output()))) working_dir = results['mapreduce.job.local.dir'] self.assertEqual(working_dir, os.path.join(runner._get_local_tmp_dir(), 'step', '000', 'mapper', '00000', 'wd')) self.assertEqual(results['mapreduce.job.cache.archives'], '') expected_cache_files = [ script_path + '#mr_test_jobconf.py', upload_path + '#upload' ] + [ '%s#%s' % (path, name) for path, name in self._extra_expected_local_files(runner) ] self.assertEqual( sorted(results['mapreduce.job.cache.files'].split(',')), sorted(expected_cache_files)) self.assertEqual(results['mapreduce.job.cache.local.archives'], '') expected_local_files = [ os.path.join(working_dir, 'mr_test_jobconf.py'), os.path.join(working_dir, 'upload') ] + [ os.path.join(working_dir, name) for path, name in self._extra_expected_local_files(runner) ] self.assertEqual( sorted(results['mapreduce.job.cache.local.files'].split(',')), sorted(expected_local_files)) self.assertEqual(results['mapreduce.job.id'], runner._job_key) self.assertEqual(results['mapreduce.map.input.file'], input_gz_path) self.assertEqual(results['mapreduce.map.input.length'], str(input_gz_size)) self.assertEqual(results['mapreduce.map.input.start'], '0') self.assertEqual(results['mapreduce.task.attempt.id'], 'attempt_%s_mapper_00000_0' % runner._job_key) self.assertEqual(results['mapreduce.task.id'], 'task_%s_mapper_00000' % runner._job_key) self.assertEqual(results['mapreduce.task.ismap'], 'true') self.assertEqual(results['mapreduce.task.output.dir'], runner._output_dir) self.assertEqual(results['mapreduce.task.partition'], '0') self.assertEqual(results['user.defined'], 'something')
def test_jobconf_simulated_by_runner(self): # use a .gz file so there's only one split input_gz_path = join(self.tmp_dir, 'input.gz') with gzip.GzipFile(input_gz_path, 'wb') as input_gz: input_gz.write(b'foo\n') input_gz_size = os.stat(input_gz_path)[stat.ST_SIZE] upload_path = join(self.tmp_dir, 'upload') with open(upload_path, 'wb') as upload_file: upload_file.write(b'PAYLOAD') # use --no-bootstrap-mrjob so we don't have to worry about # mrjob.tar.gz and the setup wrapper script self.add_mrjob_to_pythonpath() mr_job = MRTestJobConf([ '-r', self.RUNNER, '--no-bootstrap-mrjob', '-D=user.defined=something', '--files', upload_path, input_gz_path ]) mr_job.sandbox() results = {} # between the single line of input and setting mapred.map.tasks to 1, # we should be restricted to only one task, which will give more # predictable results with mr_job.make_runner() as runner: script_path = runner._script_path runner.run() results.update(dict(mr_job.parse_output(runner.cat_output()))) working_dir = results['mapreduce.job.local.dir'] self.assertEqual( working_dir, join(runner._get_local_tmp_dir(), 'step', '000', 'mapper', '00000', 'wd')) self.assertEqual(results['mapreduce.job.cache.archives'], '') expected_cache_files = [ script_path + '#mr_test_jobconf.py', upload_path + '#upload' ] + [ '%s#%s' % (path, name) for path, name in self._extra_expected_local_files(runner) ] self.assertEqual( sorted(results['mapreduce.job.cache.files'].split(',')), sorted(expected_cache_files)) self.assertEqual(results['mapreduce.job.cache.local.archives'], '') expected_local_files = [ join(working_dir, 'mr_test_jobconf.py'), join(working_dir, 'upload') ] + [ join(working_dir, name) for path, name in self._extra_expected_local_files(runner) ] self.assertEqual( sorted(results['mapreduce.job.cache.local.files'].split(',')), sorted(expected_local_files)) self.assertEqual(results['mapreduce.job.id'], runner._job_key) self.assertEqual(results['mapreduce.map.input.file'], 'file://' + input_gz_path) self.assertEqual(results['mapreduce.map.input.length'], str(input_gz_size)) self.assertEqual(results['mapreduce.map.input.start'], '0') self.assertEqual(results['mapreduce.task.attempt.id'], 'attempt_%s_mapper_00000_0' % runner._job_key) self.assertEqual(results['mapreduce.task.id'], 'task_%s_mapper_00000' % runner._job_key) self.assertEqual(results['mapreduce.task.ismap'], 'true') self.assertEqual(results['mapreduce.task.output.dir'], runner._output_dir) self.assertEqual(results['mapreduce.task.partition'], '0') self.assertEqual(results['user.defined'], 'something')