class StreamingArgsTestCase(EmptyMrjobConfTestCase): MRJOB_CONF_CONTENTS = {'runners': {'hadoop': { 'hadoop_home': 'kansas', 'hadoop_streaming_jar': 'binks.jar.jar', }}} def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar', mr_job_script='my_job.py', stdin=StringIO()) self.runner._add_job_files_for_upload() self.runner._hadoop_version='0.20.204' self.simple_patch(self.runner, '_new_upload_args', return_value=['new_upload_args']) self.simple_patch(self.runner, '_old_upload_args', return_value=['old_upload_args']) self.simple_patch(self.runner, '_hadoop_args_for_step', return_value=['hadoop_args_for_step']) self.simple_patch(self.runner, '_hdfs_step_input_files', return_value=['hdfs_step_input_files']) self.simple_patch(self.runner, '_hdfs_step_output_dir', return_value='hdfs_step_output_dir') self.runner._script_path = 'my_job.py' self._new_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'new_upload_args', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir'] self._old_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir', 'old_upload_args'] def simple_patch(self, obj, attr, side_effect=None, return_value=None): patcher = patch.object(obj, attr, side_effect=side_effect, return_value=return_value) patcher.start() self.addCleanup(patcher.stop) def _assert_streaming_step(self, step, args): self.runner._steps = [step] self.assertEqual( self.runner._args_for_streaming_step(0), self._new_basic_args + args) def _assert_streaming_step_old(self, step, args): self.runner._hadoop_version = '0.18' self.runner._steps = [step] self.assertEqual( self.runner._args_for_streaming_step(0), self._old_basic_args + args) def test_basic_mapper(self): self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', }, }, ['-mapper', 'python my_job.py --step-num=0 --mapper', '-jobconf', 'mapred.reduce.tasks=0']) def test_basic_reducer(self): self._assert_streaming_step( { 'type': 'streaming', 'reducer': { 'type': 'script', }, }, ['-mapper', 'cat', '-reducer', 'python my_job.py --step-num=0 --reducer']) def test_pre_filters(self): self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep anything', }, 'combiner': { 'type': 'script', 'pre_filter': 'grep nothing', }, 'reducer': { 'type': 'script', 'pre_filter': 'grep something', }, }, ["-mapper", "bash -c 'grep anything | python my_job.py --step-num=0" " --mapper'", "-combiner", "bash -c 'grep nothing | python my_job.py --step-num=0" " --combiner'", "-reducer", "bash -c 'grep something | python my_job.py --step-num=0" " --reducer'"]) def test_combiner_018(self): self._assert_streaming_step_old( { 'type': 'streaming', 'mapper': { 'type': 'command', 'command': 'cat', }, 'combiner': { 'type': 'script', }, }, ["-mapper", "bash -c 'cat | sort | python my_job.py --step-num=0" " --combiner'", '-jobconf', 'mapred.reduce.tasks=0']) def test_pre_filters_018(self): self._assert_streaming_step_old( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep anything', }, 'combiner': { 'type': 'script', 'pre_filter': 'grep nothing', }, 'reducer': { 'type': 'script', 'pre_filter': 'grep something', }, }, ['-mapper', "bash -c 'grep anything | python my_job.py --step-num=0" " --mapper | sort | grep nothing | python my_job.py" " --step-num=0 --combiner'", '-reducer', "bash -c 'grep something | python my_job.py --step-num=0" " --reducer'"]) def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': bash_wrap("grep 'anything'"), }, }, ['-mapper', "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |" " python my_job.py --step-num=0 --mapper'", '-jobconf', 'mapred.reduce.tasks=0'])
class StreamingArgsTestCase(EmptyMrjobConfTestCase): MRJOB_CONF_CONTENTS = {'runners': {'hadoop': { 'hadoop_home': 'kansas', 'hadoop_streaming_jar': 'binks.jar.jar', }}} def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar', mr_job_script='my_job.py', stdin=BytesIO()) self.runner._add_job_files_for_upload() self.start(patch.object(self.runner, '_upload_args', return_value=['new_upload_args'])) self.start(patch.object(self.runner, '_pre_0_20_upload_args', return_value=['old_upload_args'])) self.start(patch.object(self.runner, '_hadoop_args_for_step', return_value=['hadoop_args_for_step'])) self.start(patch.object(self.runner, '_hdfs_step_input_files', return_value=['hdfs_step_input_files'])) self.start(patch.object(self.runner, '_hdfs_step_output_dir', return_value='hdfs_step_output_dir')) self.start(patch.object(HadoopFilesystem, 'get_hadoop_version', return_value='1.2.0')) self.runner._script_path = 'my_job.py' self._new_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'new_upload_args', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir'] self._old_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir', 'old_upload_args'] def _assert_streaming_step(self, step, args): self.runner._steps = [step] self.assertEqual( self.runner._args_for_streaming_step(0), self._new_basic_args + args) def _assert_streaming_step_old(self, step, args): HadoopFilesystem.get_hadoop_version.return_value = '0.18' self.runner._steps = [step] self.assertEqual( self.runner._args_for_streaming_step(0), self._old_basic_args + args) def test_basic_mapper(self): self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', }, }, ['-mapper', PYTHON_BIN + ' my_job.py --step-num=0 --mapper', '-jobconf', 'mapred.reduce.tasks=0']) def test_basic_reducer(self): self._assert_streaming_step( { 'type': 'streaming', 'reducer': { 'type': 'script', }, }, ['-mapper', 'cat', '-reducer', PYTHON_BIN + ' my_job.py --step-num=0 --reducer']) def test_pre_filters(self): self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep anything', }, 'combiner': { 'type': 'script', 'pre_filter': 'grep nothing', }, 'reducer': { 'type': 'script', 'pre_filter': 'grep something', }, }, ["-mapper", "bash -c 'grep anything | " + PYTHON_BIN + " my_job.py --step-num=0 --mapper'", "-combiner", "bash -c 'grep nothing | " + PYTHON_BIN + " my_job.py --step-num=0 --combiner'", "-reducer", "bash -c 'grep something | " + PYTHON_BIN + " my_job.py --step-num=0 --reducer'"]) def test_combiner_018(self): self._assert_streaming_step_old( { 'type': 'streaming', 'mapper': { 'type': 'command', 'command': 'cat', }, 'combiner': { 'type': 'script', }, }, ["-mapper", "bash -c 'cat | sort | " + PYTHON_BIN + " my_job.py --step-num=0 --combiner'", '-jobconf', 'mapred.reduce.tasks=0']) def test_pre_filters_018(self): self._assert_streaming_step_old( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep anything', }, 'combiner': { 'type': 'script', 'pre_filter': 'grep nothing', }, 'reducer': { 'type': 'script', 'pre_filter': 'grep something', }, }, ['-mapper', "bash -c 'grep anything | " + PYTHON_BIN + " my_job.py --step-num=0" " --mapper | sort | grep nothing | " + PYTHON_BIN + " my_job.py --step-num=0 --combiner'", '-reducer', "bash -c 'grep something | " + PYTHON_BIN + " my_job.py --step-num=0 --reducer'"]) def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': bash_wrap("grep 'anything'"), }, }, ['-mapper', "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' | " + PYTHON_BIN + " my_job.py --step-num=0 --mapper'", '-jobconf', 'mapred.reduce.tasks=0'])
class StreamingArgsTestCase(EmptyMrjobConfTestCase): MRJOB_CONF_CONTENTS = {'runners': {'hadoop': { 'hadoop_home': 'kansas', 'hadoop_streaming_jar': 'binks.jar.jar', }}} BASIC_HADOOP_ARGS = [ 'hadoop', 'jar', '<streaming jar>', '<upload args>', '<hadoop args for step>', ] BASIC_JOB_ARGS = [ '-input', '<hdfs step input files>', '-output', '<hdfs step output dir>', ] def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin='hadoop', hadoop_streaming_jar='<streaming jar>', mr_job_script='my_job.py', stdin=BytesIO()) self.runner._add_job_files_for_upload() self.start(patch.object(self.runner, '_upload_args', return_value=['<upload args>'])) self.start(patch.object(self.runner, '_hadoop_args_for_step', return_value=['<hadoop args for step>'])) self.start(patch.object(self.runner, '_hdfs_step_input_files', return_value=['<hdfs step input files>'])) self.start(patch.object(self.runner, '_hdfs_step_output_dir', return_value='<hdfs step output dir>')) self.start(patch.object(HadoopFilesystem, 'get_hadoop_version', return_value='2.7.1')) self.runner._script_path = 'my_job.py' def _assert_streaming_step(self, step, args): self.runner._steps = [step] self.assertEqual( self.runner._args_for_streaming_step(0), self._new_basic_args + args) def test_basic_mapper(self): self.runner._steps = [ { 'type': 'streaming', 'mapper': { 'type': 'script', }, }, ] self.assertEqual( self.runner._args_for_streaming_step(0), (self.BASIC_HADOOP_ARGS + ['-D', 'mapreduce.job.reduces=0'] + self.BASIC_JOB_ARGS + [ '-mapper', PYTHON_BIN + ' my_job.py --step-num=0 --mapper'])) def test_basic_mapper_pre_yarn(self): # use a different jobconf (-D) on pre-YARN self.start(patch.object(HadoopFilesystem, 'get_hadoop_version', return_value='1.0.3')) self.runner._steps = [ { 'type': 'streaming', 'mapper': { 'type': 'script', }, }, ] self.assertEqual( self.runner._args_for_streaming_step(0), (self.BASIC_HADOOP_ARGS + ['-D', 'mapred.reduce.tasks=0'] + self.BASIC_JOB_ARGS + [ '-mapper', PYTHON_BIN + ' my_job.py --step-num=0 --mapper'])) def test_basic_reducer(self): self.runner._steps = [ { 'type': 'streaming', 'reducer': { 'type': 'script', }, }, ] self.assertEqual( self.runner._args_for_streaming_step(0), (self.BASIC_HADOOP_ARGS + self.BASIC_JOB_ARGS + [ '-mapper', 'cat', '-reducer', PYTHON_BIN + ' my_job.py --step-num=0 --reducer'])) def test_pre_filters(self): self.runner._steps = [ { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep anything', }, 'combiner': { 'type': 'script', 'pre_filter': 'grep nothing', }, 'reducer': { 'type': 'script', 'pre_filter': 'grep something', }, }, ] self.assertEqual( self.runner._args_for_streaming_step(0), (self.BASIC_HADOOP_ARGS + self.BASIC_JOB_ARGS + [ '-mapper', "bash -c 'grep anything | " + PYTHON_BIN + " my_job.py --step-num=0 --mapper'", '-combiner', "bash -c 'grep nothing | " + PYTHON_BIN + " my_job.py --step-num=0 --combiner'", '-reducer', "bash -c 'grep something | " + PYTHON_BIN + " my_job.py --step-num=0 --reducer'"])) def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self.runner._steps = [ { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': bash_wrap("grep 'anything'"), }, }, ] self.assertEqual( self.runner._args_for_streaming_step(0), (self.BASIC_HADOOP_ARGS + ['-D', 'mapreduce.job.reduces=0'] + self.BASIC_JOB_ARGS + [ '-mapper', "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' | " + PYTHON_BIN + " my_job.py --step-num=0 --mapper'"]))
class StreamingArgsTestCase(EmptyMrjobConfTestCase): MRJOB_CONF_CONTENTS = { 'runners': { 'hadoop': { 'hadoop_home': 'kansas', 'hadoop_streaming_jar': 'binks.jar.jar', } } } def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner(hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar', mr_job_script='my_job.py', stdin=StringIO()) self.runner._add_job_files_for_upload() self.runner._hadoop_version = '0.20.204' self.simple_patch(self.runner, '_new_upload_args', return_value=['new_upload_args']) self.simple_patch(self.runner, '_old_upload_args', return_value=['old_upload_args']) self.simple_patch(self.runner, '_hadoop_conf_args', return_value=['hadoop_conf_args']) self.simple_patch(self.runner, '_hdfs_step_input_files', return_value=['hdfs_step_input_files']) self.simple_patch(self.runner, '_hdfs_step_output_dir', return_value='hdfs_step_output_dir') self.runner._script_path = 'my_job.py' self._new_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'new_upload_args', 'hadoop_conf_args', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir' ] self._old_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'hadoop_conf_args', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir', 'old_upload_args' ] def simple_patch(self, obj, attr, side_effect=None, return_value=None): patcher = patch.object(obj, attr, side_effect=side_effect, return_value=return_value) patcher.start() self.addCleanup(patcher.stop) def _assert_streaming_step(self, step, args, step_num=0, num_steps=1): self.assertEqual( self.runner._streaming_args(step, step_num, num_steps), self._new_basic_args + args) def _assert_streaming_step_old(self, step, args, step_num=0, num_steps=1): self.runner._hadoop_version = '0.18' self.assertEqual( self._old_basic_args + args, self.runner._streaming_args(step, step_num, num_steps)) def test_basic_mapper(self): self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', }, }, [ '-mapper', 'python my_job.py --step-num=0 --mapper', '-jobconf', 'mapred.reduce.tasks=0' ]) def test_basic_reducer(self): self._assert_streaming_step( { 'type': 'streaming', 'reducer': { 'type': 'script', }, }, [ '-mapper', 'cat', '-reducer', 'python my_job.py --step-num=0 --reducer' ]) def test_pre_filters(self): self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep anything', }, 'combiner': { 'type': 'script', 'pre_filter': 'grep nothing', }, 'reducer': { 'type': 'script', 'pre_filter': 'grep something', }, }, [ "-mapper", "bash -c 'grep anything | python my_job.py --step-num=0" " --mapper'", "-combiner", "bash -c 'grep nothing | python my_job.py --step-num=0" " --combiner'", "-reducer", "bash -c 'grep something | python my_job.py --step-num=0" " --reducer'" ]) def test_combiner_018(self): self._assert_streaming_step_old( { 'type': 'streaming', 'mapper': { 'type': 'command', 'command': 'cat', }, 'combiner': { 'type': 'script', }, }, [ "-mapper", "bash -c 'cat | sort | python my_job.py --step-num=0" " --combiner'", '-jobconf', 'mapred.reduce.tasks=0' ]) def test_pre_filters_018(self): self._assert_streaming_step_old( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep anything', }, 'combiner': { 'type': 'script', 'pre_filter': 'grep nothing', }, 'reducer': { 'type': 'script', 'pre_filter': 'grep something', }, }, [ '-mapper', "bash -c 'grep anything | python my_job.py --step-num=0" " --mapper | sort | grep nothing | python my_job.py" " --step-num=0 --combiner'", '-reducer', "bash -c 'grep something | python my_job.py --step-num=0" " --reducer'" ]) def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': bash_wrap("grep 'anything'"), }, }, [ '-mapper', "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |" " python my_job.py --step-num=0 --mapper'", '-jobconf', 'mapred.reduce.tasks=0' ])
class StreamingArgsTestCase(EmptyMrjobConfTestCase): MRJOB_CONF_CONTENTS = {"runners": {"hadoop": {"hadoop_home": "kansas", "hadoop_streaming_jar": "binks.jar.jar"}}} def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin="hadoop", hadoop_streaming_jar="streaming.jar", mr_job_script="my_job.py", stdin=StringIO() ) self.runner._add_job_files_for_upload() self.runner._hadoop_version = "0.20.204" self.simple_patch(self.runner, "_new_upload_args", return_value=["new_upload_args"]) self.simple_patch(self.runner, "_old_upload_args", return_value=["old_upload_args"]) self.simple_patch(self.runner, "_hadoop_args_for_step", return_value=["hadoop_args_for_step"]) self.simple_patch(self.runner, "_hdfs_step_input_files", return_value=["hdfs_step_input_files"]) self.simple_patch(self.runner, "_hdfs_step_output_dir", return_value="hdfs_step_output_dir") self.runner._script_path = "my_job.py" self._new_basic_args = [ "hadoop", "jar", "streaming.jar", "new_upload_args", "hadoop_args_for_step", "-input", "hdfs_step_input_files", "-output", "hdfs_step_output_dir", ] self._old_basic_args = [ "hadoop", "jar", "streaming.jar", "hadoop_args_for_step", "-input", "hdfs_step_input_files", "-output", "hdfs_step_output_dir", "old_upload_args", ] def simple_patch(self, obj, attr, side_effect=None, return_value=None): patcher = patch.object(obj, attr, side_effect=side_effect, return_value=return_value) patcher.start() self.addCleanup(patcher.stop) def _assert_streaming_step(self, step, args): self.runner._steps = [step] self.assertEqual(self.runner._args_for_streaming_step(0), self._new_basic_args + args) def _assert_streaming_step_old(self, step, args): self.runner._hadoop_version = "0.18" self.runner._steps = [step] self.assertEqual(self.runner._args_for_streaming_step(0), self._old_basic_args + args) def test_basic_mapper(self): self._assert_streaming_step( {"type": "streaming", "mapper": {"type": "script"}}, ["-mapper", "python my_job.py --step-num=0 --mapper", "-jobconf", "mapred.reduce.tasks=0"], ) def test_basic_reducer(self): self._assert_streaming_step( {"type": "streaming", "reducer": {"type": "script"}}, ["-mapper", "cat", "-reducer", "python my_job.py --step-num=0 --reducer"], ) def test_pre_filters(self): self._assert_streaming_step( { "type": "streaming", "mapper": {"type": "script", "pre_filter": "grep anything"}, "combiner": {"type": "script", "pre_filter": "grep nothing"}, "reducer": {"type": "script", "pre_filter": "grep something"}, }, [ "-mapper", "bash -c 'grep anything | python my_job.py --step-num=0" " --mapper'", "-combiner", "bash -c 'grep nothing | python my_job.py --step-num=0" " --combiner'", "-reducer", "bash -c 'grep something | python my_job.py --step-num=0" " --reducer'", ], ) def test_combiner_018(self): self._assert_streaming_step_old( {"type": "streaming", "mapper": {"type": "command", "command": "cat"}, "combiner": {"type": "script"}}, [ "-mapper", "bash -c 'cat | sort | python my_job.py --step-num=0" " --combiner'", "-jobconf", "mapred.reduce.tasks=0", ], ) def test_pre_filters_018(self): self._assert_streaming_step_old( { "type": "streaming", "mapper": {"type": "script", "pre_filter": "grep anything"}, "combiner": {"type": "script", "pre_filter": "grep nothing"}, "reducer": {"type": "script", "pre_filter": "grep something"}, }, [ "-mapper", "bash -c 'grep anything | python my_job.py --step-num=0" " --mapper | sort | grep nothing | python my_job.py" " --step-num=0 --combiner'", "-reducer", "bash -c 'grep something | python my_job.py --step-num=0" " --reducer'", ], ) def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self._assert_streaming_step( {"type": "streaming", "mapper": {"type": "script", "pre_filter": bash_wrap("grep 'anything'")}}, [ "-mapper", "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |" " python my_job.py --step-num=0 --mapper'", "-jobconf", "mapred.reduce.tasks=0", ], )