class FindProbableCauseOfFailureTestCase(MockHadoopTestCase): # integration tests for _find_probable_cause_of_failure() def setUp(self): super(FindProbableCauseOfFailureTestCase, self).setUp() os.environ['MOCK_HADOOP_VERSION'] = '2.7.0' self.runner = HadoopJobRunner() def test_empty(self): self.assertEqual(self.runner._find_probable_cause_of_failure(), None) def test_yarn_python_exception(self): APPLICATION_ID = 'application_1450486922681_0004' CONTAINER_ID = 'container_1450486922681_0005_01_000003' log_subdir = os.path.join( os.environ['HADOOP_HOME'], 'logs', 'userlogs', APPLICATION_ID, CONTAINER_ID) os.makedirs(log_subdir) syslog_path = os.path.join(log_subdir, 'syslog') with open(syslog_path, 'w') as syslog: syslog.write( '2015-12-21 14:06:17,707 INFO [main]' ' org.apache.hadoop.mapred.MapTask: Processing split:' ' hdfs://e4270474c8ee:9000/user/root/tmp/mrjob' '/mr_boom.root.20151221.190511.059097/files' '/bootstrap.sh:0+335\n') syslog.write( '2015-12-21 14:06:18,538 WARN [main]' ' org.apache.hadoop.mapred.YarnChild: Exception running child' ' : java.lang.RuntimeException:' ' PipeMapRed.waitOutputThreads(): subprocess failed with' ' code 1\n') syslog.write( ' at org.apache.hadoop.streaming.PipeMapRed' '.waitOutputThreads(PipeMapRed.java:322)\n') stderr_path = os.path.join(log_subdir, 'stderr') with open(stderr_path, 'w') as stderr: stderr.write('Traceback (most recent call last):\n') stderr.write(' File "mr_boom.py", line 10, in <module>\n') stderr.write(' MRBoom.run()\n') stderr.write('Exception: BOOM\n') # need application_id self.assertIsNone(self.runner._find_probable_cause_of_failure()) cause = self.runner._find_probable_cause_of_failure( application_id=APPLICATION_ID) self.assertTrue(cause) self.assertEqual(cause['syslog']['path'], syslog_path) self.assertTrue(cause['syslog']['error']) self.assertEqual(cause['stderr']['path'], stderr_path) self.assertTrue(cause['stderr']['error'])
def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar', mr_job_script='my_job.py', stdin=StringIO()) self.runner._add_job_files_for_upload() self.runner._hadoop_version='0.20.204' self.simple_patch(self.runner, '_new_upload_args', return_value=['new_upload_args']) self.simple_patch(self.runner, '_old_upload_args', return_value=['old_upload_args']) self.simple_patch(self.runner, '_hadoop_args_for_step', return_value=['hadoop_args_for_step']) self.simple_patch(self.runner, '_hdfs_step_input_files', return_value=['hdfs_step_input_files']) self.simple_patch(self.runner, '_hdfs_step_output_dir', return_value='hdfs_step_output_dir') self.runner._script_path = 'my_job.py' self._new_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'new_upload_args', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir'] self._old_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir', 'old_upload_args']
def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner(hadoop_bin='hadoop', hadoop_streaming_jar='<streaming jar>', mr_job_script='my_job.py', stdin=BytesIO()) self.runner._add_job_files_for_upload() self.start( patch.object(self.runner, '_upload_args', return_value=['<upload args>'])) self.start( patch.object(self.runner, '_hadoop_args_for_step', return_value=['<hadoop args for step>'])) self.start( patch.object(self.runner, '_hdfs_step_input_files', return_value=['<hdfs step input files>'])) self.start( patch.object(self.runner, '_hdfs_step_output_dir', return_value='<hdfs step output dir>')) self.start( patch.object(HadoopFilesystem, 'get_hadoop_version', return_value='2.7.1')) self.runner._script_path = 'my_job.py'
def test_du(self): root = os.environ['MOCK_HDFS_ROOT'] data_path_1 = os.path.join(root, 'data1') with open(data_path_1, 'w') as f: f.write("abcd") remote_data_1 = 'hdfs:///data1' data_dir = os.path.join(root, 'more') os.mkdir(data_dir) remote_dir = 'hdfs:///more' data_path_2 = os.path.join(data_dir, 'data2') with open(data_path_2, 'w') as f: f.write("defg") remote_data_2 = 'hdfs:///more/data2' data_path_3 = os.path.join(data_dir, 'data3') with open(data_path_3, 'w') as f: f.write("hijk") remote_data_2 = 'hdfs:///more/data3' runner = HadoopJobRunner(conf_path=False) self.assertEqual(runner.du(root), 12) self.assertEqual(runner.du(remote_dir), 8) self.assertEqual(runner.du(remote_dir + '/*'), 8) self.assertEqual(runner.du(remote_data_1), 4) self.assertEqual(runner.du(remote_data_2), 4)
def test_infer_from_hadoop_bin_realpath(self): with patch('posixpath.realpath', return_value='/ha/do/op/bin'): self.runner = HadoopJobRunner(hadoop_bin=['/usr/bin/hadoop']) self.mock_paths.append('/ha/do/op/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/hadoop-streaming.jar')
def setUp(self): super(HadoopLogDirsTestCase, self).setUp() os.environ.clear() self.mock_hadoop_version = '2.7.0' # the result of _hadoop_dir(). This handles non-log-specific # environment variables, such as $HADOOP_PREFIX, and also guesses # based on the path of the Hadoop binary self.mock_hadoop_dirs = [] def mock_get_hadoop_version(): return self.mock_hadoop_version def mock_hadoop_dirs_method(): return (d for d in self.mock_hadoop_dirs) self.start( patch('mrjob.hadoop.HadoopJobRunner.get_hadoop_version', side_effect=mock_get_hadoop_version)) self.start( patch('mrjob.hadoop.HadoopJobRunner._hadoop_dirs', side_effect=mock_hadoop_dirs_method)) self.runner = HadoopJobRunner()
def test_infer_from_hadoop_bin_parent_dir(self): self.runner = HadoopJobRunner( hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop']) self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/bin-parent/hadoop-streaming.jar')
def test_hadoop_mapred_home_beats_infer_from_hadoop_bin(self): self.runner = HadoopJobRunner( hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop']) self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar') self.test_hadoop_mapred_home()
def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar', mr_job_script='my_job.py', stdin=BytesIO()) self.runner._add_job_files_for_upload() self.start(patch.object(self.runner, '_upload_args', return_value=['new_upload_args'])) self.start(patch.object(self.runner, '_pre_0_20_upload_args', return_value=['old_upload_args'])) self.start(patch.object(self.runner, '_hadoop_args_for_step', return_value=['hadoop_args_for_step'])) self.start(patch.object(self.runner, '_hdfs_step_input_files', return_value=['hdfs_step_input_files'])) self.start(patch.object(self.runner, '_hdfs_step_output_dir', return_value='hdfs_step_output_dir')) self.start(patch.object(HadoopFilesystem, 'get_hadoop_version', return_value='1.2.0')) self.runner._script_path = 'my_job.py' self._new_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'new_upload_args', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir'] self._old_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir', 'old_upload_args']
def test_hadoop_runner_cluster_mode(self): runner = HadoopJobRunner(spark_deploy_mode='cluster') self.assertEqual(runner._logs_needed_to_pick_error('streaming'), ('step', 'history', 'task')) self.assertEqual(runner._logs_needed_to_pick_error('spark'), ('step', 'task'))
def test_hadoop_runner_client_mode(self): runner = HadoopJobRunner() self.assertEqual(runner._logs_needed_to_pick_error('streaming'), ('step', 'history', 'task')) self.assertEqual(runner._logs_needed_to_pick_error('spark'), ('step',))
def test_hadoop_log_dirs_opt(self): self.runner = HadoopJobRunner(hadoop_log_dirs=['/logs1', '/logs2']) os.environ['HADOOP_LOG_DIR'] = '/path/to/hadoop-log-dir' # setting hadoop_log_dirs short-circuits automatic discovery of logs self.assertEqual(list(self.runner._hadoop_log_dirs()), ['/logs1', '/logs2'])
def test_uris(self): runner = HadoopJobRunner() list(runner.ls('hdfs://tmp/waffles')) list(runner.ls('lego://my/ego')) list(runner.ls('/tmp')) with open(os.environ['MOCK_HADOOP_LOG']) as mock_log: hadoop_cmd_args = [shlex.split(line) for line in mock_log] assert_equal(hadoop_cmd_args, [ ['fs', '-lsr', 'hdfs://tmp/waffles'], ['fs', '-lsr', 'lego://my/ego'], ])
def setUp(self): super(HadoopStreamingJarTestCase, self).setUp() self.mock_paths = [] def mock_ls(path): # don't bother to support globs return (p for p in sorted(self.mock_paths) if p.startswith(path)) self.start(patch('mrjob.fs.local.LocalFilesystem.ls', side_effect=mock_ls)) os.environ.clear() self.runner = HadoopJobRunner()
def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar') self.runner._hadoop_version='0.20.204' self.simple_patch(self.runner, '_new_upload_args', return_value=['new_upload_args']) self.simple_patch(self.runner, '_old_upload_args', return_value=['old_upload_args']) self.simple_patch(self.runner, '_hadoop_conf_args', return_value=['hadoop_conf_args']) self.simple_patch(self.runner, '_hdfs_step_input_files', return_value=['hdfs_step_input_files']) self.simple_patch(self.runner, '_hdfs_step_output_dir', return_value='hdfs_step_output_dir') self.runner._script = {'name': 'my_job.py'} self._new_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'new_upload_args', 'hadoop_conf_args', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir'] self._old_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'hadoop_conf_args', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir', 'old_upload_args']
def make_runner(self): """Make a runner based on command-line arguments, so we can launch this job on EMR, on Hadoop, or locally. :rtype: :py:class:`mrjob.runner.MRJobRunner` """ if self.options.runner == 'emr': # avoid requiring dependencies (such as boto3) for other runners from mrjob.emr import EMRJobRunner return EMRJobRunner(**self.emr_job_runner_kwargs()) elif self.options.runner == 'dataproc': from mrjob.dataproc import DataprocJobRunner return DataprocJobRunner(**self.dataproc_job_runner_kwargs()) elif self.options.runner == 'hadoop': from mrjob.hadoop import HadoopJobRunner return HadoopJobRunner(**self.hadoop_job_runner_kwargs()) elif self.options.runner == 'inline': raise ValueError("inline is not supported in the multi-lingual" " launcher.") else: # run locally by default from mrjob.local import LocalMRJobRunner return LocalMRJobRunner(**self.local_job_runner_kwargs())
def make_runner(self): """Make a runner based on command-line arguments, so we can launch this job on EMR, on Hadoop, or locally. :rtype: :py:class:`mrjob.runner.MRJobRunner` """ # have to import here so that we can still run the MRJob # without importing boto from mrjob.emr import EMRJobRunner from mrjob.hadoop import HadoopJobRunner from mrjob.local import LocalMRJobRunner if self.options.runner == 'emr': return EMRJobRunner(**self.emr_job_runner_kwargs()) elif self.options.runner == 'hadoop': return HadoopJobRunner(**self.hadoop_job_runner_kwargs()) elif self.options.runner == 'inline': raise ValueError("inline is not supported in the multi-lingual" " launcher.") else: # run locally by default return LocalMRJobRunner(**self.local_job_runner_kwargs())
def test_hadoop_log_dirs_opt(self): self.runner = HadoopJobRunner(hadoop_log_dirs=['/logs1', '/logs2']) os.environ['HADOOP_LOG_DIR'] = '/path/to/hadoop-log-dir' # setting hadoop_log_dirs short-circuits automatic discovery of logs self.assertEqual( list(self.runner._hadoop_log_dirs()), ['/logs1', '/logs2'])
def setUp(self): super(StreamingLogDirsTestCase, self).setUp() self.log = self.start(patch('mrjob.hadoop.log')) self.runner = HadoopJobRunner() self.runner._hadoop_log_dirs = Mock(return_value=[]) self.runner.fs.exists = Mock(return_value=True) self.log.reset_mock() # ignore logging from HadoopJobRunner init
def test_hadoop_home_regression(self): # kill $HADOOP_HOME if it exists try: del os.environ['HADOOP_HOME'] except KeyError: pass with patch('mrjob.hadoop.find_hadoop_streaming_jar', return_value='some.jar'): HadoopJobRunner(hadoop_home=self.tmp_dir, conf_paths=[])
def test_cat_compressed(self): input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'w') input_gz.write('foo\nbar\n') input_gz.close() with HadoopJobRunner(cleanup=['NONE']) as runner: output = [] for line in runner.cat(input_gz_path): output.append(line) assert_equal(output, ['foo\n', 'bar\n']) input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2') input_bz2 = bz2.BZ2File(input_bz2_path, 'w') input_bz2.write('bar\nbar\nfoo\n') input_bz2.close() with HadoopJobRunner(cleanup=['NONE']) as runner: output = [] for line in runner.cat(input_bz2_path): output.append(line) assert_equal(output, ['bar\n', 'bar\n', 'foo\n'])
def test_pass_through_fields(self): # TODO: currently can't initialize HadoopRunner without setting these runner = HadoopJobRunner(hadoop_bin='hadoooooooooop', hadoop_home='kansas', hadoop_streaming_jar='streaming.jar') with no_handlers_for_logger('mrjob.runner'): stderr = StringIO() log_to_stream('mrjob.runner', stderr) self.assertEqual(runner._hadoop_bin, runner.fs._hadoop_bin) # deprecation warning is different for non-functions self.assertIn( 'deprecated: access HadoopJobRunner.fs._hadoop_bin directly', stderr.getvalue())
def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin='hadoop', hadoop_streaming_jar='<streaming jar>', mr_job_script='my_job.py', stdin=BytesIO()) self.runner._add_job_files_for_upload() self.start(patch.object(self.runner, '_upload_args', return_value=['<upload args>'])) self.start(patch.object(self.runner, '_hadoop_args_for_step', return_value=['<hadoop args for step>'])) self.start(patch.object(self.runner, '_hdfs_step_input_files', return_value=['<hdfs step input files>'])) self.start(patch.object(self.runner, '_hdfs_step_output_dir', return_value='<hdfs step output dir>')) self.start(patch.object(HadoopFilesystem, 'get_hadoop_version', return_value='2.7.1')) self.runner._script_path = 'my_job.py'
def test_prefer_own_methods(self): # TODO: currently can't initialize HadoopRunner without setting these runner = HadoopJobRunner(hadoop_bin='hadoop', hadoop_home='kansas', hadoop_streaming_jar='streaming.jar') with no_handlers_for_logger('mrjob.runner'): stderr = StringIO() log_to_stream('mrjob.runner', stderr) self.assertEqual(runner.ls, runner.fs.ls) # Hadoop Runner has its own version self.assertNotEqual(runner.get_hadoop_version, runner.fs.get_hadoop_version) self.assertIn('deprecated: call HadoopJobRunner.fs.ls() directly', stderr.getvalue()) self.assertNotIn('get_hadoop_version', stderr.getvalue())
def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin="hadoop", hadoop_streaming_jar="streaming.jar", mr_job_script="my_job.py", stdin=StringIO() ) self.runner._add_job_files_for_upload() self.runner._hadoop_version = "0.20.204" self.simple_patch(self.runner, "_new_upload_args", return_value=["new_upload_args"]) self.simple_patch(self.runner, "_old_upload_args", return_value=["old_upload_args"]) self.simple_patch(self.runner, "_hadoop_args_for_step", return_value=["hadoop_args_for_step"]) self.simple_patch(self.runner, "_hdfs_step_input_files", return_value=["hdfs_step_input_files"]) self.simple_patch(self.runner, "_hdfs_step_output_dir", return_value="hdfs_step_output_dir") self.runner._script_path = "my_job.py" self._new_basic_args = [ "hadoop", "jar", "streaming.jar", "new_upload_args", "hadoop_args_for_step", "-input", "hdfs_step_input_files", "-output", "hdfs_step_output_dir", ] self._old_basic_args = [ "hadoop", "jar", "streaming.jar", "hadoop_args_for_step", "-input", "hdfs_step_input_files", "-output", "hdfs_step_output_dir", "old_upload_args", ]
def setUp(self): super(HadoopLogDirsTestCase, self).setUp() os.environ.clear() self.mock_hadoop_version = '2.7.0' # the result of _hadoop_dir(). This handles non-log-specific # environment variables, such as $HADOOP_PREFIX, and also guesses # based on the path of the Hadoop binary self.mock_hadoop_dirs = [] def mock_get_hadoop_version(): return self.mock_hadoop_version def mock_hadoop_dirs_method(): return (d for d in self.mock_hadoop_dirs) self.start(patch('mrjob.hadoop.HadoopJobRunner.get_hadoop_version', side_effect=mock_get_hadoop_version)) self.start(patch('mrjob.hadoop.HadoopJobRunner._hadoop_dirs', side_effect=mock_hadoop_dirs_method)) self.runner = HadoopJobRunner()
def test_cat_uncompressed(self): local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as input_file: input_file.write('bar\nfoo\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\nfoo\n') remote_input_path = 'hdfs:///data/foo' check_call([ self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path ]) with HadoopJobRunner(cleanup=['NONE']) as runner: local_output = [] for line in runner.cat(local_input_path): local_output.append(line) remote_output = [] for line in runner.cat(remote_input_path): remote_output.append(line) assert_equal(local_output, ['bar\n', 'foo\n']) assert_equal(remote_output, ['foo\n', 'foo\n'])
def setUp(self): super(FindProbableCauseOfFailureTestCase, self).setUp() os.environ['MOCK_HADOOP_VERSION'] = '2.7.0' self.runner = HadoopJobRunner()
def test_deprecated_hadoop_home_option(self): self.runner = HadoopJobRunner(hadoop_home='/ha/do/op/home-option') self.mock_paths.append('/ha/do/op/home-option/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/home-option/hadoop-streaming.jar')
class StreamingArgsTestCase(EmptyMrjobConfTestCase): MRJOB_CONF_CONTENTS = { 'runners': { 'hadoop': { 'hadoop_home': 'kansas', 'hadoop_streaming_jar': 'binks.jar.jar', } } } def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner(hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar', mr_job_script='my_job.py', stdin=StringIO()) self.runner._add_job_files_for_upload() self.runner._hadoop_version = '0.20.204' self.simple_patch(self.runner, '_new_upload_args', return_value=['new_upload_args']) self.simple_patch(self.runner, '_old_upload_args', return_value=['old_upload_args']) self.simple_patch(self.runner, '_hadoop_conf_args', return_value=['hadoop_conf_args']) self.simple_patch(self.runner, '_hdfs_step_input_files', return_value=['hdfs_step_input_files']) self.simple_patch(self.runner, '_hdfs_step_output_dir', return_value='hdfs_step_output_dir') self.runner._script_path = 'my_job.py' self._new_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'new_upload_args', 'hadoop_conf_args', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir' ] self._old_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'hadoop_conf_args', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir', 'old_upload_args' ] def simple_patch(self, obj, attr, side_effect=None, return_value=None): patcher = patch.object(obj, attr, side_effect=side_effect, return_value=return_value) patcher.start() self.addCleanup(patcher.stop) def _assert_streaming_step(self, step, args, step_num=0, num_steps=1): self.assertEqual( self.runner._streaming_args(step, step_num, num_steps), self._new_basic_args + args) def _assert_streaming_step_old(self, step, args, step_num=0, num_steps=1): self.runner._hadoop_version = '0.18' self.assertEqual( self._old_basic_args + args, self.runner._streaming_args(step, step_num, num_steps)) def test_basic_mapper(self): self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', }, }, [ '-mapper', 'python my_job.py --step-num=0 --mapper', '-jobconf', 'mapred.reduce.tasks=0' ]) def test_basic_reducer(self): self._assert_streaming_step( { 'type': 'streaming', 'reducer': { 'type': 'script', }, }, [ '-mapper', 'cat', '-reducer', 'python my_job.py --step-num=0 --reducer' ]) def test_pre_filters(self): self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep anything', }, 'combiner': { 'type': 'script', 'pre_filter': 'grep nothing', }, 'reducer': { 'type': 'script', 'pre_filter': 'grep something', }, }, [ "-mapper", "bash -c 'grep anything | python my_job.py --step-num=0" " --mapper'", "-combiner", "bash -c 'grep nothing | python my_job.py --step-num=0" " --combiner'", "-reducer", "bash -c 'grep something | python my_job.py --step-num=0" " --reducer'" ]) def test_combiner_018(self): self._assert_streaming_step_old( { 'type': 'streaming', 'mapper': { 'type': 'command', 'command': 'cat', }, 'combiner': { 'type': 'script', }, }, [ "-mapper", "bash -c 'cat | sort | python my_job.py --step-num=0" " --combiner'", '-jobconf', 'mapred.reduce.tasks=0' ]) def test_pre_filters_018(self): self._assert_streaming_step_old( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep anything', }, 'combiner': { 'type': 'script', 'pre_filter': 'grep nothing', }, 'reducer': { 'type': 'script', 'pre_filter': 'grep something', }, }, [ '-mapper', "bash -c 'grep anything | python my_job.py --step-num=0" " --mapper | sort | grep nothing | python my_job.py" " --step-num=0 --combiner'", '-reducer', "bash -c 'grep something | python my_job.py --step-num=0" " --reducer'" ]) def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': bash_wrap("grep 'anything'"), }, }, [ '-mapper', "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |" " python my_job.py --step-num=0 --mapper'", '-jobconf', 'mapred.reduce.tasks=0' ])
class HadoopStreamingJarTestCase(SandboxedTestCase): def setUp(self): super(HadoopStreamingJarTestCase, self).setUp() self.mock_paths = [] def mock_ls(path): # don't bother to support globs return (p for p in sorted(self.mock_paths) if p.startswith(path)) self.start(patch('mrjob.fs.local.LocalFilesystem.ls', side_effect=mock_ls)) os.environ.clear() self.runner = HadoopJobRunner() def test_empty_fs(self): self.assertEqual(self.runner._find_hadoop_streaming_jar(), None) def test_deprecated_hadoop_home_option(self): self.runner = HadoopJobRunner(hadoop_home='/ha/do/op/home-option') self.mock_paths.append('/ha/do/op/home-option/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/home-option/hadoop-streaming.jar') def test_deprecated_hadoop_home_option_beats_hadoop_prefix(self): os.environ['HADOOP_PREFIX'] = '/ha/do/op/prefix' self.mock_paths.append('/ha/do/op/prefix/hadoop-streaming.jar') self.test_deprecated_hadoop_home_option() # tests of well-known environment variables def test_hadoop_prefix(self): os.environ['HADOOP_PREFIX'] = '/ha/do/op/prefix' self.mock_paths.append('/ha/do/op/prefix/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/prefix/hadoop-streaming.jar') def test_hadoop_prefix_beats_hadoop_home(self): os.environ['HADOOP_HOME'] = '/ha/do/op/home' self.mock_paths.append('/ha/do/op/home/hadoop-streaming.jar') self.test_hadoop_prefix() def test_hadoop_home(self): os.environ['HADOOP_HOME'] = '/ha/do/op/home' self.mock_paths.append('/ha/do/op/home/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/home/hadoop-streaming.jar') def test_hadoop_home_beats_hadoop_install(self): os.environ['HADOOP_INSTALL'] = '/ha/do/op/install' self.mock_paths.append('/ha/do/op/install/hadoop-streaming.jar') self.test_hadoop_home() def test_hadoop_install(self): os.environ['HADOOP_INSTALL'] = '/ha/do/op/install' self.mock_paths.append('/ha/do/op/install/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/install/hadoop-streaming.jar') def test_hadoop_install_beats_hadoop_mapred_home(self): os.environ['HADOOP_MAPRED_HOME'] = '/ha/do/op/mapred-home' self.mock_paths.append('/ha/do/op/mapred-home/hadoop-streaming.jar') self.test_hadoop_install() def test_hadoop_mapred_home(self): os.environ['HADOOP_MAPRED_HOME'] = '/ha/do/op/mapred-home' self.mock_paths.append('/ha/do/op/mapred-home/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/mapred-home/hadoop-streaming.jar') def test_hadoop_mapred_home_beats_infer_from_hadoop_bin(self): self.runner = HadoopJobRunner( hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop']) self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar') self.test_hadoop_mapred_home() # infer from hadoop_bin def test_infer_from_hadoop_bin_parent_dir(self): self.runner = HadoopJobRunner( hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop']) self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/bin-parent/hadoop-streaming.jar') def test_hadoop_bin_beats_hadoop_anything_home(self): os.environ['HADOOP_ANYTHING_HOME'] = '/ha/do/op/anything-home' self.mock_paths.append('/ha/do/op/anything-home/hadoop-streaming.jar') self.test_infer_from_hadoop_bin_parent_dir() def test_dont_infer_from_bin_hadoop(self): self.runner = HadoopJobRunner(hadoop_bin=['/bin/hadoop']) self.mock_paths.append('/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), None) def test_dont_infer_from_usr_bin_hadoop(self): self.runner = HadoopJobRunner(hadoop_bin=['/usr/bin/hadoop']) self.mock_paths.append('/usr/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), None) def test_dont_infer_from_usr_local_bin_hadoop(self): self.runner = HadoopJobRunner(hadoop_bin=['/usr/local/bin/hadoop']) self.mock_paths.append('/usr/local/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), None) def test_infer_from_hadoop_bin_realpath(self): with patch('posixpath.realpath', return_value='/ha/do/op/bin'): self.runner = HadoopJobRunner(hadoop_bin=['/usr/bin/hadoop']) self.mock_paths.append('/ha/do/op/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/hadoop-streaming.jar') # tests of fallback environment variables ($HADOOP_*_HOME) def test_hadoop_anything_home(self): os.environ['HADOOP_WHATEVER_HOME'] = '/ha/do/op/whatever-home' self.mock_paths.append('/ha/do/op/whatever-home/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/whatever-home/hadoop-streaming.jar') # $HADOOP_ANYTHING_HOME comes before $HADOOP_WHATEVER_HOME os.environ['HADOOP_ANYTHING_HOME'] = '/ha/do/op/anything-home' self.mock_paths.append('/ha/do/op/anything-home/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/anything-home/hadoop-streaming.jar') def test_hadoop_anything_home_beats_hard_coded_paths(self): self.mock_paths.append('/home/hadoop/contrib/hadoop-streaming.jar') self.mock_paths.append( '/usr/lib/hadoop-mapreduce/hadoop-streaming.jar') self.test_hadoop_anything_home() # hard-coded paths (for Hadoop inside EMR) def test_hard_coded_emr_paths(self): self.mock_paths.append( '/usr/lib/hadoop-mapreduce/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/usr/lib/hadoop-mapreduce/hadoop-streaming.jar') # /home/hadoop/contrib takes precedence self.mock_paths.append('/home/hadoop/contrib/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/home/hadoop/contrib/hadoop-streaming.jar') # invalid environment variables def test_other_environment_variable(self): os.environ['HADOOP_YARN_MRJOB_DIR'] = '/ha/do/op/yarn-mrjob-dir' self.mock_paths.append( '/ha/do/op/yarn-mrjob-dir/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), None) # alternate jar names and paths def test_subdirs(self): os.environ['HADOOP_PREFIX'] = '/ha/do/op' self.mock_paths.append('/ha/do/op/contrib/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/contrib/hadoop-streaming.jar') def test_hadoop_streaming_jar_name_with_version(self): os.environ['HADOOP_PREFIX'] = '/ha/do/op' self.mock_paths.append('/ha/do/op/hadoop-streaming-2.6.0-amzn-0.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/hadoop-streaming-2.6.0-amzn-0.jar') def test_skip_hadoop_streaming_source_jar(self): os.environ['HADOOP_PREFIX'] = '/ha/do/op' # Googled it; it really is named *-sources.jar, not *-source.jar self.mock_paths.append( '/ha/do/op/hadoop-streaming-2.0.0-mr1-cdh4.3.1-sources.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), None) # multiple matching jars in same directory def test_pick_shortest_name(self): os.environ['HADOOP_PREFIX'] = '/ha/do/op' self.mock_paths.append('/ha/do/op/hadoop-streaming-1.0.3.jar') self.mock_paths.append('/ha/do/op/hadoop-streaming.jar') # hadoop-streaming-1.0.3.jar comes first in alphabetical order self.assertEqual(sorted(self.mock_paths), self.mock_paths) self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/hadoop-streaming.jar') def test_pick_shallowest_subpath(self): os.environ['HADOOP_PREFIX'] = '/ha/do/op' self.mock_paths.append('/ha/do/op/hadoop-streaming-1.0.3.jar') self.mock_paths.append('/ha/do/op/old/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/hadoop-streaming-1.0.3.jar') def test_fall_back_to_alphabetical_order(self): os.environ['HADOOP_PREFIX'] = '/ha/do/op' self.mock_paths.append('/ha/do/op/hadoop-streaming-a.jar') self.mock_paths.append('/ha/do/op/hadoop-streaming-b.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/hadoop-streaming-a.jar') # sanity-check that directory order overrides path sort order def test_directory_order_overrides_path_sort_order(self): os.environ['HADOOP_HOME'] = '/ha/do/op/a' os.environ['HADOOP_PREFIX'] = '/ha/do/op/b' self.mock_paths.append('/ha/do/op/a/hadoop-streaming-a.jar') self.mock_paths.append('/ha/do/op/b/hadoop-streaming-b.jar') # $HADOOP_PREFIX takes precendence over $HADOOP_HOME, so sort # order doesn't matter self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/b/hadoop-streaming-b.jar') # now search in parent dir (/ha/do/op) to invoke sort order os.environ['HADOOP_PREFIX'] = '/ha/do/op' self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/a/hadoop-streaming-a.jar')
class StreamingArgsTestCase(EmptyMrjobConfTestCase): MRJOB_CONF_CONTENTS = {'runners': {'hadoop': { 'hadoop_home': 'kansas', 'hadoop_streaming_jar': 'binks.jar.jar', }}} def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar', mr_job_script='my_job.py', stdin=BytesIO()) self.runner._add_job_files_for_upload() self.start(patch.object(self.runner, '_upload_args', return_value=['new_upload_args'])) self.start(patch.object(self.runner, '_pre_0_20_upload_args', return_value=['old_upload_args'])) self.start(patch.object(self.runner, '_hadoop_args_for_step', return_value=['hadoop_args_for_step'])) self.start(patch.object(self.runner, '_hdfs_step_input_files', return_value=['hdfs_step_input_files'])) self.start(patch.object(self.runner, '_hdfs_step_output_dir', return_value='hdfs_step_output_dir')) self.start(patch.object(HadoopFilesystem, 'get_hadoop_version', return_value='1.2.0')) self.runner._script_path = 'my_job.py' self._new_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'new_upload_args', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir'] self._old_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir', 'old_upload_args'] def _assert_streaming_step(self, step, args): self.runner._steps = [step] self.assertEqual( self.runner._args_for_streaming_step(0), self._new_basic_args + args) def _assert_streaming_step_old(self, step, args): HadoopFilesystem.get_hadoop_version.return_value = '0.18' self.runner._steps = [step] self.assertEqual( self.runner._args_for_streaming_step(0), self._old_basic_args + args) def test_basic_mapper(self): self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', }, }, ['-mapper', PYTHON_BIN + ' my_job.py --step-num=0 --mapper', '-jobconf', 'mapred.reduce.tasks=0']) def test_basic_reducer(self): self._assert_streaming_step( { 'type': 'streaming', 'reducer': { 'type': 'script', }, }, ['-mapper', 'cat', '-reducer', PYTHON_BIN + ' my_job.py --step-num=0 --reducer']) def test_pre_filters(self): self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep anything', }, 'combiner': { 'type': 'script', 'pre_filter': 'grep nothing', }, 'reducer': { 'type': 'script', 'pre_filter': 'grep something', }, }, ["-mapper", "bash -c 'grep anything | " + PYTHON_BIN + " my_job.py --step-num=0 --mapper'", "-combiner", "bash -c 'grep nothing | " + PYTHON_BIN + " my_job.py --step-num=0 --combiner'", "-reducer", "bash -c 'grep something | " + PYTHON_BIN + " my_job.py --step-num=0 --reducer'"]) def test_combiner_018(self): self._assert_streaming_step_old( { 'type': 'streaming', 'mapper': { 'type': 'command', 'command': 'cat', }, 'combiner': { 'type': 'script', }, }, ["-mapper", "bash -c 'cat | sort | " + PYTHON_BIN + " my_job.py --step-num=0 --combiner'", '-jobconf', 'mapred.reduce.tasks=0']) def test_pre_filters_018(self): self._assert_streaming_step_old( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep anything', }, 'combiner': { 'type': 'script', 'pre_filter': 'grep nothing', }, 'reducer': { 'type': 'script', 'pre_filter': 'grep something', }, }, ['-mapper', "bash -c 'grep anything | " + PYTHON_BIN + " my_job.py --step-num=0" " --mapper | sort | grep nothing | " + PYTHON_BIN + " my_job.py --step-num=0 --combiner'", '-reducer', "bash -c 'grep something | " + PYTHON_BIN + " my_job.py --step-num=0 --reducer'"]) def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': bash_wrap("grep 'anything'"), }, }, ['-mapper', "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' | " + PYTHON_BIN + " my_job.py --step-num=0 --mapper'", '-jobconf', 'mapred.reduce.tasks=0'])
class StreamingArgsTestCase(EmptyMrjobConfTestCase): MRJOB_CONF_CONTENTS = {"runners": {"hadoop": {"hadoop_home": "kansas", "hadoop_streaming_jar": "binks.jar.jar"}}} def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin="hadoop", hadoop_streaming_jar="streaming.jar", mr_job_script="my_job.py", stdin=StringIO() ) self.runner._add_job_files_for_upload() self.runner._hadoop_version = "0.20.204" self.simple_patch(self.runner, "_new_upload_args", return_value=["new_upload_args"]) self.simple_patch(self.runner, "_old_upload_args", return_value=["old_upload_args"]) self.simple_patch(self.runner, "_hadoop_args_for_step", return_value=["hadoop_args_for_step"]) self.simple_patch(self.runner, "_hdfs_step_input_files", return_value=["hdfs_step_input_files"]) self.simple_patch(self.runner, "_hdfs_step_output_dir", return_value="hdfs_step_output_dir") self.runner._script_path = "my_job.py" self._new_basic_args = [ "hadoop", "jar", "streaming.jar", "new_upload_args", "hadoop_args_for_step", "-input", "hdfs_step_input_files", "-output", "hdfs_step_output_dir", ] self._old_basic_args = [ "hadoop", "jar", "streaming.jar", "hadoop_args_for_step", "-input", "hdfs_step_input_files", "-output", "hdfs_step_output_dir", "old_upload_args", ] def simple_patch(self, obj, attr, side_effect=None, return_value=None): patcher = patch.object(obj, attr, side_effect=side_effect, return_value=return_value) patcher.start() self.addCleanup(patcher.stop) def _assert_streaming_step(self, step, args): self.runner._steps = [step] self.assertEqual(self.runner._args_for_streaming_step(0), self._new_basic_args + args) def _assert_streaming_step_old(self, step, args): self.runner._hadoop_version = "0.18" self.runner._steps = [step] self.assertEqual(self.runner._args_for_streaming_step(0), self._old_basic_args + args) def test_basic_mapper(self): self._assert_streaming_step( {"type": "streaming", "mapper": {"type": "script"}}, ["-mapper", "python my_job.py --step-num=0 --mapper", "-jobconf", "mapred.reduce.tasks=0"], ) def test_basic_reducer(self): self._assert_streaming_step( {"type": "streaming", "reducer": {"type": "script"}}, ["-mapper", "cat", "-reducer", "python my_job.py --step-num=0 --reducer"], ) def test_pre_filters(self): self._assert_streaming_step( { "type": "streaming", "mapper": {"type": "script", "pre_filter": "grep anything"}, "combiner": {"type": "script", "pre_filter": "grep nothing"}, "reducer": {"type": "script", "pre_filter": "grep something"}, }, [ "-mapper", "bash -c 'grep anything | python my_job.py --step-num=0" " --mapper'", "-combiner", "bash -c 'grep nothing | python my_job.py --step-num=0" " --combiner'", "-reducer", "bash -c 'grep something | python my_job.py --step-num=0" " --reducer'", ], ) def test_combiner_018(self): self._assert_streaming_step_old( {"type": "streaming", "mapper": {"type": "command", "command": "cat"}, "combiner": {"type": "script"}}, [ "-mapper", "bash -c 'cat | sort | python my_job.py --step-num=0" " --combiner'", "-jobconf", "mapred.reduce.tasks=0", ], ) def test_pre_filters_018(self): self._assert_streaming_step_old( { "type": "streaming", "mapper": {"type": "script", "pre_filter": "grep anything"}, "combiner": {"type": "script", "pre_filter": "grep nothing"}, "reducer": {"type": "script", "pre_filter": "grep something"}, }, [ "-mapper", "bash -c 'grep anything | python my_job.py --step-num=0" " --mapper | sort | grep nothing | python my_job.py" " --step-num=0 --combiner'", "-reducer", "bash -c 'grep something | python my_job.py --step-num=0" " --reducer'", ], ) def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self._assert_streaming_step( {"type": "streaming", "mapper": {"type": "script", "pre_filter": bash_wrap("grep 'anything'")}}, [ "-mapper", "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |" " python my_job.py --step-num=0 --mapper'", "-jobconf", "mapred.reduce.tasks=0", ], )
def test_missing_hadoop_version(self): with patch.dict('os.environ', MOCK_HADOOP_VERSION=''): runner = HadoopJobRunner() self.assertRaises(Exception, runner.get_hadoop_version)
class StreamingArgsTestCase(EmptyMrjobConfTestCase): MRJOB_CONF_CONTENTS = {'runners': {'hadoop': { 'hadoop_home': 'kansas', 'hadoop_streaming_jar': 'binks.jar.jar', }}} BASIC_HADOOP_ARGS = [ 'hadoop', 'jar', '<streaming jar>', '<upload args>', '<hadoop args for step>', ] BASIC_JOB_ARGS = [ '-input', '<hdfs step input files>', '-output', '<hdfs step output dir>', ] def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin='hadoop', hadoop_streaming_jar='<streaming jar>', mr_job_script='my_job.py', stdin=BytesIO()) self.runner._add_job_files_for_upload() self.start(patch.object(self.runner, '_upload_args', return_value=['<upload args>'])) self.start(patch.object(self.runner, '_hadoop_args_for_step', return_value=['<hadoop args for step>'])) self.start(patch.object(self.runner, '_hdfs_step_input_files', return_value=['<hdfs step input files>'])) self.start(patch.object(self.runner, '_hdfs_step_output_dir', return_value='<hdfs step output dir>')) self.start(patch.object(HadoopFilesystem, 'get_hadoop_version', return_value='2.7.1')) self.runner._script_path = 'my_job.py' def _assert_streaming_step(self, step, args): self.runner._steps = [step] self.assertEqual( self.runner._args_for_streaming_step(0), self._new_basic_args + args) def test_basic_mapper(self): self.runner._steps = [ { 'type': 'streaming', 'mapper': { 'type': 'script', }, }, ] self.assertEqual( self.runner._args_for_streaming_step(0), (self.BASIC_HADOOP_ARGS + ['-D', 'mapreduce.job.reduces=0'] + self.BASIC_JOB_ARGS + [ '-mapper', PYTHON_BIN + ' my_job.py --step-num=0 --mapper'])) def test_basic_mapper_pre_yarn(self): # use a different jobconf (-D) on pre-YARN self.start(patch.object(HadoopFilesystem, 'get_hadoop_version', return_value='1.0.3')) self.runner._steps = [ { 'type': 'streaming', 'mapper': { 'type': 'script', }, }, ] self.assertEqual( self.runner._args_for_streaming_step(0), (self.BASIC_HADOOP_ARGS + ['-D', 'mapred.reduce.tasks=0'] + self.BASIC_JOB_ARGS + [ '-mapper', PYTHON_BIN + ' my_job.py --step-num=0 --mapper'])) def test_basic_reducer(self): self.runner._steps = [ { 'type': 'streaming', 'reducer': { 'type': 'script', }, }, ] self.assertEqual( self.runner._args_for_streaming_step(0), (self.BASIC_HADOOP_ARGS + self.BASIC_JOB_ARGS + [ '-mapper', 'cat', '-reducer', PYTHON_BIN + ' my_job.py --step-num=0 --reducer'])) def test_pre_filters(self): self.runner._steps = [ { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep anything', }, 'combiner': { 'type': 'script', 'pre_filter': 'grep nothing', }, 'reducer': { 'type': 'script', 'pre_filter': 'grep something', }, }, ] self.assertEqual( self.runner._args_for_streaming_step(0), (self.BASIC_HADOOP_ARGS + self.BASIC_JOB_ARGS + [ '-mapper', "bash -c 'grep anything | " + PYTHON_BIN + " my_job.py --step-num=0 --mapper'", '-combiner', "bash -c 'grep nothing | " + PYTHON_BIN + " my_job.py --step-num=0 --combiner'", '-reducer', "bash -c 'grep something | " + PYTHON_BIN + " my_job.py --step-num=0 --reducer'"])) def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self.runner._steps = [ { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': bash_wrap("grep 'anything'"), }, }, ] self.assertEqual( self.runner._args_for_streaming_step(0), (self.BASIC_HADOOP_ARGS + ['-D', 'mapreduce.job.reduces=0'] + self.BASIC_JOB_ARGS + [ '-mapper', "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' | " + PYTHON_BIN + " my_job.py --step-num=0 --mapper'"]))
class HadoopLogDirsTestCase(SandboxedTestCase): def setUp(self): super(HadoopLogDirsTestCase, self).setUp() os.environ.clear() self.mock_hadoop_version = '2.7.0' # the result of _hadoop_dir(). This handles non-log-specific # environment variables, such as $HADOOP_PREFIX, and also guesses # based on the path of the Hadoop binary self.mock_hadoop_dirs = [] def mock_get_hadoop_version(): return self.mock_hadoop_version def mock_hadoop_dirs_method(): return (d for d in self.mock_hadoop_dirs) self.start(patch('mrjob.hadoop.HadoopJobRunner.get_hadoop_version', side_effect=mock_get_hadoop_version)) self.start(patch('mrjob.hadoop.HadoopJobRunner._hadoop_dirs', side_effect=mock_hadoop_dirs_method)) self.runner = HadoopJobRunner() def test_empty(self): self.assertEqual(list(self.runner._hadoop_log_dirs()), ['hdfs:///tmp/hadoop-yarn/staging', '/mnt/var/log/hadoop']) def test_precedence(self): os.environ['HADOOP_LOG_DIR'] = '/path/to/hadoop-log-dir' os.environ['YARN_LOG_DIR'] = '/path/to/yarn-log-dir' self.mock_hadoop_dirs = ['/path/to/hadoop-prefix', '/path/to/hadoop-home'] self.assertEqual( list(self.runner._hadoop_log_dirs(output_dir='hdfs:///output/')), ['/path/to/hadoop-log-dir', '/path/to/yarn-log-dir', 'hdfs:///tmp/hadoop-yarn/staging', 'hdfs:///output/_logs', '/path/to/hadoop-prefix/logs', '/path/to/hadoop-home/logs', '/mnt/var/log/hadoop']) def test_hadoop_log_dirs_opt(self): self.runner = HadoopJobRunner(hadoop_log_dirs=['/logs1', '/logs2']) os.environ['HADOOP_LOG_DIR'] = '/path/to/hadoop-log-dir' # setting hadoop_log_dirs short-circuits automatic discovery of logs self.assertEqual( list(self.runner._hadoop_log_dirs()), ['/logs1', '/logs2']) def test_need_yarn_for_yarn_log_dir_and_hdfs_log_dir(self): os.environ['YARN_LOG_DIR'] = '/path/to/yarn-log-dir' self.mock_hadoop_version = '2.0.0' self.assertEqual(list(self.runner._hadoop_log_dirs()), ['/path/to/yarn-log-dir', 'hdfs:///tmp/hadoop-yarn/staging', '/mnt/var/log/hadoop']) self.mock_hadoop_version = '1.0.3' self.assertEqual(list(self.runner._hadoop_log_dirs()), ['/mnt/var/log/hadoop'])
def test_dont_infer_from_usr_local_bin_hadoop(self): self.runner = HadoopJobRunner(hadoop_bin=['/usr/local/bin/hadoop']) self.mock_paths.append('/usr/local/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)
def setUp(self): super(PickErrorTestCase, self).setUp() os.environ['MOCK_HADOOP_VERSION'] = '2.7.0' self.runner = HadoopJobRunner()
def test_get_hadoop_version(self): runner = HadoopJobRunner() self.assertEqual(runner.get_hadoop_version(), '1.2.0')
class StreamingArgsTestCase(EmptyMrjobConfTestCase): MRJOB_CONF_CONTENTS = {'runners': {'hadoop': { 'hadoop_home': 'kansas', 'hadoop_streaming_jar': 'binks.jar.jar', }}} def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar', mr_job_script='my_job.py', stdin=StringIO()) self.runner._add_job_files_for_upload() self.runner._hadoop_version='0.20.204' self.simple_patch(self.runner, '_new_upload_args', return_value=['new_upload_args']) self.simple_patch(self.runner, '_old_upload_args', return_value=['old_upload_args']) self.simple_patch(self.runner, '_hadoop_args_for_step', return_value=['hadoop_args_for_step']) self.simple_patch(self.runner, '_hdfs_step_input_files', return_value=['hdfs_step_input_files']) self.simple_patch(self.runner, '_hdfs_step_output_dir', return_value='hdfs_step_output_dir') self.runner._script_path = 'my_job.py' self._new_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'new_upload_args', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir'] self._old_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir', 'old_upload_args'] def simple_patch(self, obj, attr, side_effect=None, return_value=None): patcher = patch.object(obj, attr, side_effect=side_effect, return_value=return_value) patcher.start() self.addCleanup(patcher.stop) def _assert_streaming_step(self, step, args): self.runner._steps = [step] self.assertEqual( self.runner._args_for_streaming_step(0), self._new_basic_args + args) def _assert_streaming_step_old(self, step, args): self.runner._hadoop_version = '0.18' self.runner._steps = [step] self.assertEqual( self.runner._args_for_streaming_step(0), self._old_basic_args + args) def test_basic_mapper(self): self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', }, }, ['-mapper', 'python my_job.py --step-num=0 --mapper', '-jobconf', 'mapred.reduce.tasks=0']) def test_basic_reducer(self): self._assert_streaming_step( { 'type': 'streaming', 'reducer': { 'type': 'script', }, }, ['-mapper', 'cat', '-reducer', 'python my_job.py --step-num=0 --reducer']) def test_pre_filters(self): self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep anything', }, 'combiner': { 'type': 'script', 'pre_filter': 'grep nothing', }, 'reducer': { 'type': 'script', 'pre_filter': 'grep something', }, }, ["-mapper", "bash -c 'grep anything | python my_job.py --step-num=0" " --mapper'", "-combiner", "bash -c 'grep nothing | python my_job.py --step-num=0" " --combiner'", "-reducer", "bash -c 'grep something | python my_job.py --step-num=0" " --reducer'"]) def test_combiner_018(self): self._assert_streaming_step_old( { 'type': 'streaming', 'mapper': { 'type': 'command', 'command': 'cat', }, 'combiner': { 'type': 'script', }, }, ["-mapper", "bash -c 'cat | sort | python my_job.py --step-num=0" " --combiner'", '-jobconf', 'mapred.reduce.tasks=0']) def test_pre_filters_018(self): self._assert_streaming_step_old( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep anything', }, 'combiner': { 'type': 'script', 'pre_filter': 'grep nothing', }, 'reducer': { 'type': 'script', 'pre_filter': 'grep something', }, }, ['-mapper', "bash -c 'grep anything | python my_job.py --step-num=0" " --mapper | sort | grep nothing | python my_job.py" " --step-num=0 --combiner'", '-reducer', "bash -c 'grep something | python my_job.py --step-num=0" " --reducer'"]) def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': bash_wrap("grep 'anything'"), }, }, ['-mapper', "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |" " python my_job.py --step-num=0 --mapper'", '-jobconf', 'mapred.reduce.tasks=0'])