def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner(hadoop_bin='hadoop', hadoop_streaming_jar='<streaming jar>', mr_job_script='my_job.py', stdin=BytesIO()) self.runner._add_job_files_for_upload() self.start( patch.object(self.runner, '_upload_args', return_value=['<upload args>'])) self.start( patch.object(self.runner, '_hadoop_args_for_step', return_value=['<hadoop args for step>'])) self.start( patch.object(self.runner, '_hdfs_step_input_files', return_value=['<hdfs step input files>'])) self.start( patch.object(self.runner, '_hdfs_step_output_dir', return_value='<hdfs step output dir>')) self.start( patch.object(HadoopFilesystem, 'get_hadoop_version', return_value='2.7.1')) self.runner._script_path = 'my_job.py'
def make_runner(self): """Make a runner based on command-line arguments, so we can launch this job on EMR, on Hadoop, or locally. :rtype: :py:class:`mrjob.runner.MRJobRunner` """ if self.options.runner == 'emr': # avoid requiring dependencies (such as boto3) for other runners from mrjob.emr import EMRJobRunner return EMRJobRunner(**self.emr_job_runner_kwargs()) elif self.options.runner == 'dataproc': from mrjob.dataproc import DataprocJobRunner return DataprocJobRunner(**self.dataproc_job_runner_kwargs()) elif self.options.runner == 'hadoop': from mrjob.hadoop import HadoopJobRunner return HadoopJobRunner(**self.hadoop_job_runner_kwargs()) elif self.options.runner == 'inline': raise ValueError("inline is not supported in the multi-lingual" " launcher.") else: # run locally by default from mrjob.local import LocalMRJobRunner return LocalMRJobRunner(**self.local_job_runner_kwargs())
def test_hadoop_mapred_home_beats_infer_from_hadoop_bin(self): self.runner = HadoopJobRunner( hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop']) self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar') self.test_hadoop_mapred_home()
def test_infer_from_hadoop_bin_parent_dir(self): self.runner = HadoopJobRunner( hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop']) self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/bin-parent/hadoop-streaming.jar')
def make_runner(self): """Make a runner based on command-line arguments, so we can launch this job on EMR, on Hadoop, or locally. :rtype: :py:class:`mrjob.runner.MRJobRunner` """ # have to import here so that we can still run the MRJob # without importing boto from mrjob.emr import EMRJobRunner from mrjob.hadoop import HadoopJobRunner from mrjob.local import LocalMRJobRunner if self.options.runner == 'emr': return EMRJobRunner(**self.emr_job_runner_kwargs()) elif self.options.runner == 'hadoop': return HadoopJobRunner(**self.hadoop_job_runner_kwargs()) elif self.options.runner == 'inline': raise ValueError("inline is not supported in the multi-lingual" " launcher.") else: # run locally by default return LocalMRJobRunner(**self.local_job_runner_kwargs())
def setUp(self): super(HadoopLogDirsTestCase, self).setUp() os.environ.clear() self.mock_hadoop_version = '2.7.0' # the result of _hadoop_dir(). This handles non-log-specific # environment variables, such as $HADOOP_PREFIX, and also guesses # based on the path of the Hadoop binary self.mock_hadoop_dirs = [] def mock_get_hadoop_version(): return self.mock_hadoop_version def mock_hadoop_dirs_method(): return (d for d in self.mock_hadoop_dirs) self.start( patch('mrjob.hadoop.HadoopJobRunner.get_hadoop_version', side_effect=mock_get_hadoop_version)) self.start( patch('mrjob.hadoop.HadoopJobRunner._hadoop_dirs', side_effect=mock_hadoop_dirs_method)) self.runner = HadoopJobRunner()
def test_hadoop_runner_cluster_mode(self): runner = HadoopJobRunner(spark_deploy_mode='cluster') self.assertEqual(runner._logs_needed_to_pick_error('streaming'), ('step', 'history', 'task')) self.assertEqual(runner._logs_needed_to_pick_error('spark'), ('step', 'task'))
def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar', mr_job_script='my_job.py', stdin=StringIO()) self.runner._add_job_files_for_upload() self.runner._hadoop_version='0.20.204' self.simple_patch(self.runner, '_new_upload_args', return_value=['new_upload_args']) self.simple_patch(self.runner, '_old_upload_args', return_value=['old_upload_args']) self.simple_patch(self.runner, '_hadoop_args_for_step', return_value=['hadoop_args_for_step']) self.simple_patch(self.runner, '_hdfs_step_input_files', return_value=['hdfs_step_input_files']) self.simple_patch(self.runner, '_hdfs_step_output_dir', return_value='hdfs_step_output_dir') self.runner._script_path = 'my_job.py' self._new_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'new_upload_args', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir'] self._old_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir', 'old_upload_args']
def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar', mr_job_script='my_job.py', stdin=BytesIO()) self.runner._add_job_files_for_upload() self.start(patch.object(self.runner, '_upload_args', return_value=['new_upload_args'])) self.start(patch.object(self.runner, '_pre_0_20_upload_args', return_value=['old_upload_args'])) self.start(patch.object(self.runner, '_hadoop_args_for_step', return_value=['hadoop_args_for_step'])) self.start(patch.object(self.runner, '_hdfs_step_input_files', return_value=['hdfs_step_input_files'])) self.start(patch.object(self.runner, '_hdfs_step_output_dir', return_value='hdfs_step_output_dir')) self.start(patch.object(HadoopFilesystem, 'get_hadoop_version', return_value='1.2.0')) self.runner._script_path = 'my_job.py' self._new_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'new_upload_args', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir'] self._old_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir', 'old_upload_args']
def test_hadoop_runner_client_mode(self): runner = HadoopJobRunner() self.assertEqual(runner._logs_needed_to_pick_error('streaming'), ('step', 'history', 'task')) self.assertEqual(runner._logs_needed_to_pick_error('spark'), ('step',))
def test_infer_from_hadoop_bin_realpath(self): with patch('posixpath.realpath', return_value='/ha/do/op/bin'): self.runner = HadoopJobRunner(hadoop_bin=['/usr/bin/hadoop']) self.mock_paths.append('/ha/do/op/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/hadoop-streaming.jar')
def test_hadoop_log_dirs_opt(self): self.runner = HadoopJobRunner(hadoop_log_dirs=['/logs1', '/logs2']) os.environ['HADOOP_LOG_DIR'] = '/path/to/hadoop-log-dir' # setting hadoop_log_dirs short-circuits automatic discovery of logs self.assertEqual(list(self.runner._hadoop_log_dirs()), ['/logs1', '/logs2'])
def setUp(self): super(StreamingLogDirsTestCase, self).setUp() self.log = self.start(patch('mrjob.hadoop.log')) self.runner = HadoopJobRunner() self.runner._hadoop_log_dirs = Mock(return_value=[]) self.runner.fs.exists = Mock(return_value=True) self.log.reset_mock() # ignore logging from HadoopJobRunner init
def test_hadoop_home_regression(self): # kill $HADOOP_HOME if it exists try: del os.environ['HADOOP_HOME'] except KeyError: pass with patch('mrjob.hadoop.find_hadoop_streaming_jar', return_value='some.jar'): HadoopJobRunner(hadoop_home=self.tmp_dir, conf_paths=[])
def test_uris(self): runner = HadoopJobRunner() list(runner.ls('hdfs://tmp/waffles')) list(runner.ls('lego://my/ego')) list(runner.ls('/tmp')) with open(os.environ['MOCK_HADOOP_LOG']) as mock_log: hadoop_cmd_args = [shlex.split(line) for line in mock_log] assert_equal(hadoop_cmd_args, [ ['fs', '-lsr', 'hdfs://tmp/waffles'], ['fs', '-lsr', 'lego://my/ego'], ])
def setUp(self): super(HadoopStreamingJarTestCase, self).setUp() self.mock_paths = [] def mock_ls(path): # don't bother to support globs return (p for p in sorted(self.mock_paths) if p.startswith(path)) self.start(patch('mrjob.fs.local.LocalFilesystem.ls', side_effect=mock_ls)) os.environ.clear() self.runner = HadoopJobRunner()
def test_cat_compressed(self): input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'w') input_gz.write('foo\nbar\n') input_gz.close() with HadoopJobRunner(cleanup=['NONE']) as runner: output = [] for line in runner.cat(input_gz_path): output.append(line) assert_equal(output, ['foo\n', 'bar\n']) input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2') input_bz2 = bz2.BZ2File(input_bz2_path, 'w') input_bz2.write('bar\nbar\nfoo\n') input_bz2.close() with HadoopJobRunner(cleanup=['NONE']) as runner: output = [] for line in runner.cat(input_bz2_path): output.append(line) assert_equal(output, ['bar\n', 'bar\n', 'foo\n'])
def test_pass_through_fields(self): # TODO: currently can't initialize HadoopRunner without setting these runner = HadoopJobRunner(hadoop_bin='hadoooooooooop', hadoop_home='kansas', hadoop_streaming_jar='streaming.jar') with no_handlers_for_logger('mrjob.runner'): stderr = StringIO() log_to_stream('mrjob.runner', stderr) self.assertEqual(runner._hadoop_bin, runner.fs._hadoop_bin) # deprecation warning is different for non-functions self.assertIn( 'deprecated: access HadoopJobRunner.fs._hadoop_bin directly', stderr.getvalue())
def test_prefer_own_methods(self): # TODO: currently can't initialize HadoopRunner without setting these runner = HadoopJobRunner(hadoop_bin='hadoop', hadoop_home='kansas', hadoop_streaming_jar='streaming.jar') with no_handlers_for_logger('mrjob.runner'): stderr = StringIO() log_to_stream('mrjob.runner', stderr) self.assertEqual(runner.ls, runner.fs.ls) # Hadoop Runner has its own version self.assertNotEqual(runner.get_hadoop_version, runner.fs.get_hadoop_version) self.assertIn('deprecated: call HadoopJobRunner.fs.ls() directly', stderr.getvalue()) self.assertNotIn('get_hadoop_version', stderr.getvalue())
def test_cat_uncompressed(self): local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as input_file: input_file.write('bar\nfoo\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\nfoo\n') remote_input_path = 'hdfs:///data/foo' check_call([ self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path ]) with HadoopJobRunner(cleanup=['NONE']) as runner: local_output = [] for line in runner.cat(local_input_path): local_output.append(line) remote_output = [] for line in runner.cat(remote_input_path): remote_output.append(line) assert_equal(local_output, ['bar\n', 'foo\n']) assert_equal(remote_output, ['foo\n', 'foo\n'])
def setUp(self): super(PickErrorTestCase, self).setUp() os.environ['MOCK_HADOOP_VERSION'] = '2.7.0' self.runner = HadoopJobRunner()
def setUp(self): super(FindProbableCauseOfFailureTestCase, self).setUp() os.environ['MOCK_HADOOP_VERSION'] = '2.7.0' self.runner = HadoopJobRunner()
def test_missing_hadoop_version(self): with patch.dict('os.environ', MOCK_HADOOP_VERSION=''): runner = HadoopJobRunner() self.assertRaises(Exception, runner.get_hadoop_version)
def test_get_hadoop_version(self): runner = HadoopJobRunner() self.assertEqual(runner.get_hadoop_version(), '1.2.0')
def test_dont_infer_from_usr_local_bin_hadoop(self): self.runner = HadoopJobRunner(hadoop_bin=['/usr/local/bin/hadoop']) self.mock_paths.append('/usr/local/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)
from mrjob.hadoop import HadoopJobRunner #x = HadoopJobRunner(conf_path="/nfs/ruby/calvin/.mrjob", mr_job_script="mr_sha1.py", hadoop_input_format="org.apache.hadoop.mapred.SequenceFileAsTextInputFormat") x = HadoopJobRunner(conf_path="/nfs/ruby/calvin/.mrjob", mr_job_script="mr_sha1.py", hadoop_input_format= "org.apache.hadoop.mapred.SequenceFileAsBinaryInputFormat") #x = HadoopJobRunner(hadoop_input_format="org.apache.hadoop.mapred.SequenceFileAsTextInputFormat") x.run()
def test_deprecated_hadoop_home_option(self): self.runner = HadoopJobRunner(hadoop_home='/ha/do/op/home-option') self.mock_paths.append('/ha/do/op/home-option/hadoop-streaming.jar') self.assertEqual(self.runner._find_hadoop_streaming_jar(), '/ha/do/op/home-option/hadoop-streaming.jar')