def test_two_files(self): runner = MRJobRunner(conf_paths=[]) runner._invoke_sort([self.a, self.b], self.out) self.assertEqual( list(open(self.out)), ['A\n', 'B\n', 'alligator\n', 'apple\n', 'ball\n', 'banana\n'])
def test_jobconf(self): jobconf = {'FOO': 'bar', 'BAZ': 'qux', 'BAX': 'Arnold'} runner = MRJobRunner(conf_path=False, jobconf=jobconf) assert_equal(runner._hadoop_conf_args(0, 1), ['-jobconf', 'BAX=Arnold', '-jobconf', 'BAZ=qux', '-jobconf', 'FOO=bar',])
def test_cmdenv(self): cmdenv = {'FOO': 'bar', 'BAZ': 'qux', 'BAX': 'Arnold'} runner = MRJobRunner(conf_path=False, cmdenv=cmdenv) assert_equal(runner._hadoop_conf_args(0, 1), ['-cmdenv', 'BAX=Arnold', '-cmdenv', 'BAZ=qux', '-cmdenv', 'FOO=bar',])
def test_bad_sort(self): self.use_bad_sort() runner = MRJobRunner(conf_paths=[]) with no_handlers_for_logger(): self.assertRaises(CalledProcessError, runner._invoke_sort, [self.a, self.b], self.out)
def test_one_file(self): runner = MRJobRunner(conf_paths=[]) runner._invoke_sort([self.a], self.out) self.assertEqual(list(open(self.out)), ['A\n', 'alligator\n', 'apple\n'])
def test_one_file(self): runner = MRJobRunner(conf_paths=[]) self.addCleanup(runner.cleanup) runner._invoke_sort([self.a], self.out) with open(self.out) as out_f: self.assertEqual(list(out_f), ['A\n', 'alligator\n', 'apple\n'])
def test_hadoop_output_format(self): format = 'org.apache.hadoop.mapred.SequenceFileOutputFormat' runner = MRJobRunner(conf_path=False, hadoop_output_format=format) assert_equal(runner._hadoop_conf_args(0, 1), ['-outputformat', format]) # test multi-step job assert_equal(runner._hadoop_conf_args(0, 2), []) assert_equal(runner._hadoop_conf_args(1, 2), ['-outputformat', format])
def test_bad_sort(self): self.use_bad_sort() runner = MRJobRunner(conf_paths=[]) self.addCleanup(runner.cleanup) with no_handlers_for_logger(): # sometimes we get a broken pipe error (IOError) on PyPy self.assertRaises((CalledProcessError, IOError), runner._invoke_sort, [self.a, self.b], self.out)
def test_hadoop_extra_args_comes_first(self): runner = MRJobRunner( conf_path=False, cmdenv={'FOO': 'bar'}, hadoop_input_format='FooInputFormat', hadoop_output_format='BarOutputFormat', jobconf={'baz': 'quz'}, hadoop_extra_args=['-libjar', 'qux.jar']) # hadoop_extra_args should come first conf_args = runner._hadoop_conf_args(0, 1) assert_equal(conf_args[:2], ['-libjar', 'qux.jar']) assert_equal(len(conf_args), 10)
def test_python_bin(self): runner = MRJobRunner(python_bin=['python', '-v']) self.assertEqual(runner._interpreter(), ['python', '-v']) self.assertEqual(runner._interpreter(steps=True), [sys.executable])
def test_bootstrap_mrjob_overrides_interpreter(self): runner = MRJobRunner( conf_paths=[], interpreter=['ruby'], bootstrap_mrjob=True) self.assertEqual(runner._bootstrap_mrjob(), True)
def test_interpreter(self): runner = MRJobRunner(conf_paths=[], interpreter=['ruby']) self.assertEqual(runner._bootstrap_mrjob(), False)
def test_no_bootstrap_mrjob(self): runner = MRJobRunner(conf_paths=[], bootstrap_mrjob=False) self.assertEqual(runner._bootstrap_mrjob(), False)
def test_default(self): runner = MRJobRunner(conf_paths=[]) self.assertEqual(runner._bootstrap_mrjob(), True)
def test_hadoop_extra_args(self): extra_args = ['-foo', 'bar'] runner = MRJobRunner(conf_path=False, hadoop_extra_args=extra_args) assert_equal(runner._hadoop_conf_args(0, 1), extra_args)
def test_environment_variables_non_windows(self): runner = MRJobRunner(conf_path=False) self.environment_variable_checks(runner, ['TEMP', 'TMPDIR'])
def test_environment_variables_windows(self): runner = MRJobRunner(conf_paths=[]) self.addCleanup(runner.cleanup) runner._sort_is_windows_sort = True self.environment_variable_checks(runner, ['TMP'])
def test_default(self): runner = MRJobRunner() self.assertEqual(runner._interpreter(), self.default_python_bin()) self.assertEqual(runner._interpreter(steps=True), [sys.executable])
def test_bad_sort(self): self.use_bad_sort() runner = MRJobRunner(conf_path=False) assert_raises(CalledProcessError, runner._invoke_sort, [self.a, self.b], self.out)
def test_no_files(self): runner = MRJobRunner(conf_path=False) assert_raises(ValueError, runner._invoke_sort, [], self.out)
def test_environment_variables_non_windows(self): runner = MRJobRunner(conf_paths=[]) self.addCleanup(runner.cleanup) self.environment_variable_checks(runner, ['TEMP', 'TMPDIR'])
def test_steps_python_bin(self): runner = MRJobRunner(steps_python_bin=['python', '-v']) self.assertEqual(runner._interpreter(), self.default_python_bin()) self.assertEqual(runner._interpreter(steps=True), ['python', '-v'])
def test_no_files(self): runner = MRJobRunner(conf_paths=[]) self.assertRaises(ValueError, runner._invoke_sort, [], self.out)
def test_interpreter(self): runner = MRJobRunner(interpreter=['ruby']) self.assertEqual(runner._interpreter(), ['ruby']) self.assertEqual(runner._interpreter(steps=True), ['ruby'])
def test_steps_interpreter(self): # including whether steps_interpreter overrides interpreter runner = MRJobRunner(interpreter=['ruby', '-v'], steps_interpreter=['ruby']) self.assertEqual(runner._interpreter(), ['ruby', '-v']) self.assertEqual(runner._interpreter(steps=True), ['ruby'])
def test_interpreter_overrides_steps_python_bin(self): runner = MRJobRunner(interpreter=['ruby'], steps_python_bin=['python', '-v']) self.assertEqual(runner._interpreter(), ['ruby']) self.assertEqual(runner._interpreter(steps=True), ['ruby'])
def test_environment_variables_windows(self): runner = MRJobRunner(conf_path=False) runner._sort_is_windows_sort = True self.environment_variable_checks(runner, ['TMP'])
def test_empty(self): runner = MRJobRunner(conf_path=False) assert_equal(runner._hadoop_conf_args(0, 1), [])