def test_set_runner_class(self): spark_submit_main(['-r', 'emr', 'foo.py', 'arg1']) self.assertEqual(self.runner_class.alias, 'emr') self.assertTrue(self.runner_class.called) self.assertTrue(self.runner.run.called)
def test_end_to_end(self): script_path = spark_wordcount_script.__file__ if script_path.endswith('.pyc'): script_path = script_path[:-1] input_path = self.makefile( 'input', b'one fish\ntwo fish\nred fish\nblue fish\n') # don't create this path, let Spark do it output_path = join(self.tmp_dir, 'output') self.assertFalse(exists(output_path)) spark_submit_main( ['-r', 'local', script_path, input_path, output_path]) self.assertTrue(exists(output_path)) word_counts = {} for path in glob(join(output_path, 'part-*')): with open(path) as f: for line in f: word, count = safeeval(line) word_counts[word] = count self.assertEqual(word_counts, dict(blue=1, fish=4, one=1, red=1, two=1))
def test_pass_through_to_step_spark_args(self): spark_submit_main(['--class', 'Backpack', '--name', 'Backpack', '--num-executors', '3', '--conf', 'foo=BAR', '--name', 'Mochila', 'dora.jar', 'arg1']) # --class becomes part of step # --conf is an alias for a mrjob opt, goes to runner # other args end up in spark-args as-is kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['steps'], [ dict( args=['arg1'], jar='dora.jar', jobconf={}, main_class='Backpack', spark_args=[ '--name', 'Backpack', '--num-executors', '3', '--name', 'Mochila', ], type='spark_jar', ) ]) self.assertEqual(kwargs['jobconf'], dict(foo='BAR'))
def test_pass_through_to_step_spark_args(self): spark_submit_main([ '--class', 'Backpack', '--name', 'Backpack', '--num-executors', '3', '--conf', 'foo=BAR', '--name', 'Mochila', 'dora.jar', 'arg1' ]) # --class becomes part of step # --conf is an alias for a mrjob opt, goes to runner # other args end up in spark-args as-is kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['steps'], [ dict( args=['arg1'], jar='dora.jar', jobconf={}, main_class='Backpack', spark_args=[ '--name', 'Backpack', '--num-executors', '3', '--name', 'Mochila', ], type='spark_jar', ) ]) self.assertEqual(kwargs['jobconf'], dict(foo='BAR'))
def test_end_to_end(self): script_path = spark_wordcount_script.__file__ if script_path.endswith('.pyc'): script_path = script_path[:-1] input_path = self.makefile( 'input', b'one fish\ntwo fish\nred fish\nblue fish\n') # don't create this path, let Spark do it output_path = join(self.tmp_dir, 'output') self.assertFalse(exists(output_path)) spark_submit_main( ['-r', 'local', script_path, input_path, output_path]) self.assertTrue(exists(output_path)) word_counts = {} for path in glob(join(output_path, 'part-*')): with open(path) as f: for line in f: word, count = safeeval(line) word_counts[word] = count self.assertEqual(word_counts, dict(blue=1, fish=4, one=1, red=1, two=1))
def test_set_runner_class(self): spark_submit_main(['-r', 'emr', 'foo.py', 'arg1']) self.assertEqual(self.runner_class.alias, 'emr') self.assertTrue(self.runner_class.called) self.assertTrue(self.runner.run.called)
def test_hard_coded_kwargs(self): spark_submit_main(['foo.py', 'arg1']) kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['check_input_paths'], False) self.assertEqual(kwargs['input_paths'], [os.devnull]) self.assertEqual(kwargs['output_dir'], None)
def test_filters_runner_kwargs(self): # may want to change this behavior; see #1898 spark_submit_main(['-r', 'emr', 'foo.py', 'arg1']) kwargs = self.get_runner_kwargs() self.assertIn('region', kwargs) self.assertNotIn('hadoop_bin', kwargs)
def test_hard_coded_kwargs(self): spark_submit_main(['foo.py', 'arg1']) kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['check_input_paths'], False) self.assertEqual(kwargs['input_paths'], [os.devnull]) self.assertEqual(kwargs['output_dir'], None)
def test_filters_runner_kwargs(self): # may want to change this behavior; see #1898 spark_submit_main(['-r', 'emr', 'foo.py', 'arg1']) kwargs = self.get_runner_kwargs() self.assertIn('region', kwargs) self.assertNotIn('hadoop_bin', kwargs)
def test_allow_py3_extension(self): spark_submit_main(['foo.py3', 'arg1', 'arg2']) kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['steps'], [ dict( args=['arg1', 'arg2'], jobconf={}, script='foo.py3', spark_args=[], type='spark_script', ) ])
def test_no_script_args_okay(self): spark_submit_main(['foo.py']) kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['steps'], [ dict( args=[], jobconf={}, script='foo.py', spark_args=[], type='spark_script', ) ])
def test_allow_py3_extension(self): spark_submit_main(['foo.py3', 'arg1', 'arg2']) kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['steps'], [ dict( args=['arg1', 'arg2'], jobconf={}, script='foo.py3', spark_args=[], type='spark_script', ) ])
def test_no_script_args_okay(self): spark_submit_main(['foo.py']) kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['steps'], [ dict( args=[], jobconf={}, script='foo.py', spark_args=[], type='spark_script', ) ])
def test_jar_main_class(self): spark_submit_main( ['--class', 'Backpack', 'dora.jar', 'arg1', 'arg2', 'arg3']) kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['steps'], [ dict( args=['arg1', 'arg2', 'arg3'], jar='dora.jar', jobconf={}, main_class='Backpack', spark_args=[], type='spark_jar', ) ])
def test_runner_kwargs(self): spark_submit_main([ '--hadoop-bin', 'super-hadoop', '--master', 'local', '--py-files', 'bar.py,baz.py', 'foo.py', 'arg1' ]) kwargs = self.get_runner_kwargs() # regular old runner arg self.assertEqual(kwargs['hadoop_bin'], 'super-hadoop') # spark alias for mrjob opt self.assertEqual(kwargs['spark_master'], 'local') # arg with custom parser self.assertEqual(kwargs['py_files'], ['bar.py', 'baz.py'])
def test_runner_kwargs(self): spark_submit_main(['--hadoop-bin', 'super-hadoop', '--master', 'local', '--py-files', 'bar.py,baz.py', 'foo.py', 'arg1']) kwargs = self.get_runner_kwargs() # regular old runner arg self.assertEqual(kwargs['hadoop_bin'], 'super-hadoop') # spark alias for mrjob opt self.assertEqual(kwargs['spark_master'], 'local') # arg with custom parser self.assertEqual(kwargs['py_files'], ['bar.py', 'baz.py'])
def test_jar_main_class(self): spark_submit_main(['--class', 'Backpack', 'dora.jar', 'arg1', 'arg2', 'arg3']) kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['steps'], [ dict( args=['arg1', 'arg2', 'arg3'], jar='dora.jar', jobconf={}, main_class='Backpack', spark_args=[], type='spark_jar', ) ])
def test_basic(self): spark_submit_main(['foo.py', 'arg1', 'arg2']) self.assertEqual(self.runner_class.alias, 'spark') self.assertTrue(self.runner_class.called) self.assertTrue(self.runner.run.called) kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['steps'], [ dict( args=['arg1', 'arg2'], jobconf={}, script='foo.py', spark_args=[], type='spark_script', ) ])
def test_basic(self): spark_submit_main(['foo.py', 'arg1', 'arg2']) self.assertEqual(self.runner_class.alias, 'spark') self.assertTrue(self.runner_class.called) self.assertTrue(self.runner.run.called) kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['steps'], [ dict( args=['arg1', 'arg2'], jobconf={}, script='foo.py', spark_args=[], type='spark_script', ) ])
def test_jar_step(self): spark_submit_main(['dora.jar', 'arg1', 'arg2', 'arg3']) self.assertEqual(self.runner_class.alias, 'spark') self.assertTrue(self.runner_class.called) self.assertTrue(self.runner.run.called) kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['steps'], [ dict( args=['arg1', 'arg2', 'arg3'], jar='dora.jar', jobconf={}, main_class=None, spark_args=[], type='spark_jar', ) ])
def test_switches_to_spark_script(self): # regression test for #2070 spark_submit_main(['foo.py', '--bar', 'baz']) self.assertEqual(self.runner_class.alias, 'spark') self.assertTrue(self.runner_class.called) self.assertTrue(self.runner.run.called) kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['steps'], [ dict( args=['--bar', 'baz'], jobconf={}, script='foo.py', spark_args=[], type='spark_script', ) ])
def test_jar_step(self): spark_submit_main(['dora.jar', 'arg1', 'arg2', 'arg3']) self.assertEqual(self.runner_class.alias, 'spark') self.assertTrue(self.runner_class.called) self.assertTrue(self.runner.run.called) kwargs = self.get_runner_kwargs() self.assertEqual(kwargs['steps'], [ dict( args=['arg1', 'arg2', 'arg3'], jar='dora.jar', jobconf={}, main_class=None, spark_args=[], type='spark_jar', ) ])
def test_end_to_end(self): script_path = self.makefile('foo.py') spark_submit_main(['-r', 'emr', script_path, 'arg1']) emr_client = self.client('emr') cluster_ids = [c['Id'] for c in emr_client.list_clusters()['Clusters']] self.assertEqual(len(cluster_ids), 1) cluster_id = cluster_ids[0] steps = emr_client.list_steps(ClusterId=cluster_id)['Steps'] self.assertEqual(len(steps), 1) step = steps[0] self.assertEqual(step['Status']['State'], 'COMPLETED') step_args = step['Config']['Args'] self.assertEqual(step_args[0], 'spark-submit') self.assertEqual(step_args[-1], 'arg1') self.assertTrue(step_args[-2].endswith('/foo.py'))
def test_end_to_end(self): script_path = self.makefile('foo.py') spark_submit_main( ['-r', 'emr', script_path, 'arg1']) emr_client = self.client('emr') cluster_ids = [c['Id'] for c in emr_client.list_clusters()['Clusters']] self.assertEqual(len(cluster_ids), 1) cluster_id = cluster_ids[0] steps = emr_client.list_steps(ClusterId=cluster_id)['Steps'] self.assertEqual(len(steps), 1) step = steps[0] self.assertEqual(step['Status']['State'], 'COMPLETED') step_args = step['Config']['Args'] self.assertEqual(step_args[0], 'spark-submit') self.assertEqual(step_args[-1], 'arg1') self.assertTrue(step_args[-2].endswith('/foo.py'))
def test_cleanup_called(self): spark_submit_main(['-r', 'emr', 'foo.py', 'arg1']) self.assertTrue(self.runner.cleanup.called)
def test_cleanup_called(self): spark_submit_main(['-r', 'emr', 'foo.py', 'arg1']) self.assertTrue(self.runner.cleanup.called)