def test_no_bootstrap_script_if_not_needed(self): runner = DataprocJobRunner(conf_paths=[], bootstrap_mrjob=False, bootstrap_python=False) runner._add_bootstrap_files_for_upload() self.assertIsNone(runner._master_bootstrap_script_path)
def test_bootstrap_mrjob_uses_python_bin(self): # use all the bootstrap options runner = DataprocJobRunner(conf_paths=[], bootstrap_mrjob=True, python_bin=["anaconda"]) runner._add_bootstrap_files_for_upload() self.assertIsNotNone(runner._master_bootstrap_script_path) with open(runner._master_bootstrap_script_path, "r") as f: content = f.read() self.assertIn("sudo anaconda -m compileall -q -f", content)
def test_bootstrap_script_respects_sh_bin(self): runner = DataprocJobRunner(conf_paths=[]) self.start(patch('mrjob.dataproc.DataprocJobRunner._sh_bin', return_value=['/bin/bash'])) runner._add_bootstrap_files_for_upload() self.assertIsNotNone(runner._master_bootstrap_script_path) with open(runner._master_bootstrap_script_path) as f: lines = list(f) self.assertEqual(lines[0].strip(), '#!/bin/bash')
def test_bootstrap_mrjob_uses_python_bin(self): # use all the bootstrap options runner = DataprocJobRunner(conf_paths=[], bootstrap_mrjob=True, python_bin=['anaconda']) runner._add_bootstrap_files_for_upload() self.assertIsNotNone(runner._master_bootstrap_script_path) with open(runner._master_bootstrap_script_path, 'r') as f: content = f.read() self.assertIn('sudo anaconda -m compileall -q -f', content)
def test_bootstrap_script_respects_sh_pre_commands(self): runner = DataprocJobRunner(conf_paths=[]) self.start(patch('mrjob.dataproc.DataprocJobRunner._sh_pre_commands', return_value=['garply', 'quux'])) runner._add_bootstrap_files_for_upload() self.assertIsNotNone(runner._master_bootstrap_script_path) with open(runner._master_bootstrap_script_path) as f: lines = list(f) self.assertEqual([line.strip() for line in lines[1:3]], ['garply', 'quux'])
def test_usr_bin_env(self): runner = DataprocJobRunner(conf_paths=[], bootstrap_mrjob=True, sh_bin="bash -e") runner._add_bootstrap_files_for_upload() self.assertIsNotNone(runner._master_bootstrap_script_path) self.assertTrue(os.path.exists(runner._master_bootstrap_script_path)) with open(runner._master_bootstrap_script_path) as f: lines = [line.rstrip() for line in f] self.assertEqual(lines[0], "#!/usr/bin/env bash -e")
def test_usr_bin_env(self): runner = DataprocJobRunner(conf_paths=[], bootstrap_mrjob=True, sh_bin='bash -e') runner._add_bootstrap_files_for_upload() self.assertIsNotNone(runner._master_bootstrap_script_path) self.assertTrue(os.path.exists(runner._master_bootstrap_script_path)) with open(runner._master_bootstrap_script_path) as f: lines = [line.rstrip() for line in f] self.assertEqual(lines[0], '#!/usr/bin/env bash -e')
def test_create_master_bootstrap_script(self): # create a fake src tarball foo_py_path = os.path.join(self.tmp_dir, 'foo.py') with open(foo_py_path, 'w'): pass runner = DataprocJobRunner( conf_paths=[], bootstrap=[ PYTHON_BIN + ' ' + foo_py_path + '#bar.py', 'gs://walrus/scripts/ohnoes.sh#', 'echo "Hi!"', 'true', 'ls', 'speedups.sh', '/tmp/s.sh' ], bootstrap_mrjob=True) runner._add_bootstrap_files_for_upload() self.assertIsNotNone(runner._master_bootstrap_script_path) self.assertTrue(os.path.exists(runner._master_bootstrap_script_path)) with open(runner._master_bootstrap_script_path) as f: lines = [line.rstrip() for line in f] self.assertEqual(lines[0], '#!/bin/sh -ex') # check PWD gets stored self.assertIn('__mrjob_PWD=$PWD', lines) def assertScriptDownloads(path, name=None): uri = runner._upload_mgr.uri(path) name = runner._bootstrap_dir_mgr.name('file', path, name=name) self.assertIn( ' hadoop fs -copyToLocal %s $__mrjob_PWD/%s' % (uri, name), lines) self.assertIn(' chmod u+rx $__mrjob_PWD/%s' % (name, ), lines) # check files get downloaded assertScriptDownloads(foo_py_path, 'bar.py') assertScriptDownloads('gs://walrus/scripts/ohnoes.sh') assertScriptDownloads(runner._mrjob_zip_path) # check scripts get run # bootstrap self.assertIn(' ' + PYTHON_BIN + ' $__mrjob_PWD/bar.py', lines) self.assertIn(' $__mrjob_PWD/ohnoes.sh', lines) self.assertIn(' echo "Hi!"', lines) self.assertIn(' true', lines) self.assertIn(' ls', lines) self.assertIn(' speedups.sh', lines) self.assertIn(' /tmp/s.sh', lines) # bootstrap_mrjob mrjob_zip_name = runner._bootstrap_dir_mgr.name( 'file', runner._mrjob_zip_path) self.assertIn( " __mrjob_PYTHON_LIB=$(" + PYTHON_BIN + " -c 'from" " distutils.sysconfig import get_python_lib;" " print(get_python_lib())')", lines) self.assertIn( ' sudo unzip $__mrjob_PWD/' + mrjob_zip_name + ' -d $__mrjob_PYTHON_LIB', lines) self.assertIn( ' sudo ' + PYTHON_BIN + ' -m compileall -q -f' ' $__mrjob_PYTHON_LIB/mrjob && true', lines) # bootstrap_python if PY2: self.assertIn(' sudo apt-get install -y python-pip python-dev', lines) else: self.assertIn( ' sudo apt-get install -y python3 python3-pip python3-dev', lines)
def test_create_master_bootstrap_script(self): # create a fake src tarball foo_py_path = os.path.join(self.tmp_dir, 'foo.py') with open(foo_py_path, 'w'): pass # use all the bootstrap options runner = DataprocJobRunner(conf_paths=[], bootstrap=[ PYTHON_BIN + ' ' + foo_py_path + '#bar.py', 'gs://walrus/scripts/ohnoes.sh#', # bootstrap_cmds 'echo "Hi!"', 'true', 'ls', # bootstrap_scripts 'speedups.sh', '/tmp/s.sh' ], bootstrap_mrjob=True) runner._add_bootstrap_files_for_upload() self.assertIsNotNone(runner._master_bootstrap_script_path) self.assertTrue(os.path.exists(runner._master_bootstrap_script_path)) with open(runner._master_bootstrap_script_path) as f: lines = [line.rstrip() for line in f] self.assertEqual(lines[0], '#!/bin/sh -ex') # check PWD gets stored self.assertIn('__mrjob_PWD=$PWD', lines) def assertScriptDownloads(path, name=None): uri = runner._upload_mgr.uri(path) name = runner._bootstrap_dir_mgr.name('file', path, name=name) self.assertIn( 'hadoop fs -copyToLocal %s $__mrjob_PWD/%s' % (uri, name), lines) self.assertIn( 'chmod a+x $__mrjob_PWD/%s' % (name,), lines) # check files get downloaded assertScriptDownloads(foo_py_path, 'bar.py') assertScriptDownloads('gs://walrus/scripts/ohnoes.sh') assertScriptDownloads(runner._mrjob_tar_gz_path) # check scripts get run # bootstrap self.assertIn(PYTHON_BIN + ' $__mrjob_PWD/bar.py', lines) self.assertIn('$__mrjob_PWD/ohnoes.sh', lines) self.assertIn('echo "Hi!"', lines) self.assertIn('true', lines) self.assertIn('ls', lines) self.assertIn('speedups.sh', lines) self.assertIn('/tmp/s.sh', lines) # bootstrap_mrjob mrjob_tar_gz_name = runner._bootstrap_dir_mgr.name( 'file', runner._mrjob_tar_gz_path) self.assertIn("__mrjob_PYTHON_LIB=$(" + PYTHON_BIN + " -c 'from" " distutils.sysconfig import get_python_lib;" " print(get_python_lib())')", lines) self.assertIn('sudo tar xfz $__mrjob_PWD/' + mrjob_tar_gz_name + ' -C $__mrjob_PYTHON_LIB', lines) self.assertIn('sudo ' + PYTHON_BIN + ' -m compileall -f' ' $__mrjob_PYTHON_LIB/mrjob && true', lines) # bootstrap_python if PY2: self.assertIn('sudo apt-get install -y python-pip python-dev', lines) else: self.assertIn('sudo apt-get install -y python3 python3-pip python3-dev', lines)
def test_create_master_bootstrap_script(self): # create a fake src tarball foo_py_path = os.path.join(self.tmp_dir, "foo.py") with open(foo_py_path, "w"): pass # use all the bootstrap options runner = DataprocJobRunner( conf_paths=[], bootstrap=[ PYTHON_BIN + " " + foo_py_path + "#bar.py", "gs://walrus/scripts/ohnoes.sh#", # bootstrap_cmds 'echo "Hi!"', "true", "ls", # bootstrap_scripts "speedups.sh", "/tmp/s.sh", ], bootstrap_mrjob=True, ) runner._add_bootstrap_files_for_upload() self.assertIsNotNone(runner._master_bootstrap_script_path) self.assertTrue(os.path.exists(runner._master_bootstrap_script_path)) with open(runner._master_bootstrap_script_path) as f: lines = [line.rstrip() for line in f] self.assertEqual(lines[0], "#!/bin/sh -ex") # check PWD gets stored self.assertIn("__mrjob_PWD=$PWD", lines) def assertScriptDownloads(path, name=None): uri = runner._upload_mgr.uri(path) name = runner._bootstrap_dir_mgr.name("file", path, name=name) self.assertIn("hadoop fs -copyToLocal %s $__mrjob_PWD/%s" % (uri, name), lines) self.assertIn("chmod a+x $__mrjob_PWD/%s" % (name,), lines) # check files get downloaded assertScriptDownloads(foo_py_path, "bar.py") assertScriptDownloads("gs://walrus/scripts/ohnoes.sh") assertScriptDownloads(runner._mrjob_zip_path) # check scripts get run # bootstrap self.assertIn(PYTHON_BIN + " $__mrjob_PWD/bar.py", lines) self.assertIn("$__mrjob_PWD/ohnoes.sh", lines) self.assertIn('echo "Hi!"', lines) self.assertIn("true", lines) self.assertIn("ls", lines) self.assertIn("speedups.sh", lines) self.assertIn("/tmp/s.sh", lines) # bootstrap_mrjob mrjob_zip_name = runner._bootstrap_dir_mgr.name("file", runner._mrjob_zip_path) self.assertIn( "__mrjob_PYTHON_LIB=$(" + PYTHON_BIN + " -c 'from" " distutils.sysconfig import get_python_lib;" " print(get_python_lib())')", lines, ) self.assertIn("sudo unzip $__mrjob_PWD/" + mrjob_zip_name + " -d $__mrjob_PYTHON_LIB", lines) self.assertIn("sudo " + PYTHON_BIN + " -m compileall -q -f" " $__mrjob_PYTHON_LIB/mrjob && true", lines) # bootstrap_python if PY2: self.assertIn("sudo apt-get install -y python-pip python-dev", lines) else: self.assertIn("sudo apt-get install -y python3 python3-pip python3-dev", lines)