def test_no_name(self): self.assertEqual( parse_setup_cmd('foo#'), [{'type': 'file', 'path': 'foo', 'name': None}]) self.assertEqual( parse_setup_cmd('foo#/'), [{'type': 'archive', 'path': 'foo', 'name': None}, '/'])
def test_hash_path_alone(self): self.assertEqual(parse_setup_cmd("foo#bar"), [{"type": "file", "path": "foo", "name": "bar"}]) self.assertEqual(parse_setup_cmd("/dir/foo#bar"), [{"type": "file", "path": "/dir/foo", "name": "bar"}]) self.assertEqual(parse_setup_cmd("foo#bar/"), [{"type": "archive", "path": "foo", "name": "bar"}, "/"]) self.assertEqual( parse_setup_cmd("/dir/foo#bar/"), [{"type": "archive", "path": "/dir/foo", "name": "bar"}, "/"] )
def test_archive_hash_path_alone(self): self.assertEqual( parse_setup_cmd('foo#/'), [{'type': 'archive', 'path': 'foo', 'name': None}, '/']) self.assertEqual( parse_setup_cmd('foo#bar/'), [{'type': 'archive', 'path': 'foo', 'name': 'bar'}, '/']) self.assertEqual( parse_setup_cmd('/dir/foo#bar/'), [{'type': 'archive', 'path': '/dir/foo', 'name': 'bar'}, '/'])
def test_file_hash_path_alone(self): self.assertEqual( parse_setup_cmd('foo#'), [{'type': 'file', 'path': 'foo', 'name': None}]) self.assertEqual( parse_setup_cmd('foo#bar'), [{'type': 'file', 'path': 'foo', 'name': 'bar'}]) self.assertEqual( parse_setup_cmd('/dir/foo#bar'), [{'type': 'file', 'path': '/dir/foo', 'name': 'bar'}])
def _parse_setup(self): """Parse the *setup* option with :py:func:`mrjob.setup.parse_setup_cmd()`. If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both true, create mrjob.tar.gz (if it doesn't exist already) and prepend a setup command that adds it to PYTHONPATH. Also patch in the deprecated options *python_archives*, *setup_cmd*, and *setup_script* as setup commands. """ setup = [] # python_archives for path in self._opts['python_archives']: path_dict = parse_legacy_hash_path('archive', path) setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH']) # setup for cmd in self._opts['setup']: setup.append(parse_setup_cmd(cmd)) # setup_cmds for cmd in self._opts['setup_cmds']: if not isinstance(cmd, basestring): cmd = cmd_line(cmd) setup.append([cmd]) # setup_scripts for path in self._opts['setup_scripts']: path_dict = parse_legacy_hash_path('file', path) setup.append([path_dict]) return setup
def __init__(self, **kwargs): super(MRJobBinRunner, self).__init__(**kwargs) # where a zip file of the mrjob library is stored locally self._mrjob_zip_path = None # we'll create the setup wrapper scripts later self._setup_wrapper_script_path = None self._manifest_setup_script_path = None # self._setup is a list of shell commands with path dicts # interleaved; see mrjob.setup.parse_setup_cmd() for details self._setup = [parse_setup_cmd(cmd) for cmd in self._opts['setup']] for cmd in self._setup: for token in cmd: if isinstance(token, dict): # convert dir archives tokens to archives if token['type'] == 'dir': # feed the archive's path to self._working_dir_mgr token['path'] = self._dir_archive_path(token['path']) token['type'] = 'archive' self._working_dir_mgr.add(**token) # --py-files on Spark doesn't allow '#' (see #1375) if any('#' in path for path in self._opts['py_files']): raise ValueError("py_files cannot contain '#'")
def test_colon_after_name(self): self.assertEqual(parse_setup_cmd('echo foo.egg#:$PYTHONPATH'), [ 'echo ', { 'type': 'file', 'path': 'foo.egg', 'name': None }, ':$PYTHONPATH' ])
def test_start_path_after_equals(self): self.assertEqual(parse_setup_cmd('export PYTHONPATH=foo.egg#'), [ 'export PYTHONPATH=', { 'type': 'file', 'path': 'foo.egg', 'name': None } ])
def test_named_dir(self): self.assertEqual(parse_setup_cmd('cd src/#awesome-dir'), [ 'cd ', { 'type': 'dir', 'path': 'src', 'name': 'awesome-dir' }, '/' ])
def test_start_path_after_colon(self): self.assertEqual( parse_setup_cmd('export PYTHONPATH=$PYTHONPATH:foo.tar.gz#/'), [ 'export PYTHONPATH=$PYTHONPATH:', { 'type': 'archive', 'path': 'foo.tar.gz', 'name': None }, '/' ])
def test_allow_colons_in_uris(self): self.assertEqual( parse_setup_cmd('export PATH=$PATH:s3://foo/script.sh#'), [ 'export PATH=$PATH:', { 'type': 'file', 'path': 's3://foo/script.sh', 'name': None } ])
def test_file_inside_dir(self): self.assertEqual(parse_setup_cmd('sudo dpkg -i my_pkgs/#/fooify.deb'), [ 'sudo dpkg -i ', { 'type': 'dir', 'path': 'my_pkgs', 'name': None }, '/fooify.deb' ])
def test_name_slash_included_in_command(self): self.assertEqual( parse_setup_cmd('sudo dpkg -i my_pkgs.tar#/fooify.deb'), [ 'sudo dpkg -i ', { 'type': 'archive', 'path': 'my_pkgs.tar', 'name': None }, '/fooify.deb' ])
def test_resolve_path_but_not_name(self): with patch.dict(os.environ, {'HOME': '/home/foo', 'USER': '******', 'BAR': 'bar'}, clear=True): self.assertEqual( parse_setup_cmd(r'. ~/tmp/$USER/\$BAR.sh#$USER.sh'), ['. ', {'path': '/home/foo/tmp/foo/$BAR.sh', 'name': '$USER.sh', 'type': 'file'}])
def test_shell_punctuation_after_name(self): self.assertEqual( parse_setup_cmd('touch foo#; cat bar#>baz; cat qux#|grep quux'), ['touch ', {'type': 'file', 'path': 'foo', 'name': None}, '; cat ', {'type': 'file', 'path': 'bar', 'name': None}, '>baz; cat ', {'type': 'file', 'path': 'qux', 'name': None}, '|grep quux'])
def test_dir_hash_path_alone(self): self.assertEqual(parse_setup_cmd('foo/#'), [{ 'type': 'dir', 'path': 'foo', 'name': None }, '/']) self.assertEqual(parse_setup_cmd('foo/#/'), [{ 'type': 'dir', 'path': 'foo', 'name': None }, '/']) self.assertEqual(parse_setup_cmd('foo/#bar'), [{ 'type': 'dir', 'path': 'foo', 'name': 'bar' }, '/']) self.assertEqual(parse_setup_cmd('foo/#bar/'), [{ 'type': 'dir', 'path': 'foo', 'name': 'bar' }, '/']) self.assertEqual(parse_setup_cmd('/dir/foo/#bar'), [{ 'type': 'dir', 'path': '/dir/foo', 'name': 'bar' }, '/']) self.assertEqual(parse_setup_cmd('/dir/foo/#bar/'), [{ 'type': 'dir', 'path': '/dir/foo', 'name': 'bar' }, '/'])
def test_shell_punctuation_after_name(self): self.assertEqual( parse_setup_cmd("touch foo#; cat bar#>baz; cat qux#|grep quux"), [ "touch ", {"type": "file", "path": "foo", "name": None}, "; cat ", {"type": "file", "path": "bar", "name": None}, ">baz; cat ", {"type": "file", "path": "qux", "name": None}, "|grep quux", ], )
def _parse_setup(self): """Parse the *setup* option with :py:func:`mrjob.setup.parse_setup_cmd()`. If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both true, create mrjob.tar.gz (if it doesn't exist already) and prepend a setup command that adds it to PYTHONPATH. Also patch in the deprecated options *python_archives*, *setup_cmd*, and *setup_script* as setup commands. """ setup = [] # python_archives for path in self._opts["python_archives"]: path_dict = parse_legacy_hash_path("archive", path) setup.append(["export PYTHONPATH=", path_dict, ":$PYTHONPATH"]) # setup for cmd in self._opts["setup"]: setup.append(parse_setup_cmd(cmd)) # setup_cmds if self._opts["setup_cmds"]: log.warning( "setup_cmds is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead." ) for cmd in self._opts["setup_cmds"]: if not isinstance(cmd, string_types): cmd = cmd_line(cmd) setup.append([cmd]) # setup_scripts if self._opts["setup_scripts"]: log.warning( "setup_scripts is deprecated since v0.4.2 and will be removed" " in v0.6.0. Consider using setup instead." ) for path in self._opts["setup_scripts"]: path_dict = parse_legacy_hash_path("file", path) setup.append([path_dict]) return setup
def _parse_setup_and_py_files(self): """Parse the *setup* option with :py:func:`mrjob.setup.parse_setup_cmd()`, and patch in *py_files*. """ setup = [] # py_files for path in self._opts['py_files']: # Spark (at least v1.3.1) doesn't work with # and --py-files, # see #1375 if '#' in path: raise ValueError("py_files cannot contain '#'") path_dict = parse_legacy_hash_path('file', path) setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH']) # setup for cmd in self._opts['setup']: setup.append(parse_setup_cmd(cmd)) return setup
def __init__(self, **kwargs): super(MRJobBinRunner, self).__init__(**kwargs) # where a zip file of the mrjob library is stored locally self._mrjob_zip_path = None # we'll create the setup wrapper scripts later self._setup_wrapper_script_path = None self._manifest_setup_script_path = None self._spark_python_wrapper_path = None # self._setup is a list of shell commands with path dicts # interleaved; see mrjob.setup.parse_setup_cmd() for details self._setup = [parse_setup_cmd(cmd) for cmd in self._opts['setup']] for cmd in self._setup: for token in cmd: if isinstance(token, dict): # convert dir archives tokens to archives if token['type'] == 'dir': # feed the archive's path to self._working_dir_mgr token['path'] = self._dir_archive_path(token['path']) token['type'] = 'archive' self._working_dir_mgr.add(**token) # warning: no setup scripts on Spark when no working dir if self._setup and self._has_pyspark_steps() and not( self._spark_executors_have_own_wd()): log.warning("setup commands aren't supported on Spark master %r" % self._spark_master()) # --py-files on Spark doesn't allow '#' (see #1375) if any('#' in path for path in self._opts['py_files']): raise ValueError("py_files cannot contain '#'") # Keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin']
def __init__(self, **kwargs): super(MRJobBinRunner, self).__init__(**kwargs) # where a zip file of the mrjob library is stored locally self._mrjob_zip_path = None # we'll create the setup wrapper scripts later self._setup_wrapper_script_path = None self._manifest_setup_script_path = None self._spark_python_wrapper_path = None # self._setup is a list of shell commands with path dicts # interleaved; see mrjob.setup.parse_setup_cmd() for details self._setup = [parse_setup_cmd(cmd) for cmd in self._opts['setup']] if self._setup and self._has_pyspark_steps() and not ( self._spark_setup_is_supported()): log.warning("setup commands aren't supported on Spark master %r" % self._spark_master()) for cmd in self._setup: for token in cmd: if isinstance(token, dict): # convert dir archives tokens to archives if token['type'] == 'dir': # feed the archive's path to self._working_dir_mgr token['path'] = self._dir_archive_path(token['path']) token['type'] = 'archive' self._working_dir_mgr.add(**token) # --py-files on Spark doesn't allow '#' (see #1375) if any('#' in path for path in self._opts['py_files']): raise ValueError("py_files cannot contain '#'") # Keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin']
def test_dir_hash_path_alone(self): self.assertEqual( parse_setup_cmd('foo/#'), [{'type': 'dir', 'path': 'foo', 'name': None}, '/']) self.assertEqual( parse_setup_cmd('foo/#/'), [{'type': 'dir', 'path': 'foo', 'name': None}, '/']) self.assertEqual( parse_setup_cmd('foo/#bar'), [{'type': 'dir', 'path': 'foo', 'name': 'bar'}, '/']) self.assertEqual( parse_setup_cmd('foo/#bar/'), [{'type': 'dir', 'path': 'foo', 'name': 'bar'}, '/']) self.assertEqual( parse_setup_cmd('/dir/foo/#bar'), [{'type': 'dir', 'path': '/dir/foo', 'name': 'bar'}, '/']) self.assertEqual( parse_setup_cmd('/dir/foo/#bar/'), [{'type': 'dir', 'path': '/dir/foo', 'name': 'bar'}, '/'])
def test_no_hash(self): self.assertEqual(parse_setup_cmd('foo'), ['foo'])
def test_name_slash_included_in_command(self): self.assertEqual( parse_setup_cmd('sudo dpkg -i my_pkgs.tar#/fooify.deb'), ['sudo dpkg -i ', {'type': 'archive', 'path': 'my_pkgs.tar', 'name': None}, '/fooify.deb'])
def test_double_hash(self): self.assertEqual(parse_setup_cmd('foo#bar#baz'), [{'type': 'file', 'path': 'foo#bar', 'name': 'baz'}])
def test_no_hash(self): self.assertEqual(parse_setup_cmd("foo"), ["foo"])
def test_colon_after_name(self): self.assertEqual( parse_setup_cmd('echo foo.egg#:$PYTHONPATH'), ['echo ', {'type': 'file', 'path': 'foo.egg', 'name': None}, ':$PYTHONPATH'])
def test_no_path(self): self.assertEqual(parse_setup_cmd('#bar'), ['#bar'])
def test_colon_after_name(self): self.assertEqual( parse_setup_cmd("echo foo.egg#:$PYTHONPATH"), ["echo ", {"type": "file", "path": "foo.egg", "name": None}, ":$PYTHONPATH"], )
def test_no_path(self): self.assertEqual(parse_setup_cmd("#bar"), ["#bar"])
def test_dont_parse_hash_path_inside_quotes(self): self.assertEqual( parse_setup_cmd('"foo#bar"'), ['"foo#bar"']) self.assertEqual( parse_setup_cmd("'foo#bar'"), ["'foo#bar'"])
def test_root_dir_only(self): # tarring up the entire filesystem is a terrible idea; no # good reason to allow this self.assertEqual(parse_setup_cmd('/#'), ['/#'])
def test_resolve_path_but_not_name(self): with patch.dict(os.environ, {"HOME": "/home/foo", "USER": "******", "BAR": "bar"}, clear=True): self.assertEqual( parse_setup_cmd(r". ~/tmp/$USER/\$BAR.sh#$USER.sh"), [". ", {"path": "/home/foo/tmp/foo/$BAR.sh", "name": "$USER.sh", "type": "file"}], )
def test_allow_colons_in_uris(self): self.assertEqual( parse_setup_cmd("export PATH=$PATH:s3://foo/script.sh#"), ["export PATH=$PATH:", {"type": "file", "path": "s3://foo/script.sh", "name": None}], )
def test_start_path_after_equals(self): self.assertEqual( parse_setup_cmd("export PYTHONPATH=foo.egg#"), ["export PYTHONPATH=", {"type": "file", "path": "foo.egg", "name": None}], )
def test_start_path_after_colon(self): self.assertEqual( parse_setup_cmd("export PYTHONPATH=$PYTHONPATH:foo.tar.gz#/"), ["export PYTHONPATH=$PYTHONPATH:", {"type": "archive", "path": "foo.tar.gz", "name": None}, "/"], )
def test_empty(self): self.assertEqual(parse_setup_cmd(''), []) self.assertEqual(parse_setup_cmd(' '), [' ']) self.assertRaises(TypeError, parse_setup_cmd, None)
def test_double_hash(self): self.assertEqual(parse_setup_cmd("foo#bar#baz"), [{"type": "file", "path": "foo#bar", "name": "baz"}])
def test_empty(self): self.assertEqual(parse_setup_cmd(""), []) self.assertEqual(parse_setup_cmd(" "), [" "]) self.assertRaises(TypeError, parse_setup_cmd, None)
def test_name_slash_included_in_command(self): self.assertEqual( parse_setup_cmd("sudo dpkg -i my_pkgs.tar#/fooify.deb"), ["sudo dpkg -i ", {"type": "archive", "path": "my_pkgs.tar", "name": None}, "/fooify.deb"], )
def test_dont_parse_hash_path_inside_quotes(self): self.assertEqual(parse_setup_cmd('"foo#bar"'), ['"foo#bar"']) self.assertEqual(parse_setup_cmd("'foo#bar'"), ["'foo#bar'"])
def _parse_bootstrap(self): """Parse the *bootstrap* option with :py:func:`mrjob.setup.parse_setup_cmd()`. """ return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']]
def test_start_path_after_colon(self): self.assertEqual( parse_setup_cmd('export PYTHONPATH=$PYTHONPATH:foo.tar.gz#/'), ['export PYTHONPATH=$PYTHONPATH:', {'type': 'archive', 'path': 'foo.tar.gz', 'name': None}, '/'])
def test_no_name(self): self.assertEqual(parse_setup_cmd("foo#"), [{"type": "file", "path": "foo", "name": None}]) self.assertEqual(parse_setup_cmd("foo#/"), [{"type": "archive", "path": "foo", "name": None}, "/"])