def _syslog_to_stderr_path(path): """Get the path/uri of the stderr log corresponding to the given syslog. If the syslog is gzipped (/path/to/syslog.gz), we'll expect stderr to be gzipped too (/path/to/stderr.gz). """ stem, filename = posixpath.split(path) return posixpath.join(stem, 'stderr' + file_ext(filename))
def filter_path(path): filename = os.path.basename(path) return not(file_ext(filename).lower() in ('.pyc', '.pyo') or # filter out emacs backup files filename.endswith('~') or # filter out emacs lock files filename.startswith('.#') or # filter out MacFuse resource forks filename.startswith('._'))
def _unarchive_file(self, path, dest): path = os.path.abspath(path) # figure out how to unarchive the file, based on its extension unarchive_args = HOW_TO_UNARCHIVE.get(file_ext(path)) if not unarchive_args: raise ValueError("Don't know how to unarchive %s" % path) log.debug('unarchiving %s -> %s' % (path, dest)) self.mkdir(dest) check_call(unarchive_args + [path], cwd=dest)
def name_uniquely(path, names_taken=(), proposed_name=None, unhide=False, strip_ext=False, suffix=''): """Come up with a unique name for *path*. :param names_taken: a dictionary or set of names not to use. :param proposed_name: name to use if it is not taken. If this is not set, we propose a name based on the filename. :param unhide: make sure final name doesn't start with periods or underscores :param strip_ext: if we propose a name, it shouldn't have a file extension :param suffix: if set to a string, add this to the end of any filename we propose. Should include the ``.``. If the proposed name is taken, we add a number to the end of the filename, keeping the extension the same. For example: >>> name_uniquely('foo.txt', {'foo.txt'}) 'foo-1.txt' >>> name_uniquely('bar.tar.gz', {'bar'}, strip_ext=True) 'bar-1' """ filename = proposed_name or os.path.basename(path.rstrip('/' + os.sep)) ext = file_ext(filename) prefix = filename[:-len(ext) or None] if strip_ext and not proposed_name: ext = '' if suffix and not proposed_name: ext += suffix if unhide: prefix = prefix.lstrip('.').lstrip('_') # is our proposed name taken? name = prefix + ext if prefix and name not in names_taken: return name # add 1, 2, etc. to the name until it's not taken for i in itertools.count(1): if prefix: name = '%s-%d%s' % (prefix, i, ext) else: # if no prefix is left (due to empty filename or unhiding) # just use numbers; don't start filenames with '-' name = '%d%s' % (i, ext) if name not in names_taken: return name
def test_file_ext(self): self.assertEqual(file_ext('foo.zip'), '.zip') self.assertEqual(file_ext('foo.Z'), '.Z') self.assertEqual(file_ext('foo.tar.gz'), '.tar.gz') self.assertEqual(file_ext('README'), '') self.assertEqual(file_ext('README,v'), '') self.assertEqual(file_ext('README.txt,v'), '.txt,v')
def test_file_ext(self): self.assertEqual(file_ext("foo.zip"), ".zip") self.assertEqual(file_ext("foo.Z"), ".Z") self.assertEqual(file_ext("foo.tar.gz"), ".tar.gz") self.assertEqual(file_ext("README"), "") self.assertEqual(file_ext("README,v"), "") self.assertEqual(file_ext("README.txt,v"), ".txt,v")
def filter_path(path): filename = os.path.basename(path) return not ( file_ext(filename).lower() in (".pyc", ".pyo") or # filter out emacs backup files filename.endswith("~") or # filter out emacs lock files filename.startswith(".#") or # filter out MacFuse resource forks filename.startswith("._") )
def parse_doc_filename(input_uri): """Parse a filename like ``some_id-cat1-cat2-not_cat3.txt`` into ``dict(id='some_id', cats=dict(cat1=True, cat2=True, cat3=False))`` """ # get filename without extension name_with_ext = posixpath.basename(input_uri) name = name_with_ext[:-len(file_ext(name_with_ext))] parts = name.split('-') doc_id = parts[0] cats = {} for part in parts[1:]: if part.startswith('not_'): cats[part[4:]] = False else: cats[part] = True return dict(id=doc_id, cats=cats)
def _master_bootstrap_script_content(self, bootstrap): """Return a list containing the lines of the master bootstrap script. (without trailing newlines) """ out = [] # shebang, precommands out.extend(self._start_of_sh_script()) out.append('') # for example, create a tmp dir and cd to it if self._bootstrap_pre_commands(): out.extend(self._bootstrap_pre_commands()) out.append('') # store $PWD out.append('# store $PWD') out.append('__mrjob_PWD=$PWD') out.append('') # special case for PWD being in /, which happens on Dataproc # (really we should cd to tmp or something) out.append('if [ $__mrjob_PWD = "/" ]; then') out.append(' __mrjob_PWD=""') out.append('fi') out.append('') # run commands in a block so we can redirect stdout to stderr # (e.g. to catch errors from compileall). See #370 out.append('{') # download files out.append(' # download files and mark them executable') cp_to_local = self._cp_to_local_cmd() # TODO: why bother with $__mrjob_PWD here, since we're already in it? for name, path in sorted( self._bootstrap_dir_mgr.name_to_path('file').items()): uri = self._upload_mgr.uri(path) out.append(' %s %s $__mrjob_PWD/%s' % (cp_to_local, pipes.quote(uri), pipes.quote(name))) # imitate Hadoop Distributed Cache (see #1602) out.append(' chmod u+rx $__mrjob_PWD/%s' % pipes.quote(name)) out.append('') # download and unarchive archives archive_names_and_paths = sorted( self._bootstrap_dir_mgr.name_to_path('archive').items()) if archive_names_and_paths: # make tmp dir if needed out.append(' # download and unpack archives') out.append(' __mrjob_TMP=$(mktemp -d)') out.append('') for name, path in archive_names_and_paths: uri = self._upload_mgr.uri(path) ext = file_ext(basename(path)) # copy file to tmp dir quoted_archive_path = '$__mrjob_TMP/%s' % pipes.quote(name) out.append( ' %s %s %s' % (cp_to_local, pipes.quote(uri), quoted_archive_path)) # unarchive file if ext not in _EXT_TO_UNARCHIVE_CMD: raise KeyError('unknown archive file extension: %s' % path) unarchive_cmd = _EXT_TO_UNARCHIVE_CMD[ext] out.append(' ' + unarchive_cmd % dict(file=quoted_archive_path, dir='$__mrjob_PWD/' + pipes.quote(name))) # imitate Hadoop Distributed Cache (see #1602) out.append(' chmod u+rx -R $__mrjob_PWD/%s' % pipes.quote(name)) out.append('') # run bootstrap commands out.append(' # bootstrap commands') for cmd in bootstrap: # reconstruct the command line, substituting $__mrjob_PWD/<name> # for path dicts line = ' ' for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._bootstrap_dir_mgr.name(**token)) else: # it's raw script line += token out.append(line) out.append('} 1>&2') # stdout -> stderr for ease of error log parsing return out
def _master_bootstrap_script_content(self, bootstrap): """Return a list containing the lines of the master bootstrap script. (without trailing newlines) """ out = [] # shebang, precommands out.extend(self._start_of_sh_script()) out.append('') # store $PWD out.append('# store $PWD') out.append('__mrjob_PWD=$PWD') out.append('') # special case for PWD being in /, which happens on Dataproc # (really we should cd to tmp or something) out.append('if [ $__mrjob_PWD = "/" ]; then') out.append(' __mrjob_PWD=""') out.append('fi') out.append('') # run commands in a block so we can redirect stdout to stderr # (e.g. to catch errors from compileall). See #370 out.append('{') # download files out.append(' # download files and mark them executable') cp_to_local = self._cp_to_local_cmd() # TODO: why bother with $__mrjob_PWD here, since we're already in it? for name, path in sorted( self._bootstrap_dir_mgr.name_to_path('file').items()): uri = self._upload_mgr.uri(path) out.append('') out.append(' %s %s $__mrjob_PWD/%s' % (cp_to_local, pipes.quote(uri), pipes.quote(name))) # imitate Hadoop Distributed Cache (see #1602) out.append(' chmod u+rx $__mrjob_PWD/%s' % pipes.quote(name)) out.append('') # download and unarchive archives archive_names_and_paths = sorted( self._bootstrap_dir_mgr.name_to_path('archive').items()) if archive_names_and_paths: # make tmp dir if needed out.append(' # download and unpack archives') out.append(' __mrjob_TMP=$(mktemp -d)') out.append('') for name, path in archive_names_and_paths: uri = self._upload_mgr.uri(path) ext = file_ext(basename(path)) # copy file to tmp dir quoted_archive_path = '$__mrjob_TMP/%s' % pipes.quote(name) out.append(' %s %s %s' % ( cp_to_local, pipes.quote(uri), quoted_archive_path)) # unarchive file if ext not in _EXT_TO_UNARCHIVE_CMD: raise KeyError('unknown archive file extension: %s' % path) unarchive_cmd = _EXT_TO_UNARCHIVE_CMD[ext] out.append(' ' + unarchive_cmd % dict( file=quoted_archive_path, dir='$__mrjob_PWD/' + pipes.quote(name))) # imitate Hadoop Distributed Cache (see #1602) out.append( ' chmod u+rx -R $__mrjob_PWD/%s' % pipes.quote(name)) out.append('') # run bootstrap commands out.append(' # bootstrap commands') for cmd in bootstrap: # reconstruct the command line, substituting $__mrjob_PWD/<name> # for path dicts line = ' ' for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._bootstrap_dir_mgr.name(**token)) else: # it's raw script line += token out.append(line) out.append('} 1>&2') # stdout -> stderr for ease of error log parsing return out
def ignore_initial_dots(self): self.assertEqual(file_ext('.emacs'), '') self.assertEqual(file_ext('.mrjob.conf'), '.conf') self.assertEqual(file_ext('...dots.txt'), '.txt')