Пример #1
0
def find_corpus_fileids(root, regexp):
    if not isinstance(root, PathPointer):
        raise TypeError("find_corpus_fileids: expected a PathPointer")
    regexp += "$"

    # Find fileids in a zipfile: scan the zipfile's namelist.  Filter
    # out entries that end in '/' -- they're directories.
    if isinstance(root, ZipFilePathPointer):
        fileids = [name[len(root.entry) :] for name in root.zipfile.namelist() if not name.endswith("/")]
        items = [name for name in fileids if re.match(regexp, name)]
        return sorted(items)

    # Find fileids in a directory: use os.walk to search all (proper
    # or symlinked) subdirectories, and match paths against the regexp.
    elif isinstance(root, FileSystemPathPointer):
        items = []
        # workaround for py25 which doesn't support followlinks
        kwargs = {}
        if not py25():
            kwargs = {"followlinks": True}
        for dirname, subdirs, fileids in os.walk(root.path, **kwargs):
            prefix = "".join("%s/" % p for p in _path_from(root.path, dirname))
            items += [prefix + fileid for fileid in fileids if re.match(regexp, prefix + fileid)]
            # Don't visit svn directories:
            if ".svn" in subdirs:
                subdirs.remove(".svn")
        return sorted(items)

    else:
        raise AssertionError("Don't know how to handle %r" % root)
Пример #2
0
def find_corpus_fileids(root, regexp):
    if not isinstance(root, PathPointer):
        raise TypeError('find_corpus_fileids: expected a PathPointer')
    regexp += '$'

    # Find fileids in a zipfile: scan the zipfile's namelist.  Filter
    # out entries that end in '/' -- they're directories.
    if isinstance(root, ZipFilePathPointer):
        fileids = [name[len(root.entry):] for name in root.zipfile.namelist()
                 if not name.endswith('/')]
        items = [name for name in fileids if re.match(regexp, name)]
        return sorted(items)

    # Find fileids in a directory: use os.walk to search all (proper
    # or symlinked) subdirectories, and match paths against the regexp.
    elif isinstance(root, FileSystemPathPointer):
        items = []
        # workaround for py25 which doesn't support followlinks
        kwargs = {}
        if not py25():
            kwargs = {'followlinks': True}
        for dirname, subdirs, fileids in os.walk(root.path, **kwargs):
            prefix = ''.join('%s/' % p for p in _path_from(root.path, dirname))
            items += [prefix+fileid for fileid in fileids
                      if re.match(regexp, prefix+fileid)]
            # Don't visit svn directories:
            if '.svn' in subdirs: subdirs.remove('.svn')
        return sorted(items)

    else:
        raise AssertionError("Don't know how to handle %r" % root)