Пример #1
0
    def assertEqualDirs(self, want, got, suffix=None, subset=False, filterdir="entries"):
        """Assert that two directory trees contains identical files

        :param want: The expected directory tree
        :type  want: str
        :param got: The actual directory tree
        :type  got: str
        :param suffix: If given, only check files ending in suffix (otherwise check all the files
        :type  suffix: str
        :param subset: If True, require only that files in want is a subset of files in got (otherwise require that the sets are identical)
        :type subset: bool
        :param filterdir: If given, don't compare the parts of the tree that starts with filterdir
        :type  suffix: str
        """
        wantfiles = [x[len(want) + 1:]
                     for x in util.list_dirs(want, suffix) if not x.startswith(want + os.sep + filterdir)]
        gotfiles = [x[len(got) + 1:]
                    for x in util.list_dirs(got, suffix) if not x.startswith(got + os.sep + filterdir)]
        self.maxDiff = None
        if subset:
            self.assertTrue(set(wantfiles).issubset(set(gotfiles)))
        else:
            self.assertEqual(wantfiles, gotfiles)  # or assertIn?
        for f in wantfiles:
            if not filecmp.cmp(os.path.join(want, f),
                               os.path.join(got, f),
                               shallow=False):
                self.assertEqual(util.readfile(os.path.join(want, f), mode="rb"),
                                 util.readfile(os.path.join(got, f), mode="rb"))
Пример #2
0
    def list_attachments(self, basefile, action, version=None):
        """Get all attachments for a basefile in a specified state

        :param action: The state (type of file) to look for (either
                       ``downloaded``, ``parsed`` or ``generated``. If
                       ``None``, look for all types.
        :type action: str
        :param basefile: The basefile to list attachments for
        :type  basefile: str
        :param version: The version of the basefile to list attachments for. If None, list attachments for the current version.
        :type  version: str
        :returns: All available attachments for the basefile 
        :rtype: generator
        """

        basedir = self.datadir
        pathfrag = self.pathfrag_to_basefile(basefile)
        if version:
            v_pathfrag = self.pathfrag_to_basefile(version)
            directory = os.sep.join((basedir, "archive", action, pathfrag, v_pathfrag))
        else:
            directory = os.sep.join((basedir, action, pathfrag))
        # FIXME: Similar map exists in list_basefiles_for and in other
        # places throughout the code. Should subclasses be able to
        # control suffixes beyond the simple self.downloaded_suffix
        # mechanism?
        suffixmap = {'downloaded': self.downloaded_suffix,
                     'parsed': '.xhtml',
                     'generated': '.html'}
        mainfile = "index" + suffixmap[action]
        for x in util.list_dirs(directory, reverse=False):
            # /datadir/base/downloaded/basefile/attachment.txt => attachment.txt
            x = x[len(directory) + 1:]
            if x != mainfile:
                yield x
Пример #3
0
 def list_basefiles_for(self, action, basedir=None):
     if not basedir:
         basedir = self.datadir
     if action == "parse":
         # Note: This pulls everything into memory before first
         # value is yielded. A more nifty variant is at
         # http://code.activestate.com/recipes/491285/
         d = os.path.sep.join((basedir, "downloaded"))
         for x in sorted(itertools.chain(util.list_dirs(d, ".doc"),
                                         util.list_dirs(d, ".docx"))):
             suffix = os.path.splitext(x)[1]
             pathfrag = x[len(d) + 1:-len(suffix)]
             yield self.pathfrag_to_basefile(pathfrag)
     else:
         for x in super(DVStore, self).list_basefiles_for(action, basedir):
             yield x
Пример #4
0
 def list_basefiles_for(self, action, basedir=None, force=True):
     if action == "parse":
         for x in util.list_dirs(self.staticdir, self.downloaded_suffixes[0]):
             pathfrag = x[len(self.staticdir) + 1:-len(self.downloaded_suffixes[0])]
             yield self.pathfrag_to_basefile(pathfrag)
     else:
         for x in super(StaticStore, self).list_basefiles_for(action, basedir, force):
             yield x
Пример #5
0
 def list_basefiles_for(self, action, basedir=None, force=True):
     if action == "parse":
         for x in util.list_dirs(self.staticdir,
                                 self.downloaded_suffixes[0]):
             pathfrag = x[len(self.staticdir) +
                          1:-len(self.downloaded_suffixes[0])]
             yield self.pathfrag_to_basefile(pathfrag)
     else:
         for x in super(StaticStore,
                        self).list_basefiles_for(action, basedir, force):
             yield x
Пример #6
0
 def test_listdirs(self):
     util.writefile(self.p("foo.txt"), "Hello")
     util.writefile(self.p("bar.txt"), "Hello")
     util.writefile(self.p("foo/2.txt"), "Hello")
     util.writefile(self.p("foo/10.txt"), "Hello")
     util.writefile(self.datadir+"/foo/baz.text", "Hello")
     generator = util.list_dirs(self.datadir, ".txt")
     self.assertEqual(self.p("bar.txt"), next(generator))
     self.assertEqual([self.p("foo.txt"),
                       self.p("foo/2.txt"),
                       self.p("foo/10.txt")], list(generator))
Пример #7
0
    def extractdir(self, resourcedir, target, suffixes=None):
        """Extract all file resources contained in the specified
        resource directory to the target directory.
        
        Searches all loadpaths and optionally the Resources API for
        any file contained within. This means the target dir may end
        up with eg. one file from a high-priority path and other files
        from the system dirs/resources. This in turns makes it easy to
        just override a single file in a larger set of resource files.

        Even if the resourcedir might contain resources in
        subdirectories (eg "source/sub/dir/resource.xml"), the
        extraction will be to the top-level target directory (eg
        "target/resource.xml").

        """
        if not suffixes:
            suffixes = []
        extracted = set()
        for path in self.loadpath:
            if resourcedir and resourcedir != ".":
                path = path+os.sep+resourcedir
            if not os.path.exists(path):
                continue
            # for f in os.listdir(path):
            for f in util.list_dirs(path, suffixes):
                f = f[len(path)+1:]
                basef = os.path.basename(f)
                src = os.sep.join([path, f])
                dest = os.sep.join([target, basef])
                if dest not in extracted and os.path.isfile(src):
                    util.ensure_dir(dest)
                    shutil.copy2(src, dest)
                    extracted.add(dest)

        if self.use_pkg_resources:
            self._check_module_path()
            path = self.resourceprefix
            if resourcedir:
                path = path + os.sep + resourcedir
            for f in pkg_resources.resource_listdir(self.modulename, path):
                src = path + os.sep + f
                dest = target
                dest += os.sep + f
                if (dest not in extracted and not
                    pkg_resources.resource_isdir(self.modulename,
                                                 self.resourceprefix + os.sep + f)):
                    util.ensure_dir(dest)
                    with open(dest, "wb") as fp:
                        readfp = pkg_resources.resource_stream(self.modulename,
                                                               src)
                        fp.write(readfp.read())
                        readfp.close()
                    extracted.add(dest)
Пример #8
0
    def extractdir(self, resourcedir, target, suffixes=None):
        """Extract all file resources contained in the specified
        resource directory to the target directory.
        
        Searches all loadpaths and optionally the Resources API for
        any file contained within. This means the target dir may end
        up with eg. one file from a high-priority path and other files
        from the system dirs/resources. This in turns makes it easy to
        just override a single file in a larger set of resource files.

        Even if the resourcedir might contain resources in
        subdirectories (eg "source/sub/dir/resource.xml"), the
        extraction will be to the top-level target directory (eg
        "target/resource.xml").

        """
        if not suffixes:
            suffixes = []
        extracted = set()
        for path in self.loadpath:
            if resourcedir and resourcedir != ".":
                path = path + os.sep + resourcedir
            if not os.path.exists(path):
                continue
            # for f in os.listdir(path):
            for f in util.list_dirs(path, suffixes):
                f = f[len(path) + 1:]
                basef = os.path.basename(f)
                src = os.sep.join([path, f])
                dest = os.sep.join([target, basef])
                if dest not in extracted and os.path.isfile(src):
                    util.ensure_dir(dest)
                    shutil.copy2(src, dest)
                    extracted.add(dest)

        if self.use_pkg_resources:
            self._check_module_path()
            path = self.resourceprefix
            if resourcedir:
                path = path + os.sep + resourcedir
            for f in pkg_resources.resource_listdir(self.modulename, path):
                src = path + os.sep + f
                dest = target
                dest += os.sep + f
                if (dest not in extracted and not pkg_resources.resource_isdir(
                        self.modulename, self.resourceprefix + os.sep + f)):
                    util.ensure_dir(dest)
                    with open(dest, "wb") as fp:
                        readfp = pkg_resources.resource_stream(
                            self.modulename, src)
                        fp.write(readfp.read())
                        readfp.close()
                    extracted.add(dest)
Пример #9
0
 def test_extractdir_newwd(self):
     dest = self.tempdir + os.sep + "dest"
     os.mkdir(dest)
     prevdir = os.getcwd()
     os.chdir(self.tempdir)
     if "FERENDA_HOME" not in os.environ:
         os.environ["FERENDA_HOME"] = prevdir
     try:
         self.repo.resourceloader.extractdir("xsl", dest)
         extracted = [x[len(dest) + 1:] for x in util.list_dirs(dest)]
         self.assertEqual(self.expected, set(extracted))
     finally:
         os.chdir(prevdir)
Пример #10
0
    def assertEqualDirs(self, want, got, suffix=None, filterdir="entries"):
        """Assert that two directory trees contains identical files

        :param want: The expected directory tree
        :type  want: str
        :param got: The actual directory tree
        :type  got: str
        :param suffix: If given, only check files ending in suffix (otherwise check all the files
        :type  suffix: str
        :param filterdir: If given, don't compare the parts of the tree that starts with filterdir
        :type  suffix: str
        """
        wantfiles = [x[len(want) + 1:]
                     for x in util.list_dirs(want, suffix) if not x.startswith(want + os.sep + filterdir)]
        gotfiles = [x[len(got) + 1:]
                    for x in util.list_dirs(got, suffix) if not x.startswith(got + os.sep + filterdir)]
        self.maxDiff = None
        self.assertEqual(wantfiles, gotfiles)  # or assertIn?
        for f in gotfiles:
            self.assertTrue(filecmp.cmp(os.path.join(want, f),
                                        os.path.join(got, f),
                                        shallow=False))
Пример #11
0
    def assertEqualDirs(self,
                        want,
                        got,
                        suffix=None,
                        subset=False,
                        filterdir="entries"):
        """Assert that two directory trees contains identical files

        :param want: The expected directory tree
        :type  want: str
        :param got: The actual directory tree
        :type  got: str
        :param suffix: If given, only check files ending in suffix (otherwise check all the files
        :type  suffix: str
        :param subset: If True, require only that files in want is a subset of files in got (otherwise require that the sets are identical)
        :type subset: bool
        :param filterdir: If given, don't compare the parts of the tree that starts with filterdir
        :type  suffix: str
        """
        wantfiles = [
            x[len(want) + 1:] for x in util.list_dirs(want, suffix)
            if not x.startswith(want + os.sep + filterdir)
        ]
        gotfiles = [
            x[len(got) + 1:] for x in util.list_dirs(got, suffix)
            if not x.startswith(got + os.sep + filterdir)
        ]
        self.maxDiff = None
        if subset:
            self.assertTrue(set(wantfiles).issubset(set(gotfiles)))
        else:
            self.assertEqual(wantfiles, gotfiles)  # or assertIn?
        for f in wantfiles:
            if not filecmp.cmp(os.path.join(want, f),
                               os.path.join(got, f),
                               shallow=False):
                self.assertEqual(
                    util.readfile(os.path.join(want, f), mode="rb"),
                    util.readfile(os.path.join(got, f), mode="rb"))
Пример #12
0
 def test_extractdir_newwd(self):
     dest = self.tempdir + os.sep + "dest"
     os.mkdir(dest)
     prevdir = os.getcwd()
     os.chdir(self.tempdir)
     if "FERENDA_HOME" not in os.environ:
         os.environ["FERENDA_HOME"] = prevdir
     try:
         self.repo.resourceloader.extractdir("xsl", dest)
         extracted = [x[len(dest)+1:] for x in util.list_dirs(dest)]
         self.assertEqual(self.expected, set(extracted))
     finally:
         os.chdir(prevdir)
Пример #13
0
    def list_versions(self, basefile, action=None):
        """Get all archived versions of a given basefile.

        :param basefile: The basefile to list archived versions for
        :type  basefile: str
        :param action: The type of file to look for (either
                       ``downloaded``, ``parsed`` or ``generated``. If
                       ``None``, look for all types.
        :type action: str
        :returns: All available versions for that basefile
        :rtype: generator
        """

        if action:
            assert action in (
                'downloaded', 'parsed', 'generated'), "Action %s invalid" % action
            actions = (action,)
        else:
            actions = ('downloaded', 'parsed', 'generated')

        basedir = self.datadir
        pathfrag = self.basefile_to_pathfrag(basefile)
        yielded_basefiles = []
        for action in actions:
            directory = os.sep.join((basedir, "archive",
                                     action, pathfrag))
            if not os.path.exists(directory):
                continue
            for x in util.list_dirs(directory, reverse=False):
                if os.path.exists(x):
                    # /datadir/base/archive/downloaded/basefile/version.html
                    # => version.html
                    x = x[len(directory) + 1:]
                    if self.storage_policy == "dir":
                        # version/index.html => version
                        x = os.sep.join(x.split(os.sep)[:-1])
                    else:
                        # version.html => version
                        x = os.path.splitext(x)[0]
                    if os.sep in x:
                        # we didn't find an archived file for
                        # basefile, instead we found an archived file
                        # for another basefile that startswith our
                        # basefile (eg '123' and '123/a', and we found
                        # '123/a/4.html')
                        continue
                    # print("Found file %r %r" % (x, self.pathfrag_to_basefile(x)))
                    basefile = self.pathfrag_to_basefile(x)
                    if basefile not in yielded_basefiles:
                        yielded_basefiles.append(basefile)
                        yield basefile
Пример #14
0
    def list_versions(self, basefile, action=None):
        """Get all archived versions of a given basefile.

        :param basefile: The basefile to list archived versions for
        :type  basefile: str
        :param action: The type of file to look for (either
                       ``downloaded``, ``parsed`` or ``generated``. If
                       ``None``, look for all types.
        :type action: str
        :returns: All available versions for that basefile
        :rtype: generator
        """

        if action:
            assert action in ('downloaded', 'parsed',
                              'generated'), "Action %s invalid" % action
            actions = (action, )
        else:
            actions = ('downloaded', 'parsed', 'generated')

        basedir = self.datadir
        pathfrag = self.basefile_to_pathfrag(basefile)
        yielded_basefiles = []
        for action in actions:
            directory = os.sep.join((basedir, "archive", action, pathfrag))
            if not os.path.exists(directory):
                continue
            for x in util.list_dirs(directory, reverse=False):
                if os.path.exists(x):
                    # /datadir/base/archive/downloaded/basefile/version.html
                    # => version.html
                    x = x[len(directory) + 1:]
                    if self.storage_policy == "dir":
                        # version/index.html => version
                        x = os.sep.join(x.split(os.sep)[:-1])
                    else:
                        # version.html => version
                        x = os.path.splitext(x)[0]
                    if os.sep in x:
                        # we didn't find an archived file for
                        # basefile, instead we found an archived file
                        # for another basefile that startswith our
                        # basefile (eg '123' and '123/a', and we found
                        # '123/a/4.html')
                        continue
                    # print("Found file %r %r" % (x, self.pathfrag_to_basefile(x)))
                    basefile = self.pathfrag_to_basefile(x)
                    if basefile not in yielded_basefiles:
                        yielded_basefiles.append(basefile)
                        yield basefile
Пример #15
0
def load_files(path, graph=None):
    # loads all the n3 files found under path into a graph
    if graph is None:
        graph = rdflib.Graph()
    if os.path.isfile(path):
        return load_file(path, graph)
    elif os.path.isdir(path):
        print("loading all n3 files in %s" % path)
        for f in util.list_dirs(path, suffix=".n3"):
            # FIXME: ugly hack to avoid reading one particular n3 file
            if f.endswith("sources.n3"):
                continue
            load_file(f, graph)
        return graph
    else:
        print("ERROR: can't load %s" % path)
Пример #16
0
    def list_attachments(self, basefile, action, version=None):
        """Get all attachments for a basefile in a specified state

        :param action: The state (type of file) to look for (either
                       ``downloaded``, ``parsed`` or ``generated``. If
                       ``None``, look for all types.
        :type action: str
        :param basefile: The basefile to list attachments for
        :type  basefile: str
        :param version: The version of the basefile to list attachments for. If None, list attachments for the current version.
        :type  version: str
        :returns: All available attachments for the basefile
        :rtype: generator
        """
        if self.storage_policy != "dir":
            raise errors.AttachmentPolicyError(
                "Can't list attachments if storage_policy != 'dir'")

        basedir = self.datadir
        # pathfrag = self.pathfrag_to_basefile(basefile) # that can't be right?
        pathfrag = self.basefile_to_pathfrag(basefile)
        if version:
            v_pathfrag = self.basefile_to_pathfrag(version)
            directory = os.sep.join((basedir, "archive", action, pathfrag,
                                     ".versions", v_pathfrag))
        else:
            directory = os.sep.join((basedir, action, pathfrag))
        # FIXME: Similar map exists in list_basefiles_for and in other
        # places throughout the code. Should subclasses be able to
        # control suffixes beyond the simple self.downloaded_suffix
        # mechanism?
        suffixmap = {
            'downloaded': self.downloaded_suffixes,
            'parsed': ['.xhtml'],
            'generated': ['.html']
        }
        mainfiles = ["index" + s for s in suffixmap[action]]
        for x in util.list_dirs(directory, reverse=False):
            # /datadir/base/downloaded/basefile/attachment.txt => attachment.txt
            x = x[len(directory) + 1:]
            if x not in mainfiles:
                if not [
                        suffix for suffix in self.invalid_suffixes
                        if x.endswith(suffix)
                ]:
                    yield x
Пример #17
0
 def add_downloaded_files(filelist, spec, url):
     downloaddir = os.sep.join(
         [self.datadir, self.repoclass.alias, "downloaded"])
     for f in list(util.list_dirs(downloaddir)):
         if f.endswith(".etag"):
             continue  # FIXME: this is ugly
         if f not in filelist:
             # print("Fetching %s resulted in downloaded file %s" % (url, f))
             filelist.append(f)
             expect = "downloaded" + f.replace(downloaddir, "")
             if os.sep != "/":
                 expect = expect.replace(os.sep, "/")
             spec[url]['expect'] = expect
             reldest = os.path.relpath(".." + os.sep + "downloaded",
                                       os.path.dirname(f))
             dest = os.path.normpath(
                 os.path.join(os.path.dirname(specfile), reldest))
             util.ensure_dir(dest)
             shutil.copy2(f, dest)
Пример #18
0
    def list_attachments(self, basefile, action, version=None):
        """Get all attachments for a basefile in a specified state

        :param action: The state (type of file) to look for (either
                       ``downloaded``, ``parsed`` or ``generated``. If
                       ``None``, look for all types.
        :type action: str
        :param basefile: The basefile to list attachments for
        :type  basefile: str
        :param version: The version of the basefile to list attachments for. If None, list attachments for the current version.
        :type  version: str
        :returns: All available attachments for the basefile
        :rtype: generator
        """
        if self.storage_policy != "dir":
            raise errors.AttachmentPolicyError(
                "Can't list attachments if storage_policy != 'dir'")

        basedir = self.datadir
        # pathfrag = self.pathfrag_to_basefile(basefile) # that can't be right?
        pathfrag = self.basefile_to_pathfrag(basefile)
        if version:
            v_pathfrag = self.basefile_to_pathfrag(version)
            directory = os.sep.join((basedir, "archive", action, pathfrag, v_pathfrag))
        else:
            directory = os.sep.join((basedir, action, pathfrag))
        # FIXME: Similar map exists in list_basefiles_for and in other
        # places throughout the code. Should subclasses be able to
        # control suffixes beyond the simple self.downloaded_suffix
        # mechanism?
        suffixmap = {'downloaded': self.downloaded_suffixes,
                     'parsed': ['.xhtml'],
                     'generated': ['.html']}
        mainfiles = ["index" + s for s in suffixmap[action]]
        for x in util.list_dirs(directory, reverse=False):
            # /datadir/base/downloaded/basefile/attachment.txt => attachment.txt
            x = x[len(directory) + 1:]
            if x not in mainfiles:
                if not [suffix for suffix in self.invalid_suffixes if x.endswith(suffix)]:
                    yield x
Пример #19
0
 def add_downloaded_files(filelist, spec, url):
     downloaddir = os.sep.join([self.datadir, self.repoclass.alias,
                                "downloaded"])
     for f in list(util.list_dirs(downloaddir)):
         if f.endswith(".etag"):
             continue  # FIXME: this is ugly
         if f not in filelist:
             # print("Fetching %s resulted in downloaded file %s" % (url, f))
             filelist.append(f)
             expect = "downloaded" + f.replace(downloaddir, "")
             if os.sep != "/":
                 expect = expect.replace(os.sep, "/")
             spec[url]['expect'] = expect
             reldest = os.path.relpath(
                 ".." +
                 os.sep +
                 "downloaded",
                 os.path.dirname(f))
             dest = os.path.normpath(
                 os.path.join(
                     os.path.dirname(specfile),
                     reldest))
             util.ensure_dir(dest)
             shutil.copy2(f, dest)
Пример #20
0
    def importarchive(self, archivedir):
        """Imports downloaded data from an archive from legacy lagen.nu data.

        In particular, creates proper archive storage for older
        versions of each text.

        """
        current = archived = 0
        for f in util.list_dirs(archivedir, ".html"):
            if not f.startswith("downloaded/sfs"):  # sfst or sfsr
                continue
            for regex in self.templ:
                m = re.match(regex, f)
                if not m:
                    continue
                if "vcheck" in m.groupdict():  # silently ignore
                    break
                basefile = "%s:%s" % (m.group("byear"), m.group("bnum"))

                # need to look at the file to find out its version
                # text = t.extractfile(f).read(4000).decode("latin-1")
                text = open(f).read(4000).decode("latin-1")
                reader = TextReader(string=text)
                updated_to = self._find_uppdaterad_tom(basefile, reader=reader)

                if "vyear" in m.groupdict():  # this file is marked as
                    # an archival version
                    archived += 1
                    version = updated_to

                    if m.group("vyear") == "first":
                        pass
                    else:
                        exp = "%s:%s" % (m.group("vyear"), m.group("vnum"))
                        if version != exp:
                            self.log.warning("%s: Expected %s, found %s" %
                                             (f, exp, version))
                else:
                    version = None
                    current += 1
                    de = DocumentEntry()
                    de.basefile = basefile
                    de.id = self.canonical_uri(basefile, updated_to)
                    # fudge timestamps best as we can
                    de.orig_created = datetime.fromtimestamp(
                        os.path.getctime(f))
                    de.orig_updated = datetime.fromtimestamp(
                        os.path.getmtime(f))
                    de.orig_updated = datetime.now()
                    de.orig_url = self.document_url_template % locals()
                    de.published = datetime.now()
                    de.url = self.generated_url(basefile)
                    de.title = "SFS %s" % basefile
                    # de.set_content()
                    # de.set_link()
                    de.save(self.store.documententry_path(basefile))
                # this yields more reasonable basefiles, but they are not
                # backwards compatible -- skip them for now
                # basefile = basefile.replace("_", "").replace(".", "")
                if "type" in m.groupdict() and m.group("type") == "sfsr":
                    dest = self.store.register_path(basefile)
                    current -= 1  # to offset the previous increment
                else:
                    dest = self.store.downloaded_path(basefile, version)
                self.log.debug("%s: extracting %s to %s" % (basefile, f, dest))
                util.ensure_dir(dest)
                shutil.copy2(f, dest)
                break
            else:
                self.log.warning("Couldn't process %s" % f)
        self.log.info(
            "Extracted %s current versions and %s archived versions" %
            (current, archived))
Пример #21
0
    def list_basefiles_for(self, action, basedir=None, force=True):
        """Get all available basefiles that can be used for the
        specified action.

        :param action: The action for which to get available
                       basefiles (``parse``, ``relate``, ``generate``
                       or ``news``)
        :type action: str
        :param basedir: The base directory in which to search for
                        available files. If not provided, defaults to
                        ``self.datadir``.
        :type basedir: str
        :returns: All available basefiles
        :rtype: generator
        """
        def prepend_index(suffixes):
            prepend = self.storage_policy == "dir"
            # If each document is stored in a separate directory
            # (storage_policy = "dir"), there is usually other
            # auxillary files (attachments and whatnot) in that
            # directory as well. Make sure we only yield a single file
            # from each directory. By convention, the main file is
            # called index.html, index.pdf or whatever.
            return [os.sep + "index" + s if prepend else s for s in suffixes]

        if not basedir:
            basedir = self.datadir
        directory = None
        if action == "parse":
            directory = os.path.sep.join((basedir, "downloaded"))
            suffixes = prepend_index(self.downloaded_suffixes)
        elif action == "relate":
            directory = os.path.sep.join((basedir, "distilled"))
            suffixes = [".rdf"]
        elif action == "generate":
            directory = os.path.sep.join((basedir, "parsed"))
            suffixes = prepend_index([".xhtml"])
        elif action == "news":
            directory = os.path.sep.join((basedir, "entries"))
            suffixes = [".json"]
        # FIXME: fake action, needed for get_status. replace with
        # something more elegant
        elif action in ("_postgenerate"):
            directory = os.path.sep.join((basedir, "generated"))
            suffixes = [".html"]

        if not directory:
            raise ValueError("No directory calculated for action %s" % action)

        if not os.path.exists(directory):
            return

        # if we have information about how long each basefile took the
        # last time, use that to yield the most demanding basefiles
        # first. This improves throughput when processing files in
        # paralel
        durations_path = self.path(".durations",
                                   "entries",
                                   ".json",
                                   storage_policy="file")
        durations = {}
        if os.path.exists(durations_path):
            with open(durations_path) as fp:
                d = json.load(fp)
                if action in d:
                    durations = d[action]
        yielded_paths = set()
        for basefile, duration in sorted(durations.items(),
                                         key=operator.itemgetter(1),
                                         reverse=True):
            if duration == -1 and not force:
                # Skip files that will raise DocumentRemovedError ?
                pass
            elif not force and not self.needed(basefile, action):
                # Skip files for which no action will be performed
                pass
            else:
                # make sure the underlying file really still exists
                path = None
                if action == "parse":
                    path = self.downloaded_path(basefile)
                elif action == "relate":
                    path = self.distilled_path(basefile)
                elif action == "generate":
                    path = self.parsed_path(basefile)
                if os.path.exists(path):
                    yielded_paths.add(path)
                    yield basefile

        for x in util.list_dirs(directory, suffixes, reverse=True):
            # ignore empty files placed by download (which may
            # have done that in order to avoid trying to
            # re-download nonexistent resources)
            if x in yielded_paths:
                continue
            if os.path.exists(x) and os.path.getsize(x) > 0 and not x.endswith(
                (".root.json", ".durations.json")):
                # get a pathfrag from full path
                # suffixlen = len(suffix) if self.storage_policy == "file" else len(suffix) + 1
                suffixlen = 0
                for s in suffixes:
                    if x.endswith(s):
                        suffixlen = len(s)
                        break
                else:
                    raise ValueError(
                        "%s doesn't end with a valid suffix (%s)" % x,
                        ", ".join(suffixes))
                x = x[len(directory) + 1:-suffixlen]
                yield self.pathfrag_to_basefile(x)
Пример #22
0
    def download_test(self, specfile, basefile=None):
        """This test is run for each json file found in docroot/source."""
        # this function can run in normal test mode or in
        # FERENDA_SET_TESTFILE mode. In the latter, all the normal download
        # code, including net access, is run. Calls to requests.get
        # are intercepted and notes are made of which URLs are
        # requested, and if this results in files on disk. The end
        # result is a JSON file and a set of cached files, all placed under
        # "source/"
        def add_downloaded_files(filelist, spec, url):
            downloaddir = os.sep.join([self.datadir, self.repoclass.alias,
                                       "downloaded"])
            for f in list(util.list_dirs(downloaddir)):
                if f.endswith(".etag"):
                    continue  # FIXME: this is ugly
                if f not in filelist:
                    # print("Fetching %s resulted in downloaded file %s" % (url, f))
                    filelist.append(f)
                    expect = "downloaded" + f.replace(downloaddir, "")
                    if os.sep != "/":
                        expect = expect.replace(os.sep, "/")
                    spec[url]['expect'] = expect
                    reldest = os.path.relpath(
                        ".." +
                        os.sep +
                        "downloaded",
                        os.path.dirname(f))
                    dest = os.path.normpath(
                        os.path.join(
                            os.path.dirname(specfile),
                            reldest))
                    util.ensure_dir(dest)
                    shutil.copy2(f, dest)

        with codecs.open(specfile, encoding="utf-8") as fp:
            spec = json.load(fp)
        for k in list(spec.keys()):
            # NB: This exposes the encoded, possibly non-ascii, values
            # of the URL as byte strings. The encoding of these is
            # unknown (and we cannot generally assume UTF-8. Let's see
            # if this bites us.
            nk = unquote(k)
            if k != nk:
                spec[nk] = spec[k]
                del spec[k]

            # process the special '@settings' key (FIXME: didn't I already
            # implement this somewhere else?)
            #
            # a @settings like this:
            #     "@settings": {
            # 	"config": {"next_sfsnr": "2014:913"}
            #     },
            #
            # will have the effect of this:
            #
            # self.repo.config.next_sfsnr = "2014:913"
            if '@settings' in spec:
                for attribute in spec['@settings']:
                    if isinstance(spec['@settings'][attribute], dict):
                        thing = getattr(self.repo, attribute)
                        for key, value in spec['@settings'][attribute].items():
                            setattr(thing, key, value)
                    else:
                        setattr(self.repo, attribute,
                                spec['@settings'][attribute])

        if os.environ.get("FERENDA_SET_TESTFILE"):
            downloaddir = os.sep.join([self.datadir, self.repoclass.alias,
                                       "downloaded"])
            state = {'downloaded':  list(util.list_dirs(downloaddir)),
                     'previous_url': None,
                     'requests': 0}
            try:
                rc = int(os.environ.get("FERENDA_SET_TESTFILE"))
                state['total_requests'] = rc
            except (ValueError, TypeError):
                state['total_requests'] = 2  # search page, single payload

            def callback(req):
                # clean up after last callback
                add_downloaded_files(state['downloaded'], spec, state['previous_url'])
                if state['requests'] == state['total_requests']:
                    raise MaxDownloadsReached()
                # make a real requests call somehow
                responses.stop()
                # when testing this testing function
                # (testTestutil.RepoTester.test_download_setfile) we
                # still want to disable responses, but we don't want
                # to make an actual HTTP call. Detect if we are
                # running that test by examining the stack, and if so,
                # mock the requests.get call in a different way.
                frames = [f for f in inspect.stack() if f[3] == "test_download_setfile"]
                if frames:
                    frame = frames[0][0]
                    resp = frame.f_locals['self']._myget(req.url)
                else:
                    resp = requests.get(req.url)
                responses.start()
                # create a filename. use .html as suffix unless we
                # should use something else
                contenttype = resp.headers["Content-type"]
                stem = os.path.splitext(specfile)[0]
                suffix = {'application/pdf': 'pdf',
                          'application/json': 'json',
                          'text/plain': 'txt'}.get(contenttype, "html")
                outfile = "%s-%s.%s" % (stem, state['requests'], suffix)
                with open(outfile, "wb") as fp:
                    fp.write(resp.content)

                if not frames and os.environ.get("TRAVIS") != "true":
                    if suffix == "html":
                        print(
                            "requested %s, saved as %s. Edit if needed, then press enter" %
                            (req.url, outfile))
                        x = input()
                    else:
                        print("requested %s, saved %s" % (req.url, outfile))

                with open(outfile, "rb") as fp:
                    content = fp.read()
                spec[req.url] = {'file': os.path.basename(outfile)}
                if resp.encoding != 'utf-8':
                    spec[req.url]['encoding'] = resp.encoding

                state['requests'] += 1
                state['previous_url'] = req.url
                return (resp.status_code, resp.headers, content)
        else:
            def callback(req):
                headers = {'Content-type': 'text/html'}
                try:
                    # normalize req.url. req.url might be a (byte)str
                    # but keys in spec will be (and should be)
                    # unicode. Assume that req.url is all ascii
                    if isinstance(req.url, bytes):
                        url = req.url.decode()
                    else:
                        url = req.url
                    urlspec = spec[unquote(url)]
                    if isinstance(urlspec, str):
                        urlspec = {'file': urlspec}
                    url_location = os.path.join(os.path.dirname(specfile),
                                                urlspec['file'])
                    # load the .content property
                    with open(url_location, "rb") as fp:
                        content = fp.read()
                    return (200, headers, content)
                except KeyError:
                    return (404, headers, "Not found")
        responses.add_callback(responses.GET,
                               re.compile("(.*)"),
                               callback)
        # PERFORM THE TEST
        try:
            self.repo.download(basefile)
        except MaxDownloadsReached:
            pass

        if os.environ.get("FERENDA_SET_TESTFILE"):
            # process final file and save specfile
            add_downloaded_files(state['downloaded'], spec,
                                 state['previous_url'])
            with open(specfile, "w") as fp:
                j = json.dumps(spec, indent=4, 
                          separators=(', ', ': '))
                fp.write(j)

        # organize a temporary copy of files that we can compare our results to
        wantdir = "%s/%s-want" % (self.datadir, self.repoclass.alias)
        expected = False
        for url in spec:
            if url == "@settings":
                continue
            if "expect" in spec[url]:
                expected = True
                sourcefile = os.path.join(os.path.dirname(specfile),
                                          spec[url]['file'])
                wantfile = "%s/%s" % (wantdir, spec[url]['expect'])

                util.copy_if_different(sourcefile, wantfile)
        if expected:
            self.assertEqualDirs(wantdir,
                                 "%s/%s" % (self.datadir,
                                            self.repoclass.alias),
                                 subset=True)
        else:
            # the test doesn't actually result in any downloaded file
            if hasattr(self.repo, 'expect') and self.repo.expect is False:
                pass
            else:
                self.fail('No files were marked as "expect" in specfile %s' %
                          specfile)
Пример #23
0
    def download_test(self, specfile, basefile=None):
        """This test is run for each json file found in docroot/source."""

        # this function can run in normal test mode or in
        # FERENDA_SET_TESTFILE mode. In the latter, all the normal download
        # code, including net access, is run. Calls to requests.get
        # are intercepted and notes are made of which URLs are
        # requested, and if this results in files on disk. The end
        # result is a JSON file and a set of cached files, all placed under
        # "source/"
        def add_downloaded_files(filelist, spec, url):
            downloaddir = os.sep.join(
                [self.datadir, self.repoclass.alias, "downloaded"])
            for f in list(util.list_dirs(downloaddir)):
                if f.endswith(".etag"):
                    continue  # FIXME: this is ugly
                if f not in filelist:
                    # print("Fetching %s resulted in downloaded file %s" % (url, f))
                    filelist.append(f)
                    expect = "downloaded" + f.replace(downloaddir, "")
                    if os.sep != "/":
                        expect = expect.replace(os.sep, "/")
                    spec[url]['expect'] = expect
                    reldest = os.path.relpath(".." + os.sep + "downloaded",
                                              os.path.dirname(f))
                    dest = os.path.normpath(
                        os.path.join(os.path.dirname(specfile), reldest))
                    util.ensure_dir(dest)
                    shutil.copy2(f, dest)

        with codecs.open(specfile, encoding="utf-8") as fp:
            spec = json.load(fp)
        for k in list(spec.keys()):
            # NB: This exposes the encoded, possibly non-ascii, values
            # of the URL as byte strings. The encoding of these is
            # unknown (and we cannot generally assume UTF-8. Let's see
            # if this bites us.
            nk = unquote(k)
            if k != nk:
                spec[nk] = spec[k]
                del spec[k]

            # process the special '@settings' key (FIXME: didn't I already
            # implement this somewhere else?)
            #
            # a @settings like this:
            #     "@settings": {
            # 	"config": {"next_sfsnr": "2014:913"}
            #     },
            #
            # will have the effect of this:
            #
            # self.repo.config.next_sfsnr = "2014:913"
            if '@settings' in spec:
                for attribute in spec['@settings']:
                    if isinstance(spec['@settings'][attribute], dict):
                        thing = getattr(self.repo, attribute)
                        for key, value in spec['@settings'][attribute].items():
                            setattr(thing, key, value)
                    else:
                        setattr(self.repo, attribute,
                                spec['@settings'][attribute])

        if os.environ.get("FERENDA_SET_TESTFILE"):
            downloaddir = os.sep.join(
                [self.datadir, self.repoclass.alias, "downloaded"])
            state = {
                'downloaded': list(util.list_dirs(downloaddir)),
                'previous_url': None,
                'requests': 0
            }
            try:
                rc = int(os.environ.get("FERENDA_SET_TESTFILE"))
                state['total_requests'] = rc
            except (ValueError, TypeError):
                state['total_requests'] = 2  # search page, single payload

            def callback(req):
                # clean up after last callback
                add_downloaded_files(state['downloaded'], spec,
                                     state['previous_url'])
                if state['requests'] == state['total_requests']:
                    raise MaxDownloadsReached()
                # make a real requests call somehow
                responses.stop()
                # when testing this testing function
                # (testTestutil.RepoTester.test_download_setfile) we
                # still want to disable responses, but we don't want
                # to make an actual HTTP call. Detect if we are
                # running that test by examining the stack, and if so,
                # mock the requests.get call in a different way.
                frames = [
                    f for f in inspect.stack()
                    if f[3] == "test_download_setfile"
                ]
                if frames:
                    frame = frames[0][0]
                    resp = frame.f_locals['self']._myget(req.url)
                else:
                    resp = requests.get(req.url)
                responses.start()
                # create a filename. use .html as suffix unless we
                # should use something else
                contenttype = resp.headers["Content-type"]
                stem = os.path.splitext(specfile)[0]
                suffix = {
                    'application/pdf': 'pdf',
                    'application/json': 'json',
                    'text/plain': 'txt'
                }.get(contenttype, "html")
                outfile = "%s-%s.%s" % (stem, state['requests'], suffix)
                with open(outfile, "wb") as fp:
                    fp.write(resp.content)

                if not frames and os.environ.get("TRAVIS") != "true":
                    if suffix == "html":
                        print(
                            "requested %s, saved as %s. Edit if needed, then press enter"
                            % (req.url, outfile))
                        x = input()
                    else:
                        print("requested %s, saved %s" % (req.url, outfile))

                with open(outfile, "rb") as fp:
                    content = fp.read()
                spec[req.url] = {'file': os.path.basename(outfile)}
                if resp.encoding != 'utf-8':
                    spec[req.url]['encoding'] = resp.encoding

                state['requests'] += 1
                state['previous_url'] = req.url
                return (resp.status_code, resp.headers, content)
        else:

            def callback(req):
                headers = {'Content-type': 'text/html'}
                try:
                    # normalize req.url. req.url might be a (byte)str
                    # but keys in spec will be (and should be)
                    # unicode. Assume that req.url is all ascii
                    if isinstance(req.url, bytes):
                        url = req.url.decode()
                    else:
                        url = req.url
                    urlspec = spec[unquote(url)]
                    if isinstance(urlspec, str):
                        urlspec = {'file': urlspec}
                    url_location = os.path.join(os.path.dirname(specfile),
                                                urlspec['file'])
                    # load the .content property
                    with open(url_location, "rb") as fp:
                        content = fp.read()
                    return (200, headers, content)
                except KeyError:
                    return (404, headers, "Not found")

        responses.add_callback(responses.GET, re.compile("(.*)"), callback)
        # PERFORM THE TEST
        try:
            self.repo.download(basefile)
        except MaxDownloadsReached:
            pass

        if os.environ.get("FERENDA_SET_TESTFILE"):
            # process final file and save specfile
            add_downloaded_files(state['downloaded'], spec,
                                 state['previous_url'])
            with open(specfile, "w") as fp:
                j = json.dumps(spec, indent=4, separators=(', ', ': '))
                fp.write(j)

        # organize a temporary copy of files that we can compare our results to
        wantdir = "%s/%s-want" % (self.datadir, self.repoclass.alias)
        expected = False
        for url in spec:
            if url == "@settings":
                continue
            if "expect" in spec[url]:
                expected = True
                sourcefile = os.path.join(os.path.dirname(specfile),
                                          spec[url]['file'])
                wantfile = "%s/%s" % (wantdir, spec[url]['expect'])

                util.copy_if_different(sourcefile, wantfile)
        if expected:
            self.assertEqualDirs(wantdir,
                                 "%s/%s" %
                                 (self.datadir, self.repoclass.alias),
                                 subset=True)
        else:
            # the test doesn't actually result in any downloaded file
            if hasattr(self.repo, 'expect') and self.repo.expect is False:
                pass
            else:
                self.fail('No files were marked as "expect" in specfile %s' %
                          specfile)
Пример #24
0
    def list_basefiles_for(self, action, basedir=None, force=True):
        """Get all available basefiles that can be used for the
        specified action.

        :param action: The action for which to get available
                       basefiles (``parse``, ``relate``, ``generate``
                       or ``news``)
        :type action: str
        :param basedir: The base directory in which to search for
                        available files. If not provided, defaults to
                        ``self.datadir``.
        :type basedir: str
        :returns: All available basefiles
        :rtype: generator
        """
        def prepend_index(suffixes):
            prepend = self.storage_policy == "dir"
            # If each document is stored in a separate directory
            # (storage_policy = "dir"), there is usually other
            # auxillary files (attachments and whatnot) in that
            # directory as well. Make sure we only yield a single file
            # from each directory. By convention, the main file is
            # called index.html, index.pdf or whatever.
            return [os.sep + "index" + s if prepend else s for s in suffixes]

        def trim_documententry(basefile):
            # if the path (typically for the distilled or
            # parsed file) is a 0-size file, the following
            # steps should not be carried out. But since
            # they at some point might have done that
            # anyway, we're left with a bunch of stale
            # error reports in the entry files. As a
            # one-time-thing, try to blank out irrelevant
            # sections.
            entry = DocumentEntry(self.documententry_path(basefile))
            sections = {'parse': ['parse', 'relate', 'generate'],
                        'relate': ['relate', 'generate'],
                        'generate': ['generate']}.get(action, {})
            for section in sections:
                if section in entry.status:
                    del entry.status[section]
            entry.save()
        
        if not basedir:
            basedir = self.datadir
        directory = None
        if action == "parse":
            directory = os.path.sep.join((basedir, "downloaded"))
            suffixes = prepend_index(self.downloaded_suffixes)
        elif action == "relate":
            directory = os.path.sep.join((basedir, "distilled"))
            suffixes = [".rdf"]
        elif action == "generate":
            directory = os.path.sep.join((basedir, "parsed"))
            suffixes = prepend_index([".xhtml"])
        elif action == "news":
            directory = os.path.sep.join((basedir, "entries"))
            suffixes = [".json"]
        # FIXME: _postgenerate is a fake action, needed for
        # get_status. Maybe we can replace it with transformlinks now?
        elif action in ("_postgenerate", "transformlinks"):
            directory = os.path.sep.join((basedir, "generated"))
            suffixes = prepend_index([".html"])

        if not directory:
            raise ValueError("No directory calculated for action %s" % action)

        if not os.path.exists(directory):
            return

        # if we have information about how long each basefile took the
        # last time, use that to yield the most demanding basefiles
        # first. This improves throughput when processing files in
        # paralell. Note: .durations.json is only created by
        # devel.statusreport
        durations_path = self.path(".durations", "entries", ".json", storage_policy="file")
        durations = {}
        if os.path.exists(durations_path):
            with open(durations_path) as fp:
                d = json.load(fp)
                if action in d:
                    durations = d[action]
        yielded_paths = set()
        for basefile, duration in sorted(durations.items(), key=operator.itemgetter(1), reverse=True):
            if duration == -1 and not force:
                # Skip files that will raise DocumentRemovedError ?
                pass
            elif not force and not self.needed(basefile, action):
                # Skip files for which no action will be performed
                pass
            else:
                # make sure the underlying file really still exists
                path = None
                intermediate_path = False
                if action == "parse":
                    path = self.downloaded_path(basefile)
                    intermediate_path = os.path.exists(self.intermediate_path(basefile))
                elif action == "relate":
                    path = self.distilled_path(basefile)
                elif action == "generate":
                    path = self.parsed_path(basefile)
                if os.path.exists(path):
                    yielded_paths.add(path)
                    if os.path.getsize(path) > 0 or intermediate_path:
                        yield basefile
                    else:
                        trim_documententry(basefile)
        
        for x in util.list_dirs(directory, suffixes, reverse=True):
            if x in yielded_paths:
                continue
            if not os.path.exists(x) or x.endswith((".root.json", ".durations.json")):
                continue
            # get a pathfrag from full path
            # suffixlen = len(suffix) if self.storage_policy == "file" else len(suffix) + 1
            suffixlen = 0
            for s in suffixes:
                if x.endswith(s):
                    suffixlen = len(s)
                    break
            else:
                raise ValueError("%s doesn't end with a valid suffix (%s)" % x, ", ".join(suffixes))
            pathfrag = x[len(directory) + 1:-suffixlen]
            basefile = self.pathfrag_to_basefile(pathfrag)
            # ignore empty files placed by download (which may have
            # done that in order to avoid trying to re-download
            # nonexistent resources) -- but not if there is a viable
            # intermediate file (dv.py creates empty files in download
            # but contentful files in intermediate, when splitting a
            # large doc over multiple basefiles).
            intermediate_path = False
            if action == "parse":
                intermediate_path = os.path.exists(self.intermediate_path(basefile))
            if os.path.getsize(x) > 0 or intermediate_path:
                yield basefile
            elif action in ("relate", "generate"):
                trim_documententry(basefile)
Пример #25
0
    def importarchive(self, archivedir, overwrite=False):
        """Imports downloaded data from an archive from legacy lagen.nu data.

        In particular, creates proper archive storage for older
        versions of each text.

        """
        def valid(f):
            size = os.path.getsize(f)
            if size == 0:
                return False
            with open(f, mode="rb") as fp:
                fp.seek(size - 20)
                end_bytes = fp.read()
            end = end_bytes.decode(errors="ignore")
            return '</html>' in end

        def find_version(f):
            # need to look at the file to find out its version
            encoding = self._sniff_encoding(f)
            raw = open(f, 'rb').read(8000)
            text = unescape(raw.decode(encoding, errors="replace"))
            reader = TextReader(string=text)
            updated_to = self._find_uppdaterad_tom(basefile, reader=reader)
            return updated_to

        current = archived = skipped = invalid = 0
        spares = {}
        recent_versions = {}  # records the current version of every
        # basefile for which we have any archive
        # file
        for f in util.list_dirs(archivedir, ".html"):
            if "downloaded/sfst" not in f:
                continue
            if os.path.getsize(f) == 0:
                continue
            for regex in self.templ:
                m = re.search(regex, f)
                if not m:
                    continue
                basefile = self.sanitize_basefile(
                    "%s:%s" % (m.group("byear"), m.group("bnum")))

                if "vyear" in m.groupdict():  # this file is marked as
                    # an archival version
                    expected_version = self.sanitize_basefile(
                        "%s:%s" % (m.group("vyear"), m.group("vnum")))
                elif "vfirst" in m.groupdict():
                    expected_version = basefile
                else:
                    # if neither vyear or vfirst is in the filename,
                    # this is the very first version we have saved. It
                    # might be the first version, or it could be the
                    # first version that we were able to download. We
                    # just go with something and don't worry if it
                    # turns out to be wrong.
                    expected_version = basefile

                if os.path.getsize(f) == 0:
                    # we can't get any useful info from this file, but
                    # we can use it to trigger a selection of a spare,
                    # if available
                    this_version = expected_version
                else:
                    this_version = find_version(f)
                    if this_version != expected_version:
                        self.log.warning(
                            "%s@%s: Expected %s to be version %s" %
                            (basefile, this_version, f, expected_version))
                    try:
                        sanitized_this_version = self.sanitize_basefile(
                            this_version)
                    except:
                        self.log.error(
                            "%s@%s: Couldn't sanitize version found in %s" %
                            (basefile, this_version, f))
                        break
                    if this_version != sanitized_this_version:
                        self.log.warning(
                            "%s@%s: Version in %s sanitizes to %s" %
                            (basefile, this_version, f,
                             sanitized_this_version))
                        this_version = sanitized_this_version

                if "vcheck" in m.groupdict():
                    # these checksum variants should be older variants
                    # of a version we already have -- but in case the
                    # non-checksum version is empty or corrupted, we
                    # ought to use the best available checksum version
                    if valid(f):
                        spare_version = find_version(f)
                        spares[(basefile, spare_version)] = f
                    break

                if basefile not in recent_versions:
                    mainline = self.store.downloaded_path(basefile)
                    if os.path.exists(mainline):
                        recent_versions[basefile] = find_version(mainline)
                    else:
                        self.log.warning(
                            "%s@%s: archive file %s has no corresponding file in mainline (expected %s)"
                            % (basefile, this_version, f, mainline))
                        current += 1
                        # but we'll create an archived version anyway, not one in mainline
                        recent_versions[basefile] = None
                if this_version == recent_versions[basefile]:
                    self.log.debug(
                        "%s@%s: file %s has same version as mainline" %
                        (basefile, this_version, f))
                    break
                if valid(f):
                    source = f
                elif (basefile, this_version) in spares:
                    source = spares[(basefile, this_version)]
                    self.log.warning(
                        "%s@%s: using spare %s instead of invalid file %s" %
                        (basefile, this_version, f, source))
                else:
                    self.log.error(
                        "%s@%s: file %s is invalid, and no spare is available"
                        % (basefile, this_version, f))
                    invalid += 1
                    break
                dest = self.store.downloaded_path(basefile,
                                                  version=this_version)
                if os.path.exists(dest) and not overwrite:
                    self.log.debug(
                        "%s@%s: Not extracting %s as %s already exists" %
                        (basefile, this_version, f, dest))
                    skipped += 1
                else:
                    self.log.info("%s@%s: extracting %s to %s" %
                                  (basefile, this_version, f, dest))
                    util.ensure_dir(dest)
                    shutil.copy2(f, dest)
                    archived += 1
                break
            else:
                self.log.warning("Couldn't process %s" % f)
        self.log.info(
            "Extracted %s current versions and %s archived versions (skipped %s files that already existed, and couldn't handle %s invalid versions)"
            % (current, archived, skipped, invalid))
Пример #26
0
    def list_basefiles_for(self, action, basedir=None):
        """Get all available basefiles that can be used for the
        specified action.

        :param action: The action for which to get available
                       basefiles (``parse``, ``relate``, ``generate``
                       or ``news``)
        :type action: str
        :param basedir: The base directory in which to search for
                        available files. If not provided, defaults to
                        ``self.datadir``.
        :type basedir: str
        :returns: All available basefiles
        :rtype: generator
        """
        if not basedir:
            basedir = self.datadir
        directory = None
        if action == "parse":
            directory = os.path.sep.join((basedir, "downloaded"))
            if self.storage_policy == "dir":
                # If each document is stored in a separate directory,
                # there is usually other auxillary files (attachments
                # and whatnot) in that directory as well. Make sure we
                # only yield a single file from each directory. By
                # convention, the main file is called index.html,
                # index.pdf or whatever.
                # print("storage_policy dir: %s" % self.storage_policy)
                suffix = "index" + self.downloaded_suffix
            else:
                # print("storage_policy file: %s" % self.storage_policy)
                suffix = self.downloaded_suffix
        elif action == "relate":
            directory = os.path.sep.join((basedir, "distilled"))
            suffix = ".rdf"
        elif action == "generate":
            directory = os.path.sep.join((basedir, "parsed"))
            if self.storage_policy == "dir":
                suffix = "index.xhtml"
            else:
                suffix = ".xhtml"
        elif action == "news":
            directory = os.path.sep.join((basedir, "entries"))
            suffix = ".json"

        # FIXME: fake action, needed for get_status. replace with
        # something more elegant
        elif action in ("_postgenerate"):
            directory = os.path.sep.join((basedir, "generated"))
            suffix = ".html"

        if not directory:
            raise ValueError("No directory calculated for action %s" % action)

        if not os.path.exists(directory):
            return

        for x in util.list_dirs(directory, suffix, reverse=True):
            # ignore empty files placed by download (which may
            # have done that in order to avoid trying to
            # re-download nonexistent resources)

            if os.path.exists(x) and os.path.getsize(x) > 0:
                # get a pathfrag from full path
                suffixlen = len(suffix) if self.storage_policy == "file" else len(suffix) + 1
                x = x[len(directory) + 1:-suffixlen]
                yield self.pathfrag_to_basefile(x)
Пример #27
0
 def test_extractdir_repo(self):
     dest = self.tempdir + os.sep + "dest"
     os.mkdir(dest)
     self.repo.resourceloader.extractdir("xsl", dest)
     extracted = [x[len(dest) + 1:] for x in util.list_dirs(dest)]
     self.assertEqual(self.expected, set(extracted))
Пример #28
0
 def test_extractdir_repo(self):
     dest = self.tempdir + os.sep + "dest"
     os.mkdir(dest)
     self.repo.resourceloader.extractdir("xsl", dest)
     extracted = [x[len(dest)+1:] for x in util.list_dirs(dest)]
     self.assertEqual(self.expected, set(extracted))
Пример #29
0
    def list_versions(self, basefile, action=None):
        """Get all archived versions of a given basefile.

        :param basefile: The basefile to list archived versions for
        :type  basefile: str
        :param action: The type of file to look for (either
                       ``downloaded``, ``parsed`` or ``generated``. If
                       ``None``, look for all types.
        :type action: str
        :returns: All available versions for that basefile
        :rtype: generator
        """

        if action:
            if action == "relate":
                raise StopIteration()
            assert action in ('downloaded', 'parsed',
                              'generated'), "Action %s invalid" % action
            actions = (action, )
        else:
            actions = ('downloaded', 'parsed', 'generated')

        basedir = self.datadir
        pathfrag = self.basefile_to_pathfrag(basefile)
        yielded_basefiles = []
        for action in actions:
            directory = os.sep.join(
                (basedir, "archive", action, pathfrag, ".versions"))
            if not os.path.exists(directory):
                continue
            for x in util.list_dirs(directory, reverse=False):
                if os.path.exists(x):
                    # /datadir/base/archive/downloaded/basefile/version.html
                    # => version.html
                    x = x[len(directory) + 1:]
                    if self.storage_policy == "dir":
                        # version/index.html => version
                        x = os.sep.join(x.split(os.sep)[:-1])
                    else:
                        # version.html => version
                        x = os.path.splitext(x)[0]
                    if os.sep in x:
                        # we didn't find an archived file for
                        # basefile, instead we found an archived file
                        # for another basefile that startswith our
                        # basefile (eg '123' and '123/a', and we found
                        # '123/a/4.html')

                        # FIXME: This doesn't work at all with version
                        # identifiers that map to os.sep (eg SFS.py,
                        # which might have a basefile 1980:100, which
                        # then has version 2007:145, stored at
                        # <datadir>/archive/downloaded/1980/100/2007/145.html. We
                        # might need to rethink filenaming here...
                        # continue
                        pass
                    # print("Found file %r %r" % (x, self.pathfrag_to_basefile(x)))
                    basefile = self.pathfrag_to_basefile(x)
                    if basefile not in yielded_basefiles:
                        yielded_basefiles.append(basefile)
                        yield basefile
Пример #30
0
    def list_basefiles_for(self, action, basedir=None, force=True):
        """Get all available basefiles that can be used for the
        specified action.

        :param action: The action for which to get available
                       basefiles (``parse``, ``relate``, ``generate``
                       or ``news``)
        :type action: str
        :param basedir: The base directory in which to search for
                        available files. If not provided, defaults to
                        ``self.datadir``.
        :type basedir: str
        :returns: All available basefiles
        :rtype: generator
        """
        def prepend_index(suffixes):
            prepend = self.storage_policy == "dir"
            # If each document is stored in a separate directory
            # (storage_policy = "dir"), there is usually other
            # auxillary files (attachments and whatnot) in that
            # directory as well. Make sure we only yield a single file
            # from each directory. By convention, the main file is
            # called index.html, index.pdf or whatever.
            return [os.sep + "index" + s if prepend else s for s in suffixes]

        def trim_documententry(basefile):
            # if the path (typically for the distilled or
            # parsed file) is a 0-size file, the following
            # steps should not be carried out. But since
            # they at some point might have done that
            # anyway, we're left with a bunch of stale
            # error reports in the entry files. As a
            # one-time-thing, try to blank out irrelevant
            # sections.
            entry = DocumentEntry(self.documententry_path(basefile))
            sections = {
                'parse': ['parse', 'relate', 'generate'],
                'relate': ['relate', 'generate'],
                'generate': ['generate']
            }.get(action, {})
            for section in sections:
                if section in entry.status:
                    del entry.status[section]
            entry.save()

        if not basedir:
            basedir = self.datadir
        directory = None
        if action == "parse":
            directory = os.path.sep.join((basedir, "downloaded"))
            suffixes = prepend_index(self.downloaded_suffixes)
        elif action == "relate":
            directory = os.path.sep.join((basedir, "distilled"))
            suffixes = [".rdf"]
        elif action == "generate":
            directory = os.path.sep.join((basedir, "parsed"))
            suffixes = prepend_index([".xhtml"])
        elif action == "news":
            directory = os.path.sep.join((basedir, "entries"))
            suffixes = [".json"]
        # FIXME: _postgenerate is a fake action, needed for
        # get_status. Maybe we can replace it with transformlinks now?
        elif action in ("_postgenerate", "transformlinks"):
            directory = os.path.sep.join((basedir, "generated"))
            suffixes = prepend_index([".html"])

        if not directory:
            raise ValueError("No directory calculated for action %s" % action)

        if not os.path.exists(directory):
            return

        # if we have information about how long each basefile took the
        # last time, use that to yield the most demanding basefiles
        # first. This improves throughput when processing files in
        # paralell. Note: .durations.json is only created by
        # devel.statusreport
        durations_path = self.path(".durations",
                                   "entries",
                                   ".json",
                                   storage_policy="file")
        durations = {}
        # print("%s: About to check durations at %s" % (datetime.now(), durations_path))
        if os.path.exists(durations_path):
            with open(durations_path) as fp:
                try:
                    d = json.load(fp)
                except JSONDecodeError as e:
                    # just skip this, it's not essential (we should warn about the corrupt JSON file though)
                    print("ERROR: %s is not a valid JSON file" %
                          durations_path)
                    d = {}
                if action in d:
                    durations = d[action]
        yielded_paths = set()
        # print("%s: Loaded %s durations" % (datetime.now(), len(durations)))
        for basefile, duration in sorted(durations.items(),
                                         key=operator.itemgetter(1),
                                         reverse=True):
            # print("Handling %s %s" % (basefile, duration))
            path = None
            intermediate_path_exists = False
            if action == "parse":
                path = self.downloaded_path(basefile)
                intermediate_path_exists = os.path.exists(
                    self.intermediate_path(basefile))
            elif action == "relate":
                path = self.distilled_path(basefile)
            elif action == "generate":
                path = self.parsed_path(basefile)
            if duration == -1 and not force:
                # Skip files that will raise DocumentRemovedError ?
                yielded_paths.add(path)
            elif not force and not self.needed(basefile, action):
                # Skip files for which no action will be performed
                yielded_paths.add(path)
            else:
                if os.path.exists(path):
                    yielded_paths.add(path)
                    if os.path.getsize(path) > 0 or intermediate_path_exists:
                        yield basefile
                    else:
                        trim_documententry(basefile)
        # print("%s: Processing non-duration files" % datetime.now())
        for x in util.list_dirs(directory, suffixes, reverse=True):
            if x in yielded_paths:
                continue
            if not os.path.exists(x) or x.endswith(
                (".root.json", ".durations.json")):
                continue
            # get a pathfrag from full path
            # suffixlen = len(suffix) if self.storage_policy == "file" else len(suffix) + 1
            suffixlen = 0
            for s in suffixes:
                if x.endswith(s):
                    suffixlen = len(s)
                    break
            else:
                raise ValueError("%s doesn't end with a valid suffix (%s)" % x,
                                 ", ".join(suffixes))
            pathfrag = x[len(directory) + 1:-suffixlen]
            basefile = self.pathfrag_to_basefile(pathfrag)
            # ignore empty files placed by download (which may have
            # done that in order to avoid trying to re-download
            # nonexistent resources) -- but not if there is a viable
            # intermediate file (dv.py creates empty files in download
            # but contentful files in intermediate, when splitting a
            # large doc over multiple basefiles).
            intermediate_path = False
            if action == "parse":
                intermediate_path = os.path.exists(
                    self.intermediate_path(basefile))
            if os.path.getsize(x) > 0 or intermediate_path:
                yield basefile
            elif action in ("relate", "generate"):
                trim_documententry(basefile)
Пример #31
0
    def importarchive(self, archivedir):
        """Imports downloaded data from an archive from legacy lagen.nu data.

        In particular, creates proper archive storage for older
        versions of each text.

        """
        current = archived = 0
        for f in util.list_dirs(archivedir, ".html"):
            if "downloaded/sfst" not in f:
                continue
            if os.path.getsize(f) == 0:
                continue
            for regex in self.templ:
                m = re.search(regex, f)
                if not m:
                    continue
                
                if "vcheck" in m.groupdict():  # silently ignore these
                                             # (they should be older
                                             # versions of a version
                                             # we already have -- but
                                             # we ought to test this!)
                    break
                basefile = "%s:%s" % (m.group("byear"), m.group("bnum"))

                # need to look at the file to find out its version
                raw = open(f, 'rb').read(8000)
                # if it uses html5 doctype, assume utf-8, otherwise assume latin-1
                encoding = "utf-8" if b'<!DOCTYPE html>' in raw else "latin-1" 
                text = unescape(raw.decode(encoding, errors="replace"))
                reader = TextReader(string=text)
                updated_to = self._find_uppdaterad_tom(basefile,
                                                       reader=reader)

                if "vyear" in m.groupdict():  # this file is marked as
                                              # an archival version
                    archived += 1
                    version = updated_to

                    if m.group("vyear") == "first":
                        pass
                    else:
                        exp = "%s:%s" % (m.group("vyear"), m.group("vnum"))
                        if version != exp:
                            self.log.warning("%s: Expected %s, found %s" %
                                             (f, exp, version))
                else:
                    break
                    # what was the actual POINT of this? SFS.download
                    # will have downloaded a copy of this exact
                    # version (the most recent version), regardless of
                    # whether it's expired or not.
                    
                    # version = None
                    # current += 1
                    # de = DocumentEntry()
                    # de.basefile = basefile
                    # de.id = self.canonical_uri(basefile, updated_to)
                    # # fudge timestamps best as we can
                    # de.orig_created = datetime.fromtimestamp(os.path.getctime(f))
                    # de.orig_updated = datetime.fromtimestamp(os.path.getmtime(f))
                    # de.orig_updated = datetime.now()
                    # de.orig_url = self.document_url_template % locals()
                    # de.published = datetime.now()
                    # de.url = self.generated_url(basefile)
                    # de.title = "SFS %s" % basefile
                    # de.save(self.store.documententry_path(basefile))

                if m.group("type") == "sfsr":
                    dest = self.store.register_path(basefile, version=version)
                else:
                    dest = self.store.downloaded_path(basefile, version=version)
                self.log.debug("%s: extracting %s to %s" % (basefile, f, dest))
                util.ensure_dir(dest)
                shutil.copy2(f, dest)
                break
            else:
                self.log.warning("Couldn't process %s" % f)
        self.log.info("Extracted %s current versions and %s archived versions"
                      % (current, archived))