def assertEqualDirs(self, want, got, suffix=None, subset=False, filterdir="entries"): """Assert that two directory trees contains identical files :param want: The expected directory tree :type want: str :param got: The actual directory tree :type got: str :param suffix: If given, only check files ending in suffix (otherwise check all the files :type suffix: str :param subset: If True, require only that files in want is a subset of files in got (otherwise require that the sets are identical) :type subset: bool :param filterdir: If given, don't compare the parts of the tree that starts with filterdir :type suffix: str """ wantfiles = [x[len(want) + 1:] for x in util.list_dirs(want, suffix) if not x.startswith(want + os.sep + filterdir)] gotfiles = [x[len(got) + 1:] for x in util.list_dirs(got, suffix) if not x.startswith(got + os.sep + filterdir)] self.maxDiff = None if subset: self.assertTrue(set(wantfiles).issubset(set(gotfiles))) else: self.assertEqual(wantfiles, gotfiles) # or assertIn? for f in wantfiles: if not filecmp.cmp(os.path.join(want, f), os.path.join(got, f), shallow=False): self.assertEqual(util.readfile(os.path.join(want, f), mode="rb"), util.readfile(os.path.join(got, f), mode="rb"))
def list_attachments(self, basefile, action, version=None): """Get all attachments for a basefile in a specified state :param action: The state (type of file) to look for (either ``downloaded``, ``parsed`` or ``generated``. If ``None``, look for all types. :type action: str :param basefile: The basefile to list attachments for :type basefile: str :param version: The version of the basefile to list attachments for. If None, list attachments for the current version. :type version: str :returns: All available attachments for the basefile :rtype: generator """ basedir = self.datadir pathfrag = self.pathfrag_to_basefile(basefile) if version: v_pathfrag = self.pathfrag_to_basefile(version) directory = os.sep.join((basedir, "archive", action, pathfrag, v_pathfrag)) else: directory = os.sep.join((basedir, action, pathfrag)) # FIXME: Similar map exists in list_basefiles_for and in other # places throughout the code. Should subclasses be able to # control suffixes beyond the simple self.downloaded_suffix # mechanism? suffixmap = {'downloaded': self.downloaded_suffix, 'parsed': '.xhtml', 'generated': '.html'} mainfile = "index" + suffixmap[action] for x in util.list_dirs(directory, reverse=False): # /datadir/base/downloaded/basefile/attachment.txt => attachment.txt x = x[len(directory) + 1:] if x != mainfile: yield x
def list_basefiles_for(self, action, basedir=None): if not basedir: basedir = self.datadir if action == "parse": # Note: This pulls everything into memory before first # value is yielded. A more nifty variant is at # http://code.activestate.com/recipes/491285/ d = os.path.sep.join((basedir, "downloaded")) for x in sorted(itertools.chain(util.list_dirs(d, ".doc"), util.list_dirs(d, ".docx"))): suffix = os.path.splitext(x)[1] pathfrag = x[len(d) + 1:-len(suffix)] yield self.pathfrag_to_basefile(pathfrag) else: for x in super(DVStore, self).list_basefiles_for(action, basedir): yield x
def list_basefiles_for(self, action, basedir=None, force=True): if action == "parse": for x in util.list_dirs(self.staticdir, self.downloaded_suffixes[0]): pathfrag = x[len(self.staticdir) + 1:-len(self.downloaded_suffixes[0])] yield self.pathfrag_to_basefile(pathfrag) else: for x in super(StaticStore, self).list_basefiles_for(action, basedir, force): yield x
def test_listdirs(self): util.writefile(self.p("foo.txt"), "Hello") util.writefile(self.p("bar.txt"), "Hello") util.writefile(self.p("foo/2.txt"), "Hello") util.writefile(self.p("foo/10.txt"), "Hello") util.writefile(self.datadir+"/foo/baz.text", "Hello") generator = util.list_dirs(self.datadir, ".txt") self.assertEqual(self.p("bar.txt"), next(generator)) self.assertEqual([self.p("foo.txt"), self.p("foo/2.txt"), self.p("foo/10.txt")], list(generator))
def extractdir(self, resourcedir, target, suffixes=None): """Extract all file resources contained in the specified resource directory to the target directory. Searches all loadpaths and optionally the Resources API for any file contained within. This means the target dir may end up with eg. one file from a high-priority path and other files from the system dirs/resources. This in turns makes it easy to just override a single file in a larger set of resource files. Even if the resourcedir might contain resources in subdirectories (eg "source/sub/dir/resource.xml"), the extraction will be to the top-level target directory (eg "target/resource.xml"). """ if not suffixes: suffixes = [] extracted = set() for path in self.loadpath: if resourcedir and resourcedir != ".": path = path+os.sep+resourcedir if not os.path.exists(path): continue # for f in os.listdir(path): for f in util.list_dirs(path, suffixes): f = f[len(path)+1:] basef = os.path.basename(f) src = os.sep.join([path, f]) dest = os.sep.join([target, basef]) if dest not in extracted and os.path.isfile(src): util.ensure_dir(dest) shutil.copy2(src, dest) extracted.add(dest) if self.use_pkg_resources: self._check_module_path() path = self.resourceprefix if resourcedir: path = path + os.sep + resourcedir for f in pkg_resources.resource_listdir(self.modulename, path): src = path + os.sep + f dest = target dest += os.sep + f if (dest not in extracted and not pkg_resources.resource_isdir(self.modulename, self.resourceprefix + os.sep + f)): util.ensure_dir(dest) with open(dest, "wb") as fp: readfp = pkg_resources.resource_stream(self.modulename, src) fp.write(readfp.read()) readfp.close() extracted.add(dest)
def extractdir(self, resourcedir, target, suffixes=None): """Extract all file resources contained in the specified resource directory to the target directory. Searches all loadpaths and optionally the Resources API for any file contained within. This means the target dir may end up with eg. one file from a high-priority path and other files from the system dirs/resources. This in turns makes it easy to just override a single file in a larger set of resource files. Even if the resourcedir might contain resources in subdirectories (eg "source/sub/dir/resource.xml"), the extraction will be to the top-level target directory (eg "target/resource.xml"). """ if not suffixes: suffixes = [] extracted = set() for path in self.loadpath: if resourcedir and resourcedir != ".": path = path + os.sep + resourcedir if not os.path.exists(path): continue # for f in os.listdir(path): for f in util.list_dirs(path, suffixes): f = f[len(path) + 1:] basef = os.path.basename(f) src = os.sep.join([path, f]) dest = os.sep.join([target, basef]) if dest not in extracted and os.path.isfile(src): util.ensure_dir(dest) shutil.copy2(src, dest) extracted.add(dest) if self.use_pkg_resources: self._check_module_path() path = self.resourceprefix if resourcedir: path = path + os.sep + resourcedir for f in pkg_resources.resource_listdir(self.modulename, path): src = path + os.sep + f dest = target dest += os.sep + f if (dest not in extracted and not pkg_resources.resource_isdir( self.modulename, self.resourceprefix + os.sep + f)): util.ensure_dir(dest) with open(dest, "wb") as fp: readfp = pkg_resources.resource_stream( self.modulename, src) fp.write(readfp.read()) readfp.close() extracted.add(dest)
def test_extractdir_newwd(self): dest = self.tempdir + os.sep + "dest" os.mkdir(dest) prevdir = os.getcwd() os.chdir(self.tempdir) if "FERENDA_HOME" not in os.environ: os.environ["FERENDA_HOME"] = prevdir try: self.repo.resourceloader.extractdir("xsl", dest) extracted = [x[len(dest) + 1:] for x in util.list_dirs(dest)] self.assertEqual(self.expected, set(extracted)) finally: os.chdir(prevdir)
def assertEqualDirs(self, want, got, suffix=None, filterdir="entries"): """Assert that two directory trees contains identical files :param want: The expected directory tree :type want: str :param got: The actual directory tree :type got: str :param suffix: If given, only check files ending in suffix (otherwise check all the files :type suffix: str :param filterdir: If given, don't compare the parts of the tree that starts with filterdir :type suffix: str """ wantfiles = [x[len(want) + 1:] for x in util.list_dirs(want, suffix) if not x.startswith(want + os.sep + filterdir)] gotfiles = [x[len(got) + 1:] for x in util.list_dirs(got, suffix) if not x.startswith(got + os.sep + filterdir)] self.maxDiff = None self.assertEqual(wantfiles, gotfiles) # or assertIn? for f in gotfiles: self.assertTrue(filecmp.cmp(os.path.join(want, f), os.path.join(got, f), shallow=False))
def assertEqualDirs(self, want, got, suffix=None, subset=False, filterdir="entries"): """Assert that two directory trees contains identical files :param want: The expected directory tree :type want: str :param got: The actual directory tree :type got: str :param suffix: If given, only check files ending in suffix (otherwise check all the files :type suffix: str :param subset: If True, require only that files in want is a subset of files in got (otherwise require that the sets are identical) :type subset: bool :param filterdir: If given, don't compare the parts of the tree that starts with filterdir :type suffix: str """ wantfiles = [ x[len(want) + 1:] for x in util.list_dirs(want, suffix) if not x.startswith(want + os.sep + filterdir) ] gotfiles = [ x[len(got) + 1:] for x in util.list_dirs(got, suffix) if not x.startswith(got + os.sep + filterdir) ] self.maxDiff = None if subset: self.assertTrue(set(wantfiles).issubset(set(gotfiles))) else: self.assertEqual(wantfiles, gotfiles) # or assertIn? for f in wantfiles: if not filecmp.cmp(os.path.join(want, f), os.path.join(got, f), shallow=False): self.assertEqual( util.readfile(os.path.join(want, f), mode="rb"), util.readfile(os.path.join(got, f), mode="rb"))
def test_extractdir_newwd(self): dest = self.tempdir + os.sep + "dest" os.mkdir(dest) prevdir = os.getcwd() os.chdir(self.tempdir) if "FERENDA_HOME" not in os.environ: os.environ["FERENDA_HOME"] = prevdir try: self.repo.resourceloader.extractdir("xsl", dest) extracted = [x[len(dest)+1:] for x in util.list_dirs(dest)] self.assertEqual(self.expected, set(extracted)) finally: os.chdir(prevdir)
def list_versions(self, basefile, action=None): """Get all archived versions of a given basefile. :param basefile: The basefile to list archived versions for :type basefile: str :param action: The type of file to look for (either ``downloaded``, ``parsed`` or ``generated``. If ``None``, look for all types. :type action: str :returns: All available versions for that basefile :rtype: generator """ if action: assert action in ( 'downloaded', 'parsed', 'generated'), "Action %s invalid" % action actions = (action,) else: actions = ('downloaded', 'parsed', 'generated') basedir = self.datadir pathfrag = self.basefile_to_pathfrag(basefile) yielded_basefiles = [] for action in actions: directory = os.sep.join((basedir, "archive", action, pathfrag)) if not os.path.exists(directory): continue for x in util.list_dirs(directory, reverse=False): if os.path.exists(x): # /datadir/base/archive/downloaded/basefile/version.html # => version.html x = x[len(directory) + 1:] if self.storage_policy == "dir": # version/index.html => version x = os.sep.join(x.split(os.sep)[:-1]) else: # version.html => version x = os.path.splitext(x)[0] if os.sep in x: # we didn't find an archived file for # basefile, instead we found an archived file # for another basefile that startswith our # basefile (eg '123' and '123/a', and we found # '123/a/4.html') continue # print("Found file %r %r" % (x, self.pathfrag_to_basefile(x))) basefile = self.pathfrag_to_basefile(x) if basefile not in yielded_basefiles: yielded_basefiles.append(basefile) yield basefile
def list_versions(self, basefile, action=None): """Get all archived versions of a given basefile. :param basefile: The basefile to list archived versions for :type basefile: str :param action: The type of file to look for (either ``downloaded``, ``parsed`` or ``generated``. If ``None``, look for all types. :type action: str :returns: All available versions for that basefile :rtype: generator """ if action: assert action in ('downloaded', 'parsed', 'generated'), "Action %s invalid" % action actions = (action, ) else: actions = ('downloaded', 'parsed', 'generated') basedir = self.datadir pathfrag = self.basefile_to_pathfrag(basefile) yielded_basefiles = [] for action in actions: directory = os.sep.join((basedir, "archive", action, pathfrag)) if not os.path.exists(directory): continue for x in util.list_dirs(directory, reverse=False): if os.path.exists(x): # /datadir/base/archive/downloaded/basefile/version.html # => version.html x = x[len(directory) + 1:] if self.storage_policy == "dir": # version/index.html => version x = os.sep.join(x.split(os.sep)[:-1]) else: # version.html => version x = os.path.splitext(x)[0] if os.sep in x: # we didn't find an archived file for # basefile, instead we found an archived file # for another basefile that startswith our # basefile (eg '123' and '123/a', and we found # '123/a/4.html') continue # print("Found file %r %r" % (x, self.pathfrag_to_basefile(x))) basefile = self.pathfrag_to_basefile(x) if basefile not in yielded_basefiles: yielded_basefiles.append(basefile) yield basefile
def load_files(path, graph=None): # loads all the n3 files found under path into a graph if graph is None: graph = rdflib.Graph() if os.path.isfile(path): return load_file(path, graph) elif os.path.isdir(path): print("loading all n3 files in %s" % path) for f in util.list_dirs(path, suffix=".n3"): # FIXME: ugly hack to avoid reading one particular n3 file if f.endswith("sources.n3"): continue load_file(f, graph) return graph else: print("ERROR: can't load %s" % path)
def list_attachments(self, basefile, action, version=None): """Get all attachments for a basefile in a specified state :param action: The state (type of file) to look for (either ``downloaded``, ``parsed`` or ``generated``. If ``None``, look for all types. :type action: str :param basefile: The basefile to list attachments for :type basefile: str :param version: The version of the basefile to list attachments for. If None, list attachments for the current version. :type version: str :returns: All available attachments for the basefile :rtype: generator """ if self.storage_policy != "dir": raise errors.AttachmentPolicyError( "Can't list attachments if storage_policy != 'dir'") basedir = self.datadir # pathfrag = self.pathfrag_to_basefile(basefile) # that can't be right? pathfrag = self.basefile_to_pathfrag(basefile) if version: v_pathfrag = self.basefile_to_pathfrag(version) directory = os.sep.join((basedir, "archive", action, pathfrag, ".versions", v_pathfrag)) else: directory = os.sep.join((basedir, action, pathfrag)) # FIXME: Similar map exists in list_basefiles_for and in other # places throughout the code. Should subclasses be able to # control suffixes beyond the simple self.downloaded_suffix # mechanism? suffixmap = { 'downloaded': self.downloaded_suffixes, 'parsed': ['.xhtml'], 'generated': ['.html'] } mainfiles = ["index" + s for s in suffixmap[action]] for x in util.list_dirs(directory, reverse=False): # /datadir/base/downloaded/basefile/attachment.txt => attachment.txt x = x[len(directory) + 1:] if x not in mainfiles: if not [ suffix for suffix in self.invalid_suffixes if x.endswith(suffix) ]: yield x
def add_downloaded_files(filelist, spec, url): downloaddir = os.sep.join( [self.datadir, self.repoclass.alias, "downloaded"]) for f in list(util.list_dirs(downloaddir)): if f.endswith(".etag"): continue # FIXME: this is ugly if f not in filelist: # print("Fetching %s resulted in downloaded file %s" % (url, f)) filelist.append(f) expect = "downloaded" + f.replace(downloaddir, "") if os.sep != "/": expect = expect.replace(os.sep, "/") spec[url]['expect'] = expect reldest = os.path.relpath(".." + os.sep + "downloaded", os.path.dirname(f)) dest = os.path.normpath( os.path.join(os.path.dirname(specfile), reldest)) util.ensure_dir(dest) shutil.copy2(f, dest)
def list_attachments(self, basefile, action, version=None): """Get all attachments for a basefile in a specified state :param action: The state (type of file) to look for (either ``downloaded``, ``parsed`` or ``generated``. If ``None``, look for all types. :type action: str :param basefile: The basefile to list attachments for :type basefile: str :param version: The version of the basefile to list attachments for. If None, list attachments for the current version. :type version: str :returns: All available attachments for the basefile :rtype: generator """ if self.storage_policy != "dir": raise errors.AttachmentPolicyError( "Can't list attachments if storage_policy != 'dir'") basedir = self.datadir # pathfrag = self.pathfrag_to_basefile(basefile) # that can't be right? pathfrag = self.basefile_to_pathfrag(basefile) if version: v_pathfrag = self.basefile_to_pathfrag(version) directory = os.sep.join((basedir, "archive", action, pathfrag, v_pathfrag)) else: directory = os.sep.join((basedir, action, pathfrag)) # FIXME: Similar map exists in list_basefiles_for and in other # places throughout the code. Should subclasses be able to # control suffixes beyond the simple self.downloaded_suffix # mechanism? suffixmap = {'downloaded': self.downloaded_suffixes, 'parsed': ['.xhtml'], 'generated': ['.html']} mainfiles = ["index" + s for s in suffixmap[action]] for x in util.list_dirs(directory, reverse=False): # /datadir/base/downloaded/basefile/attachment.txt => attachment.txt x = x[len(directory) + 1:] if x not in mainfiles: if not [suffix for suffix in self.invalid_suffixes if x.endswith(suffix)]: yield x
def add_downloaded_files(filelist, spec, url): downloaddir = os.sep.join([self.datadir, self.repoclass.alias, "downloaded"]) for f in list(util.list_dirs(downloaddir)): if f.endswith(".etag"): continue # FIXME: this is ugly if f not in filelist: # print("Fetching %s resulted in downloaded file %s" % (url, f)) filelist.append(f) expect = "downloaded" + f.replace(downloaddir, "") if os.sep != "/": expect = expect.replace(os.sep, "/") spec[url]['expect'] = expect reldest = os.path.relpath( ".." + os.sep + "downloaded", os.path.dirname(f)) dest = os.path.normpath( os.path.join( os.path.dirname(specfile), reldest)) util.ensure_dir(dest) shutil.copy2(f, dest)
def importarchive(self, archivedir): """Imports downloaded data from an archive from legacy lagen.nu data. In particular, creates proper archive storage for older versions of each text. """ current = archived = 0 for f in util.list_dirs(archivedir, ".html"): if not f.startswith("downloaded/sfs"): # sfst or sfsr continue for regex in self.templ: m = re.match(regex, f) if not m: continue if "vcheck" in m.groupdict(): # silently ignore break basefile = "%s:%s" % (m.group("byear"), m.group("bnum")) # need to look at the file to find out its version # text = t.extractfile(f).read(4000).decode("latin-1") text = open(f).read(4000).decode("latin-1") reader = TextReader(string=text) updated_to = self._find_uppdaterad_tom(basefile, reader=reader) if "vyear" in m.groupdict(): # this file is marked as # an archival version archived += 1 version = updated_to if m.group("vyear") == "first": pass else: exp = "%s:%s" % (m.group("vyear"), m.group("vnum")) if version != exp: self.log.warning("%s: Expected %s, found %s" % (f, exp, version)) else: version = None current += 1 de = DocumentEntry() de.basefile = basefile de.id = self.canonical_uri(basefile, updated_to) # fudge timestamps best as we can de.orig_created = datetime.fromtimestamp( os.path.getctime(f)) de.orig_updated = datetime.fromtimestamp( os.path.getmtime(f)) de.orig_updated = datetime.now() de.orig_url = self.document_url_template % locals() de.published = datetime.now() de.url = self.generated_url(basefile) de.title = "SFS %s" % basefile # de.set_content() # de.set_link() de.save(self.store.documententry_path(basefile)) # this yields more reasonable basefiles, but they are not # backwards compatible -- skip them for now # basefile = basefile.replace("_", "").replace(".", "") if "type" in m.groupdict() and m.group("type") == "sfsr": dest = self.store.register_path(basefile) current -= 1 # to offset the previous increment else: dest = self.store.downloaded_path(basefile, version) self.log.debug("%s: extracting %s to %s" % (basefile, f, dest)) util.ensure_dir(dest) shutil.copy2(f, dest) break else: self.log.warning("Couldn't process %s" % f) self.log.info( "Extracted %s current versions and %s archived versions" % (current, archived))
def list_basefiles_for(self, action, basedir=None, force=True): """Get all available basefiles that can be used for the specified action. :param action: The action for which to get available basefiles (``parse``, ``relate``, ``generate`` or ``news``) :type action: str :param basedir: The base directory in which to search for available files. If not provided, defaults to ``self.datadir``. :type basedir: str :returns: All available basefiles :rtype: generator """ def prepend_index(suffixes): prepend = self.storage_policy == "dir" # If each document is stored in a separate directory # (storage_policy = "dir"), there is usually other # auxillary files (attachments and whatnot) in that # directory as well. Make sure we only yield a single file # from each directory. By convention, the main file is # called index.html, index.pdf or whatever. return [os.sep + "index" + s if prepend else s for s in suffixes] if not basedir: basedir = self.datadir directory = None if action == "parse": directory = os.path.sep.join((basedir, "downloaded")) suffixes = prepend_index(self.downloaded_suffixes) elif action == "relate": directory = os.path.sep.join((basedir, "distilled")) suffixes = [".rdf"] elif action == "generate": directory = os.path.sep.join((basedir, "parsed")) suffixes = prepend_index([".xhtml"]) elif action == "news": directory = os.path.sep.join((basedir, "entries")) suffixes = [".json"] # FIXME: fake action, needed for get_status. replace with # something more elegant elif action in ("_postgenerate"): directory = os.path.sep.join((basedir, "generated")) suffixes = [".html"] if not directory: raise ValueError("No directory calculated for action %s" % action) if not os.path.exists(directory): return # if we have information about how long each basefile took the # last time, use that to yield the most demanding basefiles # first. This improves throughput when processing files in # paralel durations_path = self.path(".durations", "entries", ".json", storage_policy="file") durations = {} if os.path.exists(durations_path): with open(durations_path) as fp: d = json.load(fp) if action in d: durations = d[action] yielded_paths = set() for basefile, duration in sorted(durations.items(), key=operator.itemgetter(1), reverse=True): if duration == -1 and not force: # Skip files that will raise DocumentRemovedError ? pass elif not force and not self.needed(basefile, action): # Skip files for which no action will be performed pass else: # make sure the underlying file really still exists path = None if action == "parse": path = self.downloaded_path(basefile) elif action == "relate": path = self.distilled_path(basefile) elif action == "generate": path = self.parsed_path(basefile) if os.path.exists(path): yielded_paths.add(path) yield basefile for x in util.list_dirs(directory, suffixes, reverse=True): # ignore empty files placed by download (which may # have done that in order to avoid trying to # re-download nonexistent resources) if x in yielded_paths: continue if os.path.exists(x) and os.path.getsize(x) > 0 and not x.endswith( (".root.json", ".durations.json")): # get a pathfrag from full path # suffixlen = len(suffix) if self.storage_policy == "file" else len(suffix) + 1 suffixlen = 0 for s in suffixes: if x.endswith(s): suffixlen = len(s) break else: raise ValueError( "%s doesn't end with a valid suffix (%s)" % x, ", ".join(suffixes)) x = x[len(directory) + 1:-suffixlen] yield self.pathfrag_to_basefile(x)
def download_test(self, specfile, basefile=None): """This test is run for each json file found in docroot/source.""" # this function can run in normal test mode or in # FERENDA_SET_TESTFILE mode. In the latter, all the normal download # code, including net access, is run. Calls to requests.get # are intercepted and notes are made of which URLs are # requested, and if this results in files on disk. The end # result is a JSON file and a set of cached files, all placed under # "source/" def add_downloaded_files(filelist, spec, url): downloaddir = os.sep.join([self.datadir, self.repoclass.alias, "downloaded"]) for f in list(util.list_dirs(downloaddir)): if f.endswith(".etag"): continue # FIXME: this is ugly if f not in filelist: # print("Fetching %s resulted in downloaded file %s" % (url, f)) filelist.append(f) expect = "downloaded" + f.replace(downloaddir, "") if os.sep != "/": expect = expect.replace(os.sep, "/") spec[url]['expect'] = expect reldest = os.path.relpath( ".." + os.sep + "downloaded", os.path.dirname(f)) dest = os.path.normpath( os.path.join( os.path.dirname(specfile), reldest)) util.ensure_dir(dest) shutil.copy2(f, dest) with codecs.open(specfile, encoding="utf-8") as fp: spec = json.load(fp) for k in list(spec.keys()): # NB: This exposes the encoded, possibly non-ascii, values # of the URL as byte strings. The encoding of these is # unknown (and we cannot generally assume UTF-8. Let's see # if this bites us. nk = unquote(k) if k != nk: spec[nk] = spec[k] del spec[k] # process the special '@settings' key (FIXME: didn't I already # implement this somewhere else?) # # a @settings like this: # "@settings": { # "config": {"next_sfsnr": "2014:913"} # }, # # will have the effect of this: # # self.repo.config.next_sfsnr = "2014:913" if '@settings' in spec: for attribute in spec['@settings']: if isinstance(spec['@settings'][attribute], dict): thing = getattr(self.repo, attribute) for key, value in spec['@settings'][attribute].items(): setattr(thing, key, value) else: setattr(self.repo, attribute, spec['@settings'][attribute]) if os.environ.get("FERENDA_SET_TESTFILE"): downloaddir = os.sep.join([self.datadir, self.repoclass.alias, "downloaded"]) state = {'downloaded': list(util.list_dirs(downloaddir)), 'previous_url': None, 'requests': 0} try: rc = int(os.environ.get("FERENDA_SET_TESTFILE")) state['total_requests'] = rc except (ValueError, TypeError): state['total_requests'] = 2 # search page, single payload def callback(req): # clean up after last callback add_downloaded_files(state['downloaded'], spec, state['previous_url']) if state['requests'] == state['total_requests']: raise MaxDownloadsReached() # make a real requests call somehow responses.stop() # when testing this testing function # (testTestutil.RepoTester.test_download_setfile) we # still want to disable responses, but we don't want # to make an actual HTTP call. Detect if we are # running that test by examining the stack, and if so, # mock the requests.get call in a different way. frames = [f for f in inspect.stack() if f[3] == "test_download_setfile"] if frames: frame = frames[0][0] resp = frame.f_locals['self']._myget(req.url) else: resp = requests.get(req.url) responses.start() # create a filename. use .html as suffix unless we # should use something else contenttype = resp.headers["Content-type"] stem = os.path.splitext(specfile)[0] suffix = {'application/pdf': 'pdf', 'application/json': 'json', 'text/plain': 'txt'}.get(contenttype, "html") outfile = "%s-%s.%s" % (stem, state['requests'], suffix) with open(outfile, "wb") as fp: fp.write(resp.content) if not frames and os.environ.get("TRAVIS") != "true": if suffix == "html": print( "requested %s, saved as %s. Edit if needed, then press enter" % (req.url, outfile)) x = input() else: print("requested %s, saved %s" % (req.url, outfile)) with open(outfile, "rb") as fp: content = fp.read() spec[req.url] = {'file': os.path.basename(outfile)} if resp.encoding != 'utf-8': spec[req.url]['encoding'] = resp.encoding state['requests'] += 1 state['previous_url'] = req.url return (resp.status_code, resp.headers, content) else: def callback(req): headers = {'Content-type': 'text/html'} try: # normalize req.url. req.url might be a (byte)str # but keys in spec will be (and should be) # unicode. Assume that req.url is all ascii if isinstance(req.url, bytes): url = req.url.decode() else: url = req.url urlspec = spec[unquote(url)] if isinstance(urlspec, str): urlspec = {'file': urlspec} url_location = os.path.join(os.path.dirname(specfile), urlspec['file']) # load the .content property with open(url_location, "rb") as fp: content = fp.read() return (200, headers, content) except KeyError: return (404, headers, "Not found") responses.add_callback(responses.GET, re.compile("(.*)"), callback) # PERFORM THE TEST try: self.repo.download(basefile) except MaxDownloadsReached: pass if os.environ.get("FERENDA_SET_TESTFILE"): # process final file and save specfile add_downloaded_files(state['downloaded'], spec, state['previous_url']) with open(specfile, "w") as fp: j = json.dumps(spec, indent=4, separators=(', ', ': ')) fp.write(j) # organize a temporary copy of files that we can compare our results to wantdir = "%s/%s-want" % (self.datadir, self.repoclass.alias) expected = False for url in spec: if url == "@settings": continue if "expect" in spec[url]: expected = True sourcefile = os.path.join(os.path.dirname(specfile), spec[url]['file']) wantfile = "%s/%s" % (wantdir, spec[url]['expect']) util.copy_if_different(sourcefile, wantfile) if expected: self.assertEqualDirs(wantdir, "%s/%s" % (self.datadir, self.repoclass.alias), subset=True) else: # the test doesn't actually result in any downloaded file if hasattr(self.repo, 'expect') and self.repo.expect is False: pass else: self.fail('No files were marked as "expect" in specfile %s' % specfile)
def download_test(self, specfile, basefile=None): """This test is run for each json file found in docroot/source.""" # this function can run in normal test mode or in # FERENDA_SET_TESTFILE mode. In the latter, all the normal download # code, including net access, is run. Calls to requests.get # are intercepted and notes are made of which URLs are # requested, and if this results in files on disk. The end # result is a JSON file and a set of cached files, all placed under # "source/" def add_downloaded_files(filelist, spec, url): downloaddir = os.sep.join( [self.datadir, self.repoclass.alias, "downloaded"]) for f in list(util.list_dirs(downloaddir)): if f.endswith(".etag"): continue # FIXME: this is ugly if f not in filelist: # print("Fetching %s resulted in downloaded file %s" % (url, f)) filelist.append(f) expect = "downloaded" + f.replace(downloaddir, "") if os.sep != "/": expect = expect.replace(os.sep, "/") spec[url]['expect'] = expect reldest = os.path.relpath(".." + os.sep + "downloaded", os.path.dirname(f)) dest = os.path.normpath( os.path.join(os.path.dirname(specfile), reldest)) util.ensure_dir(dest) shutil.copy2(f, dest) with codecs.open(specfile, encoding="utf-8") as fp: spec = json.load(fp) for k in list(spec.keys()): # NB: This exposes the encoded, possibly non-ascii, values # of the URL as byte strings. The encoding of these is # unknown (and we cannot generally assume UTF-8. Let's see # if this bites us. nk = unquote(k) if k != nk: spec[nk] = spec[k] del spec[k] # process the special '@settings' key (FIXME: didn't I already # implement this somewhere else?) # # a @settings like this: # "@settings": { # "config": {"next_sfsnr": "2014:913"} # }, # # will have the effect of this: # # self.repo.config.next_sfsnr = "2014:913" if '@settings' in spec: for attribute in spec['@settings']: if isinstance(spec['@settings'][attribute], dict): thing = getattr(self.repo, attribute) for key, value in spec['@settings'][attribute].items(): setattr(thing, key, value) else: setattr(self.repo, attribute, spec['@settings'][attribute]) if os.environ.get("FERENDA_SET_TESTFILE"): downloaddir = os.sep.join( [self.datadir, self.repoclass.alias, "downloaded"]) state = { 'downloaded': list(util.list_dirs(downloaddir)), 'previous_url': None, 'requests': 0 } try: rc = int(os.environ.get("FERENDA_SET_TESTFILE")) state['total_requests'] = rc except (ValueError, TypeError): state['total_requests'] = 2 # search page, single payload def callback(req): # clean up after last callback add_downloaded_files(state['downloaded'], spec, state['previous_url']) if state['requests'] == state['total_requests']: raise MaxDownloadsReached() # make a real requests call somehow responses.stop() # when testing this testing function # (testTestutil.RepoTester.test_download_setfile) we # still want to disable responses, but we don't want # to make an actual HTTP call. Detect if we are # running that test by examining the stack, and if so, # mock the requests.get call in a different way. frames = [ f for f in inspect.stack() if f[3] == "test_download_setfile" ] if frames: frame = frames[0][0] resp = frame.f_locals['self']._myget(req.url) else: resp = requests.get(req.url) responses.start() # create a filename. use .html as suffix unless we # should use something else contenttype = resp.headers["Content-type"] stem = os.path.splitext(specfile)[0] suffix = { 'application/pdf': 'pdf', 'application/json': 'json', 'text/plain': 'txt' }.get(contenttype, "html") outfile = "%s-%s.%s" % (stem, state['requests'], suffix) with open(outfile, "wb") as fp: fp.write(resp.content) if not frames and os.environ.get("TRAVIS") != "true": if suffix == "html": print( "requested %s, saved as %s. Edit if needed, then press enter" % (req.url, outfile)) x = input() else: print("requested %s, saved %s" % (req.url, outfile)) with open(outfile, "rb") as fp: content = fp.read() spec[req.url] = {'file': os.path.basename(outfile)} if resp.encoding != 'utf-8': spec[req.url]['encoding'] = resp.encoding state['requests'] += 1 state['previous_url'] = req.url return (resp.status_code, resp.headers, content) else: def callback(req): headers = {'Content-type': 'text/html'} try: # normalize req.url. req.url might be a (byte)str # but keys in spec will be (and should be) # unicode. Assume that req.url is all ascii if isinstance(req.url, bytes): url = req.url.decode() else: url = req.url urlspec = spec[unquote(url)] if isinstance(urlspec, str): urlspec = {'file': urlspec} url_location = os.path.join(os.path.dirname(specfile), urlspec['file']) # load the .content property with open(url_location, "rb") as fp: content = fp.read() return (200, headers, content) except KeyError: return (404, headers, "Not found") responses.add_callback(responses.GET, re.compile("(.*)"), callback) # PERFORM THE TEST try: self.repo.download(basefile) except MaxDownloadsReached: pass if os.environ.get("FERENDA_SET_TESTFILE"): # process final file and save specfile add_downloaded_files(state['downloaded'], spec, state['previous_url']) with open(specfile, "w") as fp: j = json.dumps(spec, indent=4, separators=(', ', ': ')) fp.write(j) # organize a temporary copy of files that we can compare our results to wantdir = "%s/%s-want" % (self.datadir, self.repoclass.alias) expected = False for url in spec: if url == "@settings": continue if "expect" in spec[url]: expected = True sourcefile = os.path.join(os.path.dirname(specfile), spec[url]['file']) wantfile = "%s/%s" % (wantdir, spec[url]['expect']) util.copy_if_different(sourcefile, wantfile) if expected: self.assertEqualDirs(wantdir, "%s/%s" % (self.datadir, self.repoclass.alias), subset=True) else: # the test doesn't actually result in any downloaded file if hasattr(self.repo, 'expect') and self.repo.expect is False: pass else: self.fail('No files were marked as "expect" in specfile %s' % specfile)
def list_basefiles_for(self, action, basedir=None, force=True): """Get all available basefiles that can be used for the specified action. :param action: The action for which to get available basefiles (``parse``, ``relate``, ``generate`` or ``news``) :type action: str :param basedir: The base directory in which to search for available files. If not provided, defaults to ``self.datadir``. :type basedir: str :returns: All available basefiles :rtype: generator """ def prepend_index(suffixes): prepend = self.storage_policy == "dir" # If each document is stored in a separate directory # (storage_policy = "dir"), there is usually other # auxillary files (attachments and whatnot) in that # directory as well. Make sure we only yield a single file # from each directory. By convention, the main file is # called index.html, index.pdf or whatever. return [os.sep + "index" + s if prepend else s for s in suffixes] def trim_documententry(basefile): # if the path (typically for the distilled or # parsed file) is a 0-size file, the following # steps should not be carried out. But since # they at some point might have done that # anyway, we're left with a bunch of stale # error reports in the entry files. As a # one-time-thing, try to blank out irrelevant # sections. entry = DocumentEntry(self.documententry_path(basefile)) sections = {'parse': ['parse', 'relate', 'generate'], 'relate': ['relate', 'generate'], 'generate': ['generate']}.get(action, {}) for section in sections: if section in entry.status: del entry.status[section] entry.save() if not basedir: basedir = self.datadir directory = None if action == "parse": directory = os.path.sep.join((basedir, "downloaded")) suffixes = prepend_index(self.downloaded_suffixes) elif action == "relate": directory = os.path.sep.join((basedir, "distilled")) suffixes = [".rdf"] elif action == "generate": directory = os.path.sep.join((basedir, "parsed")) suffixes = prepend_index([".xhtml"]) elif action == "news": directory = os.path.sep.join((basedir, "entries")) suffixes = [".json"] # FIXME: _postgenerate is a fake action, needed for # get_status. Maybe we can replace it with transformlinks now? elif action in ("_postgenerate", "transformlinks"): directory = os.path.sep.join((basedir, "generated")) suffixes = prepend_index([".html"]) if not directory: raise ValueError("No directory calculated for action %s" % action) if not os.path.exists(directory): return # if we have information about how long each basefile took the # last time, use that to yield the most demanding basefiles # first. This improves throughput when processing files in # paralell. Note: .durations.json is only created by # devel.statusreport durations_path = self.path(".durations", "entries", ".json", storage_policy="file") durations = {} if os.path.exists(durations_path): with open(durations_path) as fp: d = json.load(fp) if action in d: durations = d[action] yielded_paths = set() for basefile, duration in sorted(durations.items(), key=operator.itemgetter(1), reverse=True): if duration == -1 and not force: # Skip files that will raise DocumentRemovedError ? pass elif not force and not self.needed(basefile, action): # Skip files for which no action will be performed pass else: # make sure the underlying file really still exists path = None intermediate_path = False if action == "parse": path = self.downloaded_path(basefile) intermediate_path = os.path.exists(self.intermediate_path(basefile)) elif action == "relate": path = self.distilled_path(basefile) elif action == "generate": path = self.parsed_path(basefile) if os.path.exists(path): yielded_paths.add(path) if os.path.getsize(path) > 0 or intermediate_path: yield basefile else: trim_documententry(basefile) for x in util.list_dirs(directory, suffixes, reverse=True): if x in yielded_paths: continue if not os.path.exists(x) or x.endswith((".root.json", ".durations.json")): continue # get a pathfrag from full path # suffixlen = len(suffix) if self.storage_policy == "file" else len(suffix) + 1 suffixlen = 0 for s in suffixes: if x.endswith(s): suffixlen = len(s) break else: raise ValueError("%s doesn't end with a valid suffix (%s)" % x, ", ".join(suffixes)) pathfrag = x[len(directory) + 1:-suffixlen] basefile = self.pathfrag_to_basefile(pathfrag) # ignore empty files placed by download (which may have # done that in order to avoid trying to re-download # nonexistent resources) -- but not if there is a viable # intermediate file (dv.py creates empty files in download # but contentful files in intermediate, when splitting a # large doc over multiple basefiles). intermediate_path = False if action == "parse": intermediate_path = os.path.exists(self.intermediate_path(basefile)) if os.path.getsize(x) > 0 or intermediate_path: yield basefile elif action in ("relate", "generate"): trim_documententry(basefile)
def importarchive(self, archivedir, overwrite=False): """Imports downloaded data from an archive from legacy lagen.nu data. In particular, creates proper archive storage for older versions of each text. """ def valid(f): size = os.path.getsize(f) if size == 0: return False with open(f, mode="rb") as fp: fp.seek(size - 20) end_bytes = fp.read() end = end_bytes.decode(errors="ignore") return '</html>' in end def find_version(f): # need to look at the file to find out its version encoding = self._sniff_encoding(f) raw = open(f, 'rb').read(8000) text = unescape(raw.decode(encoding, errors="replace")) reader = TextReader(string=text) updated_to = self._find_uppdaterad_tom(basefile, reader=reader) return updated_to current = archived = skipped = invalid = 0 spares = {} recent_versions = {} # records the current version of every # basefile for which we have any archive # file for f in util.list_dirs(archivedir, ".html"): if "downloaded/sfst" not in f: continue if os.path.getsize(f) == 0: continue for regex in self.templ: m = re.search(regex, f) if not m: continue basefile = self.sanitize_basefile( "%s:%s" % (m.group("byear"), m.group("bnum"))) if "vyear" in m.groupdict(): # this file is marked as # an archival version expected_version = self.sanitize_basefile( "%s:%s" % (m.group("vyear"), m.group("vnum"))) elif "vfirst" in m.groupdict(): expected_version = basefile else: # if neither vyear or vfirst is in the filename, # this is the very first version we have saved. It # might be the first version, or it could be the # first version that we were able to download. We # just go with something and don't worry if it # turns out to be wrong. expected_version = basefile if os.path.getsize(f) == 0: # we can't get any useful info from this file, but # we can use it to trigger a selection of a spare, # if available this_version = expected_version else: this_version = find_version(f) if this_version != expected_version: self.log.warning( "%s@%s: Expected %s to be version %s" % (basefile, this_version, f, expected_version)) try: sanitized_this_version = self.sanitize_basefile( this_version) except: self.log.error( "%s@%s: Couldn't sanitize version found in %s" % (basefile, this_version, f)) break if this_version != sanitized_this_version: self.log.warning( "%s@%s: Version in %s sanitizes to %s" % (basefile, this_version, f, sanitized_this_version)) this_version = sanitized_this_version if "vcheck" in m.groupdict(): # these checksum variants should be older variants # of a version we already have -- but in case the # non-checksum version is empty or corrupted, we # ought to use the best available checksum version if valid(f): spare_version = find_version(f) spares[(basefile, spare_version)] = f break if basefile not in recent_versions: mainline = self.store.downloaded_path(basefile) if os.path.exists(mainline): recent_versions[basefile] = find_version(mainline) else: self.log.warning( "%s@%s: archive file %s has no corresponding file in mainline (expected %s)" % (basefile, this_version, f, mainline)) current += 1 # but we'll create an archived version anyway, not one in mainline recent_versions[basefile] = None if this_version == recent_versions[basefile]: self.log.debug( "%s@%s: file %s has same version as mainline" % (basefile, this_version, f)) break if valid(f): source = f elif (basefile, this_version) in spares: source = spares[(basefile, this_version)] self.log.warning( "%s@%s: using spare %s instead of invalid file %s" % (basefile, this_version, f, source)) else: self.log.error( "%s@%s: file %s is invalid, and no spare is available" % (basefile, this_version, f)) invalid += 1 break dest = self.store.downloaded_path(basefile, version=this_version) if os.path.exists(dest) and not overwrite: self.log.debug( "%s@%s: Not extracting %s as %s already exists" % (basefile, this_version, f, dest)) skipped += 1 else: self.log.info("%s@%s: extracting %s to %s" % (basefile, this_version, f, dest)) util.ensure_dir(dest) shutil.copy2(f, dest) archived += 1 break else: self.log.warning("Couldn't process %s" % f) self.log.info( "Extracted %s current versions and %s archived versions (skipped %s files that already existed, and couldn't handle %s invalid versions)" % (current, archived, skipped, invalid))
def list_basefiles_for(self, action, basedir=None): """Get all available basefiles that can be used for the specified action. :param action: The action for which to get available basefiles (``parse``, ``relate``, ``generate`` or ``news``) :type action: str :param basedir: The base directory in which to search for available files. If not provided, defaults to ``self.datadir``. :type basedir: str :returns: All available basefiles :rtype: generator """ if not basedir: basedir = self.datadir directory = None if action == "parse": directory = os.path.sep.join((basedir, "downloaded")) if self.storage_policy == "dir": # If each document is stored in a separate directory, # there is usually other auxillary files (attachments # and whatnot) in that directory as well. Make sure we # only yield a single file from each directory. By # convention, the main file is called index.html, # index.pdf or whatever. # print("storage_policy dir: %s" % self.storage_policy) suffix = "index" + self.downloaded_suffix else: # print("storage_policy file: %s" % self.storage_policy) suffix = self.downloaded_suffix elif action == "relate": directory = os.path.sep.join((basedir, "distilled")) suffix = ".rdf" elif action == "generate": directory = os.path.sep.join((basedir, "parsed")) if self.storage_policy == "dir": suffix = "index.xhtml" else: suffix = ".xhtml" elif action == "news": directory = os.path.sep.join((basedir, "entries")) suffix = ".json" # FIXME: fake action, needed for get_status. replace with # something more elegant elif action in ("_postgenerate"): directory = os.path.sep.join((basedir, "generated")) suffix = ".html" if not directory: raise ValueError("No directory calculated for action %s" % action) if not os.path.exists(directory): return for x in util.list_dirs(directory, suffix, reverse=True): # ignore empty files placed by download (which may # have done that in order to avoid trying to # re-download nonexistent resources) if os.path.exists(x) and os.path.getsize(x) > 0: # get a pathfrag from full path suffixlen = len(suffix) if self.storage_policy == "file" else len(suffix) + 1 x = x[len(directory) + 1:-suffixlen] yield self.pathfrag_to_basefile(x)
def test_extractdir_repo(self): dest = self.tempdir + os.sep + "dest" os.mkdir(dest) self.repo.resourceloader.extractdir("xsl", dest) extracted = [x[len(dest) + 1:] for x in util.list_dirs(dest)] self.assertEqual(self.expected, set(extracted))
def test_extractdir_repo(self): dest = self.tempdir + os.sep + "dest" os.mkdir(dest) self.repo.resourceloader.extractdir("xsl", dest) extracted = [x[len(dest)+1:] for x in util.list_dirs(dest)] self.assertEqual(self.expected, set(extracted))
def list_versions(self, basefile, action=None): """Get all archived versions of a given basefile. :param basefile: The basefile to list archived versions for :type basefile: str :param action: The type of file to look for (either ``downloaded``, ``parsed`` or ``generated``. If ``None``, look for all types. :type action: str :returns: All available versions for that basefile :rtype: generator """ if action: if action == "relate": raise StopIteration() assert action in ('downloaded', 'parsed', 'generated'), "Action %s invalid" % action actions = (action, ) else: actions = ('downloaded', 'parsed', 'generated') basedir = self.datadir pathfrag = self.basefile_to_pathfrag(basefile) yielded_basefiles = [] for action in actions: directory = os.sep.join( (basedir, "archive", action, pathfrag, ".versions")) if not os.path.exists(directory): continue for x in util.list_dirs(directory, reverse=False): if os.path.exists(x): # /datadir/base/archive/downloaded/basefile/version.html # => version.html x = x[len(directory) + 1:] if self.storage_policy == "dir": # version/index.html => version x = os.sep.join(x.split(os.sep)[:-1]) else: # version.html => version x = os.path.splitext(x)[0] if os.sep in x: # we didn't find an archived file for # basefile, instead we found an archived file # for another basefile that startswith our # basefile (eg '123' and '123/a', and we found # '123/a/4.html') # FIXME: This doesn't work at all with version # identifiers that map to os.sep (eg SFS.py, # which might have a basefile 1980:100, which # then has version 2007:145, stored at # <datadir>/archive/downloaded/1980/100/2007/145.html. We # might need to rethink filenaming here... # continue pass # print("Found file %r %r" % (x, self.pathfrag_to_basefile(x))) basefile = self.pathfrag_to_basefile(x) if basefile not in yielded_basefiles: yielded_basefiles.append(basefile) yield basefile
def list_basefiles_for(self, action, basedir=None, force=True): """Get all available basefiles that can be used for the specified action. :param action: The action for which to get available basefiles (``parse``, ``relate``, ``generate`` or ``news``) :type action: str :param basedir: The base directory in which to search for available files. If not provided, defaults to ``self.datadir``. :type basedir: str :returns: All available basefiles :rtype: generator """ def prepend_index(suffixes): prepend = self.storage_policy == "dir" # If each document is stored in a separate directory # (storage_policy = "dir"), there is usually other # auxillary files (attachments and whatnot) in that # directory as well. Make sure we only yield a single file # from each directory. By convention, the main file is # called index.html, index.pdf or whatever. return [os.sep + "index" + s if prepend else s for s in suffixes] def trim_documententry(basefile): # if the path (typically for the distilled or # parsed file) is a 0-size file, the following # steps should not be carried out. But since # they at some point might have done that # anyway, we're left with a bunch of stale # error reports in the entry files. As a # one-time-thing, try to blank out irrelevant # sections. entry = DocumentEntry(self.documententry_path(basefile)) sections = { 'parse': ['parse', 'relate', 'generate'], 'relate': ['relate', 'generate'], 'generate': ['generate'] }.get(action, {}) for section in sections: if section in entry.status: del entry.status[section] entry.save() if not basedir: basedir = self.datadir directory = None if action == "parse": directory = os.path.sep.join((basedir, "downloaded")) suffixes = prepend_index(self.downloaded_suffixes) elif action == "relate": directory = os.path.sep.join((basedir, "distilled")) suffixes = [".rdf"] elif action == "generate": directory = os.path.sep.join((basedir, "parsed")) suffixes = prepend_index([".xhtml"]) elif action == "news": directory = os.path.sep.join((basedir, "entries")) suffixes = [".json"] # FIXME: _postgenerate is a fake action, needed for # get_status. Maybe we can replace it with transformlinks now? elif action in ("_postgenerate", "transformlinks"): directory = os.path.sep.join((basedir, "generated")) suffixes = prepend_index([".html"]) if not directory: raise ValueError("No directory calculated for action %s" % action) if not os.path.exists(directory): return # if we have information about how long each basefile took the # last time, use that to yield the most demanding basefiles # first. This improves throughput when processing files in # paralell. Note: .durations.json is only created by # devel.statusreport durations_path = self.path(".durations", "entries", ".json", storage_policy="file") durations = {} # print("%s: About to check durations at %s" % (datetime.now(), durations_path)) if os.path.exists(durations_path): with open(durations_path) as fp: try: d = json.load(fp) except JSONDecodeError as e: # just skip this, it's not essential (we should warn about the corrupt JSON file though) print("ERROR: %s is not a valid JSON file" % durations_path) d = {} if action in d: durations = d[action] yielded_paths = set() # print("%s: Loaded %s durations" % (datetime.now(), len(durations))) for basefile, duration in sorted(durations.items(), key=operator.itemgetter(1), reverse=True): # print("Handling %s %s" % (basefile, duration)) path = None intermediate_path_exists = False if action == "parse": path = self.downloaded_path(basefile) intermediate_path_exists = os.path.exists( self.intermediate_path(basefile)) elif action == "relate": path = self.distilled_path(basefile) elif action == "generate": path = self.parsed_path(basefile) if duration == -1 and not force: # Skip files that will raise DocumentRemovedError ? yielded_paths.add(path) elif not force and not self.needed(basefile, action): # Skip files for which no action will be performed yielded_paths.add(path) else: if os.path.exists(path): yielded_paths.add(path) if os.path.getsize(path) > 0 or intermediate_path_exists: yield basefile else: trim_documententry(basefile) # print("%s: Processing non-duration files" % datetime.now()) for x in util.list_dirs(directory, suffixes, reverse=True): if x in yielded_paths: continue if not os.path.exists(x) or x.endswith( (".root.json", ".durations.json")): continue # get a pathfrag from full path # suffixlen = len(suffix) if self.storage_policy == "file" else len(suffix) + 1 suffixlen = 0 for s in suffixes: if x.endswith(s): suffixlen = len(s) break else: raise ValueError("%s doesn't end with a valid suffix (%s)" % x, ", ".join(suffixes)) pathfrag = x[len(directory) + 1:-suffixlen] basefile = self.pathfrag_to_basefile(pathfrag) # ignore empty files placed by download (which may have # done that in order to avoid trying to re-download # nonexistent resources) -- but not if there is a viable # intermediate file (dv.py creates empty files in download # but contentful files in intermediate, when splitting a # large doc over multiple basefiles). intermediate_path = False if action == "parse": intermediate_path = os.path.exists( self.intermediate_path(basefile)) if os.path.getsize(x) > 0 or intermediate_path: yield basefile elif action in ("relate", "generate"): trim_documententry(basefile)
def importarchive(self, archivedir): """Imports downloaded data from an archive from legacy lagen.nu data. In particular, creates proper archive storage for older versions of each text. """ current = archived = 0 for f in util.list_dirs(archivedir, ".html"): if "downloaded/sfst" not in f: continue if os.path.getsize(f) == 0: continue for regex in self.templ: m = re.search(regex, f) if not m: continue if "vcheck" in m.groupdict(): # silently ignore these # (they should be older # versions of a version # we already have -- but # we ought to test this!) break basefile = "%s:%s" % (m.group("byear"), m.group("bnum")) # need to look at the file to find out its version raw = open(f, 'rb').read(8000) # if it uses html5 doctype, assume utf-8, otherwise assume latin-1 encoding = "utf-8" if b'<!DOCTYPE html>' in raw else "latin-1" text = unescape(raw.decode(encoding, errors="replace")) reader = TextReader(string=text) updated_to = self._find_uppdaterad_tom(basefile, reader=reader) if "vyear" in m.groupdict(): # this file is marked as # an archival version archived += 1 version = updated_to if m.group("vyear") == "first": pass else: exp = "%s:%s" % (m.group("vyear"), m.group("vnum")) if version != exp: self.log.warning("%s: Expected %s, found %s" % (f, exp, version)) else: break # what was the actual POINT of this? SFS.download # will have downloaded a copy of this exact # version (the most recent version), regardless of # whether it's expired or not. # version = None # current += 1 # de = DocumentEntry() # de.basefile = basefile # de.id = self.canonical_uri(basefile, updated_to) # # fudge timestamps best as we can # de.orig_created = datetime.fromtimestamp(os.path.getctime(f)) # de.orig_updated = datetime.fromtimestamp(os.path.getmtime(f)) # de.orig_updated = datetime.now() # de.orig_url = self.document_url_template % locals() # de.published = datetime.now() # de.url = self.generated_url(basefile) # de.title = "SFS %s" % basefile # de.save(self.store.documententry_path(basefile)) if m.group("type") == "sfsr": dest = self.store.register_path(basefile, version=version) else: dest = self.store.downloaded_path(basefile, version=version) self.log.debug("%s: extracting %s to %s" % (basefile, f, dest)) util.ensure_dir(dest) shutil.copy2(f, dest) break else: self.log.warning("Couldn't process %s" % f) self.log.info("Extracted %s current versions and %s archived versions" % (current, archived))