def download(self): # do something with static/sitenews.txt --> split into # <datadir>/sitenews/<timestamp>.txt ofp = temppath = path = basefile = None with codecs.open(self.resourceloader.filename(self.config.newsfile), encoding="utf-8") as fp: for line in fp: m = self.re_news_subjectline(line) if m: if ofp: ofp.close() if util.replace_if_different(temppath, path): self.log.info("%s: creating news item" % basefile) d = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S") basefile = str(int(d.timestamp())) path = self.store.downloaded_path(basefile) fileno, temppath = tempfile.mkstemp(text=True) util.ensure_dir(path) # ofp = codecs.open(path, "w", encoding="utf-8") ofp = os.fdopen(fileno, "w") ofp.write(line) ofp.close() if util.replace_if_different(temppath, path): self.log.info("%s: download OK (creating news item)" % basefile)
def test_doctype(self): base = self.datadir+os.sep util.ensure_dir(base+"teststyle-doctype.xslt") with open(base+"teststyle-doctype.xslt","w") as fp: fp.write("""<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:output method="html" doctype-system="about:legacy-compat" omit-xml-declaration="yes" encoding='utf-8' indent="yes"/> <xsl:template match="/"> <html> <head> <title><xsl:value-of select="/doc/title"/></title> </head> <body> <h1>hello world</h1> </body> </html> </xsl:template> </xsl:stylesheet> """) with open(base+"infile.xml","w") as fp: fp.write("""<doc><title>Document title</title></doc>""") t = Transformer("XSLT", base+"teststyle-doctype.xslt", "xsl", None, "") t.transform_file(base+"infile.xml", base+"outfile.xml") self.assertTrue(util.readfile(base+"outfile.xml").startswith('<!DOCTYPE html SYSTEM "about:legacy-compat">'))
def put_files_in_place(self): self.repo = None self.repos = [DocumentRepository(datadir=self.datadir, storetype = self.storetype, storelocation = self.storelocation, storerepository = self.storerepository, indextype = self.indextype, indexlocation = self.indexlocation)] # create three basic documents (at parsed and distilled) # # each document should have a dcterms:title, a dcterms:issued and a # dcterms:publisher, which has a URI # # basefile dcterms:title dcterms:issued dcterms:publisher # 123/a "Example" 2014-01-04 <http://example.org/publisher/A> # 123/b "Example 2" 2013-09-23 <http://example.org/publisher/B> # 123/c "Of needles" 2014-05-06 <http://example.org/publisher/B> for i in ('a','b','c'): self.ttl_to_rdf_xml("test/files/base/distilled/123/%s.ttl" % i, self.repos[0].store.distilled_path("123/%s" % i), self.repos[0].store) util.ensure_dir(self.repos[0].store.parsed_path("123/%s" % i)) shutil.copy2("test/files/base/parsed/123/%s.xhtml" % i, self.repos[0].store.parsed_path("123/%s" % i)) self.repos[0].relate("123/%s" % i) # prepare a base.ttl (or var-common.js) that maps # <http://example.org/publisher/B> to "Publishing house B" self.repos[0].rdf_type = self.repos[0].ns['bibo'].Standard
def create_external_resources(self, doc): resources = [] cssfile = self.store.parsed_path(doc.basefile, attachment="index.css") resources.append(cssfile) util.ensure_dir(cssfile) with open(cssfile, "w") as fp: # Create CSS header with fontspecs for pdf in doc.body: assert isinstance(pdf, PDFReader), "doc.body is %s, not PDFReader -- still need to access fontspecs etc" % type(pdf) for spec in list(pdf.fontspec.values()): fp.write(".fontspec%s {font: %spx %s; color: %s;}\n" % (spec['id'], spec['size'], spec['family'], spec['color'])) # 2 Copy all created png files to their correct locations totcnt = 0 pdfbase = os.path.splitext(os.path.basename(pdf.filename))[0] for pdf in doc.body: cnt = 0 for page in pdf: totcnt += 1 cnt += 1 if page.background: src = self.store.intermediate_path( doc.basefile, attachment=os.path.basename(page.background)) dest = self.store.parsed_path( doc.basefile, attachment=os.path.basename(page.background)) if util.copy_if_different(src, dest): self.log.debug("Copied %s to %s" % (src, dest)) resources.append(dest) fp.write("#page%03d { background: url('%s');}\n" % (cnt, os.path.basename(dest))) return resources
def native_to_file(self, nativedata, outfile): res = etree.tostring(nativedata, pretty_print=self.format, encoding="utf-8") util.ensure_dir(outfile) with open(outfile, "wb") as fp: fp.write(res)
def test_parse(self): util.ensure_dir(self.repo.store.downloaded_path("sample")) shutil.copy2("test/files/pdfreader/sample.pdf", self.repo.store.downloaded_path("sample")) try: self.repo.required_predicates = [] self.repo.parse("sample") except errors.ExternalCommandError: # print("pdftohtml error: retrying") # for systems that don't have pdftohtml, we copy the expected # intermediate files, so that we can test the rest of the logic targetdir = os.path.dirname(self.repo.store.intermediate_path("sample")) # print("working around by copying test/files/pdfreader/intermediate tree to %s" % targetdir) if os.path.exists(targetdir): shutil.rmtree(targetdir) shutil.copytree("test/files/pdfreader/intermediate", targetdir) # make really sure the xml file has a newer timestamp than the PDF from time import sleep sleep(0.01) os.utime(targetdir+"/index.xml", None) try: self.repo.parse("sample") except errors.ExternalCommandError as e: print("ExternalCommandError on rerun.\n targetdir: %s\n %s exists: %s\n message: %s" % (targetdir, targetdir+"/index.xml", os.path.exists(targetdir+"/index.xml"), e)) # print("Workaround succeeded: %s" % os.path.exists(targetdir+"/index.xml")) p = self.repo.store.datadir self.assertTrue(os.path.exists(p+'/intermediate/sample/index001.png')) self.assertFalse(os.path.exists(p+'/intermediate/sample/index.pdf')) self.assertTrue(os.path.exists(p+'/intermediate/sample/index.xml')) self.assertTrue(os.path.exists(p+'/parsed/sample/index001.png')) self.assertTrue(os.path.exists(p+'/parsed/sample/index.css')) self.assertTrue(os.path.exists(p+'/parsed/sample/index.xhtml'))
def graph_to_image(self, graph, imageformat, filename): import pydot import rdflib dot = pydot.Dot() # dot.progs = {"dot": "c:/Program Files/Graphviz2.26.3/bin/dot.exe"} # code from rdflib.util.graph_to_dot, but adjusted to handle unicode nodes = {} for s, o in graph.subject_objects(): for i in s, o: if i not in list(nodes.keys()): if isinstance(i, rdflib.BNode): nodes[i] = repr(i)[7:] elif isinstance(i, rdflib.Literal): nodes[i] = repr(i)[16:-1] elif isinstance(i, rdflib.URIRef): nodes[i] = repr(i)[22:-2] for s, p, o in graph.triples((None, None, None)): dot.add_edge(pydot.Edge(nodes[s], nodes[o], label=repr(p)[22:-2])) self.log.debug("Writing %s format to %s" % (imageformat, filename)) util.ensure_dir(filename) dot.write(path=filename, prog="dot", format=imageformat) self.log.debug("Wrote %s" % filename)
def word_to_docbook(self, indoc, outdoc): """Convert a old Word document (.doc) to a pseudo-docbook file through antiword.""" tmpfile = mktemp() indoc = os.path.normpath(indoc) wrapper = textwrap.TextWrapper(break_long_words=False, width=72) util.ensure_dir(outdoc) if " " in indoc: indoc = '"%s"' % indoc cmd = "antiword -x db %s > %s" % (indoc, tmpfile) self.log.debug("Executing %s" % cmd) (ret, stdout, stderr) = util.runcmd(cmd) if ret != 0: self.log.error("Docbook conversion failed: %s" % stderr) raise errors.ExternalCommandError( "Docbook conversion failed: %s" % stderr.strip()) tree = ET.parse(tmpfile) for element in tree.getiterator(): if element.text and element.text.strip() != "": replacement = "" for p in element.text.split("\n"): if p: replacement += wrapper.fill(p) + "\n\n" element.text = replacement.strip() tree.write(outdoc, encoding="utf-8") os.unlink(tmpfile)
def archive(self, basefile, version): """Moves the current version of a document to an archive. All files related to the document are moved (downloaded, parsed, generated files and any existing attachment files). :param basefile: The basefile of the document to archive :type basefile: str :param version: The version id to archive under :type version: str """ for meth in (self.downloaded_path, self.documententry_path, self.parsed_path, self.serialized_path, self.distilled_path, self.annotation_path, self.generated_path): # FIXME: what about intermediate? Ignore them as they # should be able to be regenerated at any time? src = meth(basefile) dest = meth(basefile, version) if self.storage_policy == "dir" and meth in (self.downloaded_path, self.parsed_path, self.generated_path): src = os.path.dirname(src) dest = os.path.dirname(dest) if not os.path.exists(src): continue if os.path.exists(dest): raise errors.ArchivingError( "Archive destination %s for basefile %s version %s already exists!" % (dest, basefile, version)) # self.log.debug("Archiving %s to %s" % (src,dest)) # print("Archiving %s to %s" % (src,dest)) util.ensure_dir(dest) shutil.move(src, dest)
def make_api_files(self): # this should create the following files under resourcedir # api/context.json (aliased to /json-ld/context.json if legacyapi) # api/terms.json (aliased to /var/terms.json if legacyapi) # api/common.json (aliased to /var/common.json if legacyapi) # MAYBE api/ui/ - copied from ferenda/res/ui files = [] context = os.sep.join([self.resourcedir, "api", "context.json"]) if self.config.legacyapi: self.log.info("Creating API files for legacyapi") contextpath = "/json-ld/context.json" termspath = "/var/terms" commonpath = "/var/common" else: # FIXME: create correct URL path contextpath = "/rsrc/api/context.json" termspath = "/rsrc/api/terms.json" commonpath = "/rsrc/api/common.json" util.ensure_dir(context) with open(context, "w") as fp: contextdict = self._get_json_context() s = json.dumps({"@context": contextdict}, separators=(', ', ': '), indent=4, sort_keys=True) fp.write(s) files.append(self._filepath_to_urlpath(context, 2)) common = os.sep.join([self.resourcedir, "api", "common.json"]) terms = os.sep.join([self.resourcedir, "api", "terms.json"]) for (filename, func, urlpath) in ((common, self._get_common_graph, commonpath), (terms, self._get_term_graph, termspath)): g = func(self.config.url + urlpath[1:]) d = json.loads( g.serialize(format="json-ld", context=contextdict, indent=4).decode("utf-8")) # d might not contain a @context (if contextdict == {}, ie # no repos are given) if '@context' in d: d['@context'] = contextpath if self.config.legacyapi: d = self._convert_legacy_jsonld(d, self.config.url + urlpath[1:]) with open(filename, "w") as fp: s = json.dumps(d, indent=4, separators=(', ', ': '), sort_keys=True) fp.write(s) files.append(self._filepath_to_urlpath(filename, 2)) if self.config.legacyapi: # copy ui explorer app to <url>/rsrc/ui/ -- this does not get # included in files targetdir = os.sep.join([self.resourcedir, "ui"]) self.resourceloader.extractdir("ui", targetdir) return files
def save(self, path=None): """Saves the state of the documententry to a JSON file at *path*. If *path* is not provided, uses the path that the object was initialized with. """ if not path: path = self._path # better be there # The below concise way of creating a dict will yield a # future.types.newdict.newdict, whose .keys() method yields a # dictionary-keyiterator object, not a standard sortable # list. This fails with json.dump(sort_keys=True). # # d = dict((k, v) for (k, v) in self.__dict__.items() if k[0] != "_") # # So we create a standard py2 dict by using literals: d = {} for (k, v) in self.__dict__.items(): if k[0] != "_": d[k] = v if isinstance(self.summary, Literal) and self.summary.datatype == RDF.XMLLiteral: d["summary_type"] = "html" util.ensure_dir(path) with open(path, "w") as fp: s = json.dumps(d, default=util.json_default_date, indent=2, separators=(', ', ': '), sort_keys=True) fp.write(s)
def save(self, path=None): """Saves the state of the documententry to a JSON file at *path*. If *path* is not provided, uses the path that the object was initialized with. """ if not path: path = self._path # better be there # The below concise way of creating a dict will yield a # future.types.newdict.newdict, whose .keys() method yields a # dictionary-keyiterator object, not a standard sortable # list. This fails with json.dump(sort_keys=True). # # d = dict((k, v) for (k, v) in self.__dict__.items() if k[0] != "_") # # So we create a standard py2 dict by using literals: d = {} for (k, v) in self.__dict__.items(): if k[0] != "_": d[k] = v if isinstance(self.summary, Literal) and self.summary.datatype == RDF.XMLLiteral: d["summary_type"] = "html" util.ensure_dir(path) with open(path, "w") as fp: s = json.dumps(d, default=util.json_default_date, indent=2, separators=(', ', ': '), sort_keys=True) fp.write(s)
def test_parse(self): util.ensure_dir(self.repo.store.downloaded_path("sample")) shutil.copy2("test/files/pdfreader/sample.pdf", self.repo.store.downloaded_path("sample")) try: self.repo.parse("sample") except errors.ExternalCommandError: # print("pdftohtml error: retrying") # for systems that don't have pdftohtml, we copy the expected # intermediate files, so that we can test the rest of the logic targetdir = os.path.dirname(self.repo.store.intermediate_path("sample")) # print("working around by copying to %s" % targetdir) if os.path.exists(targetdir): shutil.rmtree(targetdir) shutil.copytree("test/files/pdfreader/intermediate", targetdir) self.repo.parse("sample") # print("Workaround succeeded") p = self.repo.store.datadir self.assertTrue(os.path.exists(p+'/intermediate/sample/index001.png')) self.assertTrue(os.path.exists(p+'/intermediate/sample/index.pdf')) self.assertTrue(os.path.exists(p+'/intermediate/sample/index.xml')) self.assertTrue(os.path.exists(p+'/parsed/sample/index001.png')) self.assertTrue(os.path.exists(p+'/parsed/sample/index.css')) self.assertTrue(os.path.exists(p+'/parsed/sample/index.xhtml'))
def setUp(self): super(TOC, self).setUp() resources = self.datadir + os.sep + "rsrc" + os.sep + "resources.xml" util.ensure_dir(resources) shutil.copy2( "%s/files/base/rsrc/resources.xml" % os.path.dirname(__file__), resources)
def download_ftp(self, dirname, recurse, user, password, connection=None): self.log.debug('Listing contents of %s' % dirname) lines = [] if not connection: connection = FTP('ftp.dom.se') connection.login(user, password) connection.cwd(dirname) connection.retrlines('LIST', lines.append) for line in lines: parts = line.split() filename = parts[-1].strip() if line.startswith('d') and recurse: self.download(filename, recurse) elif line.startswith('-'): basefile = os.path.splitext(filename)[0] if dirname: basefile = dirname + "/" + basefile localpath = self.store.downloaded_path(basefile) if os.path.exists(localpath) and not self.config.force: pass # we already got this else: util.ensure_dir(localpath) self.log.debug('Fetching %s to %s' % (filename, localpath)) connection.retrbinary('RETR %s' % filename, # FIXME: retrbinary calls .close()? open(localpath, 'wb').write) self.process_zipfile(localpath) connection.cwd('/')
def graph_to_image(self, graph, imageformat, filename): import pydot import rdflib dot = pydot.Dot() # dot.progs = {"dot": "c:/Program Files/Graphviz2.26.3/bin/dot.exe"} # code from rdflib.util.graph_to_dot, but adjusted to handle unicode nodes = {} for s, o in graph.subject_objects(): for i in s, o: if i not in list(nodes.keys()): if isinstance(i, rdflib.BNode): nodes[i] = repr(i)[7:] elif isinstance(i, rdflib.Literal): nodes[i] = repr(i)[16:-1] elif isinstance(i, rdflib.URIRef): nodes[i] = repr(i)[22:-2] for s, p, o in graph.triples((None, None, None)): dot.add_edge(pydot.Edge(nodes[s], nodes[o], label=repr(p)[22:-2])) self.log.debug("Writing %s format to %s" % (imageformat, filename)) util.ensure_dir(filename) dot.write(path=filename, prog="dot", format=imageformat) self.log.debug("Wrote %s" % filename)
def metrics(self, metricspath=None, plotpath=None, startpage=0, pagecount=None, force=False): docsegments = self.documents if len(docsegments) == 1: return super(PropAnalyzer, self).metrics(metricspath, plotpath, startpage, pagecount, force) else: r = [] exclude = [] mainidx = None for idx, (startpage, pagecount, tag) in enumerate(docsegments): r.append(super(PropAnalyzer, self).metrics(startpage=startpage, pagecount=pagecount)) if tag != 'main': exclude.extend(list(range(startpage, startpage+pagecount))) elif mainidx is None: mainidx = idx r[mainidx]['excludedpages'] = exclude # since we don't pass metricspath to super().metrics, that # func does not create a metrics.json cache file. So we # generate that now (using the same data as we return) util.ensure_dir(metricspath) with open(metricspath, "w") as fp: s = json.dumps(r[mainidx], indent=4, separators=(', ', ': '), sort_keys=True) fp.write(s) return r[mainidx]
def archive(self, basefile, version): """Moves the current version of a document to an archive. All files related to the document are moved (downloaded, parsed, generated files and any existing attachment files). :param basefile: The basefile of the document to archive :type basefile: str :param version: The version id to archive under :type version: str """ for meth in (self.downloaded_path, self.documententry_path, self.parsed_path, self.serialized_path, self.distilled_path, self.annotation_path, self.generated_path): # FIXME: what about intermediate? Ignore them as they # should be able to be regenerated at any time? src = meth(basefile) dest = meth(basefile, version) if self.storage_policy == "dir" and meth in (self.downloaded_path, self.parsed_path, self.generated_path): src = os.path.dirname(src) dest = os.path.dirname(dest) if not os.path.exists(src): continue if os.path.exists(dest): raise errors.ArchivingError( "Archive destination %s for basefile %s version %s already exists!" % (dest, basefile, version)) # self.log.debug("Archiving %s to %s" % (src,dest)) # print("Archiving %s to %s" % (src,dest)) util.ensure_dir(dest) shutil.move(src, dest)
def open(self, basefile, maindir, suffix, mode="r", version=None, attachment=None): """ Context manager that opens files for reading or writing. The parameters are the same as for :meth:`~ferenda.DocumentStore.path`, and the note is applicable here as well -- use :meth:`~ferenda.DocumentStore.open_downloaded`, :meth:`~ferenda.DocumentStore.open_parsed` et al if possible. Example: >>> store = DocumentStore(datadir="/tmp/base") >>> with store.open('123/a', 'parsed', '.xhtml', mode="w") as fp: ... res = fp.write("hello world") >>> os.path.exists("/tmp/base/parsed/123/a.xhtml") True """ filename = self.path(basefile, maindir, suffix, version, attachment) fp = NamedTemporaryFile(mode, delete=False) fp.realname = filename try: yield fp finally: tempname = fp.name fp.close() if not os.path.exists(filename) or not filecmp.cmp(tempname, filename): util.ensure_dir(filename) shutil.move(tempname, filename) else: os.unlink(tempname)
def close(self, *args, **kwargs): if "w" in self.mode: tempname = util.name_from_fp(self.fp) ret = self.fp.close() if not os.path.exists(self.filename) or not filecmp.cmp( tempname, self.filename): util.ensure_dir(self.filename) shutil.move(tempname, self.filename) # since _open uses NamedTemporaryFile, which creates # files only readable by the creating user, we need to # set more liberal permissions. FIXME: This should # respect os.umask() os.chmod( self.filename, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH) else: os.unlink(tempname) return ret else: # This is needed sometimes since # Bzip2File/LZMAFile/GzipFile doesn't close the open file # objects that they wrap if hasattr(self.fp, '_fp'): # for Bzip2File/LZMAFile with IOBufferedReader self.fp._fp.close() if hasattr(self.fp, 'fileobj'): # for GzipFile in the same situation self.fp.fileobj.close() return self.fp.close()
def native_to_file(self, nativedata, outfile, doctype=None): extra = {} if doctype: extra['doctype'] = doctype res = etree.tostring(nativedata, pretty_print=self.format, encoding="utf-8", **extra) util.ensure_dir(outfile) with open(outfile, "wb") as fp: fp.write(res)
def makeimage(basename, label): filename = "res/img/sfs/%s.png" % basename if not os.path.exists(filename): util.ensure_dir(filename) self.log.info("Creating img %s with label %s" % (filename, label)) cmd = 'convert -background transparent -fill Grey -font %s -pointsize 10 -size 44x14 -gravity East label:"%s " %s' % (font, label, filename) util.runcmd(cmd) return filename
def _process_file(self, filename, buf, destdir, origin=""): """ Helper function to concatenate or copy CSS/JS (optionally processing them with e.g. Scss) or other files to correct place under the web root directory. :param filename: The name (relative to the ferenda package) of the file :param buf: A buffer into which the contents of the file is written (if combineresources == True) :param destdir: The directory into which the file will be copied (unless combineresources == True) :param origin: The source of the configuration that specifies this file :returns: The URL path of the resulting file, relative to the web root (or None if combineresources == True) :rtype: str """ if filename.startswith("http://") or filename.startswith("https://"): if self.config.combineresources: raise errors.ConfigurationError( "makeresources: Can't use combineresources=True in combination with external js/css URLs (%s)" % filename) self.log.debug("Using external url %s" % filename) return filename try: fp = self.resourceloader.openfp(filename, binary=True) except errors.ResourceNotFound: self.log.warning("file %(filename)s (specified in %(origin)s)" " doesn't exist" % locals()) return None (base, ext) = os.path.splitext(filename) if self.config.combineresources: self.log.debug("combining %s into buffer" % filename) d = fp.read() buf.write(d) fp.close() return None else: # FIXME: don't copy (at least not log) if the outfile # already exists. # self.log.debug("writing %s out to %s" % (filename, destdir)) outfile = destdir + os.sep + os.path.basename(filename) if (os.path.islink(outfile) and os.path.relpath( os.path.join(os.path.dirname(outfile), os.readlink(outfile))) == util.name_from_fp(fp)): self.log.warning( "%s is a symlink to source file %s, won't overwrite" % (outfile, util.name_from_fp(fp))) else: util.ensure_dir(outfile) with open(outfile, "wb") as fp2: fp2.write(fp.read()) fp.close() return self._filepath_to_urlpath(outfile, 2)
def makeimage(basename, label): filename = "res/img/sfs/%s.png" % basename if not os.path.exists(filename): util.ensure_dir(filename) self.log.info("Creating img %s with label %s" % (filename, label)) cmd = 'convert -background transparent -fill gray50 -font %s -pointsize 10 -size 44x14 -gravity East label:"%s " %s' % ( font, label, filename) util.runcmd(cmd) return filename
def download(self, basefile=None): # Get all "term sets" (used dcterms:subject Objects, wiki pages # describing legal concepts, swedish wikipedia pages...) terms = defaultdict(dict) # 1) Query the triplestore for all dcterms:subject triples (is this # semantically sensible for a "download" action -- the content # isn't really external?) -- term set "subjects" (these come # from both court cases and legal definitions in law text) sq = """ PREFIX dcterms:<http://purl.org/dc/terms/> PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#> SELECT ?uri ?subject ?label WHERE { {?uri dcterms:subject ?subject . } OPTIONAL {?subject rdfs:label ?label . } } """ store = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) results = store.select(sq, "python") for row in results: if 'label' in row: label = row['label'] else: label = self.basefile_from_uri(row['subject']) if label is None: self.log.warning("could not determine keyword from %s" % row['subject']) continue sanitized = self.sanitize_term(label) if sanitized: if sanitized not in terms: terms[sanitized]['subjects'] = [] terms[sanitized]['subjects'].append(row['uri']) self.log.debug("Retrieved %s subject terms from triplestore" % len(terms)) for termset_func in self.termset_funcs: termset_func(terms) for term in terms: term = self.sanitize_term(term) if not term: continue oldterms = "" termpath = self.store.downloaded_path(term) if os.path.exists(termpath): oldterms = yaml.load(util.readfile(termpath)) if terms[term] != oldterms: util.ensure_dir(termpath) util.writefile(termpath, yaml.dump(terms[term], default_flow_style=False)) self.log.info("%s: in %s termsets" % (term, len(terms[term]))) else: self.log.debug("%s: skipped" % term)
def test_feed_param(self): tocdata = b"<!-- specific feed goes here -->" tocpath = self.repo.store.resourcepath("feed/a.atom") util.ensure_dir(tocpath) with open(tocpath, "wb") as fp: fp.write(tocdata) self.env["PATH_INFO"] = "/dataset/base/feed.atom?title=a" status, headers, content = self.call_wsgi(self.env) want = ["200 OK", {'Content-Type': 'application/atom+xml'}, tocdata] self.assertResponse(want[0], want[1], want[2], status, headers, content)
def test_load(self): path = self.repo.store.documententry_path("123/a") util.ensure_dir(path) with open(path, "w") as fp: fp.write(self.basic_json) d = DocumentEntry(path=path) self.assertEqual(d.orig_checked, datetime(2013,3,27,20,46,37)) self.assertIsNone(d.orig_updated) self.assertEqual(d.orig_url,'http://source.example.org/doc/123/a') self.assertEqual(d.id,'http://example.org/123/a') self.assertEqual('<DocumentEntry id=http://example.org/123/a>', repr(d))
def native_to_file(self, nativedata, outfile, doctype=None): extra = {} if doctype: extra['doctype'] = doctype res = etree.tostring(nativedata, pretty_print=self.format, encoding="utf-8", **extra) util.ensure_dir(outfile) with open(outfile, "wb") as fp: fp.write(res)
def test_load_status(self): path = self.repo.store.documententry_path("123/a") util.ensure_dir(path) with open(path, "w") as fp: fp.write(self.status_json) d = DocumentEntry(path=path) self.assertEqual(datetime(2018,8,14,18,15,00), d.status['download']['date']) self.assertEqual(datetime(2018,8,14,18,16,00), d.status['parse']['date']) self.assertEqual(datetime(2018,8,14,18,17,00), d.status['relate']['date']) self.assertEqual(datetime(2018,8,14,18,18,00), d.status['generate']['date']) self.assertEqual("2018-08-14T18:18:00", d.status['generate']['not_a_date'])
def extractdir(self, resourcedir, target, suffixes=None): """Extract all file resources contained in the specified resource directory to the target directory. Searches all loadpaths and optionally the Resources API for any file contained within. This means the target dir may end up with eg. one file from a high-priority path and other files from the system dirs/resources. This in turns makes it easy to just override a single file in a larger set of resource files. Even if the resourcedir might contain resources in subdirectories (eg "source/sub/dir/resource.xml"), the extraction will be to the top-level target directory (eg "target/resource.xml"). """ if not suffixes: suffixes = [] extracted = set() for path in self.loadpath: if resourcedir and resourcedir != ".": path = path+os.sep+resourcedir if not os.path.exists(path): continue # for f in os.listdir(path): for f in util.list_dirs(path, suffixes): f = f[len(path)+1:] basef = os.path.basename(f) src = os.sep.join([path, f]) dest = os.sep.join([target, basef]) if dest not in extracted and os.path.isfile(src): util.ensure_dir(dest) shutil.copy2(src, dest) extracted.add(dest) if self.use_pkg_resources: self._check_module_path() path = self.resourceprefix if resourcedir: path = path + os.sep + resourcedir for f in pkg_resources.resource_listdir(self.modulename, path): src = path + os.sep + f dest = target dest += os.sep + f if (dest not in extracted and not pkg_resources.resource_isdir(self.modulename, self.resourceprefix + os.sep + f)): util.ensure_dir(dest) with open(dest, "wb") as fp: readfp = pkg_resources.resource_stream(self.modulename, src) fp.write(readfp.read()) readfp.close() extracted.add(dest)
def test_load(self): path = self.repo.store.documententry_path("123/a") util.ensure_dir(path) with open(path, "w") as fp: fp.write(self.basic_json) d = DocumentEntry(path=path) self.assertEqual(d.orig_checked, datetime(2013,3,27,20,46,37)) self.assertIsNone(d.orig_updated) self.assertEqual(d.orig_url,'http://source.example.org/doc/123/a') self.assertEqual(d.id,'http://example.org/123/a') self.assertEqual('<DocumentEntry id=http://example.org/123/a>', repr(d))
def extractdir(self, resourcedir, target, suffixes=None): """Extract all file resources contained in the specified resource directory to the target directory. Searches all loadpaths and optionally the Resources API for any file contained within. This means the target dir may end up with eg. one file from a high-priority path and other files from the system dirs/resources. This in turns makes it easy to just override a single file in a larger set of resource files. Even if the resourcedir might contain resources in subdirectories (eg "source/sub/dir/resource.xml"), the extraction will be to the top-level target directory (eg "target/resource.xml"). """ if not suffixes: suffixes = [] extracted = set() for path in self.loadpath: if resourcedir and resourcedir != ".": path = path + os.sep + resourcedir if not os.path.exists(path): continue # for f in os.listdir(path): for f in util.list_dirs(path, suffixes): f = f[len(path) + 1:] basef = os.path.basename(f) src = os.sep.join([path, f]) dest = os.sep.join([target, basef]) if dest not in extracted and os.path.isfile(src): util.ensure_dir(dest) shutil.copy2(src, dest) extracted.add(dest) if self.use_pkg_resources: self._check_module_path() path = self.resourceprefix if resourcedir: path = path + os.sep + resourcedir for f in pkg_resources.resource_listdir(self.modulename, path): src = path + os.sep + f dest = target dest += os.sep + f if (dest not in extracted and not pkg_resources.resource_isdir( self.modulename, self.resourceprefix + os.sep + f)): util.ensure_dir(dest) with open(dest, "wb") as fp: readfp = pkg_resources.resource_stream( self.modulename, src) fp.write(readfp.read()) readfp.close() extracted.add(dest)
def wrapper(self, doc): ret = f(self, doc) updated = self.render_xhtml(doc, self.store.parsed_path(doc.basefile)) if updated: self.log.debug("%s: Created %s" % (doc.basefile, self.store.parsed_path(doc.basefile))) # css file + background images + png renderings of text self.create_external_resources(doc) # Validate that all triples specified in doc.meta and any # .meta property on any body object is present in the # XHTML+RDFa file. distilled_graph = Graph() with codecs.open(self.store.parsed_path(doc.basefile), encoding="utf-8") as fp: # unicode distilled_graph.parse(data=fp.read(), format="rdfa", publicID=doc.uri) # The act of parsing from RDFa binds a lot of namespaces # in the graph in an unneccesary manner. Particularly it # binds both 'dc' and 'dcterms' to # 'http://purl.org/dc/terms/', which makes serialization # less than predictable. Blow these prefixes away. distilled_graph.bind("dc", URIRef("http://purl.org/dc/elements/1.1/")) distilled_graph.bind( "dcterms", URIRef("http://example.org/this-prefix-should-not-be-used")) util.ensure_dir(self.store.distilled_path(doc.basefile)) with open(self.store.distilled_path(doc.basefile), "wb") as distilled_file: # print("============distilled===============") # print(distilled_graph.serialize(format="turtle").decode('utf-8')) distilled_graph.serialize(distilled_file, format="pretty-xml") self.log.debug( '%s: %s triples extracted to %s', doc.basefile, len(distilled_graph), self.store.distilled_path(doc.basefile)) for g in iterate_graphs(doc.body): doc.meta += g for triple in distilled_graph: # len_before = len(doc.meta) doc.meta.remove(triple) # len_after = len(doc.meta) if doc.meta: self.log.warning("%s: %d triple(s) from the original metadata was " "not found in the serialized XHTML file:\n%s", doc.basefile, len(doc.meta), doc.meta.serialize(format="nt").decode('utf-8').strip()) return ret
def _process_file(self, filename, buf, destdir, origin=""): """ Helper function to concatenate or copy CSS/JS (optionally processing them with e.g. Scss) or other files to correct place under the web root directory. :param filename: The name (relative to the ferenda package) of the file :param buf: A buffer into which the contents of the file is written (if combineresources == True) :param destdir: The directory into which the file will be copied (unless combineresources == True) :param origin: The source of the configuration that specifies this file :returns: The URL path of the resulting file, relative to the web root (or None if combineresources == True) :rtype: str """ if filename.startswith("http://") or filename.startswith("https://"): if self.config.combineresources: raise errors.ConfigurationError( "makeresources: Can't use combineresources=True in combination with external js/css URLs (%s)" % filename) self.log.debug("Using external url %s" % filename) return filename try: fp = self.resourceloader.openfp(filename, binary=True) except errors.ResourceNotFound: self.log.warning("file %(filename)s (specified in %(origin)s)" " doesn't exist" % locals()) return None (base, ext) = os.path.splitext(filename) if self.config.combineresources: self.log.debug("combining %s into buffer" % filename) d = fp.read() buf.write(d) fp.close() return None else: # FIXME: don't copy (at least not log) if the outfile # already exists. # self.log.debug("writing %s out to %s" % (filename, destdir)) outfile = destdir + os.sep + os.path.basename(filename) if (os.path.islink(outfile) and os.path.relpath(os.path.join(os.path.dirname(outfile), os.readlink(outfile))) == util.name_from_fp(fp)): self.log.warning("%s is a symlink to source file %s, won't overwrite" % (outfile, util.name_from_fp(fp))) else: util.ensure_dir(outfile) with open(outfile, "wb") as fp2: fp2.write(fp.read()) fp.close() return self._filepath_to_urlpath(outfile, 2)
def plot(self, filename, margincounters, stylecounters, metrics): try: import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt except ImportError: raise ImportError("You need matplotlib installed") # plt.style.use('ggplot') # looks good but makes histograms unreadable matplotlib.rcParams.update({'font.size': 8}) # width, height in inches plt.figure(figsize=((len(margincounters)) * 2, 7)) # if 6 counters: # +0,0--+ +0,1--+ +0,2--+ +0,3--+ # | LM | | LEM | | RM | | REM | # +-----+ +-----+ +-----+ +-----+ # +1,0--+ +1,1--+ +1,2 colspan=2+ # | TM | | BM | | Styles | # +-----+ +-----+ +-------------+ # # if 4 counters: # +0,0--+ +0,1--+ +0,2--+ # | LM | | RM | | TM | # +-----+ +-----+ +-----+ # +1,0--+ +1,1 colspan=2+ # | BM | | Styles | # +-----+ +-------------+ # disregard the pageheight/pagewidth counters pagewidth = max(margincounters['pagewidth']) del margincounters['pagewidth'] pageheight = max(margincounters['pageheight']) del margincounters['pageheight'] if len(margincounters) == 4: coords = ((0, 0), (0, 1), (0, 2), (1, 0), (1, 1)) grid = (2, 3) elif len(margincounters) == 6: coords = ((0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2)) grid = (2, 4) else: # FIXME: make this dynamic raise ValueError("Can't layout other # of counters than 4 or 6") marginplots = [plt.subplot2grid(grid, pos) for pos in coords[:-1]] self.plot_margins(marginplots, margincounters, metrics, pagewidth, pageheight) styleplot = plt.subplot2grid(grid, coords[-1], colspan=2) self.plot_styles(styleplot, stylecounters, metrics, plt) util.ensure_dir(filename) plt.savefig(filename, dpi=150) self.log.debug("wrote %s" % filename)
def make_api_files(self): # this should create the following files under resourcedir # api/context.json (aliased to /json-ld/context.json if legacyapi) # api/terms.json (aliased to /var/terms.json if legacyapi) # api/common.json (aliased to /var/common.json if legacyapi) # MAYBE api/ui/ - copied from ferenda/res/ui files = [] context = os.sep.join([self.resourcedir, "api", "context.json"]) if self.config.legacyapi: self.log.info("Creating API files for legacyapi") contextpath = "/json-ld/context.json" termspath = "/var/terms" commonpath = "/var/common" else: # FIXME: create correct URL path contextpath = "/rsrc/api/context.json" termspath = "/rsrc/api/terms.json" commonpath = "/rsrc/api/common.json" util.ensure_dir(context) with open(context, "w") as fp: contextdict = self._get_json_context() s = json.dumps({"@context": contextdict}, separators=(', ', ': '), indent=4, sort_keys=True) fp.write(s) files.append(self._filepath_to_urlpath(context, 2)) common = os.sep.join([self.resourcedir, "api", "common.json"]) terms = os.sep.join([self.resourcedir, "api", "terms.json"]) for (filename, func, urlpath) in ((common, self._get_common_graph, commonpath), (terms, self._get_term_graph, termspath)): g = func(self.config.url + urlpath[1:]) d = json.loads(g.serialize(format="json-ld", context=contextdict, indent=4).decode("utf-8")) # d might not contain a @context (if contextdict == {}, ie # no repos are given) if '@context' in d: d['@context'] = contextpath if self.config.legacyapi: d = self._convert_legacy_jsonld(d, self.config.url + urlpath[1:]) with open(filename, "w") as fp: s = json.dumps(d, indent=4, separators=(', ', ': '), sort_keys=True) fp.write(s) files.append(self._filepath_to_urlpath(filename, 2)) if self.config.legacyapi: # copy ui explorer app to <url>/rsrc/ui/ -- this does not get # included in files targetdir = os.sep.join([self.resourcedir, "ui"]) self.resourceloader.extractdir("ui", targetdir) return files
def plot(self, filename, margincounters, stylecounters, metrics): try: import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt except ImportError: raise ImportError("You need matplotlib installed") # plt.style.use('ggplot') # looks good but makes histograms unreadable matplotlib.rcParams.update({'font.size': 8}) # width, height in inches plt.figure(figsize=((len(margincounters)) * 2, 7)) # if 6 counters: # +0,0--+ +0,1--+ +0,2--+ +0,3--+ # | LM | | LEM | | RM | | REM | # +-----+ +-----+ +-----+ +-----+ # +1,0--+ +1,1--+ +1,2 colspan=2+ # | TM | | BM | | Styles | # +-----+ +-----+ +-------------+ # # if 4 counters: # +0,0--+ +0,1--+ +0,2--+ # | LM | | RM | | TM | # +-----+ +-----+ +-----+ # +1,0--+ +1,1 colspan=2+ # | BM | | Styles | # +-----+ +-------------+ # disregard the pageheight/pagewidth counters pagewidth = max(margincounters['pagewidth']) del margincounters['pagewidth'] pageheight = max(margincounters['pageheight']) del margincounters['pageheight'] if len(margincounters) == 4: coords = ((0, 0), (0, 1), (0, 2), (1, 0), (1, 1)) grid = (2, 3) elif len(margincounters) == 6: coords = ((0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2)) grid = (2, 4) else: # FIXME: make this dynamic raise ValueError("Can't layout other # of counters than 4 or 6") marginplots = [plt.subplot2grid(grid, pos) for pos in coords[:-1]] self.plot_margins(marginplots, margincounters, metrics, pagewidth, pageheight) styleplot = plt.subplot2grid(grid, coords[-1], colspan=2) self.plot_styles(styleplot, stylecounters, metrics, plt) util.ensure_dir(filename) plt.savefig(filename, dpi=150) self.log.debug("wrote %s" % filename)
def test_dataset_param(self): util.ensure_dir(self.repo.store.generated_path("123/a")) tocdata = b"<!-- specific toc page goes here -->" tocpath = self.repo.store.resourcepath("toc/title/a.html") with open(tocpath, "wb") as fp: fp.write(tocdata) self.env["PATH_INFO"] = "/dataset/base?title=a" status, headers, content = self.call_wsgi(self.env) want = ["200 OK", {'Content-Type': 'text/html; charset=utf-8'}, tocdata] self.assertResponse(want[0], want[1], want[2], status, headers, content)
def test_attachment_param(self): self.repo.store.storage_policy = "dir" util.ensure_dir(self.repo.store.generated_path("123/a")) cssdata = b"/* css data goes here */" csspath = self.repo.store.generated_path("123/a", attachment="index.css") with open(csspath, "wb") as fp: fp.write(cssdata) self.env["PATH_INFO"] = "/res/base/123/a?attachment=index.css" status, headers, content = self.call_wsgi(self.env) want = ["200 OK", {'Content-Type': 'text/css'}, cssdata] self.assertResponse(want[0], want[1], want[2], status, headers, content)
def close(self, *args, **kwargs): if "w" in self.mode: tempname = util.name_from_fp(self.fp) ret = self.fp.close() if not os.path.exists(self.filename) or not filecmp.cmp( tempname, self.filename): util.ensure_dir(self.filename) shutil.move(tempname, self.filename) else: os.unlink(tempname) return ret else: return self.fp.close()
def test_feed_param(self): tocdata = b"<!-- specific feed goes here -->" tocpath = self.repo.store.resourcepath("feed/a.atom") util.ensure_dir(tocpath) with open(tocpath, "wb") as fp: fp.write(tocdata) self.env["PATH_INFO"] = "/dataset/base/feed.atom?title=a" status, headers, content = self.call_wsgi(self.env) want = ["200 OK", {'Content-Type': 'application/atom+xml'}, tocdata] self.assertResponse(want[0], want[1], want[2], status, headers, content)
def test_dataset_param(self): util.ensure_dir(self.repo.store.generated_path("123/a")) tocdata = b"<!-- specific toc page goes here -->" tocpath = self.repo.store.resourcepath("toc/title/a.html") with open(tocpath, "wb") as fp: fp.write(tocdata) self.builder.path = "/dataset/base" self.builder.query_string = "title=a" status, headers, content = self.call_wsgi() want = ["200 OK", {'Content-Type': 'text/html; charset=utf-8'}, tocdata] self.assertResponse(want[0], want[1], want[2], status, headers, content)
def test_attachment_param(self): self.repo.store.storage_policy = "dir" util.ensure_dir(self.repo.store.generated_path("123/a")) cssdata = b"/* css data goes here */" csspath = self.repo.store.generated_path("123/a", attachment="index.css") with open(csspath, "wb") as fp: fp.write(cssdata) self.env["PATH_INFO"] = "/res/base/123/a?attachment=index.css" status, headers, content = self.call_wsgi(self.env) want = ["200 OK", {'Content-Type': 'text/css'}, cssdata] self.assertResponse(want[0], want[1], want[2], status, headers, content)
def GenerateMap(self, basefile): start = time() infile = os.path.relpath(self._xmlFileName(basefile)) head = codecs.open(infile, encoding='utf-8').read(1024) m = self.re_xmlbase(head) if m: uri = "http://rinfo.lagrummet.se/publ/rattsfall/%s" % m.group(1) mapfile = self.store.path('generated', 'uri.map', '.new') util.ensure_dir(mapfile) f = codecs.open(mapfile, 'a', encoding='iso-8859-1') f.write("%s\t%s\n" % (m.group(1), basefile)) f.close() self.log.info("%s ok" % basefile) return else: self.log.warning("could not find xml:base in %s" % infile)
def test_load_status(self): path = self.repo.store.documententry_path("123/a") util.ensure_dir(path) with open(path, "w") as fp: fp.write(self.status_json) d = DocumentEntry(path=path) self.assertEqual(datetime(2018, 8, 14, 18, 15, 00), d.status['download']['date']) self.assertEqual(datetime(2018, 8, 14, 18, 16, 00), d.status['parse']['date']) self.assertEqual(datetime(2018, 8, 14, 18, 17, 00), d.status['relate']['date']) self.assertEqual(datetime(2018, 8, 14, 18, 18, 00), d.status['generate']['date']) self.assertEqual("2018-08-14T18:18:00", d.status['generate']['not_a_date'])
def write_doc(basefile, page_el): writefile = False p = self.store.downloaded_path(basefile) newcontent = etree.tostring(page_el, encoding="utf-8") if not os.path.exists(p): writefile = True else: oldcontent = util.readfile(p, "rb") if newcontent != oldcontent: writefile = True if writefile: util.ensure_dir(p) with open(p, "wb") as fp: fp.write(newcontent) self.log.info("%s: extracting from XML dump" % basefile) if basefile in basefiles: del basefiles[basefiles.index(basefile)]
def test_modify(self): path = self.repo.store.documententry_path("123/a") util.ensure_dir(path) with open(path, "w") as fp: fp.write(self.basic_json) d = DocumentEntry(path=path) d.orig_updated = datetime(2013, 3, 27, 20, 59, 42, 325067) d.id = "http://example.org/123/a" # do this in setUp? with open(self.datadir+"/xhtml","w") as f: f.write("<div>xhtml fragment</div>") d.set_content(self.datadir+"/xhtml", "http://example.org/test", mimetype="xhtml", inline=True) d.save() self.assertEqual(self.d2u(util.readfile(path)), self.modified_json)
def writegraph(graph, dest, operation="transformed"): util.ensure_dir(dest) if os.path.exists(dest): olddata = util.readfile(dest).split("\n\n", 1)[1] else: olddata = "" newdata = graph.serialize(format="turtle").decode("utf-8") if newdata != olddata: with open(dest, "w") as fp: header = "# Automatically %s from sources at %s\n\n" % ( operation, datetime.now().isoformat()) fp.write(header) fp.write(newdata) print("Wrote %s triples to %s" % (len(graph), dest)) else: print("%s is unchanged" % dest)
def download(self, basefile=None, url=None): if basefile: if not url: entry = DocumentEntry(self.store.documententry_path(basefile)) url = entry.orig_url if url: return self.download_single(basefile, url) else: raise DownloadError( "%s doesn't support downloading single basefiles w/o page URL" % self.__class__.__name__) params = { 'filterType': 'Taxonomy', 'filterByType': 'FilterablePageBase', 'preFilteredCategories': '1324', 'rootPageReference': '0', 'filteredContentCategories': self.document_type } if 'lastdownload' in self.config and not self.config.refresh: params['fromDate'] = self.config.lastdownload.strftime("%Y-%m-%d") # temporary test -- useful when troubleshooting behaviour related to malformed entries in the search result list # params['fromDate'] = "2009-05-13" # params['toDate'] = "2009-05-20" self.log.debug("Loading documents starting from %s" % params.get('fromDate', "the beginning")) try: for basefile, url in self.download_get_basefiles(params): try: # sleep(0.5) # regeringen.se has a tendency to throw 400 errors, maybe because we're too quick? self.download_single(basefile, url) except requests.exceptions.HTTPError as e: if self.download_accept_404 and e.response.status_code == 404: self.log.error("%s: %s %s" % (basefile, url, e)) ret = False else: raise e finally: urlmap_path = self.store.path("urls", "downloaded", ".map", storage_policy="file") util.ensure_dir(urlmap_path) with codecs.open(urlmap_path, "w", encoding="utf-8") as fp: for url, identifier in self.urlmap.items(): fp.write("%s\t%s\n" % (url, identifier))
def test_modify(self): path = self.repo.store.documententry_path("123/a") util.ensure_dir(path) with open(path, "w") as fp: fp.write(self.basic_json) d = DocumentEntry(path=path) d.orig_updated = datetime(2013, 3, 27, 20, 59, 42, 325067) d.id = "http://example.org/123/a" # do this in setUp? with open(self.datadir+"/xhtml","w") as f: f.write("<div>xhtml fragment</div>") d.set_content(self.datadir+"/xhtml", "http://example.org/test", mimetype="xhtml", inline=True) d.save() self.assertEqual(self.d2u(util.readfile(path)), self.modified_json)
def word_to_ooxml(self, indoc, outdoc): """Extracts the raw OOXML file from a modern Word document (.docx).""" name = "word/document.xml" zipf = zipfile.ZipFile(indoc, "r") assert name in zipf.namelist(), "No %s in zipfile %s" % (name, indoc) data = zipf.read(name) util.ensure_dir(outdoc) with open(outdoc, "wb") as fp: fp.write(data) # FIXME: We need to reimplement this old function (which ran # tidy on the outfile) with an internal lxml based thingy # util.indent_xml_file(outdoc) zi = zipf.getinfo(name) dt = datetime(*zi.date_time) ts = mktime(dt.timetuple()) os.utime(outdoc, (ts, ts))
def add_downloaded_files(filelist, spec, url): downloaddir = os.sep.join( [self.datadir, self.repoclass.alias, "downloaded"]) for f in list(util.list_dirs(downloaddir)): if f.endswith(".etag"): continue # FIXME: this is ugly if f not in filelist: # print("Fetching %s resulted in downloaded file %s" % (url, f)) filelist.append(f) expect = "downloaded" + f.replace(downloaddir, "") if os.sep != "/": expect = expect.replace(os.sep, "/") spec[url]['expect'] = expect reldest = os.path.relpath(".." + os.sep + "downloaded", os.path.dirname(f)) dest = os.path.normpath( os.path.join(os.path.dirname(specfile), reldest)) util.ensure_dir(dest) shutil.copy2(f, dest)
def download(self): # do something with static/sitenews.txt --> split into # <datadir>/sitenews/<timestamp>.txt ofp = None with codecs.open(self.resourceloader.filename(self.config.newsfile), encoding="utf-8") as fp: for line in fp: m = self.re_news_subjectline(line) if m: if ofp: ofp.close() d = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S") basefile = str(int(d.timestamp())) path = self.store.downloaded_path(basefile) self.log.info("%s: creating news item" % basefile) util.ensure_dir(path) ofp = codecs.open(path, "w", encoding="utf-8") ofp.write(line) ofp.close()
def put_files_in_place(self): self.repos = [] for repoclass in DocRepo1, DocRepo2, DocRepo3: repo = repoclass(datadir=self.datadir, storetype=self.storetype, storelocation=self.storelocation, storerepository=self.storerepository, indextype=self.indextype, indexlocation=self.indexlocation) self.repos.append(repo) # NOTE: calling repo.relate(basefile, self.repos) will reorder # self.repos in MRU order. This is for efficency, but might # cause a change in the list we iterate over. So by wrapping # in list(), we create a temporary list that won't be # reordered. for repo in list(self.repos): for basefile in "a", "b", "c", "d": util.ensure_dir(repo.store.parsed_path(basefile)) # Put files in place: parsed parsed_path = "test/files/testrepos/%s/parsed/%s.xhtml" % (repo.alias, basefile) shutil.copy2(parsed_path, repo.store.parsed_path(basefile)) # FIXME: This distilling code is copied from # decorators.render -- should perhaps move to a # DocumentRepository method like render_xhtml distilled_graph = Graph() with codecs.open(repo.store.parsed_path(basefile), encoding="utf-8") as fp: # unicode distilled_graph.parse(data=fp.read(), format="rdfa", publicID=repo.canonical_uri(basefile)) distilled_graph.bind("dc", URIRef("http://purl.org/dc/elements/1.1/")) distilled_graph.bind("dcterms", URIRef("http://example.org/this-prefix-should-not-be-used")) util.ensure_dir(repo.store.distilled_path(basefile)) with open(repo.store.distilled_path(basefile), "wb") as distilled_file: distilled_graph.serialize(distilled_file, format="pretty-xml") # print("#======= %s/%s ========" % (repo.alias, basefile)) # print(distilled_graph.serialize(format="turtle").decode()) # finally index all the data into the triplestore/fulltextindex repo.relate(basefile, self.repos)