def test_facet_query(self): results1 = json.load(open("test/files/datasets/results1.json")) results2 = json.load(open("test/files/datasets/results2.json")) self.loader.add_serialized( util.readfile("test/files/datasets/books.ttl"), format="turtle", context="http://example.org/ctx/base") self.loader.add_serialized( util.readfile("test/files/datasets/articles.ttl"), format="turtle", context="http://example.org/ctx/other") # Since the query is partially constructed by DocumentRepository, we # need to run that code. import rdflib from ferenda import DocumentRepository repo = DocumentRepository() repo.config.storetype = self.storetype repo.rdf_type = rdflib.URIRef("http://purl.org/ontology/bibo/Book") # test 1 sq = repo.facet_query("http://example.org/ctx/base") got = self.store.select(sq, format="python") self.assertEqual(len(got), len(results1)) for row in results1: self.assertIn(row, got) # test 2 sq = repo.facet_query("http://example.org/ctx/other") got = self.store.select(sq, format="python") self.assertEqual(len(got), len(results2)) for row in results2: self.assertIn(row, got) if self.storetype == "SLEEPYCAT": self.store.graph.close()
def test_parse(self): self.repo.download() self.assertTrue(self.repo.parse("1")) # both A and B can # handle this but B # should win self.assertEqual("basefile 1, parsed by b", util.readfile(self.repo.store.parsed_path("1"))) self.assertEqual("basefile 1, metadata from b", util.readfile(self.repo.store.distilled_path("1"))) self.assertEqual(["attach.txt"], list(self.repo.store.list_attachments("1", "parsed"))) with self.assertRaises(errors.ParseError): self.assertFalse(self.repo.parse("2")) # none can handle this self.assertTrue(self.repo.parse("3")) # only A can handle this self.assertEqual("basefile 3, parsed by a", util.readfile(self.repo.store.parsed_path("3"))) self.assertEqual("basefile 3, metadata from a", util.readfile(self.repo.store.distilled_path("3"))) self.assertEqual([], # this repo supports attachment, but # underlying repo A did not list(self.repo.store.list_attachments("3", "parsed"))) # in this case, all files should be up-to-date, so no copying # should occur (triggering the "Attachments are (likely) # up-to-date branch") self.assertTrue(self.repo.parse("1")) # and finally, list_basefiles_for("generate") should delegate # to DocumentStore.list_basefiles_for self.assertEqual(set(["1", "3"]), set(self.repo.store.list_basefiles_for("generate")))
def test_open(self): wanted_filename = self.store.path("basefile", "maindir", ".suffix") with self.store.open("basefile", "maindir", ".suffix", "w") as fp: self.assertNotEqual(fp.name, wanted_filename) self.assertEqual(fp.realname, wanted_filename) fp.write("This is the data") self.assertEqual(util.readfile(wanted_filename), "This is the data") mtime = os.stat(wanted_filename).st_mtime # make sure that the open method also can be used with self.store.open("basefile", "maindir", ".suffix") as fp: self.assertEqual("This is the data", fp.read()) # make sure writing identical content does not actually write # a new file time.sleep(.1) # just to get a different mtime with self.store.open("basefile", "maindir", ".suffix", "w") as fp: fp.write("This is the data") self.assertEqual(os.stat(wanted_filename).st_mtime, mtime) # make sure normal fp = self.store.open("basefile", "maindir", ".suffix", "w") fp.write("This is the new data") fp.close() self.assertEqual(util.readfile(wanted_filename), "This is the new data")
def test_parse(self): self.repo.download() self.assertTrue(self.repo.parse("1")) # both A and B can # handle this but B # should win self.assertEqual("basefile 1, parsed by b", util.readfile(self.repo.store.parsed_path("1"))) self.assertEqual("basefile 1, metadata from b", util.readfile(self.repo.store.distilled_path("1"))) self.assertEqual(["attach.txt"], list(self.repo.store.list_attachments("1", "parsed"))) with self.assertRaises(errors.ParseError): self.assertFalse(self.repo.parse("2")) # none can handle this self.assertTrue(self.repo.parse("3")) # only A can handle this self.assertEqual("basefile 3, parsed by a", util.readfile(self.repo.store.parsed_path("3"))) self.assertEqual("basefile 3, metadata from a", util.readfile(self.repo.store.distilled_path("3"))) self.assertEqual( [], # this repo supports attachment, but # underlying repo A did not list(self.repo.store.list_attachments("3", "parsed"))) # in this case, all files should be up-to-date, so no copying # should occur (triggering the "Attachments are (likely) # up-to-date branch") self.assertTrue(self.repo.parse("1")) # and finally, list_basefiles_for("generate") should delegate # to DocumentStore.list_basefiles_for self.assertEqual(set(["1", "3"]), set(self.repo.store.list_basefiles_for("generate")))
def assertEqualDirs(self, want, got, suffix=None, subset=False, filterdir="entries"): """Assert that two directory trees contains identical files :param want: The expected directory tree :type want: str :param got: The actual directory tree :type got: str :param suffix: If given, only check files ending in suffix (otherwise check all the files :type suffix: str :param subset: If True, require only that files in want is a subset of files in got (otherwise require that the sets are identical) :type subset: bool :param filterdir: If given, don't compare the parts of the tree that starts with filterdir :type suffix: str """ wantfiles = [x[len(want) + 1:] for x in util.list_dirs(want, suffix) if not x.startswith(want + os.sep + filterdir)] gotfiles = [x[len(got) + 1:] for x in util.list_dirs(got, suffix) if not x.startswith(got + os.sep + filterdir)] self.maxDiff = None if subset: self.assertTrue(set(wantfiles).issubset(set(gotfiles))) else: self.assertEqual(wantfiles, gotfiles) # or assertIn? for f in wantfiles: if not filecmp.cmp(os.path.join(want, f), os.path.join(got, f), shallow=False): self.assertEqual(util.readfile(os.path.join(want, f), mode="rb"), util.readfile(os.path.join(got, f), mode="rb"))
def test_replace_if_different(self): # test 1: dst does not exist util.writefile(self.fname, "Hello") self.assertTrue(util.replace_if_different(self.fname, self.fname2)) self.assertFalse(os.path.exists(self.fname)) self.assertTrue(os.path.exists(self.fname2)) # test 2: dst exists, but is different (gets overwritten) util.writefile(self.fname, "Hello (different)") self.assertTrue(util.replace_if_different(self.fname, self.fname2)) self.assertFalse(os.path.exists(self.fname)) self.assertEqual("Hello (different)", util.readfile(self.fname2)) # test 3: src and dst is identical (src gets removed) util.writefile(self.fname, "Hello (different)") self.assertFalse(util.replace_if_different(self.fname, self.fname2)) self.assertFalse(os.path.exists(self.fname)) # test 4: dst exist, is different, gets archived newfile = self.dname+"/new.txt" archivefile = self.dname+"/archive.txt" util.writefile(newfile, "Hello (archiving)") self.assertTrue(util.replace_if_different(newfile, self.fname2, archivefile)) self.assertFalse(os.path.exists(newfile)) self.assertEqual("Hello (archiving)", util.readfile(self.fname2)) self.assertEqual("Hello (different)", util.readfile(archivefile))
def test_get_serialized_file(self): want = tempfile.mktemp(suffix=".nt") util.writefile(want, util.readfile("test/files/datasets/dataset.nt")) got = tempfile.mktemp(suffix=".nt") self.loader.add_serialized( util.readfile("test/files/datasets/dataset.nt"),format="nt") del self.loader self.store.get_serialized_file(got, format="nt") self.assertEqualGraphs(want,got)
def parse_test(self, downloaded_file, xhtml_file, docroot): """This test is run once for each basefile found in docroot/downloaded. It performs a full parse, and verifies that the resulting XHTML document is equal to the XHTML file placed in docroot/parsed/. """ basefile = self.filename_to_basefile(downloaded_file) def runtest(): if "FERENDA_LOG_TEST" in os.environ: loglevel = { "DEBUG": logging.DEBUG, "INFO": logging.INFO, "WARNING": logging.WARNING, "ERROR": logging.ERROR, "CRITICAL": logging.CRITICAL }.get(os.environ["FERENDA_LOG_TEST"], logging.INFO) logformat = "%(asctime)s %(name)s %(levelname)s %(message)s" datefmt = "%H:%M:%S" handler = logging.StreamHandler() handler.setLevel(loglevel) handler.setFormatter( logging.Formatter(logformat, datefmt=datefmt)) logger = logging.getLogger() logger.setLevel(loglevel) # shut some non-core loggers up for logname in [ 'requests.packages.urllib3.connectionpool', 'rdflib.plugins.sleepycat', 'rdflib.plugins.parsers.pyRdfa', 'ferenda.thirdparty.patch' ]: log = logging.getLogger(logname) log.propagate = False logger.addHandler(handler) self.repo.parse(basefile) if "FERENDA_LOG_TEST" in os.environ: logger.removeHandler(handler) if "FERENDA_PROFILE_TEST" in os.environ: print("Profiling test") import cProfile cProfile.runctx("runtest()", globals(), locals(), sort="cumtime") else: runtest() print = builtins.print if 'FERENDA_SET_TESTFILE' in os.environ: print("Overwriting '%s' with result of parse ('%s')" % (xhtml_file, basefile)) util.robust_rename(xhtml_file, xhtml_file + "~") shutil.copy2(self.repo.store.parsed_path(basefile), xhtml_file) return self.assertEqualXML(util.readfile(xhtml_file), util.readfile( self.repo.store.parsed_path(basefile)), tidy_xhtml=True)
def test_download(self): self.repo.download() self.assertEqual("basefile 1, repo a", util.readfile(self.datadir+"/a/downloaded/1.html")) self.assertEqual("basefile 1, repo b", util.readfile(self.datadir+"/b/downloaded/1/index.html")) self.assertEqual("basefile 2, repo b", util.readfile(self.datadir+"/b/downloaded/2/index.html")) self.assertEqual("basefile 3, repo a", util.readfile(self.datadir+"/a/downloaded/3.html"))
def test_sesame_get_serialized(self, mock_get): store = TripleStore.connect("SESAME", "", "") want = util.readfile("test/files/triplestore/combinedgraph.nt", "rb") got = store.get_serialized() self.assertEqual(want, got) self.assertEqual(mock_get.call_count, 1) want = util.readfile("test/files/triplestore/namedgraph.nt", "rb") got = store.get_serialized(context="namedgraph") # results in single get self.assertEqual(want, got) self.assertEqual(mock_get.call_count, 2)
def _loadgraph(filename): g = rdflib.Graph() # we must read the data ourself, providing a non-ascii # filename to Graph.parse fails deep in rdflib internals format = guess_format(filename) if format == "nt": data = util.readfile(filename, "r", encoding="utf-8") else: data = util.readfile(filename, "rb") g.parse(data=data, format=format) return g
def test_download(self): self.repo.download() self.assertEqual("basefile 1, repo a", util.readfile(self.datadir + "/a/downloaded/1.html")) self.assertEqual( "basefile 1, repo b", util.readfile(self.datadir + "/b/downloaded/1/index.html")) self.assertEqual( "basefile 2, repo b", util.readfile(self.datadir + "/b/downloaded/2/index.html")) self.assertEqual("basefile 3, repo a", util.readfile(self.datadir + "/a/downloaded/3.html"))
def parse_test(self, downloaded_file, xhtml_file, docroot): """This test is run once for each basefile found in docroot/downloaded. It performs a full parse, and verifies that the resulting XHTML document is equal to the XHTML file placed in docroot/parsed/. """ basefile = self.filename_to_basefile(downloaded_file) def runtest(): if "FERENDA_LOG_TEST" in os.environ: loglevel = { "DEBUG": logging.DEBUG, "INFO": logging.INFO, "WARNING": logging.WARNING, "ERROR": logging.ERROR, "CRITICAL": logging.CRITICAL }.get(os.environ["FERENDA_LOG_TEST"], logging.INFO) logformat = "%(asctime)s %(name)s %(levelname)s %(message)s" datefmt = "%H:%M:%S" handler = logging.StreamHandler() handler.setLevel(loglevel) handler.setFormatter(logging.Formatter(logformat, datefmt=datefmt)) logger = logging.getLogger() logger.setLevel(loglevel) # shut some non-core loggers up for logname in ['requests.packages.urllib3.connectionpool', 'rdflib.plugins.sleepycat', 'rdflib.plugins.parsers.pyRdfa', 'ferenda.thirdparty.patch']: log = logging.getLogger(logname) log.propagate = False logger.addHandler(handler) self.repo.parse(basefile) if "FERENDA_LOG_TEST" in os.environ: logger.removeHandler(handler) if "FERENDA_PROFILE_TEST" in os.environ: print("Profiling test") import cProfile cProfile.runctx("runtest()", globals(), locals(), sort="cumtime") else: runtest() print = builtins.print if 'FERENDA_SET_TESTFILE' in os.environ: print("Overwriting '%s' with result of parse ('%s')" % (xhtml_file, basefile)) util.robust_rename(xhtml_file, xhtml_file + "~") shutil.copy2(self.repo.store.parsed_path(basefile), xhtml_file) return self.assertEqualXML(util.readfile(xhtml_file), util.readfile(self.repo.store.parsed_path(basefile)), tidy_xhtml=True)
def test_construct_annotations_rfc(self): # print("Not loading, re-using data") self.loader.add_serialized( util.readfile("test/files/datasets/rfc.nt"), format="nt", context="http://localhost:8000/dataset/rfc" ) uri = "http://localhost:8000/res/rfc/7066" sq = util.readfile("ferenda/sources/tech/res/sparql/rfc-annotations.rq") % {'uri': uri} got = self.store.construct(sq) want = Graph() want.parse(data=util.readfile("test/files/datasets/annotations-rfc.nt"), format="nt") self.assertEqualGraphs(want, got, exact=True)
def test_construct_annotations_rfc(self): # print("Not loading, re-using data") self.loader.add_serialized( util.readfile("test/files/datasets/rfc.nt"), format="nt", context="http://localhost:8000/dataset/rfc" ) uri = "http://localhost:8000/res/rfc/7066" sq = util.readfile("ferenda/res/sparql/rfc-annotations.rq") % {'uri': uri} got = self.store.construct(sq) want = Graph() want.parse(data=util.readfile("test/files/datasets/annotations-rfc.nt"), format="nt") self.assertEqualGraphs(want, got, exact=True)
def makeresponse(*args, **kwargs): if len(returned) > len(responses): raise IndexError("Ran out of canned responses after %s calls" % len(returned)) resp = Mock() resp.status_code = responses[len(returned)][0] responsefile = responses[len(returned)][1] if responsefile: responsefile = "test/files/triplestore/" + responsefile resp.content = util.readfile(responsefile, "rb") resp.text = util.readfile(responsefile) if responsefile.endswith(".json"): data = json.loads(util.readfile(responsefile)) resp.json = Mock(return_value=data) returned.append(True) return resp
def test_construct(self): self.loader.add_serialized( util.readfile("test/files/datasets/addressbook.ttl"), format="turtle") del self.loader sq = """PREFIX ab: <http://learningsparql.com/ns/addressbook#> PREFIX d: <http://learningsparql.com/ns/data#> CONSTRUCT { ?person ?p ?o . } WHERE { ?person ab:firstName "Craig" ; ab:lastName "Ellis" ; ?p ?o . }""" want = Graph() want.parse(data=""" @prefix d:<http://learningsparql.com/ns/data#> . @prefix ab:<http://learningsparql.com/ns/addressbook#> . d:i8301 ab:email "*****@*****.**", "*****@*****.**" ; ab:firstName "Craig" ; ab:lastName "Ellis" . """, format="turtle") if self.store.__class__ == FusekiStore: got = self.store.construct(sq, uniongraph=False) else: got = self.store.construct(sq) # self.assertTrue(isomorphic(want,got)) self.assertEqualGraphs(want, got, exact=True) if self.store.__class__ == SleepycatStore: self.store.graph.close()
def __init__(self, repos, inifile=None, **kwargs): super(WSGIApp, self).__init__(repos, inifile, **kwargs) sfsrepo = [repo for repo in repos if repo.alias == "sfs"][0] self.parser = SwedishCitationParser( LegalRef(LegalRef.RATTSFALL, LegalRef.LAGRUM, LegalRef.KORTLAGRUM, LegalRef.FORARBETEN, LegalRef.MYNDIGHETSBESLUT), sfsrepo.minter, sfsrepo.commondata, allow_relative=True) graph = Graph().parse(sfsrepo.resourceloader.filename("extra/sfs.ttl"), format="turtle") self.lagforkortningar = [ str(o) for s, o in graph.subject_objects(DCTERMS.alternate) ] self.paragraflag = [] for s, o in graph.subject_objects(DCTERMS.alternate): basefile = sfsrepo.basefile_from_uri(str(s)) distilledpath = sfsrepo.store.distilled_path(basefile) firstpara_uri = str(s) + "#P1" needle = '<rpubl:Paragraf rdf:about="%s">' % firstpara_uri if os.path.exists(distilledpath) and needle in util.readfile( distilledpath): self.paragraflag.append(str(o).lower()) self.lagnamn = [str(o) for s, o in graph.subject_objects(RDFS.label)] self.lagforkortningar_regex = "|".join( sorted(self.lagforkortningar, key=len, reverse=True))
def download_single(self, basefile, url): updated = False created = False filename = self.store.downloaded_path(basefile) created = not os.path.exists(filename) # util.print_open_fds() if self.download_if_needed(url, basefile): if created: self.log.info("%s: downloaded from %s" % (basefile, url)) else: self.log.info( "%s: downloaded new version from %s" % (basefile, url)) updated = True else: self.log.debug("%s: exists and is unchanged" % basefile) soup = BeautifulSoup(util.readfile(filename), "lxml") for pdflink in soup.find_all("a", href=re.compile("\.pdf$")): slug = "-".join(pdflink["href"].rsplit("/")[-2:]) attachment_path = self.store.downloaded_path(basefile, attachment=slug) self.download_if_needed(urljoin(url, pdflink["href"]), basefile, filename=attachment_path) vm = soup.find("a", text="Visa Varumärke") if vm: attachment_path = self.store.downloaded_path(basefile, attachment="varumarke.jpg") attachment_url = re.search("http[^'\"]*", vm["href"]).group(0) self.download_if_needed(attachment_url, basefile, filename=attachment_path) entry = DocumentEntry(self.store.documententry_path(basefile)) now = datetime.now() entry.orig_url = url if created: entry.orig_created = now if updated: entry.orig_updated = now entry.orig_checked = now entry.save()
def test_fuseki_get_serialized(self, mock_get): store = TripleStore.connect("FUSEKI", "", "", curl=False) # test 1: a namedgraph (cases with no context are already run by # test_fuseki_get_serialized_file) want = util.readfile("test/files/triplestore/namedgraph.nt", "rb") got = store.get_serialized(context="namedgraph") # results in single get self.assertEqual(want, got)
def test_add_serialized(self): # test adding to default graph self.assertEqual(0,self.store.triple_count()) self.store.add_serialized( util.readfile("test/files/datasets/dataset.nt"), format="nt") self.assertEqual(7,self.store.triple_count())
def parse(self, doc): source = util.readfile(self.store.downloaded_path(doc.basefile)) html = publish_string(source, writer_name="html") soup = BeautifulSoup(html, "lxml") docinfo = soup.find("table", "docinfo") docuri = URIRef(doc.uri) if docinfo: # this is where our custom metadata goes for row in docinfo.find_all("tr", "field"): key, val = row.th.text.strip(), row.td.text.strip() if key == 'footer-order:': doc.meta.add((docuri, OLO['index'], Literal(int(val)))) else: self.log.warning("%s: Unknown metadata directive %s (%s)" % (doc.basefile, key, val)) # we don't need these in the final result docinfo.decompose() soup.find("h1", "title").decompose() doc.body = elements_from_soup(soup.body) doc.meta.add((docuri, DCTERMS.title, Literal(soup.title.text, doc.lang))) doc.meta.add((docuri, PROV.wasGeneratedBy, Literal(self.qualified_class_name()))) doc.meta.add((docuri, RDF.type, self.rdf_type)) self.parse_entry_update(doc) return True
def test_download_setfile(self): # create a empty.json os.mkdir(self.datadir+"/source") with open(self.datadir+"/source/empty.json", "w") as fp: s = json.dumps({'@settings': {'config': {'refresh': True}}}, separators=(', ', ': ')) fp.write(s) os.environ["FERENDA_SET_TESTFILE"] = "true" self._runtest() del os.environ["FERENDA_SET_TESTFILE"] # make sure downloaded files have been placed where they # should + empty.json has correct content. self.assertTrue(os.path.exists(self.datadir+"/source/empty-0.html")) self.assertEqual("<p>This is doc A</p>", util.readfile(self.datadir+"/source/empty-1.html")) with open(self.datadir+"/source/empty.json") as fp: gotjson = json.load(fp) wantjson = copy(self.basicjson) wantjson['http://example.org/']['file'] = "empty-0.html" wantjson['http://example.org/doc/a_.html']['file'] = "empty-1.html" self.assertEqual(wantjson, gotjson) pass
def test_add_serialized(self): # test adding to default graph self.assertEqual(0, self.store.triple_count()) self.store.add_serialized( util.readfile("test/files/datasets/dataset.nt"), format="nt") self.assertEqual(7,self.store.triple_count())
def parse(self, doc): head, body = util.readfile(self.store.downloaded_path(doc.basefile)).split("\n\n", 1) datestr, timestr, title = head.split(" ", 2) published = datetime.strptime("%s %s" % (datestr, timestr), "%Y-%m-%d %H:%M:%S") doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type)) doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published))) doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang))) soup = bs4.BeautifulSoup("<div class='sitenews-item'>"+body+"</div>", "lxml") doc.body = elements_from_soup(soup.body) # move timestamp into dcterms:issued, title into dcterms:title # parse body with elements_from_soup # set first real para as dcterms:abstract (XMLLiteral) doc.body[0][0] = Div([doc.body[0][0]], datatype="rdf:XMLLiteral", property="dcterms:abstract") # but we need to add it to doc.meta RIGHT AWAY because of reasons... doc.meta.add((URIRef(doc.uri), DCTERMS.abstract, Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral))) self.parse_entry_update(doc) # need to set published and possibly updated entry = DocumentEntry(self.store.documententry_path(doc.basefile)) entry.published = published entry.save() return True
def test_download_setfile(self): # create a empty.json os.mkdir(self.datadir + "/source") with open(self.datadir + "/source/empty.json", "w") as fp: s = json.dumps({'@settings': { 'config': { 'refresh': True } }}, separators=(', ', ': ')) fp.write(s) os.environ["FERENDA_SET_TESTFILE"] = "true" self._runtest() del os.environ["FERENDA_SET_TESTFILE"] # make sure downloaded files have been placed where they # should + empty.json has correct content. self.assertTrue(os.path.exists(self.datadir + "/source/empty-0.html")) self.assertEqual("<p>This is doc A</p>", util.readfile(self.datadir + "/source/empty-1.html")) with open(self.datadir + "/source/empty.json") as fp: gotjson = json.load(fp) wantjson = copy(self.basicjson) wantjson['http://example.org/']['file'] = "empty-0.html" wantjson['http://example.org/doc/a_.html']['file'] = "empty-1.html" self.assertEqual(wantjson, gotjson) pass
def parametric_test(self, filename): # these options adjusts the constructed URIs. by default, the # official rpubl URIs are minted. # # self.repo.config.localizeuri = True # self.repo.config.url = "http://example.org/" # self.repo.config.urlpath = '' # a few of the subclasses have specialized rules. make sure we # instantiate the correct class repo = os.path.basename(filename).split("-")[0] basefile = os.path.splitext(os.path.basename(filename))[0].replace( "-", "/", 1).replace("-", ":") repoclass = self.aliases[repo] self.repo = repoclass( datadir=self.datadir, storelocation=self.datadir + "/ferenda.sqlite", indexlocation=self.datadir + "/whoosh", ) doc = self.repo.make_document(basefile) text = self.repo.sanitize_text(util.readfile(filename), basefile) reader = TextReader(string=text, encoding='utf-8') self.repo.parse_metadata_from_textreader(reader, doc) wantfile = filename.replace(".txt", ".n3") if os.path.exists(wantfile): self.assertEqualGraphs(wantfile, doc.meta, exact=False) else: self.fail( "Expected a %s with the following content:\n\n%s" % (wantfile, doc.meta.serialize(format="n3").decode("utf-8")))
def test_transform_html(self): base = self.datadir+os.sep with open(base+"teststyle.xslt","w") as fp: fp.write("""<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:param name="value"/> <xsl:param name="file"/> <xsl:variable name="content" select="document($file)/root/*"/> <xsl:template match="/"> <output> <paramvalue><xsl:value-of select="$value"/></paramvalue> <paramfile><xsl:copy-of select="$content"/></paramfile> <infile><xsl:value-of select="/doc/title"/></infile> </output> </xsl:template> </xsl:stylesheet> """) with open(base+"paramfile.xml","w") as fp: fp.write("""<root><node key='value'><subnode>textnode</subnode></node></root>""") with open(base+"infile.xml","w") as fp: fp.write("""<doc><title>Document title</title></doc>""") t = Transformer("XSLT", base+"teststyle.xslt", ["res/xsl"], "") t.transform_file(base+"infile.xml", base+"outfile.xml", {'value':'blahonga', 'file':base+'paramfile.xml'}) self.assertEqualXML(util.readfile(base+"outfile.xml"),""" <output> <paramvalue>blahonga</paramvalue> <paramfile><node key='value'><subnode>textnode</subnode></node></paramfile> <infile>Document title</infile> </output>""")
def parametric_test(self, filename): # these options adjusts the constructed URIs. by default, the # official rpubl URIs are minted. # # self.repo.config.localizeuri = True # self.repo.config.url = "http://example.org/" # self.repo.config.urlpath = '' # a few of the subclasses have specialized rules. make sure we # instantiate the correct class repo, basefile = self.parse_filename(filename) doc = repo.make_document(basefile) text = repo.sanitize_text(util.readfile(filename), basefile) reader = TextReader(string=text, encoding='utf-8') props = repo.extract_metadata(reader, basefile) props = repo.sanitize_metadata(props, basefile) resource = repo.polish_metadata(props, basefile) repo.infer_metadata(resource, basefile) wantfile = filename.replace(".txt", ".n3") if os.path.exists(wantfile): self.assertEqualGraphs(wantfile, resource.graph, exact=False) else: self.fail( "Expected a %s with the following content:\n\n%s" % (wantfile, doc.meta.serialize(format="n3").decode("utf-8")))
def extract_metadata(self, rawhead, basefile): metadata = util.readfile(self.store.downloaded_path( basefile, attachment="index.rdf")) # For some reason these RDF files might use canonical # decomposition form (NFD) which is less optimal. Fix this. metadata = unicodedata.normalize("NFC", metadata) sourcegraph = Graph().parse(data=metadata) rooturi = sourcegraph.value(predicate=RDF.type, object=BIBO.Book) if rooturi is None: # then just try to identify the main uri and use that subjects = set(sourcegraph.subjects()) if len(subjects) == 1: rooturi = next(iter(subjects)) title = sourcegraph.value(subject=rooturi, predicate=DC.title) issued = sourcegraph.value(subject=rooturi, predicate=DC.date) if isinstance(issued, str): # sometimes dc:date is weird like "1976[1974]" (SOU 1974:42) if len(issued) != 4: self.log.warning("expected issued date as single 4-digit year, got %s" % issued) # fall back on an approximation based on the basefile issued = basefile.split(":")[0] issued = Literal(util.gYear(int(issued)), datatype=XSD.gYear) attribs = self.metadata_from_basefile(basefile) attribs["dcterms:title"] = title if issued: attribs["dcterms:issued"] = issued return attribs
def extract_metadata(self, rawhead, basefile): metadata = util.readfile( self.store.downloaded_path(basefile, attachment="index.rdf")) # For some reason these RDF files might use canonical # decomposition form (NFD) which is less optimal. Fix this. metadata = unicodedata.normalize("NFC", metadata) sourcegraph = Graph().parse(data=metadata) rooturi = sourcegraph.value(predicate=RDF.type, object=BIBO.Book) if rooturi is None: # then just try to identify the main uri and use that subjects = set(sourcegraph.subjects()) if len(subjects) == 1: rooturi = next(iter(subjects)) title = sourcegraph.value(subject=rooturi, predicate=DC.title) issued = sourcegraph.value(subject=rooturi, predicate=DC.date) if isinstance(issued, str): # sometimes dc:date is weird like "1976[1974]" (SOU 1974:42) if len(issued) != 4: self.log.warning( "expected issued date as single 4-digit year, got %s" % issued) # fall back on an approximation based on the basefile issued = basefile.split(":")[0] issued = Literal(util.gYear(int(issued)), datatype=XSD.gYear) attribs = self.metadata_from_basefile(basefile) attribs["dcterms:title"] = title if issued: attribs["dcterms:issued"] = issued return attribs
def parse_test(self, downloaded_file, xhtml_file, docroot): # patch method so we control where the downloaded doc is # loaded from. basefile = self.filename_to_basefile(downloaded_file) # with patch('ferenda.DocumentStore.downloaded_path', # return_value=downloaded_file): with patch.object(self.repo.documentstore_class, 'downloaded_path', return_value=downloaded_file): self.repo.parse(basefile) if 'FERENDA_SET_TESTFILES' in os.environ: print("Overwriting %r with result of parse (%r)" % (xhtml_file, basefile)) util.robust_rename(xhtml_file, xhtml_file + "~") shutil.copy2(self.repo.store.parsed_path(basefile), xhtml_file) return self.assertEqualXML(util.readfile(xhtml_file), util.readfile(self.repo.store.parsed_path(basefile)))
def test_doctype(self): base = self.datadir+os.sep util.ensure_dir(base+"teststyle-doctype.xslt") with open(base+"teststyle-doctype.xslt","w") as fp: fp.write("""<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:output method="html" doctype-system="about:legacy-compat" omit-xml-declaration="yes" encoding='utf-8' indent="yes"/> <xsl:template match="/"> <html> <head> <title><xsl:value-of select="/doc/title"/></title> </head> <body> <h1>hello world</h1> </body> </html> </xsl:template> </xsl:stylesheet> """) with open(base+"infile.xml","w") as fp: fp.write("""<doc><title>Document title</title></doc>""") t = Transformer("XSLT", base+"teststyle-doctype.xslt", "xsl", None, "") t.transform_file(base+"infile.xml", base+"outfile.xml") self.assertTrue(util.readfile(base+"outfile.xml").startswith('<!DOCTYPE html SYSTEM "about:legacy-compat">'))
def download_is_different(self, existing, new): if existing.endswith(".html"): # load both existing and new into a BeautifulSoup object, then # compare the first <pre> element existing_soup = BeautifulSoup( util.readfile(existing, encoding=self.source_encoding), "lxml") new_soup = BeautifulSoup( util.readfile(new, encoding=self.source_encoding), "lxml") existing = existing_soup.find("div", "search-results-content") new = new_soup.find("div", "search-results-content") try: return existing != new except RuntimeError: # can happen with at least v4.4.1 of beautifulsoup return True else: return super(Trips, self).download_is_different(existing, new)
def parse(self, doc): head, body = util.readfile(self.store.downloaded_path( doc.basefile)).split("\n\n", 1) datestr, timestr, title = head.split(" ", 2) published = datetime.strptime("%s %s" % (datestr, timestr), "%Y-%m-%d %H:%M:%S") doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type)) doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published))) doc.meta.add( (URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang))) soup = bs4.BeautifulSoup( "<div class='sitenews-item'>" + body + "</div>", "lxml") doc.body = elements_from_soup(soup.body) # move timestamp into dcterms:issued, title into dcterms:title # parse body with elements_from_soup # set first real para as dcterms:abstract (XMLLiteral) doc.body[0][0] = Div([doc.body[0][0]], datatype="rdf:XMLLiteral", property="dcterms:abstract") # but we need to add it to doc.meta RIGHT AWAY because of reasons... doc.meta.add((URIRef(doc.uri), DCTERMS.abstract, Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral))) self.parse_entry_update( doc) # need to set published and possibly updated entry = DocumentEntry(self.store.documententry_path(doc.basefile)) entry.published = published entry.save() return True
def _loadgraph(filename): g = rdflib.Graph() # we must read the data ourself, providing a non-ascii # filename to Graph.parse fails deep in rdflib internals g.parse(data=util.readfile(filename, "rb"), format=guess_format(filename)) return g
def distill_test(self, downloaded_file, rdf_file, docroot): try: prefixlen = len(docroot + "/downloaded/") if self.repo.storage_policy == "dir": suffixlen = len(downloaded_file.split(os.sep)[-1]) + 1 else: suffixlen = len(os.path.splitext(downloaded_file)[1]) pathfrag = downloaded_file[prefixlen:-suffixlen] basefile = self.repo.store.pathfrag_to_basefile(pathfrag) except: basefile = self.filename_to_basefile(downloaded_file) with patch.object(self.repo.documentstore_class, 'downloaded_path', return_value=downloaded_file): # self.repo.config.fsmdebug = True self.repo.parse(basefile) if 'FERENDA_SET_TESTFILES' in os.environ: print("Overwriting %r with result of parse (%r)" % (rdf_file, basefile)) g = rdflib.Graph() g.parse(data=util.readfile(self.repo.store.distilled_path(basefile))) util.robust_rename(rdf_file, rdf_file + "~") with open(rdf_file, "wb") as fp: fp.write(g.serialize(format="turtle")) return self.assertEqualGraphs(rdf_file, self.repo.store.distilled_path(basefile), exact=False)
def test_rdf(self): # basic test 3: accept: application/rdf+xml -> RDF statements (in XML) self.env['HTTP_ACCEPT'] = 'application/rdf+xml' status, headers, content = self.call_wsgi(self.env) self.assertResponse("200 OK", {'Content-Type': 'application/rdf+xml'}, util.readfile(self.repo.store.distilled_path("123/a"), "rb"), status, headers, content)
def ttl_to_rdf_xml(self, inpath, outpath, store=None): if not store: store = self.repo.store g = Graph() g.parse(data=util.readfile(inpath, encoding="utf-8"), format="turtle") with _open(outpath, "wb") as fp: fp.write(g.serialize(format="pretty-xml")) return g
def _test_pyfile(self, pyfile, want=True, comparator=None): pycode = compile(util.readfile(pyfile), pyfile, "exec") result = six.exec_(pycode, globals(), locals()) # the exec:ed code is expected to set return_value got = locals()["return_value"] if not comparator: comparator = self.assertEqual comparator(want, got)
def test_add_serialized_named_graph(self): self.test_add_serialized() # set up environment for this case self.store.add_serialized( util.readfile("test/files/datasets/dataset2.nt"), format="nt", context="http://example.org/ctx1") self.assertEqual(3,self.store.triple_count( context="http://example.org/ctx1")) self.assertEqual(10,self.store.triple_count())
def download_is_different(self, existing, new): if existing.endswith(".html"): # load both existing and new into a BeautifulSoup object, then # compare the first <pre> element existing_soup = BeautifulSoup( util.readfile( existing, encoding=self.source_encoding), "lxml") new_soup = BeautifulSoup(util.readfile(new, encoding=self.source_encoding), "lxml") existing = existing_soup.find("div", "search-results-content") new = new_soup.find("div", "search-results-content") try: return existing != new except RuntimeError: # can happen with at least v4.4.1 of beautifulsoup return True else: return super(Trips, self).download_is_different(existing, new)
def test_xhtml(self): # basic test 2: accept: application/xhtml+xml -> parsed file self.env['HTTP_ACCEPT'] = 'application/xhtml+xml' status, headers, content = self.call_wsgi(self.env) self.assertResponse("200 OK", {'Content-Type': 'application/xhtml+xml'}, util.readfile(self.repo.store.parsed_path("123/a"), "rb"), status, headers, content)
def transform(self, indata, config=None, parameters={}): strparams = {} if config: # paths to be used with the document() function # must use unix path separators if os.sep == "\\": config = config.replace(os.sep, "/") # print("Tranform: Using config %s. Contents:" % config) # print(util.readfile(config)) config_fullpath = os.path.abspath(config) strparams['configurationfile'] = XSLT.strparam(config_fullpath) removefiles = [] for key, value in parameters.items(): if key.endswith("file") and value: if all(ord(c) < 128 and c != " " for c in value): # IF the file name contains ONLY ascii chars and # no spaces, we can use it directly. However, we # need to relativize path of file relative to the # XSL file we'll be using. The mechanism could be # clearer... value = os.path.relpath(value, self.templdir) else: # If the filename contains non-ascii characters or # space, any attempt to eg # "document($annotationfile)" in the XSLT document # will silently fail. Seriously, f**k lxml's error # handling. In this case, copy it to a temp file # (in the temporary templdir, with ascii filename) # and use that. contents = util.readfile(value) value = os.path.basename(value) value = "".join(c for c in value if ord(c) < 128 and c != " ") removefiles.append(self.templdir + os.sep + value) util.writefile(self.templdir + os.sep + value, contents) if os.sep == "\\": value = value.replace(os.sep, "/") strparams[key] = XSLT.strparam(value) try: return self._transformer(indata, **strparams) except etree.XSLTApplyError as e: # the exception will only contain the last error. Errors # emanting from the xhtml file will not have file/line # number information. Errors emanting from the xslt file # do have file/line number info, and is probably more # useful to deal with. for error in self._transformer.error_log: if error.line: log.error("%s: %s (line %s)" % (error.filename, error.message, error.line)) raise errors.TransformError(str(e)) finally: for f in removefiles: util.robust_remove(f) # FIXME: This can never be reached, if _transformer() does not # raise an error, the above returns immediately. if len(self._transformer.error_log) > 0: raise errors.TransformError(str(_transformer.error_log))
def _checksum(self, filename): """MD5-checksumman för den angivna filen""" import hashlib c = hashlib.md5() try: c.update(util.readfile(filename, encoding=self.source_encoding)) except: self.log.warning("Could not extract plaintext from %s" % filename) return c.hexdigest()
def test_save(self): path = self.repo.store.documententry_path("123/a") d = DocumentEntry() d.orig_checked = datetime(2013,3,27,20,46,37) d.orig_url = 'http://source.example.org/doc/123/a' d.save(path=path) self.maxDiff = None self.assertEqual(self.d2u(util.readfile(path)), self.basic_json)
def test_basic(self): # basic test 1: accept: text/html -> generated file # Note that our Accept header has a more complicated value # typical of a real-life browse status, headers, content = self.call_wsgi(self.env) self.assertResponse( "200 OK", {'Content-Type': 'text/html; charset=utf-8'}, util.readfile(self.repo.store.generated_path("123/a"), "rb"), status, headers, content)
def test_longdesc(self): # test 2: Same, but with a multi-line desc dconf = self.globalconf.base dconf.download_text = b"This is a file.\nIt has been downloaded.\n" repo = MockRepo(datadir=self.datadir) with repo.store.open_downloaded(self.basefile, "wb") as fp: fp.write(b"This is a file.\nIt has been patched.\n") longdesc = "A longer comment\nspanning\nseveral lines" patchpath = self.d.mkpatch("base", self.basefile, longdesc) self.assertTrue(patchpath) patchcontent = util.readfile(patchpath) desccontent = util.readfile(patchpath.replace(".patch", ".desc")) self.assertEqual(longdesc, desccontent) self.assertFalse("A longer comment" in patchcontent) self.assertIn("@@ -1,2 +1,2 @@", patchcontent) self.assertIn("-It has been downloaded.", patchcontent) self.assertIn("+It has been patched.", patchcontent)