def downloaded_to_intermediate(self, basefile): # Check to see if this might not be a proper SFS at all # (from time to time, other agencies publish their stuff # in SFS - this seems to be handled by giving those # documents a SFS nummer on the form "N1992:31". Filter # these out. if basefile.startswith('N'): raise IckeSFS("%s is not a regular SFS" % basefile) filename = self.store.downloaded_path(basefile) try: t = TextReader(filename, encoding=self.source_encoding) except IOError: self.log.warning("%s: Fulltext is missing" % basefile) # FIXME: This code needs to be rewritten baseuri = self.canonical_uri(basefile) if baseuri in registry: title = registry[baseuri].value(URIRef(baseuri), self.ns['dcterms'].title) desc.value(self.ns['dcterms'].title, title) desc.rel(self.ns['dcterms'].publisher, self.lookup_resource("Regeringskansliet")) desc.value(self.ns['dcterms'].identifier, "SFS " + basefile) doc.body = Forfattning([Stycke(['Lagtext saknas'], id='S1')]) # Check to see if the Författning has been revoked (using # plain fast string searching, no fancy HTML parsing and # traversing) if not self.config.keepexpired: try: t.cuepast('<i>Författningen är upphävd/skall upphävas: ') datestr = t.readto('</i></b>') if datetime.strptime(datestr, '%Y-%m-%d') < datetime.today(): self.log.debug('%s: Expired' % basefile) raise UpphavdForfattning( "%s is an expired SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) t.seek(0) except IOError: t.seek(0) t.cuepast('<pre>') # remove ä et al try: # this is the preferred way from py34 onwards. FIXME: Move # this to ferenda.compat import html txt = html.unescape(t.readto('</pre>')) except ImportError: # this is the old way. hp = HTMLParser() txt = hp.unescape(t.readto('</pre>')) if '\r\n' not in txt: txt = txt.replace('\n', '\r\n') re_tags = re.compile("</?\w{1,3}>") txt = re_tags.sub('', txt) # add ending CRLF aids with producing better diffs txt += "\r\n" util.writefile(self.store.intermediate_path(basefile), txt, encoding=self.source_encoding) return codecs.open(self.store.intermediate_path(basefile), encoding=self.source_encoding)
def test_depth(self): xsltfile = self.datadir+os.sep+"notused.xslt" util.writefile(xsltfile, '<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"/>') t = Transformer("XSLT", xsltfile, "xsl", None, "data") self.assertEqual(0, t._depth("data", "data/index.html")) self.assertEqual(1, t._depth("data/repo", "data/index.html")) self.assertEqual(3, t._depth("data/repo/toc/title", "data/index.html"))
def test_depth(self): xsltfile = self.datadir+os.sep+"notused.xslt" util.writefile(xsltfile, '<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"/>') t = Transformer("XSLT", xsltfile, ["res/xsl"], "data") self.assertEqual(0, t._depth("data", "data/index.html")) self.assertEqual(1, t._depth("data/repo", "data/index.html")) self.assertEqual(3, t._depth("data/repo/toc/title", "data/index.html"))
def _make_files(self, option, filedir, combinefile=None, combinefunc=None): urls = [] buf = BytesIO() processed = set() # eg. self.config.cssfiles if getattr(self.config, option): # it's possible to set eg # cssfiles=None when # creating the Resources # object for f in getattr(self.config, option): urls.append(self._process_file(f, buf, filedir, "ferenda.ini")) processed.add(f) for repo in self.repos: # FIXME: create a more generic way of optionally # signalling to a repo that "Hey, now it's time to create # your resources if you can" if repo.__class__.__name__ == "SFS" and option == "imgfiles": self.log.info("calling into SFS._makeimages()") LayeredConfig.set(repo.config, 'imgfiles', repo._makeimages()) for f in getattr(repo.config, option): if f in processed: continue urls.append(self._process_file(f, buf, filedir, repo.alias)) processed.add(f) urls = list(filter(None, urls)) if combinefile: txt = buf.getvalue().decode('utf-8') util.writefile(combinefile, combinefunc(txt)) return [self._filepath_to_urlpath(combinefile, 2)] else: return urls
def make_resources_xml(self, cssfiles, jsfiles): E = ElementMaker() # namespace = None, nsmap={None: ...} root = E.configuration( E.sitename(self.config.sitename), E.sitedescription(self.config.sitedescription), E.url(self.config.url), E.tabs(*self._links('tabs')), E.footerlinks(*self._links('footer')), E.stylesheets(*self._li_wrap(cssfiles, 'link', 'href', rel="stylesheet")), E.javascripts(*self._li_wrap(jsfiles, 'script', 'src', text=" ")) ) if not self.config.staticsite: root.append( E.search( E.endpoint(self.config.searchendpoint) ) ) outfile = self.resourcedir + os.sep + "resources.xml" util.writefile( outfile, etree.tostring( root, encoding="utf-8", pretty_print=True).decode("utf-8")) self.log.info("Wrote %s" % outfile) return [self._filepath_to_urlpath(outfile, 1)]
def download_single(self, basefile, url=None): if url is None: result = self.query_webservice("DN = %s" % basefile, page=1) result.raise_for_status() tree = etree.parse(BytesIO(result.content)) results = tree.findall(".//{http://eur-lex.europa.eu/search}result") assert len(results) == 1 result = results[0] cellarid = result.find(".//{http://eur-lex.europa.eu/search}reference").text cellarid = re.split("[:_]", cellarid)[2] celex = result.find(".//{http://eur-lex.europa.eu/search}ID_CELEX")[0].text match = self.celexfilter(celex) assert match celex = match.group(1) assert celex == basefile lang, filetype, mimetype, url = self.find_manifestation(cellarid, celex) # FIXME: This is an ugly way of making sure the downloaded # file gets the right suffix (due to # DocumentStore.downloaded_path choosing a filename from among # several possible suffixes based on what file already exists downloaded_path = self.store.path(basefile, 'downloaded', '.'+filetype) if not os.path.exists(downloaded_path): util.writefile(downloaded_path, "") return super(EURLex, self).download_single(basefile, url)
def test_write_atom_inline(self): self.repo.faceted_data = Mock(return_value=self.faceted_data) for basefile in range(25): de = DocumentEntry( self.repo.store.documententry_path(str(basefile))) util.writefile(self.repo.store.parsed_path(str(basefile)), "<html><p>Document #%s</p></html>" % basefile) de.set_content(self.repo.store.parsed_path(str(basefile)), self.repo.canonical_uri(str(basefile)), inline=True) de.save() unsorted_entries = self.repo.news_facet_entries() entries = sorted(list(unsorted_entries), key=itemgetter('updated'), reverse=True) self.repo.news_write_atom(entries, 'New and updated documents', 'main', archivesize=6) tree = etree.parse('%s/base/feed/main.atom' % self.datadir) NS = "{http://www.w3.org/2005/Atom}" content = tree.find(".//" + NS + "content") self.assertIsNotNone(content) self.assertIsNone(content.get("src")) self.assertIsNone(content.get("hash")) self.assertEqual(content.get("type"), "xhtml") self.assertEqualXML( etree.tostring(content[0]), '<html xmlns="http://www.w3.org/2005/Atom" xmlns:le="http://purl.org/atompub/link-extensions/1.0"><p>Document #24</p></html>' )
def test_write_atom_inline(self): self.repo.faceted_data = Mock(return_value=self.faceted_data) for basefile in range(25): de = DocumentEntry(self.repo.store.documententry_path(str(basefile))) util.writefile(self.repo.store.parsed_path(str(basefile)), "<html><p>Document #%s</p></html>" % basefile) de.set_content(self.repo.store.parsed_path(str(basefile)), self.repo.canonical_uri(str(basefile)), inline=True) de.save() unsorted_entries = self.repo.news_facet_entries() entries = sorted(list(unsorted_entries), key=itemgetter('updated'), reverse=True) self.repo.news_write_atom(entries, 'New and updated documents', 'main', archivesize=6) tree = etree.parse('%s/base/feed/main.atom' % self.datadir) NS = "{http://www.w3.org/2005/Atom}" content = tree.find(".//"+NS+"content") self.assertIsNotNone(content) self.assertIsNone(content.get("src")) self.assertIsNone(content.get("hash")) self.assertEqual(content.get("type"), "xhtml") self.assertEqualXML(etree.tostring(content[0]), '<html xmlns="http://www.w3.org/2005/Atom" xmlns:le="http://purl.org/atompub/link-extensions/1.0"><p>Document #24</p></html>')
def download_single(self, basefile, url=None): if not url: entry = DocumentEntry(self.store.documententry_path(basefile)) url = entry.orig_url xml_downloaded_path = self.store.downloaded_path(basefile).replace(".pdf", ".xml") if self.get_parse_options(basefile) == "metadataonly": # in these cases, to save space, get # the smaller XML OCR data, not the # actual scanned images-in-PDF url = url.replace(".pdf", ".xml").replace("pdf/web", "xml") # make store.downloaded_path return .xml suffixes (and set # the timestamp to the beginning of epoch so that the # resulting if-modified-since header doesn't contain the # current date/time if not os.path.exists(xml_downloaded_path): util.writefile(xml_downloaded_path, "") os.utime(xml_downloaded_path, (0,0)) else: # if parse options have changed from metadataonly to # default, there will be a xml file lying about which will # make downloaded_path return its name. Remove it so that # we don't end up with pdf files that have a .xml # extension. if os.path.exists(xml_downloaded_path): os.unlink(xml_downloaded_path) return super(PropKB, self).download_single(basefile, url)
def test_run_makeresources(self): # 1. setup test_run_enable # 2. run('all', 'makeresources') # 3. verify that all css/jss files specified by default and in Testrepo gets copied # (remove rsrc) # 4. run('all', 'makeresources', '--combine') # 5. verify that single css and js file is created self._enable_repos() s = os.sep want = {'css':[s.join(['rsrc', 'css','test.css']), s.join(['rsrc', 'css','other.css'])], 'js':[s.join(['rsrc', 'js','test.js'])], 'xml':[s.join(['rsrc', 'resources.xml'])] } got = manager.run(['all', 'makeresources']) self.assertEqual(want,got) # 6. alter the ferenda.ini so that it doesn't specify any css/js files util.writefile("ferenda.ini", """[__root__] loglevel=WARNING datadir = %s url = http://localhost:8000 searchendpoint = /search/ apiendpoint = /api/ """ % self.tempdir) want = {'css':[], 'js':[], 'xml':[s.join(['rsrc', 'resources.xml'])] } got = manager.run(['all', 'makeresources']) self.assertEqual(want,got)
def _make_files(self, option, filedir, combinefile=None, combinefunc=None): urls = [] buf = BytesIO() processed = set() # eg. self.config.cssfiles if getattr(self.config, option): # it's possible to set eg # cssfiles=None when # creating the Resources # object for f in getattr(self.config, option): urls.append(self._process_file(f, buf, filedir, "ferenda.ini")) processed.add(f) for repo in self.repos: # FIXME: create a more generic way of optionally # signalling to a repo that "Hey, now it's time to create # your resources if you can" if repo.__class__.__name__ == "SFS" and option == "imgfiles": self.log.info("calling into SFS._makeimages()") LayeredConfig.set(repo.config, 'imgfiles', repo._makeimages()) if hasattr(repo.config, option): for f in getattr(repo.config, option): if f in processed: continue urls.append(self._process_file(f, buf, filedir, repo.alias)) processed.add(f) urls = list(filter(None, urls)) if combinefile: txt = buf.getvalue().decode('utf-8') util.writefile(combinefile, combinefunc(txt)) return [self._filepath_to_urlpath(combinefile, 2)] else: return urls
def setUp(self): self.maxDiff = None self.tempdir = tempfile.mkdtemp() # FIXME: this creates (and tearDown deletes) a file in # cwd. Should be placed in self.tempdir, but tests need to be # adjusted to find it there. # NB: The section keys are different from the specified # classes alias properties. This is intended. staticmockclass.resourcebase = self.tempdir util.writefile( "ferenda.ini", """[__root__] datadir = %s loglevel = CRITICAL [test] class=testManager.staticmockclass [test2] class=testManager.staticmockclass2 """ % self.tempdir) util.writefile(self.tempdir + "/test.js", "// test.js code goes here") util.writefile(self.tempdir + "/test.css", "/* test.css code goes here */") util.writefile(self.tempdir + "/test.png", "\x89\x50\x4e\x47\x0d\x0a\x1a\x0a PNG data goes here") util.writefile(self.tempdir + "/transformed.scss", "a { color: red + green; }")
def downloaded_to_intermediate(self, basefile, attachment=None): # Check to see if this might not be a proper SFS at all # (from time to time, other agencies publish their stuff # in SFS - this seems to be handled by giving those # documents a SFS nummer on the form "N1992:31". Filter # these out. if basefile.startswith('N'): raise IckeSFS("%s is not a regular SFS" % basefile) filename = self.store.downloaded_path(basefile) try: t = TextReader(filename, encoding=self.source_encoding) except IOError: self.log.warning("%s: Fulltext is missing" % basefile) # FIXME: This code needs to be rewritten baseuri = self.canonical_uri(basefile) if baseuri in registry: title = registry[baseuri].value(URIRef(baseuri), self.ns['dcterms'].title) desc.value(self.ns['dcterms'].title, title) desc.rel(self.ns['dcterms'].publisher, self.lookup_resource("Regeringskansliet")) desc.value(self.ns['dcterms'].identifier, "SFS " + basefile) doc.body = Forfattning([Stycke(['Lagtext saknas'], id='S1')]) # Check to see if the Författning has been revoked (using # plain fast string searching, no fancy HTML parsing and # traversing) if not self.config.keepexpired: try: t.cuepast('<i>Författningen är upphävd/skall upphävas: ') datestr = t.readto('</i></b>') if datetime.strptime(datestr, '%Y-%m-%d') < datetime.today(): self.log.debug('%s: Expired' % basefile) raise UpphavdForfattning("%s is an expired SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) t.seek(0) except IOError: t.seek(0) t.cuepast('<pre>') # remove ä et al try: # this is the preferred way from py34 onwards. FIXME: Move # this to ferenda.compat import html txt = html.unescape(t.readto('</pre>')) except ImportError: # this is the old way. hp = HTMLParser() txt = hp.unescape(t.readto('</pre>')) if '\r\n' not in txt: txt = txt.replace('\n', '\r\n') re_tags = re.compile("</?\w{1,3}>") txt = re_tags.sub('', txt) # add ending CRLF aids with producing better diffs txt += "\r\n" util.writefile(self.store.intermediate_path(basefile), txt, encoding=self.source_encoding) return codecs.open(self.store.intermediate_path(basefile), encoding=self.source_encoding)
def parse(self, doc): doc.uri = self.canonical_uri(doc.basefile) d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) self.infer_triples(d, doc.basefile) # prefer PDF or Word files over the plaintext-containing HTML files # FIXME: PDF or Word files are now stored as attachments pdffile = self.generic_path(doc.basefile, 'downloaded', '.pdf') wordfiles = (self.generic_path(doc.basefile, 'downloaded', '.doc'), self.generic_path(doc.basefile, 'downloaded', '.docx'), self.generic_path(doc.basefile, 'downloaded', '.wpd'), self.generic_path(doc.basefile, 'downloaded', '.rtf')) wordfile = None for f in wordfiles: if os.path.exists(f): wordfile = f # if we lack a .pdf file, use Open/LibreOffice to convert any # .wpd or .doc file to .pdf first if (wordfile and not os.path.exists(pdffile)): intermediate_pdf = self.generic_path( doc.basefile, "intermediate", ".pdf") if not os.path.exists(intermediate_pdf): cmdline = "%s --headless -convert-to pdf -outdir '%s' %s" % (self.config.get('soffice', 'soffice'), os.path.dirname( intermediate_pdf), wordfile) self.log.debug( "%s: Converting to PDF: %s" % (doc.basefile, cmdline)) (ret, stdout, stderr) = util.runcmd( cmdline, require_success=True) pdffile = intermediate_pdf if os.path.exists(pdffile): self.log.debug("%s: Using %s" % (doc.basefile, pdffile)) intermediate_dir = os.path.dirname( self.generic_path(doc.basefile, 'intermediate', '.foo')) self.setup_logger('pdfreader', self.config.get('log', 'INFO')) pdfreader = PDFReader() pdfreader.read(pdffile, intermediate_dir) self.parse_from_pdfreader(pdfreader, doc) else: downloaded_path = self.downloaded_path(doc.basefile) intermediate_path = self.generic_path( doc.basefile, 'intermediate', '.txt') self.log.debug("%s: Using %s (%s)" % (doc.basefile, downloaded_path, intermediate_path)) if not os.path.exists(intermediate_path): html = codecs.open( downloaded_path, encoding="iso-8859-1").read() util.writefile(intermediate_path, util.extract_text( html, '<pre>', '</pre>'), encoding="utf-8") textreader = TextReader(intermediate_path, encoding="utf-8") self.parse_from_textreader(textreader, doc)
def transform(self, indata, config=None, parameters={}): strparams = {} if config: # paths to be used with the document() function # must use unix path separators if os.sep == "\\": config = config.replace(os.sep, "/") # print("Tranform: Using config %s. Contents:" % config) # print(util.readfile(config)) config_fullpath = os.path.abspath(config) strparams['configurationfile'] = XSLT.strparam(config_fullpath) removefiles = [] for key, value in parameters.items(): if key.endswith("file") and value: if all(ord(c) < 128 and c != " " for c in value): # IF the file name contains ONLY ascii chars and # no spaces, we can use it directly. However, we # need to relativize path of file relative to the # XSL file we'll be using. The mechanism could be # clearer... value = os.path.relpath(value, self.templdir) else: # If the filename contains non-ascii characters or # space, any attempt to eg # "document($annotationfile)" in the XSLT document # will silently fail. Seriously, f**k lxml's error # handling. In this case, copy it to a temp file # (in the temporary templdir, with ascii filename) # and use that. contents = util.readfile(value) value = os.path.basename(value) value = "".join(c for c in value if ord(c) < 128 and c != " ") removefiles.append(self.templdir + os.sep + value) util.writefile(self.templdir + os.sep + value, contents) if os.sep == "\\": value = value.replace(os.sep, "/") strparams[key] = XSLT.strparam(value) try: return self._transformer(indata, **strparams) except etree.XSLTApplyError as e: # the exception will only contain the last error. Errors # emanting from the xhtml file will not have file/line # number information. Errors emanting from the xslt file # do have file/line number info, and is probably more # useful to deal with. for error in self._transformer.error_log: if error.line: log.error("%s: %s (line %s)" % (error.filename, error.message, error.line)) raise errors.TransformError(str(e)) finally: for f in removefiles: util.robust_remove(f) # FIXME: This can never be reached, if _transformer() does not # raise an error, the above returns immediately. if len(self._transformer.error_log) > 0: raise errors.TransformError(str(_transformer.error_log))
def parse(self, basefile): if basefile in ("1", "3"): util.writefile(self.store.parsed_path(basefile), "basefile %s, parsed by a" % basefile) util.writefile(self.store.distilled_path(basefile), "basefile %s, metadata from a" % basefile) return True else: return False # we don't even have this basefile
def test_get_serialized_file(self): want = tempfile.mktemp(suffix=".nt") util.writefile(want, util.readfile("test/files/datasets/dataset.nt")) got = tempfile.mktemp(suffix=".nt") self.loader.add_serialized( util.readfile("test/files/datasets/dataset.nt"),format="nt") del self.loader self.store.get_serialized_file(got, format="nt") self.assertEqualGraphs(want,got)
def test_combining(self): # Test2: combining, resources specified by global config # (maybe we should use smaller CSS+JS files? Test takes 2+ seconds...) s = os.sep want = {'css':[s.join(['rsrc', 'css','combined.css'])], 'js':[s.join(['rsrc', 'js','combined.js'])], 'img': [], 'xml':[s.join(['rsrc', 'resources.xml'])] } testcss = ["css/ferenda.css", "res/css/fake1.css", "res/css/fake2.css"] testjs = ["js/ferenda.js", "res/js/fake1.js", "res/js/fake2.js"] resources = Resources([staticmockclass(),staticmockclass2()],self.tempdir+os.sep+'rsrc', combineresources=True, cssfiles=testcss, jsfiles=testjs, sitename="Blahonga", sitedescription="A non-default value") rl = resources.resourceloader testcssfiles = [] testjsfiles = [] for cssfile in testcss: try: testcssfiles.append(rl.filename(cssfile)) except errors.ResourceNotFound: util.writefile(cssfile, "/* this is a faked css file: %s */" % cssfile*1000) testcssfiles.append(cssfile) for jsfile in testjs: try: testjsfiles.append(rl.filename(jsfile)) except errors.ResourceNotFound: util.writefile(jsfile, "/* this is a faked js file: %s */" % jsfile*1000) testjsfiles.append(jsfile) got = resources.make(api=False) self.assertEqual(want,got) tree = ET.parse(self.tempdir+'/'+got['xml'][0]) stylesheets=tree.find("stylesheets").getchildren() self.assertEqual(len(stylesheets),1) self.assertEqual(stylesheets[0].attrib['href'],'rsrc/css/combined.css') javascripts=tree.find("javascripts").getchildren() self.assertEqual(len(javascripts),1) self.assertEqual(javascripts[0].attrib['src'],'rsrc/js/combined.js') self.assertEqual(tree.find("sitename").text,"Blahonga") self.assertEqual(tree.find("sitedescription").text,"A non-default value") self.assertTrue(os.path.exists(self.tempdir+'/rsrc/css/combined.css')) self.assertTrue(os.path.exists(self.tempdir+'/rsrc/js/combined.js')) # check that the combining/minifying indeed saved us some space self.assertLess(os.path.getsize(self.tempdir+'/rsrc/css/combined.css'), sum([os.path.getsize(x) for x in testcssfiles])) self.assertLess(os.path.getsize(self.tempdir+'/rsrc/js/combined.js'), sum([os.path.getsize(x) for x in testjsfiles]))
def transform(self, indata, config=None, parameters={}): strparams = {} if config: # paths to be used with the document() function # must use unix path separators if os.sep == "\\": config = config.replace(os.sep, "/") # print("Tranform: Using config %s. Contents:" % config) # print(util.readfile(config)) config_fullpath = os.path.abspath(config) strparams['configurationfile'] = XSLT.strparam(config_fullpath) removefiles = [] for key, value in parameters.items(): if key.endswith("file") and value: if all(ord(c) < 128 and c != " " for c in value): # IF the file name contains ONLY ascii chars and # no spaces, we can use it directly. However, we # need to relativize path of file relative to the # XSL file we'll be using. The mechanism could be # clearer... value = os.path.relpath(value, self.templdir) else: # If the filename contains non-ascii characters or # space, any attempt to eg # "document($annotationfile)" in the XSLT document # will silently fail. Seriously, f**k lxml's error # handling. In this case, copy it to a temp file # (in the temporary templdir, with ascii filename) # and use that. contents = util.readfile(value) value = os.path.basename(value) value = "".join(c for c in value if ord(c) < 128 and c != " ") removefiles.append(self.templdir+os.sep+value) util.writefile(self.templdir+os.sep+value, contents) if os.sep == "\\": value = value.replace(os.sep, "/") strparams[key] = XSLT.strparam(value) try: return self._transformer(indata, **strparams) except etree.XSLTApplyError as e: # the exception will only contain the last error. Errors # emanting from the xhtml file will not have file/line # number information. Errors emanting from the xslt file # do have file/line number info, and is probably more # useful to deal with. for error in self._transformer.error_log: if error.line: log.error("%s: %s (line %s)" % (error.filename, error.message, error.line)) raise errors.TransformError(str(e)) finally: for f in removefiles: util.robust_remove(f) # FIXME: This can never be reached, if _transformer() does not # raise an error, the above returns immediately. if len(self._transformer.error_log) > 0: raise errors.TransformError(str(_transformer.error_log))
def parse(self, doc): # create an intermediate file before we know the correct # path for it. Later steps should move this file to the # correct place. util.writefile(self.store.intermediate_path(doc.basefile), "dummy") doc.meta.add((rdflib.URIRef(doc.uri), DCTERMS.title, rdflib.Literal("Hello World", lang="en"))) doc.body = Body([H1(["Hello world"])]) doc.basefile = doc.basefile.replace("a/", "b/") return True
def test_republishsource(self): self.repo.config.republishsource = True for basefile in range(25): util.writefile(self.repo.store.downloaded_path(str(basefile)), "Source content") entries = sorted(list(self.repo.news_entries()), key=attrgetter('updated'), reverse=True) self.assertEqual(entries[0].content['src'], self.repo.downloaded_url("24"))
def test_list_basefiles_file(self): files = ["downloaded/123/a.html", "downloaded/123/b.html", "downloaded/124/a.html", "downloaded/124/b.html"] basefiles = ["124/b", "124/a", "123/b", "123/a"] for f in files: util.writefile(self.p(f),"Nonempty") self.assertEqual(list(self.store.list_basefiles_for("parse")), basefiles)
def test_list_basefiles_postgenerate_file(self): files = ["generated/123/a.html", "generated/123/b.html", "generated/124/a.html", "generated/124/b.html"] basefiles = ["124/b", "124/a", "123/b", "123/a"] for f in files: util.writefile(self.p(f),"nonempty") self.assertEqual(list(self.store.list_basefiles_for("_postgenerate")), basefiles)
def test_ifneeded_relate(self): @ifneeded("relate") def testfunc(repo, basefile, needed): repo.called = True repo.needed = needed try: datadir = tempfile.mkdtemp() mockbasefile = "1234" mockrepo = Mock() mockrepo.store = DocumentStore(datadir=datadir) mockrepo.called = False mockrepo.config.force = False # create some docentry file in a good place de = DocumentEntry(mockrepo.store.documententry_path("1234")) now = datetime.datetime.now() de.indexed_ts = now + datetime.timedelta(seconds=3600) de.indexed_ft = now + datetime.timedelta(seconds=-3600) de.indexed_dep = now + datetime.timedelta(seconds=-3600) de.save() # test 1: Outfile is newer - the ifneeded decorator should # make sure the actual testfunc code is never reached # NOTE: the "relate" branch of DocumentStore.needed # doesn't use outfile_is_newer, so we can't patch that, we # have to create actual files parsedpath = mockrepo.store.parsed_path("1234") util.writefile(parsedpath, "dummy") os.utime(parsedpath, (now.timestamp(), now.timestamp() - 7200)) testfunc(mockrepo, mockbasefile) self.assertFalse(mockrepo.called) mockrepo.called = False # test 2: Outfile is older than the information in the documententry file os.utime(parsedpath, (now.timestamp(), now.timestamp())) testfunc(mockrepo, mockbasefile) self.assertTrue(mockrepo.called) self.assertTrue(mockrepo.needed) self.assertFalse(mockrepo.needed.triples) self.assertFalse(mockrepo.needed.dependencies) self.assertTrue(mockrepo.needed.fulltext) mockrepo.called = False # test 3: Outfile is newer, but the global force option was set os.utime(parsedpath, (now.timestamp(), now.timestamp() - 7200)) mockrepo.config.force = True testfunc(mockrepo, mockbasefile) self.assertTrue(mockrepo.called) mockrepo.config.force = None mockrepo.called = False finally: if os.path.exists(datadir): shutil.rmtree(datadir)
def download(self, basefile=None): # Get all "term sets" (used dcterms:subject Objects, wiki pages # describing legal concepts, swedish wikipedia pages...) terms = defaultdict(dict) # 1) Query the triplestore for all dcterms:subject triples (is this # semantically sensible for a "download" action -- the content # isn't really external?) -- term set "subjects" (these come # from both court cases and legal definitions in law text) sq = """ PREFIX dcterms:<http://purl.org/dc/terms/> PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#> SELECT ?uri ?subject ?label WHERE { {?uri dcterms:subject ?subject . } OPTIONAL {?subject rdfs:label ?label . } } """ store = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) results = store.select(sq, "python") for row in results: if 'label' in row: label = row['label'] else: label = self.basefile_from_uri(row['subject']) if label is None: self.log.warning("could not determine keyword from %s" % row['subject']) continue sanitized = self.sanitize_term(label) if sanitized: if sanitized not in terms: terms[sanitized]['subjects'] = [] terms[sanitized]['subjects'].append(row['uri']) self.log.debug("Retrieved %s subject terms from triplestore" % len(terms)) for termset_func in self.termset_funcs: termset_func(terms) for term in terms: term = self.sanitize_term(term) if not term: continue oldterms = "" termpath = self.store.downloaded_path(term) if os.path.exists(termpath): oldterms = yaml.load(util.readfile(termpath)) if terms[term] != oldterms: util.ensure_dir(termpath) util.writefile(termpath, yaml.dump(terms[term], default_flow_style=False)) self.log.info("%s: in %s termsets" % (term, len(terms[term]))) else: self.log.debug("%s: skipped" % term)
def test_listdirs(self): util.writefile(self.p("foo.txt"), "Hello") util.writefile(self.p("bar.txt"), "Hello") util.writefile(self.p("foo/2.txt"), "Hello") util.writefile(self.p("foo/10.txt"), "Hello") util.writefile(self.datadir+"/foo/baz.text", "Hello") generator = util.list_dirs(self.datadir, ".txt") self.assertEqual(self.p("bar.txt"), next(generator)) self.assertEqual([self.p("foo.txt"), self.p("foo/2.txt"), self.p("foo/10.txt")], list(generator))
def test_list_versions_file(self): files = ["archive/downloaded/123/a/1.html", "archive/downloaded/123/a/2.html", "archive/downloaded/123/a/2bis.html", "archive/downloaded/123/a/10.html"] versions = ["1","2", "2bis", "10"] for f in files: util.writefile(self.p(f),"nonempty") # list_versions(action, basefile) self.assertEqual(list(self.store.list_versions("123/a","downloaded")), versions)
def test_list_versions_file(self): files = ["archive/downloaded/123/a/.versions/1.html", "archive/downloaded/123/a/.versions/2.html", "archive/downloaded/123/a/.versions/2bis.html", "archive/downloaded/123/a/.versions/10.html"] versions = ["1","2", "2bis", "10"] for f in files: util.writefile(self.p(f),"nonempty") # list_versions(action, basefile) self.assertEqual(list(self.store.list_versions("123/a","downloaded")), versions)
def test_list_basefiles_generate_dir(self): files = ["parsed/123/a/index.xhtml", "parsed/123/b/index.xhtml", "parsed/124/a/index.xhtml", "parsed/124/b/index.xhtml"] basefiles = ["124/b", "124/a", "123/b", "123/a"] self.store.storage_policy = "dir" for f in files: util.writefile(self.p(f),"nonempty") self.assertEqual(list(self.store.list_basefiles_for("generate")), basefiles)
def test_list_versions_dir(self): files = ["archive/downloaded/123/a/1/index.html", "archive/downloaded/123/a/2/index.html", "archive/downloaded/123/a/2bis/index.html", "archive/downloaded/123/a/10/index.html"] basefiles = ['123/a'] versions = ["1","2", "2bis", "10"] for f in files: util.writefile(self.p(f),"nonempty") self.store.storage_policy = "dir" self.assertEqual(list(self.store.list_versions("123/a", "downloaded")), versions)
def test_download(self): # create a basic.json + 1-2 resources os.mkdir(self.datadir + "/source") with open(self.datadir + "/source/basic.json", "w") as fp: s = json.dumps(self.basicjson, separators=(', ', ': ')) fp.write(s) util.writefile(self.datadir + "/source/index.html", "<p><a href='doc/a_.html'>ID: a</a></p>") util.writefile(self.datadir + "/source/a_.html", "<p>This is doc A</p>") self._runtest()
def test_distill_setfile(self): os.mkdir(self.datadir+"/downloaded") util.writefile(self.datadir+"/downloaded/a.html", "<p>This is doc A</p>") util.writefile(self.datadir+"/distilled/a.ttl", "") os.environ["FERENDA_SET_TESTFILE"] = "1" with patch("builtins.print") as printmock: self._runtest() del os.environ["FERENDA_SET_TESTFILE"] self.assertEqual(self.expected_ttl, util.readfile(self.datadir+"/distilled/a.ttl")) pass
def test_download(self): # create a basic.json + 1-2 resources os.mkdir(self.datadir+"/source") with open(self.datadir+"/source/basic.json", "w") as fp: s = json.dumps(self.basicjson, separators=(', ', ': ')) fp.write(s) util.writefile(self.datadir+"/source/index.html", "<p><a href='doc/a_.html'>ID: a</a></p>") util.writefile(self.datadir+"/source/a_.html", "<p>This is doc A</p>") self._runtest()
def test_distill_setfile(self): os.mkdir(self.datadir + "/downloaded") util.writefile(self.datadir + "/downloaded/a.html", "<p>This is doc A</p>") util.writefile(self.datadir + "/distilled/a.ttl", "") os.environ["FERENDA_SET_TESTFILE"] = "1" with patch("builtins.print") as printmock: self._runtest() del os.environ["FERENDA_SET_TESTFILE"] self.assertEqual(self.expected_ttl, util.readfile(self.datadir + "/distilled/a.ttl")) pass
def test_list_versions_dir(self): files = ["archive/downloaded/123/a/.versions/1/index.html", "archive/downloaded/123/a/.versions/2/index.html", "archive/downloaded/123/a/.versions/2bis/index.html", "archive/downloaded/123/a/.versions/10/index.html"] basefiles = ['123/a'] versions = ["1","2", "2bis", "10"] for f in files: util.writefile(self.p(f),"nonempty") self.store.storage_policy = "dir" self.assertEqual(list(self.store.list_versions("123/a", "downloaded")), versions)
def test_list_attachments(self): self.store.storage_policy = "dir" # attachments require this files = ["downloaded/123/a/index.html", "downloaded/123/a/attachment.html", "downloaded/123/a/appendix.pdf", "downloaded/123/a/other.txt"] basefiles = ['123/a'] attachments = ['appendix.pdf', 'attachment.html', 'other.txt'] for f in files: util.writefile(self.p(f),"nonempty") # list_attachments(action, basefile, version=None) self.assertEqual(list(self.store.list_attachments("123/a", "downloaded")), attachments)
def test_list_basefiles_parse_dir(self): files = ["downloaded/123/a/index.html", "downloaded/123/b/index.html", "downloaded/124/a/index.html", "downloaded/124/b/index.html"] basefiles = ["124/b", "124/a", "123/b", "123/a"] self.store.storage_policy = "dir" for f in files: p = self.p(f) util.writefile(p,"nonempty") self.assertEqual(list(self.store.list_basefiles_for("parse")), basefiles)
def test_list_attachments(self): files = [ "downloaded/123/a/index.html", "downloaded/123/a/attachment.html", "downloaded/123/a/appendix.pdf", "downloaded/123/a/other.txt", ] basefiles = ["123/a"] attachments = ["appendix.pdf", "attachment.html", "other.txt"] for f in files: util.writefile(self.p(f), "nonempty") # list_attachments(action, basefile, version=None) self.assertEqual(list(self.store.list_attachments("123/a", "downloaded")), attachments)
def test_run_makeresources(self): # 1. setup test_run_enable # 2. run('all', 'makeresources') # 3. verify that all css/jss files specified by default and in # Testrepo gets copied (remove rsrc) # 4. run('all', 'makeresources', '--combine') # 5. verify that single css and js file is created self._enable_repos() s = os.sep want = { 'css': [ s.join(['rsrc', 'css', 'test.css']), s.join(['rsrc', 'css', 'other.css']) ], 'js': [s.join(['rsrc', 'js', 'test.js'])], 'img': [s.join(['rsrc', 'img', 'test.png'])], 'json': [ s.join(['rsrc', 'api', 'context.json']), s.join(['rsrc', 'api', 'common.json']), s.join(['rsrc', 'api', 'terms.json']) ], 'xml': [s.join(['rsrc', 'resources.xml'])] } got = manager.run(['all', 'makeresources']) self.assertEqual(want, got) # 6. alter the ferenda.ini so that it doesn't specify any css/js files util.writefile( "ferenda.ini", """[__root__] loglevel=WARNING datadir = %s url = http://localhost:8000/ searchendpoint = /search/ apiendpoint = /api/ cssfiles = [] jsfiles = [] imgfiles = [] """ % self.tempdir) want = { 'css': [], 'js': [], 'img': [], 'json': [ s.join(['rsrc', 'api', 'context.json']), s.join(['rsrc', 'api', 'common.json']), s.join(['rsrc', 'api', 'terms.json']) ], 'xml': [s.join(['rsrc', 'resources.xml'])] } got = manager.run(['all', 'makeresources']) self.assertEqual(want, got)
def setUp(self): super(EqualDirs, self).setUp() self.datadir = tempfile.mkdtemp() util.writefile(self.datadir + "/want/one.txt", "Contents of one") util.writefile(self.datadir + "/got/one.txt", "Contents of one") util.writefile(self.datadir + "/want/sub/two.text", "Contents of two") util.writefile(self.datadir + "/got/sub/two.text", "Contents of two")
def test_replace_if_different(self): # test 1: dst does not exist util.writefile(self.fname, "Hello") self.assertTrue(util.replace_if_different(self.fname, self.fname2)) self.assertFalse(os.path.exists(self.fname)) self.assertTrue(os.path.exists(self.fname2)) # test 2: dst exists, but is different (gets overwritten) util.writefile(self.fname, "Hello (different)") self.assertTrue(util.replace_if_different(self.fname, self.fname2)) self.assertFalse(os.path.exists(self.fname)) self.assertEqual("Hello (different)", util.readfile(self.fname2)) # test 3: src and dst is identical (src gets removed) util.writefile(self.fname, "Hello (different)") self.assertFalse(util.replace_if_different(self.fname, self.fname2)) self.assertFalse(os.path.exists(self.fname)) # test 4: dst exist, is different, gets archived newfile = self.dname+"/new.txt" archivefile = self.dname+"/archive.txt" util.writefile(newfile, "Hello (archiving)") self.assertTrue(util.replace_if_different(newfile, self.fname2, archivefile)) self.assertFalse(os.path.exists(newfile)) self.assertEqual("Hello (archiving)", util.readfile(self.fname2)) self.assertEqual("Hello (different)", util.readfile(archivefile))
def test_list_invalid_attachments(self): # test that files with an invalid suffix (in # store.invalid_suffixes) is not listed self.store.storage_policy = "dir" # attachments require this files = ["downloaded/123/a/index.html", "downloaded/123/a/index.invalid", "downloaded/123/a/other.invalid", "downloaded/123/a/other.txt"] basefiles = ['123/a'] attachments = ['other.txt'] for f in files: util.writefile(self.p(f),"nonempty") # list_attachments(action, basefile, version=None) self.assertEqual(list(self.store.list_attachments("123/a", "downloaded")), attachments)
def test_parse_setfile(self): os.mkdir(self.datadir + "/downloaded") util.writefile(self.datadir + "/downloaded/a.html", "<p>This is doc A</p>") util.writefile(self.datadir + "/parsed/a.xhtml", "") os.environ["FERENDA_SET_TESTFILE"] = "1" with patch("builtins.print") as printmock: self._runtest() output = printmock.mock_calls[0][1][0] output = re.sub("'[^']*'", "''", output, 1) self.assertEqual("Overwriting '' with result of parse ('a')", output) del os.environ["FERENDA_SET_TESTFILE"] self.assertEqualXML(self.expected_xhtml, util.readfile(self.datadir + "/parsed/a.xhtml")) pass