def download(self, basefile=None): if basefile: return self.download_single(basefile) if not self.config.mediawikidump: resp = requests.get(self.config.mediawikidump) xml = etree.parse(resp.content) wikinamespaces = [] # FIXME: Find out the proper value of MW_NS for ns_el in xml.findall("//" + MW_NS + "namespace"): wikinamespaces.append(ns_el.text) # Get list of currently downloaded pages - if any of those # does not appear in the XML dump, remove them afterwards basefiles = self.store.list_basefiles_for("parse") downloaded_files = [self.store.downloaded_path(x) for x in basefiles] for page_el in xml.findall(MW_NS + "page"): basefile = page_el.find(MW_NS + "title").text if basefile == "Huvudsida": continue if ":" in basefile and basefile.split(":")[0] in wikinamespaces: (namespace, localtitle) = basefile.split(":", 1) if namespace not in self.config.mediawikinamespaces: continue with self.store.open_downloaded(title, "w"): f.write(etree.tostring(page_el, encoding="utf-8")) if basefile in basefiles: del basefiles[basefiles.index(basefile)] for b in basefiles: self.log.debug("Removing stale %s" % b) util.robust_remove(self.store.downloaded_path(b))
def wrapper(self, doc): try: return f(self, doc) except DocumentRemovedError as e: self.log.info( "Document has been removed (%s)", e) util.robust_remove(self.parsed_path(doc.basefile)) return False except ParseError as e: self.log.error("ParseError %s", e) # FIXME: we'd like to use the shorter "if # ('fatalexceptions' in self.config" but a Mock we're # using in testDecorators.Decorators.test_handleerror does # not emulate this way of using the LayeredConfig # object. Until we rewrite the testcase better, this is # what we have to do. if (hasattr(self.config, 'fatalexceptions') and self.config.fatalexceptions): raise else: return False except Exception: self.log.exception("parse failed") # FIXME: see above if (hasattr(self.config, 'fatalexceptions') and self.config.fatalexceptions): raise else: return False
def GenerateMapAll(self): mapfile = os.path.sep.join( [self.baseDir, 'dv', 'generated', 'uri.map']) util.robust_remove(mapfile + ".new") parsed_dir = os.path.sep.join([self.baseDir, 'dv', 'parsed']) self._do_for_all(parsed_dir, '.xht2', self.GenerateMap) util.robustRename(mapfile + ".new", mapfile)
def test_loadgraphs(self): with open("graph_a.ttl", "w") as fp: fp.write(self.graph_a) with open("graph_a.nt", "w") as fp: fp.write(self.graph_a_nt) self.tester.assertEqualGraphs("graph_a.ttl", "graph_a.nt") util.robust_remove("graph_a.ttl") util.robust_remove("graph_a.nt")
def transform(self, indata, config=None, parameters={}): strparams = {} if config: # paths to be used with the document() function # must use unix path separators if os.sep == "\\": config = config.replace(os.sep, "/") # print("Tranform: Using config %s. Contents:" % config) # print(util.readfile(config)) config_fullpath = os.path.abspath(config) strparams['configurationfile'] = XSLT.strparam(config_fullpath) removefiles = [] for key, value in parameters.items(): if key.endswith("file") and value: if all(ord(c) < 128 and c != " " for c in value): # IF the file name contains ONLY ascii chars and # no spaces, we can use it directly. However, we # need to relativize path of file relative to the # XSL file we'll be using. The mechanism could be # clearer... value = os.path.relpath(value, self.templdir) else: # If the filename contains non-ascii characters or # space, any attempt to eg # "document($annotationfile)" in the XSLT document # will silently fail. Seriously, f**k lxml's error # handling. In this case, copy it to a temp file # (in the temporary templdir, with ascii filename) # and use that. contents = util.readfile(value) value = os.path.basename(value) value = "".join(c for c in value if ord(c) < 128 and c != " ") removefiles.append(self.templdir + os.sep + value) util.writefile(self.templdir + os.sep + value, contents) if os.sep == "\\": value = value.replace(os.sep, "/") strparams[key] = XSLT.strparam(value) try: return self._transformer(indata, **strparams) except etree.XSLTApplyError as e: # the exception will only contain the last error. Errors # emanting from the xhtml file will not have file/line # number information. Errors emanting from the xslt file # do have file/line number info, and is probably more # useful to deal with. for error in self._transformer.error_log: if error.line: log.error("%s: %s (line %s)" % (error.filename, error.message, error.line)) raise errors.TransformError(str(e)) finally: for f in removefiles: util.robust_remove(f) # FIXME: This can never be reached, if _transformer() does not # raise an error, the above returns immediately. if len(self._transformer.error_log) > 0: raise errors.TransformError(str(_transformer.error_log))
def textreader_from_basefile(self, basefile, encoding): infile = self.store.downloaded_path(basefile) tmpfile = self.store.path(basefile, "intermediate", ".pdf") outfile = self.store.path(basefile, "intermediate", ".txt") util.copy_if_different(infile, tmpfile) util.runcmd("pdftotext %s" % tmpfile, require_success=True) util.robust_remove(tmpfile) return TextReader(outfile, encoding=encoding, linesep=TextReader.UNIX)
def transform(self, indata, config=None, parameters={}): strparams = {} if config: # paths to be used with the document() function # must use unix path separators if os.sep == "\\": config = config.replace(os.sep, "/") # print("Tranform: Using config %s. Contents:" % config) # print(util.readfile(config)) config_fullpath = os.path.abspath(config) strparams['configurationfile'] = XSLT.strparam(config_fullpath) removefiles = [] for key, value in parameters.items(): if key.endswith("file") and value: if all(ord(c) < 128 and c != " " for c in value): # IF the file name contains ONLY ascii chars and # no spaces, we can use it directly. However, we # need to relativize path of file relative to the # XSL file we'll be using. The mechanism could be # clearer... value = os.path.relpath(value, self.templdir) else: # If the filename contains non-ascii characters or # space, any attempt to eg # "document($annotationfile)" in the XSLT document # will silently fail. Seriously, f**k lxml's error # handling. In this case, copy it to a temp file # (in the temporary templdir, with ascii filename) # and use that. contents = util.readfile(value) value = os.path.basename(value) value = "".join(c for c in value if ord(c) < 128 and c != " ") removefiles.append(self.templdir+os.sep+value) util.writefile(self.templdir+os.sep+value, contents) if os.sep == "\\": value = value.replace(os.sep, "/") strparams[key] = XSLT.strparam(value) try: return self._transformer(indata, **strparams) except etree.XSLTApplyError as e: # the exception will only contain the last error. Errors # emanting from the xhtml file will not have file/line # number information. Errors emanting from the xslt file # do have file/line number info, and is probably more # useful to deal with. for error in self._transformer.error_log: if error.line: log.error("%s: %s (line %s)" % (error.filename, error.message, error.line)) raise errors.TransformError(str(e)) finally: for f in removefiles: util.robust_remove(f) # FIXME: This can never be reached, if _transformer() does not # raise an error, the above returns immediately. if len(self._transformer.error_log) > 0: raise errors.TransformError(str(_transformer.error_log))
def tearDown(self): manager.config_loaded = False manager.shutdown_logger() if self.orig_cwd: os.chdir(self.orig_cwd) shutil.rmtree(self.tempdir) sys.path.remove(self.tempdir) else: # all tests took place in the project directory, so we # have to clean some crap out. for crap in ("ferenda.ini", "example.py", "index.html", "index.xhtml", "other.css", "rsrc", "data", "dummyfile.txt", "test.css", "test.js", "test.png"): util.robust_remove(crap)
def test_drawboxes(self): pypdfmock = MagicMock() canvasmock = MagicMock() mocks = {'PyPDF2': pypdfmock, 'reportlab': MagicMock(), 'reportlab.pdfgen': MagicMock(), 'reportlab.pdfgen.canvas': canvasmock} with patch.dict('sys.modules', mocks): metrics = self.analyzer.metrics() pdfpath = "test/files/pdfanalyze/lipsum.debug.pdf" self.analyzer.drawboxes(pdfpath, metrics=metrics) self.assertTrue(canvasmock.Canvas.called) self.assertTrue(pypdfmock.PdfFileReader.called) self.assertTrue(pypdfmock.PdfFileWriter.called) util.robust_remove(pdfpath)
def download(self, basefile=None): if basefile: return self.download_single(basefile) if self.config.mediawikidump: resp = requests.get(self.config.mediawikidump) xmldumppath = self.store.path('dump', 'downloaded', '.xml') with self.store._open(xmldumppath, mode="wb") as fp: fp.write(resp.content) # xml = etree.parse(resp.content) xml = etree.parse(xmldumppath) else: raise ConfigurationError("config.mediawikidump not set") MW_NS = "{%s}" % xml.getroot().nsmap[None] wikinamespaces = [] # FIXME: Find out the proper value of MW_NS for ns_el in xml.findall("//" + MW_NS + "namespace"): wikinamespaces.append(ns_el.text) # Get list of existing basefiles - if any of those # does not appear in the XML dump, remove them afterwards basefiles = list(self.store.list_basefiles_for("parse")) for page_el in xml.findall(MW_NS + "page"): basefile = page_el.find(MW_NS + "title").text if basefile == "Huvudsida": continue if ":" in basefile and basefile.split(":")[0] in wikinamespaces: (namespace, localtitle) = basefile.split(":", 1) if namespace not in self.config.mediawikinamespaces: continue p = self.store.downloaded_path(basefile) self.log.info("%s: extracting from XML dump" % basefile) with self.store.open_downloaded(basefile, "w") as fp: fp.write(etree.tostring(page_el, encoding="utf-8")) if basefile in basefiles: del basefiles[basefiles.index(basefile)] for b in basefiles: self.log.debug("%s: removing stale document" % b) util.robust_remove(self.store.downloaded_path(b))
def archive(self, basefile, version, overwrite=False, copy=False): """Moves the current version of a document to an archive. All files related to the document are moved (downloaded, parsed, generated files and any existing attachment files). :param basefile: The basefile of the document to archive :type basefile: str :param version: The version id to archive under :type version: str """ for meth in (self.downloaded_path, self.documententry_path, self.parsed_path, self.serialized_path, self.distilled_path, self.annotation_path, self.generated_path): # FIXME: what about intermediate? Ignore them as they # should be able to be regenerated at any time? src = meth(basefile) dest = meth(basefile, version) if self.storage_policy == "dir" and meth in (self.downloaded_path, self.parsed_path, self.generated_path): src = os.path.dirname(src) dest = os.path.dirname(dest) if not os.path.exists(src): continue if os.path.exists(dest): if overwrite: util.robust_remove(dest) else: raise errors.ArchivingError( "Archive destination %s for basefile %s version %s already exists!" % (dest, basefile, version)) # self.log.debug("Archiving %s to %s" % (src,dest)) # print("Archiving %s to %s" % (src,dest)) util.ensure_dir(dest) if copy: shutil.copy2(src, dest) else: shutil.move(src, dest)
def test_margins(self): jsonpath = "test/files/pdfanalyze/lipsum.metrics.json" try: self.assertFalse(os.path.exists(jsonpath)) metrics = self.analyzer.metrics(jsonpath, startpage=1) self.assertEquals({'default': {'family': 'Comic Sans MS', 'size': 14}, 'bottommargin': 1149, 'h1': {'family': 'Cambria,Bold', 'size': 19}, 'h2': {'family': 'Cambria,Bold', 'size': 17}, 'h3': {'family': 'Cambria,Bold', 'size': 14}, 'topmargin': 53, 'leftmargin': 135, 'leftmargin_even': 108, 'pageheight': 1262, 'pagewidth': 892, 'rightmargin': 780, 'rightmargin_even': 760, 'scanned_source': False}, metrics) self.assertTrue(os.path.exists(jsonpath)) finally: util.robust_remove(jsonpath)
def test_margins(self): jsonpath = "test/files/pdfanalyze/lipsum.metrics.json" try: self.assertFalse(os.path.exists(jsonpath)) metrics = self.analyzer.metrics(jsonpath, startpage=1) self.assertEqual({'default': {'family': 'Comic Sans MS', 'size': 14}, 'bottommargin': 1149, 'h1': {'family': 'Cambria,Bold', 'size': 19}, 'h2': {'family': 'Cambria,Bold', 'size': 17}, 'h3': {'family': 'Cambria,Bold', 'size': 14}, 'topmargin': 53, 'leftmargin': 135, 'leftmargin_even': 108, 'pageheight': 1262, 'pagewidth': 892, 'rightmargin': 780, 'rightmargin_even': 760, 'scanned_source': False}, metrics) self.assertTrue(os.path.exists(jsonpath)) finally: util.robust_remove(jsonpath)
def wrapper(self, doc): try: return f(self, doc) except DocumentRemovedError as e: self.log.info( "%s: Document has been removed (%s)", doc.basefile, e) util.robust_remove(self.parsed_path(doc.basefile)) return False except KeyboardInterrupt: raise except ParseError as e: self.log.error("%s: ParseError %s", doc.basefile, e) if (hasattr(self.config, 'fatalexceptions') and self.config.fatalexceptions): raise else: return False except: self.log.exception("parse of %s failed", doc.basefile) if (hasattr(self.config, 'fatalexceptions') and self.config.fatalexceptions): raise else: return False
def test_robust_remove(self): util.writefile(self.fname, "Hello") util.robust_remove(self.fname) util.robust_remove(self.fname)
def query_webservice(self, query, page): # this is the only soap template we'll need, so we include it # verbatim to avoid having a dependency on a soap module like # zeep. endpoint = 'https://eur-lex.europa.eu/EURLexWebService' envelope = """<soap-env:Envelope xmlns:soap-env="http://www.w3.org/2003/05/soap-envelope"> <soap-env:Header> <wsse:Security xmlns:wsse="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-secext-1.0.xsd"> <wsse:UsernameToken> <wsse:Username>%s</wsse:Username> <wsse:Password Type="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-username-token-profile-1.0#PasswordText">%s</wsse:Password> </wsse:UsernameToken> </wsse:Security> </soap-env:Header> <soap-env:Body> <sear:searchRequest xmlns:sear="http://eur-lex.europa.eu/search"> <sear:expertQuery>%s</sear:expertQuery> <sear:page>%s</sear:page> <sear:pageSize>%s</sear:pageSize> <sear:searchLanguage>%s</sear:searchLanguage> </sear:searchRequest> </soap-env:Body> </soap-env:Envelope> """ % (self.config.username, self.config.password, escape(query, quote=False), page, self.pagesize, self.lang) headers = {'Content-Type': 'application/soap+xml; charset=utf-8; action="https://eur-lex.europa.eu/EURLexWebService/doQuery"', 'SOAPAction': 'https://eur-lex.europa.eu/EURLexWebService/doQuery'} if self.config.curl: # dump the envelope to a tempfile headerstr = "" for k, v in headers.items(): assert "'" not in v # if it is, we need to work on escaping it headerstr += " --header '%s: %s'" % (k, v) with tempfile.NamedTemporaryFile() as fp: fp.write(envelope.encode("utf-8")) fp.flush() envelopename = fp.name headerfiledesc, headerfilename = tempfile.mkstemp() cmd = 'curl -L -X POST -D %(headerfilename)s --data-binary "@%(envelopename)s" %(headerstr)s %(endpoint)s' % locals() (ret, stdout, stderr) = util.runcmd(cmd) headerfp = os.fdopen(headerfiledesc) header = headerfp.read() headerfp.close() util.robust_remove(headerfilename) status, headers = header.split('\n', 1) prot, code, msg = status.split(" ", 2) headers = dict(email.message_from_string(headers).items()) res = FakeResponse(int(code), stdout, headers) else: res = util.robust_fetch(self.session.post, endpoint, self.log, raise_for_status=False, data=envelope, headers=headers, timeout=10) if res.status_code == 500: tree = etree.parse(BytesIO(res.content)) statuscode = tree.find(".//{http://www.w3.org/2003/05/soap-envelope}Subcode")[0].text statusmsg = tree.find(".//{http://www.w3.org/2003/05/soap-envelope}Text").text raise errors.DownloadError("%s: %s" % (statuscode, statusmsg)) elif res.status_code == 301: # the call to robust_fetch or curl should have followed # the redirect, but at this point we'll just have to # report the error raise errors.DownloadError("%s: was redirected to %s" % (endpoint, res.headers['Location'])) return res
removed.append((inst, basefile)) fp.write('}\n') for (inst, basefile) in removed: downloaded_path = inst.store.downloaded_path(basefile) storage_policy = inst.store.storage_policy if not os.path.exists(downloaded_path): # maybe the reason is that this is a compositerepo? # FIXME: maybe CompositeStore.downloaded_path and # friends should do this transparently? if hasattr(inst, 'get_preferred_instances'): subinsts = list(inst.get_preferred_instances(basefile)) if not subinsts: print("%s %s: WARNING: no subinst handles this basefile" % (inst.alias, basefile)) continue subinst = subinsts[0] downloaded_path = subinst.store.downloaded_path(basefile) storage_policy = subinst.store.storage_policy assert(os.path.exists(downloaded_path)) print("%s %s: removing %s" % (inst.alias, basefile, downloaded_path)) count["removed"] += 1 if storage_policy == "dir": shutil.rmtree(os.path.dirname(downloaded_path)) # print("shutil.rmtree(%s)" % os.path.dirname(downloaded_path)) else: util.robust_remove(downloaded_path) # print("util.robust_remove(%s)" % downloaded_path) print("%(unreferenced)s unreferenced docs, %(metadataonly)s set to 'metadataonly', %(removed)s removed" % count)
def download_single(self, basefile, url=None): if url is None: url = self.remote_url(basefile) if not url: # remote_url failed return updated = created = False checked = True mainattachment = None if url in self.urlmap: attachment = self.urlmap[url] else: attachment = self.sniff_attachment(url) if attachment: self.urlmap[url] = attachment attachment += ".html" else: self.urlmap[url] = '' attachment = "index.html" downloaded_path = self.store.downloaded_path(basefile, attachment=attachment) created = not os.path.exists(downloaded_path) if self.download_if_needed(url, basefile, filename=downloaded_path): text = util.readfile(downloaded_path) if "<div>Inga tr\xe4ffar</div>" in text: self.log.warning("%s: Could not find this prop at %s, might be a bug" % (basefile, url)) util.robust_remove(downloaded_path) return False if created: self.log.info("%s: download OK from %s" % (basefile, url)) else: self.log.info( "%s: download OK (new version) from %s" % (basefile, url)) updated = True else: self.log.debug("%s: exists and is unchanged" % basefile) text = util.readfile(downloaded_path) soup = BeautifulSoup(text, "lxml") del text attachment = self.find_attachment(soup) extraurls = [] results = soup.find("div", "search-results-content") a = results.find("a", string="Hämta Pdf") if a: extraurls.append(a.get("href")) a = results.find("a", string="Hämta Doc") if a: extraurls.append(a.get("href")) # parse downloaded html/text page and find out extraurls for url in extraurls: if url.endswith('get=doc'): # NOTE: We cannot be sure that this is # actually a Word (CDF) file. For older files # it might be a WordPerfect file (.wpd) or a # RDF file, for newer it might be a .docx. We # cannot be sure until we've downloaded it. # So we quickly read the first 4 bytes r = requests.get(url, stream=True) sig = r.raw.read(4) # r.raw.close() #bodyidx = head.index("\n\n") #sig = head[bodyidx:bodyidx+4] if sig == b'\xffWPC': doctype = ".wpd" elif sig == b'\xd0\xcf\x11\xe0': doctype = ".doc" elif sig == b'PK\x03\x04': doctype = ".docx" elif sig == b'{\\rt': doctype = ".rtf" else: self.log.error( "%s: Attached file has signature %r -- don't know what type this is" % (basefile, sig)) continue elif url.endswith('get=pdf'): doctype = ".pdf" else: self.log.warning("Unknown doc type %s" % url.split("get=")[-1]) doctype = None if doctype: if attachment: filename = self.store.downloaded_path( basefile, attachment=attachment + doctype) else: filename = self.store.downloaded_path( basefile, attachment="index" + doctype) self.log.debug("%s: downloading attachment %s" % (basefile, filename)) self.download_if_needed(url, basefile, filename=filename) entry = DocumentEntry(self.store.documententry_path(basefile)) now = datetime.now() entry.orig_url = url if created: entry.orig_created = now if updated: entry.orig_updated = now if checked: entry.orig_checked = now entry.save() return updated
def tearDown(self): util.robust_remove("test/files/pdfanalyze/lipsum.metrics.json") util.robust_remove("test/files/pdfanalyze/lipsum.plot.png") util.robust_remove("test/files/pdfanalyze/lipsum.debug.pdf")
def download_single(self, basefile, url=None): if url is None: url = self.remote_url(basefile) if not url: # remote_url failed return updated = created = False checked = True mainattachment = None if url in self.urlmap: attachment = self.urlmap[url] else: attachment = self.sniff_attachment(url) if attachment: self.urlmap[url] = attachment attachment += ".html" else: self.urlmap[url] = '' attachment = "index.html" downloaded_path = self.store.downloaded_path(basefile, attachment=attachment) created = not os.path.exists(downloaded_path) if self.download_if_needed(url, basefile, filename=downloaded_path): text = util.readfile(downloaded_path) if "<div>Inga tr\xe4ffar</div>" in text: self.log.warning( "%s: Could not find this prop at %s, might be a bug" % (basefile, url)) util.robust_remove(downloaded_path) return False if created: self.log.info("%s: downloaded from %s" % (basefile, url)) else: self.log.info("%s: downloaded new version from %s" % (basefile, url)) updated = True else: self.log.debug("%s: exists and is unchanged" % basefile) text = util.readfile(downloaded_path) soup = BeautifulSoup(text, "lxml") del text attachment = self.find_attachment(soup) extraurls = [] results = soup.find("div", "search-results-content") a = results.find("a", string="Hämta Pdf") if a: extraurls.append(a.get("href")) a = results.find("a", string="Hämta Doc") if a: extraurls.append(a.get("href")) # parse downloaded html/text page and find out extraurls for url in extraurls: if url.endswith('get=doc'): # NOTE: We cannot be sure that this is # actually a Word (CDF) file. For older files # it might be a WordPerfect file (.wpd) or a # RDF file, for newer it might be a .docx. We # cannot be sure until we've downloaded it. # So we quickly read the first 4 bytes r = requests.get(url, stream=True) sig = r.raw.read(4) # r.raw.close() #bodyidx = head.index("\n\n") #sig = head[bodyidx:bodyidx+4] if sig == b'\xffWPC': doctype = ".wpd" elif sig == b'\xd0\xcf\x11\xe0': doctype = ".doc" elif sig == b'PK\x03\x04': doctype = ".docx" elif sig == b'{\\rt': doctype = ".rtf" else: self.log.error( "%s: Attached file has signature %r -- don't know what type this is" % (basefile, sig)) continue elif url.endswith('get=pdf'): doctype = ".pdf" else: self.log.warning("Unknown doc type %s" % url.split("get=")[-1]) doctype = None if doctype: if attachment: filename = self.store.downloaded_path( basefile, attachment=attachment + doctype) else: filename = self.store.downloaded_path(basefile, attachment="index" + doctype) self.log.debug("%s: downloading attachment %s" % (basefile, filename)) self.download_if_needed(url, basefile, filename=filename) entry = DocumentEntry(self.store.documententry_path(basefile)) now = datetime.now() entry.orig_url = url if created: entry.orig_created = now if updated: entry.orig_updated = now if checked: entry.orig_checked = now entry.save() return updated
def download(self, basefile=None): def write_doc(basefile, page_el): writefile = False p = self.store.downloaded_path(basefile) newcontent = etree.tostring(page_el, encoding="utf-8") if not os.path.exists(p): writefile = True else: oldcontent = util.readfile(p, "rb") if newcontent != oldcontent: writefile = True if writefile: util.ensure_dir(p) with open(p, "wb") as fp: fp.write(newcontent) self.log.info("%s: extracting from XML dump" % basefile) if basefile in basefiles: del basefiles[basefiles.index(basefile)] if basefile: return self.download_single(basefile) if self.config.mediawikidump: xmldumppath = self.store.path('dump', 'downloaded', '.xml') resp = requests.get(self.config.mediawikidump) self.log.info("Loaded XML dump from %s" % self.config.mediawikidump) from ferenda.documentstore import _open with _open(xmldumppath, mode="wb") as fp: fp.write(resp.content) xml = etree.parse(xmldumppath) else: raise ConfigurationError("config.mediawikidump not set") MW_NS = "{%s}" % xml.getroot().nsmap[None] wikinamespaces = [] for ns_el in xml.findall("//" + MW_NS + "namespace"): wikinamespaces.append(ns_el.text) # Get list of existing basefiles - if any of those # does not appear in the XML dump, remove them afterwards basefiles = list(self.store.list_basefiles_for("parse")) total = written = 0 deferred = {} for page_el in xml.findall(MW_NS + "page"): basefile = page_el.find(MW_NS + "title").text if basefile == "Huvudsida": # FIXME: generalize/make configurable continue if ":" in basefile and basefile.split(":")[0] in wikinamespaces: (namespace, localtitle) = basefile.split(":", 1) if namespace not in self.config.mediawikinamespaces: continue # defer writing of this one, so that it overwrites any # similarly named pages from teh main namespace. This # is so that Category pages about $TOPIC take # precedence over ordinary pages about $TOPIC deferred[localtitle] = page_el else: write_doc(basefile, page_el) for basefile, page_el in deferred.items(): write_doc(basefile, page_el) if 'dump' in basefiles: # never remove del basefiles[basefiles.index('dump')] for b in basefiles: self.log.info("%s: removing stale document" % b) util.robust_remove(self.store.downloaded_path(b))
def test_fsmparse(self): try: # 1. write a new python module containing a class with a staticmethod with open("testparser.py", "w") as fp: fp.write(""" from six import text_type as str from ferenda.elements import Body, Paragraph class Testobject(object): @staticmethod def get_parser(): return Parser() class Parser(object): def parse(self, source): res = Body() for chunk in source: res.append(Paragraph([str(len(chunk.strip()))])) return res """) import imp fp, pathname, desc = imp.find_module("testparser") imp.load_module("testparser", fp, pathname, desc) # 2. write a textfile with two paragraphs with open("testparseinput.txt", "w") as fp: fp.write("""This is one paragraph. And another. """) # 3. patch print and call fsmparse d = Devel() printmock = MagicMock() with patch('builtins.print', printmock): # 3.1 fsmparse dynamically imports the module and call the method # with every chunk from the text file # 3.2 fsmparse asserts that the method returned a callable # 3.3 fsmparse calls it with a iterable of text chunks from the # textfile # 3.4 fsmparse recieves a Element structure and prints a # serialized version d.fsmparse("testparser.Testobject.get_parser", "testparseinput.txt") self.assertTrue(printmock.called) # 4. check that the expected thing was printed want = """ <Body> <Paragraph> <str>22</str> </Paragraph> <Paragraph> <str>12</str> </Paragraph> </Body> """.strip()+"\n" printmock.assert_has_calls([call(want)]) finally: util.robust_remove("testparser.py") util.robust_remove("testparser.pyc") util.robust_remove("testparseinput.txt") if os.path.exists("__pycache__") and os.path.isdir("__pycache__"): shutil.rmtree("__pycache__")
def test_fsmparse(self): try: # 1. write a new python module containing a class with a staticmethod with open("testparser.py", "w") as fp: fp.write(""" from six import text_type as str from ferenda.elements import Body, Paragraph class Testobject(object): @staticmethod def get_parser(): return Parser() class Parser(object): def parse(self, source): res = Body() for chunk in source: res.append(Paragraph([str(len(chunk.strip()))])) return res """) import imp fp, pathname, desc = imp.find_module("testparser") imp.load_module("testparser", fp, pathname, desc) # 2. write a textfile with two paragraphs with open("testparseinput.txt", "w") as fp: fp.write("""This is one paragraph. And another. """) # 3. patch print and call fsmparse d = Devel() printmock = MagicMock() with patch('builtins.print', printmock): # 3.1 fsmparse dynamically imports the module and call the method # with every chunk from the text file # 3.2 fsmparse asserts that the method returned a callable # 3.3 fsmparse calls it with a iterable of text chunks from the # textfile # 3.4 fsmparse recieves a Element structure and prints a # serialized version d.fsmparse("testparser.Testobject.get_parser", "testparseinput.txt") self.assertTrue(printmock.called) # 4. check that the expected thing was printed want = """ <Body> <Paragraph> <str>22</str> </Paragraph> <Paragraph> <str>12</str> </Paragraph> </Body> """.strip() + "\n" printmock.assert_has_calls([call(want)]) finally: util.robust_remove("testparser.py") util.robust_remove("testparser.pyc") util.robust_remove("testparseinput.txt") if os.path.exists("__pycache__") and os.path.isdir("__pycache__"): shutil.rmtree("__pycache__")