def to_html(cls, kb_entry): r = kb_entry.body.replace("\r", "") parsed = parseString(title=kb_entry.subject, raw=r, wikidb=cls.NOCDB(kb_entry)) preprocess(parsed) xhtml = MWXHTMLWriter() xhtml.writeBook(parsed) block = ET.tostring(xhtml.xmlbody) return block
def getXHTML(wikitext, title, language): db = DummyDB() db.normalize_and_get_page = noop r = parseString(title=title, raw=wikitext, wikidb=db, lang=language) if not r: return None preprocess(r) removeLangLinks(r) dbw = MWXHTMLWriter() dbw.writeBook(r) return dbw.asstring()
def to_html(cls, kb_entry): from mwlib.uparser import parseString from mwlib.xhtmlwriter import MWXHTMLWriter, preprocess try: import xml.etree.ElementTree as ET except: from elementtree import ElementTree as ET r = kb_entry.body.replace("\r", "") parsed = parseString(title=kb_entry.subject, raw=r, wikidb=cls.NOCDB(kb_entry)) preprocess(parsed) xhtml = MWXHTMLWriter() xhtml.writeBook(parsed) block = ET.tostring(xhtml.xmlbody) return block
def parseENwikt(): wiktionaryGet.getWiktionaries(['en']) fh = bz2.BZ2File("enwiktionary-latest-pages-meta-current.xml.bz2") bg_en = {} en_bg = {} debug = False if debug: try: from IPython.Shell import IPShellEmbed ipshell = IPShellEmbed() except: from IPython import embed ipshell = embed cyrlRE = re.compile(ur'[\u0400-\u04FF\u0500-\u052F]', re.UNICODE) bulRE = re.compile("[bB]ulgarian", re.UNICODE) bulgarianSingle = re.compile("\* [bB]ulgarian", re.UNICODE) bulgarianSectionStart = re.compile("^==Bulgarian==$", re.UNICODE) bulgarianSectionEnd = re.compile("^==[A-Za-z-]+==$", re.UNICODE) keep = False read = False w = MWXHTMLWriter() while 1: line = fh.readline() if not line: break if line == " <page>\n": article = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" read = True elif line == " </page>\n": read = False if keep: keep = False article += line root = xml.dom.minidom.parseString(article) if len(root.getElementsByTagName("text")[0].childNodes) > 0: title = root.getElementsByTagName( "title")[0].firstChild.data text = root.getElementsByTagName("text")[0].firstChild.data newText = "" Bulg = False for line in text.split('\n'): if bulgarianSectionStart.search(line): Bulg = True elif bulgarianSectionEnd.search(line): Bulg = False if Bulg == True: newText += line + '\n' elif bulgarianSingle.search(line): newText += line + '\n' if newText is not "": p = parseString(title, newText) if cyrlRE.search(title): if debug: print "bg_en = " + newText.encode('utf-8') ipshell() bg_en[title] = ''.join( ET.tostring(w.write(p), encoding="utf-8", method="html").split('\n')) else: if debug: print "en_bg = " + newText.encode('utf-8') ipshell() en_bg[title] = ''.join( ET.tostring(w.write(p), encoding="utf-8", method="html").split('\n')) if read: if bulRE.search(line): keep = True article += line enWiktBG = bz2.BZ2File("enWiktBG.pickle.bz2", 'wb') pickle.dump((bg_en, en_bg), enWiktBG, pickle.HIGHEST_PROTOCOL) enWiktBG.close()