示例#1
0
 def to_html(cls, kb_entry):
     r = kb_entry.body.replace("\r", "")
     parsed = parseString(title=kb_entry.subject, raw=r, wikidb=cls.NOCDB(kb_entry))
     preprocess(parsed)
     xhtml = MWXHTMLWriter()
     xhtml.writeBook(parsed)
     block = ET.tostring(xhtml.xmlbody)
     return block
示例#2
0
def getXHTML(wikitext, title, language):
    db = DummyDB()
    db.normalize_and_get_page = noop
    r = parseString(title=title, raw=wikitext, wikidb=db, lang=language)
    if not r:
        return None
    preprocess(r)
    removeLangLinks(r)
    dbw = MWXHTMLWriter()
    dbw.writeBook(r)
    return dbw.asstring()
示例#3
0
 def to_html(cls, kb_entry):
     from mwlib.uparser import parseString
     from mwlib.xhtmlwriter import MWXHTMLWriter, preprocess
     try:
         import xml.etree.ElementTree as ET
     except:
         from elementtree import ElementTree as ET
     r = kb_entry.body.replace("\r", "")
     parsed = parseString(title=kb_entry.subject,
                          raw=r,
                          wikidb=cls.NOCDB(kb_entry))
     preprocess(parsed)
     xhtml = MWXHTMLWriter()
     xhtml.writeBook(parsed)
     block = ET.tostring(xhtml.xmlbody)
     return block
示例#4
0
def parseENwikt():
    wiktionaryGet.getWiktionaries(['en'])
    fh = bz2.BZ2File("enwiktionary-latest-pages-meta-current.xml.bz2")

    bg_en = {}
    en_bg = {}

    debug = False

    if debug:
        try:
            from IPython.Shell import IPShellEmbed
            ipshell = IPShellEmbed()
        except:
            from IPython import embed
            ipshell = embed

    cyrlRE = re.compile(ur'[\u0400-\u04FF\u0500-\u052F]', re.UNICODE)
    bulRE = re.compile("[bB]ulgarian", re.UNICODE)
    bulgarianSingle = re.compile("\* [bB]ulgarian", re.UNICODE)
    bulgarianSectionStart = re.compile("^==Bulgarian==$", re.UNICODE)
    bulgarianSectionEnd = re.compile("^==[A-Za-z-]+==$", re.UNICODE)

    keep = False
    read = False

    w = MWXHTMLWriter()

    while 1:
        line = fh.readline()
        if not line:
            break
        if line == "  <page>\n":
            article = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
            read = True
        elif line == "  </page>\n":
            read = False
            if keep:
                keep = False
                article += line
                root = xml.dom.minidom.parseString(article)
                if len(root.getElementsByTagName("text")[0].childNodes) > 0:
                    title = root.getElementsByTagName(
                        "title")[0].firstChild.data
                    text = root.getElementsByTagName("text")[0].firstChild.data
                    newText = ""
                    Bulg = False
                    for line in text.split('\n'):
                        if bulgarianSectionStart.search(line):
                            Bulg = True
                        elif bulgarianSectionEnd.search(line):
                            Bulg = False
                        if Bulg == True:
                            newText += line + '\n'
                        elif bulgarianSingle.search(line):
                            newText += line + '\n'
                    if newText is not "":
                        p = parseString(title, newText)
                        if cyrlRE.search(title):
                            if debug:
                                print "bg_en = " + newText.encode('utf-8')
                                ipshell()
                            bg_en[title] = ''.join(
                                ET.tostring(w.write(p),
                                            encoding="utf-8",
                                            method="html").split('\n'))
                        else:
                            if debug:
                                print "en_bg = " + newText.encode('utf-8')
                                ipshell()
                            en_bg[title] = ''.join(
                                ET.tostring(w.write(p),
                                            encoding="utf-8",
                                            method="html").split('\n'))
        if read:
            if bulRE.search(line):
                keep = True
            article += line

    enWiktBG = bz2.BZ2File("enWiktBG.pickle.bz2", 'wb')

    pickle.dump((bg_en, en_bg), enWiktBG, pickle.HIGHEST_PROTOCOL)

    enWiktBG.close()