Пример #1
0
def parseNews(page):
    #pywikibot.output(page.title(asLink=True))
    site = page.site
    #response, data = pywikibot.comms.http.request(site, '/w/api.php', {'action':'parse','format':'json','page':page.title()})
    rq = api.Request(site=site, action='parse', format='json', page=page.title())
    data = rq.submit()
    #print data
    #text = simplejson.loads(data)['parse']['text']['*']
    text = data['parse']['text']['*']
    #print text

    #doc = minidom_parseString(u'<html><body>' + text.encode('utf-8') + u'</body></html>')
    doc = minidom_parseString((u'<html><body>' + text + u'</body></html>').encode('utf-8'))

    ul = doc.getElementsByTagName('ul')
    if ul:
        for li in ul[0].getElementsByTagName('li'):
            if li.firstChild.nodeType == Node.TEXT_NODE:
                prefix = li.firstChild.nodeValue
                if site.lang == 'en':
                    prefix = date_rx.sub(r'[[\2 \1]]',prefix)
                elif site.lang == 'fr':
                    prefix = date_rx.sub(r'{{date|\1|\2|\3}}',prefix)
            else:
                prefix = ''
            yield prefix, pywikibot.Page(site, li.getElementsByTagName('a')[0].getAttribute('title'))
Пример #2
0
def parseNews(page):
    wikipedia.output(page.aslink())
    site = page.site()
    response, data = site.postForm('/w/api.php', {'action':'parse','format':'json','page':page.title()})
    text = simplejson.loads(data)['parse']['text']['*']
    #print text
 
    #doc = minidom_parseString(u'<html><body>' + text.encode('utf-8') + u'</body></html>')
    doc = minidom_parseString((u'<html><body>' + text + u'</body></html>').encode('utf-8'))
 
    ul = doc.getElementsByTagName('ul')
    if ul:
        for li in ul[0].getElementsByTagName('li'):
            if li.firstChild.nodeType == Node.TEXT_NODE:
                prefix = li.firstChild.nodeValue
                if site.lang == 'en':
                    prefix = date_rx.sub(r'[[\2 \1]]',prefix)
                elif site.lang == 'fr':
                    prefix = date_rx.sub(r'{{date|\1|\2|\3}}',prefix)
            else:
                prefix = ''
            yield prefix, wikipedia.Page(site, li.getElementsByTagName('a')[0].getAttribute('title'))
Пример #3
0
def simplify(data, lang):
    doc = minidom_parseString('<body>' + data + '</body>')
    Simplifier(doc, lang).simplify()
    return _serializeBody(doc)
Пример #4
0
def parseString(t):
    return minidom_parseString('<body>'+t+'</body>').documentElement
Пример #5
0
def simplify(data, lang):
    doc = minidom_parseString('<body>'+data+'</body>')
    Simplifier(doc, lang).simplify()
    return _serializeBody(doc)