def parseNews(page): #pywikibot.output(page.title(asLink=True)) site = page.site #response, data = pywikibot.comms.http.request(site, '/w/api.php', {'action':'parse','format':'json','page':page.title()}) rq = api.Request(site=site, action='parse', format='json', page=page.title()) data = rq.submit() #print data #text = simplejson.loads(data)['parse']['text']['*'] text = data['parse']['text']['*'] #print text #doc = minidom_parseString(u'<html><body>' + text.encode('utf-8') + u'</body></html>') doc = minidom_parseString((u'<html><body>' + text + u'</body></html>').encode('utf-8')) ul = doc.getElementsByTagName('ul') if ul: for li in ul[0].getElementsByTagName('li'): if li.firstChild.nodeType == Node.TEXT_NODE: prefix = li.firstChild.nodeValue if site.lang == 'en': prefix = date_rx.sub(r'[[\2 \1]]',prefix) elif site.lang == 'fr': prefix = date_rx.sub(r'{{date|\1|\2|\3}}',prefix) else: prefix = '' yield prefix, pywikibot.Page(site, li.getElementsByTagName('a')[0].getAttribute('title'))
def parseNews(page): wikipedia.output(page.aslink()) site = page.site() response, data = site.postForm('/w/api.php', {'action':'parse','format':'json','page':page.title()}) text = simplejson.loads(data)['parse']['text']['*'] #print text #doc = minidom_parseString(u'<html><body>' + text.encode('utf-8') + u'</body></html>') doc = minidom_parseString((u'<html><body>' + text + u'</body></html>').encode('utf-8')) ul = doc.getElementsByTagName('ul') if ul: for li in ul[0].getElementsByTagName('li'): if li.firstChild.nodeType == Node.TEXT_NODE: prefix = li.firstChild.nodeValue if site.lang == 'en': prefix = date_rx.sub(r'[[\2 \1]]',prefix) elif site.lang == 'fr': prefix = date_rx.sub(r'{{date|\1|\2|\3}}',prefix) else: prefix = '' yield prefix, wikipedia.Page(site, li.getElementsByTagName('a')[0].getAttribute('title'))
def simplify(data, lang): doc = minidom_parseString('<body>' + data + '</body>') Simplifier(doc, lang).simplify() return _serializeBody(doc)
def parseString(t): return minidom_parseString('<body>'+t+'</body>').documentElement
def simplify(data, lang): doc = minidom_parseString('<body>'+data+'</body>') Simplifier(doc, lang).simplify() return _serializeBody(doc)