def _wikipedia_Page_getEditPage(self, get_redirect=False, throttle=True, sysop=False, oldid=None, nofollow_redirects=False): """ Gets the source of a wiki page through WikiProxy TODO: finish (use permalink things in localhost/~daniel/WikiSense/WikiProxy.php """ isWatched = False # mja, hoe gaan we dat checken? editRestriction = None # toolserver.getEditRestriction etc. if not oldid: oldid = self.latestRevision() data = { 'wiki': self.site().hostname(), 'title': self.sectionFreeTitle(), 'rev': oldid } if wikipedia.verbose: wikipedia.output( u'Getting revision %(rev)i of page %(title)s from %(wiki)s' % data) path = 'http://localhost/~daniel/WikiSense/WikiProxy.php' f = urllib.urlopen(path, urllib.urlencode(data)) if (throttle and not ('x-wikiproxy' in f.headers and f.headers['x-wikiproxy'] == 'hit')): wikipedia.get_throttle() return (f.read(), False, None)
def exportPage(self, page): response = None data = None wp = wikipedia.getSite(code=u'en', fam=u'wikipedia') address = wp.export_address() title = page.sectionFreeTitle().encode(wp.encoding()) predata = { 'action': 'submit', 'pages': title, 'offset': '1', } #if True is True:#Future Loop marker while True: wikipedia.get_throttle() wikipedia.output( '\03{lightpurple}>>\03{default} \03{lightaqua}Exporting revisions.\03{default}' ) # Now make the actual request to the server now = time() if wp.hostname() in config.authenticate.keys(): predata["Content-type"] = "application/x-www-form-urlencoded" predata["User-agent"] = wikipedia.useragent data = wp.urlEncode(predata) response = urllib2.urlopen( urllib2.Request( wp.protocol() + '://' + wp.hostname() + address, data)) data = response.read() else: response, data = wp.postForm(address, predata) data = data.encode(wp.encoding()) wikipedia.get_throttle.setDelay(time() - now) doc = minidom.parseString(data) revs = doc.getElementsByTagName('revision') revCount = len(revs) if revCount > 0: lastRev = revs[len(revs) - 1].getElementsByTagName('timestamp')[0] timestamp = '' for nodes in lastRev.childNodes: if nodes.nodeType == Node.TEXT_NODE: timestamp += nodes.data wikipedia.output( '\03{lightpurple}>>\03{default} \03{lightaqua}Got %s revisions up to %s.\03{default}' % (revCount, timestamp)) fileName = 'wpdumps/%s-%s.xml' % (title.replace( '/', '-'), predata['offset'].replace(':', '-')) wikipedia.output( '\03{lightpurple}>>\03{default} \03{lightblue}Saving to %s.\03{default}' % fileName) f = open(fileName, 'w') f.write(data) f.close() predata['offset'] = timestamp else: wikipedia.output( '\03{lightpurple}>>\03{default} \03{lightaqua}Returned no revisions, exporting for this page is complete.\03{default}' ) break
def exportPage(self, page): response = None data = None wp = wikipedia.getSite(code=u'en', fam=u'wikipedia') address = wp.export_address() title = page.sectionFreeTitle().encode(wp.encoding()) predata = { 'action': 'submit', 'pages': title, 'offset': '1', } #if True is True:#Future Loop marker while True: wikipedia.get_throttle() wikipedia.output('\03{lightpurple}>>\03{default} \03{lightaqua}Exporting revisions.\03{default}') # Now make the actual request to the server now = time() if wp.hostname() in config.authenticate.keys(): predata["Content-type"] = "application/x-www-form-urlencoded" predata["User-agent"] = wikipedia.useragent data = wp.urlEncode(predata) response = urllib2.urlopen(urllib2.Request(wp.protocol() + '://' + wp.hostname() + address, data)) data = response.read() else: response, data = wp.postForm(address, predata) data = data.encode(wp.encoding()) wikipedia.get_throttle.setDelay(time() - now) doc = minidom.parseString(data) revs = doc.getElementsByTagName('revision') revCount = len(revs) if revCount > 0: lastRev = revs[len(revs)-1].getElementsByTagName('timestamp')[0] timestamp = '' for nodes in lastRev.childNodes: if nodes.nodeType == Node.TEXT_NODE: timestamp += nodes.data wikipedia.output('\03{lightpurple}>>\03{default} \03{lightaqua}Got %s revisions up to %s.\03{default}' % (revCount,timestamp)) fileName = 'wpdumps/%s-%s.xml' % (title.replace('/','-'),predata['offset'].replace(':','-')) wikipedia.output('\03{lightpurple}>>\03{default} \03{lightblue}Saving to %s.\03{default}' % fileName) f = open(fileName, 'w') f.write(data) f.close() predata['offset'] = timestamp else: wikipedia.output('\03{lightpurple}>>\03{default} \03{lightaqua}Returned no revisions, exporting for this page is complete.\03{default}') break
def _wikipedia_Page_getEditPage(self, get_redirect=False, throttle=True, sysop=False, oldid=None, nofollow_redirects=False): """ Gets the source of a wiki page through WikiProxy TODO: finish (use permalink things in localhost/~daniel/WikiSense/WikiProxy.php """ isWatched = False # mja, hoe gaan we dat checken? editRestriction = None # toolserver.getEditRestriction etc. if not oldid: oldid = self.latestRevision() data = {'wiki' : self.site().hostname(), 'title' : self.sectionFreeTitle(), 'rev' : oldid } if wikipedia.verbose: wikipedia.output(u'Getting revision %(rev)i of page %(title)s from %(wiki)s' % data) path = 'http://localhost/~daniel/WikiSense/WikiProxy.php' f = urllib.urlopen(path, urllib.urlencode(data)) if (throttle and not ('x-wikiproxy' in f.headers and f.headers['x-wikiproxy'] == 'hit')): wikipedia.get_throttle() return (f.read(), False, None)
def _oldParseCategory(self, purge=False, startFrom=None): """Yields all articles and subcategories that are in this category. Set purge to True to instruct MediaWiki not to serve a cached version. Set startFrom to a string which is the title of the page to start from. Yielded results are tuples in the form (tag, page) where tag is one of the constants ARTICLE and SUBCATEGORY, and title is the Page or Category object. Note that results of this method need not be unique. This should not be used outside of this module. """ if self.site().versionnumber() < 4: Rtitle = re.compile('title\s?=\s?"([^"]*)"') elif self.site().versionnumber() < 8: # FIXME seems to parse all links Rtitle = re.compile('/\S*(?: title\s?=\s?)?"([^"]*)"') else: Rtitle = re.compile('<li>(?:<span.*?>)?<a href=".*?"\s?title\s?=\s?"' '([^"]*)"\>\+?[^\<\+]') if self.site().versionnumber() < 8: Rsubcat = None Rimage = None else: Rsubcat = re.compile('CategoryTreeLabelCategory"\s?href=".+?">(.+?)</a>') Rimage = re.compile( '<div class\s?=\s?"thumb"\sstyle="[^"]*">' '(?:<div style="[^"]*">)?<a href=".*?"' '(?:\sclass="image")?\stitle\s?=\s?"([^"]*)"' ) # regular expression matching the "(next 200)" link RLinkToNextPage = re.compile('&from=(.*?)" title="') if startFrom: currentPageOffset = urllib.quote(startFrom.encode(self.site().encoding())) else: currentPageOffset = None while True: path = self.site().get_address(self.urlname()) if purge: path += "&action=purge" if currentPageOffset: path += "&from=" + currentPageOffset pywikibot.output( "Getting [[%s]] starting at %s..." % (self.title(), pywikibot.url2link(currentPageOffset, self.site(), self.site())) ) else: pywikibot.output("Getting [[%s]]..." % self.title()) pywikibot.get_throttle() txt = self.site().getUrl(path) # index where subcategory listing begins if self.site().versionnumber() >= 9: # These IDs were introduced in 1.9 if '<div id="mw-subcategories">' in txt: ibegin = txt.index('<div id="mw-subcategories">') elif '<div id="mw-pages">' in txt: ibegin = txt.index('<div id="mw-pages">') elif '<div id="mw-category-media">' in txt: ibegin = txt.index('<div id="mw-category-media">') else: # No pages return else: # does not work for cats without text ibegin = txt.index("<!-- start content -->") # TODO: This parses category text and may think they are # pages in category! Check for versions before 1.9 # index where article listing ends if '<div class="printfooter">' in txt: iend = txt.index('<div class="printfooter">') elif '<div class="catlinks">' in txt: iend = txt.index('<div class="catlinks">') else: iend = txt.index("<!-- end content -->") txt = txt[ibegin:iend] for title in Rtitle.findall(txt): if title == self.title(): # This is only a link to "previous 200" or "next 200". # Ignore it. pass # For MediaWiki versions where subcats look like articles elif isCatTitle(title, self.site()): yield SUBCATEGORY, Category(self.site(), title) else: yield ARTICLE, pywikibot.Page(self.site(), title) if Rsubcat: # For MediaWiki versions where subcats look differently for titleWithoutNamespace in Rsubcat.findall(txt): title = "Category:%s" % titleWithoutNamespace yield SUBCATEGORY, Category(self.site(), title) if Rimage: # For MediaWiki versions where images work through galleries for title in Rimage.findall(txt): # In some MediaWiki versions, the titles contain the # namespace, but they don't in other (newer) versions. Use # the ImagePage's defaultNamespace feature to get everything # correctly. yield ARTICLE, pywikibot.ImagePage(self.site(), title) # try to find a link to the next list page matchObj = RLinkToNextPage.search(txt) if matchObj: currentPageOffset = matchObj.group(1) else: break
def _parseCategory(self, purge=False, startFrom=None, sortby=None, sortdir=None, endsort=None): """ Yields all articles and subcategories that are in this category by API. Set startFrom to a string which is the title of the page to start from. Yielded results are tuples in the form (tag, page) where tag is one of the constants ARTICLE and SUBCATEGORY, and title is the Page or Category object. Note that results of this method need not be unique. This should not be used outside of this module. """ if not self.site().has_api() or self.site().versionnumber() < 11: for tag, page in self._oldParseCategory(purge, startFrom): yield tag, page return currentPageOffset = None params = { "action": "query", "list": "categorymembers", "cmtitle": self.title(), "cmprop": ["title", "ids", "sortkey", "timestamp"], #'': '', } if self.site().versionnumber() > 16: params["cmprop"].append("sortkeyprefix") if sortby: params["cmsort"] = sortby if sortdir: params["cmdir"] = sortdir while True: if pywikibot.config.special_page_limit > 500: params["cmlimit"] = 500 else: params["cmlimit"] = pywikibot.config.special_page_limit if currentPageOffset: params.update(currentPageOffset) pywikibot.output( "Getting [[%s]] list from %s..." % (self.title(), "%s=%s" % currentPageOffset.popitem()) ) else: msg = "Getting [[%s]] list" % self.title() # category sort keys are uppercase if startFrom: startFrom = startFrom.upper() params["cmstartsortkey"] = startFrom msg += " starting at %s" % startFrom if endsort: endsort = endsort.upper() params["cmendsortkey"] = endsort msg += " ending at %s" % endsort pywikibot.output(msg + u"...") pywikibot.get_throttle() data = query.GetData(params, self.site()) if "error" in data: raise RuntimeError("%s" % data["error"]) count = 0 for memb in data["query"]["categorymembers"]: count += 1 # For MediaWiki versions where subcats look like articles if memb["ns"] == 14: if "sortkeyprefix" in memb: sortKeyPrefix = memb["sortkeyprefix"] else: sortKeyPrefix = None yield SUBCATEGORY, Category( self.site(), memb["title"], sortKey=memb["sortkey"], sortKeyPrefix=sortKeyPrefix ) elif memb["ns"] == 6: yield ARTICLE, pywikibot.ImagePage(self.site(), memb["title"]) else: page = pywikibot.Page(self.site(), memb["title"], defaultNamespace=memb["ns"]) if "sortkeyprefix" in memb: page.sortkeyprefix = memb["sortkeyprefix"] else: page.sortkeyprefix = None yield ARTICLE, page if count >= params["cmlimit"]: break # try to find a link to the next list page if "query-continue" in data and count < params["cmlimit"]: currentPageOffset = data["query-continue"]["categorymembers"] else: break
def refresh(site, sysop=False, witheditsonly=True): #if not site.has_api() or site.versionnumber() < 10: # _refreshOld(site) # get botlist special page's URL if not site.loggedInAs(sysop=sysop): site.forceLogin(sysop=sysop) params = { 'action': 'query', 'list': 'allusers', 'augroup': 'bot', } if witheditsonly: params['auwitheditsonly'] = '' pywikibot.output(u'Retrieving bot user list for %s via API.' % repr(site)) botlist = [] while True: pywikibot.get_throttle() data = pywikibot.query.GetData(params, site, sysop=sysop) if 'error' in data: raise RuntimeError('ERROR: %s' % data) botlist.extend([w['name'] for w in data['query']['allusers']]) if 'query-continue' in data: params.update(data['query-continue']['allusers']) else: break pywikibot.output(u'Retrieving global bot user list for %s.' % repr(site)) m1 = True offset = '' if site.live_version()[1] >= 18: PATTERN = u'<li><a.*?>(.*?)</.*?> *\((.*?),\s(.*?)\)(?:.*?)</li>' elif site.live_version()[1] == 17: PATTERN = u'<li>(.*?) *\((.*?),\s(.*?)\)(?:.*?)</li>' else: PATTERN = u'<li>(.*?) *\((.*?),\s(.*?)\)</li>' while m1: pywikibot.get_throttle() text = site.getUrl( site.globalusers_address(offset=urllib.quote(offset), group='Global_bot')) m1 = re.findall(u'<li>.*?</li>', text) for item in m1: m2 = re.search(PATTERN, item) (bot, flag_local, flag_global) = m2.groups() flag_local = (flag_local[:2] == u'<a') flag_global = True # since group='Global_bot' if bot not in botlist: botlist.append(bot) #print len(botlist) offset = bot.encode(site.encoding()) # Save the botlist to disk # The file is stored in the botlists subdir. Create if necessary. if sysop: f = open( pywikibot.config.datafilepath( 'botlists', 'botlist-%s-%s-sysop.dat' % (site.family.name, site.lang)), 'w') else: f = open( pywikibot.config.datafilepath( 'botlists', 'botlist-%s-%s.dat' % (site.family.name, site.lang)), 'w') pickle.dump(botlist, f) f.close()
def _oldParseCategory(self, purge=False, startFrom=None): """Yields all articles and subcategories that are in this category. Set purge to True to instruct MediaWiki not to serve a cached version. Set startFrom to a string which is the title of the page to start from. Yielded results are tuples in the form (tag, page) where tag is one of the constants ARTICLE and SUBCATEGORY, and title is the Page or Category object. Note that results of this method need not be unique. This should not be used outside of this module. """ if self.site().versionnumber() < 4: Rtitle = re.compile('title\s?=\s?\"([^\"]*)\"') elif self.site().versionnumber() < 8: # FIXME seems to parse all links Rtitle = re.compile('/\S*(?: title\s?=\s?)?\"([^\"]*)\"') else: Rtitle = re.compile( '<li>(?:<span.*?>)?<a href=\".*?\"\s?title\s?=\s?\"' '([^\"]*)\"\>\+?[^\<\+]') if self.site().versionnumber() < 8: Rsubcat = None Rimage = None else: Rsubcat = re.compile( 'CategoryTreeLabelCategory\"\s?href=\".+?\">(.+?)</a>') Rimage = re.compile( '<div class\s?=\s?\"thumb\"\sstyle=\"[^\"]*\">' '(?:<div style=\"[^\"]*\">)?<a href=\".*?\"' '(?:\sclass="image")?\stitle\s?=\s?\"([^\"]*)\"') # regular expression matching the "(next 200)" link RLinkToNextPage = re.compile('&from=(.*?)" title="') if startFrom: currentPageOffset = urllib.quote( startFrom.encode(self.site().encoding())) else: currentPageOffset = None while True: path = self.site().get_address(self.urlname()) if purge: path += '&action=purge' if currentPageOffset: path += '&from=' + currentPageOffset pywikibot.output( 'Getting [[%s]] starting at %s...' % (self.title(), pywikibot.url2link(currentPageOffset, self.site(), self.site()))) else: pywikibot.output('Getting [[%s]]...' % self.title()) pywikibot.get_throttle() txt = self.site().getUrl(path) # index where subcategory listing begins if self.site().versionnumber() >= 9: # These IDs were introduced in 1.9 if '<div id="mw-subcategories">' in txt: ibegin = txt.index('<div id="mw-subcategories">') elif '<div id="mw-pages">' in txt: ibegin = txt.index('<div id="mw-pages">') elif '<div id="mw-category-media">' in txt: ibegin = txt.index('<div id="mw-category-media">') else: # No pages return else: # does not work for cats without text ibegin = txt.index('<!-- start content -->') # TODO: This parses category text and may think they are # pages in category! Check for versions before 1.9 # index where article listing ends if '<div class="printfooter">' in txt: iend = txt.index('<div class="printfooter">') elif '<div class="catlinks">' in txt: iend = txt.index('<div class="catlinks">') else: iend = txt.index('<!-- end content -->') txt = txt[ibegin:iend] for title in Rtitle.findall(txt): if title == self.title(): # This is only a link to "previous 200" or "next 200". # Ignore it. pass # For MediaWiki versions where subcats look like articles elif isCatTitle(title, self.site()): yield SUBCATEGORY, Category(self.site(), title) else: yield ARTICLE, pywikibot.Page(self.site(), title) if Rsubcat: # For MediaWiki versions where subcats look differently for titleWithoutNamespace in Rsubcat.findall(txt): title = 'Category:%s' % titleWithoutNamespace yield SUBCATEGORY, Category(self.site(), title) if Rimage: # For MediaWiki versions where images work through galleries for title in Rimage.findall(txt): # In some MediaWiki versions, the titles contain the # namespace, but they don't in other (newer) versions. Use # the ImagePage's defaultNamespace feature to get everything # correctly. yield ARTICLE, pywikibot.ImagePage(self.site(), title) # try to find a link to the next list page matchObj = RLinkToNextPage.search(txt) if matchObj: currentPageOffset = matchObj.group(1) else: break
def _parseCategory(self, purge=False, startFrom=None, sortby=None, sortdir=None, endsort=None): """ Yields all articles and subcategories that are in this category by API. Set startFrom to a string which is the title of the page to start from. Yielded results are tuples in the form (tag, page) where tag is one of the constants ARTICLE and SUBCATEGORY, and title is the Page or Category object. Note that results of this method need not be unique. This should not be used outside of this module. """ if not self.site().has_api() or self.site().versionnumber() < 11: for tag, page in self._oldParseCategory(purge, startFrom): yield tag, page return currentPageOffset = None params = { 'action': 'query', 'list': 'categorymembers', 'cmtitle': self.title(), 'cmprop': ['title', 'ids', 'sortkey', 'timestamp'], #'': '', } if self.site().versionnumber() > 16: params['cmprop'].append('sortkeyprefix') if sortby: params['cmsort'] = sortby if sortdir: params['cmdir'] = sortdir while True: if pywikibot.config.special_page_limit > 500: params['cmlimit'] = 500 else: params['cmlimit'] = pywikibot.config.special_page_limit if currentPageOffset: params.update(currentPageOffset) pywikibot.output( 'Getting [[%s]] list from %s...' % (self.title(), "%s=%s" % currentPageOffset.popitem())) else: msg = 'Getting [[%s]] list' % self.title() # category sort keys are uppercase if startFrom: startFrom = startFrom.upper() params['cmstartsortkey'] = startFrom msg += ' starting at %s' % startFrom if endsort: endsort = endsort.upper() params['cmendsortkey'] = endsort msg += ' ending at %s' % endsort pywikibot.output(msg + u'...') pywikibot.get_throttle() data = query.GetData(params, self.site()) if 'error' in data: raise RuntimeError("%s" % data['error']) count = 0 for memb in data['query']['categorymembers']: count += 1 # For MediaWiki versions where subcats look like articles if memb['ns'] == 14: if 'sortkeyprefix' in memb: sortKeyPrefix = memb['sortkeyprefix'] else: sortKeyPrefix = None yield SUBCATEGORY, Category(self.site(), memb['title'], sortKey=memb['sortkey'], sortKeyPrefix=sortKeyPrefix) elif memb['ns'] == 6: yield ARTICLE, pywikibot.ImagePage(self.site(), memb['title']) else: page = pywikibot.Page(self.site(), memb['title'], defaultNamespace=memb['ns']) if 'sortkeyprefix' in memb: page.sortkeyprefix = memb['sortkeyprefix'] else: page.sortkeyprefix = None yield ARTICLE, page if count >= params['cmlimit']: break # try to find a link to the next list page if 'query-continue' in data and count < params['cmlimit']: currentPageOffset = data['query-continue']['categorymembers'] else: break
def refresh(site, sysop=False, witheditsonly=True): #if not site.has_api() or site.versionnumber() < 10: # _refreshOld(site) # get botlist special page's URL if not site.loggedInAs(sysop=sysop): site.forceLogin(sysop=sysop) params = { 'action': 'query', 'list': 'allusers', 'augroup': 'bot', } if witheditsonly: params['auwitheditsonly'] = '' pywikibot.output(u'Retrieving bot user list for %s via API.' % repr(site)) botlist = [] while True: pywikibot.get_throttle() data = pywikibot.query.GetData(params, site, sysop=sysop) if 'error' in data: raise RuntimeError('ERROR: %s' % data) botlist.extend([w['name'] for w in data['query']['allusers']]) if 'query-continue' in data: params.update(data['query-continue']['allusers']) else: break pywikibot.output(u'Retrieving global bot user list for %s.' % repr(site)) m1 = True offset = '' if site.live_version()[1] >= 18: PATTERN = u'<li><a.*?>(.*?)</.*?> *\((.*?),\s(.*?)\)(?:.*?)</li>' elif site.live_version()[1] == 17: PATTERN = u'<li>(.*?) *\((.*?),\s(.*?)\)(?:.*?)</li>' else: PATTERN = u'<li>(.*?) *\((.*?),\s(.*?)\)</li>' while m1: pywikibot.get_throttle() text = site.getUrl(site.globalusers_address(offset=urllib.quote(offset), group='Global_bot')) m1 = re.findall(u'<li>.*?</li>', text) for item in m1: m2 = re.search(PATTERN, item) (bot, flag_local, flag_global) = m2.groups() flag_local = (flag_local[:2] == u'<a') flag_global = True # since group='Global_bot' if bot not in botlist: botlist.append( bot ) #print len(botlist) offset = bot.encode(site.encoding()) # Save the botlist to disk # The file is stored in the botlists subdir. Create if necessary. if sysop: f = open(pywikibot.config.datafilepath('botlists', 'botlist-%s-%s-sysop.dat' % (site.family.name, site.lang)), 'w') else: f = open(pywikibot.config.datafilepath('botlists', 'botlist-%s-%s.dat' % (site.family.name, site.lang)), 'w') pickle.dump(botlist, f) f.close()
def _parseCategory(self, purge=False, startFrom=None, sortby=None, sortdir=None): """ Yields all articles and subcategories that are in this category by API. Set startFrom to a string which is the title of the page to start from. Yielded results are tuples in the form (tag, page) where tag is one of the constants ARTICLE and SUBCATEGORY, and title is the Page or Category object. Note that results of this method need not be unique. This should not be used outside of this module. """ if not self.site().has_api() or self.site().versionnumber() < 11: for tag, page in self._oldParseCategory(purge, startFrom): yield tag, page return currentPageOffset = None params = { 'action': 'query', 'list': 'categorymembers', 'cmtitle': self.title(), 'cmprop': ['title', 'ids', 'sortkey', 'timestamp'], #'': '', } if sortby: params['cmsort'] = sortby if sortdir: params['cmdir'] = sortdir while True: if wikipedia.config.special_page_limit > 500: params['cmlimit'] = 500 else: params['cmlimit'] = wikipedia.config.special_page_limit if currentPageOffset: params.update(currentPageOffset) wikipedia.output('Getting [[%s]] list from %s...' % (self.title(), "%s=%s" % currentPageOffset.popitem())) elif startFrom: startFrom = startFrom.upper() # category sort keys are uppercase params['cmstartsortkey'] = startFrom wikipedia.output('Getting [[%s]] list starting at %s...' % (self.title(), startFrom)) else: wikipedia.output('Getting [[%s]]...' % self.title()) wikipedia.get_throttle() data = query.GetData(params, self.site()) if 'error' in data: raise RuntimeError("%s" % data['error']) count = 0 for memb in data['query']['categorymembers']: count += 1 # For MediaWiki versions where subcats look like articles if memb['ns'] == 14: yield SUBCATEGORY, Category(self.site(), memb['title'], sortKey=memb['sortkey']) elif memb['ns'] == 6: yield ARTICLE, wikipedia.ImagePage(self.site(), memb['title']) else: yield ARTICLE, wikipedia.Page(self.site(), memb['title'], defaultNamespace=memb['ns']) if count >= params['cmlimit']: break # try to find a link to the next list page if 'query-continue' in data and count < params['cmlimit']: currentPageOffset = data['query-continue']['categorymembers'] else: break
def _parseCategory(self, purge=False, startFrom=None): """ Yields all articles and subcategories that are in this category by API. Set startFrom to a string which is the title of the page to start from. Yielded results are tuples in the form (tag, page) where tag is one of the constants ARTICLE and SUBCATEGORY, and title is the Page or Category object. Note that results of this method need not be unique. This should not be used outside of this module. """ if not self.site().has_api() or self.site().versionnumber() < 11: for tag, page in self._oldParseCategory(purge, startFrom): yield tag, page return currentPageOffset = None params = { 'action': 'query', 'list': 'categorymembers', 'cmtitle': self.title(), 'cmprop': ['title', 'ids', 'sortkey', 'timestamp'], #'': '', } while True: if wikipedia.config.special_page_limit > 500: params['cmlimit'] = 500 else: params['cmlimit'] = wikipedia.config.special_page_limit if currentPageOffset: params['cmcontinue'] = currentPageOffset wikipedia.output( 'Getting [[%s]] list from %s...' % (self.title(), currentPageOffset[:-1])) # cmcontinue last key is '|' elif startFrom: params['cmstartsortkey'] = startFrom wikipedia.output('Getting [[%s]] list starting at %s...' % (self.title(), startFrom)) else: wikipedia.output('Getting [[%s]]...' % self.title()) wikipedia.get_throttle() data = query.GetData(params, self.site()) if 'error' in data: raise RuntimeError("%s" % data['error']) count = 0 for memb in data['query']['categorymembers']: count += 1 # For MediaWiki versions where subcats look like articles if memb['ns'] == 14: yield SUBCATEGORY, Category(self.site(), memb['title'], sortKey=memb['sortkey']) elif memb['ns'] == 6: yield ARTICLE, wikipedia.ImagePage(self.site(), memb['title']) else: yield ARTICLE, wikipedia.Page(self.site(), memb['title'], defaultNamespace=memb['ns']) if count >= params['cmlimit']: break # try to find a link to the next list page if 'query-continue' in data and count < params['cmlimit']: currentPageOffset = data['query-continue']['categorymembers'][ 'cmcontinue'] else: break