def workon(page): mysite = wikipedia.getSite() try: text = page.get() except wikipedia.IsRedirectPage: wikipedia.output(u'%s is a redirect page. Skipping' % page.aslink()) return except wikipedia.NoPage: wikipedia.output(u'%s does not exist. Skipping' % page.aslink()) return wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) links = page.linkedPages() if len(links) > 0: wikipedia.getall(mysite,links) else: wikipedia.output('Nothing left to do.') return for page2 in links: try: target = page2.getRedirectTarget() except (wikipedia.Error,wikipedia.SectionError): continue text = treat(text, page2, target) if text != page.get(): comment = wikipedia.translate(mysite, msg) wikipedia.showDiff(page.get() ,text) try: page.put(text, comment) except (wikipedia.Error): wikipedia.output('Error: unable to put %s' % page.aslink())
def oneQuery(self): """Perform one step in the solution process""" # First find the best language to work on code = self.selectQueryCode() if code == None: print "NOTE: Nothing left to do" return False # Now assemble a reasonable list of pages to get group = [] plgroup = [] for subj in self.subjects: # Promise the subject that we will work on the code language # We will get a list of pages we can do. x = subj.willWorkOn(code) if x: plgroup.extend(x) group.append(subj) if len(plgroup)>=globalvar.maxquerysize: break if len(plgroup) == 0: print "NOTE: Nothing left to do 2" return False # Get the content of the assembled list in one blow try: wikipedia.getall(code, plgroup) except wikipedia.SaxError: # Ignore this error, and get the pages the traditional way. pass # Tell all of the subjects that the promised work is done for subj in group: subj.workDone(self) return True
def workon(page): mysite = pywikibot.getSite() try: text = page.get() except pywikibot.IsRedirectPage: pywikibot.output(u'%s is a redirect page. Skipping' % page.title(asLink=True)) return except pywikibot.NoPage: pywikibot.output(u'%s does not exist. Skipping' % page.title(asLink=True)) return pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) links = page.linkedPages() if len(links) > 0: pywikibot.getall(mysite,links) else: pywikibot.output('Nothing left to do.') return for page2 in links: try: target = page2.getRedirectTarget() except (pywikibot.Error,pywikibot.SectionError): continue text = treat(text, page2, target) if text != page.get(): comment = i18n.twtranslate(mysite, 'fixing_redirects-fixing') pywikibot.showDiff(page.get() ,text) try: page.put(text, comment) except (pywikibot.Error): pywikibot.output('Error: unable to put %s' % page.title(asLink=True))
def workon(page): mysite = pywikibot.getSite() try: text = page.get() except pywikibot.IsRedirectPage: pywikibot.output(u'%s is a redirect page. Skipping' % page.title(asLink=True)) return except pywikibot.NoPage: pywikibot.output(u'%s does not exist. Skipping' % page.title(asLink=True)) return pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) links = page.linkedPages() if len(links) > 0: pywikibot.getall(mysite, links) else: pywikibot.output('Nothing left to do.') return for page2 in links: try: target = page2.getRedirectTarget() except (pywikibot.Error, pywikibot.SectionError): continue text = treat(text, page2, target) if text != page.get(): comment = pywikibot.translate(mysite, msg) pywikibot.showDiff(page.get(), text) try: page.put(text, comment) except (pywikibot.Error): pywikibot.output('Error: unable to put %s' % page.title(asLink=True))
def main(): global mysite, linktrail, page start = [] for arg in wikipedia.handleArgs(): start.append(arg) if start: start = " ".join(start) else: start = "!" mysite = wikipedia.getSite() linktrail = mysite.linktrail() try: generator = pagegenerators.CategorizedPageGenerator( mysite.disambcategory(), start=start) except wikipedia.NoPage: print "The bot does not know the disambiguation category for your wiki." raise # only work on articles generator = pagegenerators.NamespaceFilterPageGenerator(generator, [0]) generator = pagegenerators.PreloadingGenerator(generator) pagestodo = [] pagestoload = [] for page in generator: if page.isRedirectPage(): continue linked = page.linkedPages() pagestodo.append((page, linked)) pagestoload += linked if len(pagestoload) > 49: wikipedia.getall(mysite, pagestoload) for page, links in pagestodo: workon(page, links) pagestoload = [] pagestodo = []
def main(): global mysite, linktrail, page start = [] for arg in wikipedia.handleArgs(): start.append(arg) if start: start = " ".join(start) else: start = "!" mysite = wikipedia.getSite() linktrail = mysite.linktrail() try: generator = pagegenerators.CategorizedPageGenerator(mysite.disambcategory(), start = start) except wikipedia.NoPage: print "The bot does not know the disambiguation category for your wiki." raise # only work on articles generator = pagegenerators.NamespaceFilterPageGenerator(generator, [0]) generator = pagegenerators.PreloadingGenerator(generator) pagestodo = [] pagestoload = [] for page in generator: if page.isRedirectPage(): continue linked = page.linkedPages() pagestodo.append((page,linked)) pagestoload += linked if len(pagestoload) > 49: wikipedia.getall(mysite,pagestoload) for page, links in pagestodo: workon(page,links) pagestoload = [] pagestodo = []
def testSite(site): try: wikipedia.getall(site, [wikipedia.Page(site, 'Any page name')]) except KeyboardInterrupt: raise except wikipedia.NoSuchSite: wikipedia.output( u'No such language %s' % site.lang ) except: wikipedia.output( u'Error processing language %s' % site.lang ) wikipedia.output( u''.join(traceback.format_exception(*sys.exc_info())))
def testSite(site): try: wikipedia.getall(site, [wikipedia.Page(site, "Any page name")]) except KeyboardInterrupt: raise except wikipedia.NoSuchSite: wikipedia.output(u"No such language %s" % site.lang) except: wikipedia.output(u"Error processing language %s" % site.lang) wikipedia.output(u"".join(traceback.format_exception(*sys.exc_info())))
def testSite(site): try: pywikibot.getall(site, [pywikibot.Page(site, 'Any page name')]) except KeyboardInterrupt: raise except pywikibot.NoSuchSite: pywikibot.output( u'No such language %s' % site.lang ) except: pywikibot.output( u'Error processing language %s' % site.lang ) pywikibot.output( u''.join(traceback.format_exception(*sys.exc_info())))
def workon(page): mysite = pywikibot.getSite() try: text = page.get() except pywikibot.IsRedirectPage: pywikibot.output(u'%s is a redirect page. Skipping' % page) return except pywikibot.NoPage: pywikibot.output(u'%s does not exist. Skipping' % page) return pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) links = page.linkedPages() if len(links) > 0: pywikibot.getall(mysite,links) else: pywikibot.output('Nothing left to do.') return for page2 in links: try: target = page2.getRedirectTarget() except pywikibot.NoPage: gen = mysite.logpages(number=1, mode='move', title=page2.title(), dump=True) try: lastmove = gen.next()['move'] except StopIteration: continue target = pywikibot.Page(mysite, lastmove['new_title']) except (pywikibot.Error, pywikibot.SectionError): continue # no fix to user namespaces if target.namespace() in [0, 1] and not page2.namespace() in [0, 1]: continue text = treat(text, page2, target) if text != page.get(): comment = i18n.twtranslate(mysite, 'fixing_redirects-fixing') pywikibot.showDiff(page.get(), text) try: page.put(text, comment) except (pywikibot.Error): pywikibot.output('Error: unable to put %s' % page)
def workon(page): mysite = pywikibot.getSite() try: text = page.get() except pywikibot.IsRedirectPage: pywikibot.output(u'%s is a redirect page. Skipping' % page) return except pywikibot.NoPage: pywikibot.output(u'%s does not exist. Skipping' % page) return pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) links = page.linkedPages() if len(links): pywikibot.getall(mysite, links) else: pywikibot.output('Nothing left to do.') return for page2 in links: try: target = page2.getRedirectTarget() except pywikibot.NoPage: try: target = page2.getMovedTarget() except pywikibot.NoPage: continue target = pywikibot.Page(mysite, lastmove['new_title']) except (pywikibot.Error, pywikibot.SectionError): continue # no fix to user namespaces if target.namespace() in [0, 1] and not page2.namespace() in [0, 1]: continue text = treat(text, page2, target) if text != page.get(): comment = i18n.twtranslate(mysite, 'fixing_redirects-fixing') pywikibot.showDiff(page.get(), text) try: page.put(text, comment) except (pywikibot.Error): pywikibot.error('unable to put %s' % page)
def preload(self, page_list, retry=False): try: while len(page_list) > 0: # It might be that the pages are on different sites, # e.g. because the -interwiki parameter was used. # Query the sites one by one. site = page_list[0].site() pagesThisSite = [page for page in page_list if page.site() == site] page_list = [page for page in page_list if page.site() != site] pywikibot.getall(site, pagesThisSite) for page in pagesThisSite: yield page except IndexError: # Can happen if the pages list is empty. Don't care. pass except pywikibot.SaxError: if not retry: # Retry once. self.preload(page_list, retry=True) # Ignore this error, and get the pages the traditional way later. pass
def workon(page): mysite = wikipedia.getSite() try: text = page.get() except wikipedia.IsRedirectPage: return wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) links = page.linkedPages() wikipedia.getall(mysite,links) for page2 in links: try: target = page2.getRedirectTarget() except (wikipedia.Error,wikipedia.SectionError): continue text = treat(text, page2, target) if text != page.get(): comment = wikipedia.translate(mysite, msg) try: page.put(text, comment) except (wikipedia.Error): wikipedia.output('Error : unable to put %s' % page.aslink())
def preload(self, page_list, retry=False): try: while len(page_list) > 0: # It might be that the pages are on different sites, # e.g. because the -interwiki parameter was used. # Query the sites one by one. site = page_list[0].site() pagesThisSite = [ page for page in page_list if page.site() == site ] page_list = [page for page in page_list if page.site() != site] wikipedia.getall(site, pagesThisSite) for page in pagesThisSite: yield page except IndexError: # Can happen if the pages list is empty. Don't care. pass except wikipedia.SaxError: if not retry: # Retry once. self.preload(page_list, retry=True) # Ignore this error, and get the pages the traditional way later. pass
def main(): automatic = False namespaces = [] msg = { 'ar': u'إزالة الوصلات إلى موقع سبام %s', 'de': u'Entferne in Spam-Blacklist eingetragenen Weblink auf %s', 'en': u'Removing links to spamming site %s', 'es': u'Removiendo enlaces a sitio publicitario %s', 'fa': u'حذف پیوند به وبگاه هرزنگاری %s', 'he': u'מסיר קישורים לאתר ספאם %s', 'fr': u'Suppression du lien blacklisté %s', 'it': u'Rimuovo link contenuto nella Spam-Blacklist %s', 'ja': u'ロボットによる: 迷惑リンク削除 %s', 'nl': u'Links naar gespamde site: %s verwijderd', 'pt': u'Removendo links de spam do site %s', 'ta': u'எரிதமாக இணைக்கப்பட்ட %s இணையத்தளம் நீக்கப்பட்டது', 'vi': u'xóa các liên kết đến website spam %s', 'zh': u'機器人: 移除廣告黑名單連結 %s', } spamSite = '' for arg in pywikibot.handleArgs(): if arg.startswith("-automatic"): automatic = True elif arg.startswith('-namespace:'): try: namespaces.append(int(arg[len('-namespace:'):])) except ValueError: namespaces.append(arg[len('-namespace:'):]) else: spamSite = arg if not automatic: pywikibot.put_throttle.setDelay(1) if not spamSite: pywikibot.showHelp('spamremove') pywikibot.output(u"No spam site specified.") sys.exit() mysite = pywikibot.getSite() pages = list(set(mysite.linksearch(spamSite))) if namespaces: pages = list(set(pagegenerators.NamespaceFilterPageGenerator(pages, namespaces))) if len(pages) == 0: pywikibot.output('No page found.') else: pywikibot.getall(mysite, pages) for p in pages: text = p.get() if not spamSite in text: continue # Show the title of the page we're working on. # Highlight the title in purple. pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % p.title()) lines = text.split('\n') newpage = [] lastok = "" for line in lines: if spamSite in line: if lastok: pywikibot.output(lastok) pywikibot.output('\03{lightred}%s\03{default}' % line) lastok = None else: newpage.append(line) if line.strip(): if lastok is None: pywikibot.output(line) lastok = line if automatic: answer = "y" else: answer = pywikibot.inputChoice(u'\nDelete the red lines?', ['yes', 'no', 'edit'], ['y', 'N', 'e'], 'n') if answer == "n": continue elif answer == "e": editor = editarticle.TextEditor() newtext = editor.edit(text, highlight=spamSite, jumpIndex=text.find(spamSite)) else: newtext = "\n".join(newpage) if newtext != text: p.put(newtext, pywikibot.translate(mysite, msg) % spamSite)
exclude(line, real_exclude=False) pl = pywikibot.Page(mysite, line) checked[pl] = pl f.close() excludefile = codecs.open(filename, 'a', encoding=mysite.encoding()) except IOError: # File does not exist excludefile = codecs.open(filename, 'w', encoding=mysite.encoding()) try: parentcats = workingcat.categories() except pywikibot.Error: parentcats = [] # Do not include articles already in subcats; only checking direct subcats subcatlist = workingcat.subcategoriesList() if subcatlist: pywikibot.getall(mysite, subcatlist) for cat in subcatlist: list = cat.articlesList() for page in list: exclude(page.title(), real_exclude=False) checked[page] = page list = workingcat.articlesList() if list: for pl in list: checked[pl] = pl pywikibot.getall(mysite, list) for pl in list: include(pl) else: pywikibot.output( u"Category %s does not exist or is empty. Which page to start with?"
def main(): automatic = False namespaces = [] msg = { 'ar': u'إزالة الوصلات إلى موقع سبام %s', 'de': u'Entferne in Spam-Blacklist eingetragenen Weblink auf %s', 'en': u'Removing links to spamming site %s', 'es': u'Removiendo enlaces a sitio publicitario %s', 'fa': u'حذف پیوند به وبگاه هرزنگاری %s', 'he': u'מסיר קישורים לאתר ספאם %s', 'fr': u'Suppression du lien blacklisté %s', 'it': u'Rimuovo link contenuto nella Spam-Blacklist %s', 'ja': u'ロボットによる: 迷惑リンク削除 %s', 'nl': u'Links naar gespamde site: %s verwijderd', 'pt': u'Removendo links de spam do site %s', 'ta': u'எரிதமாக இணைக்கப்பட்ட %s இணையத்தளம் நீக்கப்பட்டது', 'vi': u'xóa các liên kết đến website spam %s', 'zh': u'機器人: 移除廣告黑名單連結 %s', } spamSite = '' for arg in pywikibot.handleArgs(): if arg.startswith("-automatic"): automatic = True elif arg.startswith('-namespace:'): try: namespaces.append(int(arg[len('-namespace:'):])) except ValueError: namespaces.append(arg[len('-namespace:'):]) else: spamSite = arg if not automatic: pywikibot.put_throttle.setDelay(1) if not spamSite: pywikibot.showHelp('spamremove') pywikibot.output(u"No spam site specified.") sys.exit() mysite = pywikibot.getSite() pages = list(set(mysite.linksearch(spamSite))) if namespaces: pages = list( set(pagegenerators.NamespaceFilterPageGenerator(pages, namespaces))) if len(pages) == 0: pywikibot.output('No page found.') else: pywikibot.getall(mysite, pages) for p in pages: text = p.get() if spamSite not in text: continue # Show the title of the page we're working on. # Highlight the title in purple. pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % p.title()) lines = text.split('\n') newpage = [] lastok = "" for line in lines: if spamSite in line: if lastok: pywikibot.output(lastok) pywikibot.output('\03{lightred}%s\03{default}' % line) lastok = None else: newpage.append(line) if line.strip(): if lastok is None: pywikibot.output(line) lastok = line if automatic: answer = "y" else: answer = pywikibot.inputChoice(u'\nDelete the red lines?', ['yes', 'no', 'edit'], ['y', 'N', 'e'], 'n') if answer == "n": continue elif answer == "e": editor = editarticle.TextEditor() newtext = editor.edit(text, highlight=spamSite, jumpIndex=text.find(spamSite)) else: newtext = "\n".join(newpage) if newtext != text: p.put(newtext, pywikibot.translate(mysite, msg) % spamSite)
exclude(line,real_exclude=False) pl = wikipedia.Page(mysite,line) checked[pl] = pl f.close() excludefile = codecs.open(filename, 'a', encoding = mysite.encoding()) except IOError: # File does not exist excludefile = codecs.open(filename, 'w', encoding = mysite.encoding()) try: parentcats = workingcat.categories() except wikipedia.Error: parentcats = [] # Do not include articles already in subcats; only checking direct subcats subcatlist = workingcat.subcategoriesList() if subcatlist: wikipedia.getall(mysite,subcatlist) for cat in subcatlist: list = cat.articlesList() for page in list: exclude(page.title(),real_exclude=False) checked[page] = page list = workingcat.articlesList() if list: for pl in list: checked[pl]=pl wikipedia.getall(mysite,list) for pl in list: include(pl) else: wikipedia.output(u"Category %s does not exist or is empty. Which page to start with?"%workingcatname) answer = wikipedia.input(u"(Default is [[%s]]):"%workingcatname)
else: start = "!" mysite = wikipedia.getSite() linktrail = mysite.linktrail() try: generator = pagegenerators.CategorizedPageGenerator(mysite.disambcategory(), start = start) except wikipedia.NoPage: print "The bot does not know the disambiguation category for your wiki." raise # only work on articles generator = pagegenerators.NamespaceFilterPageGenerator(generator, [0]) generator = pagegenerators.PreloadingGenerator(generator) pagestodo = [] pagestoload = [] for page in generator: if page.isRedirectPage(): continue linked = page.linkedPages() pagestodo.append((page,linked)) pagestoload += linked if len(pagestoload) > 49: wikipedia.getall(mysite,pagestoload) for page, links in pagestodo: workon(page,links) pagestoload = [] pagestodo = [] finally: wikipedia.stopme()
exclude(line,real_exclude=False) pl = pywikibot.Page(mysite,line) checked[pl] = pl f.close() excludefile = codecs.open(filename, 'a', encoding = mysite.encoding()) except IOError: # File does not exist excludefile = codecs.open(filename, 'w', encoding = mysite.encoding()) try: parentcats = workingcat.categories() except pywikibot.Error: parentcats = [] # Do not include articles already in subcats; only checking direct subcats subcatlist = workingcat.subcategoriesList() if subcatlist: pywikibot.getall(mysite,subcatlist) for cat in subcatlist: list = cat.articlesList() for page in list: exclude(page.title(),real_exclude=False) checked[page] = page list = workingcat.articlesList() if list: for pl in list: checked[pl]=pl pywikibot.getall(mysite,list) for pl in list: include(pl) else: pywikibot.output( u"Category %s does not exist or is empty. Which page to start with?"
i += 1 if i == 480: break start = todo[len(todo) - 1].title() + '_0' # todo is a list of pages to do, donow are the pages we will be doing in this run. if len(todo) > 60: # Take the first 60. donow = todo[0:60] todo = todo[60:] else: donow = todo # If there was more to do, the 'if len(todo)<61' part would have extended # todo beyond this size. cont = False try: wikipedia.getall(mysite, donow) except wikipedia.SaxError: # Ignore this error, and get the pages the traditional way. pass checked += len(donow) for pl in donow: R = re.compile(r'http://[^\s}<\]]+[^\s.,:;)\?!\]}<]') try: for url in R.findall(pl.get()): url = wikipedia.unicode2html(url, 'ascii') try: error = URLerrorFinder().open(url) except IOError: error = -1 if error in allowederrorcodes: working += 1
i += 1 if i == 480: break start = todo[len(todo) - 1].title() + "_0" # todo is a list of pages to do, donow are the pages we will be doing in this run. if len(todo) > 60: # Take the first 60. donow = todo[0:60] todo = todo[60:] else: donow = todo # If there was more to do, the 'if len(todo)<61' part would have extended # todo beyond this size. cont = False try: wikipedia.getall(mysite, donow) except wikipedia.SaxError: # Ignore this error, and get the pages the traditional way. pass checked += len(donow) for pl in donow: R = re.compile(r"http://[^\s}<\]]+[^\s.,:;)\?!\]}<]") try: for url in R.findall(pl.get()): url = wikipedia.unicode2html(url, "ascii") try: error = URLerrorFinder().open(url) except IOError: error = -1 if error in allowederrorcodes: working += 1
def treat(self, page): """ "" Finds links, checks if they exist on wikibooks, then checks if they "" exist on Wikipedia, if not removes the link entirely. Also remove "" all "citation needed" tags. """ text = self.load(page) newText = text linksFoundInPage = [] wikibooksPages = [] wikipediaPages = [] linksOnWikipedia = [] redlinks = [] def linkName(link): link = link.strip('[') link = link.strip(']') if link.find("|") != -1: return link[ link.find("|")+1:] else: return None def linkURL(link): link = link.strip('[') link = link.strip(']') if link.find("|") != -1: return link[ :link.find("|")] else: return link # Matches text between "[[" and "]]" linkRegex = re.compile("\[\[.*?]]") linksFoundInPage = linkRegex.findall(text) # Remove items that aren't links cleanLinks = [] for link in linksFoundInPage: if link.find("#") != -1: continue elif link.find("Image:") != -1: continue elif link.find("File:") != -1: continue else: cleanLinks.append(link) linksFoundInPage = cleanLinks pregen = pagegenerators.PreloadingGenerator(self.generator) # Download wikibooksPages for link in linksFoundInPage: wikibooksPages.append( pywikibot.Page( page.site(), linkURL(link) ) ) pywikibot.getall(page.site(), wikibooksPages) # Download wikipediaPages wikipediaSite = pywikibot.getSite(page.site().language(), 'wikipedia') for link in linksFoundInPage: wikipediaPages.append( pywikibot.Page( wikipediaSite, linkURL(link)) ) pywikibot.getall(wikipediaSite, wikipediaPages) # sort links, sending to linksOnWikibooks, linksOnWikipedia, or redlinks i = 0 for link in linksFoundInPage: if wikibooksPages[i].exists(): print "Page \"" + wikibooksPages[i].title() + "\" exists on wikibooks." # no need to keep a list links on wikipedia else: #check on wikipedia if wikipediaPages[i].exists(): print "Page \"" + wikipediaPages[i].title() + "\" exists on wikipedia." linksOnWikipedia.append( linksFoundInPage[i] ) else: print "Could not find page \"" + wikibooksPages[i].title() + "\" removing." redlinks.append( linksFoundInPage[i] ) i += 1 # # remove redlinks, and change wikipedia links to use w: # for link in linksOnWikipedia: if linkName(link) == None: print linkURL(link) newLink = "[[w:" + linkURL(link) + "|" + linkURL(link) + "]]" newText = newText.replace(link, newLink) else: newText = newText.replace(link, "[[w:" + linkURL(link) + "|" + linkName(link) + "]]" ) print "-" + linkName(link) for link in redlinks: if linkName(link) == None: newText = newText.replace(link, linkURL(link)) else: newText = newText.replace(link, linkName(link) ) text = newText """ "" Finished """ if not self.save(text, page, self.summary): pywikibot.output(u'Page %s not saved.' % page.title(asLink=True))