def crawlerAll(start): gen = pagegenerators.AllpagesPageGenerator(start, namespace=0, includeredirects=False) for Page in pagegenerators.PreloadingGenerator(gen, 100): #print (Page.title().encode(config.console_encoding, 'replace')) modification(Page.title())
def main(): summary_commandline, gen, template = None, None, None namespaces, PageTitles, exceptions = [], [], [] encat = '' autoText, autoTitle = False, False recentcat, newcat = False, False genFactory = pagegenerators.GeneratorFactory() for arg in wikipedia.handleArgs(): if arg == '-autotitle': autoTitle = True elif arg == '-autotext': autoText = True elif arg.startswith('-except:'): exceptions.append(arg[8:]) elif arg.startswith('-start'): firstPageTitle = arg[7:] if not firstPageTitle: firstPageTitle = wikipedia.input( u'At which page do you want to start?') firstPageTitle = wikipedia.Page( fasite, firstPageTitle).title(withNamespace=False) gen = pagegenerators.AllpagesPageGenerator(firstPageTitle, 0, includeredirects=True) elif arg.startswith('-template:'): template = arg[10:] elif arg.startswith('-namespace:'): namespaces.append(int(arg[11:])) elif arg.startswith('-summary:'): wikipedia.setAction(arg[9:]) summary_commandline = True else: generator = genFactory.handleArg(arg) if generator: gen = generator if not gen: wikipedia.stopme() sys.exit() if namespaces != []: gen = pagegenerators.PreloadingGenerator(gen, pageNumber=60) preloadingGen = pagegenerators.NamespaceFilterPageGenerator( gen, namespaces) else: preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60) _cache, last_timestamp = get_cache() add_text(preloadingGen) now = str(datetime.now()) todaynum = int(now.split('-')[2].split(' ')[0]) + int( now.split('-')[1]) * 30 + (int(now.split('-')[0]) - 2000) * 365 if last_timestamp + 3 < todaynum: put_cache(_cache, todaynum) else: put_cache({}, 0)
def main(): start = '!' featured = False namespace = None gen = None # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() for arg in wikipedia.handleArgs(): if arg == '-featured': featured = True elif arg.startswith('-namespace'): if len(arg) == 10: namespace = int(wikipedia.input(u'Which namespace should be processed?')) else: namespace = int(arg[11:]) else: genFactory.handleArg(arg) gen = genFactory.getCombinedGenerator() mysite = wikipedia.getSite() if mysite.sitename() == 'wikipedia:nl': wikipedia.output(u'\03{lightred}There is consensus on the Dutch Wikipedia that bots should not be used to fix redirects.\03{default}') sys.exit() linktrail = mysite.linktrail() if featured: featuredList = wikipedia.translate(mysite, featured_articles) ref = wikipedia.Page(wikipedia.getSite(), featuredList) gen = pagegenerators.ReferringPageGenerator(ref) generator = pagegenerators.NamespaceFilterPageGenerator(gen, [0]) for page in generator: workon(page) elif namespace is not None: for page in pagegenerators.AllpagesPageGenerator(start=start, namespace=namespace, includeredirects=False): workon(page) elif gen: for page in pagegenerators.PreloadingGenerator(gen): workon(page) else: wikipedia.showHelp('fixing_redirects')
def main(): skip = u'' if len(sys.argv) > 1: site = wikipedia.Site(sys.argv[1], sys.argv[1]) else: print 'python script.py wikifamily [skiptopage]' sys.exit() if len(sys.argv) > 2: skip = sys.argv[2] gen = pagegenerators.AllpagesPageGenerator(start=skip, namespace=0, site=site) pre = pagegenerators.PreloadingGenerator(gen, pageNumber=250) alltitles = [] for page in pre: if not page.exists( ): #do not put .isRedirectPage() or it will never find redirects when checking below before creating continue alltitles.append(page.title()) print page.title() for wtitle in alltitles: if len(wtitle) > 1: wtitle_ = wtitle[0] + wtitle[1:].lower() redirects = set() for t in [wtitle, wtitle_]: redirects.add(t) redirects.add(remove1(t)) redirects.add(remove2(t)) redirects.add(removeaccute(t)) redirects.add(remove1(remove2(t))) redirects.add(remove1(removeaccute(t))) redirects.add(remove2(removeaccute(t))) redirects.add(remove1(remove2(removeaccute(t)))) print redirects for redirect in redirects: redirect = redirect.strip() if redirect and redirect != wtitle and not redirect in alltitles: red = wikipedia.Page(site, redirect) if not red.exists(): output = u"#REDIRECT [[%s]]" % (wtitle) msg = u"BOT - Creating redirect to [[%s]]" % (wtitle) red.put(output, msg)
def generator(self): # Choose which generator to use according to options. pagegen = None if self.__workonnew: if not self.__number: self.__number = config.special_page_limit pagegen = pagegenerators.NewpagesPageGenerator(number = self.__number) elif self.__refpagetitle: refpage = wikipedia.Page(wikipedia.getSite(), self.__refpagetitle) pagegen = pagegenerators.ReferringPageGenerator(refpage) elif self.__linkpagetitle: linkpage = wikipedia.Page(wikipedia.getSite(), self.__linkpagetitle) pagegen = pagegenerators.LinkedPageGenerator(linkpage) elif self.__catname: cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % self.__catname) if self.__start: pagegen = pagegenerators.CategorizedPageGenerator(cat, recurse = self.__catrecurse, start = self.__start) else: pagegen = pagegenerators.CategorizedPageGenerator(cat, recurse = self.__catrecurse) elif self.__textfile: pagegen = pagegenerators.TextfilePageGenerator(self.__textfile) else: if not self.__start: self.__start = '!' namespace = wikipedia.Page(wikipedia.getSite(), self.__start).namespace() start = wikipedia.Page(wikipedia.getSite(), self.__start).titleWithoutNamespace() pagegen = pagegenerators.AllpagesPageGenerator(start, namespace) return pagegen
if __name__ == "__main__": singlepage = [] gen = None start = None try: action = None for arg in wikipedia.handleArgs(): if arg == ('pages'): action = 'pages' elif arg == ('categories'): action = 'categories' elif arg.startswith('-start:'): start = wikipedia.Page(wikipedia.getSite(), arg[7:]) gen = pagegenerators.AllpagesPageGenerator( start.titleWithoutNamespace(), namespace=start.namespace(), includeredirects=False) elif arg.startswith('-cat:'): cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % arg[5:]) gen = pagegenerators.CategorizedPageGenerator(cat) elif arg.startswith('-ref:'): ref = wikipedia.Page(wikipedia.getSite(), arg[5:]) gen = pagegenerators.ReferringPageGenerator(ref) elif arg.startswith('-link:'): link = wikipedia.Page(wikipedia.getSite(), arg[6:]) gen = pagegenerators.LinkedPageGenerator(link) elif arg.startswith('-page:'): singlepage = wikipedia.Page(wikipedia.getSite(), arg[6:]) gen = iter([singlepage]) #else:
print "Warning! There is no wordlist for your language!" else: print "Wordlist successfully loaded." # This is a purely interactive bot, we therefore do not want to put-throttle pywikibot.put_throttle.setDelay(1) except: pywikibot.stopme() raise try: if newpages: for (page, date, length, loggedIn, user, comment) in pywikibot.getSite().newpages(1000): checkPage(page, checknames, knownonly) elif start: for page in pagegenerators.PreloadingGenerator( pagegenerators.AllpagesPageGenerator(start=start, includeredirects=False)): checkPage(page, checknames, knownonly) if longpages: for (page, length) in pywikibot.getSite().longpages(500): checkPage(page, checknames, knownonly) else: title = ' '.join(title) while title != '': try: page = pywikibot.Page(mysite, title) text = page.get() except pywikibot.NoPage: print "Page does not exist." except pywikibot.IsRedirectPage:
opts, args = getopt.getopt(sys.argv[1:], "h", ["help"]) except getopt.error, msg: print msg print "for help use --help" sys.exit(2) for o, a in opts: if o in ("-h", "--help"): print main.__doc__ sys.exit(0) ratelimit=15 commonssite=wikipedia.Site('commons', 'commons') st=u"!" if (len(sys.argv)>=2): st=sys.argv[1] gen=pagegenerators.AllpagesPageGenerator(start = st, namespace = 6, includeredirects = False, site = commonssite) pre=pagegenerators.PreloadingGenerator(gen, pageNumber=250, lookahead=250) inicio=ur"(?im)^(?P<inicio> *\| *Date *\= *)" # eliminamos . finales que no permiten hacer la conversión de fechas # no meter el espacio en [ \.]* al comienzo http://commons.wikimedia.org/w/index.php?title=File:18crown6.2.png&diff=prev&oldid=39395458 fin=ur"\.*(?P<fin> *((at|a las|,)? *\d\d:\d\d(:\d\d)?)?[ \.]*[\n\r\|])" #español dd month aaaa separador_es=[ur" *de?l? *", ur" *[\-\/\,\. ]? *"] #cuidado no meter () month2number_es={ u"enero":u"01", u"ene":u"01", u"febrero":u"02", u"feb":u"02", u"marzo":u"03", u"mar":u"03", u"abril":u"04", u"abr":u"04", u"mayo":u"05", u"may":u"05",
def main(): skip = u'' if len(sys.argv) > 1: site = wikipedia.Site(sys.argv[1], sys.argv[1]) else: print 'python script.py wikifamily [skiptopage]' sys.exit() if len(sys.argv) > 2: skip = sys.argv[2] gen = pagegenerators.AllpagesPageGenerator(start=skip, namespace=0, site=site) pre = pagegenerators.PreloadingGenerator(gen, pageNumber=250) alltitles = [] for page in pre: if not page.exists(): #do not put .isRedirectPage() or it will never find redirects when checking below before creating continue alltitles.append(page.title()) print page.title() for wtitle in alltitles: if len(wtitle) > 1: wtitle_ = wtitle[0]+wtitle[1:].lower() redirects = set() for t in [wtitle, wtitle_]: redirects.add(t) redirects.add(remove1(t)) redirects.add(remove2(t)) redirects.add(removeaccute(t)) redirects.add(remove1(remove2(t))) redirects.add(remove1(removeaccute(t))) redirects.add(remove2(removeaccute(t))) redirects.add(remove1(remove2(removeaccute(t)))) #redirects para Lista de ... if wtitle.startswith('Lista de ') and len(wtitle)>10: listade = wtitle[9:] listade = listade[0].upper()+listade[1:] redirects.add(listade) #redirects para Lista de acampadas/asambleas/... de/del/de la Madrid/provincia de Madrid if sys.argv[1].lower() == '15mpedia': for colectivo in [u'acampadas', u'asambleas', u'bancos de tiempo', u'centros sociales', u'comedores sociales']: #!!!no incluir asociaciones, ni comisiones, ni manifestaciones, ni plataformas porque detrás del "de " puede venir un tema y no un lugar if wtitle.startswith('Lista de %s de ' % colectivo): redirects.add(re.sub(ur"Lista de %s de " % colectivo, ur"Lista de %s en " % colectivo, wtitle)) elif wtitle.startswith('Lista de %s del ' % colectivo): redirects.add(re.sub(ur"Lista de %s del " % colectivo, ur"Lista de %s en el " % colectivo, wtitle)) elif wtitle.startswith('Lista de %s de la ' % colectivo): redirects.add(re.sub(ur"Lista de %s de la " % colectivo, ur"Lista de %s en la " % colectivo, wtitle)) if wtitle.startswith('Lista de comedores sociales ') and len(wtitle)>30: redirects.add(re.sub(ur"Lista de comedores sociales ", ur"Lista de comedores ", wtitle)) print redirects for redirect in redirects: redirect = redirect.strip() if redirect and redirect != wtitle and not redirect in alltitles: red = wikipedia.Page(site, redirect) if not red.exists(): output = u"#REDIRECT [[%s]]" % (wtitle) msg = u"BOT - Creating redirect to [[%s]]" % (wtitle) red.put(output, msg)
langorig = 'en' st = 'A' langdest = 'es' if len(sys.argv) >= 2: langdest = sys.argv[1] if len(sys.argv) >= 3: st = sys.argv[1] redirects = tareas.getRedirectsAndTargets(langorig, targetStartsWith=st) localpages = tareas.getPageTitle(langdest, redirects=True) wikipediadestino = wikipedia.Site(langdest, 'wikipedia') gen = pagegenerators.AllpagesPageGenerator(start=st, namespace=0, includeredirects=False, site=wikipediadestino) preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=100, lookahead=100) for page in preloadingGen: if page.exists() and page.isRedirectPage() or page.isDisambig(): pass else: wtitle = page.title() if wtitle[0] != st[0]: st = wtitle[0] redirects = tareas.getRedirectsAndTargets(langorig, targetStartsWith=st[0])
# -*- coding: utf-8 -*- # This scripts replaces a given text for another text in all wiki pages. You just need two configuration settings: oldtext = u"some text here" # Here you can put the old text you want to replace newtext = u"a nice new text here!" # Here you can put the new text that will replace oldtext import pywikibot as pwb import pagegenerators as pg def replace(text, antiga, nova): if not antiga in text: pass else: noutext = text.replace(antiga, nova) page.text = noutext page.save("Bot: Replacing -%s; +%s." % (antiga, nova)) if __name__ == '__main__': site = pwb.Site() for page in list(pg.AllpagesPageGenerator(site=site)): replace(page.text, oldtext, newtext)
# the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import re import wikipedia import pagegenerators skip = u'!' site = wikipedia.Site('wikipapers', 'wikipapers') gen = pagegenerators.AllpagesPageGenerator(start = skip, namespace=0, includeredirects=False, site=site) pre = pagegenerators.PreloadingGenerator(gen, pageNumber=250) msg = u"BOT - Creating talk page" output = u"<noinclude>{{talk}}</noinclude>" for page in pre: if page.exists() and not page.isRedirectPage(): if re.search(ur"(?im)\{\{\s*Infobox Publication", page.get()): talk = page.toggleTalkPage() if not talk.exists(): talk.put(output, msg)
#cosas que sobran ur'(?im)^ *\| *\}\}': ur'}}', #html ur'(?i)< *br */ *>': ur'<br />', } st = 'A' if len(sys.argv) >= 2: st = sys.argv[1] gen = pagegenerators.AllpagesPageGenerator(start=st, namespace=0, includeredirects=False, site=wikipedia.Site( 'es', 'wikipedia')) preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=33, lookahead=33) for page in preloadingGen: if page.isRedirectPage() or page.isDisambig(): pass else: wtitle = page.title() wtext = newtext = page.get() newtext = justificarParametros(newtext, page.templatesWithParams()) """
def main(): limitdays = 700 # oldest allowed ref link r_case1 = r'(?im)(?P<ref><\s*ref[^<>]*>\s*\[*\s*(?P<url>[^<>\[\]\s]+)\s*[^<>]*\s*\]*\s*<\s*/\s*ref\s*>)' #only URL, no title #<ref>{{cite web|title=CFL.ca <!-- BOT GENERATED TITLE -->|url=http://www.cfl.ca/standings/1985/reg|work=|archiveurl=http://www.webcitation.org/5gbBs41sC|archivedate=2009-05-07|deadurl=no|accessdate=2009-03-28}}</ref> r_case1 = re.compile(r_case1) r_case2 = r'(?im)(?P<ref><ref[^<>]*>\s*\{\{\s*cite web(?P<param>\s*\|\s*(?!archiveurl|archivedate)(?P<paramname>url|title|first|last|author|authorlink|coauthors|date|month|year|work|publisher|location|page|pages|at|language|trans_title|format|doi|accessdate|quote|ref|separator|postscript)\s*=\s*(?P<paramvalue>[^<>\|]*))*\s*\}\}\s*</ref>)' r_case2 = re.compile(r_case2) start = '!' namespace = 0 email = '' if len(sys.argv) > 1: start = sys.argv[1] if len(sys.argv) > 2: email = sys.argv[2] gen = pagegenerators.AllpagesPageGenerator(start, namespace, includeredirects=False) preload = pagegenerators.PreloadingGenerator(gen) for page in preload: if not page.exists() or \ page.isRedirectPage() or \ page.isDisambig(): print 'This page is redirect or disambig, or it does not exist. Skiping...' continue wtitle = page.title() print '=' * 3, wtitle, '=' * 3 wtext = newtext = page.get() if not allowbots(text=wtext, user='******'): print 'Skiping by page exclusion compliant' continue references = r_case1.finditer(wtext) if references: history = page.getVersionHistory(getAll=False, reverseOrder=True, revCount=500) #only metadata if len(history) >= 500: print 'Too long history, skiping...' continue history = page.fullVersionHistory( getAll=False, reverseOrder=True, revCount=500) #now, load history with content for reference in references: ref = reference.group('ref') url = reference.group('url') if not isURL(url=url): print 'This is not an URL', url continue if re.search(r'(archive\.org|webcitation\.org)', url): print 'URL is an archived URL, skiping...', url continue urltitle = getURLTitle(url=url) deadurl = isURLDead(url=url) archiveurl = '' archivedate = '' accessdate = getDateURLFirstTimeInArticle(history=history, url=url) if not accessdate: print 'Unknown URL (%s) date first time in article, skiping...' % ( urls) continue if (datetime.datetime.now() - accessdate).days > limitdays: print 'This URL (%s) was added long time ago: %d days. Skiping...' % ( url, (datetime.datetime.now() - accessdate).days) continue if deadurl: print 'URL is dead (%s), cannot archive it, searching for an archived copy...' % ( url) archiveurl, archivedate = recentArchived(url=url) if archiveurl and archivedate: print 'There is an archived copy (%s, %s), YAY!' % ( archiveurl, archivedate) else: print 'No archived copy available in WebCite, skiping this reference...' continue else: archiveurl, archivedate = archiveURL(url=url, email=email) if not archiveurl or not archivedate: print 'Error, no archiveurl or no archivedate retrieved for %s' % ( url) continue r_sub1 = '%s - {{WebCite|url=%s|date=%s}}</ref>' % ( ref.split('</ref>')[0], archiveurl, archivedate.strftime('%Y-%m-%d'), ) newtext = string.replace(newtext, ref, r_sub1, 1) if newtext != wtext: wikipedia.showDiff(wtext, newtext) summary = 'BOT - Adding link to [[WebCite]] archive for recently added reference(s)' page.put(newtext, summary)
import wikipedia, pagegenerators import re, random, time, sys, datetime import cosmetic_changes days = [ u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J', u'K', u'L', u'M', u'N', u'Ñ', u'O', u'P', u'Q', u'R', u'S', u'T', u'U', u'V', u'W', u'X', u'Y', u'Z', u'Á', u'É', u'Í', u'Ó' ] wiki = wikipedia.Site("en", "wikipedia") day = datetime.datetime.now().day day = day % len(days) if len(sys.argv) == 2: start = sys.argv[1] gen = pagegenerators.AllpagesPageGenerator(start, namespace=0, includeredirects=True, site=wiki) else: start = days[day] gen = pagegenerators.AllpagesPageGenerator(start, namespace=0, includeredirects=True, site=wiki) preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=200) for page in preloadingGen: if page.exists() and page.isRedirectPage(): wikipedia.output(u"Analizando: [[%s]]" % page.title()) wtext = page.get(get_redirect=True) wtitle = page.title() #punto de ruptura
f.close() except IOError: print "Warning! There is no wordlist for your language!" else: print "Wordlist successfully loaded." # This is a purely interactive bot, we therefore do not want to put-throttle pywikibot.put_throttle.setDelay(1) except: pywikibot.stopme() raise try: if newpages: for (page, date, length, loggedIn, user, comment) in pywikibot.getSite().newpages(1000): checkPage(page, checknames, knownonly) elif start: for page in pagegenerators.PreloadingGenerator(pagegenerators.AllpagesPageGenerator(start=start,includeredirects=False)): checkPage(page, checknames, knownonly) if longpages: for (page, length) in pywikibot.getSite().longpages(500): checkPage(page, checknames, knownonly) else: title = ' '.join(title) while title != '': try: page = pywikibot.Page(mysite,title) text = page.get() except pywikibot.NoPage: print "Page does not exist." except pywikibot.IsRedirectPage: