def crawlerAll(start):
    gen = pagegenerators.AllpagesPageGenerator(start,
                                               namespace=0,
                                               includeredirects=False)
    for Page in pagegenerators.PreloadingGenerator(gen, 100):
        #print (Page.title().encode(config.console_encoding, 'replace'))
        modification(Page.title())
예제 #2
0
def main():
    summary_commandline, gen, template = None, None, None
    namespaces, PageTitles, exceptions = [], [], []
    encat = ''
    autoText, autoTitle = False, False
    recentcat, newcat = False, False
    genFactory = pagegenerators.GeneratorFactory()
    for arg in wikipedia.handleArgs():
        if arg == '-autotitle':
            autoTitle = True
        elif arg == '-autotext':
            autoText = True
        elif arg.startswith('-except:'):
            exceptions.append(arg[8:])

        elif arg.startswith('-start'):
            firstPageTitle = arg[7:]
            if not firstPageTitle:
                firstPageTitle = wikipedia.input(
                    u'At which page do you want to start?')
            firstPageTitle = wikipedia.Page(
                fasite, firstPageTitle).title(withNamespace=False)
            gen = pagegenerators.AllpagesPageGenerator(firstPageTitle,
                                                       0,
                                                       includeredirects=True)
        elif arg.startswith('-template:'):
            template = arg[10:]
        elif arg.startswith('-namespace:'):
            namespaces.append(int(arg[11:]))
        elif arg.startswith('-summary:'):
            wikipedia.setAction(arg[9:])
            summary_commandline = True
        else:
            generator = genFactory.handleArg(arg)
            if generator:
                gen = generator
    if not gen:
        wikipedia.stopme()
        sys.exit()
    if namespaces != []:
        gen = pagegenerators.PreloadingGenerator(gen, pageNumber=60)
        preloadingGen = pagegenerators.NamespaceFilterPageGenerator(
            gen, namespaces)
    else:
        preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60)
    _cache, last_timestamp = get_cache()
    add_text(preloadingGen)

    now = str(datetime.now())
    todaynum = int(now.split('-')[2].split(' ')[0]) + int(
        now.split('-')[1]) * 30 + (int(now.split('-')[0]) - 2000) * 365

    if last_timestamp + 3 < todaynum:
        put_cache(_cache, todaynum)
    else:
        put_cache({}, 0)
예제 #3
0
def main():
    start = '!'
    featured = False
    namespace = None
    gen = None

    # This factory is responsible for processing command line arguments
    # that are also used by other scripts and that determine on which pages
    # to work on.
    genFactory = pagegenerators.GeneratorFactory()

    for arg in wikipedia.handleArgs():
        if arg == '-featured':
            featured = True
        elif arg.startswith('-namespace'):
            if len(arg) == 10:
                namespace = int(wikipedia.input(u'Which namespace should be processed?'))
            else:
                namespace = int(arg[11:])
        else:
            genFactory.handleArg(arg)

    gen = genFactory.getCombinedGenerator()

    mysite = wikipedia.getSite()
    if mysite.sitename() == 'wikipedia:nl':
        wikipedia.output(u'\03{lightred}There is consensus on the Dutch Wikipedia that bots should not be used to fix redirects.\03{default}')
        sys.exit()

    linktrail = mysite.linktrail()
    if featured:
        featuredList = wikipedia.translate(mysite, featured_articles)
        ref = wikipedia.Page(wikipedia.getSite(), featuredList)
        gen = pagegenerators.ReferringPageGenerator(ref)
        generator = pagegenerators.NamespaceFilterPageGenerator(gen, [0])
        for page in generator:
            workon(page)
    elif namespace is not None:
        for page in pagegenerators.AllpagesPageGenerator(start=start, namespace=namespace, includeredirects=False):
            workon(page)
    elif gen:
        for page in pagegenerators.PreloadingGenerator(gen):
            workon(page)
    else:
        wikipedia.showHelp('fixing_redirects')
예제 #4
0
def main():
    skip = u''
    if len(sys.argv) > 1:
        site = wikipedia.Site(sys.argv[1], sys.argv[1])
    else:
        print 'python script.py wikifamily [skiptopage]'
        sys.exit()
    if len(sys.argv) > 2:
        skip = sys.argv[2]
    gen = pagegenerators.AllpagesPageGenerator(start=skip,
                                               namespace=0,
                                               site=site)
    pre = pagegenerators.PreloadingGenerator(gen, pageNumber=250)
    alltitles = []
    for page in pre:
        if not page.exists(
        ):  #do not put .isRedirectPage() or it will never find redirects when checking below before creating
            continue
        alltitles.append(page.title())
        print page.title()

    for wtitle in alltitles:
        if len(wtitle) > 1:
            wtitle_ = wtitle[0] + wtitle[1:].lower()
            redirects = set()
            for t in [wtitle, wtitle_]:
                redirects.add(t)
                redirects.add(remove1(t))
                redirects.add(remove2(t))
                redirects.add(removeaccute(t))
                redirects.add(remove1(remove2(t)))
                redirects.add(remove1(removeaccute(t)))
                redirects.add(remove2(removeaccute(t)))
                redirects.add(remove1(remove2(removeaccute(t))))

            print redirects
            for redirect in redirects:
                redirect = redirect.strip()
                if redirect and redirect != wtitle and not redirect in alltitles:
                    red = wikipedia.Page(site, redirect)
                    if not red.exists():
                        output = u"#REDIRECT [[%s]]" % (wtitle)
                        msg = u"BOT - Creating redirect to [[%s]]" % (wtitle)
                        red.put(output, msg)
예제 #5
0
    def generator(self):
        # Choose which generator to use according to options.
 
        pagegen = None
 
        if self.__workonnew:
            if not self.__number:
                self.__number = config.special_page_limit
            pagegen = pagegenerators.NewpagesPageGenerator(number = self.__number)
 
        elif self.__refpagetitle:
            refpage = wikipedia.Page(wikipedia.getSite(), self.__refpagetitle)
            pagegen = pagegenerators.ReferringPageGenerator(refpage)
 
        elif self.__linkpagetitle:
            linkpage = wikipedia.Page(wikipedia.getSite(), self.__linkpagetitle)
            pagegen = pagegenerators.LinkedPageGenerator(linkpage)
 
        elif self.__catname:
            cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % self.__catname)
 
            if self.__start:
                pagegen = pagegenerators.CategorizedPageGenerator(cat, recurse = self.__catrecurse, start = self.__start)
            else:
                pagegen = pagegenerators.CategorizedPageGenerator(cat, recurse = self.__catrecurse)
 
        elif self.__textfile:
            pagegen = pagegenerators.TextfilePageGenerator(self.__textfile)
 
        else:
            if not self.__start:
                self.__start = '!'
            namespace = wikipedia.Page(wikipedia.getSite(), self.__start).namespace()
            start = wikipedia.Page(wikipedia.getSite(), self.__start).titleWithoutNamespace()
 
            pagegen = pagegenerators.AllpagesPageGenerator(start, namespace)
 
        return pagegen
예제 #6
0
if __name__ == "__main__":
    singlepage = []
    gen = None
    start = None
    try:
        action = None
        for arg in wikipedia.handleArgs():
            if arg == ('pages'):
                action = 'pages'
            elif arg == ('categories'):
                action = 'categories'
            elif arg.startswith('-start:'):
                start = wikipedia.Page(wikipedia.getSite(), arg[7:])
                gen = pagegenerators.AllpagesPageGenerator(
                    start.titleWithoutNamespace(),
                    namespace=start.namespace(),
                    includeredirects=False)
            elif arg.startswith('-cat:'):
                cat = catlib.Category(wikipedia.getSite(),
                                      'Category:%s' % arg[5:])
                gen = pagegenerators.CategorizedPageGenerator(cat)
            elif arg.startswith('-ref:'):
                ref = wikipedia.Page(wikipedia.getSite(), arg[5:])
                gen = pagegenerators.ReferringPageGenerator(ref)
            elif arg.startswith('-link:'):
                link = wikipedia.Page(wikipedia.getSite(), arg[6:])
                gen = pagegenerators.LinkedPageGenerator(link)
            elif arg.startswith('-page:'):
                singlepage = wikipedia.Page(wikipedia.getSite(), arg[6:])
                gen = iter([singlepage])
            #else:
예제 #7
0
        print "Warning! There is no wordlist for your language!"
    else:
        print "Wordlist successfully loaded."
    # This is a purely interactive bot, we therefore do not want to put-throttle
    pywikibot.put_throttle.setDelay(1)
except:
    pywikibot.stopme()
    raise
try:
    if newpages:
        for (page, date, length, loggedIn, user,
             comment) in pywikibot.getSite().newpages(1000):
            checkPage(page, checknames, knownonly)
    elif start:
        for page in pagegenerators.PreloadingGenerator(
                pagegenerators.AllpagesPageGenerator(start=start,
                                                     includeredirects=False)):
            checkPage(page, checknames, knownonly)

    if longpages:
        for (page, length) in pywikibot.getSite().longpages(500):
            checkPage(page, checknames, knownonly)

    else:
        title = ' '.join(title)
        while title != '':
            try:
                page = pywikibot.Page(mysite, title)
                text = page.get()
            except pywikibot.NoPage:
                print "Page does not exist."
            except pywikibot.IsRedirectPage:
예제 #8
0
        opts, args = getopt.getopt(sys.argv[1:], "h", ["help"])
    except getopt.error, msg:
        print msg
        print "for help use --help"
        sys.exit(2)
    for o, a in opts:
        if o in ("-h", "--help"):
            print main.__doc__
            sys.exit(0)

    ratelimit=15
    commonssite=wikipedia.Site('commons', 'commons')
    st=u"!"
    if (len(sys.argv)>=2):
        st=sys.argv[1]
    gen=pagegenerators.AllpagesPageGenerator(start = st, namespace = 6, includeredirects = False, site = commonssite)
    pre=pagegenerators.PreloadingGenerator(gen, pageNumber=250, lookahead=250)

    inicio=ur"(?im)^(?P<inicio> *\| *Date *\= *)"
    # eliminamos . finales que no permiten hacer la conversión de fechas
    # no meter el espacio en [ \.]* al comienzo http://commons.wikimedia.org/w/index.php?title=File:18crown6.2.png&diff=prev&oldid=39395458
    fin=ur"\.*(?P<fin> *((at|a las|,)? *\d\d:\d\d(:\d\d)?)?[ \.]*[\n\r\|])" 

    #español   dd month aaaa
    separador_es=[ur" *de?l? *", ur" *[\-\/\,\. ]? *"] #cuidado no meter ()
    month2number_es={
    u"enero":u"01", u"ene":u"01", 
    u"febrero":u"02", u"feb":u"02", 
    u"marzo":u"03", u"mar":u"03", 
    u"abril":u"04", u"abr":u"04", 
    u"mayo":u"05", u"may":u"05", 
예제 #9
0
def main():
    skip = u''
    if len(sys.argv) > 1:
        site = wikipedia.Site(sys.argv[1], sys.argv[1])
    else:
        print 'python script.py wikifamily [skiptopage]'
        sys.exit()
    if len(sys.argv) > 2:
        skip = sys.argv[2]
    gen = pagegenerators.AllpagesPageGenerator(start=skip, namespace=0, site=site)
    pre = pagegenerators.PreloadingGenerator(gen, pageNumber=250)
    alltitles = []
    for page in pre:
        if not page.exists(): #do not put .isRedirectPage() or it will never find redirects when checking below before creating
            continue
        alltitles.append(page.title())
        print page.title()
        
    for wtitle in alltitles:
        if len(wtitle) > 1:
            wtitle_ = wtitle[0]+wtitle[1:].lower()
            redirects = set()
            for t in [wtitle, wtitle_]:
                redirects.add(t)
                redirects.add(remove1(t))
                redirects.add(remove2(t))
                redirects.add(removeaccute(t))
                redirects.add(remove1(remove2(t)))
                redirects.add(remove1(removeaccute(t)))
                redirects.add(remove2(removeaccute(t)))
                redirects.add(remove1(remove2(removeaccute(t))))
                
                #redirects para Lista de ...
                if wtitle.startswith('Lista de ') and len(wtitle)>10:
                    listade = wtitle[9:]
                    listade = listade[0].upper()+listade[1:]
                    redirects.add(listade)
                
                #redirects para Lista de acampadas/asambleas/... de/del/de la Madrid/provincia de Madrid
                if sys.argv[1].lower() == '15mpedia':
                    for colectivo in [u'acampadas', u'asambleas', u'bancos de tiempo', u'centros sociales', u'comedores sociales']:
                        #!!!no incluir asociaciones, ni comisiones, ni manifestaciones, ni plataformas porque detrás del "de " puede venir un tema y no un lugar
                        if wtitle.startswith('Lista de %s de ' % colectivo):
                            redirects.add(re.sub(ur"Lista de %s de " % colectivo, ur"Lista de %s en " % colectivo, wtitle))
                        elif wtitle.startswith('Lista de %s del ' % colectivo):
                            redirects.add(re.sub(ur"Lista de %s del " % colectivo, ur"Lista de %s en el " % colectivo, wtitle))
                        elif wtitle.startswith('Lista de %s de la ' % colectivo):
                            redirects.add(re.sub(ur"Lista de %s de la " % colectivo, ur"Lista de %s en la " % colectivo, wtitle))
                     
                     if wtitle.startswith('Lista de comedores sociales ') and len(wtitle)>30:
						 redirects.add(re.sub(ur"Lista de comedores sociales ", ur"Lista de comedores ", wtitle))
            
            print redirects
            for redirect in redirects:
                redirect = redirect.strip()
                if redirect and redirect != wtitle and not redirect in alltitles:
                    red = wikipedia.Page(site, redirect)
                    if not red.exists():
                        output = u"#REDIRECT [[%s]]" % (wtitle)
                        msg = u"BOT - Creating redirect to [[%s]]" % (wtitle)
                        red.put(output, msg)
예제 #10
0

langorig = 'en'
st = 'A'
langdest = 'es'
if len(sys.argv) >= 2:
    langdest = sys.argv[1]
if len(sys.argv) >= 3:
    st = sys.argv[1]

redirects = tareas.getRedirectsAndTargets(langorig, targetStartsWith=st)
localpages = tareas.getPageTitle(langdest, redirects=True)

wikipediadestino = wikipedia.Site(langdest, 'wikipedia')
gen = pagegenerators.AllpagesPageGenerator(start=st,
                                           namespace=0,
                                           includeredirects=False,
                                           site=wikipediadestino)
preloadingGen = pagegenerators.PreloadingGenerator(gen,
                                                   pageNumber=100,
                                                   lookahead=100)

for page in preloadingGen:
    if page.exists() and page.isRedirectPage() or page.isDisambig():
        pass
    else:
        wtitle = page.title()

        if wtitle[0] != st[0]:
            st = wtitle[0]
            redirects = tareas.getRedirectsAndTargets(langorig,
                                                      targetStartsWith=st[0])
예제 #11
0
# -*- coding: utf-8 -*-

# This scripts replaces a given text for another text in all wiki pages. You just need two configuration settings:

oldtext = u"some text here"  # Here you can put the old text you want to replace
newtext = u"a nice new text here!"  # Here you can put the new text that will replace oldtext

import pywikibot as pwb
import pagegenerators as pg


def replace(text, antiga, nova):
    if not antiga in text:
        pass
    else:
        noutext = text.replace(antiga, nova)
        page.text = noutext
        page.save("Bot: Replacing -%s; +%s." % (antiga, nova))


if __name__ == '__main__':
    site = pwb.Site()
    for page in list(pg.AllpagesPageGenerator(site=site)):
        replace(page.text, oldtext, newtext)
예제 #12
0
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import re

import wikipedia
import pagegenerators

skip = u'!'
site = wikipedia.Site('wikipapers', 'wikipapers')
gen = pagegenerators.AllpagesPageGenerator(start = skip, namespace=0, includeredirects=False, site=site)
pre = pagegenerators.PreloadingGenerator(gen, pageNumber=250)
msg = u"BOT - Creating talk page"
output = u"<noinclude>{{talk}}</noinclude>"
for page in pre:
    if page.exists() and not page.isRedirectPage():
        if re.search(ur"(?im)\{\{\s*Infobox Publication", page.get()):
            talk = page.toggleTalkPage()
            if not talk.exists():
                talk.put(output, msg)
            
            
예제 #13
0
    #cosas que sobran
    ur'(?im)^ *\| *\}\}':
    ur'}}',

    #html
    ur'(?i)< *br */ *>':
    ur'<br />',
}

st = 'A'
if len(sys.argv) >= 2:
    st = sys.argv[1]

gen = pagegenerators.AllpagesPageGenerator(start=st,
                                           namespace=0,
                                           includeredirects=False,
                                           site=wikipedia.Site(
                                               'es', 'wikipedia'))
preloadingGen = pagegenerators.PreloadingGenerator(gen,
                                                   pageNumber=33,
                                                   lookahead=33)

for page in preloadingGen:
    if page.isRedirectPage() or page.isDisambig():
        pass
    else:
        wtitle = page.title()
        wtext = newtext = page.get()

        newtext = justificarParametros(newtext, page.templatesWithParams())
        """
예제 #14
0
def main():
    limitdays = 700  # oldest allowed ref link

    r_case1 = r'(?im)(?P<ref><\s*ref[^<>]*>\s*\[*\s*(?P<url>[^<>\[\]\s]+)\s*[^<>]*\s*\]*\s*<\s*/\s*ref\s*>)'  #only URL, no title
    #<ref>{{cite web|title=CFL.ca <!-- BOT GENERATED TITLE -->|url=http://www.cfl.ca/standings/1985/reg|work=|archiveurl=http://www.webcitation.org/5gbBs41sC|archivedate=2009-05-07|deadurl=no|accessdate=2009-03-28}}</ref>
    r_case1 = re.compile(r_case1)
    r_case2 = r'(?im)(?P<ref><ref[^<>]*>\s*\{\{\s*cite web(?P<param>\s*\|\s*(?!archiveurl|archivedate)(?P<paramname>url|title|first|last|author|authorlink|coauthors|date|month|year|work|publisher|location|page|pages|at|language|trans_title|format|doi|accessdate|quote|ref|separator|postscript)\s*=\s*(?P<paramvalue>[^<>\|]*))*\s*\}\}\s*</ref>)'
    r_case2 = re.compile(r_case2)

    start = '!'
    namespace = 0
    email = ''

    if len(sys.argv) > 1:
        start = sys.argv[1]
    if len(sys.argv) > 2:
        email = sys.argv[2]

    gen = pagegenerators.AllpagesPageGenerator(start,
                                               namespace,
                                               includeredirects=False)
    preload = pagegenerators.PreloadingGenerator(gen)

    for page in preload:
        if not page.exists() or \
           page.isRedirectPage() or \
           page.isDisambig():
            print 'This page is redirect or disambig, or it does not exist. Skiping...'
            continue

        wtitle = page.title()
        print '=' * 3, wtitle, '=' * 3
        wtext = newtext = page.get()

        if not allowbots(text=wtext, user='******'):
            print 'Skiping by page exclusion compliant'
            continue

        references = r_case1.finditer(wtext)
        if references:
            history = page.getVersionHistory(getAll=False,
                                             reverseOrder=True,
                                             revCount=500)  #only metadata
            if len(history) >= 500:
                print 'Too long history, skiping...'
                continue
            history = page.fullVersionHistory(
                getAll=False, reverseOrder=True,
                revCount=500)  #now, load history with content

            for reference in references:
                ref = reference.group('ref')
                url = reference.group('url')
                if not isURL(url=url):
                    print 'This is not an URL', url
                    continue
                if re.search(r'(archive\.org|webcitation\.org)', url):
                    print 'URL is an archived URL, skiping...', url
                    continue

                urltitle = getURLTitle(url=url)
                deadurl = isURLDead(url=url)
                archiveurl = ''
                archivedate = ''

                accessdate = getDateURLFirstTimeInArticle(history=history,
                                                          url=url)
                if not accessdate:
                    print 'Unknown URL (%s) date first time in article, skiping...' % (
                        urls)
                    continue
                if (datetime.datetime.now() - accessdate).days > limitdays:
                    print 'This URL (%s) was added long time ago: %d days. Skiping...' % (
                        url, (datetime.datetime.now() - accessdate).days)
                    continue

                if deadurl:
                    print 'URL is dead (%s), cannot archive it, searching for an archived copy...' % (
                        url)
                    archiveurl, archivedate = recentArchived(url=url)
                    if archiveurl and archivedate:
                        print 'There is an archived copy (%s, %s), YAY!' % (
                            archiveurl, archivedate)
                    else:
                        print 'No archived copy available in WebCite, skiping this reference...'
                        continue
                else:
                    archiveurl, archivedate = archiveURL(url=url, email=email)

                if not archiveurl or not archivedate:
                    print 'Error, no archiveurl or no archivedate retrieved for %s' % (
                        url)
                    continue

                r_sub1 = '%s - {{WebCite|url=%s|date=%s}}</ref>' % (
                    ref.split('</ref>')[0],
                    archiveurl,
                    archivedate.strftime('%Y-%m-%d'),
                )
                newtext = string.replace(newtext, ref, r_sub1, 1)

        if newtext != wtext:
            wikipedia.showDiff(wtext, newtext)
            summary = 'BOT - Adding link to [[WebCite]] archive for recently added reference(s)'
            page.put(newtext, summary)
예제 #15
0
import wikipedia, pagegenerators
import re, random, time, sys, datetime
import cosmetic_changes

days = [
    u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J', u'K', u'L',
    u'M', u'N', u'Ñ', u'O', u'P', u'Q', u'R', u'S', u'T', u'U', u'V', u'W',
    u'X', u'Y', u'Z', u'Á', u'É', u'Í', u'Ó'
]
wiki = wikipedia.Site("en", "wikipedia")
day = datetime.datetime.now().day
day = day % len(days)
if len(sys.argv) == 2:
    start = sys.argv[1]
    gen = pagegenerators.AllpagesPageGenerator(start,
                                               namespace=0,
                                               includeredirects=True,
                                               site=wiki)
else:
    start = days[day]
    gen = pagegenerators.AllpagesPageGenerator(start,
                                               namespace=0,
                                               includeredirects=True,
                                               site=wiki)
preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=200)

for page in preloadingGen:
    if page.exists() and page.isRedirectPage():
        wikipedia.output(u"Analizando: [[%s]]" % page.title())
        wtext = page.get(get_redirect=True)
        wtitle = page.title()
        #punto de ruptura
예제 #16
0
        f.close()
    except IOError:
        print "Warning! There is no wordlist for your language!"
    else:
        print "Wordlist successfully loaded."
    # This is a purely interactive bot, we therefore do not want to put-throttle
    pywikibot.put_throttle.setDelay(1)
except:
    pywikibot.stopme()
    raise
try:
    if newpages:
        for (page, date, length, loggedIn, user, comment) in pywikibot.getSite().newpages(1000):
            checkPage(page, checknames, knownonly)
    elif start:
        for page in pagegenerators.PreloadingGenerator(pagegenerators.AllpagesPageGenerator(start=start,includeredirects=False)):
            checkPage(page, checknames, knownonly)

    if longpages:
        for (page, length) in pywikibot.getSite().longpages(500):
            checkPage(page, checknames, knownonly)

    else:
        title = ' '.join(title)
        while title != '':
            try:
                page = pywikibot.Page(mysite,title)
                text = page.get()
            except pywikibot.NoPage:
                print "Page does not exist."
            except pywikibot.IsRedirectPage: