示例#1
0
def titlegrabber(lang="English"):
    import xmlreader
    totalfile = open(scriptdir + "\\all_titles.txt", "w")
    writefile = open(scriptdir + "\\en_titles.txt", "w")
    English = set()
    all = set()
    if "wikt.xml" not in os.listdir(scriptdir):
        dump = xmlreader.XmlDump(scriptdir + "\wikt.bz2")
    else:
        dump = xmlreader.XmlDump(scriptdir + "\wikt.xml")
    for d in dump.parse():
        all.add(d.title)
        if ":" in d.title: continue
        elif "==" + lang + "==" not in d.text: continue
        else: English.add(d.title)
        try:
            print d.title, len(English)
        except:
            pass
    for e in English:
        writefile.write(e.encode("utf-8") + "\r\n")
    for a in all:
        totalfile.write(a.encode("utf-8") + "\r\n")
    writefile.close()
    totalfile.close()
    return English
示例#2
0
 def __iter__(self):
     import xmlreader
     dump = xmlreader.XmlDump(self.xmlFilename)
     for entry in dump.parse():
         text = pywikibot.removeDisabledParts(entry.text)
         if self.refR.search(text) and not self.referencesR.search(text):
             yield pywikibot.Page(pywikibot.getSite(), entry.title)
def dump_entries(dump=None, namespace=None, main_only=False, offline=False):
    """
        Returns an iterator over every entry in the `dump`
        If the `dump` is not specified, `latest_dump()` is used.
    """
    if dump is None:
        dump = latest_dump(offline=offline)

    def do_main_only(namespaces):
        for entry in xmlreader.XmlDump(dump).parse():
            if not ":" in entry.title:
                yield entry
            elif not entry.title[:entry.title.find(":")] in namespaces:
                yield entry

    def do_namespace(namespace):
        for entry in xmlreader.XmlDump(dump).parse():
            if entry.title[:entry.title.find(":")] == namespace:
                yield entry

    if main_only == False and namespace != "":
        if namespace:
            return do_namespace(namespace)
        else:
            return xmlreader.XmlDump(dump).parse()
    else:
        return do_main_only(namespaces)
示例#4
0
    def __iter__(self):
        """
        Yield page objects until the entire XML dump has been read.
        """
        import xmlreader
        mysite = pywikibot.getSite()
        dump = xmlreader.XmlDump(self.xmlfilename)
        # regular expression to find the original template.
        # {{vfd}} does the same thing as {{Vfd}}, so both will be found.
        # The old syntax, {{msg:vfd}}, will also be found.
        # TODO: check site.nocapitalize()
        templatePatterns = []
        for template in self.templates:
            templatePattern = template.titleWithoutNamespace()
            if not pywikibot.getSite().nocapitalize:
                templatePattern = '[' + templatePattern[0].upper(
                ) + templatePattern[0].lower() + ']' + templatePattern[1:]
            templatePattern = re.sub(' ', '[_ ]', templatePattern)
            templatePatterns.append(templatePattern)
        templateRegex = re.compile(
            r'\{\{ *([mM][sS][gG]:)?(?:%s) *(?P<parameters>\|[^}]+|) *}}' %
            '|'.join(templatePatterns))

        for entry in dump.parse():
            if templateRegex.search(entry.text):
                page = pywikibot.Page(mysite, entry.title)
                yield page
示例#5
0
 def test_XmlDumpFirstRev(self):
     pages = [r for r in xmlreader.XmlDump("data/article-pear.xml").parse()]
     self.assertEquals(1, len(pages))
     self.assertEquals(u"Automated conversion", pages[0].comment)
     self.assertEquals(u"Pear", pages[0].title)
     self.assertEquals(u"24278", pages[0].id)
     self.assertTrue(pages[0].text.startswith('Pears are [[tree]]s of'))
     self.assertTrue(not pages[0].isredirect)
示例#6
0
    def __init__(self, xmlFilename, xmlStart, namespaces):
        self.xmlStart = xmlStart
        self.namespaces = namespaces
        self.skipping = bool(xmlStart)
        self.site = pywikibot.getSite()

        dump = xmlreader.XmlDump(xmlFilename)
        self.parser = dump.parse()
示例#7
0
 def test_XmlDumpAllRevs(self):
     pages = [
         r for r in xmlreader.XmlDump("data/article-pear.xml",
                                      allrevisions=True).parse()
     ]
     self.assertEquals(4, len(pages))
     self.assertEquals(u"Automated conversion", pages[0].comment)
     self.assertEquals(u"Pear", pages[0].title)
     self.assertEquals(u"24278", pages[0].id)
     self.assertTrue(pages[0].text.startswith('Pears are [[tree]]s of'))
     self.assertEquals(u"Quercusrobur", pages[1].username)
     self.assertEquals(u"Pear", pages[0].title)
示例#8
0
 def __iter__(self):
     import xmlreader
     mysite = pywikibot.getSite()
     dump = xmlreader.XmlDump(self.xmlFilename)
     for entry in dump.parse():
         if mysite.nocapitalize:
             title = re.escape(entry.title)
         else:
             title = '[%s%s]%s' % (re.escape(
                 entry.title[0].lower()), re.escape(
                     entry.title[0].upper()), re.escape(entry.title[1:]))
         selflinkR = re.compile(r'\[\[' + title + '(\|[^\]]*)?\]\]')
         if selflinkR.search(entry.text):
             yield pywikibot.Page(mysite, entry.title)
             continue
示例#9
0
    def __init__(self, xmlFilename, xmlStart, replacements, exceptions):
        self.xmlFilename = xmlFilename
        self.replacements = replacements
        self.exceptions = exceptions
        self.xmlStart = xmlStart
        self.skipping = bool(xmlStart)

        self.excsInside = []
        if "inside-tags" in self.exceptions:
            self.excsInside += self.exceptions['inside-tags']
        if "inside" in self.exceptions:
            self.excsInside += self.exceptions['inside']
        import xmlreader
        self.site = pywikibot.getSite()
        dump = xmlreader.XmlDump(self.xmlFilename)
        self.parser = dump.parse()
示例#10
0
def main(*args):
    genFactory = pagegenerators.GeneratorFactory()
    # If xmlfilename is None, references will be loaded from the live wiki.
    xmlfilename = None
    user = None
    skip = False
    timestamp = None
    # read command line parameters
    for arg in pywikibot.handleArgs(*args):
        xmlfilename = arg

    print xmlfilename

    importsite = "speedydeletion"

    outsite = pywikibot.getSite("en", importsite)
    outsite.forceLogin()

    dump = xmlreader.XmlDump(xmlfilename)
    count = 0
    for entry in dump.parse():
        #        print  file_store[entry.title]
        title = entry.title.encode("ascii", "ignore")

        m = re.search("Wikipedia:", entry.title)
        if m:
            pywikibot.output(u'skipping %s' % entry.title)
            next
        if entry.title != "Main Page":
            try:
                if (file_store[title]):
                    count = count + 1


#                    pywikibot.output(u'was cached %s' % entry.title)
                else:
                    pywikibot.output(u'not exists %s' % entry.title)
            except KeyError:
                print sys.exc_type, ":", "%s is not in the list." % sys.exc_value
                pywikibot.output(u'key error %s' % entry.title)
                try:
                    outpage = pywikibot.Page(outsite, entry.title)
                    if outpage.exists():
                        pywikibot.output(u'there is an article %s' %
                                         entry.title)
                        file_store[title] = 1
                    else:
                        pywikibot.output(u'is not there  %s' % entry.title)
                        contents = entry.text
                        usernames = entry.username
                        contents = contents + "\n{{wikipedia-deleted|%s}}" % usernames
                        outpage.put(contents)
                    try:
                        file_store[title] = 1
                    except:
                        pywikibot.output(
                            u'could not save %s! to the list of article' %
                            entry.title)
                finally:
                    count = count + 1
            finally:
                count = count + 1
示例#11
0
# -*- coding: UTF-8 -*-
import wikipedia, xmlreader, codecs, re, json, sys

dump = xmlreader.XmlDump(
    "/data/project/dexbot/pywikipedia-git/wikidatawiki-20150603-pages-articles.xml.bz2"
)
langs = ['en', 'fa']
a = 0
db = {}
with codecs.open(
        '/data/project/dexbot/pywikipedia-git/snowball2_%s_%s.txt' %
    (langs[0], langs[1]), 'w', 'utf-8') as f:
    f.write('')


def sep(lang):
    if lang == 'ja':
        return u'・'
    if lang == 'zh':
        return u'·'
    return ' '


def _make_old_dict(_contents):
    """Convert the new dictionary to the old one for consistency."""
    if isinstance(_contents.get('claims', {}),
                  list) and not _contents.get('sitelinks'):
        return _contents
    old_dict = _contents
    new_dict = {
        'links': {},
示例#12
0
def main(*args):
    genFactory = pagegenerators.GeneratorFactory()
    # If xmlfilename is None, references will be loaded from the live wiki.
    xmlfilename = None
    user = None
    skip = False
    timestamp = None
    # read command line parameters
    for arg in pywikibot.handleArgs(*args):
        xmlfilename = arg

    print xmlfilename 

    importsite = "speedydeletion"

    outsite = pywikibot.getSite("en",importsite)
    outsite.forceLogin()

    mysite = pywikibot.getSite()
    dump = xmlreader.XmlDump(xmlfilename) #, allrevisions=True
    count = 0
    for entry in dump.parse():

        print entry.username
#        print entry.revisionid

        if entry.title != "Main Page" :
            page = pywikibot.Page(mysite, entry.title)


            try :
                if (file_store[entry.title] ) :
                    pywikibot.output(u'skipping at %s' % entry.title)
                    count = count +1                    
            except:
                try :
                    pywikibot.output(u'updating %s' % entry.title)
                    outpage = pywikibot.Page(outsite, entry.title)
                    contents = ""
                    try:
                        contents  = outpage.get()
                    except pywikibot.NoPage:
                        contents = ""
                    except pywikibot.IsRedirectPage:
                        print "skipping redirect"

                    if (not(contents)):
                        contents = entry.text
                        
                    usernames = entry.username
                    print ("http://%s%s" % ( outpage.site().hostname(),        outpage.site().nice_get_address(outpage.title())        ))
                    
                    match = re.search(r'\{(wikipedia-deleted)',contents)

                    if (match == None) :

                        contents = contents +  "\n{{wikipedia-deleted|%s}}" % usernames
                        try :
                            status = outpage.put(contents, "adding the username %s" % usernames)
                        except pywikibot.exceptions.LockedPage:
                            print "locked, skipping"

                    else:
                        print match
                        print match.group(0)                        
                        pywikibot.output(u'skipping, already done %s' % entry.title)
                        
#                    except:
#                        print "hiccup"
#                    finally :
#                        count = count + 1

                    file_store[entry.title] = entry.title
                finally:
                    count = count + 1
            finally:
                count = count + 1
                print "done with %s %d" % (entry.title, count)
#
# MIT license
#http://dumps.wikimedia.your.org/fawiki/
#
#fawiki-20140802-pages-meta-current.xml.bz2
#http://dumps.wikimedia.your.org/fawiki/20150325/fawiki-20150325-pages-meta-current.xml.bz2
import wikipedia, xmlreader, codecs, re
import os
bot_adress = "/data/project/rezabot/"
TheDay = '20150325'

urllinkmain = 'http://dumps.wikimedia.your.org/fawiki/%s/fawiki-%s-pages-meta-current.xml.bz2' % (
    TheDay, TheDay)
print urllinkmain
#os.system('wget '+urllinkmain +" "+bot_adress+"fawiki-"+TheDay+"-pages-meta-current.xml.bz2")
dump = xmlreader.XmlDump(bot_adress + "fawiki-" + TheDay +
                         "-pages-meta-current.xml.bz2")

pre, noinclude, includeonly, tags1, tags2 = u'\n', u'\n', u'\n', u'\n', u'\n'
for entry in dump.new_parse():
    if entry.ns == '0':
        text = entry.text.replace(u' /', u'/').replace(u'/ ', u'/').replace(
            u'< ', u'<').replace(u' >', u'>')

        if u'<noinclude>' in text or u'</noinclude>' in text:
            noinclude += u"#[[%s]]\n" % entry.title
        elif u'<includeonly>' in text or u'</includeonly>' in text:
            includeonly += u"#[[%s]]\n" % entry.title
        elif u'<pre>' in text or u'</pre>' in text:
            pre += u"#[[%s]]\n" % entry.title
        elif u'__NOGALLERY__' in text:
            tags1 += u"#[[%s]]\n" % entry.title
示例#14
0
 def __init__(self, api):
     self.api = api
     self.dump = xmlreader.XmlDump(DUMP)
示例#15
0
def main():
    """Missing articles"""
    xml = xmlreader.XmlDump(
        '%s%s' % (dumppath and '%s/' % dumppath or '', dumpfilename),
        allrevisions=False)
    c = 0
    bios = 0
    global skip
    if skip:
        print 'Skiping to...', skip
    for x in xml.parse():  #parsing the whole dump, one page a time
        c += 1
        if c % 10000 == 0:
            print 'Total pages analysed =', c, '| Bios =', bios
        if skip:
            if x.title == skip:
                skip = ''
            continue

        #filtering unuseful pages
        if re.search(title_ex_r, x.title) or \
           re.search(red_r, x.text) or \
           re.search(dis_r, x.text) or \
           len(x.text.splitlines()) < 3 or len(x.text) < 1024*2:
            continue

        #si tiene iws hacia targetlang, no nos interesa, ya existe la bio
        if re.search(iws_target_r, x.text):
            continue

        #nombre con dos palabras largas al menos
        trozos = [
        ]  # no hacer la asignacion del bucle for directamente, sino almacena True y False en vez de los trozos
        [
            len(trozo) >= 3 and trozos.append(trozo)
            for trozo in x.title.split(' ')
        ]
        if not len(trozos) >= 2:
            continue
        #metemos variantes sin acentos
        [(trozo != quitaracentos(trozo) and trozo not in trozos)
         and trozos.append(quitaracentos(trozo)) for trozo in trozos]

        #descartamos algunas bios
        if not re.search(birth_r, x.text) or not re.search(
                death_r, x.text):  #si es BLP, fuera
            continue
        #sino podemos sacar su año de nacimiento ni fallecimiento, fuera
        if not re.search(birth_r, x.text) and not re.search(
                death_r, x.text) and bdtemplate_r.has_key(
                    lang) and not re.search(bdtemplate_r[lang], x.text):
            continue

        print 'Analysing http://%s.wikipedia.org/wiki/%s' % (
            lang, re.sub(' ', '_', x.title))

        #buscando imágenes útiles para la bio
        images = re.findall(ur"(?im)[\s\/\:\|\=]+([^\/\:\|\=]+\.jpe?g)[\s\|]",
                            x.text)
        image_cand = ''
        if images:
            for image in images:
                if len(re.findall(ur"(%s)" % ('|'.join(trozos)), image)) >= 1:
                    image_cand = image
                    break
        if image_cand:
            print 'We have image_cand'
        else:
            print 'No image_cand'
            #continue

        #description
        desc = re.findall(
            ur"(?im)^(\'{2,5}\s*.{,25}\s*%s[^\n\r]+)[\n\r]" %
            (x.title.split(' ')[0]), x.text)
        if not desc:
            print 'No description'
            continue
        else:
            print 'We have description'
            desc = desc[0]

        #birth and death dates
        birthdate = ''
        deathdate = ''
        #first try with birth/death categories
        m = birth_r.finditer(x.text)
        for i in m:
            birthdate = i.group('birthyear')
            break
        m = death_r.finditer(x.text)
        for i in m:
            deathdate = i.group('deathyear')
            break

        #second attempt uses bio first paragraph
        if not birthdate and not deathdate:
            m = dates_r[lang].finditer(desc)
            for i in m:
                """birthmonth = ''
                if i.group('birthday') and i.group('birthmonth'):
                    if monthstoen.has_key(quitaracentos(i.group('birthmonth').lower())):
                        birthmonth = monthstoen[i.group('birthmonth').lower()]
                deathmonth = ''
                if i.group('deathday') and i.group('deathmonth'):
                    if monthstoen.has_key(quitaracentos(i.group('deathmonth').lower())):
                        deathmonth = monthstoen[i.group('deathmonth').lower()]
                if birthmonth:
                    #continue #temp
                    birthdate = u'%s %s, %s' % (birthmonth, i.group('birthday'), i.group('birthyear'))
                else:
                    birthdate = u'%s' % (i.group('birthyear'))
                if deathmonth:
                    #continue #temp
                    deathdate = u'%s %s, %s' % (deathmonth, i.group('deathday'), i.group('deathyear'))
                else:
                    deathdate = u'%s' % (i.group('deathyear'))"""
                birthdate = i.group('birthyear')
                deathdate = i.group('deathyear')
                break

        #third case uses special templates
        #special cases for es: {{BD|XXXX|YYYY|DEFAULTSORT}}, or vi:, or others
        if not birthdate and not deathdate and bdtemplate_r.has_key(lang):
            m = bdtemplate_r[lang].finditer(x.text)
            for i in m:
                birthdate = u'%s' % (i.group('birthyear'))
                deathdate = u'%s' % (i.group('deathyear'))
                break

        if birthdate and deathdate:
            print 'We have birthdate and deathdate'
            if (int(deathdate[-4:]) -
                    int(birthdate[-4:])) < 20:  #weird, child prodigy?
                print 'But dates are weird', birthdate, deathdate
                continue  #skiping bio
        else:
            print 'No birthdate or deathdate'
        #end birth and death dates

        #defaultsort
        m = defaultsort_r.finditer(x.text)
        defaultsort = ''
        for d in m:
            defaultsort = d.group("defaultsort")
            break
        if not defaultsort and bdtemplate_r.has_key(lang):
            m = bdtemplate_r[lang].finditer(x.text)
            for i in m:
                defaultsort = u'%s' % (i.group('defaultsort'))
                break
        if not defaultsort:  #create myself
            defaultsort = u'%s, %s' % (' '.join(
                quitaracentos(x.title).split(' ')[1:]), quitaracentos(
                    x.title).split(' ')[0])

        #iws
        m = iws_r.finditer(x.text)
        iws = []
        for iw in m:
            if not iw.group('iwlang') in [targetlang, lang]:
                iws.append([iw.group('iwlang'), iw.group('iwtitle')])
        iws.append([lang, x.title])
        if len(iws) < minimumiws:
            print 'No minimum interwikis'
            continue  # this language and other wiki at least
        print 'We have %d interwikis' % len(iws)
        iws.sort()
        iws_plain = ''
        for iwlang, iwtitle in iws:
            iws_plain += u'[[%s:%s]]\n' % (iwlang, iwtitle)

        if desc and len(desc) < 2500 and birthdate and deathdate:
            #check if live version has interwiki or not
            sourcebio = wikipedia.Page(wikipedia.Site(lang, 'wikipedia'),
                                       x.title)
            if not sourcebio.exists():
                print 'Page doesnt exist'
                continue
            if sourcebio.isRedirectPage():
                print 'Page is redirect'
                continue
            if sourcebio.isDisambig():
                print 'Page is disambig'
                continue
            if len(re.findall(iws_target_r, sourcebio.get())) != 0:
                print 'Found iw to target lang in the current version of article'
                continue

            #cats, esto es lo más costoso en tiempo, entonces lo dejamos para este último if justo antes de generar el output
            m = cats_r.finditer(x.text)
            cats = []
            [
                translatecat(cat.group('catname'), lang)
                and translatecat(cat.group('catname'), lang) not in cats
                and cats.append(translatecat(cat.group('catname'), lang))
                for cat in m
            ]
            cats.sort()

            #nationality
            nationality = ''
            if cats:
                n = [cat.split(' ')[0] for cat in cats]
                for nn in n:
                    if nn in nationalitytonation.keys():
                        if nationality:
                            if nn != nationality:  #conflict, several nationalities for this bio, blank nationality and exit
                                nationality = ''
                                break
                        else:
                            nationality = nn
                    else:
                        if not nn.isdigit():
                            f = open('missingarticlesxml.output.errors', 'a')
                            f.write((u'missing nationality = %s\n' %
                                     (nn)).encode('utf-8'))
                            f.close()

            if nationality:
                print 'We have nationality'
            else:
                print 'No nationality found'
                continue

            #occupations (usando cats)
            occupations = []
            if nationality:
                for cat in cats:
                    t = cat.split(' ')
                    if (
                            t[0] == nationality
                            or t[0].split('-')[0] == nationality
                    ) and len(
                            t
                    ) == 2:  # [[Category:Spanish writers]] [[Category:Spanish-language writers]]
                        if t[1][-3:] == 'ies':
                            if not '%sy' % t[1].rstrip('ies') in occupations:
                                occupations.append(
                                    '%sy' % t[1].rstrip('ies')
                                )  #remove final ies and add y
                        elif t[1][-1] == 's':
                            if not t[1].rstrip('s') in occupations:
                                occupations.append(
                                    t[1].rstrip('s'))  #remove final s
                        elif t[1] == 'businesspeople':
                            if not 'businessman' in occupations:
                                occupations.append('businessman')

            if occupations:
                print 'We have occupation'
            else:
                print 'No occupations found'
                continue

            #la salida para esta bio
            output = u"""\n<br clear="all"/>\n==== [[%s]] ([[:%s:%s|%s]]) ====""" % (
                x.title, lang, x.title, lang)
            if image_cand:
                output += u"""\n[[File:%s|thumb|right|120px|%s]]""" % (
                    image_cand, x.title)
            output += u"""\n<small><nowiki>%s</nowiki></small>""" % (
                linkstoiws(desc, lang).strip())
            output += u"""\n<pre>"""
            output += u"""\n{{Expand %s|%s}}""" % (langisotolang[lang],
                                                   x.title)
            if image_cand:
                output += u"""\n[[File:%s|thumb|right|%s]]""" % (image_cand,
                                                                 x.title)
            output += u"""\n\'\'\'%s\'\'\' (%s–%s) was %s %s %s.""" % (
                x.title, birthdate, deathdate,
                nationality and nationalitytonation[nationality][0]
                in ['A', 'E', 'I', 'O', 'U'] and 'an' or 'a', nationality
                and '[[%s|%s]]' %
                (nationalitytonation[nationality], nationality),
                occupations and
                (len(occupations) > 1 and '%s and %s' %
                 (', '.join(occupations[:-1]), occupations[-1:][0])
                 or occupations[0]) or '...')
            output += u"""\n\n{{Persondata <!-- Metadata: see [[Wikipedia:Persondata]]. -->"""
            output += u"""\n| NAME              = %s """ % (defaultsort)
            output += u"""\n| ALTERNATIVE NAMES = """
            output += u"""\n| SHORT DESCRIPTION = """
            output += u"""\n| DATE OF BIRTH     = %s """ % (birthdate)
            output += u"""\n| PLACE OF BIRTH    = """
            output += u"""\n| DATE OF DEATH     = %s """ % (deathdate)
            output += u"""\n| PLACE OF DEATH    = """
            output += u"""\n}}"""
            output += u"""\n{{DEFAULTSORT:%s}}""" % (defaultsort)
            if cats:
                output += u"""\n"""
                for cat in cats:
                    if not cat in ['Men', 'Women', 'Fascists'
                                   ] and not cat.startswith('Death'):
                        output += u"""\n[[Category:%s]]""" % (cat)
            output += u"""\n\n%s""" % (iws_plain)
            output += u"""\n%s""" % (
                nationality and nationalitytonation[nationality]
                and '{{%s-bio-stub}}' %
                (nationalitytonation[nationality]) or '{{bio-stub}}')
            output += u"""\n</pre>"""

            #last replacements...
            output = re.sub(ur"{{United States-bio-stub}}",
                            ur"{{US-bio-stub}}", output)
            output = re.sub(ur"{{Czech Republic-bio-stub}}",
                            ur"{{Czech-bio-stub}}", output)
            #end last

            print '#' * 70
            print x.title, 'https://%s.wikipedia.org/wiki/%s' % (
                lang, x.title.replace(' ', '_'))
            print output
            bios += 1
            print 'Total pages analysed =', c, '| Bios =', bios
            f = open(
                'missingarticlesxml.output.%s.%02d.txt' % (lang, len(iws)),
                'a')
            f.write(output.encode('utf-8'))
            f.close()
# -*- coding: utf-8 -*-
import codecs
import re

import xmlreader
dump = xmlreader.XmlDump("fawiki-20150325-pages-articles.xml.bz2")
a = 0
f = codecs.open("markup.txt", "w", "utf-8")
f.write("")
f.close()
rer = re.compile(ur'(<table|<pre>\s*?</pre>|<noinclude>\s*?</noinclude>|'
                 '<includeonly>\s*?</includeonly>|__NOGALLERY__|'
                 '__NOEDITSECTION__|__TOC__|__NOTOC__)')
for entry in dump.new_parse():
    if entry.ns in ['0', '14', '6', '4']:
        if rer.search(entry.text):
            a += 1
            print "found one: %d" % a
            f = codecs.open("markup.txt", "a", "utf-8")
            f.write(u"[[%s]]\n" % entry.title)
            f.close()
示例#17
0
 def __init__(self, xmlfilename):
     import xmlreader
     self.xmldump = xmlreader.XmlDump(xmlfilename)
示例#18
0
 def do_main_only(namespaces):
     for entry in xmlreader.XmlDump(dump).parse():
         if not ":" in entry.title:
             yield entry
         elif not entry.title[:entry.title.find(":")] in namespaces:
             yield entry
示例#19
0
 def test_XmlDumpRedirect(self):
     pages = [
         r for r in xmlreader.XmlDump("data/article-pyrus.xml").parse()
     ]
     self.assertTrue(pages[0].isredirect)
示例#20
0
 def do_namespace(namespace):
     for entry in xmlreader.XmlDump(dump).parse():
         if entry.title[:entry.title.find(":")] == namespace:
             yield entry
示例#21
0
        return 5

def cleantitle(title):
    title = re.sub(ur"[&]", ur"-", title)
    return title

dumppath = ''
if len(sys.argv) == 2:
    dumpfilename = sys.argv[1]

#download commons dump
dumppath = '/mnt/user-store/emijrp'
dumpfilename = 'commonswiki-latest-pages-articles.xml.bz2'
os.system('wget -c http://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-pages-articles.xml.bz2 -O %s/%s' % (dumppath, dumpfilename))

xml = xmlreader.XmlDump('%s%s' % (dumppath and '%s/' % dumppath or '', dumpfilename), allrevisions=False)
path = '/home/emijrp/public_html/commonsexplorer'
errors = 0
minpics = 1 #min pics to show for year
maximages = 100000 #max images to show in the sum all years
maxyear = 2000
minyear = 1850
c = 0
s = 0
coord_dec_r = re.compile(ur"(?im)(?P<all>{{\s*(Location dec|Object location dec)\s*\|\s*(?P<lat>[\d\.\-\+]+)\s*\|\s*(?P<lon>[\d\.\-\+]+)\s*\|?\s*[^\|\}]*\s*}})")
coord_r = re.compile(ur"(?im)(?P<all>{{\s*(Location|Object location)\s*\|\s*(?P<lat_d>[\d\.\-\+]+)\s*\|\s*(?P<lat_m>[\d\.\-\+]+)\s*\|\s*(?P<lat_s>[\d\.\-\+]+)\s*\|\s*(?P<lat>[NS])\s*\|\s*(?P<lon_d>[\d\.\-\+]+)\s*\|\s*(?P<lon_m>[\d\.\-\+]+)\s*\|\s*(?P<lon_s>[\d\.\-\+]+)\s*\|\s*(?P<lon>[EW])\s*\|?\s*[^\|\}]*\s*}})")
date_r = re.compile(ur"(?im)^\s*\|\s*Date\s*=\s*(?P<date>(\d{4}(-\d{2}-\d{2})?))\D")
description_r = re.compile(ur"(?im)\{\{\s*en\s*\|\s*(1\s*\=)?\s*(?P<description>[^\{\}]{10,300})\s*\}\}")
exclude_images_r = re.compile(ur"(?im)\b(maps?|mapa)\b")

images_by_year = {}
示例#22
0
文件: redirect.py 项目: moleculea/ess
    def get_redirects_from_dump(self, alsoGetPageTitles=False):
        '''
        Load a local XML dump file, look at all pages which have the
        redirect flag set, and find out where they're pointing at. Return
        a dictionary where the redirect names are the keys and the redirect
        targets are the values.
        '''
        xmlFilename = self.xmlFilename
        redict = {}
        # open xml dump and read page titles out of it
        dump = xmlreader.XmlDump(xmlFilename)
        redirR = self.site.redirectRegex()
        readPagesCount = 0
        if alsoGetPageTitles:
            pageTitles = set()
        for entry in dump.parse():
            readPagesCount += 1
            # always print status message after 10000 pages
            if readPagesCount % 10000 == 0:
                pywikibot.output(u'%i pages read...' % readPagesCount)
            if len(self.namespaces) > 0:
                if pywikibot.Page(self.site, entry.title).namespace() \
                        not in self.namespaces:
                    continue
            if alsoGetPageTitles:
                pageTitles.add(entry.title.replace(' ', '_'))

            m = redirR.match(entry.text)
            if m:
                target = m.group(1)
                # There might be redirects to another wiki. Ignore these.
                for code in self.site.family.iwkeys:
                    if target.startswith('%s:' % code) \
                            or target.startswith(':%s:' % code):
                        if code == self.site.language():
                            # link to our wiki, but with the lang prefix
                            target = target[(len(code) + 1):]
                            if target.startswith(':'):
                                target = target[1:]
                        else:
                            pywikibot.output(
                                u'NOTE: Ignoring %s which is a redirect to %s:'
                                % (entry.title, code))
                            target = None
                            break
                # if the redirect does not link to another wiki
                if target:
                    source = entry.title.replace(' ', '_')
                    target = target.replace(' ', '_')
                    # remove leading and trailing whitespace
                    target = target.strip('_')
                    # capitalize the first letter
                    if not pywikibot.getSite().nocapitalize:
                        source = source[:1].upper() + source[1:]
                        target = target[:1].upper() + target[1:]
                    if '#' in target:
                        target = target[:target.index('#')].rstrip("_")
                    if '|' in target:
                        pywikibot.output(
                            u'HINT: %s is a redirect with a pipelink.' %
                            entry.title)
                        target = target[:target.index('|')].rstrip("_")
                    if target:  # in case preceding steps left nothing
                        redict[source] = target
        if alsoGetPageTitles:
            return redict, pageTitles
        else:
            return redict
示例#23
0
def main():
    """Localisation for dates (YYYY-MM-DD)"""

    month2number = {
        #English
        u"en": {
            u"january": u"01",
            u"jan": u"01",
            u"february": u"02",
            u"feb": u"02",
            u"march": u"03",
            u"mar": u"03",
            u"april": u"04",
            u"apr": u"04",
            u"may": u"05",
            u"june": u"06",
            u"jun": u"06",
            u"july": u"07",
            u"jul": u"07",
            u"august": u"08",
            u"aug": u"08",
            u"september": u"09",
            u"sep": u"09",
            u"sept": u"09",
            u"october": u"10",
            u"oct": u"10",
            u"november": u"11",
            u"nov": u"11",
            u"december": u"12",
            u"dec": u"12",
        },

        #Spanish
        u"es": {
            u"enero": u"01",
            u"ene": u"01",
            u"febrero": u"02",
            u"feb": u"02",
            u"marzo": u"03",
            u"mar": u"03",
            u"abril": u"04",
            u"abr": u"04",
            u"mayo": u"05",
            u"may": u"05",
            u"junio": u"06",
            u"jun": u"06",
            u"julio": u"07",
            u"jul": u"07",
            u"agosto": u"08",
            u"ago": u"08",
            u"agos": u"08",
            u"setiembre": u"09",
            u"septiembre": u"09",
            u"sep": u"09",
            u"sept": u"09",
            u"octubre": u"10",
            u"oct": u"10",
            u"noviembre": u"11",
            u"nov": u"11",
            u"diciembre": u"12",
            u"dic": u"12",
        },

        #French
        u"fr": {
            u"janvier": u"01",
            u"jan": u"01",
            u"février": u"02",
            u"fevrier": u"02",
            u"mars": u"03",
            u"avril": u"04",
            u"avr": u"04",
            u"mai": u"05",
            u"juin": u"06",
            u"juillet": u"07",
            u"août": u"08",
            u"aout": u"08",
            u"septembre": u"09",
            u"sept": u"09",
            u"sep": u"09",
            u"octobre": u"10",
            u"oct": u"10",
            u"novembre": u"11",
            u"nov": u"11",
            u"décembre": u"12",
            u"decembre": u"12",
            u"dec": u"12",
        },

        #German
        u"de": {
            u"januar": u"01",
            u"jan": u"01",
            u"februar": u"02",
            u"feb": u"02",
            u"märz": u"03",
            u"marz": u"03",
            u"mar": u"03",
            u"april": u"04",
            u"apr": u"04",
            u"mai": u"05",
            u"juni": u"06",
            u"juli": u"07",
            u"august": u"08",
            u"aug": u"08",
            u"september": u"09",
            u"sept": u"09",
            u"sep": u"09",
            u"oktober": u"10",
            u"okt": u"10",
            u"november": u"11",
            u"nov": u"11",
            u"dezember": u"12",
            u"dez": u"12",
        },

        #Italian
        u"it": {
            u"gennaio": u"01",
            u"gen": u"01",
            u"febbraio": u"02",
            u"feb": u"02",
            u"marzo": u"03",
            u"mar": u"03",
            u"aprile": u"04",
            u"apr": u"04",
            u"maggio": u"05",
            u"mag": u"05",
            u"giugno": u"06",
            u"luglio": u"07",
            u"agosto": u"08",
            u"ago": u"08",
            u"settembre": u"09",
            u"sett": u"09",
            u"set": u"09",
            u"ottobre": u"10",
            u"ott": u"10",
            u"novembre": u"11",
            u"nov": u"11",
            u"diciembre": u"12",
            u"dic": u"12",
        },

        #Nederlands
        u"nl": {
            u"januari": u"01",
            u"jan": u"01",
            u"februari": u"02",
            u"feb": u"02",
            u"maart": u"03",
            u"april": u"04",
            u"apr": u"04",
            u"mei": u"05",
            u"juni": u"06",
            u"juli": u"07",
            u"augustus": u"08",
            u"aug": u"08",
            u"september": u"09",
            u"sept": u"09",
            u"sep": u"09",
            u"oktober": u"10",
            u"okt": u"10",
            u"november": u"11",
            u"nov": u"11",
            u"december": u"12",
            u"dec": u"12",
        },

        #Polski
        u"pl": {
            u"styczeń": u"01",
            u"luty": u"02",
            u"marzec": u"03",
            u"kwiecień": u"04",
            u"maj": u"05",
            u"czerwiec": u"06",
            u"lipiec": u"07",
            u"sierpień": u"08",
            u"wrzesień": u"09",
            u"październik": u"10",
            u"listopad": u"11",
            u"grudzień": u"12",
        },

        #Portuguese
        u"pt": {
            u"janeiro": u"01",
            u"jan": u"01",
            u"fevereiro": u"02",
            u"fev": u"02",
            u"março": u"03",
            u"mar": u"03",
            u"abril": u"04",
            u"abr": u"04",
            u"maio": u"05",
            u"junho": u"06",
            u"julho": u"07",
            u"agosto": u"08",
            u"setembro": u"09",
            u"outubro": u"10",
            u"novembre": u"11",
            u"dezembro": u"12",
        },
    }

    #regexps
    spliter1 = ur'[\s\-\,\.\/\\]*'  #spliter for months in words
    spliter2 = ur''  #todo, spliter for dates with month in numbers
    suffix1 = ur'[\s\.]*(st|nd|rd|th)?[\s\.]*'  # March 1st, ..., not mandatory
    regexp_r = {
        'en-ddmonthyyyy':
        re.compile(
            ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])%s%s(?P<month>%s)%s(?P<year>\d{4}))(?P<end>\s*))$"
            % (suffix1, spliter1, '|'.join(
                month2number['en'].keys()), spliter1)),
        'en-monthddyyyy':
        re.compile(
            ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<month>%s)%s(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])%s%s(?P<year>\d{4}))(?P<end>\s*))$"
            % ('|'.join(
                month2number['en'].keys()), spliter1, suffix1, spliter1)),
        'es-ddmonthyyyy':
        re.compile(
            ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])\s+de\s+(?P<month>%s)\s+de\s+(?P<year>\d{4}))(?P<end>\s*))$"
            % ('|'.join(month2number['es'].keys()))),
        'fr-ddmonthyyyy':
        re.compile(
            ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])\s+(?P<month>%s)\s+(?P<year>\d{4}))(?P<end>\s*))$"
            % ('|'.join(month2number['fr'].keys()))),
        'de-ddmonthyyyy':
        re.compile(
            ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])%s(?P<month>%s)%s(?P<year>\d{4}))(?P<end>\s*))$"
            % (spliter1, '|'.join(month2number['de'].keys()), spliter1)),
        'it-ddmonthyyyy':
        re.compile(
            ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])%s(?P<month>%s)%s(?P<year>\d{4}))(?P<end>\s*))$"
            % (spliter1, '|'.join(month2number['it'].keys()), spliter1)),
        'nl-ddmonthyyyy':
        re.compile(
            ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])%s(?P<month>%s)%s(?P<year>\d{4}))(?P<end>\s*))$"
            % (spliter1, '|'.join(month2number['nl'].keys()), spliter1)),
        'pl-ddmonthyyyy':
        re.compile(
            ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])%s(?P<month>%s)%s(?P<year>\d{4}))(?P<end>\s*))$"
            % (spliter1, '|'.join(month2number['pl'].keys()), spliter1)),
        'pt-ddmonthyyyy':
        re.compile(
            ur"(?im)^(?P<all>(?P<ini>\s*\|\s*Date\s*=\s*)(?P<date>(?P<day>[1-9]|1[0-9]|2[0-9]|3[0-1])\s+de\s+(?P<month>%s)\s+de\s+(?P<year>\d{4}))(?P<end>\s*))$"
            % ('|'.join(month2number['pt'].keys()))),
    }

    dumpfilename = ''
    modes = []
    skip = u''  #'File:Lagothrix lagotricha.jpg'
    if len(sys.argv) >= 2:
        dumpfilename = sys.argv[1]
    else:
        print 'python script.py dumpfilename [mode] [skipuntilthispage]'
        sys.exit()
    if len(sys.argv) >= 3:  #en1, fr1, etc, regexps
        if sys.argv[2] != 'all':
            modes = [sys.argv[2]]
    if not modes:
        modes = regexp_r.keys()
    if len(sys.argv) >= 4:
        skip = re.sub('_', ' ', sys.argv[3])

    xml = xmlreader.XmlDump(dumpfilename, allrevisions=False)
    c = 0
    if skip:
        print 'Skiping to...', skip
    for x in xml.parse():  #parsing the whole dump
        if not x.title.strip().startswith('File:'):
            continue
        c += 1
        if skip:
            if x.title.strip() != skip:
                continue
            else:
                skip = ''

        for mode in modes:
            m = re.findall(regexp_r[mode], x.text)  # check dump text
            if m:
                print c, 'Candidate found in dump: ', x.title

                page = wikipedia.Page(wikipedia.Site("commons", "commons"),
                                      x.title)
                if not page.exists() or page.isRedirectPage(
                ) or page.isDisambig():
                    print '  Page not found, deleted or redirect?'
                    continue  #next page in dump
                if not page.canBeEdited():
                    print '  Page cannot be edited, protected?'
                    continue  #next page in dump

                wtext = page.get()
                newtext = wtext

                if re.findall(regexp_r[mode], wtext):
                    m = re.finditer(
                        regexp_r[mode], wtext
                    )  # check live text to verify that the date is still in Commons page
                    for i in m:
                        print '  Commons page has a date to translate:', x.title

                        #text to remove
                        if mode in [
                                'en-ddmonthyyyy',
                                'en-monthddyyyy',
                                'es-ddmonthyyyy',
                                'fr-ddmonthyyyy',
                                'de-ddmonthyyyy',
                                'it-ddmonthyyyy',
                                'nl-ddmonthyyyy',
                                'pl-ddmonthyyyy',
                                'pt-ddmonthyyyy',
                        ]:
                            regexp_rep = i.group('all')
                        elif False:  #other modes...
                            pass

                        #text to insert
                        monthname = i.group('month').strip().lower()
                        if mode in [
                                'en-ddmonthyyyy',
                                'en-monthddyyyy',
                                'es-ddmonthyyyy',
                                'fr-ddmonthyyyy',
                                'de-ddmonthyyyy',
                                'it-ddmonthyyyy',
                                'nl-ddmonthyyyy',
                                'pl-ddmonthyyyy',
                                'pt-ddmonthyyyy',
                        ]:
                            regexp_sub = ur"%s%s-%s-%02d%s" % (
                                i.group('ini'), i.group('year'),
                                month2number[mode.split('-')[0]][monthname],
                                int(i.group('day')), i.group('end'))
                        elif False:  #other modes...
                            pass

                        newtext = newtext.replace(
                            regexp_rep, regexp_sub,
                            1)  #replace only the first occurence
                        if wtext != newtext:  #submit only if difference appears
                            wikipedia.showDiff(wtext, newtext)
                            page.put(
                                newtext,
                                u"BOT - Changes to allow localization: %s → %s"
                                % (regexp_rep, regexp_sub))

                        break  #only one replacement and break
                else:
                    print '  Text in Commons page does not contain a date to be localised'
                break  #only one mode, then skip to the following page
示例#24
0
def main(*args):

    print "ARGS:%s\n" % sys.argv

    genFactory = pagegenerators.GeneratorFactory()
    # If xmlfilename is None, references will be loaded from the live wiki.
    xmlfilename = None
    user = None
    skip = False
    timestamp = None
    # read command line parameters
    for arg in pywikibot.handleArgs(*args):
        xmlfilename = arg
    print xmlfilename
    insite = pywikibot.getSite("en", "wikipedia")
    importsite = "speedydeletion"
    outsite = pywikibot.getSite("en", importsite)
    outsite.forceLogin()

    try:
        print "try to open %s\n" % xmlfilename
        with open(xmlfilename) as f:
            pass
    except:
        print "cannot open %s\n" % xmlfilename
        exit(0)

    if sys.argv[1] == "--validate":
        tempfile = "%s.tmp" % xmlfilename
        status = subprocess.call("xmllint --recover  %s -o %s" %
                                 (xmlfilename, tempfile),
                                 shell=True)
        print "status %d\n" % status
    else:
        tempfile = xmlfilename

    dump = xmlreader.XmlDump(tempfile)
    count = 0

    for entry in dump.parse():
        #        print  file_store[entry.title]
        title = entry.title.encode("utf8", "ignore")

        if re.search("^User:"******"^Wikipedia:", entry.title):
            #            pywikibot.output(u'skipping %s' % entry.title)
            continue
#        if  re.search("^User:"******"^User Talk:" , entry.title):
#            pywikibot.output(u'skipping %s' % entry.title)
#            continue
        if re.search(".css$", entry.title):
            #            pywikibot.output(u'skipping %s' % entry.title)
            continue
        if re.search("^Main Page", entry.title):
            #            pywikibot.output(u'skipping %s' % entry.title)
            continue


#        pywikibot.output(u'Considering %s' % entry.title)
        title = title.replace(":", "_")
        title = title.replace("!", "_")
        title = title.replace("/", "_")
        title = title.replace("\\", "_")
        title = decode(title)
        try:
            if (len(title) < 1):
                pywikibot.output(u'empty title:%s' % entry.title)
                continue

            if (file_store[title]):
                count = count + 1
            else:
                pywikibot.output(u'not exists %s' % entry.title)
        except KeyError:
            try:
                outpage = pywikibot.Page(site=outsite,
                                         title=entry.title,
                                         insite=outsite)

                exists = False
                try:
                    exists = outpage.exists()
                except:
                    pywikibot.output(
                        u'key error exiting article %s transformed to %s' %
                        (entry.title, title))

                if exists:
                    #pywikibot.output(u'there is an article %s' % entry.title)
                    try:
                        file_store[title] = 1
                    except KeyError:
                        pywikibot.output(
                            u'key error saving article %s transformed to %s' %
                            (entry.title, title))

                else:
                    pywikibot.output(u'is not there, adding  %s' % entry.title)
                    contents = entry.text
                    usernames = entry.username
                    if re.search('Template:', title):
                        contents = contents + "<noinclude>{{wikipedia-template|%s}}</noinclude>" % usernames
                    else:
                        contents = contents + "\n{{wikipedia-deleted|%s}}" % usernames
                    outpage._site = outsite
                    try:
                        outpage.put(contents)
                    except:
                        pywikibot.output(u'cannot put article %s / %s' %
                                         (entry.title, title))
                try:
                    file_store[title] = 1
                except KeyboardInterrupt:
                    print "Bye"
                    sys.exit()

                except KeyError:
                    pywikibot.output(
                        u'could not save %s! to the list of article' %
                        entry.title)

            except KeyboardInterrupt:
                print "Bye"
                sys.exit()
            except KeyError:
                pywikibot.output(u'problem with  %s! ' % entry.title)

            finally:
                count = count + 1

        except KeyboardInterrupt:
            print "Bye"
            sys.exit()
        except KeyError:
            pywikibot.output(u'problem2 with  %s! ' % entry.title)

        finally:
            count = count + 1
示例#25
0
    createDB(conn=conn, cursor=cursor)

    limit = 1000
    c = 0
    c_page = 0
    t1 = time.time()
    tt = time.time()
    
    r_internal_links = re.compile(ur'(?i)(\[\[[^\|\]\r\n]+?(\|[^\|\]\r\n]*?)?\]\])') #descontar external, images, categories, interwiki?
    r_external_links = re.compile(ur'(?i)\b(ftps?|git|gopher|https?|irc|mms|news|svn|telnet|worldwind)://')
    # http://en.wikipedia.org/wiki/Special:SiteMatrix
    r_interwikis = re.compile(ur'(?i)(\[\[([a-z]{2,3}|simple|classical)(\-([a-z]{2,3}){1,2}|tara)?\:[^\[\]]+?\]\])')
    r_sections = re.compile(ur'(?im)^(={1,6})[^=]+\1')
    r_templates = re.compile(ur'(?im)(^|[^\{])\{\{[^\{\}\|]+[\}\|]') # {{T1|...}} or {{T1}}
    
    xml = xmlreader.XmlDump(dumpfilename, innerxml=xmlfilename, allrevisions=True)
    errors = 0
    errors_page = 0
    page_id = -1 #impossible value
    page_title = ''
    page_editcount = 0
    page_creation_timestamp = ''
    page_last_timestamp = ''
    page_text = ''
    page_size = 0
    page_internal_links = 0
    page_external_links = 0
    page_interwikis = 0
    page_sections = 0
    page_templates = 0
    rev_prev_text_for_diff = ''