示例#1
0
def refresh_messages(site=None):
    site = site or wikipedia.getSite()
    # get 'all messages' special page's path
    path = site.allmessages_address()
    print 'Retrieving MediaWiki messages for %s' % repr(site)
    wikipedia.put_throttle()  # It actually is a get, but a heavy one.
    allmessages = site.getUrl(path)

    print 'Parsing MediaWiki messages'
    soup = BeautifulSoup(allmessages,
                         convertEntities=BeautifulSoup.HTML_ENTITIES)
    # The MediaWiki namespace in URL-encoded format, as it can contain
    # non-ASCII characters and spaces.
    quotedMwNs = urllib.quote(
        site.namespace(8).replace(' ', '_').encode(site.encoding()))
    mw_url = site.path() + "?title=" + quotedMwNs + ":"
    altmw_url = site.path() + "/" + quotedMwNs + ":"
    nicemw_url = site.nice_get_address(quotedMwNs + ":")
    shortmw_url = "/" + quotedMwNs + ":"
    ismediawiki = lambda url: url and (url.startswith(
        mw_url) or url.startswith(altmw_url) or url.startswith(nicemw_url) or
                                       url.startswith(shortmw_url))
    # we will save the found key:value pairs here
    dictionary = {}

    try:
        for keytag in soup('a', href=ismediawiki):
            # Key strings only contain ASCII characters, so we can save them as
            # strs
            key = str(keytag.find(text=True))
            keyrow = keytag.parent.parent
            if keyrow['class'] == "orig":
                valrow = keyrow.findNextSibling('tr')
                assert valrow['class'] == "new"
                value = unicode(valrow.td.string).strip()
            elif keyrow['class'] == 'def':
                value = unicode(keyrow('td')[1].string).strip()
            else:
                raise AssertionError("Unknown tr class value: %s" %
                                     keyrow['class'])
            dictionary[key] = value
    except Exception, e:
        wikipedia.debugDump(
            'MediaWiki_Msg', site, u'%s: %s while processing URL: %s' %
            (repr(e), str(e), unicode(path)), allmessages)
        raise
def refresh_messages(site = None):
    site = site or wikipedia.getSite()
    # get 'all messages' special page's path
    path = site.allmessages_address()
    print 'Retrieving MediaWiki messages for %s' % repr(site)
    wikipedia.put_throttle() # It actually is a get, but a heavy one.
    allmessages = site.getUrl(path)

    print 'Parsing MediaWiki messages'
    soup = BeautifulSoup(allmessages,
                         convertEntities=BeautifulSoup.HTML_ENTITIES)
    # The MediaWiki namespace in URL-encoded format, as it can contain
    # non-ASCII characters and spaces.
    quotedMwNs = urllib.quote(site.namespace(8).replace(' ', '_').encode(site.encoding()))
    mw_url = site.path() + "?title=" + quotedMwNs + ":"
    altmw_url = site.path() + "/" + quotedMwNs + ":"
    nicemw_url = site.nice_get_address(quotedMwNs + ":")
    shortmw_url = "/" + quotedMwNs + ":"
    ismediawiki = lambda url:url and (url.startswith(mw_url)
                                      or url.startswith(altmw_url)
                                      or url.startswith(nicemw_url)
                                      or url.startswith(shortmw_url))
    # we will save the found key:value pairs here
    dictionary = {}

    try:
        for keytag in soup('a', href=ismediawiki):
            # Key strings only contain ASCII characters, so we can save them as
            # strs
            key = str(keytag.find(text=True))
            keyrow = keytag.parent.parent
            if keyrow['class'] == "orig":
                valrow = keyrow.findNextSibling('tr')
                assert valrow['class'] == "new"
                value = unicode(valrow.td.string).strip()
            elif keyrow['class'] == 'def':
                value = unicode(keyrow('td')[1].string).strip()
            else:
                raise AssertionError("Unknown tr class value: %s" % keyrow['class'])
            dictionary[key] = value
    except Exception, e:
        wikipedia.debugDump( 'MediaWiki_Msg', site, u'%s: %s while processing URL: %s' % (repr(e), str(e), unicode(path)), allmessages)
        raise
示例#3
0
def refresh(site, sysop=False, witheditsonly=True):
    #if not site.has_api() or site.versionnumber() < 10:
    #    _refreshOld(site)

    # get botlist special page's URL
    if not site.loggedInAs(sysop=sysop):
        site.forceLogin(sysop=sysop)

    params = {
        'action': 'query',
        'list': 'allusers',
        'augroup': 'bot',
    }
    if witheditsonly:
        params['auwitheditsonly'] = ''

    pywikibot.output(u'Retrieving bot user list for %s via API.' % repr(site))
    pywikibot.put_throttle()  # It actually is a get, but a heavy one.
    botlist = []
    while True:
        data = pywikibot.query.GetData(params, site, sysop=sysop)
        if 'error' in data:
            raise RuntimeError('ERROR: %s' % data)
        botlist.extend([w['name'] for w in data['query']['allusers']])

        if 'query-continue' in data:
            params['aufrom'] = data['query-continue']['allusers']['aufrom']
        else:
            break

    pywikibot.output(u'Retrieving global bot user list for %s.' % repr(site))
    pywikibot.put_throttle()  # It actually is a get, but a heavy one.
    m1 = True
    offset = ''
    if site.versionnumber() >= 17:
        PATTERN = u'<li>(.*?) *\((.*?),\s(.*?)\)(?:.*?)</li>'
    else:
        PATTERN = u'<li>(.*?) *\((.*?),\s(.*?)\)</li>'
    while m1:
        text = site.getUrl(
            site.globalusers_address(offset=offset, group='Global_bot'))

        m1 = re.findall(u'<li>.*?</li>', text)
        for item in m1:
            m2 = re.search(PATTERN, item)
            (bot, flag_local, flag_global) = m2.groups()
            flag_local = (flag_local[:2] == u'<a')
            flag_global = True  # since group='Global_bot'

            if bot not in botlist:
                botlist.append(bot)

        #print len(botlist)
        offset = bot.encode(site.encoding())

    # Save the botlist to disk
    # The file is stored in the botlists subdir. Create if necessary.
    if sysop:
        f = open(
            pywikibot.config.datafilepath(
                'botlists',
                'botlist-%s-%s-sysop.dat' % (site.family.name, site.lang)),
            'w')
    else:
        f = open(
            pywikibot.config.datafilepath(
                'botlists',
                'botlist-%s-%s.dat' % (site.family.name, site.lang)), 'w')
    pickle.dump(botlist, f)
    f.close()
示例#4
0
def refresh(site, sysop=False, witheditsonly=True):
    #if not site.has_api() or site.versionnumber() < 10:
    #    _refreshOld(site)
    
    # get botlist special page's URL
    if not site.loggedInAs(sysop=sysop):
        site.forceLogin(sysop=sysop)
 
    params = {
        'action': 'query',
        'list': 'allusers',
        'augroup': 'bot',
    }
    if witheditsonly:
        params['auwitheditsonly'] = ''
 
    pywikibot.output(u'Retrieving bot user list for %s via API.' % repr(site))
    pywikibot.put_throttle() # It actually is a get, but a heavy one.
    botlist = []
    while True:
        data = pywikibot.query.GetData(params, site, sysop=sysop)
        if 'error' in data:
            raise RuntimeError('ERROR: %s' % data)
        botlist.extend([w['name'] for w in data['query']['allusers']])
 
        if 'query-continue' in data:
            params['aufrom'] = data['query-continue']['allusers']['aufrom']
        else:
            break

    pywikibot.output(u'Retrieving global bot user list for %s.' % repr(site))
    pywikibot.put_throttle() # It actually is a get, but a heavy one.
    m1 = True
    offset = ''
    while m1:
        text = site.getUrl(site.globalusers_address(offset=offset, group='Global_bot'))

        m1 = re.findall(u'<li>.*?</li>', text)
        for item in m1:
            m2 = re.search(u'<li>(.*?)\((.*?),\s(.*?)\)</li>', item)
            (bot, flag_local, flag_global) = m2.groups()

            bot         = bot[:-2]
            flag_local  = (flag_local[:2] == u'<a')
            flag_global = True # since group='Global_bot'

            if bot not in botlist:
                botlist.append( bot )

        #print len(botlist)
        offset = bot.encode(site.encoding())

    # Save the botlist to disk
    # The file is stored in the botlists subdir. Create if necessary.
    if sysop:
        f = open(pywikibot.config.datafilepath('botlists',
                 'botlist-%s-%s-sysop.dat' % (site.family.name, site.lang)), 'w')    
    else:
        f = open(pywikibot.config.datafilepath('botlists',
                 'botlist-%s-%s.dat' % (site.family.name, site.lang)), 'w')
    pickle.dump(botlist, f)
    f.close()