示例#1
0
def refresh_messages(site=None):
    site = site or wikipedia.getSite()
    # get 'all messages' special page's path
    path = site.allmessages_address()
    print 'Retrieving MediaWiki messages for %s' % repr(site)
    wikipedia.put_throttle()  # It actually is a get, but a heavy one.
    allmessages = site.getUrl(path)

    print 'Parsing MediaWiki messages'
    soup = BeautifulSoup(allmessages,
                         convertEntities=BeautifulSoup.HTML_ENTITIES)
    # The MediaWiki namespace in URL-encoded format, as it can contain
    # non-ASCII characters and spaces.
    quotedMwNs = urllib.quote(
        site.namespace(8).replace(' ', '_').encode(site.encoding()))
    mw_url = site.path() + "?title=" + quotedMwNs + ":"
    altmw_url = site.path() + "/" + quotedMwNs + ":"
    nicemw_url = site.nice_get_address(quotedMwNs + ":")
    shortmw_url = "/" + quotedMwNs + ":"
    ismediawiki = lambda url: url and (url.startswith(
        mw_url) or url.startswith(altmw_url) or url.startswith(nicemw_url) or
                                       url.startswith(shortmw_url))
    # we will save the found key:value pairs here
    dictionary = {}

    try:
        for keytag in soup('a', href=ismediawiki):
            # Key strings only contain ASCII characters, so we can save them as
            # strs
            key = str(keytag.find(text=True))
            keyrow = keytag.parent.parent
            if keyrow['class'] == "orig":
                valrow = keyrow.findNextSibling('tr')
                assert valrow['class'] == "new"
                value = unicode(valrow.td.string).strip()
            elif keyrow['class'] == 'def':
                value = unicode(keyrow('td')[1].string).strip()
            else:
                raise AssertionError("Unknown tr class value: %s" %
                                     keyrow['class'])
            dictionary[key] = value
    except Exception, e:
        wikipedia.debugDump(
            'MediaWiki_Msg', site, u'%s: %s while processing URL: %s' %
            (repr(e), str(e), unicode(path)), allmessages)
        raise
def refresh_messages(site = None):
    site = site or wikipedia.getSite()
    # get 'all messages' special page's path
    path = site.allmessages_address()
    print 'Retrieving MediaWiki messages for %s' % repr(site)
    wikipedia.put_throttle() # It actually is a get, but a heavy one.
    allmessages = site.getUrl(path)

    print 'Parsing MediaWiki messages'
    soup = BeautifulSoup(allmessages,
                         convertEntities=BeautifulSoup.HTML_ENTITIES)
    # The MediaWiki namespace in URL-encoded format, as it can contain
    # non-ASCII characters and spaces.
    quotedMwNs = urllib.quote(site.namespace(8).replace(' ', '_').encode(site.encoding()))
    mw_url = site.path() + "?title=" + quotedMwNs + ":"
    altmw_url = site.path() + "/" + quotedMwNs + ":"
    nicemw_url = site.nice_get_address(quotedMwNs + ":")
    shortmw_url = "/" + quotedMwNs + ":"
    ismediawiki = lambda url:url and (url.startswith(mw_url)
                                      or url.startswith(altmw_url)
                                      or url.startswith(nicemw_url)
                                      or url.startswith(shortmw_url))
    # we will save the found key:value pairs here
    dictionary = {}

    try:
        for keytag in soup('a', href=ismediawiki):
            # Key strings only contain ASCII characters, so we can save them as
            # strs
            key = str(keytag.find(text=True))
            keyrow = keytag.parent.parent
            if keyrow['class'] == "orig":
                valrow = keyrow.findNextSibling('tr')
                assert valrow['class'] == "new"
                value = unicode(valrow.td.string).strip()
            elif keyrow['class'] == 'def':
                value = unicode(keyrow('td')[1].string).strip()
            else:
                raise AssertionError("Unknown tr class value: %s" % keyrow['class'])
            dictionary[key] = value
    except Exception, e:
        wikipedia.debugDump( 'MediaWiki_Msg', site, u'%s: %s while processing URL: %s' % (repr(e), str(e), unicode(path)), allmessages)
        raise
示例#3
0
def GetData(params, site = None, verbose = False, useAPI = False, retryCount = 5, encodeTitle = True):
    """Get data from the query api, and convert it into a data object
    """
    if site == None:
        site = wikipedia.getSite()

    for k,v in params.iteritems():
        if not IsString(v):
            params[k] = unicode(v)

    params['format'] = 'json'

    if not useAPI:
        params['noprofile'] = ''
    
    for k,v in params.iteritems():
        if type(v) == type(u''):
            params[k] = ToUtf8(v)

    # Titles param might be long, case convert it to post request
    data = None
    titlecount = 0
    if 'titles' in params:
        titlecount = params['titles'].count('|')
        if encodeTitle:
            data = urllib.urlencode({'titles' : params['titles']})
            del params['titles']
    
    if useAPI:
        path = site.api_address() + urllib.urlencode(params.items())
    else:
        path = site.query_address() + urllib.urlencode(params.items())
    
    if verbose:
        if titlecount > 0:
            wikipedia.output(u"Requesting %d titles from %s:%s" % (titlecount, site.lang, path))
        else:
            wikipedia.output(u"Request %s:%s" % (site.lang, path))
    
    lastError = None
    retry_idle_time = 5
    while retryCount >= 0:
        try:
            jsontext = "Nothing received"
            jsontext = site.getUrl( path, retry=True, data=data )

            # This will also work, but all unicode strings will need to be converted from \u notation
            # decodedObj = eval( jsontext )
            return simplejson.loads( jsontext )
            
        except ValueError, error:
            retryCount -= 1
            wikipedia.output(u"Error downloading data: %s" % error)
            wikipedia.output(u"Request %s:%s" % (site.lang, path))
            wikipedia.debugDump('ApiGetDataParse', site, str(error) + '\n%s' % path, jsontext)
            lastError = error
            if retryCount >= 0:
                wikipedia.output(u"Retrying in %i seconds..." % retry_idle_time)
                time.sleep(retry_idle_time)
                # Next time wait longer, but not longer than half an hour
                retry_idle_time *= 2
                if retry_idle_time > 300:
                    retry_idle_time = 300
示例#4
0
def GetData(params, site = None, useAPI = True, retryCount = 5, encodeTitle = True, sysop = False, back_response = False):
    """Get data from the query api, and convert it into a data object
    """
    if not site:
        site = wikipedia.getSite()
    data = {}
    titlecount = 0

    for k,v in params.iteritems():
        if k == u'file':
            data[k] = v
        elif type(v) == list:
            if k in [u'titles', u'pageids', u'revids', u'ususers'] and len(v) > 10:
                # Titles param might be long, case convert it to post request
                titlecount = len(params[k])
                data[k] = unicode(ListToParam(v))
            else:
                params[k] = unicode(ListToParam(v))

        elif not isinstance(v,basestring):
            params[k] = unicode(v)
        elif type(v) == unicode:
            params[k] = ToUtf8(v)

    if 'format' not in params or params['format'] != u'json':
        params['format'] = u'json'

    if not useAPI:
        params['noprofile'] = ''

    if data:
        for k in data:
            del params[k]

    if wikipedia.verbose: #dump params info.
        wikipedia.output(u"==== API action:%s ====" % params[u'action'])
        if data and 'file' not in data:
            wikipedia.output(u"%s: (%d items)" % (data.keys()[0], titlecount ) )

        for k, v in params.iteritems():
            if k not in ['action', 'format', 'file', 'xml', 'text']:
                if k == 'lgpassword' and wikipedia.verbose == 1:
                    v = u'XXXXX'
                elif not isinstance(v, unicode):
                    v = v.decode('utf-8')
                wikipedia.output(u"%s: %s" % (k, v) )
        wikipedia.output(u'-' * 16 )


    postAC = [
        'edit', 'login', 'purge', 'rollback', 'delete', 'undelete', 'protect', 'parse',
        'block', 'unblock', 'move', 'emailuser','import', 'userrights', 'upload', 'patrol'
    ]
    if useAPI:
        if params['action'] in postAC:
            path = site.api_address()
            cont = ''
        else:
            path = site.api_address() + site.urlEncode(params.items())

    else:
        path = site.query_address() + site.urlEncode(params.items())

    if wikipedia.verbose:
        if titlecount > 1:
            wikipedia.output(u"Requesting %d %s from %s" % (titlecount, data.keys()[0], site))
        else:
            wikipedia.output(u"Requesting API query from %s" % site)

    lastError = None
    retry_idle_time = 1

    while retryCount >= 0:
        try:
            jsontext = "Nothing received"
            if params['action'] == 'upload' and ('file' in data):
                import upload
                res, jsontext = upload.post_multipart(site, path, params.items(),
                  (('file', params['filename'].encode(site.encoding()), data['file']),),
                  site.cookies(sysop=sysop)
                  )
            elif params['action'] in postAC:
                res, jsontext = site.postForm(path, params, sysop, site.cookies(sysop = sysop) )
            else:
                if back_response:
                    res, jsontext = site.getUrl( path, retry=True, data=data, sysop=sysop, back_response=True)
                else:
                    jsontext = site.getUrl( path, retry=True, sysop=sysop, data=data)

            # This will also work, but all unicode strings will need to be converted from \u notation
            # decodedObj = eval( jsontext )

            jsontext = json.loads( jsontext )

            if "error" in jsontext:
                errorDetails = jsontext["error"]
                if errorDetails["code"] == 'badtoken':
                    wikipedia.output('Received a bad login token error from the server.  Attempting to refresh.')
                    params['token'] = site.getToken(sysop = sysop, getagain = True)
                    continue

            if back_response:
                return res, jsontext
            else:
                return jsontext

        except ValueError, error:
            if "<title>Wiki does not exist</title>" in jsontext:
                raise wikipedia.NoSuchSite(u'Wiki %s does not exist yet' % site)

            if 'Wikimedia Error' in jsontext: #wikimedia server error
                raise wikipedia.ServerError

            retryCount -= 1
            wikipedia.output(u"Error downloading data: %s" % error)
            wikipedia.output(u"Request %s:%s" % (site.lang, path))
            lastError = error
            if retryCount >= 0:
                wikipedia.output(u"Retrying in %i minutes..." % retry_idle_time)
                time.sleep(retry_idle_time*60)
                # Next time wait longer, but not longer than half an hour
                retry_idle_time *= 2
                if retry_idle_time > 30:
                    retry_idle_time = 30
            else:
                wikipedia.debugDump('ApiGetDataParse', site, str(error) + '\n%s\n%s' % (site.hostname(), path), jsontext)
示例#5
0
def GetData(params,
            site=None,
            verbose=False,
            useAPI=False,
            retryCount=5,
            encodeTitle=True):
    """Get data from the query api, and convert it into a data object
    """
    if site is None:
        site = wikipedia.getSite()

    for k, v in params.iteritems():
        if not IsString(v):
            params[k] = unicode(v)

    params['format'] = 'json'

    if not useAPI:
        params['noprofile'] = ''

    for k, v in params.iteritems():
        if type(v) == type(u''):
            params[k] = ToUtf8(v)

    # Titles param might be long, case convert it to post request
    data = None
    titlecount = 0
    if 'titles' in params:
        titlecount = params['titles'].count('|')
        if encodeTitle:
            data = {'titles': params['titles']}
            del params['titles']

    postAC = [
        'edit',
        'login',
        'purge',
        'rollback',
        'delete',
        'undelete',
        'protect',
        'block',
        'unblock',
        'move',
        'emailuser',
        'import',
        'userrights',
    ]
    if useAPI:
        if params['action'] in postAC:
            path = site.api_address()
        else:
            path = site.api_address() + urllib.urlencode(params.items())

    else:
        path = site.query_address() + urllib.urlencode(params.items())

    if verbose:
        if titlecount > 0:
            wikipedia.output(u"Requesting %d titles from %s:%s" %
                             (titlecount, site.lang, path))
        else:
            wikipedia.output(u"Request %s:%s" % (site.lang, path))

    lastError = None
    retry_idle_time = 5

    while retryCount >= 0:
        try:
            jsontext = "Nothing received"
            if params['action'] in postAC:
                res, jsontext = site.postData(path,
                                              urllib.urlencode(params.items()),
                                              cookies=site.cookies())
            else:
                jsontext = site.getUrl(path, retry=True, data=data)

            # This will also work, but all unicode strings will need to be converted from \u notation
            # decodedObj = eval( jsontext )
            return simplejson.loads(jsontext)

        except ValueError, error:
            retryCount -= 1
            wikipedia.output(u"Error downloading data: %s" % error)
            wikipedia.output(u"Request %s:%s" % (site.lang, path))
            wikipedia.debugDump('ApiGetDataParse', site,
                                str(error) + '\n%s' % path, jsontext)
            lastError = error
            if retryCount >= 0:
                wikipedia.output(u"Retrying in %i seconds..." %
                                 retry_idle_time)
                time.sleep(retry_idle_time)
                # Next time wait longer, but not longer than half an hour
                retry_idle_time *= 2
                if retry_idle_time > 300:
                    retry_idle_time = 300
示例#6
0
def GetData(params, site=None, useAPI=True, retryCount=config.maxretries,
            encodeTitle=True, sysop=False, back_response=False):
    """Get data from the query api, and convert it into a data object
    """
    if ('action' in params) and pywikibot.simulate and \
       (params['action'] in pywikibot.config.actions_to_block):
        pywikibot.output(
            u'\03{lightyellow}SIMULATION: %s action blocked.\03{default}'
            % params['action'])
        jsontext_dummy = {params['action']: {u'result': u''}}
        if back_response:
            import StringIO
            res_dummy = StringIO.StringIO()
            res_dummy.__dict__.update({u'code': 0, u'msg': u''})
            return res_dummy, jsontext_dummy
        else:
            return jsontext_dummy

    if not site:
        site = pywikibot.getSite()
    data = {}
    titlecount = 0

    for k, v in params.iteritems():
        if k == u'file':
            data[k] = v
        elif type(v) == list:
            if k in [u'titles', u'pageids', u'revids',
                     u'ususers'] and len(v) > 10:
                # Titles param might be long, case convert it to post request
                titlecount = len(params[k])
                data[k] = unicode(ListToParam(v))
            else:
                params[k] = unicode(ListToParam(v))

        elif not isinstance(v, basestring):
            params[k] = unicode(v)
        elif type(v) == unicode:
            params[k] = ToUtf8(v)

    if 'format' not in params or params['format'] != u'json':
        params['format'] = u'json'

    if 'action' in params and params['action'] == 'query' and not (
            'continue' in params or 'rawcontinue' in params):
        params['rawcontinue'] = ''

    if not useAPI:
        params['noprofile'] = ''

    if data:
        for k in data:
            del params[k]

    if pywikibot.verbose:  # dump params info.
        pywikibot.output(u"==== API action:%s ====" % params[u'action'])
        if data and 'file' not in data:
            pywikibot.output(u"%s: (%d items)" % (data.keys()[0], titlecount))

        for k, v in params.iteritems():
            if k not in ['action', 'format', 'file', 'xml', 'text']:
                if k == 'lgpassword' and pywikibot.verbose == 1:
                    v = u'XXXXX'
                elif not isinstance(v, unicode):
                    v = v.decode('utf-8')
                pywikibot.output(u"%s: %s" % (k, v))
        pywikibot.output(u'-' * 16)

    postAC = [
        'edit', 'login', 'purge', 'rollback', 'delete', 'undelete', 'protect',
        'parse', 'block', 'unblock', 'move', 'emailuser', 'import',
        'userrights', 'upload', 'patrol', 'wbcreateclaim', 'wbeditentity',
        'wbremoveclaims'
    ]
    if site.versionnumber() >= 18:
        postAC.append('watch')
    if useAPI:
        if params['action'] in postAC or params['action'][:5] == 'wbset':
            path = site.api_address()
        else:
            path = site.api_address() + site.urlEncode(params.items())

    else:
        path = site.query_address() + site.urlEncode(params.items())

    if pywikibot.verbose:
        if titlecount > 1:
            pywikibot.output(u"Requesting %d %s from %s"
                             % (titlecount, data.keys()[0], site))
        else:
            pywikibot.output(u"Requesting API query from %s" % site)

    lastError = None
    retry_idle_time = 1

    while retryCount >= 0:
        try:
            jsontext = "Nothing received"
            if params['action'] == 'upload' and ('file' in data):
                import upload
                res, jsontext = upload.post_multipart(
                    site, path, params.items(),
                    (('file', params['filename'].encode(site.encoding()),
                      data['file']), ),
                    site.cookies(sysop=sysop))
            elif params['action'] in postAC or params['action'][:5] == 'wbset':
                res, jsontext = site.postForm(path, params, sysop,
                                              site.cookies(sysop=sysop))
            else:
                if back_response:
                    res, jsontext = site.getUrl(path, retry=True, data=data,
                                                sysop=sysop, back_response=True)
                else:
                    jsontext = site.getUrl(path, retry=True, sysop=sysop,
                                           data=data)

            # This will also work, but all unicode strings will need to be
            # converted from \u notation
##            decodedObj = eval(jsontext)

            jsontext = json.loads(jsontext)

            if "error" in jsontext:
                errorDetails = jsontext["error"]
                if errorDetails["code"] == 'badtoken':
                    pywikibot.output('Received a bad login token error from '
                                     'the server.  Attempting to refresh.')
                    params['token'] = site.getToken(sysop=sysop,
                                                    getagain=True)
                    continue

            if back_response:
                return res, jsontext
            else:
                return jsontext

        except ValueError, error:
            if "<title>Wiki does not exist</title>" in jsontext:
                raise pywikibot.NoSuchSite(u'Wiki %s does not exist yet' % site)

            if 'Wikimedia Error' in jsontext:  # wikimedia server error
                raise pywikibot.ServerError

            retryCount -= 1
            pywikibot.output(u"Error downloading data: %s" % error)
            pywikibot.output(u"Request %s:%s" % (site.lang, path))
            lastError = error
            if retryCount >= 0:
                pywikibot.output(u"Retrying in %i minutes..." % retry_idle_time)
                time.sleep(retry_idle_time * 60)
                # Next time wait longer, but not longer than half an hour
                retry_idle_time *= 2
                if retry_idle_time > 30:
                    retry_idle_time = 30
            else:
                pywikibot.debugDump('ApiGetDataParse', site,
                                    str(error) + '\n%s\n%s' % (site.hostname(),
                                                               path),
                                    jsontext)
示例#7
0
            elif keyrow['class'] == 'def':
                value = unicode(keyrow('td')[1].string).strip()
            else:
                raise AssertionError("Unknown tr class value: %s" %
                                     keyrow['class'])
            dictionary[key] = value
    except Exception, e:
        wikipedia.debugDump(
            'MediaWiki_Msg', site, u'%s: %s while processing URL: %s' %
            (repr(e), str(e), unicode(path)), allmessages)
        raise

    # Save the dictionary to disk
    # The file is stored in the mediawiki_messages subdir. Create if necessary.
    if dictionary == {}:
        wikipedia.debugDump('MediaWiki_Msg', site,
                            u'Error URL: ' + unicode(path), allmessages)
        sys.exit()
    else:
        f = open(
            makepath('mediawiki-messages/mediawiki-messages-%s-%s.dat' %
                     (site.family.name, site.lang)), 'w')
        pickle.dump(dictionary, f)
        f.close()
    print "Loaded %i values from %s" % (len(dictionary.keys()), site)
    #print dictionary['sitestatstext']


def refresh_all_messages():
    import dircache, time
    filenames = dircache.listdir('mediawiki-messages')
    message_filenameR = re.compile(
                valrow = keyrow.findNextSibling('tr')
                assert valrow['class'] == "new"
                value = unicode(valrow.td.string).strip()
            elif keyrow['class'] == 'def':
                value = unicode(keyrow('td')[1].string).strip()
            else:
                raise AssertionError("Unknown tr class value: %s" % keyrow['class'])
            dictionary[key] = value
    except Exception, e:
        wikipedia.debugDump( 'MediaWiki_Msg', site, u'%s: %s while processing URL: %s' % (repr(e), str(e), unicode(path)), allmessages)
        raise

    # Save the dictionary to disk
    # The file is stored in the mediawiki_messages subdir. Create if necessary. 
    if dictionary == {}:
        wikipedia.debugDump( 'MediaWiki_Msg', site, u'Error URL: '+unicode(path), allmessages )
        sys.exit()
    else:
        f = open(makepath('mediawiki-messages/mediawiki-messages-%s-%s.dat' % (site.family.name, site.lang)), 'w')
        pickle.dump(dictionary, f)
        f.close()
    print "Loaded %i values from %s" % (len(dictionary.keys()), site)
    #print dictionary['sitestatstext']

def refresh_all_messages():
    import dircache, time
    filenames = dircache.listdir('mediawiki-messages')
    message_filenameR = re.compile('mediawiki-messages-([a-z:]+)-([a-z:]+).dat')
    for filename in filenames:
        match = message_filenameR.match(filename)
        if match: