def refresh_messages(site=None): site = site or wikipedia.getSite() # get 'all messages' special page's path path = site.allmessages_address() print 'Retrieving MediaWiki messages for %s' % repr(site) wikipedia.put_throttle() # It actually is a get, but a heavy one. allmessages = site.getUrl(path) print 'Parsing MediaWiki messages' soup = BeautifulSoup(allmessages, convertEntities=BeautifulSoup.HTML_ENTITIES) # The MediaWiki namespace in URL-encoded format, as it can contain # non-ASCII characters and spaces. quotedMwNs = urllib.quote( site.namespace(8).replace(' ', '_').encode(site.encoding())) mw_url = site.path() + "?title=" + quotedMwNs + ":" altmw_url = site.path() + "/" + quotedMwNs + ":" nicemw_url = site.nice_get_address(quotedMwNs + ":") shortmw_url = "/" + quotedMwNs + ":" ismediawiki = lambda url: url and (url.startswith( mw_url) or url.startswith(altmw_url) or url.startswith(nicemw_url) or url.startswith(shortmw_url)) # we will save the found key:value pairs here dictionary = {} try: for keytag in soup('a', href=ismediawiki): # Key strings only contain ASCII characters, so we can save them as # strs key = str(keytag.find(text=True)) keyrow = keytag.parent.parent if keyrow['class'] == "orig": valrow = keyrow.findNextSibling('tr') assert valrow['class'] == "new" value = unicode(valrow.td.string).strip() elif keyrow['class'] == 'def': value = unicode(keyrow('td')[1].string).strip() else: raise AssertionError("Unknown tr class value: %s" % keyrow['class']) dictionary[key] = value except Exception, e: wikipedia.debugDump( 'MediaWiki_Msg', site, u'%s: %s while processing URL: %s' % (repr(e), str(e), unicode(path)), allmessages) raise
def refresh_messages(site = None): site = site or wikipedia.getSite() # get 'all messages' special page's path path = site.allmessages_address() print 'Retrieving MediaWiki messages for %s' % repr(site) wikipedia.put_throttle() # It actually is a get, but a heavy one. allmessages = site.getUrl(path) print 'Parsing MediaWiki messages' soup = BeautifulSoup(allmessages, convertEntities=BeautifulSoup.HTML_ENTITIES) # The MediaWiki namespace in URL-encoded format, as it can contain # non-ASCII characters and spaces. quotedMwNs = urllib.quote(site.namespace(8).replace(' ', '_').encode(site.encoding())) mw_url = site.path() + "?title=" + quotedMwNs + ":" altmw_url = site.path() + "/" + quotedMwNs + ":" nicemw_url = site.nice_get_address(quotedMwNs + ":") shortmw_url = "/" + quotedMwNs + ":" ismediawiki = lambda url:url and (url.startswith(mw_url) or url.startswith(altmw_url) or url.startswith(nicemw_url) or url.startswith(shortmw_url)) # we will save the found key:value pairs here dictionary = {} try: for keytag in soup('a', href=ismediawiki): # Key strings only contain ASCII characters, so we can save them as # strs key = str(keytag.find(text=True)) keyrow = keytag.parent.parent if keyrow['class'] == "orig": valrow = keyrow.findNextSibling('tr') assert valrow['class'] == "new" value = unicode(valrow.td.string).strip() elif keyrow['class'] == 'def': value = unicode(keyrow('td')[1].string).strip() else: raise AssertionError("Unknown tr class value: %s" % keyrow['class']) dictionary[key] = value except Exception, e: wikipedia.debugDump( 'MediaWiki_Msg', site, u'%s: %s while processing URL: %s' % (repr(e), str(e), unicode(path)), allmessages) raise
def GetData(params, site = None, verbose = False, useAPI = False, retryCount = 5, encodeTitle = True): """Get data from the query api, and convert it into a data object """ if site == None: site = wikipedia.getSite() for k,v in params.iteritems(): if not IsString(v): params[k] = unicode(v) params['format'] = 'json' if not useAPI: params['noprofile'] = '' for k,v in params.iteritems(): if type(v) == type(u''): params[k] = ToUtf8(v) # Titles param might be long, case convert it to post request data = None titlecount = 0 if 'titles' in params: titlecount = params['titles'].count('|') if encodeTitle: data = urllib.urlencode({'titles' : params['titles']}) del params['titles'] if useAPI: path = site.api_address() + urllib.urlencode(params.items()) else: path = site.query_address() + urllib.urlencode(params.items()) if verbose: if titlecount > 0: wikipedia.output(u"Requesting %d titles from %s:%s" % (titlecount, site.lang, path)) else: wikipedia.output(u"Request %s:%s" % (site.lang, path)) lastError = None retry_idle_time = 5 while retryCount >= 0: try: jsontext = "Nothing received" jsontext = site.getUrl( path, retry=True, data=data ) # This will also work, but all unicode strings will need to be converted from \u notation # decodedObj = eval( jsontext ) return simplejson.loads( jsontext ) except ValueError, error: retryCount -= 1 wikipedia.output(u"Error downloading data: %s" % error) wikipedia.output(u"Request %s:%s" % (site.lang, path)) wikipedia.debugDump('ApiGetDataParse', site, str(error) + '\n%s' % path, jsontext) lastError = error if retryCount >= 0: wikipedia.output(u"Retrying in %i seconds..." % retry_idle_time) time.sleep(retry_idle_time) # Next time wait longer, but not longer than half an hour retry_idle_time *= 2 if retry_idle_time > 300: retry_idle_time = 300
def GetData(params, site = None, useAPI = True, retryCount = 5, encodeTitle = True, sysop = False, back_response = False): """Get data from the query api, and convert it into a data object """ if not site: site = wikipedia.getSite() data = {} titlecount = 0 for k,v in params.iteritems(): if k == u'file': data[k] = v elif type(v) == list: if k in [u'titles', u'pageids', u'revids', u'ususers'] and len(v) > 10: # Titles param might be long, case convert it to post request titlecount = len(params[k]) data[k] = unicode(ListToParam(v)) else: params[k] = unicode(ListToParam(v)) elif not isinstance(v,basestring): params[k] = unicode(v) elif type(v) == unicode: params[k] = ToUtf8(v) if 'format' not in params or params['format'] != u'json': params['format'] = u'json' if not useAPI: params['noprofile'] = '' if data: for k in data: del params[k] if wikipedia.verbose: #dump params info. wikipedia.output(u"==== API action:%s ====" % params[u'action']) if data and 'file' not in data: wikipedia.output(u"%s: (%d items)" % (data.keys()[0], titlecount ) ) for k, v in params.iteritems(): if k not in ['action', 'format', 'file', 'xml', 'text']: if k == 'lgpassword' and wikipedia.verbose == 1: v = u'XXXXX' elif not isinstance(v, unicode): v = v.decode('utf-8') wikipedia.output(u"%s: %s" % (k, v) ) wikipedia.output(u'-' * 16 ) postAC = [ 'edit', 'login', 'purge', 'rollback', 'delete', 'undelete', 'protect', 'parse', 'block', 'unblock', 'move', 'emailuser','import', 'userrights', 'upload', 'patrol' ] if useAPI: if params['action'] in postAC: path = site.api_address() cont = '' else: path = site.api_address() + site.urlEncode(params.items()) else: path = site.query_address() + site.urlEncode(params.items()) if wikipedia.verbose: if titlecount > 1: wikipedia.output(u"Requesting %d %s from %s" % (titlecount, data.keys()[0], site)) else: wikipedia.output(u"Requesting API query from %s" % site) lastError = None retry_idle_time = 1 while retryCount >= 0: try: jsontext = "Nothing received" if params['action'] == 'upload' and ('file' in data): import upload res, jsontext = upload.post_multipart(site, path, params.items(), (('file', params['filename'].encode(site.encoding()), data['file']),), site.cookies(sysop=sysop) ) elif params['action'] in postAC: res, jsontext = site.postForm(path, params, sysop, site.cookies(sysop = sysop) ) else: if back_response: res, jsontext = site.getUrl( path, retry=True, data=data, sysop=sysop, back_response=True) else: jsontext = site.getUrl( path, retry=True, sysop=sysop, data=data) # This will also work, but all unicode strings will need to be converted from \u notation # decodedObj = eval( jsontext ) jsontext = json.loads( jsontext ) if "error" in jsontext: errorDetails = jsontext["error"] if errorDetails["code"] == 'badtoken': wikipedia.output('Received a bad login token error from the server. Attempting to refresh.') params['token'] = site.getToken(sysop = sysop, getagain = True) continue if back_response: return res, jsontext else: return jsontext except ValueError, error: if "<title>Wiki does not exist</title>" in jsontext: raise wikipedia.NoSuchSite(u'Wiki %s does not exist yet' % site) if 'Wikimedia Error' in jsontext: #wikimedia server error raise wikipedia.ServerError retryCount -= 1 wikipedia.output(u"Error downloading data: %s" % error) wikipedia.output(u"Request %s:%s" % (site.lang, path)) lastError = error if retryCount >= 0: wikipedia.output(u"Retrying in %i minutes..." % retry_idle_time) time.sleep(retry_idle_time*60) # Next time wait longer, but not longer than half an hour retry_idle_time *= 2 if retry_idle_time > 30: retry_idle_time = 30 else: wikipedia.debugDump('ApiGetDataParse', site, str(error) + '\n%s\n%s' % (site.hostname(), path), jsontext)
def GetData(params, site=None, verbose=False, useAPI=False, retryCount=5, encodeTitle=True): """Get data from the query api, and convert it into a data object """ if site is None: site = wikipedia.getSite() for k, v in params.iteritems(): if not IsString(v): params[k] = unicode(v) params['format'] = 'json' if not useAPI: params['noprofile'] = '' for k, v in params.iteritems(): if type(v) == type(u''): params[k] = ToUtf8(v) # Titles param might be long, case convert it to post request data = None titlecount = 0 if 'titles' in params: titlecount = params['titles'].count('|') if encodeTitle: data = {'titles': params['titles']} del params['titles'] postAC = [ 'edit', 'login', 'purge', 'rollback', 'delete', 'undelete', 'protect', 'block', 'unblock', 'move', 'emailuser', 'import', 'userrights', ] if useAPI: if params['action'] in postAC: path = site.api_address() else: path = site.api_address() + urllib.urlencode(params.items()) else: path = site.query_address() + urllib.urlencode(params.items()) if verbose: if titlecount > 0: wikipedia.output(u"Requesting %d titles from %s:%s" % (titlecount, site.lang, path)) else: wikipedia.output(u"Request %s:%s" % (site.lang, path)) lastError = None retry_idle_time = 5 while retryCount >= 0: try: jsontext = "Nothing received" if params['action'] in postAC: res, jsontext = site.postData(path, urllib.urlencode(params.items()), cookies=site.cookies()) else: jsontext = site.getUrl(path, retry=True, data=data) # This will also work, but all unicode strings will need to be converted from \u notation # decodedObj = eval( jsontext ) return simplejson.loads(jsontext) except ValueError, error: retryCount -= 1 wikipedia.output(u"Error downloading data: %s" % error) wikipedia.output(u"Request %s:%s" % (site.lang, path)) wikipedia.debugDump('ApiGetDataParse', site, str(error) + '\n%s' % path, jsontext) lastError = error if retryCount >= 0: wikipedia.output(u"Retrying in %i seconds..." % retry_idle_time) time.sleep(retry_idle_time) # Next time wait longer, but not longer than half an hour retry_idle_time *= 2 if retry_idle_time > 300: retry_idle_time = 300
def GetData(params, site=None, useAPI=True, retryCount=config.maxretries, encodeTitle=True, sysop=False, back_response=False): """Get data from the query api, and convert it into a data object """ if ('action' in params) and pywikibot.simulate and \ (params['action'] in pywikibot.config.actions_to_block): pywikibot.output( u'\03{lightyellow}SIMULATION: %s action blocked.\03{default}' % params['action']) jsontext_dummy = {params['action']: {u'result': u''}} if back_response: import StringIO res_dummy = StringIO.StringIO() res_dummy.__dict__.update({u'code': 0, u'msg': u''}) return res_dummy, jsontext_dummy else: return jsontext_dummy if not site: site = pywikibot.getSite() data = {} titlecount = 0 for k, v in params.iteritems(): if k == u'file': data[k] = v elif type(v) == list: if k in [u'titles', u'pageids', u'revids', u'ususers'] and len(v) > 10: # Titles param might be long, case convert it to post request titlecount = len(params[k]) data[k] = unicode(ListToParam(v)) else: params[k] = unicode(ListToParam(v)) elif not isinstance(v, basestring): params[k] = unicode(v) elif type(v) == unicode: params[k] = ToUtf8(v) if 'format' not in params or params['format'] != u'json': params['format'] = u'json' if 'action' in params and params['action'] == 'query' and not ( 'continue' in params or 'rawcontinue' in params): params['rawcontinue'] = '' if not useAPI: params['noprofile'] = '' if data: for k in data: del params[k] if pywikibot.verbose: # dump params info. pywikibot.output(u"==== API action:%s ====" % params[u'action']) if data and 'file' not in data: pywikibot.output(u"%s: (%d items)" % (data.keys()[0], titlecount)) for k, v in params.iteritems(): if k not in ['action', 'format', 'file', 'xml', 'text']: if k == 'lgpassword' and pywikibot.verbose == 1: v = u'XXXXX' elif not isinstance(v, unicode): v = v.decode('utf-8') pywikibot.output(u"%s: %s" % (k, v)) pywikibot.output(u'-' * 16) postAC = [ 'edit', 'login', 'purge', 'rollback', 'delete', 'undelete', 'protect', 'parse', 'block', 'unblock', 'move', 'emailuser', 'import', 'userrights', 'upload', 'patrol', 'wbcreateclaim', 'wbeditentity', 'wbremoveclaims' ] if site.versionnumber() >= 18: postAC.append('watch') if useAPI: if params['action'] in postAC or params['action'][:5] == 'wbset': path = site.api_address() else: path = site.api_address() + site.urlEncode(params.items()) else: path = site.query_address() + site.urlEncode(params.items()) if pywikibot.verbose: if titlecount > 1: pywikibot.output(u"Requesting %d %s from %s" % (titlecount, data.keys()[0], site)) else: pywikibot.output(u"Requesting API query from %s" % site) lastError = None retry_idle_time = 1 while retryCount >= 0: try: jsontext = "Nothing received" if params['action'] == 'upload' and ('file' in data): import upload res, jsontext = upload.post_multipart( site, path, params.items(), (('file', params['filename'].encode(site.encoding()), data['file']), ), site.cookies(sysop=sysop)) elif params['action'] in postAC or params['action'][:5] == 'wbset': res, jsontext = site.postForm(path, params, sysop, site.cookies(sysop=sysop)) else: if back_response: res, jsontext = site.getUrl(path, retry=True, data=data, sysop=sysop, back_response=True) else: jsontext = site.getUrl(path, retry=True, sysop=sysop, data=data) # This will also work, but all unicode strings will need to be # converted from \u notation ## decodedObj = eval(jsontext) jsontext = json.loads(jsontext) if "error" in jsontext: errorDetails = jsontext["error"] if errorDetails["code"] == 'badtoken': pywikibot.output('Received a bad login token error from ' 'the server. Attempting to refresh.') params['token'] = site.getToken(sysop=sysop, getagain=True) continue if back_response: return res, jsontext else: return jsontext except ValueError, error: if "<title>Wiki does not exist</title>" in jsontext: raise pywikibot.NoSuchSite(u'Wiki %s does not exist yet' % site) if 'Wikimedia Error' in jsontext: # wikimedia server error raise pywikibot.ServerError retryCount -= 1 pywikibot.output(u"Error downloading data: %s" % error) pywikibot.output(u"Request %s:%s" % (site.lang, path)) lastError = error if retryCount >= 0: pywikibot.output(u"Retrying in %i minutes..." % retry_idle_time) time.sleep(retry_idle_time * 60) # Next time wait longer, but not longer than half an hour retry_idle_time *= 2 if retry_idle_time > 30: retry_idle_time = 30 else: pywikibot.debugDump('ApiGetDataParse', site, str(error) + '\n%s\n%s' % (site.hostname(), path), jsontext)
elif keyrow['class'] == 'def': value = unicode(keyrow('td')[1].string).strip() else: raise AssertionError("Unknown tr class value: %s" % keyrow['class']) dictionary[key] = value except Exception, e: wikipedia.debugDump( 'MediaWiki_Msg', site, u'%s: %s while processing URL: %s' % (repr(e), str(e), unicode(path)), allmessages) raise # Save the dictionary to disk # The file is stored in the mediawiki_messages subdir. Create if necessary. if dictionary == {}: wikipedia.debugDump('MediaWiki_Msg', site, u'Error URL: ' + unicode(path), allmessages) sys.exit() else: f = open( makepath('mediawiki-messages/mediawiki-messages-%s-%s.dat' % (site.family.name, site.lang)), 'w') pickle.dump(dictionary, f) f.close() print "Loaded %i values from %s" % (len(dictionary.keys()), site) #print dictionary['sitestatstext'] def refresh_all_messages(): import dircache, time filenames = dircache.listdir('mediawiki-messages') message_filenameR = re.compile(
valrow = keyrow.findNextSibling('tr') assert valrow['class'] == "new" value = unicode(valrow.td.string).strip() elif keyrow['class'] == 'def': value = unicode(keyrow('td')[1].string).strip() else: raise AssertionError("Unknown tr class value: %s" % keyrow['class']) dictionary[key] = value except Exception, e: wikipedia.debugDump( 'MediaWiki_Msg', site, u'%s: %s while processing URL: %s' % (repr(e), str(e), unicode(path)), allmessages) raise # Save the dictionary to disk # The file is stored in the mediawiki_messages subdir. Create if necessary. if dictionary == {}: wikipedia.debugDump( 'MediaWiki_Msg', site, u'Error URL: '+unicode(path), allmessages ) sys.exit() else: f = open(makepath('mediawiki-messages/mediawiki-messages-%s-%s.dat' % (site.family.name, site.lang)), 'w') pickle.dump(dictionary, f) f.close() print "Loaded %i values from %s" % (len(dictionary.keys()), site) #print dictionary['sitestatstext'] def refresh_all_messages(): import dircache, time filenames = dircache.listdir('mediawiki-messages') message_filenameR = re.compile('mediawiki-messages-([a-z:]+)-([a-z:]+).dat') for filename in filenames: match = message_filenameR.match(filename) if match: