def _ocr_callback(self, cmd_uri, parser_func=None, ocr_tool=None): """OCR callback function. @return: tuple (error, text [error description in case of error]). """ def identity(x): return x if not cmd_uri: raise ValueError('Parameter cmd_uri is mandatory.') if parser_func is None: parser_func = identity if not callable(parser_func): raise TypeError('Keyword parser_func must be callable.') if ocr_tool not in self._OCR_METHODS: raise TypeError( "ocr_tool must be in %s, not '%s'." % (self._OCR_METHODS, ocr_tool)) # wrong link fail with Exceptions retry = 0 while retry < 5: pywikibot.debug('{0}: get URI {1!r}'.format(ocr_tool, cmd_uri), _logger) try: response = http.fetch(cmd_uri) except requests.exceptions.ReadTimeout as e: retry += 1 pywikibot.warning('ReadTimeout %s: %s' % (cmd_uri, e)) pywikibot.warning('retrying in %s seconds ...' % (retry * 5)) time.sleep(retry * 5) except Exception as e: pywikibot.error('"%s": %s' % (cmd_uri, e)) return (True, e) else: pywikibot.debug('{0}: {1}'.format(ocr_tool, response.text), _logger) break data = json.loads(response.text) if ocr_tool == self._PHETOOLS: # phetools assert 'error' in data, 'Error from phetools: %s' % data assert data['error'] in [0, 1, 2, 3], ( 'Error from phetools: %s' % data) error, _text = bool(data['error']), data['text'] else: # googleOCR if 'error' in data: error, _text = True, data['error'] else: error, _text = False, data['text'] if error: pywikibot.error('OCR query %s: %s' % (cmd_uri, _text)) return (error, _text) else: return (error, parser_func(_text))
def output_country_report(rows, report_page, max_images=1000): """ Output a gallery of images without id. @param rows: list of (image, id, template) or (image, ) tuples. @param report_page: pywikibot.Page where report will be outputted. @param max_images: the max number of images to report to a page. Defaults to 1000. """ # FIXME create this page. Different name? central_page = ':c:Commons:Monuments database/Images without id' text = common.instruction_header(central_page) if rows: gallery_rows = [format_gallery_row(*row) for row in rows[:max_images]] text += u'<gallery>\n{}\n</gallery>'.format('\n'.join(gallery_rows)) else: text += common.done_message(central_page, 'images without id') if len(rows) > max_images: text += ( u'\n<!-- Maximum number of images reached: {0}, ' u'total of images without id: {1} -->'.format( max_images, len(rows))) comment = ( u'Images without an id: {0} (gallery maximum reached), ' u'total of images without id: {1}'.format( max_images, len(rows))) else: comment = u'Images without an id: {0}'.format(len(rows)) pywikibot.debug(text, _logger) common.save_to_wiki_or_local( report_page, comment, text, minorEdit=False)
def process_monument(params, source, countryconfig, conn, cursor, source_page, header_defaults, unknown_fields): """Process a single instance of a monument row template.""" title = source_page.title(True) # Get all the fields contents = {} # Add the source of information (permalink) contents['source'] = source for field in countryconfig.get('fields'): if field.get('source') in header_defaults: contents[field.get('source')] = header_defaults.get( field.get('source')) else: contents[field.get('source')] = '' contents['title'] = title for param in params: (field, value) = extract_elements_from_template_param(param) # Check first that field is not empty if field.strip(): # Is it in the fields list? if field in contents: # Load it with Big f*****g escape hack. Stupid mysql lib # Do this somewhere else.replace("'", "\\'") contents[field] = value else: # FIXME: Include more information where it went wrong pywikibot.debug( 'Found unknown field on page {0} : ({1}: {2})'.format( title, field, value), _logger) if field not in unknown_fields: unknown_fields[field] = Counter() unknown_fields[field][source_page] += 1 # time.sleep(5) # If we truncate we don't have to check for primkey (it's a made up one) if countryconfig.get('truncate'): update_monument( contents, source, countryconfig, conn, cursor, source_page) # Check if the primkey is a tuple and if all parts are present elif isinstance(countryconfig.get('primkey'), tuple): all_keys = True for partkey in countryconfig.get('primkey'): if not contents.get(lookup_source_field(partkey, countryconfig)): all_keys = False if all_keys: update_monument( contents, source, countryconfig, conn, cursor, source_page) # Check if the primkey is filled. This only works for a single primkey, # not a tuple elif contents.get(lookup_source_field(countryconfig.get('primkey'), countryconfig)): update_monument( contents, source, countryconfig, conn, cursor, source_page) else: raise NoPrimkeyException
def add_mbid_claim_to_item(self, item, mbid): """ Adds a claim with pid `pid` with value `mbid` to `item` and call `donefunc` with `mbid` to signal the completion. :type pid: str :type mbid: str :type item: pywikibot.ItemPage """ claim = wp.Claim(const.WIKIDATA_DATASITE, self.property_id) claim.setTarget(mbid) wp.debug(u"Adding property {pid}, value {mbid} to {title}".format (pid=self.property_id, mbid=mbid, title=item.title()), layer="") if wp.config.simulate: wp.output("Simulation, no property has been added") return try: item.addClaim(claim, True) except wp.UserBlocked as e: wp.error("I have been blocked") exit(1) except wp.Error as e: wp.warning(e) return else: wp.debug("Adding the source Claim", layer="") claim.addSources([const.MUSICBRAINZ_CLAIM, const.RETRIEVED_CLAIM], bot=True) self.donefunc(mbid)
def makeStatistics(totals): """Make statistics on the number of indexed images and put on Commons.""" site = pywikibot.Site('commons', 'commons') page = pywikibot.Page( site, u'Commons:Monuments database/Indexed images/Statistics') title_column = [ 'country', ('images', 'total'), 'tracked', ('template', 'tracker template'), ('cat', 'tracker category') ] table = StatisticsTable(title_column, ('images', 'tracked')) for (countrycode, countryresults) in sorted(totals.iteritems()): table.add_row({ 'country': countrycode, 'images': countryresults.get('totalImages'), 'tracked': countryresults.get('tracked_images'), 'template': u'{{tl|%s}}' % countryresults.get('commonsTemplate'), 'cat': u'[[:Category:{cat}|{cat}]]'.format( cat=countryresults.get('commonsTrackerCategory')) }) text = table.to_wikitext() comment = ( u'Updating indexed image statistics. ' u'Total indexed images: {}'.format(table.get_sum('tracked'))) pywikibot.debug(text, _logger) common.save_to_wiki_or_local(page, comment, text)
def output_country_report(unused_images, report_page, max_images=1000): """ Format and output the unused images data for a a single country. @param unused_images: the output of group_unused_images @param report_page: pywikibot.Page to which the report should be written @param max_images: the max number of images to report to a page. Defaults to 1000. Note that actual number of images may be slightly higher in order to ensure all candidates for a given monument id are presented. """ # People can add a /header template for with more info central_page = ':c:Commons:Monuments database/Unused images' text = common.instruction_header(central_page) total_pages = 0 total_ids = 0 totalImages = 0 if not unused_images: text += common.done_message(central_page, 'unused images') else: for source_page, value in unused_images.iteritems(): total_pages += 1 if totalImages < max_images: text += u'=== {0} ===\n'.format(source_page) text += u'<gallery>\n' for monument_id, candidates in value.iteritems(): total_ids += 1 if totalImages < max_images: for candidate in candidates: text += u'File:{0}|{1}\n'.format( candidate, monument_id) totalImages += len(candidates) text += u'</gallery>\n' else: for monument_id, candidates in value.iteritems(): total_ids += 1 totalImages += len(candidates) if totalImages >= max_images: text += ( u'<!-- Maximum number of images reached: {0}, ' u'total of unused images: {1} -->\n'.format( max_images, totalImages)) comment = ( u'Images to be used in monument lists: ' u'{0} (gallery maximum reached), ' u'total of unused images: {1}'.format( max_images, totalImages)) else: comment = u'Images to be used in monument lists: {0}'.format( totalImages) pywikibot.debug(text, _logger) common.save_to_wiki_or_local(report_page, comment, text, minorEdit=False) return { 'images': totalImages, 'pages': total_pages, 'ids': total_ids }
def request(self, uri, method="GET", body=None, headers=None, max_redirects=None, connection_type=None): """Start an HTTP request. @param uri: The uri to retrieve @param method: (optional) The HTTP method to use. Default is 'GET' @param body: (optional) The request body. Default is no body. @param headers: (optional) Additional headers to send. Defaults include C{connection: keep-alive}, C{user-agent} and C{content-type}. @param max_redirects: (optional) The maximum number of redirects to use for this request. The class instance's max_redirects is default @param connection_type: (optional) see L{httplib2.Http.request} @return: (response, content) tuple """ if max_redirects is None: max_redirects = self.max_redirects if headers is None: headers = {} # Prepare headers headers.pop('cookie', None) req = DummyRequest(uri, headers) self.cookiejar.lock.acquire() try: self.cookiejar.add_cookie_header(req) finally: self.cookiejar.lock.release() headers = req.headers # Wikimedia squids: add connection: keep-alive to request headers # unless overridden headers['connection'] = headers.pop('connection', 'keep-alive') # determine connection pool key and fetch connection (scheme, authority, request_uri, defrag_uri) = httplib2.urlnorm( httplib2.iri2uri(uri)) conn_key = scheme+":"+authority connection = self.connection_pool.pop_connection(conn_key) if connection is not None: self.connections[conn_key] = connection # Redirect hack: we want to regulate redirects follow_redirects = self.follow_redirects self.follow_redirects = False pywikibot.debug(u"%r" % ( (uri.replace("%7C","|"), method, body, headers, max_redirects, connection_type),), _logger) try: (response, content) = httplib2.Http.request( self, uri, method, body, headers, max_redirects, connection_type) except Exception, e: # what types? # return exception instance to be retrieved by the calling thread return e
def _disable_pywikibot_logging(): """Tells Pywikibot to not log messages below WARNING level to stderr.""" # We need to wake up Pywikibot's logging interface so that its logger level # won't get overridden by a later logging call: import pywikibot pywikibot.debug("Disabling routine logging", "logging") getLogger("pywiki").setLevel("WARNING")
def checkMultiplicity(self): """Count running processes for site and set process_multiplicity.""" global pid mysite = self.mysite pywikibot.debug('Checking multiplicity: pid = {pid}'.format(pid=pid), _logger) with self.lock: processes = [] my_pid = pid or 1 # start at 1 if global pid not yet set count = 1 # open throttle.log try: f = open(self.ctrlfilename, 'r') except IOError: if pid: raise else: now = time.time() for line in f.readlines(): # parse line; format is "pid timestamp site" try: line = line.split(' ') this_pid = int(line[0]) ptime = int(line[1].split('.')[0]) this_site = line[2].rstrip() except (IndexError, ValueError): # Sometimes the file gets corrupted ignore that line continue if now - ptime > self.releasepid: continue # process has expired, drop from file if now - ptime <= self.dropdelay \ and this_site == mysite \ and this_pid != pid: count += 1 if this_site != self.mysite or this_pid != pid: processes.append({ 'pid': this_pid, 'time': ptime, 'site': this_site }) if not pid and this_pid >= my_pid: my_pid = this_pid + 1 # next unused process id f.close() if not pid: pid = my_pid self.checktime = time.time() processes.append({ 'pid': pid, 'time': self.checktime, 'site': mysite }) processes.sort(key=lambda p: (p['pid'], p['site'])) with suppress(IOError), open(self.ctrlfilename, 'w') as f: for p in processes: f.write(FORMAT_LINE.format_map(p)) self.process_multiplicity = count pywikibot.log( 'Found {} {} processes running, including this one.'.format( count, mysite))
def __eq__(self, other): """Compare if self is equal to other.""" if not isinstance(other, LogEntry): pywikibot.debug("'{0}' cannot be compared with '{1}'" .format(type(self).__name__, type(other).__name__), _logger) return False return self.logid() == other.logid() and self.site == other.site
def p(n): pywikibot.debug('going to load %s.' % n, _logger) site = pywikibot.Site('en',"wikipedia") repo = site.data_repository() print 'SD going to load %s' % n return pywikibot.ItemPage(repo, n).get()
def make_statistics(statistics): """ Output the overall results of the bot as a nice wikitable. @param statistics: list of per dataset statistic dicts where the allowed keys are: config, totals, report page and cmt. """ site = pywikibot.Site('commons', 'commons') page = pywikibot.Page( site, u'Commons:Monuments database/Images without id/Statistics') title_column = OrderedDict([ ('code', 'country'), ('lang', '[[:en:List of ISO 639-1 codes|lang]]'), ('total_with_id', 'Total monuments with suggested id'), ('total_without_id', 'Total monuments without suggested id'), # ('total_added', 'Total templates automatically added'), ('Report page', None), ('Commons template', None) ]) numeric = [key for key in title_column.keys() if key.startswith('total_')] table = StatisticsTable(title_column, numeric) for row in statistics: country_config = row.get('config') totals = row.get('totals', {}) total_with_id_or_cmt = row.get('cmt') commons_template = None report_page = None if totals: total_with_id_or_cmt = totals.get('with_id') if country_config.get('commonsTemplate'): commons_template = u'{{tl|%s}}' % ( country_config.get('commonsTemplate'), ) if row.get('report_page'): report_page = row.get('report_page').title( as_link=True, with_ns=False, insite=site) table.add_row({ 'code': country_config.get('country'), 'lang': country_config.get('lang'), 'total_with_id': total_with_id_or_cmt, 'total_without_id': totals.get('without_id'), # 'total_added': totals.get('added'), 'Report page': report_page, 'Commons template': commons_template}) text = table.to_wikitext() comment = ( u'Updating images without id statistics. Total of {total_with_id} ' u'images with suggested ids and {total_without_id} without.'.format( **table.get_sum())) pywikibot.debug(text, _logger) common.save_to_wiki_or_local(page, comment, text)
def makeStatistics(statistics): """Output the overall results of the bot as a nice wikitable.""" site = pywikibot.Site('commons', 'commons') page = pywikibot.Page( site, u'Commons:Monuments database/Missing commonscat links/Statistics') title_column = OrderedDict([ ('code', 'country'), ('lang', None), ('total', None), ('report_page', 'page'), ('row template', None), ('Commons template', None) ]) table = StatisticsTable(title_column, ('total', )) for row in statistics: countryconfig = row.get('config') total_cats_or_cmt = row.get('total_cats') row_template = None commons_template = None report_page = None if row.get('total_cats') is None: total_cats_or_cmt = row.get('cmt') if countryconfig.get('type') != 'sparql': row_template = common.get_template_link( row.get('lang'), countryconfig.get('project', u'wikipedia'), countryconfig.get('rowTemplate'), site) if countryconfig.get('commonsTemplate'): commons_template = u'{{tl|%s}}' % ( countryconfig.get('commonsTemplate'), ) if row.get('report_page'): report_page = row.get('report_page').title( as_link=True, with_ns=False, insite=site) table.add_row({ 'code': row.get('code'), 'lang': row.get('lang'), 'total': total_cats_or_cmt, 'report_page': report_page, 'row template': row_template, 'Commons template': commons_template}) text = table.to_wikitext() comment = ( u'Updating missing commonscat links statistics. ' u'Total missing links: {total_cats}'.format( total_cats=table.get_sum('total'))) pywikibot.debug(text, _logger) common.save_to_wiki_or_local(page, comment, text)
def __init__(self, **kwargs): """Constructor.""" try: self.site = kwargs.pop("site") except KeyError: self.site = pywikibot.Site() if 'mime_params' in kwargs: self.mime_params = kwargs.pop('mime_params') # mime may not be different from mime_params if 'mime' in kwargs and kwargs.pop('mime') != self.mime: raise ValueError('If mime_params is set, mime may not differ ' 'from it.') else: self.mime = kwargs.pop('mime', False) self.throttle = kwargs.pop('throttle', True) self.max_retries = kwargs.pop("max_retries", pywikibot.config.max_retries) self.retry_wait = kwargs.pop("retry_wait", pywikibot.config.retry_wait) self.params = {} if "action" not in kwargs: raise ValueError("'action' specification missing from Request.") self.update(**kwargs) self._warning_handler = None # Actions that imply database updates on the server, used for various # things like throttling or skipping actions when we're in simulation # mode self.write = self.params["action"] in ( "edit", "move", "rollback", "delete", "undelete", "protect", "block", "unblock", "watch", "patrol", "import", "userrights", "upload", "emailuser", "createaccount", "setnotificationtimestamp", "filerevert", "options", "purge", "revisiondelete", "wbeditentity", "wbsetlabel", "wbsetdescription", "wbsetaliases", "wblinktitles", "wbsetsitelink", "wbcreateclaim", "wbremoveclaims", "wbsetclaimvalue", "wbsetreference", "wbremovereferences" ) # MediaWiki 1.23 allows assertion for any action, # whereas earlier WMF wikis and others used an extension which # could only allow assert for action=edit. # # When we can't easily check whether the extension is loaded, # to avoid cyclic recursion in the Pywikibot codebase, assume # that it is present, which will cause a API warning emitted # to the logging (console) if it is not present, but will not # otherwise be a problem. # This situation is only tripped when one of the first actions # on the site is a write action and the extension isn't installed. if ((self.write and LV(self.site.version()) >= LV("1.23")) or (self.params['action'] == 'edit' and self.site.has_extension('AssertEdit'))): pywikibot.debug(u"Adding user assertion", _logger) self.params["assert"] = "user" # make sure user is logged in if (self.site.protocol() == 'http' and (config.use_SSL_always or ( self.params["action"] == "login" and config.use_SSL_onlogin)) and self.site.family.name in config.available_ssl_project): self.site = EnableSSLSiteWrapper(self.site)
def fix_redirect(self, gid, old, new): """ :param gid str: :param old str: :param new str: """ wp.debug("Fixing the redirect from %s to %s" % (old, new), layer="") self.client.edit_url(gid, old, new, self.edit_note % (old, new)) self._performed_edit()
def make_statistics(statistics): """Output the overall results for unknown fields as a nice wikitable.""" site = pywikibot.Site('commons', 'commons') page = pywikibot.Page( site, 'Commons:Monuments database/Unknown fields/Statistics') title_column = OrderedDict([ ('code', 'country'), ('lang', None), ('total_fields', 'Total fields'), ('total_usages', 'Total usage of fields'), ('total_pages', 'Total pages containing fields'), ('report_page', 'Report page'), ('row_template', 'Row template'), ('header_template', 'Header template') ]) table = StatisticsTable( title_column, list(filter(lambda col: col.startswith('total'), title_column))) for row in statistics: if not row: # sparql harvests don't generate statistics continue countryconfig = row.get('config') row_template = common.get_template_link( countryconfig.get('lang'), countryconfig.get('project', u'wikipedia'), countryconfig.get('rowTemplate'), site) header_template = common.get_template_link( countryconfig.get('lang'), countryconfig.get('project', u'wikipedia'), countryconfig.get('headerTemplate'), site) report_page = row.get('report_page').title( as_link=True, with_ns=False, insite=site) table.add_row({ 'code': countryconfig.get('country'), 'lang': countryconfig.get('lang'), 'total_fields': row.get('total_fields'), 'total_usages': row.get('total_usages'), 'total_pages': row.get('total_pages'), 'report_page': report_page, 'row_template': row_template, 'header_template': header_template }) text = table.to_wikitext() comment = ( 'Updating unknown fields statistics. Total of {total_fields} ' 'unknown fields used {total_usages} times on {total_pages} different ' 'pages.'.format(**table.get_sum())) pywikibot.debug(text, _logger) common.save_to_wiki_or_local(page, comment, text)
def __init__(self, maxnum=5): """ @param maxnum: Maximum number of connections per identifier. The pool drops excessive connections added. """ pywikibot.debug(u"Creating connection pool.", _logger) self.connections = {} self.lock = threading.Lock() self.maxnum = maxnum
def _createFromData(self, logdata): """ Checks for logtype from data, and creates the correct LogEntry """ try: logtype = logdata['type'] return LogEntryFactory._getEntryClass(logtype)(logdata) except KeyError: pywikibot.debug(u"API log entry received:\n" + logdata, _logger) raise Error("Log entry has no 'type' key")
def process_result(self, result): entity_gid, url_gid, wikipage, rel_id, link_type_id = result wp.debug("» {wp} https://musicbrainz.org/{entitytype}/{gid}".format( entitytype=self._current_entity_type.replace("_", "-"), wp=wikipage, gid=entity_gid), layer="") try: itempage = get_wikidata_itempage_from_wikilink(wikipage) except wp.exceptions.SiteDefinitionError: wp.warning("{page} no supported family".format(page=wikipage)) return except (wp.exceptions.InvalidTitleError) as e: wp.error( "Bad or invalid title received while processing {page}".format( page=wikipage)) wp.exception(e, tb=True) return except SkipPage as e: wp.warning("{page} is being skipped because: {reason}".format( page=wikipage, reason=e)) return except IsRedirectPage as e: wp.debug("{page} is a redirect".format(page=wikipage), layer="") if self.can_edit: self.fix_redirect(url_gid, e.old, e.new) return except ValueError as e: wp.output(e) return except PageGone as e: if self.can_edit: self.end_removed(rel_id, link_type_id, entity_gid, url_gid, self._current_entity_type, wikipage) return if itempage is None: wp.debug( u"There's no wikidata page for {mbid}".format(mbid=entity_gid), layer="") return if any((key.lower() == self.property_id.lower() and claim.target == entity_gid) for key, claims in itempage.claims.items() for claim in claims): wp.debug( u"{page} already has property {pid} with value {mbid}".format( page=wikipage, mbid=entity_gid, pid=self.property_id), layer="") self.donefunc(entity_gid) return wp.debug("{mbid} is not linked in Wikidata".format(mbid=entity_gid), layer="") self.add_mbid_claim_to_item(itempage, entity_gid)
def storecookiedata(self, data: str) -> None: """ Store cookie data. @param data: The raw data as returned by getCookie() """ # THIS IS OVERRIDDEN IN data/api.py filename = config.datafilepath('pywikibot.lwp') pywikibot.debug('Storing cookies to {}'.format(filename), _logger) with open(filename, 'w') as f: f.write(data)
def categorizeImage( countrycode, lang, commonsTemplateName, commonsCategoryBase, commonsCatTemplates, page, conn, cursor, harvest_type): pywikibot.log(u'Working on: %s' % page.title()) commonsTemplate = _get_commons_template(commonsTemplateName) currentcats = list(page.categories()) if commonsCategoryBase not in currentcats: pywikibot.log(u'%s category not found at: %s. Someone probably already categorized it.' % ( commonsCategoryBase, page.title())) return False if u'Wikipedia image placeholders for cultural heritage monuments' in currentcats: pywikibot.log(u'%s in %s is a placeholder, skipping it.' % ( page.title(), commonsCategoryBase)) return False templates = page.templates() if commonsTemplate not in templates: pywikibot.log(u'%s template not found at: %s' % ( commonsTemplate, page.title())) return False try: monumentId = get_monument_id(page, commonsTemplate) except NoMonumentIdentifierFoundException: pywikibot.warning(u'Didn\'t find a valid monument identifier at: %s' % ( page.title(),)) return False monData = getMonData(countrycode, lang, monumentId, conn, cursor) if not monData: try: monumentId = int(monumentId) monData = getMonData(countrycode, lang, monumentId, conn, cursor) except ValueError: pywikibot.debug( u'Can\'t convert %s to an integer' % (monumentId,), _logger) if not monData: # Triage as log since there are plenty of valid reasons for this pywikibot.log( u'Monument with id %s not in monuments database' % (monumentId, )) return False (newcats, categorisation_method) = get_new_categories(monumentId, monData, lang, commonsCatTemplates, harvest_type) # See if one of the three options worked if newcats: comment = u'Adding categories based on [[Template:%s]] with identifier %s (method %s)' % ( commonsTemplateName, monumentId, categorisation_method) return replace_default_cat_with_new_categories_in_image( page, commonsCategoryBase, newcats, comment, verbose=True) else: pywikibot.log(u'Categories not found for %s' % page.title())
def __missing__(self, key): """Debug when the key is missing.""" pywikibot.debug('API log entry received:\n' + repr(self), _logger) if ((key in ('ns', 'title', 'pageid', 'logpage', 'params', 'action') and 'actionhidden' in self) or (key == 'comment' and 'commenthidden' in self) or (key == 'user' and 'userhidden' in self)): raise HiddenKeyError( "Log entry ({0}) has a hidden '{1}' key and you don't have " 'permission to view it.'.format(self._type, key)) raise KeyError("Log entry (%s) has no '%s' key" % (self._type, key))
def getversiondict(): """Get version info for the package. @return: - tag (name for the repository), - rev (current revision identifier), - date (date of current revision), - hash (git hash for the current revision) @rtype: C{dict} of four C{str} """ global cache if cache: return cache _program_dir = _get_program_dir() exceptions = {} for vcs_func in (getversion_git, getversion_svn_setuptools, getversion_svn, getversion_nightly, getversion_package): try: (tag, rev, date, hsh) = vcs_func(_program_dir) except Exception as e: exceptions[vcs_func] = e else: break else: # nothing worked; version unknown (but suppress exceptions) # the value is most likely '$Id' + '$', it means that # pywikibot was imported without using version control at all. tag, rev, date, hsh = ( '', '-1 (unknown)', '0 (unknown)', '(unknown)') # git and svn can silently fail, as it may be a nightly. if getversion_package in exceptions: warn('Unable to detect version; exceptions raised:\n%r' % exceptions, UserWarning) elif exceptions: pywikibot.debug('version algorithm exceptions:\n%r' % exceptions, _logger) if isinstance(date, basestring): datestring = date elif isinstance(date, time.struct_time): datestring = time.strftime('%Y/%m/%d, %H:%M:%S', date) else: warn('Unable to detect package date', UserWarning) datestring = '-2 (unknown)' cache = dict(tag=tag, rev=rev, date=datestring, hsh=hsh) return cache
def getversiondict(): """Get version info for the package. @return: - tag (name for the repository), - rev (current revision identifier), - date (date of current revision), - hash (git hash for the current revision) @rtype: C{dict} of four C{str} """ global cache if cache: return cache _program_dir = _get_program_dir() exceptions = {} for vcs_func in (getversion_git, getversion_svn_setuptools, getversion_nightly, getversion_svn, getversion_package): try: (tag, rev, date, hsh) = vcs_func(_program_dir) except Exception as e: exceptions[vcs_func] = e else: break else: # nothing worked; version unknown (but suppress exceptions) # the value is most likely '$Id' + '$', it means that # pywikibot was imported without using version control at all. tag, rev, date, hsh = ( '', '-1 (unknown)', '0 (unknown)', '(unknown)') # git and svn can silently fail, as it may be a nightly. if getversion_package in exceptions: warn('Unable to detect version; exceptions raised:\n%r' % exceptions, UserWarning) elif exceptions: pywikibot.debug('version algorithm exceptions:\n%r' % exceptions, _logger) if isinstance(date, basestring): datestring = date elif isinstance(date, time.struct_time): datestring = time.strftime('%Y/%m/%d, %H:%M:%S', date) else: warn('Unable to detect package date', UserWarning) datestring = '-2 (unknown)' cache = dict(tag=tag, rev=rev, date=datestring, hsh=hsh) return cache
def __missing__(self, key): """Debug when the key is missing.""" pywikibot.debug(u"API log entry received:\n" + repr(self), _logger) if ((key in ('ns', 'title', 'pageid', 'logpage', 'params', 'action') and 'actionhidden' in self) or (key == 'comment' and 'commenthidden' in self) or (key == 'user' and 'userhidden' in self)): raise HiddenKeyError( "Log entry ({0}) has a hidden '{1}' key and you don't have " 'permission to view it.'.format(self._type, key)) raise KeyError("Log entry (%s) has no '%s' key" % (self._type, key))
def storecookiedata(self, data): """ Store cookie data. The argument data is the raw data, as returned by getCookie(). Returns nothing. """ # THIS IS OVERRIDDEN IN data/api.py filename = config.datafilepath('pywikibot.lwp') pywikibot.debug('Storing cookies to %s' % filename, _logger) with open(filename, 'w') as f: f.write(data)
def __iter__(self): """Iterator.""" n = 0 event = None ignore_first_empty_warning = True while self._total is None or n < self._total: if not hasattr(self, 'source'): self.source = EventSource(**self.sse_kwargs) # sseclient >= 0.0.18 is required for eventstreams (T184713) # we don't have a version string inside but the instance # variable 'chunk_size' was newly introduced with 0.0.18 if not hasattr(self.source, 'chunk_size'): warning( 'You may not have the right sseclient version;\n' 'sseclient >= 0.0.18 is required for eventstreams.\n' "Install it with 'pip install \"sseclient>=0.0.18\"'") try: event = next(self.source) except (ProtocolError, socket.error, httplib.IncompleteRead) as e: warning('Connection error: {0}.\n' 'Try to re-establish connection.'.format(e)) del self.source if event is not None: self.sse_kwargs['last_id'] = event.id continue if event.event == 'message': if event.data: try: element = json.loads(event.data) except ValueError as e: warning( 'Could not load json data from\n{0}\n{1}'.format( event, e)) else: if self.streamfilter(element): n += 1 yield element elif not ignore_first_empty_warning: warning('Empty message found.') else: ignore_first_empty_warning = False elif event.event == 'error': warning('Encountered error: {0}'.format(event.data)) else: warning('Unknown event {0} occurred.'.format(event.event)) else: debug( '{0}: Stopped iterating due to ' 'exceeding item limit.'.format(self.__class__.__name__), _logger) del self.source
def __del__(self): """Destructor to close all connections in the pool.""" self.lock.acquire() try: pywikibot.debug(u"Closing connection pool (%s connections)" % len(self.connections), _logger) for key in self.connections: for connection in self.connections[key]: connection.close() except AttributeError: pass # this shows up when logger has been destroyed first finally: self.lock.release()
def _createFromData(self, logdata): """ Check for logtype from data, and creates the correct LogEntry. @param logdata: log entry data @type logdata: dict @rtype: LogEntry """ try: logtype = logdata["type"] return LogEntryFactory._getEntryClass(logtype)(logdata, self._site) except KeyError: pywikibot.debug("API log entry received:\n" + logdata, _logger) raise Error("Log entry has no 'type' key")
def fix_redirect(self, gid, old, new): """ :param gid str: :param old str: :param new str: """ if wp.config.simulate: wp.output("Simulation, not fixing the redirect from %s to %s" % (old, new)) return if self.client is None: return wp.debug("Fixing the redirect from %s to %s" % (old, new), layer="") self.client.edit_url(gid, old, new, self.edit_note % (old, new))
def set_maximum_items(self, value: int): """ Set the maximum number of items to be retrieved from the stream. If not called, most queries will continue as long as there is more data to be retrieved from the stream. @param value: The value of maximum number of items to be retrieved in total to set. """ if value is not None: self._total = int(value) debug('{}: Set limit (maximum_items) to {}.' .format(self.__class__.__name__, self._total), _logger)
def storecookiedata(self, data): """ Store cookie data. The argument data is the raw data, as returned by getCookie(). Returns nothing. """ # THIS IS OVERRIDDEN IN data/api.py filename = config.datafilepath("pywikibot.lwp") pywikibot.debug(u"Storing cookies to %s" % filename, _logger) f = open(filename, "w") f.write(data) f.close()
def _createFromData(self, logdata: dict): """ Check for logtype from data, and creates the correct LogEntry. @param logdata: log entry data @rtype: LogEntry """ try: logtype = logdata['type'] except KeyError: pywikibot.debug('API log entry received:\n{0}'.format(logdata), _logger) raise Error("Log entry has no 'type' key") return LogEntryFactory.get_entry_class(logtype)(logdata, self._site)
def __del__(self): """Destructor to close all connections in the pool.""" self.lock.acquire() try: pywikibot.debug(u"Closing connection pool (%s connections)" % len(self.connections), _logger) for key in self.connections: for connection in self.connections[key]: connection.close() except (AttributeError, TypeError): pass # this shows up when logger has been destroyed first finally: self.lock.release()
def run(self): # The Queue item is expected to either an HttpRequest object # or None (to shut down the thread) pywikibot.debug(u"Thread started, waiting for requests.", _logger) while (True): item = self.queue.get() if item is None: pywikibot.debug(u"Shutting down thread.", _logger) return try: item.data = self.http.request(*item.args, **item.kwargs) finally: if item.lock: item.lock.release()
def _create_from_data(self, logdata: Dict[str, Any]) -> LogEntry: """ Check for logtype from data, and creates the correct LogEntry. :param logdata: log entry data """ try: logtype = logdata['type'] except KeyError: pywikibot.debug('API log entry received:\n{}'.format(logdata), _logger) raise Error("Log entry has no 'type' key") return LogEntryFactory.get_entry_class(logtype)(logdata, self._site)
def run(self): # The Queue item is expected to either an HttpRequest object # or None (to shut down the thread) pywikibot.debug(u"Thread started, waiting for requests.", _logger) while True: item = self.queue.get() if item is None: pywikibot.debug(u"Shutting down thread.", _logger) return try: item.data = self.http.request(*item.args, **item.kwargs) finally: if item.lock: item.lock.release()
def _follow_redirect(self, uri, method, body, headers, response, content, max_redirects): """Internal function to follow a redirect recieved by L{request}""" (scheme, authority, absolute_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri)) if self.cache: cachekey = defrag_uri else: cachekey = None # Pick out the location header and basically start from the beginning # remembering first to strip the ETag header and decrement our 'depth' if "location" not in response and response.status != 300: raise httplib2.RedirectMissingLocation( "Redirected but the response is missing a Location: header.", response, content) # Fix-up relative redirects (which violate an RFC 2616 MUST) if "location" in response: location = response['location'] (scheme, authority, path, query, fragment) = httplib2.parse_uri(location) if authority is None: response['location'] = httplib2.urlparse.urljoin(uri, location) pywikibot.debug( u"Relative redirect: changed [%s] to [%s]" % (location, response['location']), _logger) if response.status == 301 and method in ["GET", "HEAD"]: response['-x-permanent-redirect-url'] = response['location'] if "content-location" not in response: response['content-location'] = absolute_uri httplib2._updateCache(headers, response, content, self.cache, cachekey) headers.pop('if-none-match', None) headers.pop('if-modified-since', None) if "location" in response: location = response['location'] redirect_method = ( (response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method return self.request(location, redirect_method, body=body, headers=headers, max_redirects=max_redirects - 1) else: raise httplib2.RedirectLimit( "Redirected more times than redirection_limit allows.", response, content)
def _createFromData(self, logdata): """ Check for logtype from data, and creates the correct LogEntry. @param logdata: log entry data @type logdata: dict @rtype: LogEntry """ try: logtype = logdata['type'] except KeyError: pywikibot.debug('API log entry received:\n{0}'.format(logdata), _logger) raise Error("Log entry has no 'type' key") return LogEntryFactory.get_entry_class(logtype)(logdata, self._site)
def set_maximum_items(self, value): """ Set the maximum number of items to be retrieved from the stream. If not called, most queries will continue as long as there is more data to be retrieved from the stream. @param value: The value of maximum number of items to be retrieved in total to set. @type value: int """ if value is not None: self._total = int(value) debug('{0}: Set limit (maximum_items) to {1}.' .format(self.__class__.__name__, self._total), _logger)
def _follow_redirect(self, uri, method, body, headers, response, content, max_redirects): """Internal function to follow a redirect recieved by L{request}""" (scheme, authority, absolute_uri, defrag_uri) = httplib2.urlnorm( httplib2.iri2uri(uri)) if self.cache: cachekey = defrag_uri else: cachekey = None # Pick out the location header and basically start from the beginning # remembering first to strip the ETag header and decrement our 'depth' if "location" not in response and response.status != 300: raise httplib2.RedirectMissingLocation( "Redirected but the response is missing a Location: header.", response, content) # Fix-up relative redirects (which violate an RFC 2616 MUST) if "location" in response: location = response['location'] (scheme, authority, path, query, fragment) = httplib2.parse_uri( location) if authority == None: response['location'] = httplib2.urlparse.urljoin(uri, location) pywikibot.debug(u"Relative redirect: changed [%s] to [%s]" % (location, response['location']), _logger) if response.status == 301 and method in ["GET", "HEAD"]: response['-x-permanent-redirect-url'] = response['location'] if "content-location" not in response: response['content-location'] = absolute_uri httplib2._updateCache(headers, response, content, self.cache, cachekey) headers.pop('if-none-match', None) headers.pop('if-modified-since', None) if "location" in response: location = response['location'] redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"]) ) and "GET" or method return self.request(location, redirect_method, body=body, headers=headers, max_redirects=max_redirects - 1) else: raise RedirectLimit( "Redirected more times than redirection_limit allows.", response, content)
def process_result(self, result): entity_gid, url_gid, wikipage = result wp.debug("» {wp} https://musicbrainz.org/{entitytype}/{gid}".format( entitytype=self._current_entity_type.replace("_", "-"), wp=wikipage, gid=entity_gid ), layer="") try: itempage = get_wikidata_itempage_from_wikilink(wikipage) except wp.NoSuchSite: wp.warning("{page} no supported family".format(page=wikipage)) return except (wp.BadTitle, wp.InvalidTitle) as e: wp.error("Bad or invalid title received while processing {page}".format(page=wikipage)) wp.exception(e, tb=True) return except SkipPage as e: wp.warning("{page} is being skipped because: {reason}".format(page=wikipage, reason=e)) return except IsRedirectPage as e: wp.debug("{page} is a redirect".format(page=wikipage), layer="") if self.can_edit: self.fix_redirect(url_gid, e.old, e.new) return except ValueError as e: wp.output(e) return if itempage is None: wp.debug(u"There's no wikidata page for {mbid}".format(mbid=entity_gid), layer="") return if any((key.lower() == self.property_id.lower() and claim.target == entity_gid) for key, claims in itempage.claims.items() for claim in claims): wp.debug(u"{page} already has property {pid} with value {mbid}". format(page=wikipage, mbid=entity_gid, pid=self.property_id), layer="") self.donefunc(entity_gid) return wp.debug("{mbid} is not linked in Wikidata".format( mbid=entity_gid), layer="") self.add_mbid_claim_to_item(itempage, entity_gid)
def set_query_increment(self, value): """Set the maximum number of items to be retrieved per API query. If not called, the default is to ask for "max" items and let the API decide how many to send. """ limit = int(value) # don't update if limit is greater than maximum allowed by API if self.api_limit is None: self.query_limit = limit else: self.query_limit = min(self.api_limit, limit) pywikibot.debug( u"%s: Set query_limit to %i." % (self.__class__.__name__, self.query_limit), _logger)
def update_limit(self): """Set query limit for self.module based on api response""" for mod in self.module.split('|'): for param in self._modules[mod].get("parameters", []): if param["name"] == "limit": if self.site.logged_in() and self.site.has_right( 'apihighlimits'): self.api_limit = int(param["highmax"]) else: self.api_limit = int(param["max"]) if self.prefix is None: self.prefix = self._modules[mod]["prefix"] pywikibot.debug( u"%s: Set query_limit to %i." % (self.__class__.__name__, self.api_limit), _logger) return
def update_limit(self): """Set query limit for self.module based on api response""" for mod in self.module.split('|'): for param in self._modules[mod].get("parameters", []): if param["name"] == "limit": if self.site.logged_in() and self.site.has_right('apihighlimits'): self.api_limit = int(param["highmax"]) else: self.api_limit = int(param["max"]) if self.prefix is None: self.prefix = self._modules[mod]["prefix"] pywikibot.debug(u"%s: Set query_limit to %i." % (self.__class__.__name__, self.api_limit), _logger) return
def getversiondict(): """Get version info for the package. :return: - tag (name for the repository), - rev (current revision identifier), - date (date of current revision), - hash (git hash for the current revision) :rtype: ``dict`` of four ``str`` """ _program_dir = _get_program_dir() exceptions = {} for vcs_func in (getversion_git, getversion_svn, getversion_nightly, getversion_package): try: (tag, rev, date, hsh) = vcs_func(_program_dir) except Exception as e: exceptions[vcs_func] = e else: break else: # nothing worked; version unknown (but suppress exceptions) # the value is most likely '$Id' + '$', it means that # pywikibot was imported without using version control at all. tag, rev, date, hsh = ('', '-1 (unknown)', '0 (unknown)', '(unknown)') warn( 'Unable to detect version; exceptions raised:\n{!r}'.format( exceptions), UserWarning) exceptions = None # Git and SVN can silently fail, as it may be a nightly. if exceptions: pywikibot.debug( 'version algorithm exceptions:\n{!r}'.format(exceptions), _logger) if isinstance(date, str): datestring = date elif isinstance(date, time.struct_time): datestring = time.strftime('%Y/%m/%d, %H:%M:%S', date) else: warn('Unable to detect package date', UserWarning) datestring = '-2 (unknown)' return {'tag': tag, 'rev': rev, 'date': datestring, 'hsh': hsh}
def __missing__(self, key): """Debug when the key is missing. HiddenKeyError is raised when the user does not have permission. KeyError is raised otherwise. It also logs debugging information when a key is missing. """ pywikibot.debug('API log entry received:\n' + repr(self), _logger) hidden = {'action', 'logpage', 'ns', 'pageid', 'params', 'title'} if ((key in hidden and 'actionhidden' in self) or (key == 'comment' and 'commenthidden' in self) or (key == 'user' and 'userhidden' in self)): raise HiddenKeyError( "Log entry ({}) has a hidden '{}' key and you don't have " 'permission to view it.'.format(self['type'], key)) raise KeyError("Log entry ({}) has no '{}' key".format( self['type'], key))
def is_wikisource_author_page(self, title): """Initialise author_ns if site family is 'wikisource' else pass.""" if self.site.family.name != 'wikisource': return author_ns = 0 try: author_ns = self.site.family.authornamespaces[self.site.lang][0] except (AttributeError, KeyError): pass if author_ns: author_ns_prefix = self.site.namespace(author_ns) pywikibot.debug('Author ns: {0}; name: {1}' .format(author_ns, author_ns_prefix), _logger) if title.find(author_ns_prefix + ':') == 0: author_page_name = title[len(author_ns_prefix) + 1:] verbose_output('Found author ' + author_page_name) return True
def pop_connection(self, identifier): """Get a connection from identifier's connection pool. @param identifier: The pool identifier @return: A connection object if found, None otherwise """ self.lock.acquire() try: if identifier in self.connections: if len(self.connections[identifier]) > 0: pywikibot.debug(u"Retrieved connection from '%s' pool." % identifier, _logger) return self.connections[identifier].pop() return None finally: self.lock.release()
def run(self): # The Queue item is expected to either an HttpRequest object # or None (to shut down the thread) pywikibot.debug(u"Thread started, waiting for requests.", _logger) while True: item = self.queue.get() if item is None: pywikibot.debug(u"Shutting down thread.", _logger) return # This needs to be set per request, however it is only used # the first time the pooled connection is created. self.http.disable_ssl_certificate_validation = \ item.kwargs.pop('disable_ssl_certificate_validation', False) try: item.data = self.http.request(*item.args, **item.kwargs) finally: if item.lock: item.lock.release()
def is_wikisource_author_page(self, title): """Initialise author_ns if site family is 'wikisource' else pass.""" if self.site.family.name != 'wikisource': return author_ns = 0 try: author_ns = self.site.family.authornamespaces[self.site.lang][0] except: pass if author_ns: author_ns_prefix = self.site.namespace(author_ns) pywikibot.debug(u'Author ns: %d; name: %s' % (author_ns, author_ns_prefix), _logger) if title.find(author_ns_prefix + ':') == 0: if pywikibot.config.verbose_output: author_page_name = title[len(author_ns_prefix) + 1:] pywikibot.output(u'Found author %s' % author_page_name) return True
def end_removed(self, rel_id, link_type_id, entity_gid, url_gid, entitytype, wikipage): """ :param rel_id str: :param link_type_id str: :param entity_gid str: :param url_gid str: :param entitytype str: """ url_entity = {'type': 'url', 'gid': url_gid, 'url': wikipage} other_entity = {'type': entitytype, 'gid': entity_gid} entity0 = other_entity if (entitytype < 'url') else url_entity entity1 = url_entity if (entitytype < 'url') else other_entity wp.debug("Removing non existing page %s" % (wikipage), layer="") self.client.edit_relationship(rel_id, entity0, entity1, link_type_id, {}, {}, {}, True, self.removed_edit_note % (wikipage), False) self._performed_edit()
def checkMultiplicity(self): """Count running processes for site and set process_multiplicity.""" global pid mysite = self.mysite pywikibot.debug('Checking multiplicity: pid = {pid}'.format(pid=pid), _logger) with self.lock: processes = [] used_pids = set() count = 1 now = time.time() for proc in self._read_file(raise_exc=True): used_pids.add(proc.pid) if now - proc.time > self.releasepid: continue # process has expired, drop from file if now - proc.time <= self.dropdelay \ and proc.site == mysite \ and proc.pid != pid: count += 1 if proc.site != self.mysite or proc.pid != pid: processes.append(proc) free_pid = (i for i in itertools.count(start=1) if i not in used_pids) if not pid: pid = next(free_pid) self.checktime = time.time() processes.append( ProcEntry(module_id=self._module_hash(), pid=pid, time=self.checktime, site=mysite)) self.modules = Counter(p.module_id for p in processes) self._write_file(sorted(processes, key=lambda p: p.pid)) self.process_multiplicity = count pywikibot.log( 'Found {} {} processes running, including this one.'.format( count, mysite))