def update_family(families): if not families: families = familiesDict.keys() for family in families: wikipedia.output('Checking family %s:' % family) original = wikipedia.Family(family).languages_by_size obsolete = wikipedia.Family(family).obsolete url = 'http://s23.org/wikistats/%s' % familiesDict[family] uo = wikipedia.MyURLopener f = uo.open(url) text = f.read() if family == 'wikipedia': p = re.compile( r"\[\[:([a-z\-]{2,}):\|\1\]\].*?'''([0-9,]{1,})'''</span>\]", re.DOTALL) else: p = re.compile( r"\[http://([a-z\-]{2,}).%s.org/wiki/ \1].*?'''([0-9,]{1,})'''\]" % family, re.DOTALL) new = [] for lang, cnt in p.findall(text): if lang in obsolete or lang in exceptions: # Ignore this language continue new.append(lang) if original == new: wikipedia.output(u'The lists match!') else: wikipedia.output(u"The lists don't match, the new list is:") text = u' self.languages_by_size = [\r\n' line = ' ' * 11 for lang in new: if len(line) + len(lang) <= 76: line += u" '%s'," % lang else: text += u'%s\r\n' % line line = ' ' line += u" '%s'," % lang text += u'%s\r\n' % line text += u' ]' wikipedia.output(text) family_file_name = '../families/%s_family.py' % family family_file = codecs.open(family_file_name, 'r', 'utf8') old_text = family_text = family_file.read() old = re.findall(ur'(?msu)^ {8}self.languages_by_size.+?\]', family_text)[0] family_text = family_text.replace(old, text) family_file = codecs.open(family_file_name, 'w', 'utf8') family_file.write(family_text) family_file.close()
def join_family_data(reString, namespace): for s in pywikibot.Family().namespaces[namespace].itervalues(): if type(s) == list: for e in s: reString += '|' + e else: reString += '|' + s return '\s*(' + reString + ')\s*'
def check_and_update(families, update_main=False): for family in families: family = wikipedia.Family(family) result = family_check.check_family(family) update_family(family, result, update_main) if update_main: # Update also the family.py file update_family(None, result, update_main)
def getLanguageLinks(text, insite=None, pageLink="[[]]", template_subpage=False): """ Return a dict of interlanguage links found in text. Dict uses language codes as keys and Page objects as values. Do not call this routine directly, use Page.interwiki() method instead. """ if insite is None: insite = pywikibot.getSite() fam = insite.family # when interwiki links forward to another family, retrieve pages & other # infos there if fam.interwiki_forward: fam = pywikibot.Family(fam.interwiki_forward) result = {} # Ignore interwiki links within nowiki tags, includeonly tags, pre tags, # and HTML comments tags = ['comments', 'nowiki', 'pre', 'source'] if not template_subpage: tags += ['includeonly'] text = removeDisabledParts(text, tags) # This regular expression will find every link that is possibly an # interwiki link. # NOTE: language codes are case-insensitive and only consist of basic latin # letters and hyphens. # TODO: currently, we do not have any, but BCP 47 allows digits, and # underscores. # TODO: There is no semantic difference between hyphens and # underscores -> fold them. interwikiR = re.compile(r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]') for lang, pagetitle in interwikiR.findall(text): lang = lang.lower() # Check if it really is in fact an interwiki link to a known # language, or if it's e.g. a category tag or an internal link if lang in fam.obsolete: lang = fam.obsolete[lang] if lang in fam.langs.keys(): if '|' in pagetitle: # ignore text after the pipe pagetitle = pagetitle[:pagetitle.index('|')] # we want the actual page objects rather than the titles site = pywikibot.getSite(code=lang, fam=fam) try: result[site] = pywikibot.Page(site, pagetitle, insite=insite) except pywikibot.InvalidTitle: pywikibot.output(u'[getLanguageLinks] Text contains invalid ' u'interwiki link [[%s:%s]].' % (lang, pagetitle)) continue return result
def get(self, site, type, key=None, default=None): # This can probably also provide something for # localised settings, but then it first needs to # check whether the page is sysop only. if not key: key = str(site) self.lock.acquire() try: if type not in self.summaries: self.summaries[type] = {} if key in self.summaries[type]: if (time.time() - self.summaries[type][key][1]) < \ self.CommonsDelinker.config['summary_cache']: # Return cached result return self.summaries[type][key][0] output(u'%s Fetching new summary for %s' % (self, site)) # FIXME: evil if self.CommonsDelinker.config['global']: self.check_user_page(site) page = wikipedia.Page(site, '%s%s' % \ (self.CommonsDelinker.config['local_settings'], type)) try: # Fetch the summary template, follow redirects i18n = page.get(get_redirect=True) self.summaries[type][key] = (i18n, time.time()) return i18n except wikipedia.NoPage: pass finally: self.lock.release() # No i18n available, but it may be available in the wikipedia # of that language. Only do so for wiktionary, wikibooks, # wikiquote, wikisource, wikinews, wikiversity # This will cause the bot to function even on special wikis # like mediawiki.org and meta and species. output(u'%s Using default summary for %s' % (self, site)) if default: return default if site.family.name != 'wikipedia' and self.CommonsDelinker.config[ 'global']: if site.family.name in ('wiktionary', 'wikibooks', 'wikiquote', 'wikisource', 'wikinews', 'wikiversity'): if site.lang in config.usernames['wikipedia']: newsite = self.CommonsDelinker.get_site( site.lang, wikipedia.Family('wikipedia')) return self.get(newsite, type, key=key) return self.CommonsDelinker.config['default_settings'].get(type, '')
def main(): all = False language = None fam = None wikimedia = False for arg in pywikibot.handleArgs(): if arg == '-all': all = True elif arg[0:7] == '-langs:': language = arg[7:] elif arg[0:10] == '-families:': fam = arg[10:] elif arg[0:10] == '-wikimedia': wikimedia = True mySite = pywikibot.getSite() if wikimedia: families = [ 'commons', 'incubator', 'mediawiki', 'meta', 'species', 'test', 'wikibooks', 'wikidata', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikiversity', 'wikivoyage', 'wiktionary' ] elif fam is not None: families = fam.split(',') else: families = [ mySite.family.name, ] for family in families: try: fam = pywikibot.Family(family) except ValueError: pywikibot.output(u'No such family %s' % family) continue if all: for lang in fam.langs.iterkeys(): testSite(pywikibot.getSite(lang, family)) elif language is None: lang = mySite.lang if not lang in fam.langs.keys(): lang = fam.langs.keys()[-1] testSite(pywikibot.getSite(lang, family)) else: languages = language.split(',') for lang in languages: try: testSite(pywikibot.getSite(lang, family)) except pywikibot.NoSuchSite: pywikibot.output(u'No such language %s in family %s' % (lang, family))
def __init__(self, limit = 100, mysql_default_server = 3, mysql_host_prefix = 'sql-s', mysql_host_suffix = '', mysql_kwargs = {}, no_db = False, use_autoconn = False, http_retry_timeout = 30, http_max_retries = -1, http_callback = lambda *args: None, mysql_retry_timeout = 60, mysql_max_retries = -1, mysql_callback = lambda *args: None): self.http = None self.http_retry_timeout = http_retry_timeout self.http_max_retries = http_max_retries self.http_callback = http_callback if no_db: return self.mysql_host_prefix = mysql_host_prefix self.mysql_kwargs = mysql_kwargs.copy() # To be safe if 'host' in self.mysql_kwargs: del self.mysql_kwargs['host'] self.use_autoconn = use_autoconn self.mysql_retry_timeout = mysql_retry_timeout self.mysql_max_retries = mysql_max_retries self.mysql_callback = mysql_callback self.connections = [] # Mapping database name -> mysql connection self.databases = {} # Mapping server id -> mysql connection self.servers = {} # Mapping database name -> (lang, family) self.sites = {} self.domains = {} self.unknown_families = [] # Mapping family name -> family object self.known_families = {} database, cursor = self.connect_mysql(mysql_host_prefix + str(mysql_default_server)) self.servers[mysql_default_server] = (database, cursor) # Find where the databases are located cursor.execute('SELECT dbname, domain, server FROM toolserver.wiki ORDER BY size DESC LIMIT %s', (limit, )) for dbname, domain, server in cursor.fetchall(): if server not in self.servers: self.servers[server] = self.connect_mysql(mysql_host_prefix + str(server) + mysql_host_suffix) # FIXME: wikimediafoundation! # TODO: This is one big mess try: lang, fam = family(domain) if fam not in self.known_families: self.known_families[fam] = wikipedia.Family(fam, fatal = False) except (RuntimeError, ValueError, SyntaxError): self.unknown_families.append(domain) else: self.sites[dbname] = (lang, fam) self.databases[dbname] = self.servers[server] self.domains[dbname] = domain
site = wikipedia.getSite(lang, family) wikipedia.output(u'Checking %s' % site) namespaces = check_namespaces(site) if namespaces: for id, name, defined_namespace in namespaces: try: msg = u'Namespace %s for %s is ' \ + (u'[%s]. ' if len(name) > 1 else u'%s. ') \ + (u'[%s]' if len(defined_namespace) > 1 else u'%s') \ + u' is defined in family file.' wikipedia.output(msg % (id, site, ', '.join(name), ', '.join(defined_namespace))) except: pass result[lang] = namespaces return result if __name__ == '__main__': try: wikipedia.handleArgs() family = wikipedia.Family(wikipedia.default_family) result = check_family(family) wikipedia.output(u'\nWriting raw Python dictionary to stdout.') wikipedia.output( u'Format is: (namespace_id, namespace_name, predefined_namespace)') print print result finally: wikipedia.stopme()