def weblinksIn(text, withoutBracketed=False, onlyBracketed=False): text = pywikibot.removeDisabledParts(text) # MediaWiki parses templates before parsing external links. Thus, there # might be a | or a } directly after a URL which does not belong to # the URL itself. # First, remove the curly braces of inner templates: nestedTemplateR = re.compile(r'{{([^}]*?){{(.*?)}}(.*?)}}') while nestedTemplateR.search(text): text = nestedTemplateR.sub(r'{{\1 \2 \3}}', text) # Then blow up the templates with spaces so that the | and }} will not # be regarded as part of the link:. templateWithParamsR = re.compile(r'{{([^}]*?[^ ])\|([^ ][^}]*?)}}', re.DOTALL) while templateWithParamsR.search(text): text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text) # Add <blank> at the end of a template # URL as last param of multiline template would not be correct text = text.replace('}}', ' }}') # Remove HTML comments in URLs as well as URLs in HTML comments. # Also remove text inside nowiki links etc. text = pywikibot.removeDisabledParts(text) linkR = pywikibot.compileLinkR(withoutBracketed, onlyBracketed) for m in linkR.finditer(text): if m.group('url'): yield m.group('url') else: yield m.group('urlb')
def weblinksIn(text, withoutBracketed = False, onlyBracketed = False): text = pywikibot.removeDisabledParts(text) # MediaWiki parses templates before parsing external links. Thus, there # might be a | or a } directly after a URL which does not belong to # the URL itself. # First, remove the curly braces of inner templates: nestedTemplateR = re.compile(r'{{([^}]*?){{(.*?)}}(.*?)}}') while nestedTemplateR.search(text): text = nestedTemplateR.sub(r'{{\1 \2 \3}}', text) # Then blow up the templates with spaces so that the | and }} will not be regarded as part of the link:. templateWithParamsR = re.compile(r'{{([^}]*?[^ ])\|([^ ][^}]*?)}}', re.DOTALL) while templateWithParamsR.search(text): text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text) linkR = pywikibot.compileLinkR(withoutBracketed, onlyBracketed) # Remove HTML comments in URLs as well as URLs in HTML comments. # Also remove text inside nowiki links etc. text = pywikibot.removeDisabledParts(text) for m in linkR.finditer(text): if m.group('url'): yield m.group('url') else: yield m.group('urlb')
def get(self, force=False, fetch_text=True, cache=True, *args): # Realistically no one even wants the property info, and datatype is its own function. # Cache controls only saving as cache, not fetching from it if fetch_text: return_this = super(pywikibot.PropertyPage, self).get(force, *args) # Do it cuz else: return_this = {} # Check that we don't already have it stored if not force and hasattr(self, '_constraints'): return return_this talk = self.toggleTalkPage() if not talk.exists(): text = '' else: g = mc.get(self.md5()) if g is not None: self._constraints = ast.literal_eval(g) return return_this else: text = talk.get() code = mwparserfromhell.parse(text) d = {} for temp in code.filter_templates(recursive=False): if temp.name.lower().startswith('constraint:'): nm = temp.name.lower()[11:] nm = normalize(nm) if nm == 'format': value = unicode(temp.get('pattern').value) d[nm] = pywikibot.removeDisabledParts(value, tags=['nowiki']) elif nm in ['target', 'item']: d[nm] = {'property': unicode(temp.get('property').value), } if temp.has_param('item'): d[nm]['item'] = unicode(temp.get('item').value) elif nm == 'oneof': values = unicode(temp.get('values').value) values = pywikibot.removeDisabledParts(values, tags=['comments']) values = values.replace('{{Q|', '').replace('{{q|', '').replace('}}', '') values = values.split(', ') d[nm] = list() for v in values: d[nm].append('Q' + v) elif nm == 'reciprocal': d[nm] = unicode(temp.get('property').value) else: d[nm] = '' # Just set a key like the API does self._constraints = d if cache: mc.set(self.md5(), self._constraints, expiry) return return_this
def determineClass(self, code, page): if page.toggleTalkPage().isRedirectPage(): return 'redirect' if page.namespace() == 101: return 'portal' elif page.namespace() == 15: return 'category' elif page.namespace() == 11: return 'template' if self.level == 'simple': return None found = list() stub = False code = mwparserfromhell.parse(pywikibot.removeDisabledParts(unicode(code))) #wtf for template in code.filter_templates(recursive=True): if template.has_param('class'): found.append(template.get('class').value.strip()) if (template.name.lower() in self.stub_templates) and (not stub): stub = True #check for auto=stub if not found: if stub: return 'stub' return None if (self.level == 'conservative') and (len(found) == 1): if stub: return 'stub' return None if found.count(found[0]) == len(found): #verifies that all values are equal return found[0] if self.level in ['inherit', 'conservative']: if stub: return 'stub' return None #can only be 'liberal' d={} for value in found: value = value.lower().strip() if not d.has_key(value): d[value] = 1 else: d[value] += 1 #top = d.keys()[d.values().index(max(d.values()))] sorted_d = sorted(d.iteritems(), key=operator.itemgetter(1), reverse=True) top = sorted_d[0][1] top_value = sorted_d[0][0] key=1 print sorted_d if len(sorted_d) == 1: return top_value while top == sorted_d[key][1]: if self.valueClass(top_value) <= self.valueClass(sorted_d[key][0]): top_value = sorted_d[key][0] key += 1 if len(sorted_d) >= key: break return top_value
def do_page(article): pg = pywikibot.Page(site, article) if not pg.exists(): return while pg.isRedirectPage(): pg = pg.getRedirectTarget() if pg.namespace() != 2: print 'Skipping %s.' % pg.title() return text = pg.get() text = pywikibot.removeDisabledParts(text) print '--------%s---------' % pg.title() print text[:150] print '-------------------' x=raw_input('What should the title be? ') if x == 's': print 'Skipping.' return elif x == 'o': webbrowser.open('http://enwp.org/%s' %pg.title()) return new_title = 'Wikipedia talk:Articles for creation/' + x.strip() reason = 'Preferred location for [[WP:AFC|AfC]] submissions' new_pg = pywikibot.Page(site, new_title) if new_pg.exists(): print '%s already exists, will add a (2) to the end.' % new_pg.title() new_title += ' (2)' print 'Moving to %s' % new_title pg.move(new_title, reason)
def parseInstructions(page): """ Parses the index template for all of the parameters """ text = page.get() #print u'Parsing instructions for [[%s]].' % page.title() key = text.find('{{User:HBC Archive Indexerbot/OptIn') data = text[key:].split('}}')[0][ 36:] #kinda scared about hardcoding so much #remove any comments (apparently users do this) cleaned = pywikibot.removeDisabledParts(data) info = {} info['mask'] = [] info['talkpage'] = page.title() for param in cleaned.split('|'): param = clean(param) if param.startswith('target='): target = clean(param[7:]) if target.startswith('/'): target = page.title() + target elif target.startswith('./'): target = page.title() + target[1:] info['target'] = target elif param.startswith('mask='): mask = clean(param[5:]) if mask.startswith('/'): mask = page.title() + mask elif mask.startswith('./'): mask = page.title() + mask[1:] info['mask'].append(mask) elif param.startswith('indexhere='): value = param[10:] if clean(value.lower()) == 'yes': info['indexhere'] = True else: info['indexhere'] = False elif param.startswith('template='): info['template'] = clean(param[9:].replace('\n', '')) elif param.startswith('leading_zeros='): try: info['leading_zeros'] = int(clean(param[14:])) except ValueError: pass elif param.startswith('first_archive='): info['first_archive'] = clean(param[14:]) #set default values if not already set for key in info.keys(): if type(info[key]) == type(u''): if info[key].isspace() or (not info[key]): del info[key] if not info.has_key('leading_zeros'): info['leading_zeros'] = 0 if not info.has_key('indexhere'): info['indexhere'] = False if not info.has_key('template'): info['template'] = 'User:HBC Archive Indexerbot/default template' if info['template'] == 'template location': info['template'] = 'User:HBC Archive Indexerbot/default template' return info
def __iter__(self): import xmlreader dump = xmlreader.XmlDump(self.xmlFilename) for entry in dump.parse(): text = pywikibot.removeDisabledParts(entry.text) if self.refR.search(text) and not self.referencesR.search(text): yield pywikibot.Page(pywikibot.getSite(), entry.title)
def lacksReferences(self, text, verbose=True): """ Checks whether or not the page is lacking a references tag. """ oldTextCleaned = pywikibot.removeDisabledParts(text) if self.referencesR.search(oldTextCleaned) or \ self.referencesTagR.search(oldTextCleaned): if verbose: pywikibot.output( u'No changes necessary: references tag found.') return False elif self.referencesTemplates: templateR = u'{{(' + u'|'.join(self.referencesTemplates) + ')' if re.search(templateR, oldTextCleaned, re.IGNORECASE | re.UNICODE): if verbose: pywikibot.output( u'No changes necessary: references template found.') return False if not self.refR.search(oldTextCleaned): if verbose: pywikibot.output(u'No changes necessary: no ref tags found.') return False else: if verbose: pywikibot.output(u'Found ref without references.') return True
def __iter__(self): import xmlreader dump = xmlreader.XmlDump(self.xmlFilename) for entry in dump.parse(): text = pywikibot.removeDisabledParts(entry.text) if self.refR.search(text) and not self.referencesR.search(text): yield pywikibot.Page(pywikibot.Site(), entry.title)
def normalize_usk(thingy): thingy = pywikibot.removeDisabledParts(thingy) thingy = thingy.strip() if thingy.isdigit(): if int(thingy) in USK: item = pywikibot.ItemPage(repo, USK[int(thingy)]) return item
def lacksReferences(self, text): """ Checks whether or not the page is lacking a references tag. """ oldTextCleaned = pywikibot.removeDisabledParts(text) if self.referencesR.search(oldTextCleaned) or \ self.referencesTagR.search(oldTextCleaned): if self.verbose: pywikibot.output(u'No changes necessary: references tag found.') return False elif self.referencesTemplates: templateR = u'{{(' + u'|'.join(self.referencesTemplates) + ')' if re.search(templateR, oldTextCleaned, re.IGNORECASE | re.UNICODE): if self.verbose: pywikibot.output( u'No changes necessary: references template found.') return False if not self.refR.search(oldTextCleaned): if self.verbose: pywikibot.output(u'No changes necessary: no ref tags found.') return False else: if self.verbose: pywikibot.output(u'Found ref without references.') return True
def parseInstructions(page): """ Parses the index template for all of the parameters """ text = page.get() #print u'Parsing instructions for [[%s]].' % page.title() key = text.find('{{User:HBC Archive Indexerbot/OptIn') data = text[key:].split('}}')[0][36:] #kinda scared about hardcoding so much #remove any comments (apparently users do this) cleaned = pywikibot.removeDisabledParts(data) info = {} info['mask'] = [] info['talkpage'] = page.title() for param in cleaned.split('|'): param = clean(param) if param.startswith('target='): target = clean(param[7:]) if target.startswith('/'): target = page.title() + target elif target.startswith('./'): target = page.title() + target[1:] info['target'] = target elif param.startswith('mask='): mask = clean(param[5:]) if mask.startswith('/'): mask = page.title() + mask elif mask.startswith('./'): mask = page.title() + mask[1:] info['mask'].append(mask) elif param.startswith('indexhere='): value = param[10:] if clean(value.lower()) == 'yes': info['indexhere'] = True else: info['indexhere'] = False elif param.startswith('template='): info['template'] = clean(param[9:].replace('\n','')) elif param.startswith('leading_zeros='): try: info['leading_zeros'] = int(clean(param[14:])) except ValueError: pass elif param.startswith('first_archive='): info['first_archive'] = clean(param[14:]) #set default values if not already set for key in info.keys(): if type(info[key]) == type(u''): if info[key].isspace() or (not info[key]): del info[key] if not info.has_key('leading_zeros'): info['leading_zeros'] = 0 if not info.has_key('indexhere'): info['indexhere'] = False if not info.has_key('template'): info['template'] = 'User:HBC Archive Indexerbot/default template' if info['template'] == 'template location': info['template'] = 'User:HBC Archive Indexerbot/default template' return info
def procesPage(self, page): """Process a single page/item.""" item = pywikibot.DataPage(page) pywikibot.output('Processing %s' % page) if not item.exists(): pywikibot.output('%s doesn\'t have a wikidata item :(' % page) #TODO FIXME: We should provide an option to create the page else: pagetext = page.get() pagetext = pywikibot.removeDisabledParts(pagetext) templates = pywikibot.extract_templates_and_params(pagetext) for (template, fielddict) in templates: # We found the template we were looking for if template.replace(u'_', u' ') == self.templateTitle: for field, value in fielddict.items(): # This field contains something useful for us if field in self.fields: # Check if the property isn't already set claim = self.fields[field] if claim in item.get().get('claims'): pywikibot.output( u'A claim for %s already exists. Skipping' % (claim, )) # TODO FIXME: This is a very crude way of dupe # checking else: # Try to extract a valid page match = re.search( re.compile( r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]' ), value) if match: try: link = match.group(1) linkedPage = pywikibot.Page( self.site, link) if linkedPage.isRedirectPage(): linkedPage = linkedPage.getRedirectTarget( ) linkedItem = pywikibot.DataPage( linkedPage) pywikibot.output( 'Adding %s --> %s' % (claim, linkedItem.getID())) refs = self.setSource( self.site().language()) if refs: item.editclaim(str(claim), linkedItem.getID(), refs=set(refs)) else: item.editclaim( str(claim), linkedItem.getID()) except pywikibot.NoPage: pywikibot.output( "[[%s]] doesn't exist so I can't link to it" % linkedItem.title())
def normalize_pegi(thingy): # BECAUSE PEOPLE DO WEIRD THINGS! thingy = pywikibot.removeDisabledParts(thingy) thingy = thingy.replace('+', '') thingy = thingy.strip() if thingy.isdigit(): if int(thingy) in PEGI: item = pywikibot.ItemPage(repo, PEGI[int(thingy)]) return item
def procesPage(self, page): """ Proces a single page """ item = pywikibot.DataPage(page) pywikibot.output('Processing %s' % page) if not item.exists(): pywikibot.output('%s doesn\'t have a wikidata item :(' % page) #TODO FIXME: We should provide an option to create the page else: pagetext = page.get() pagetext = pywikibot.removeDisabledParts(pagetext) templates = pywikibot.extract_templates_and_params(pagetext) for (template, fielddict) in templates: # We found the template we were looking for if template.replace(u'_', u' ') == self.templateTitle: for field, value in fielddict.items(): # This field contains something useful for us if field in self.fields: # Check if the property isn't already set claim = self.fields[field] if claim in item.get().get('claims'): pywikibot.output( u'A claim for %s already exists. Skipping' % (claim,)) # TODO FIXME: This is a very crude way of dupe # checking else: # Try to extract a valid page match = re.search( re.compile( r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]'), value) if match: try: link = match.group(1) linkedPage = pywikibot.Page(self.site, link) if linkedPage.isRedirectPage(): linkedPage = linkedPage.getRedirectTarget() linkedItem = pywikibot.DataPage(linkedPage) pywikibot.output('Adding %s --> %s' % (claim, linkedItem.getID())) if self.setSource(self.site().language()): item.editclaim( str(claim), linkedItem.getID(), refs={self.setSource( self.site().language())}) else: item.editclaim(str(claim), linkedItem.getID()) except pywikibot.NoPage: pywikibot.output( "[[%s]] doesn't exist so I can't link to it" % linkedItem.title())
def do_page(self, page): print page.title(asLink=True) if page.namespace() != 6: return text = page.get() text, gen_fix_summary = self.AWBGenFixes.do_page(text) code = mwparserfromhell.parse(text) tag = False log = '* ' summary = 'Bot: Updating license tag(s) with image has rationale=yes (errors? [[User:Legobot/Stop/22|stop me]])' for template in code.filter_templates(recursive=True): name = pywikibot.removeDisabledParts(template.name.lower()).strip() print name #print self.NFURs #time.sleep(5) if name in self.NFURs: print name tag = True if tag: for template in code.filter_templates(recursive=True): name = pywikibot.removeDisabledParts( template.name.lower()).strip() if name in self.licenses: template.add('image has rationale', 'yes') log += '[[:%s]]: Adding <code>|image has rationale=yes</code>' % page.title( ) else: print 'Skipping ' + page.title(asLink=True) return #if gen_fix_summary: # summary += ', also dating ' + gen_fix_summary puttext = unicode(code).lstrip('\n') pywikibot.showDiff(text, puttext) self.output(log) self.check_page() try: page.put(puttext, summary, nocreate=True) except pywikibot.exceptions.PageNotSaved: pass except pywikibot.exceptions.LockedPage: pass
def do_page(self, page): print page.title(asLink=True) if page.namespace() != 6: return text = page.get() text, gen_fix_summary = self.AWBGenFixes.do_page(text) code = mwparserfromhell.parse(text) tag = False log = '* ' summary = 'Bot: Updating license tag(s) with image has rationale=yes' for template in code.filter_templates(recursive=True): name = pywikibot.removeDisabledParts(template.name.lower()).strip() print name #print self.NFURs #time.sleep(5) if name in self.NFURs: print name tag = True if tag: for template in code.filter_templates(recursive=True): name = pywikibot.removeDisabledParts(template.name.lower()).strip() if name in self.licenses: template.add('image has rationale', 'yes') log += '[[:%s]]: Adding <code>|image has rationale=yes</code>' % page.title() else: print 'Skipping '+page.title(asLink=True) return if gen_fix_summary: summary += ', also dating ' + gen_fix_summary puttext = unicode(code).lstrip('\n') pywikibot.showDiff(text, puttext) self.output(log) self.check_page() try: page.put(puttext, summary, nocreate=True) except pywikibot.exceptions.PageNotSaved: pass except pywikibot.exceptions.LockedPage: pass
def rem(text): # delete table -- TODO: ^\{\{ or ^[\*\:\#]*\{\{ text = lre.rmsym(r"\{\|", r"\|\}", text) # delete template text = lre.rmsym(r"\{\{", r"\}\}", text) text = subst.process(text) text = pywikibot.removeDisabledParts(text) text = pywikibot.removeLanguageLinks(text) text = pywikibot.removeCategoryLinks(text) text = pywikibot.removeHTMLParts(text) return text
def getInterwiki(page): text = page.text for linkmatch in pywikibot.link_regex.finditer( pywikibot.removeDisabledParts(text)): linktitle = linkmatch.group("title") link = pywikibot.Link(linktitle, page.site) try: if link.site != page.site: if not link.site.family.name in projects: yield link except pywikibot.Error: continue
def do_page(self, text, fixes=True, date=True): if fixes: text = self.all_fixes(text) code = mwparserfromhell.parse(text) summary= {} for temp in code.filter_templates(recursive=True): name = pywikibot.removeDisabledParts(temp.name.lower()).strip() if name in self.redirects.keys(): new_name = self.redirects[name] if new_name.lower() != name: #prevents from capitalizing the first letter needlessly temp.name = new_name if (temp.name.lower() in self.date_these) and date: if not temp.has_param('date'): temp.add('date', datetime.datetime.today().strftime('%B %Y')) if temp.name.lower() in summary.keys(): summary[temp.name.lower()] += 1 else: summary[temp.name.lower()] = 1 msg = ', '.join('{{%s}} (%s)' % (item, summary[item]) for item in summary.keys()) return unicode(code), msg
def procesPage(self, site, page): """ Proces a single page """ item = pywikibot.ItemPage.fromPage(page) pywikibot.output('Processing %s' % page) if not item.exists(): pywikibot.output('%s doesn\'t have a wikidata item :' % page) #TODO FIXME: We should provide an option to create the page else: pagetext = page.get() pagetext = pywikibot.removeDisabledParts(pagetext) templates = pywikibot.extract_templates_and_params(pagetext) for (template, fielddict) in templates: # We found the template we were looking for linkedTemplate = pywikibot.Page(self.site, template, ns=10) if linkedTemplate.isRedirectPage(): template2 = linkedTemplate.getRedirectTarget().title() pywikibot.output(u'Template redirection from %s to %s' % (template, template2)) template = template2[9:] if template.replace(u'_', u' ') == self.templateTitle: for field, value in fielddict.items(): # This field contains something useful for us # pywikibot.output(' attribut %s' % field) if (value != "") and (field in self.fields): # Check if the property isn't already set claim = self.fields[field] if claim[2:-2] in item.get().get('claims'): pywikibot.output( u'A claim for %s already exists. Skipping' % (claim, )) # TODO FIXME: This is a very crude way of dupe # checking else: # Try to extract a valid page match = re.search( re.compile( r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]' ), value) pywikibot.output(u' cherche %s ' % value) if True: try: #Date treatement #if claim == "[[P585]]": #try: #pywikibot.output(u' Date: %s ' % value) #pywikibot.output(u' Date: %s ' % parser.parse(value)) #theClaim = pywikibot.Claim(repo, claim[2:-2]) #theClaim.setTarget(parser.parse(value)) #item.addClaim(theClaim) #except ValueError #pywikibot.output(u' Impossible to parse this date : %s ' % value) #continue #continue if value[:2] == "[[" and value[ -2:] == "]]": link = value[2:-2] else: link = value #link = match.group(1) linkedPage = pywikibot.Page( self.site, link) if linkedPage.isRedirectPage(): linkedPage = linkedPage.getRedirectTarget( ) pywikibot.output( u' linkedPage %s ' % linkedPage) linkedItem = pywikibot.ItemPage.fromPage( linkedPage) linkedItem.get() if not linkedItem.exists(): pywikibot.output( '%s doesn\'t have a wikidata item :' % linkedPage) continue #value constraints treatement if (claim in self.valuesConstraints ) and (linkedItem.getID() not in self. valuesConstraints[claim]): pywikibot.output( u'The value of the property %s is %s does not respect the constraint %s' % (claim, linkedItem.title(), self.valuesConstraints[claim]) ) continue #pywikibot.output(u' linkedItem %s ' % linkedItem) pywikibot.output( u' linkedItem.getID() %s ' % linkedItem.title()[1:]) pywikibot.output( 'Adding %s --> %s' % (claim, linkedItem.getID())) repo = site.data_repository( ) # utile self.repo existe ? theClaim = pywikibot.Claim( repo, claim[2:-2]) theClaim.setTarget(linkedItem) item.addClaim(theClaim) if self.source: theClaim.addSource(self.source, bot=True) except pywikibot.NoPage: pywikibot.output( "[[%s]] doesn't exist so I can't link to it" % linkedItem.title()) except pywikibot.exceptions.InvalidTitle: pywikibot.output( "[[%s]] is an invalid title" % link)
def run(self): """ Run the Bot """ try: deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read() except IOError: pywikibot.output( 'You need to download ' 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz ' 'and to ungzip it in the same directory') raise socket.setdefaulttimeout(30) editedpages = 0 for page in self.generator: try: # Load the page's text from the wiki new_text = page.get() if not page.canBeEdited(): pywikibot.output(u"You can't edit page %s" % page.title(asLink=True)) continue except pywikibot.NoPage: pywikibot.output(u'Page %s not found' % page.title(asLink=True)) continue except pywikibot.IsRedirectPage: pywikibot.output(u'Page %s is a redirect' % page.title(asLink=True)) continue # for each link to change for match in linksInRef.finditer( pywikibot.removeDisabledParts(page.get())): link = match.group(u'url') #debugging purpose #print link if u'jstor.org' in link: #TODO: Clean URL blacklist continue ref = RefLink(link, match.group('name')) f = None try: socket.setdefaulttimeout(20) try: f = urllib2.urlopen(ref.url.decode("utf8")) except UnicodeError: ref.url = urllib2.quote(ref.url.encode("utf8"), "://") f = urllib2.urlopen(ref.url) #Try to get Content-Type from server headers = f.info() contentType = headers.getheader('Content-Type') if contentType and not self.MIME.search(contentType): if ref.link.lower().endswith('.pdf') and \ not self.ignorepdf: # If file has a PDF suffix self.getPDFTitle(ref, f) else: pywikibot.output( u'\03{lightyellow}WARNING\03{default} : ' u'media : %s ' % ref.link) if ref.title: if not re.match( u'(?i) *microsoft (word|excel|visio)', ref.title): ref.transform(ispdf=True) repl = ref.refTitle() else: pywikibot.output( u'\03{lightyellow}WARNING\03{default} : ' u'PDF title blacklisted : %s ' % ref.title) repl = ref.refLink() else: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue # Get the real url where we end (http redirects !) redir = f.geturl() if redir != ref.link and \ domain.findall(redir) == domain.findall(link): if soft404.search(redir) and \ not soft404.search(ref.link): pywikibot.output( u'\03{lightyellow}WARNING\03{default} : ' u'Redirect 404 : %s ' % ref.link) continue if dirIndex.match(redir) and \ not dirIndex.match(ref.link): pywikibot.output( u'\03{lightyellow}WARNING\03{default} : ' u'Redirect to root : %s ' % ref.link) continue # uncompress if necessary if headers.get('Content-Encoding') in ('gzip', 'x-gzip'): # XXX: small issue here: the whole page is downloaded # through f.read(). It might fetch big files/pages. # However, truncating an encoded gzipped stream is not # an option, for unzipping will fail. compressed = StringIO.StringIO(f.read()) f = gzip.GzipFile(fileobj=compressed) # Read the first 1,000,000 bytes (0.95 MB) linkedpagetext = f.read(1000000) socket.setdefaulttimeout(None) except UnicodeError: # example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html # in [[fr:Cyanure]] pywikibot.output( u'\03{lightred}Bad link\03{default} : %s in %s' % (ref.url, page.title(asLink=True))) continue except urllib2.HTTPError as e: pywikibot.output( u'HTTP error (%s) for %s on %s' % (e.code, ref.url, page.title(asLink=True)), toStdout=True) # 410 Gone, indicates that the resource has been purposely # removed if e.code == 410 or \ (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)): repl = ref.refDead() new_text = new_text.replace(match.group(), repl) continue except (urllib2.URLError, socket.error, IOError, httplib.error) as e: pywikibot.output(u'Can\'t retrieve page %s : %s' % (ref.url, e)) continue except ValueError: # Known bug of httplib, google for : # "httplib raises ValueError reading chunked content" continue finally: if f: f.close() #remove <script>/<style>/comments/CDATA tags linkedpagetext = self.NON_HTML.sub('', linkedpagetext) meta_content = self.META_CONTENT.search(linkedpagetext) enc = [] s = None if contentType: # use charset from http header s = self.CHARSET.search(contentType) if meta_content: tag = meta_content.group() # Prefer the contentType from the HTTP header : if not contentType: contentType = tag if not s: # use charset from html s = self.CHARSET.search(tag) if s: tmp = s.group('enc').strip("\"' ").lower() naked = re.sub('[ _\-]', '', tmp) # Convert to python correct encoding names if naked == "gb2312": enc.append("gbk") elif naked == "shiftjis": enc.append("shift jis 2004") enc.append("cp932") elif naked == "xeucjp": enc.append("euc-jp") else: enc.append(tmp) else: pywikibot.output(u'No charset found for %s' % ref.link) ## continue # do not process pages without charset if not contentType: pywikibot.output(u'No content-type found for %s' % ref.link) continue elif not self.MIME.search(contentType): pywikibot.output( u'\03{lightyellow}WARNING\03{default} : media : %s ' % ref.link) repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue # Ugly hacks to try to survive when both server and page # return no encoding. # Uses most used encodings for each national suffix if u'.ru' in ref.link or u'.su' in ref.link: # see http://www.sci.aha.ru/ATL/ra13a.htm : no server # encoding, no page encoding enc = enc + ['koi8-r', 'windows-1251'] elif u'.jp' in ref.link: enc.append("shift jis 2004") enc.append("cp932") elif u'.kr' in ref.link: enc.append("euc-kr") enc.append("cp949") elif u'.zh' in ref.link: enc.append("gbk") u = linkedpagetext # Retrieves the first non empty string inside <title> tags for m in self.TITLE.finditer(u): t = m.group() if t: ref.title = t ref.transform() if ref.title: break if not ref.title: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(u'%s : No title found...' % ref.link) continue # XXX Ugly hack if u'é' in ref.title: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(u'%s : Hybrid encoding...' % ref.link) continue if self.titleBlackList.match(ref.title): repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(u'\03{lightred}WARNING\03{default} %s : ' u'Blacklisted title (%s)' % (ref.link, ref.title)) continue # Truncate long titles. 175 is arbitrary if len(ref.title) > 175: ref.title = ref.title[:175] + "..." repl = ref.refTitle() new_text = new_text.replace(match.group(), repl) # Add <references/> when needed, but ignore templates ! if page.namespace != 10: if self.norefbot.lacksReferences(new_text, verbose=False): new_text = self.norefbot.addReferences(new_text) new_text = self.deduplicator.process(new_text) if new_text == page.get(): pywikibot.output('No changes were necessary in %s' % page.title(asLink=True)) continue editedpages += 1 self.put_page(page, new_text) if self.limit and editedpages >= self.limit: pywikibot.output('Edited %s pages, stopping.' % self.limit) return if editedpages % 20 == 0: pywikibot.output( '\03{lightgreen}Checking stop page...\03{default}') actualRev = self.stopPage.latestRevision() if actualRev != self.stopPageRevId: pywikibot.output( u'[[%s]] has been edited : Someone wants us to stop.' % self.stopPage) return
def add_text(page=None, addText=None, summary=None, regexSkip=None, regexSkipUrl=None, always=False, up=False, putText=True, oldTextGiven=None, reorderEnabled=True, create=False): if not addText: raise NoEnoughData('You have to specify what text you want to add!') if not summary: summary = i18n.twtranslate(pywikibot.Site(), 'add_text-adding', {'adding': addText[:200]}) # When a page is tagged as "really well written" it has a star in the # interwiki links. This is a list of all the templates used (in regex # format) to make the stars appear. errorCount = 0 site = pywikibot.Site() pathWiki = site.family.nicepath(site.code) if putText: pywikibot.output(u'Loading %s...' % page.title()) if oldTextGiven is None: try: text = page.get() except pywikibot.NoPage: if create: pywikibot.output(u"%s doesn't exist, creating it!" % page.title()) text = u'' else: pywikibot.output(u"%s doesn't exist, skip!" % page.title()) return (False, False, always) except pywikibot.IsRedirectPage: pywikibot.output(u"%s is a redirect, skip!" % page.title()) return (False, False, always) else: text = oldTextGiven # Understand if the bot has to skip the page or not # In this way you can use both -except and -excepturl if regexSkipUrl is not None: url = '%s%s' % (pathWiki, page.title(asUrl=True)) result = re.findall(regexSkipUrl, site.getUrl(url)) if result != []: pywikibot.output( u'''Exception! regex (or word) used with -exceptUrl is in the page. Skip! Match was: %s''' % result) return (False, False, always) if regexSkip is not None: result = re.findall(regexSkip, text) if result != []: pywikibot.output( u'''Exception! regex (or word) used with -except is in the page. Skip! Match was: %s''' % result) return (False, False, always) # If not up, text put below if not up: newtext = text # Translating the \\n into binary \n addText = addText.replace('\\n', config.line_separator) if (reorderEnabled): # Getting the categories categoriesInside = pywikibot.getCategoryLinks(newtext, site) # Deleting the categories newtext = pywikibot.removeCategoryLinks(newtext, site) # Getting the interwiki interwikiInside = pywikibot.getLanguageLinks(newtext, site) # Removing the interwiki newtext = pywikibot.removeLanguageLinks(newtext, site) # Adding the text newtext += u"%s%s" % (config.line_separator, addText) # Reputting the categories newtext = pywikibot.replaceCategoryLinks(newtext, categoriesInside, site, True) # Dealing the stars' issue allstars = [] starstext = pywikibot.removeDisabledParts(text) for star in starsList: regex = re.compile('(\{\{(?:template:|)%s\|.*?\}\}[\s]*)' % star, re.I) found = regex.findall(starstext) if found != []: newtext = regex.sub('', newtext) allstars += found if allstars != []: newtext = newtext.strip() + config.line_separator * 2 allstars.sort() for element in allstars: newtext += '%s%s' % (element.strip(), config.LS) # Adding the interwiki newtext = pywikibot.replaceLanguageLinks(newtext, interwikiInside, site) else: newtext += u"%s%s" % (config.line_separator, addText) else: newtext = addText + config.line_separator + text if putText and text != newtext: pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) pywikibot.showDiff(text, newtext) # Let's put the changes. while True: # If someone load it as module, maybe it's not so useful to put the # text in the page if putText: if not always: choice = pywikibot.inputChoice( u'Do you want to accept these changes?', ['Yes', 'No', 'All', 'open in Browser'], ['y', 'n', 'a', 'b'], 'n') if choice == 'a': always = True elif choice == 'n': return (False, False, always) elif choice == 'b': webbrowser.open("http://%s%s" % ( page.site.hostname(), page.site.nice_get_address(page.title()) )) pywikibot.input("Press Enter when finished in browser.") if always or choice == 'y': try: if always: page.put(newtext, summary, minorEdit=page.namespace() != 3) else: page.put_async(newtext, summary, minorEdit=page.namespace() != 3) except pywikibot.EditConflict: pywikibot.output(u'Edit conflict! skip!') return (False, False, always) except pywikibot.ServerError: errorCount += 1 if errorCount < config.max_retries: pywikibot.output(u'Server Error! Wait..') time.sleep(config.retry_wait) continue else: raise pywikibot.ServerError(u'Fifth Server Error!') except pywikibot.SpamfilterError as e: pywikibot.output( u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) return (False, False, always) except pywikibot.PageNotSaved as error: pywikibot.output(u'Error putting page: %s' % error.args) return (False, False, always) except pywikibot.LockedPage: pywikibot.output(u'Skipping %s (locked page)' % page.title()) return (False, False, always) else: # Break only if the errors are one after the other... errorCount = 0 return (True, True, always) else: return (text, newtext, always)
def add_text(page=None, addText=None, summary=None, regexSkip=None, regexSkipUrl=None, always=False, up=False, putText=True, oldTextGiven=None, reorderEnabled=True, create=False): # When a page is tagged as "really well written" it has a star in the # interwiki links. This is a list of all the templates used (in regex # format) to make the stars appear. starsList = [ u'bueno', u'bom interwiki', u'cyswllt[ _]erthygl[ _]ddethol', u'dolen[ _]ed', u'destacado', u'destaca[tu]', u'enllaç[ _]ad', u'enllaz[ _]ad', u'leam[ _]vdc', u'legătură[ _]a[bcf]', u'liamm[ _]pub', u'lien[ _]adq', u'lien[ _]ba', u'liên[ _]kết[ _]bài[ _]chất[ _]lượng[ _]tốt', u'liên[ _]kết[ _]chọn[ _]lọc', u'ligam[ _]adq', u'ligoelstara', u'ligoleginda', u'link[ _][afgu]a', u'link[ _]adq', u'link[ _]f[lm]', u'link[ _]km', u'link[ _]sm', u'linkfa', u'na[ _]lotura', u'nasc[ _]ar', u'tengill[ _][úg]g', u'ua', u'yüm yg', u'רא', u'وصلة مقالة جيدة', u'وصلة مقالة مختارة', ] errorCount = 0 site = pywikibot.getSite() pathWiki = site.family.nicepath(site.lang) site = pywikibot.getSite() if oldTextGiven is None: try: text = page.get() except pywikibot.NoPage: if create: pywikibot.output(u"%s doesn't exist, creating it!" % page.title()) text = u'' else: pywikibot.output(u"%s doesn't exist, skip!" % page.title()) return (False, False, always) except pywikibot.IsRedirectPage: pywikibot.output(u"%s is a redirect, skip!" % page.title()) return (False, False, always) else: text = oldTextGiven # If not up, text put below if not up: newtext = text # Translating the \\n into binary \n addText = addText.replace('\\n', '\n') if (reorderEnabled): # Getting the categories categoriesInside = pywikibot.getCategoryLinks(newtext, site) # Deleting the categories newtext = pywikibot.removeCategoryLinks(newtext, site) # Getting the interwiki interwikiInside = pywikibot.getLanguageLinks(newtext, site) # Removing the interwiki newtext = pywikibot.removeLanguageLinks(newtext, site) # Adding the text newtext += u"\n%s" % addText # Reputting the categories newtext = pywikibot.replaceCategoryLinks(newtext, categoriesInside, site, True) # Dealing the stars' issue allstars = [] starstext = pywikibot.removeDisabledParts(text) for star in starsList: regex = re.compile('(\{\{(?:template:|)%s\|.*?\}\}[\s]*)' % star, re.I) found = regex.findall(starstext) if found != []: newtext = regex.sub('', newtext) allstars += found if allstars != []: newtext = newtext.strip() + '\r\n\r\n' allstars.sort() for element in allstars: newtext += '%s\r\n' % element.strip() # Adding the interwiki newtext = pywikibot.replaceLanguageLinks(newtext, interwikiInside, site) else: newtext += u"\n%s" % addText else: newtext = addText + '\n' + text if putText and text != newtext: pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) #pywikibot.showDiff(text, newtext) # Let's put the changes. while True: # If someone load it as module, maybe it's not so useful to put the # text in the page if putText: if always or choice == 'y': try: pass if always: page.put(newtext, summary, minorEdit=False) else: page.put_async(newtext, summary, minorEdit=False) except pywikibot.EditConflict: pywikibot.output(u'Edit conflict! skip!') return (False, False, always) except pywikibot.ServerError: errorCount += 1 if errorCount < 5: pywikibot.output(u'Server Error! Wait..') time.sleep(5) continue else: raise pywikibot.ServerError(u'Fifth Server Error!') except pywikibot.SpamfilterError, e: pywikibot.output( u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) return (False, False, always) except pywikibot.PageNotSaved, error: pywikibot.output(u'Error putting page: %s' % error.args) return (False, False, always) except pywikibot.LockedPage: pywikibot.output(u'Skipping %s (locked page)' % page.title()) return (False, False, always) else: # Break only if the errors are one after the other... errorCount = 0 return (True, True, always)
def standardizePageFooter(self, text): """ Makes sure that interwiki links, categories and star templates are put to the correct position and into the right order. This combines the old instances standardizeInterwiki and standardizeCategories The page footer has the following section in that sequence: 1. categories 2. ## TODO: template beyond categories ## 3. additional information depending on local site policy 4. stars templates for featured and good articles 5. interwiki links """ starsList = [ u'bueno', u'bom interwiki', u'cyswllt[ _]erthygl[ _]ddethol', u'dolen[ _]ed', u'destacado', u'destaca[tu]', u'enllaç[ _]ad', u'enllaz[ _]ad', u'leam[ _]vdc', u'legătură[ _]a[bcf]', u'liamm[ _]pub', u'lien[ _]adq', u'lien[ _]ba', u'liên[ _]kết[ _]bài[ _]chất[ _]lượng[ _]tốt', u'liên[ _]kết[ _]chọn[ _]lọc', u'ligam[ _]adq', u'ligoelstara', u'ligoleginda', u'link[ _][afgu]a', u'link[ _]adq', u'link[ _]f[lm]', u'link[ _]km', u'link[ _]sm', u'linkfa', u'na[ _]lotura', u'nasc[ _]ar', u'tengill[ _][úg]g', u'ua', u'yüm yg', u'רא', u'وصلة مقالة جيدة', u'وصلة مقالة مختارة', ] categories = None interwikiLinks = None allstars = [] # The PyWikipediaBot is no longer allowed to touch categories on the # German Wikipedia. See # http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/1#Position_der_Personendaten_am_.22Artikelende.22 # ignoring nn-wiki of cause of the comment line above iw section if not self.template and not '{{Personendaten' in text and \ not '{{SORTIERUNG' in text and not '{{DEFAULTSORT' in text and \ not self.site.lang in ('et', 'it', 'bg', 'ru'): categories = pywikibot.getCategoryLinks(text, site=self.site) if not self.talkpage: # and pywikibot.calledModuleName() <> 'interwiki': subpage = False if self.template: loc = None try: tmpl, loc = moved_links[self.site.lang] del tmpl except KeyError: pass if loc is not None and loc in self.title: subpage = True interwikiLinks = pywikibot.getLanguageLinks( text, insite=self.site, template_subpage=subpage) # Removing the interwiki text = pywikibot.removeLanguageLinks(text, site=self.site) # Removing the stars' issue starstext = pywikibot.removeDisabledParts(text) for star in starsList: regex = re.compile('(\{\{(?:template:|)%s\|.*?\}\}[\s]*)' % star, re.I) found = regex.findall(starstext) if found != []: text = regex.sub('', text) allstars += found # Adding categories if categories: ##Sorting categories in alphabetic order. beta test only on Persian Wikipedia, TODO fix bug for sorting #if self.site.language() == 'fa': # categories.sort() ##Taking main cats to top # for name in categories: # if re.search(u"(.+?)\|(.{,1}?)",name.title()) or name.title()==name.title().split(":")[0]+title: # categories.remove(name) # categories.insert(0, name) text = pywikibot.replaceCategoryLinks(text, categories, site=self.site) # Adding stars templates if allstars: text = text.strip() + self.site.family.interwiki_text_separator allstars.sort() for element in allstars: text += '%s%s' % (element.strip(), config.line_separator) pywikibot.log(u'%s' %element.strip()) # Adding the interwiki if interwikiLinks: text = pywikibot.replaceLanguageLinks(text, interwikiLinks, site=self.site, template=self.template, template_subpage=subpage) return text
import pywikibot import mwparserfromhell site = pywikibot.Site() category = pywikibot.Category(site, 'Category:Amusement parks') COUNT = 0 ALL = [] for page in category.articles(recurse=True, namespaces=[0], content=True): if page.isRedirectPage(): continue print page text = page.get() code = mwparserfromhell.parse(text) has = False for template in code.filter_templates(recursive=True): name = pywikibot.removeDisabledParts(unicode(template.name)).lower().strip() if name.startswith('infobox'): has = True break if not has: if page.title() in ALL: print 'Skipping duplicate of '+page.title() continue ALL.append(page.title()) COUNT += 1 print '+1' if COUNT >= 500: break TEXT = '' for item in ALL:
import pywikibot import mwparserfromhell site = pywikibot.Site() category = pywikibot.Category(site, 'Category:Amusement parks') COUNT = 0 ALL = [] for page in category.articles(recurse=True, namespaces=[0], content=True): if page.isRedirectPage(): continue print page text = page.get() code = mwparserfromhell.parse(text) has = False for template in code.filter_templates(recursive=True): name = pywikibot.removeDisabledParts(unicode( template.name)).lower().strip() if name.startswith('infobox'): has = True break if not has: if page.title() in ALL: print 'Skipping duplicate of ' + page.title() continue ALL.append(page.title()) COUNT += 1 print '+1' if COUNT >= 500: break TEXT = '' for item in ALL:
def standardizePageFooter(self, text): """ Makes sure that interwiki links, categories and star templates are put to the correct position and into the right order. This combines the old instances standardizeInterwiki and standardizeCategories The page footer has the following section in that sequence: 1. categories 2. ## TODO: template beyond categories ## 3. additional information depending on local site policy 4. stars templates for featured and good articles 5. interwiki links """ starsList = [ u"bueno", u"bom interwiki", u"cyswllt[ _]erthygl[ _]ddethol", u"dolen[ _]ed", u"destacado", u"destaca[tu]", u"enllaç[ _]ad", u"enllaz[ _]ad", u"leam[ _]vdc", u"legătură[ _]a[bcf]", u"liamm[ _]pub", u"lien[ _]adq", u"lien[ _]ba", u"liên[ _]kết[ _]bài[ _]chất[ _]lượng[ _]tốt", u"liên[ _]kết[ _]chọn[ _]lọc", u"ligam[ _]adq", u"ligoelstara", u"ligoleginda", u"link[ _][afgu]a", u"link[ _]adq", u"link[ _]f[lm]", u"link[ _]km", u"link[ _]sm", u"linkfa", u"na[ _]lotura", u"nasc[ _]ar", u"tengill[ _][úg]g", u"ua", u"yüm yg", u"רא", u"وصلة مقالة جيدة", u"وصلة مقالة مختارة", ] categories = None interwikiLinks = None allstars = [] hasCommentLine = False # The PyWikipediaBot is no longer allowed to touch categories on the # German Wikipedia. See # http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/1#Position_der_Personendaten_am_.22Artikelende.22 # ignoring nn-wiki of cause of the comment line above iw section if ( not self.template and not "{{Personendaten" in text and not "{{SORTIERUNG" in text and not "{{DEFAULTSORT" in text and not self.site.lang in ("et", "it", "bg", "ru") ): categories = pywikibot.getCategoryLinks(text, site=self.site) if not self.talkpage: # and pywikibot.calledModuleName() <> 'interwiki': subpage = False if self.template: loc = None try: tmpl, loc = moved_links[self.site.lang] del tmpl except KeyError: pass if loc != None and loc in self.title: subpage = True interwikiLinks = pywikibot.getLanguageLinks(text, insite=self.site, template_subpage=subpage) # Removing the interwiki text = pywikibot.removeLanguageLinks(text, site=self.site) # Removing the stars' issue starstext = pywikibot.removeDisabledParts(text) for star in starsList: regex = re.compile("(\{\{(?:template:|)%s\|.*?\}\}[\s]*)" % star, re.I) found = regex.findall(starstext) if found != []: text = regex.sub("", text) allstars += found # nn got a message between the categories and the iw's # and they want to keep it there, first remove it if self.site.lang in msg_interwiki: iw_msg = msg_interwiki[self.site.lang] if isinstance(iw_msg, tuple): iw_reg = iw_msg[1] iw_msg = iw_msg[0] else: iw_reg = u"(%s)" % iw_msg regex = re.compile(iw_reg) found = regex.findall(text) if found: hasCommentLine = True text = regex.sub("", text) # Adding categories if categories: ##Sorting categories in alphabetic order. beta test only on Persian Wikipedia, TODO fix bug for sorting # if self.site.language() == 'fa': # categories.sort() ##Taking main cats to top # for name in categories: # if re.search(u"(.+?)\|(.{,1}?)",name.title()) or name.title()==name.title().split(":")[0]+title: # categories.remove(name) # categories.insert(0, name) text = pywikibot.replaceCategoryLinks(text, categories, site=self.site) # Put the iw message back if not self.talkpage and ( (interwikiLinks or hasCommentLine) and self.site.language() == "nn" or (interwikiLinks and hasCommentLine) and self.site.language() == "fr" ): text += config.line_separator * 2 + iw_msg # Adding stars templates if allstars: text = text.strip() + self.site.family.interwiki_text_separator allstars.sort() for element in allstars: text += "%s%s" % (element.strip(), config.line_separator) pywikibot.log(u"%s" % element.strip()) # Adding the interwiki if interwikiLinks: text = pywikibot.replaceLanguageLinks( text, interwikiLinks, site=self.site, template=self.template, template_subpage=subpage ) return text
def determineClass(self, code, page): if page.toggleTalkPage().isRedirectPage(): return 'redirect' if page.namespace() == 101: return 'portal' elif page.namespace() == 15: return 'category' elif page.namespace() == 11: return 'template' if self.level == 'simple': return None found = list() stub = False code = mwparserfromhell.parse( pywikibot.removeDisabledParts(unicode(code))) #wtf for template in code.filter_templates(recursive=True): if template.has_param('class'): found.append(template.get('class').value.strip()) if (template.name.lower() in self.stub_templates) and (not stub): stub = True #check for auto=stub if not found: if stub: return 'stub' return None if (self.level == 'conservative') and (len(found) == 1): if stub: return 'stub' return None if found.count( found[0]) == len(found): #verifies that all values are equal return found[0] if self.level in ['inherit', 'conservative']: if stub: return 'stub' return None #can only be 'liberal' d = {} for value in found: value = value.lower().strip() if not d.has_key(value): d[value] = 1 else: d[value] += 1 #top = d.keys()[d.values().index(max(d.values()))] sorted_d = sorted(d.iteritems(), key=operator.itemgetter(1), reverse=True) top = sorted_d[0][1] top_value = sorted_d[0][0] key = 1 print sorted_d if len(sorted_d) == 1: return top_value while top == sorted_d[key][1]: if self.valueClass(top_value) <= self.valueClass(sorted_d[key][0]): top_value = sorted_d[key][0] key += 1 if len(sorted_d) >= key: break return top_value
def add_text(page=None, addText=None, summary=None, regexSkip=None, regexSkipUrl=None, always=False, up=False, putText=True, oldTextGiven=None, reorderEnabled=True, create=False): if not addText: raise NoEnoughData('You have to specify what text you want to add!') site = page.site if not summary: summary = i18n.twtranslate(site, 'add_text-adding', {'adding': addText[:200]}) # When a page is tagged as "really well written" it has a star in the # interwiki links. This is a list of all the templates used (in regex # format) to make the stars appear. errorCount = 0 if putText: pywikibot.output(u'Loading %s...' % page.title()) if oldTextGiven is None: try: text = page.get() except pywikibot.NoPage: if create: pywikibot.output(u"%s doesn't exist, creating it!" % page.title()) text = u'' else: pywikibot.output(u"%s doesn't exist, skip!" % page.title()) return (False, False, always) except pywikibot.IsRedirectPage: pywikibot.output(u"%s is a redirect, skip!" % page.title()) return (False, False, always) else: text = oldTextGiven # Understand if the bot has to skip the page or not # In this way you can use both -except and -excepturl if regexSkipUrl is not None: url = site.nice_get_address(page.title(asUrl=True)) result = re.findall(regexSkipUrl, site.getUrl(url)) if result != []: pywikibot.output( u'''Exception! regex (or word) used with -exceptUrl is in the page. Skip! Match was: %s''' % result) return (False, False, always) if regexSkip is not None: result = re.findall(regexSkip, text) if result != []: pywikibot.output( u'''Exception! regex (or word) used with -except is in the page. Skip! Match was: %s''' % result) return (False, False, always) # If not up, text put below if not up: newtext = text # Translating the \\n into binary \n addText = addText.replace('\\n', config.line_separator) if (reorderEnabled): # Getting the categories categoriesInside = pywikibot.getCategoryLinks(newtext, site) # Deleting the categories newtext = pywikibot.removeCategoryLinks(newtext, site) # Getting the interwiki interwikiInside = pywikibot.getLanguageLinks(newtext, site) # Removing the interwiki newtext = pywikibot.removeLanguageLinks(newtext, site) # Adding the text newtext += u"%s%s" % (config.line_separator, addText) # Reputting the categories newtext = pywikibot.replaceCategoryLinks(newtext, categoriesInside, site, True) # Dealing the stars' issue allstars = [] starstext = pywikibot.removeDisabledParts(text) for star in starsList: regex = re.compile( '(\{\{(?:template:|)%s\|.*?\}\}[\s]*)' % star, re.I) found = regex.findall(starstext) if found != []: newtext = regex.sub('', newtext) allstars += found if allstars != []: newtext = newtext.strip() + config.line_separator * 2 allstars.sort() for element in allstars: newtext += '%s%s' % (element.strip(), config.LS) # Adding the interwiki newtext = pywikibot.replaceLanguageLinks(newtext, interwikiInside, site) else: newtext += u"%s%s" % (config.line_separator, addText) else: newtext = addText + config.line_separator + text if putText and text != newtext: pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) pywikibot.showDiff(text, newtext) # Let's put the changes. while True: # If someone load it as module, maybe it's not so useful to put the # text in the page if putText: if not always: choice = pywikibot.inputChoice( u'Do you want to accept these changes?', ['Yes', 'No', 'All', 'open in Browser'], ['y', 'n', 'a', 'b'], 'n') if choice == 'a': always = True elif choice == 'n': return (False, False, always) elif choice == 'b': webbrowser.open( "http://%s%s" % (site.hostname(), site.nice_get_address(page.title(asUrl=True)))) pywikibot.input("Press Enter when finished in browser.") if always or choice == 'y': try: if always: page.put(newtext, summary, minorEdit=page.namespace() != 3) else: page.put_async(newtext, summary, minorEdit=page.namespace() != 3) except pywikibot.EditConflict: pywikibot.output(u'Edit conflict! skip!') return (False, False, always) except pywikibot.ServerError: errorCount += 1 if errorCount < config.max_retries: pywikibot.output(u'Server Error! Wait..') time.sleep(config.retry_wait) continue else: raise pywikibot.ServerError(u'Fifth Server Error!') except pywikibot.SpamfilterError as e: pywikibot.output( u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) return (False, False, always) except pywikibot.PageNotSaved as error: pywikibot.output(u'Error putting page: %s' % error.args) return (False, False, always) except pywikibot.LockedPage: pywikibot.output(u'Skipping %s (locked page)' % page.title()) return (False, False, always) else: # Break only if the errors are one after the other... errorCount = 0 return (True, True, always) else: return (text, newtext, always)
def procesPage(self, site, page): """ Proces a single page """ item = pywikibot.ItemPage.fromPage(page) pywikibot.output('Processing %s' % page) if not item.exists(): pywikibot.output('%s doesn\'t have a wikidata item :' % page) #TODO FIXME: We should provide an option to create the page else: pagetext = page.get() pagetext = pywikibot.removeDisabledParts(pagetext) templates = pywikibot.extract_templates_and_params(pagetext) for (template, fielddict) in templates: # We found the template we were looking for linkedTemplate = pywikibot.Page(self.site, template, ns=10) if linkedTemplate.isRedirectPage(): template2 = linkedTemplate.getRedirectTarget().title() pywikibot.output( u'Template redirection from %s to %s' % (template, template2)) template = template2[9:] if template.replace(u'_', u' ') == self.templateTitle: for field, value in fielddict.items(): # This field contains something useful for us # pywikibot.output(' attribut %s' % field) if (value != "") and (field in self.fields): # Check if the property isn't already set claim = self.fields[field] if claim[2:-2] in item.get().get('claims'): pywikibot.output( u'A claim for %s already exists. Skipping' % (claim,)) # TODO FIXME: This is a very crude way of dupe # checking else: # Try to extract a valid page match = re.search(re.compile( r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]'), value) pywikibot.output(u' cherche %s ' % value) if True: try: #Date treatement #if claim == "[[P585]]": #try: #pywikibot.output(u' Date: %s ' % value) #pywikibot.output(u' Date: %s ' % parser.parse(value)) #theClaim = pywikibot.Claim(repo, claim[2:-2]) #theClaim.setTarget(parser.parse(value)) #item.addClaim(theClaim) #except ValueError #pywikibot.output(u' Impossible to parse this date : %s ' % value) #continue #continue if value[:2] == "[[" and value[-2:] == "]]": link = value[2:-2] else: link = value #link = match.group(1) linkedPage = pywikibot.Page(self.site, link) if linkedPage.isRedirectPage(): linkedPage = linkedPage.getRedirectTarget() pywikibot.output(u' linkedPage %s ' % linkedPage) linkedItem = pywikibot.ItemPage.fromPage(linkedPage) linkedItem.get() if not linkedItem.exists(): pywikibot.output('%s doesn\'t have a wikidata item :' % linkedPage) continue #value constraints treatement if (claim in self.valuesConstraints) and (linkedItem.getID() not in self.valuesConstraints[claim]): pywikibot.output(u'The value of the property %s is %s does not respect the constraint %s' % (claim, linkedItem.title(), self.valuesConstraints[claim])) continue #pywikibot.output(u' linkedItem %s ' % linkedItem) pywikibot.output(u' linkedItem.getID() %s ' % linkedItem.title()[1:]) pywikibot.output('Adding %s --> %s' % (claim, linkedItem.getID())) repo = site.data_repository() # utile self.repo existe ? theClaim = pywikibot.Claim(repo, claim[2:-2]) theClaim.setTarget(linkedItem) item.addClaim(theClaim) if self.source: theClaim.addSource(self.source, bot=True) except pywikibot.NoPage: pywikibot.output( "[[%s]] doesn't exist so I can't link to it" % linkedItem.title()) except pywikibot.exceptions.InvalidTitle: pywikibot.output( "[[%s]] is an invalid title" % link)
def run(self): """ Run the Bot """ try: deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read() except IOError: pywikibot.output( 'You need to download ' 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz ' 'and to ungzip it in the same directory') raise socket.setdefaulttimeout(30) editedpages = 0 for page in self.generator: try: # Load the page's text from the wiki new_text = page.get() if not page.canBeEdited(): pywikibot.output(u"You can't edit page %s" % page.title(asLink=True)) continue except pywikibot.NoPage: pywikibot.output(u'Page %s not found' % page.title(asLink=True)) continue except pywikibot.IsRedirectPage: pywikibot.output(u'Page %s is a redirect' % page.title(asLink=True)) continue # for each link to change for match in linksInRef.finditer( pywikibot.removeDisabledParts(page.get())): link = match.group(u'url') # debugging purpose # print link if u'jstor.org' in link: # TODO: Clean URL blacklist continue ref = RefLink(link, match.group('name')) f = None try: socket.setdefaulttimeout(20) try: f = urllib2.urlopen(ref.url.decode("utf8")) except UnicodeError: ref.url = urllib2.quote(ref.url.encode("utf8"), "://") f = urllib2.urlopen(ref.url) # Try to get Content-Type from server headers = f.info() contentType = headers.getheader('Content-Type') if contentType and not self.MIME.search(contentType): if ref.link.lower().endswith('.pdf') and \ not self.getOption('ignorepdf'): # If file has a PDF suffix self.getPDFTitle(ref, f) else: pywikibot.output( u'\03{lightyellow}WARNING\03{default} : ' u'media : %s ' % ref.link) if ref.title: if not re.match( u'(?i) *microsoft (word|excel|visio)', ref.title): ref.transform(ispdf=True) repl = ref.refTitle() else: pywikibot.output( u'\03{lightyellow}WARNING\03{default} : ' u'PDF title blacklisted : %s ' % ref.title) repl = ref.refLink() else: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue # Get the real url where we end (http redirects !) redir = f.geturl() if redir != ref.link and \ domain.findall(redir) == domain.findall(link): if soft404.search(redir) and \ not soft404.search(ref.link): pywikibot.output( u'\03{lightyellow}WARNING\03{default} : ' u'Redirect 404 : %s ' % ref.link) continue if dirIndex.match(redir) and \ not dirIndex.match(ref.link): pywikibot.output( u'\03{lightyellow}WARNING\03{default} : ' u'Redirect to root : %s ' % ref.link) continue # uncompress if necessary if headers.get('Content-Encoding') in ('gzip', 'x-gzip'): # XXX: small issue here: the whole page is downloaded # through f.read(). It might fetch big files/pages. # However, truncating an encoded gzipped stream is not # an option, for unzipping will fail. compressed = StringIO.StringIO(f.read()) f = gzip.GzipFile(fileobj=compressed) # Read the first 1,000,000 bytes (0.95 MB) linkedpagetext = f.read(1000000) socket.setdefaulttimeout(None) except UnicodeError: # example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html # in [[fr:Cyanure]] pywikibot.output( u'\03{lightred}Bad link\03{default} : %s in %s' % (ref.url, page.title(asLink=True))) continue except urllib2.HTTPError as e: pywikibot.output(u'HTTP error (%s) for %s on %s' % (e.code, ref.url, page.title(asLink=True)), toStdout=True) # 410 Gone, indicates that the resource has been purposely # removed if e.code == 410 or \ (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)): repl = ref.refDead() new_text = new_text.replace(match.group(), repl) continue except (urllib2.URLError, socket.error, IOError, httplib.error) as e: pywikibot.output(u'Can\'t retrieve page %s : %s' % (ref.url, e)) continue except ValueError: # Known bug of httplib, google for : # "httplib raises ValueError reading chunked content" continue finally: if f: f.close() # remove <script>/<style>/comments/CDATA tags linkedpagetext = self.NON_HTML.sub('', linkedpagetext) meta_content = self.META_CONTENT.search(linkedpagetext) enc = [] s = None if contentType: # use charset from http header s = self.CHARSET.search(contentType) if meta_content: tag = meta_content.group() # Prefer the contentType from the HTTP header : if not contentType: contentType = tag if not s: # use charset from html s = self.CHARSET.search(tag) if s: tmp = s.group('enc').strip("\"' ").lower() naked = re.sub('[ _\-]', '', tmp) # Convert to python correct encoding names if naked == "gb2312": enc.append("gbk") elif naked == "shiftjis": enc.append("shift jis 2004") enc.append("cp932") elif naked == "xeucjp": enc.append("euc-jp") else: enc.append(tmp) else: pywikibot.output(u'No charset found for %s' % ref.link) ## continue # do not process pages without charset if not contentType: pywikibot.output(u'No content-type found for %s' % ref.link) continue elif not self.MIME.search(contentType): pywikibot.output( u'\03{lightyellow}WARNING\03{default} : media : %s ' % ref.link) repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue # Ugly hacks to try to survive when both server and page # return no encoding. # Uses most used encodings for each national suffix if u'.ru' in ref.link or u'.su' in ref.link: # see http://www.sci.aha.ru/ATL/ra13a.htm : no server # encoding, no page encoding enc = enc + ['koi8-r', 'windows-1251'] elif u'.jp' in ref.link: enc.append("shift jis 2004") enc.append("cp932") elif u'.kr' in ref.link: enc.append("euc-kr") enc.append("cp949") elif u'.zh' in ref.link: enc.append("gbk") u = linkedpagetext # Retrieves the first non empty string inside <title> tags for m in self.TITLE.finditer(u): t = m.group() if t: ref.title = t ref.transform() if ref.title: break if not ref.title: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(u'%s : No title found...' % ref.link) continue # XXX Ugly hack if u'é' in ref.title: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(u'%s : Hybrid encoding...' % ref.link) continue if self.titleBlackList.match(ref.title): repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(u'\03{lightred}WARNING\03{default} %s : ' u'Blacklisted title (%s)' % (ref.link, ref.title)) continue # Truncate long titles. 175 is arbitrary if len(ref.title) > 175: ref.title = ref.title[:175] + "..." repl = ref.refTitle() new_text = new_text.replace(match.group(), repl) # Add <references/> when needed, but ignore templates ! if page.namespace != 10: if self.norefbot.lacksReferences(new_text): new_text = self.norefbot.addReferences(new_text) new_text = self.deduplicator.process(new_text) try: self.userPut(page, page.text, new_text, comment=self.msg) except pywikibot.EditConflict: pywikibot.output(u'Skipping %s because of edit conflict' % page.title()) except pywikibot.SpamfilterError as e: pywikibot.output( u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) except pywikibot.PageNotSaved as error: pywikibot.error(u'putting page: %s' % (error.args,)) except pywikibot.LockedPage: pywikibot.output(u'Skipping %s (locked page)' % page.title()) except pywikibot.ServerError as e: pywikibot.output(u'Server Error : %s' % e) if new_text == page.text: continue else: editedpages += 1 if self.getOption('limit') and editedpages >= self.getOption('limit'): pywikibot.output('Edited %s pages, stopping.' % self.getOption('limit')) return if editedpages % 20 == 0: pywikibot.output( '\03{lightgreen}Checking stop page...\03{default}') actualRev = self.stopPage.latestRevision() if actualRev != self.stopPageRevId: pywikibot.output( u'[[%s]] has been edited : Someone wants us to stop.' % self.stopPage) return
def procesPage(self, site, page): """ Proces a single page """ pywikibot.output('Processing %s' % page) try: item = pywikibot.ItemPage.fromPage(page) except pywikibot.exceptions.NoPage: pywikibot.output(u'No wikidata for: %s ' % page) return if not item.exists(): pywikibot.output('%s doesn\'t have a wikidata item :' % page) #TODO FIXME: We should provide an option to create the page else: pagetext = page.get() pagetext = pywikibot.removeDisabledParts(pagetext) templates = pywikibot.extract_templates_and_params(pagetext) #pywikibot.output( u'Templates: %s' % templates) for (template, fielddict) in templates: # We found the template we were looking for linkedTemplate = pywikibot.Page(self.site, template, ns=10) try: if linkedTemplate.isRedirectPage(): template2 = linkedTemplate.getRedirectTarget().title() pywikibot.output( u'Template redirection from %s to %s' % (template, template2)) template = template2[9:] except pywikibot.exceptions.InvalidTitle: pywikibot.output("[[%s]] contains illegal char(s)" % template) if template.replace(u'_', u' ') == self.templateTitle: #pywikibot.output( u'Template: %s' % template) for field, value in fielddict.items(): # This field contains something useful for us field = field.strip() #pywikibot.output(' field <%s>' % field ) # pywikibot.output(' self.fields %s' % (field in self.fields)) if (value != "") and (field in self.fields): # Check if the property isn't already set #pywikibot.output(' attribut %s' % field) claim = self.fields[field] if claim[2:-2] in item.get().get('claims'): pywikibot.output( u'A claim for %s already exists. Skipping' % (claim,)) # TODO FIXME: This is a very crude way of dupe # checking else: # Try to extract a valid page match = re.search(re.compile( r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]'), value) #pywikibot.output(u' cherche %s ' % value) if True: try: value = value.strip() #Date treatement if claim == "[[P585]]" and value != "": try: pywikibot.output(u' Date: <%s> ' % value) laDate = parser.parse(value) pywikibot.output(u' Date: <%s> ' % laDate) repo = site.data_repository() # utile self.repo existe ? theClaim = pywikibot.Claim(repo, claim[2:-2]) # pywikibot.output(u' Year: %s, Month: %s, Day: %s ' % laDateText[0:3], laDateText[5:6], laDateText[7:8]) pywikibot.output('Adding %s --> %s' % (claim, laDate)) laDate = pywikibot.WbTime(year=laDate.year, month=laDate.month, day=laDate.day) theClaim.setTarget(laDate) item.addClaim(theClaim) if self.source: theClaim.addSource(self.source, bot=True) except ValueError: pywikibot.output(u' Impossible to parse this date : %s ' % value) continue continue if value[:2] == "[[" and value[-2:] == "]]": link = value[2:-2] else: link = value #pywikibot.output(u' link: <%s> ' % link) if link == "": continue #link = match.group(1) linkedPage = pywikibot.Page(self.site, link) if linkedPage.isRedirectPage(): linkedPage = linkedPage.getRedirectTarget() #pywikibot.output(u' linkedPage %s ' % linkedPage) linkedItem = pywikibot.ItemPage.fromPage(linkedPage) linkedItem.get() if not linkedItem.exists(): pywikibot.output('%s doesn\'t have a wikidata item :' % linkedPage) continue #value constraints treatement if (claim in self.valuesConstraints) and (linkedItem.getID() not in self.valuesConstraints[claim]): pywikibot.output(u'The value of the property %s is %s does not respect the constraint %s' % (claim, linkedItem.title(), self.valuesConstraints[claim])) continue #instance of constraint treatment if claim == "[[P541]]": linkedItem.get() # you need to call it to access any data. if linkedItem.claims and ('P31' in linkedItem.claims): if linkedItem.claims['P31'][0].getTarget().title(withNamespace=False) != "Q4164871": pywikibot.output(u'The P31 value is not Q4164871 but %s ' % linkedItem.claims['P31'][0].getTarget().title(withNamespace=True)) continue else: pywikibot.output(u'The P31 value is missing ') continue #pywikibot.output(u' linkedItem %s ' % linkedItem) #pywikibot.output(u' linkedItem.getID() %s ' % linkedItem.title()[1:]) pywikibot.output('Adding %s --> %s' % (claim, linkedItem.getID())) repo = site.data_repository() # utile self.repo existe ? theClaim = pywikibot.Claim(repo, claim[2:-2]) theClaim.setTarget(linkedItem) item.addClaim(theClaim) if self.source: theClaim.addSource(self.source, bot=True) except pywikibot.NoPage: pywikibot.output( "[[%s]] doesn't exist so I can't link to it" % linkedPage) except pywikibot.exceptions.InvalidTitle: pywikibot.output( "[[%s]] is an invalid title" % link)
def standardizePageFooter(self, text): """ Makes sure that interwiki links, categories and star templates are put to the correct position and into the right order. This combines the old instances standardizeInterwiki and standardizeCategories The page footer has the following section in that sequence: 1. categories 2. ## TODO: template beyond categories ## 3. additional information depending on local site policy 4. stars templates for featured and good articles 5. interwiki links """ starsList = [ u'bueno', u'bom interwiki', u'cyswllt[ _]erthygl[ _]ddethol', u'dolen[ _]ed', u'destacado', u'destaca[tu]', u'enllaç[ _]ad', u'enllaz[ _]ad', u'leam[ _]vdc', u'legătură[ _]a[bcf]', u'liamm[ _]pub', u'lien[ _]adq', u'lien[ _]ba', u'liên[ _]kết[ _]bài[ _]chất[ _]lượng[ _]tốt', u'liên[ _]kết[ _]chọn[ _]lọc', u'ligam[ _]adq', u'ligazón[ _]a[bd]', u'ligoelstara', u'ligoleginda', u'link[ _][afgu]a', u'link[ _]adq', u'link[ _]f[lm]', u'link[ _]km', u'link[ _]sm', u'linkfa', u'na[ _]lotura', u'nasc[ _]ar', u'tengill[ _][úg]g', u'ua', u'yüm yg', u'רא', u'وصلة مقالة جيدة', u'وصلة مقالة مختارة', ] categories = None interwikiLinks = None allstars = [] # The PyWikipediaBot is no longer allowed to touch categories on the # German Wikipedia. See # https://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/1#Position_der_Personendaten_am_.22Artikelende.22 # ignoring nn-wiki of cause of the comment line above iw section if not self.template and '{{Personendaten' not in text and \ '{{SORTIERUNG' not in text and '{{DEFAULTSORT' not in text and \ self.site.lang not in ('et', 'it', 'bg', 'ru'): categories = pywikibot.getCategoryLinks(text, site=self.site) if not self.talkpage: # and pywikibot.calledModuleName() <> 'interwiki': subpage = False if self.template: loc = None try: tmpl, loc = moved_links[self.site.lang] del tmpl except KeyError: pass if loc is not None and loc in self.title: subpage = True interwikiLinks = pywikibot.getLanguageLinks( text, insite=self.site, template_subpage=subpage) # Removing the interwiki text = pywikibot.removeLanguageLinks(text, site=self.site) # Removing the stars' issue starstext = pywikibot.removeDisabledParts(text) for star in starsList: regex = re.compile('(\{\{(?:template:|)%s\|.*?\}\}[\s]*)' % star, re.I) found = regex.findall(starstext) if found != []: text = regex.sub('', text) allstars += found # Adding categories if categories: ##Sorting categories in alphabetic order. beta test only on Persian Wikipedia, TODO fix bug for sorting #if self.site.language() == 'fa': # categories.sort() ##Taking main cats to top # for name in categories: # if re.search(u"(.+?)\|(.{,1}?)",name.title()) or name.title()==name.title().split(":")[0]+title: # categories.remove(name) # categories.insert(0, name) text = pywikibot.replaceCategoryLinks(text, categories, site=self.site) # Adding stars templates if allstars: text = text.strip() + self.site.family.interwiki_text_separator allstars.sort() for element in allstars: text += '%s%s' % (element.strip(), config.line_separator) pywikibot.log(u'%s' % element.strip()) # Adding the interwiki if interwikiLinks: text = pywikibot.replaceLanguageLinks(text, interwikiLinks, site=self.site, template=self.template, template_subpage=subpage) return text