def weblinksIn(text, withoutBracketed=False, onlyBracketed=False): text = textlib.removeDisabledParts(text) # MediaWiki parses templates before parsing external links. Thus, there # might be a | or a } directly after a URL which does not belong to # the URL itself. # First, remove the curly braces of inner templates: nestedTemplateR = re.compile(r'{{([^}]*?){{(.*?)}}(.*?)}}') while nestedTemplateR.search(text): text = nestedTemplateR.sub(r'{{\1 \2 \3}}', text) # Then blow up the templates with spaces so that the | and }} will not # be regarded as part of the link:. templateWithParamsR = re.compile(r'{{([^}]*?[^ ])\|([^ ][^}]*?)}}', re.DOTALL) while templateWithParamsR.search(text): text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text) # Add <blank> at the end of a template # URL as last param of multiline template would not be correct text = text.replace('}}', ' }}') # Remove HTML comments in URLs as well as URLs in HTML comments. # Also remove text inside nowiki links etc. text = textlib.removeDisabledParts(text) linkR = textlib.compileLinkR(withoutBracketed, onlyBracketed) for m in linkR.finditer(text): if m.group('url'): yield m.group('url') else: yield m.group('urlb')
def loadTypos(self): pywikibot.output('Loading typo rules') self.typoRules = [] if self.typos_page_name is None: self.typos_page_name = 'Wikipedie:WPCleaner/Typo' typos_page = pywikibot.Page(self.site, self.typos_page_name) if not typos_page.exists(): # todo: feedback return text = textlib.removeDisabledParts( typos_page.text, include=['nowiki'], site=self.site) load_all = self.load_all is True for template, fielddict in textlib.extract_templates_and_params( text, remove_disabled_parts=False, strip=False): if template.lower() == 'typo': try: rule = TypoRule.newFromParameters(fielddict, self.site) except IncompleteTypoRuleException as exc: pywikibot.warning(exc.message) # pwb.exception? except InvalidExpressionException as exc: if 'fixed-width' not in exc.message: pywikibot.warning('Invalid %s %s: %s' % ( exc.aspect, fielddict['1'], exc.message)) else: rule.id = self.top_id # fixme: cvar or ivar? self.top_id += 1 if load_all or not rule.needs_decision(): self.typoRules.append(rule) pywikibot.output('%d typo rules loaded' % len(self.typoRules)) return self.typoRules
def find_discussion(self, category: pywikibot.Category) -> 'CfdPage': """ Return the relevant discussion. @param category: The category being discussed """ if self.section(): return self text = removeDisabledParts(self.text, tags=EXCEPTIONS, site=self.site) wikicode = mwparserfromhell.parse(text, skip_style_tags=True) for section in wikicode.get_sections(levels=[4]): heading = section.filter_headings()[0] section_title = str(heading.title).strip() discussion = self.__class__( self.site, '{}#{}'.format(self.title(), section_title)) if category.title() == section_title: return discussion # Split approximately into close, nom, and others. parts = str(section).split('(UTC)') if len(parts) < 3: continue # Parse the nom for category links. nom = mwparserfromhell.parse(parts[1], skip_style_tags=True) for node in nom.ifilter(): page = self._cat_from_node(node) if page and category == page: return discussion return self
def find_discussion(self, category): """Find the section with the relevant discussion.""" if self.section(): return self.title(as_link=True) text = removeDisabledParts(self.text, site=self.site) wikicode = mwparserfromhell.parse(text, skip_style_tags=True) for section in wikicode.get_sections(levels=[4]): heading = section.filter(forcetype=Heading)[0] section_title = str(heading.title).strip() discussion = '[[{}#{}]]'.format(self.title(), section_title) if category.title() == section_title: return discussion # Split approximately into close, nom, and others parts = str(section).split('(UTC)') if len(parts) < 3: continue # Parse the nom for links for wikilink in pywikibot.link_regex.finditer(parts[1]): title = wikilink.group('title').strip().split('#')[0] if not title: continue title = pywikibot.Page(self.site, title).title() if category.title() == title: return discussion return self.title(as_link=True)
def __iter__(self): from pywikibot import xmlreader dump = xmlreader.XmlDump(self.xmlFilename) for entry in dump.parse(): text = textlib.removeDisabledParts(entry.text) if self.refR.search(text) and not self.referencesR.search(text): yield pywikibot.Page(pywikibot.Site(), entry.title)
def replace_gallery_files( self, wikicode: mwparserfromhell.wikicode.Wikicode) -> None: """ Replace files in <gallery>. :param wikicode: Parsed wikitext """ for tag in wikicode.ifilter_tags(): if tag.tag.lower() != "gallery": continue lines = str(tag.contents).splitlines() for i, line in enumerate(lines): title, sep, caption = removeDisabledParts(line).partition("|") if not title: continue try: current_icon = BSiconPage(self.current_page.site, title) current_icon.title() except (pywikibot.exceptions.Error, ValueError): continue new_icon = self.opt.bsicons_map.get(current_icon, None) if new_icon: lines[i] = f"{new_icon.title()}{sep}{caption}" self.current_page.replacements.add( Replacement(current_icon, new_icon)) if self.current_page.replacements: tag.contents = "\n".join(lines) + "\n"
def lacksReferences(self, text): """Check whether or not the page is lacking a references tag.""" oldTextCleaned = textlib.removeDisabledParts(text) if self.referencesR.search(oldTextCleaned) or \ self.referencesTagR.search(oldTextCleaned): if self.getOption('verbose'): pywikibot.output( u'No changes necessary: references tag found.') return False elif self.referencesTemplates: templateR = u'{{(' + u'|'.join(self.referencesTemplates) + ')' if re.search(templateR, oldTextCleaned, re.IGNORECASE | re.UNICODE): if self.getOption('verbose'): pywikibot.output( u'No changes necessary: references template found.') return False if not self.refR.search(oldTextCleaned): if self.getOption('verbose'): pywikibot.output(u'No changes necessary: no ref tags found.') return False else: if self.getOption('verbose'): pywikibot.output(u'Found ref without references.') return True
def treat_page(self): """Process one page.""" self.check_disabled() target = self.current_page.getCategoryRedirectTarget() seen = {self.current_page, target} while target.isCategoryRedirect(): target = target.getCategoryRedirectTarget() if target in seen: pywikibot.error( 'Skipping {} due to possible circular redirect at {}.'. format(self.current_page, target)) return seen.add(target) wikicode = mwparserfromhell.parse(self.current_page.text, skip_style_tags=True) for tpl in wikicode.ifilter_templates(): try: template = pywikibot.Page( self.site, removeDisabledParts(str(tpl.name), site=self.site), ns=10, ) template.title() except pywikibot.InvalidTitle: continue if template in self.templates: tpl.add('1', target.title(with_ns=False)) break self.put_current(str(wikicode), summary=self.getOption('summary'))
def lacksReferences(self, text) -> bool: """Check whether or not the page is lacking a references tag.""" oldTextCleaned = textlib.removeDisabledParts(text) if self.referencesR.search(oldTextCleaned) \ or self.referencesTagR.search(oldTextCleaned): if self.opt.verbose: pywikibot.output('No changes necessary: references tag found.') return False if self.referencesTemplates: templateR = '{{(' + '|'.join(self.referencesTemplates) + ')' if re.search(templateR, oldTextCleaned, re.IGNORECASE): if self.opt.verbose: pywikibot.output( 'No changes necessary: references template found.') return False if not self.refR.search(oldTextCleaned): if self.opt.verbose: pywikibot.output('No changes necessary: no ref tags found.') return False if self.opt.verbose: pywikibot.output('Found ref without references.') return True
def get_action(self, category: pywikibot.Category) -> str: """ Return the discussion action. @param category: The category being discussed """ if not self.section(): return '' text = removeDisabledParts(self.text, tags=EXCEPTIONS, site=self.site) wikicode = mwparserfromhell.parse(text, skip_style_tags=True) for section in wikicode.get_sections(levels=[4]): heading = section.filter_headings()[0] if str(heading.title).strip() == self.section(): break else: section = None # Trick pylint. return '' # Parse the discussion for category links and action. for line in str(section).splitlines(): found = False line_wc = mwparserfromhell.parse(line, skip_style_tags=True) for node in line_wc.ifilter(): page = self._cat_from_node(node) if page and category == page: found = True break matches = re.findall(r"'''Propose (.+?)'''", line) if found and matches: return matches[0] return ''
def treat(self, page): # get all linkedPages # check for disambigs linksR = re.compile('\[\[(?P<short>[^\]]*)\]\] *\|\| *\[\[(?P<long>[^\]]*)\]\]') res = [] counter = 0 if self.opt.test: pywikibot.output('Treat(%s)' % page.title(as_link=True)) for p in linksR.finditer(textlib.removeDisabledParts(page.text)): counter += 1 longn = p.group('long') shortn = p.group('short') if self.opt.testlinks: pywikibot.output('[%s][#%i] S:%s L:%s' % ( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), counter, shortn, longn)) rpl = pywikibot.Page(pywikibot.Site(), longn) rplcount = len(list(rpl.getReferences(namespaces=0))) if self.opt.testlinks: pywikibot.output('L:%s #%i In %s checking:%s - referenced by %i' % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), counter, page.title(as_link=True), rpl.title(as_link=True), rplcount)) rps = pywikibot.Page(pywikibot.Site(), shortn) rpscount = len(list(rps.getReferences(namespaces=0))) if self.opt.testlinks: pywikibot.output('S:%s #%i In %s checking:%s - referenced by %i' % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), counter, page.title(as_link=True), rps.title(as_link=True), rpscount)) res.append({"long": longn, "refl": rplcount, "short": shortn, "refs": rpscount}) print(res) return res
def __iter__(self): import xmlreader dump = xmlreader.XmlDump(self.xmlFilename) for entry in dump.parse(): text = textlib.removeDisabledParts(entry.text) if self.refR.search(text) and not self.referencesR.search(text): yield pywikibot.Page(pywikibot.Site(), entry.title)
def getWordCount(self, text): text = textlib.removeDisabledParts(text) text = textlib.removeHTMLParts(text) text = textlib.removeLanguageLinks(text) text = textlib.removeCategoryLinks(text) word_list = re.findall(r"[\w']+", text) return len(word_list)
def checkUserPage(self,page): text = page.text if self.catsPresent(textlib.removeDisabledParts(text)): text = textlib.replaceExcept(text, ur'\[\[kategoria', '[[:Kategoria', ['comment','pre','nowiki'], caseInsensitive=True) #text = re.sub('\[\[kategoria', '[[:Kategoria', text, flags=re.I) pywikibot.output(u'Kategorie usunięte') page.text = text #page.save(summary=u'Bot usuwa stronę użytkownika z kategorii', apply_cosmetic_changes=False) else: pywikibot.output(u'Strona użytkownika OK') return
def get_pages_with_descriptions(self, text): tags = {'category', 'comment', 'file', 'header', 'hyperlink', 'interwiki', 'nowiki', 'pre', 'ref', 'source', 'timeline', 'template'} text = textlib.removeDisabledParts(text, tags, site=self.site) data = {} for match in self.regex.finditer(text): title, desc = match.groups() page = pywikibot.Page(self.site, title) data[page] = self.parse_description(desc) return data
def check(text, languages): tags = ['comments', 'nowiki', 'pre', 'source'] text = textlib.removeDisabledParts(text, tags) interwikiR = re.compile(r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]') for lang, pagetitle in interwikiR.findall(text): lang = lang.lower() # Check if it really is in fact an interwiki link to a known # language, or if it's e.g. a category tag or an internal link if lang in languages: return True return False
def treat(self, page): match = self.current_rule.find.search(page.text) if not match: return text = textlib.removeDisabledParts(page.text, TypoRule.exceptions, site=self.site) match = self.current_rule.find.search(text) if match: text = self.pattern.format(page.title(as_link=True), match.group(0)) pywikibot.stdout(text) self.data.append(text)
def processArticle(page): text = page.get() text = textlib.removeDisabledParts(text) # pywikibot.output(u'Working on "%s"' % title) global codeRegexp global templateRegexp result = re.findall(codeRegexp, text) template = re.findall(templateRegexp, text) if len(result) > 0 and len(template) == 0: msg = u"* [[%s]]: " % page.title() for res in result: msg += str(res) log(msg) pywikibot.output(msg)
def lacksReferences(self, text): """Check whether or not the page is lacking a references tag.""" oldTextCleaned = textlib.removeDisabledParts(text) if self.referencesR.search(oldTextCleaned) or \ self.referencesTagR.search(oldTextCleaned): return False elif self.referencesTemplates: templateR = u'{{(' + u'|'.join(self.referencesTemplates) + ')' if re.search(templateR, oldTextCleaned, re.IGNORECASE | re.UNICODE): return False if not self.refR.search(oldTextCleaned): return False else: return True
def treat_property_and_talk(self, prop, page): self.current_talk_page = page # todo: skip sandbox properties # todo: removeDisabledParts now? code = mwparserfromhell.parse(page.text, skip_style_tags=True) for template in code.ifilter_templates(): if not template.name.matches(self.template_metadata): continue params = OrderedDict() for param in template.params: params[str(param.name).strip()] = str(param.value).strip() break else: pywikibot.output('Template "{}" not found'.format( self.template_metadata)) return keys = set(self.func_dict.keys()) & set(params.keys()) # formatter URL must go before example if {'formatter URL', 'example'} <= keys: keys.remove('formatter URL') keys = ['formatter URL'] + list(keys) clear_params = [] for key in keys: param = textlib.removeDisabledParts(params[key]) if param == '-': continue if param != '': pywikibot.output('Found param "{}"'.format(key)) try: remove = self.func_dict[key](param) except pywikibot.data.api.APIError as exc: remove = False if remove: clear_params.append(key) if self.getOption('importonly'): return for par in clear_params: template.remove(par, keep_field=True) for par in set(params.keys()) & set(self.obsolete_params): template.remove(par) self.current_page = self.current_talk_page self.put_current(str(code), show_diff=True, summary='removing migrated/obsolete parameters')
def parse_page(page): """Parse a CFD working page.""" text = removeDisabledParts(page.text, site=page.site) wikicode = mwparserfromhell.parse(text, skip_style_tags=True) for section in wikicode.get_sections(flat=True, include_lead=False): heading = section.filter(forcetype=Heading)[0] section_title = str(heading.title).lower() print(section_title) if 'move' in section_title: mode = 'move' edit_summary = 'Moving {old_cat} to {new_cats} per {cfd}' elif 'empty' in section_title: mode = 'empty' edit_summary = 'Removing {old_cat} per {cfd}' else: continue parse_section(section, page.site, mode, edit_summary)
def parse(self) -> None: """Parse the page.""" text = removeDisabledParts(self.text, tags=EXCEPTIONS, site=self.site) wikicode = mwparserfromhell.parse(text, skip_style_tags=True) for section in wikicode.get_sections(flat=True, include_lead=False): heading = section.filter_headings()[0] section_title = str(heading.title).lower() for mode in self.MODES: if mode in section_title: self.mode = mode break else: continue try: self._parse_section(str(section)) except (ValueError, pywikibot.Error): pywikibot.exception(tb=True) self._check_run()
def sectionList(self,page): sections = [] sectionR = re.compile(ur'(?im)^=+(?P<section>[^<]*?)(<ref.*?)?=+$') text = page.text # expand templates etext = page.expand_text() etext = textlib.removeDisabledParts(etext) #if self.getOption('test'): # pywikibot.output(etext) for s in sectionR.finditer(etext): if self.getOption('test'): pywikibot.output(u'>>>%s<<<' % s.group('section').strip()) sections.append(s.group('section').strip()) return(sections)
def parse_description(self, text): desc = textlib.removeDisabledParts(text, [ 'comment', 'file', 'nowiki', 'template', self.FORMATTING_REGEX, self.REF_REGEX ]) desc = LINK_REGEX.sub(self.handle_link, desc) desc = desc.replace(' ', ' ').strip() desc = re.sub(r' *\([^)]+\)$', '', desc) desc = desc.partition(';')[0] desc = re.sub(r'^.*\) [-–] +', '', desc) desc = re.sub(r'^\([^)]+\) +', '', desc) while ' ' * 2 in desc: desc = desc.replace(' ' * 2, ' ') if re.search('[^IVX]\.$', desc) or desc.endswith(tuple(',:')): desc = desc[:-1].rstrip() if desc.startswith(('a ', 'an ')): desc = desc.partition(' ')[2] return desc
def get_result(self) -> str: """Return the discussion result.""" if not self.section(): return '' text = removeDisabledParts(self.text, tags=EXCEPTIONS, site=self.site) wikicode = mwparserfromhell.parse(text, skip_style_tags=True) for section in wikicode.get_sections(levels=[4]): heading = section.filter_headings()[0] if str(heading.title).strip() == self.section(): break else: section = None # Trick pylint. return '' for line in str(section).splitlines(): matches = re.findall( r"''The result of the discussion was:''\s+'''(.+?)'''", line) if matches: return matches[0] return ''
def replace_file_links(self, text: str) -> str: """ Return text with file links replaced. :param text: Article text """ assert self.site_config is not None for match in self.site_config.file_regex.finditer( removeDisabledParts(text)): try: current_icon = BSiconPage(self.current_page.site, match.group("filename")) current_icon.title() except (pywikibot.exceptions.Error, ValueError): continue new_icon = self.opt.bsicons_map.get(current_icon, None) if new_icon: text = text.replace(match.group("filename"), new_icon.title(with_ns=False)) self.current_page.replacements.add( Replacement(current_icon, new_icon)) return text
def treat(self, page): #get all linkedPages # check for disambigs linkR = re.compile( ur'\[\[(?P<title>.*?)(#(?P<section>.*?))?(\|(?P<label>.*?))?\]\]') counter = 0 reqcounter = 0 checkedpages = [] for p in linkR.finditer(textlib.removeDisabledParts(page.text)): counter += 1 t = p.group('title') if t in checkedpages or t == '': continue try: rp = pywikibot.Page(pywikibot.Site(), t) if not rp.namespace() == 0: continue if self.getOption('testlinks'): pywikibot.output( u'%s #%i (%i) In %s checking:%s' % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), counter, reqcounter, page.title(asLink=True), rp.title(asLink=True))) if not rp.exists(): reqcounter += 1 self.addResult(page.title(), rp.title()) checkedpages.append(t) except KeyboardInterrupt: pywikibot.output( 'PICKLING at %s' % datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) with open('masti/reqlinks.dat', 'wb') as f: pickle.dump(self.results, f, protocol=config.pickle_protocol) return ('STOP') except: continue return (reqcounter)
def lacksReferences(self, text): """Check whether or not the page is lacking a references tag.""" oldTextCleaned = textlib.removeDisabledParts(text) if self.referencesR.search(oldTextCleaned) or \ self.referencesTagR.search(oldTextCleaned): if self.getOption('verbose'): pywikibot.output(u'No changes necessary: references tag found.') return False elif self.referencesTemplates: templateR = u'{{(' + u'|'.join(self.referencesTemplates) + ')' if re.search(templateR, oldTextCleaned, re.IGNORECASE | re.UNICODE): if self.getOption('verbose'): pywikibot.output( u'No changes necessary: references template found.') return False if not self.refR.search(oldTextCleaned): if self.getOption('verbose'): pywikibot.output(u'No changes necessary: no ref tags found.') return False else: if self.getOption('verbose'): pywikibot.output(u'Found ref without references.') return True
def replace_template_files( self, wikicode: mwparserfromhell.wikicode.Wikicode) -> None: """ Replace files in templates. :param wikicode: Parsed wikitext """ assert self.site_config is not None for tpl in wikicode.ifilter_templates(): try: template = Page( self.current_page.site, removeDisabledParts(str(tpl.name)), ns=self.current_page.site.namespaces.TEMPLATE, ) template.title() except (pywikibot.exceptions.Error, ValueError): continue if template in self.site_config.routemap_templates: self._replace_routemap_files(tpl) elif template in self.site_config.railway_track_templates: self._replace_rt_template_files(tpl) else: self._replace_bs_template_files(tpl, template)
def treat_page(self) -> None: """Process one page.""" self.check_disabled() try: errors = self.validate_svg() except (AssertionError, RequestException, RuntimeError): pywikibot.exception() return if errors: n_errors = len(errors) new_tpl = Template('Invalid SVG') new_tpl.add('1', n_errors) summary = 'W3C invalid SVG: {} error{}'.format( n_errors, 's' if n_errors > 1 else '') else: new_tpl = Template('Valid SVG') summary = 'W3C valid SVG' wikicode = mwparserfromhell.parse(self.current_page.text, skip_style_tags=True) for tpl in wikicode.ifilter_templates(): try: template = pywikibot.Page( self.site, removeDisabledParts(str(tpl.name), site=self.site).strip(), ns=10, ) template.title() except pywikibot.InvalidTitle: continue if template in self.templates: wikicode.replace(tpl, new_tpl) break else: wikicode.insert(0, '\n') wikicode.insert(0, new_tpl) self.put_current(str(wikicode), summary=summary, minor=not errors)
def treat(self, page): """ Returns page title if param 'text' not in page """ if self.getOption('wikipedia'): resultR = re.compile( ur'(?i)(?P<result>https?://(?P<lang>[^\.]*?)\.(?P<project>wikipedia)\.org/wiki/[^\s\|<\]\}]*)' ) else: resultR = re.compile( ur'(?i)(?P<result>https?://(?P<lang>[^\.]*?)\.(?P<project>wikipedia|wikisource|wiktionary|wikivoyage)\.org/wiki/[^\s\|<\]\}]*)' ) # allowed filtypes: svg, png, jpeg, tiff, gif, xcf imageR = re.compile(ur'(?i).*\.(svg|png|jpeg|jpg|tiff|tif|gif|xcf)$') source = textlib.removeDisabledParts(page.text) #return all found results resultslist = [] found = False for r in re.finditer(resultR, source): if self.getOption('test'): pywikibot.output(u'R:%s' % r.group('result')) img = imageR.search(r.group('result')) if not img: resultslist.append({ 'link': r.group('result'), 'lang': r.group('lang'), 'project': r.group('project') }) found = True if found: return ({'page': page.title(), 'links': resultslist}) else: return (None)
def find_and_replace(self, text, init): new_params = [] old_params = [] unknown_params = [] removed_params = [] changed = False for template, fielddict in textlib.extract_templates_and_params( text, remove_disabled_parts=False, strip=False): if self.normalize(template) not in (self.template, self.new_template): continue changed = self.normalize(template) != self.new_template start_match = re.search(r'\{\{\s*((%s)\s*:\s*)?%s\s*' % ( '|'.join(self.site.namespaces[10]), re.escape(template)), text) if not start_match: if not init: pywikibot.error("Couldn't find the template") return text, 0 start = start_match.start() if len(fielddict) > 0: end = text.index('|', start) else: end = text.index('}}', start) unnamed = {} for name, value in chain(fielddict.items(), IterUnnamed(unnamed)): end += len('|%s=%s' % (name, value)) name = name.strip() value = (value .replace('\n<!-- Zastaralé parametry -->', '') .replace('\n<!-- Neznámé parametry -->', '') .strip()) try: new_name = self.handle_param(name) except OldParamException: if textlib.removeDisabledParts(value, ['comments']).strip(): old_params.append( (name, value) ) except RemoveParamException: changed = True if textlib.removeDisabledParts(value, ['comments']).strip(): removed_params.append( (name, value) ) except UnknownParamException: if textlib.removeDisabledParts(value, ['comments']).strip(): unknown_params.append( (name, value) ) except AssertionError: pywikibot.error('Couldn\'t handle parameter "%s"' % name) return text, 0 except UnnamedParamException: unnamed[value] = '' else: new_params.append( (new_name, value) ) if new_name != name: changed = True end += len('}}') while text[start:end].count('{{') < text[start:end].count('}}'): end = text[:end].rindex('}}') + len('}}') if text[start:end].count('{{') > text[start:end].count('}}'): ballance = 1 end = start while ballance > 0: next_close = text.index('}}', end) ballance += text[end:next_close].count('{{') - 1 end = next_close + len('}}') if not text[start:end].endswith('}}'): # elif? end = text[:end].rindex('}}') + len('}}') if (end < start or not text[start:end].endswith('}}') or text[start:end].count('{{') != text[start:end].count('}}')): pywikibot.error("Couldn't parse the template") return text, 0 break else: pywikibot.error("Couldn't parse the template") return text, 0 if not changed: pywikibot.output('No parameters changed') return text, 0 while end < len(text) and text[end].isspace(): # todo: also before end += 1 lines = [] nested = 0 for line in text[start:end].splitlines(): if nested == 1 and re.match(' *\|', line): lines.append(line) nested += line.count('{{') - line.count('}}') space_before = '' if len(lines) > 0 and choice(lines).startswith(' '): space_before = ' ' self.handle_params(new_params, old_params, removed_params, unknown_params) self.deduplicate(new_params) new_params.sort(key=self.key_for_sort) new_template = '{{%s' % self.new_template if len(new_params) > 0: new_template += '\n' for param, value in new_params: new_template += '%s| %s = %s\n' % (space_before, param, value) if len(old_params) > 0: new_template += '<!-- Zastaralé parametry -->\n' for param, value in old_params: new_template += '%s| %s = %s\n' % (space_before, param, value) if len(unknown_params) > 0: new_template += '<!-- Neznámé parametry -->\n' for param, value in unknown_params: new_template += '%s| %s = %s\n' % (space_before, param, value) new_template += '}}\n' return text[:start] + new_template + text[end:], end
def run(self): """Run the Bot.""" try: deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read() except IOError: raise NotImplementedError( '404-links.txt is required for reflinks.py\n' 'You need to download\n' 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz\n' 'and to unzip it in the same directory') editedpages = 0 for page in self.generator: try: # Load the page's text from the wiki new_text = page.get() if not page.canBeEdited(): pywikibot.output(u"You can't edit page %s" % page.title(asLink=True)) continue except pywikibot.NoPage: pywikibot.output(u'Page %s not found' % page.title(asLink=True)) continue except pywikibot.IsRedirectPage: pywikibot.output(u'Page %s is a redirect' % page.title(asLink=True)) continue # for each link to change for match in linksInRef.finditer( textlib.removeDisabledParts(page.get())): link = match.group(u'url') # debugging purpose # print link if u'jstor.org' in link: # TODO: Clean URL blacklist continue ref = RefLink(link, match.group('name')) f = None try: f = comms.http.fetch( ref.url, use_fake_user_agent=self._use_fake_user_agent) # Try to get Content-Type from server contentType = f.response_headers.get('content-type') if contentType and not self.MIME.search(contentType): if ref.link.lower().endswith('.pdf') and \ not self.getOption('ignorepdf'): # If file has a PDF suffix self.getPDFTitle(ref, f) else: pywikibot.output(color_format( '{lightyellow}WARNING{default} : ' 'media : {0} ', ref.link)) if ref.title: if not re.match( u'(?i) *microsoft (word|excel|visio)', ref.title): ref.transform(ispdf=True) repl = ref.refTitle() else: pywikibot.output(color_format( '{lightyellow}WARNING{default} : ' 'PDF title blacklisted : {0} ', ref.title)) repl = ref.refLink() else: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue # Get the real url where we end (http redirects !) redir = f.data.url if redir != ref.link and \ domain.findall(redir) == domain.findall(link): if soft404.search(redir) and \ not soft404.search(ref.link): pywikibot.output(color_format( '{lightyellow}WARNING{default} : ' 'Redirect 404 : {0} ', ref.link)) continue if dirIndex.match(redir) and \ not dirIndex.match(ref.link): pywikibot.output(color_format( u'{lightyellow}WARNING{default} : ' u'Redirect to root : {0} ', ref.link)) continue if f.status != requests.codes.ok: pywikibot.output(u'HTTP error (%s) for %s on %s' % (f.status, ref.url, page.title(asLink=True)), toStdout=True) # 410 Gone, indicates that the resource has been purposely # removed if f.status == 410 or \ (f.status == 404 and (u'\t%s\t' % ref.url in deadLinks)): repl = ref.refDead() new_text = new_text.replace(match.group(), repl) continue linkedpagetext = f.raw except UnicodeError: # example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html # in [[fr:Cyanure]] pywikibot.output(color_format( '{lightred}Bad link{default} : {0} in {1}', ref.url, page.title(asLink=True))) continue except (URLError, socket.error, IOError, httplib.error) as e: pywikibot.output(u'Can\'t retrieve page %s : %s' % (ref.url, e)) continue # remove <script>/<style>/comments/CDATA tags linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext) meta_content = self.META_CONTENT.search(linkedpagetext) enc = [] s = None if contentType: # use charset from http header s = self.CHARSET.search(contentType) if meta_content: tag = meta_content.group() # Prefer the contentType from the HTTP header : if not contentType: contentType = tag if not s: # use charset from html s = self.CHARSET.search(str(tag)) if s: tmp = s.group('enc').strip("\"' ").lower() naked = re.sub(r'[ _\-]', '', tmp) # Convert to python correct encoding names if naked == "gb2312": enc.append("gbk") elif naked == "shiftjis": enc.append("shift jis 2004") enc.append("cp932") elif naked == "xeucjp": enc.append("euc-jp") else: enc.append(tmp) else: pywikibot.output(u'No charset found for %s' % ref.link) if not contentType: pywikibot.output(u'No content-type found for %s' % ref.link) continue elif not self.MIME.search(contentType): pywikibot.output(color_format( '{lightyellow}WARNING{default} : media : {0} ', ref.link)) repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue # Ugly hacks to try to survive when both server and page # return no encoding. # Uses most used encodings for each national suffix if u'.ru' in ref.link or u'.su' in ref.link: # see http://www.sci.aha.ru/ATL/ra13a.htm : no server # encoding, no page encoding enc = enc + ['koi8-r', 'windows-1251'] elif u'.jp' in ref.link: enc.append("shift jis 2004") enc.append("cp932") elif u'.kr' in ref.link: enc.append("euc-kr") enc.append("cp949") elif u'.zh' in ref.link: enc.append("gbk") if 'utf-8' not in enc: enc.append('utf-8') try: u = linkedpagetext.decode(enc[0]) # Bug T69410 except (UnicodeDecodeError, LookupError) as e: pywikibot.output(u'%s : Decoding error - %s' % (ref.link, e)) continue # Retrieves the first non empty string inside <title> tags for m in self.TITLE.finditer(u): t = m.group() if t: ref.title = t ref.transform() if ref.title: break if not ref.title: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(u'%s : No title found...' % ref.link) continue # XXX Ugly hack if u'é' in ref.title: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(u'%s : Hybrid encoding...' % ref.link) continue if self.titleBlackList.match(ref.title): repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(color_format( '{lightred}WARNING{default} {0} : ' 'Blacklisted title ({1})', ref.link, ref.title)) continue # Truncate long titles. 175 is arbitrary if len(ref.title) > 175: ref.title = ref.title[:175] + "..." repl = ref.refTitle() new_text = new_text.replace(match.group(), repl) # Add <references/> when needed, but ignore templates ! if page.namespace != 10: if self.norefbot.lacksReferences(new_text): new_text = self.norefbot.addReferences(new_text) new_text = self.deduplicator.process(new_text) old_text = page.text self.userPut(page, old_text, new_text, summary=self.msg, ignore_save_related_errors=True, ignore_server_errors=True) if new_text == old_text: continue else: editedpages += 1 if self.getOption('limit') and editedpages >= self.getOption('limit'): pywikibot.output('Edited %s pages, stopping.' % self.getOption('limit')) return if self.site_stop_page and editedpages % 20 == 0: self.stop_page = pywikibot.Page(self.site, self.site_stop_page) if self.stop_page.exists(): pywikibot.output(color_format( '{lightgreen}Checking stop page...{default}')) actual_rev = self.stop_page.latest_revision_id if actual_rev != self.stop_page_rev_id: pywikibot.output( '%s has been edited : Someone wants us to stop.' % self.stop_page.title(asLink=True)) return
def add_text(page, addText, summary=None, regexSkip=None, regexSkipUrl=None, always=False, up=False, putText=True, oldTextGiven=None, reorderEnabled=True, create=False): """ Add text to a page. @rtype: tuple of (text, newtext, always) """ site = page.site if not summary: summary = i18n.twtranslate(site, 'add_text-adding', {'adding': addText[:200]}) # When a page is tagged as "really well written" it has a star in the # interwiki links. This is a list of all the templates used (in regex # format) to make the stars appear. errorCount = 0 if putText: pywikibot.output(u'Loading %s...' % page.title()) if oldTextGiven is None: try: text = page.get() except pywikibot.NoPage: if create: pywikibot.output(u"%s doesn't exist, creating it!" % page.title()) text = u'' else: pywikibot.output(u"%s doesn't exist, skip!" % page.title()) return (False, False, always) except pywikibot.IsRedirectPage: pywikibot.output(u"%s is a redirect, skip!" % page.title()) return (False, False, always) else: text = oldTextGiven # Understand if the bot has to skip the page or not # In this way you can use both -except and -excepturl if regexSkipUrl is not None: url = page.full_url() result = re.findall(regexSkipUrl, site.getUrl(url)) if result != []: pywikibot.output('Exception! regex (or word) used with -exceptUrl ' 'is in the page. Skip!\n' 'Match was: %s' % result) return (False, False, always) if regexSkip is not None: result = re.findall(regexSkip, text) if result != []: pywikibot.output('Exception! regex (or word) used with -except ' 'is in the page. Skip!\n' 'Match was: %s' % result) return (False, False, always) # If not up, text put below if not up: newtext = text # Translating the \\n into binary \n addText = addText.replace('\\n', config.line_separator) if (reorderEnabled): # Getting the categories categoriesInside = textlib.getCategoryLinks(newtext, site) # Deleting the categories newtext = textlib.removeCategoryLinks(newtext, site) # Getting the interwiki interwikiInside = textlib.getLanguageLinks(newtext, site) # Removing the interwiki newtext = textlib.removeLanguageLinks(newtext, site) # Adding the text newtext += u"%s%s" % (config.line_separator, addText) # Reputting the categories newtext = textlib.replaceCategoryLinks(newtext, categoriesInside, site, True) # Dealing the stars' issue allstars = [] starstext = textlib.removeDisabledParts(text) for star in starsList: regex = re.compile( '(\{\{(?:template:|)%s\|.*?\}\}[\s]*)' % star, re.I) found = regex.findall(starstext) if found != []: newtext = regex.sub('', newtext) allstars += found if allstars != []: newtext = newtext.strip() + config.line_separator * 2 allstars.sort() for element in allstars: newtext += '%s%s' % (element.strip(), config.LS) # Adding the interwiki newtext = textlib.replaceLanguageLinks(newtext, interwikiInside, site) else: newtext += u"%s%s" % (config.line_separator, addText) else: newtext = addText + config.line_separator + text if putText and text != newtext: pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) pywikibot.showDiff(text, newtext) # Let's put the changes. while True: # If someone load it as module, maybe it's not so useful to put the # text in the page if putText: if not always: choice = pywikibot.input_choice( u'Do you want to accept these changes?', [('Yes', 'y'), ('No', 'n'), ('All', 'a'), ('open in Browser', 'b')], 'n', automatic_quit=False) if choice == 'a': always = True elif choice == 'n': return (False, False, always) elif choice == 'b': pywikibot.bot.open_webbrowser(page) if always or choice == 'y': try: if always: page.put(newtext, summary, minorEdit=page.namespace() != 3) else: page.put_async(newtext, summary, minorEdit=page.namespace() != 3) except pywikibot.EditConflict: pywikibot.output(u'Edit conflict! skip!') return (False, False, always) except pywikibot.ServerError: errorCount += 1 if errorCount < config.max_retries: pywikibot.output(u'Server Error! Wait..') time.sleep(config.retry_wait) continue else: raise pywikibot.ServerError(u'Fifth Server Error!') except pywikibot.SpamfilterError as e: pywikibot.output( u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) return (False, False, always) except pywikibot.LockedPage: pywikibot.output(u'Skipping %s (locked page)' % page.title()) return (False, False, always) except pywikibot.PageNotSaved as error: pywikibot.output(u'Error putting page: %s' % error.args) return (False, False, always) else: # Break only if the errors are one after the other... errorCount = 0 return (True, True, always) else: return (text, newtext, always)
def processRE(param, rx): cleaned_text = textlib.removeDisabledParts(unicode(param.value.strip())) relist = re.findall(rx, cleaned_text) return relist
def remove_cats_and_comments(self, text): """Remove categories, comments and trailing spaces from wikitext.""" text = textlib.removeCategoryLinks(text, site=self.site) text = textlib.removeDisabledParts(text, tags=['comments']) return text.strip()
def run(self): """Run the Bot.""" try: deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read() except IOError: pywikibot.output( 'You need to download ' 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz ' 'and to ungzip it in the same directory') raise socket.setdefaulttimeout(30) editedpages = 0 for page in self.generator: try: # Load the page's text from the wiki new_text = page.get() if not page.canBeEdited(): pywikibot.output(u"You can't edit page %s" % page.title(asLink=True)) continue except pywikibot.NoPage: pywikibot.output(u'Page %s not found' % page.title(asLink=True)) continue except pywikibot.IsRedirectPage: pywikibot.output(u'Page %s is a redirect' % page.title(asLink=True)) continue # for each link to change for match in linksInRef.finditer( textlib.removeDisabledParts(page.get())): link = match.group(u'url') # debugging purpose # print link if u'jstor.org' in link: # TODO: Clean URL blacklist continue ref = RefLink(link, match.group('name')) f = None try: socket.setdefaulttimeout(20) try: f = urlopen(ref.url.decode("utf8")) except UnicodeError: ref.url = quote(ref.url.encode("utf8"), "://") f = urlopen(ref.url) # Try to get Content-Type from server headers = f.info() contentType = headers.getheader('Content-Type') if contentType and not self.MIME.search(contentType): if ref.link.lower().endswith('.pdf') and \ not self.getOption('ignorepdf'): # If file has a PDF suffix self.getPDFTitle(ref, f) else: pywikibot.output( u'\03{lightyellow}WARNING\03{default} : ' u'media : %s ' % ref.link) if ref.title: if not re.match( u'(?i) *microsoft (word|excel|visio)', ref.title): ref.transform(ispdf=True) repl = ref.refTitle() else: pywikibot.output( u'\03{lightyellow}WARNING\03{default} : ' u'PDF title blacklisted : %s ' % ref.title) repl = ref.refLink() else: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue # Get the real url where we end (http redirects !) redir = f.geturl() if redir != ref.link and \ domain.findall(redir) == domain.findall(link): if soft404.search(redir) and \ not soft404.search(ref.link): pywikibot.output( u'\03{lightyellow}WARNING\03{default} : ' u'Redirect 404 : %s ' % ref.link) continue if dirIndex.match(redir) and \ not dirIndex.match(ref.link): pywikibot.output( u'\03{lightyellow}WARNING\03{default} : ' u'Redirect to root : %s ' % ref.link) continue # uncompress if necessary if headers.get('Content-Encoding') in ('gzip', 'x-gzip'): # XXX: small issue here: the whole page is downloaded # through f.read(). It might fetch big files/pages. # However, truncating an encoded gzipped stream is not # an option, or unzipping will fail. compressed = io.BytesIO(f.read()) f = gzip.GzipFile(fileobj=compressed) # Read the first 1,000,000 bytes (0.95 MB) linkedpagetext = f.read(1000000) socket.setdefaulttimeout(None) except UnicodeError: # example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html # in [[fr:Cyanure]] pywikibot.output( u'\03{lightred}Bad link\03{default} : %s in %s' % (ref.url, page.title(asLink=True))) continue except HTTPError as e: pywikibot.output(u'HTTP error (%s) for %s on %s' % (e.code, ref.url, page.title(asLink=True)), toStdout=True) # 410 Gone, indicates that the resource has been purposely # removed if e.code == 410 or \ (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)): repl = ref.refDead() new_text = new_text.replace(match.group(), repl) continue except (URLError, socket.error, IOError, httplib.error) as e: pywikibot.output(u'Can\'t retrieve page %s : %s' % (ref.url, e)) continue except ValueError: # Known bug of httplib, google for : # "httplib raises ValueError reading chunked content" continue finally: if f: f.close() # remove <script>/<style>/comments/CDATA tags linkedpagetext = self.NON_HTML.sub('', linkedpagetext) meta_content = self.META_CONTENT.search(linkedpagetext) enc = [] s = None if contentType: # use charset from http header s = self.CHARSET.search(contentType) if meta_content: tag = meta_content.group() # Prefer the contentType from the HTTP header : if not contentType: contentType = tag if not s: # use charset from html s = self.CHARSET.search(tag) if s: tmp = s.group('enc').strip("\"' ").lower() naked = re.sub(r'[ _\-]', '', tmp) # Convert to python correct encoding names if naked == "gb2312": enc.append("gbk") elif naked == "shiftjis": enc.append("shift jis 2004") enc.append("cp932") elif naked == "xeucjp": enc.append("euc-jp") else: enc.append(tmp) else: pywikibot.output(u'No charset found for %s' % ref.link) if not contentType: pywikibot.output(u'No content-type found for %s' % ref.link) continue elif not self.MIME.search(contentType): pywikibot.output( u'\03{lightyellow}WARNING\03{default} : media : %s ' % ref.link) repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue # Ugly hacks to try to survive when both server and page # return no encoding. # Uses most used encodings for each national suffix if u'.ru' in ref.link or u'.su' in ref.link: # see http://www.sci.aha.ru/ATL/ra13a.htm : no server # encoding, no page encoding enc = enc + ['koi8-r', 'windows-1251'] elif u'.jp' in ref.link: enc.append("shift jis 2004") enc.append("cp932") elif u'.kr' in ref.link: enc.append("euc-kr") enc.append("cp949") elif u'.zh' in ref.link: enc.append("gbk") if 'utf-8' not in enc: enc.append('utf-8') try: u = linkedpagetext.decode(enc[0]) # Bug 67410 except (UnicodeDecodeError, LookupError) as e: pywikibot.output(u'%s : Decoding error - %s' % (ref.link, e)) continue # Retrieves the first non empty string inside <title> tags for m in self.TITLE.finditer(u): t = m.group() if t: ref.title = t ref.transform() if ref.title: break if not ref.title: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(u'%s : No title found...' % ref.link) continue # XXX Ugly hack if u'é' in ref.title: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(u'%s : Hybrid encoding...' % ref.link) continue if self.titleBlackList.match(ref.title): repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(u'\03{lightred}WARNING\03{default} %s : ' u'Blacklisted title (%s)' % (ref.link, ref.title)) continue # Truncate long titles. 175 is arbitrary if len(ref.title) > 175: ref.title = ref.title[:175] + "..." repl = ref.refTitle() new_text = new_text.replace(match.group(), repl) # Add <references/> when needed, but ignore templates ! if page.namespace != 10: if self.norefbot.lacksReferences(new_text): new_text = self.norefbot.addReferences(new_text) new_text = self.deduplicator.process(new_text) self.userPut(page, page.text, new_text, comment=self.msg, ignore_save_related_errors=True, ignore_server_errors=True) if new_text == page.text: continue else: editedpages += 1 if self.getOption('limit') and editedpages >= self.getOption('limit'): pywikibot.output('Edited %s pages, stopping.' % self.getOption('limit')) return if editedpages % 20 == 0: pywikibot.output( '\03{lightgreen}Checking stop page...\03{default}') actualRev = self.stopPage.latest_revision_id if actualRev != self.stopPageRevId: pywikibot.output( u'[[%s]] has been edited : Someone wants us to stop.' % self.stopPage) return
def run(self): """Run the Bot.""" try: deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read() except IOError: pywikibot.output( 'You need to download ' 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz ' 'and to ungzip it in the same directory') raise editedpages = 0 for page in self.generator: try: # Load the page's text from the wiki new_text = page.get() if not page.canBeEdited(): pywikibot.output(u"You can't edit page %s" % page.title(asLink=True)) continue except pywikibot.NoPage: pywikibot.output(u'Page %s not found' % page.title(asLink=True)) continue except pywikibot.IsRedirectPage: pywikibot.output(u'Page %s is a redirect' % page.title(asLink=True)) continue # for each link to change for match in linksInRef.finditer( textlib.removeDisabledParts(page.get())): link = match.group(u'url') # debugging purpose # print link if u'jstor.org' in link: # TODO: Clean URL blacklist continue ref = RefLink(link, match.group('name')) f = None try: f = comms.http.fetch( ref.url, use_fake_user_agent=self._use_fake_user_agent) # Try to get Content-Type from server contentType = f.response_headers.get('content-type') if contentType and not self.MIME.search(contentType): if ref.link.lower().endswith('.pdf') and \ not self.getOption('ignorepdf'): # If file has a PDF suffix self.getPDFTitle(ref, f) else: pywikibot.output(color_format( '{lightyellow}WARNING{default} : ' 'media : {0} ', ref.link)) if ref.title: if not re.match( u'(?i) *microsoft (word|excel|visio)', ref.title): ref.transform(ispdf=True) repl = ref.refTitle() else: pywikibot.output(color_format( '{lightyellow}WARNING{default} : ' 'PDF title blacklisted : {0} ', ref.title)) repl = ref.refLink() else: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue # Get the real url where we end (http redirects !) redir = f.data.url if redir != ref.link and \ domain.findall(redir) == domain.findall(link): if soft404.search(redir) and \ not soft404.search(ref.link): pywikibot.output(color_format( '{lightyellow}WARNING{default} : ' 'Redirect 404 : {0} ', ref.link)) continue if dirIndex.match(redir) and \ not dirIndex.match(ref.link): pywikibot.output(color_format( u'{lightyellow}WARNING{default} : ' u'Redirect to root : {0} ', ref.link)) continue if f.status != requests.codes.ok: pywikibot.output(u'HTTP error (%s) for %s on %s' % (f.status, ref.url, page.title(asLink=True)), toStdout=True) # 410 Gone, indicates that the resource has been purposely # removed if f.status == 410 or \ (f.status == 404 and (u'\t%s\t' % ref.url in deadLinks)): repl = ref.refDead() new_text = new_text.replace(match.group(), repl) continue linkedpagetext = f.content except UnicodeError: # example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html # in [[fr:Cyanure]] pywikibot.output(color_format( '{lightred}Bad link{default} : {0} in {1}', ref.url, page.title(asLink=True))) continue except (URLError, socket.error, IOError, httplib.error) as e: pywikibot.output(u'Can\'t retrieve page %s : %s' % (ref.url, e)) continue # remove <script>/<style>/comments/CDATA tags linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext) meta_content = self.META_CONTENT.search(linkedpagetext) enc = [] s = None if contentType: # use charset from http header s = self.CHARSET.search(contentType) if meta_content: tag = meta_content.group() # Prefer the contentType from the HTTP header : if not contentType: contentType = tag if not s: # use charset from html s = self.CHARSET.search(str(tag)) if s: tmp = s.group('enc').strip("\"' ").lower() naked = re.sub(r'[ _\-]', '', tmp) # Convert to python correct encoding names if naked == "gb2312": enc.append("gbk") elif naked == "shiftjis": enc.append("shift jis 2004") enc.append("cp932") elif naked == "xeucjp": enc.append("euc-jp") else: enc.append(tmp) else: pywikibot.output(u'No charset found for %s' % ref.link) if not contentType: pywikibot.output(u'No content-type found for %s' % ref.link) continue elif not self.MIME.search(contentType): pywikibot.output(color_format( '{lightyellow}WARNING{default} : media : {0} ', ref.link)) repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue # Ugly hacks to try to survive when both server and page # return no encoding. # Uses most used encodings for each national suffix if u'.ru' in ref.link or u'.su' in ref.link: # see http://www.sci.aha.ru/ATL/ra13a.htm : no server # encoding, no page encoding enc = enc + ['koi8-r', 'windows-1251'] elif u'.jp' in ref.link: enc.append("shift jis 2004") enc.append("cp932") elif u'.kr' in ref.link: enc.append("euc-kr") enc.append("cp949") elif u'.zh' in ref.link: enc.append("gbk") if 'utf-8' not in enc: enc.append('utf-8') try: u = linkedpagetext.decode(enc[0]) # Bug T69410 except (UnicodeDecodeError, LookupError) as e: pywikibot.output(u'%s : Decoding error - %s' % (ref.link, e)) continue # Retrieves the first non empty string inside <title> tags for m in self.TITLE.finditer(u): t = m.group() if t: ref.title = t ref.transform() if ref.title: break if not ref.title: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(u'%s : No title found...' % ref.link) continue # XXX Ugly hack if u'é' in ref.title: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(u'%s : Hybrid encoding...' % ref.link) continue if self.titleBlackList.match(ref.title): repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output(color_format( '{lightred}WARNING{default} {0} : ' 'Blacklisted title ({1})', ref.link, ref.title)) continue # Truncate long titles. 175 is arbitrary if len(ref.title) > 175: ref.title = ref.title[:175] + "..." repl = ref.refTitle() new_text = new_text.replace(match.group(), repl) # Add <references/> when needed, but ignore templates ! if page.namespace != 10: if self.norefbot.lacksReferences(new_text): new_text = self.norefbot.addReferences(new_text) new_text = self.deduplicator.process(new_text) self.userPut(page, page.text, new_text, summary=self.msg, ignore_save_related_errors=True, ignore_server_errors=True) if new_text == page.text: continue else: editedpages += 1 if self.getOption('limit') and editedpages >= self.getOption('limit'): pywikibot.output('Edited %s pages, stopping.' % self.getOption('limit')) return if editedpages % 20 == 0: pywikibot.output(color_format( '{lightgreen}Checking stop page...{default}')) actualRev = self.stopPage.latest_revision_id if actualRev != self.stopPageRevId: pywikibot.output( u'[[%s]] has been edited : Someone wants us to stop.' % self.stopPage) return
def _match_xml_page_text(text): """Match page text.""" text = textlib.removeDisabledParts(text) return _ref_regex.search(text) and not _references_regex.search(text)
def linkedImages(page): """Return a list of Pages that this Page links to. Only returns pages from "normal" internal links. Category links are omitted unless prefixed with ":". Image links are omitted when parameter withImageLinks is False. Embedded templates are omitted (but links within them are returned). All interwiki and external links are omitted. @param thistxt: the wikitext of the page @return: a list of Page objects. """ Rlink = re.compile(r'\[\[(?P<title>[^\]\|\[]*)(\|[^\]]*)?\]\]') result = [] try: thistxt = textlib.removeLanguageLinks(page.get(get_redirect=True), page.site) except pywikibot.NoPage: raise except pywikibot.IsRedirectPage: raise except pywikibot.SectionError: return [] thistxt = textlib.removeCategoryLinks(thistxt, page.site) # remove HTML comments, pre, nowiki, and includeonly sections # from text before processing thistxt = textlib.removeDisabledParts(thistxt) # resolve {{ns:-1}} or {{ns:Help}} # thistxt = page.site.resolvemagicwords(thistxt) for match in Rlink.finditer(thistxt): try: #print(match.group(0)) title = match.group('title') title = title.replace("_", " ").strip(" ") # print title if title == "": # empty link - problem in the page continue # convert relative link to absolute link if title.startswith(".."): parts = self.title().split('/') parts.pop() title = '/'.join(parts) + title[2:] elif title.startswith("/"): title = '%s/%s' % (page.title(), title[1:]) if title.startswith("#"): # this is an internal section link continue if not page.site.isInterwikiLink(title): page2 = pywikibot.Page(page.site, title) try: hash(str(page2)) except Exception: pywikibot.output("Page %s contains invalid link to [[%s]]." % (page.title(), title)) continue if not page2.isImage(): continue if page2.title(withSection=False) and page2 not in result: result.append(page2) except pywikibot.NoUsername: continue except: raise return result
def add_text(page, addText, summary=None, regexSkip=None, regexSkipUrl=None, always=False, up=False, putText=True, oldTextGiven=None, reorderEnabled=True, create=False): """ Add text to a page. @rtype: tuple of (text, newtext, always) """ site = page.site if not summary: summary = i18n.twtranslate(site, 'add_text-adding', {'adding': addText[:200]}) # When a page is tagged as "really well written" it has a star in the # interwiki links. This is a list of all the templates used (in regex # format) to make the stars appear. errorCount = 0 if putText: pywikibot.output(u'Loading %s...' % page.title()) if oldTextGiven is None: try: text = page.get() except pywikibot.NoPage: if create: pywikibot.output(u"%s doesn't exist, creating it!" % page.title()) text = u'' else: pywikibot.output(u"%s doesn't exist, skip!" % page.title()) return (False, False, always) except pywikibot.IsRedirectPage: pywikibot.output(u"%s is a redirect, skip!" % page.title()) return (False, False, always) else: text = oldTextGiven # Understand if the bot has to skip the page or not # In this way you can use both -except and -excepturl if regexSkipUrl is not None: url = page.full_url() result = re.findall(regexSkipUrl, site.getUrl(url)) if result != []: pywikibot.output( 'Exception! regex (or word) used with -exceptUrl ' 'is in the page. Skip!\n' 'Match was: %s' % result) return (False, False, always) if regexSkip is not None: result = re.findall(regexSkip, text) if result != []: pywikibot.output( 'Exception! regex (or word) used with -except ' 'is in the page. Skip!\n' 'Match was: %s' % result) return (False, False, always) # If not up, text put below if not up: newtext = text # Translating the \\n into binary \n addText = addText.replace('\\n', config.line_separator) if (reorderEnabled): # Getting the categories categoriesInside = textlib.getCategoryLinks(newtext, site) # Deleting the categories newtext = textlib.removeCategoryLinks(newtext, site) # Getting the interwiki interwikiInside = textlib.getLanguageLinks(newtext, site) # Removing the interwiki newtext = textlib.removeLanguageLinks(newtext, site) # Adding the text newtext += u"%s%s" % (config.line_separator, addText) # Reputting the categories newtext = textlib.replaceCategoryLinks(newtext, categoriesInside, site, True) # Dealing the stars' issue allstars = [] starstext = textlib.removeDisabledParts(text) for star in starsList: regex = re.compile('(\{\{(?:template:|)%s\|.*?\}\}[\s]*)' % star, re.I) found = regex.findall(starstext) if found != []: newtext = regex.sub('', newtext) allstars += found if allstars != []: newtext = newtext.strip() + config.line_separator * 2 allstars.sort() for element in allstars: newtext += '%s%s' % (element.strip(), config.LS) # Adding the interwiki newtext = textlib.replaceLanguageLinks(newtext, interwikiInside, site) else: newtext += u"%s%s" % (config.line_separator, addText) else: newtext = addText + config.line_separator + text if putText and text != newtext: pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) pywikibot.showDiff(text, newtext) # Let's put the changes. while True: # If someone load it as module, maybe it's not so useful to put the # text in the page if putText: if not always: choice = pywikibot.input_choice( u'Do you want to accept these changes?', [('Yes', 'y'), ('No', 'n'), ('All', 'a'), ('open in Browser', 'b')], 'n', automatic_quit=False) if choice == 'a': always = True elif choice == 'n': return (False, False, always) elif choice == 'b': pywikibot.bot.open_webbrowser(page) if always or choice == 'y': try: if always: page.put(newtext, summary, minorEdit=page.namespace() != 3) else: page.put_async(newtext, summary, minorEdit=page.namespace() != 3) except pywikibot.EditConflict: pywikibot.output(u'Edit conflict! skip!') return (False, False, always) except pywikibot.ServerError: errorCount += 1 if errorCount < config.max_retries: pywikibot.output(u'Server Error! Wait..') time.sleep(config.retry_wait) continue else: raise pywikibot.ServerError(u'Fifth Server Error!') except pywikibot.SpamfilterError as e: pywikibot.output( u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) return (False, False, always) except pywikibot.LockedPage: pywikibot.output(u'Skipping %s (locked page)' % page.title()) return (False, False, always) except pywikibot.PageNotSaved as error: pywikibot.output(u'Error putting page: %s' % error.args) return (False, False, always) else: # Break only if the errors are one after the other... errorCount = 0 return (True, True, always) else: return (text, newtext, always)
def standardizePageFooter(self, text): """ Standardize page footer. Makes sure that interwiki links, categories and star templates are put to the correct position and into the right order. This combines the old instances standardizeInterwiki and standardizeCategories The page footer has the following section in that sequence: 1. categories 2. ## TODO: template beyond categories ## 3. additional information depending on local site policy 4. stars templates for featured and good articles 5. interwiki links """ starsList = [ u'bueno', u'bom interwiki', u'cyswllt[ _]erthygl[ _]ddethol', u'dolen[ _]ed', u'destacado', u'destaca[tu]', u'enllaç[ _]ad', u'enllaz[ _]ad', u'leam[ _]vdc', u'legătură[ _]a[bcf]', u'liamm[ _]pub', u'lien[ _]adq', u'lien[ _]ba', u'liên[ _]kết[ _]bài[ _]chất[ _]lượng[ _]tốt', u'liên[ _]kết[ _]chọn[ _]lọc', u'ligam[ _]adq', u'ligazón[ _]a[bd]', u'ligoelstara', u'ligoleginda', u'link[ _][afgu]a', u'link[ _]adq', u'link[ _]f[lm]', u'link[ _]km', u'link[ _]sm', u'linkfa', u'na[ _]lotura', u'nasc[ _]ar', u'tengill[ _][úg]g', u'ua', u'yüm yg', u'רא', u'وصلة مقالة جيدة', u'وصلة مقالة مختارة', ] categories = None interwikiLinks = None allstars = [] # The PyWikipediaBot is no longer allowed to touch categories on the # German Wikipedia. See # https://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/1#Position_der_Personendaten_am_.22Artikelende.22 # ignoring nn-wiki of cause of the comment line above iw section if not self.template and '{{Personendaten' not in text and \ '{{SORTIERUNG' not in text and '{{DEFAULTSORT' not in text and \ self.site.code not in ('et', 'it', 'bg', 'ru'): categories = textlib.getCategoryLinks(text, site=self.site) if not self.talkpage: # and pywikibot.calledModuleName() <> 'interwiki': subpage = False if self.template: loc = None try: tmpl, loc = moved_links[self.site.code] del tmpl except KeyError: pass if loc is not None and loc in self.title: subpage = True interwikiLinks = textlib.getLanguageLinks( text, insite=self.site, template_subpage=subpage) # Removing the interwiki text = textlib.removeLanguageLinks(text, site=self.site) # Removing the stars' issue starstext = textlib.removeDisabledParts(text) for star in starsList: regex = re.compile(r'(\{\{(?:template:|)%s\|.*?\}\}[\s]*)' % star, re.I) found = regex.findall(starstext) if found != []: text = regex.sub('', text) allstars += found # Adding categories if categories: # TODO: Sorting categories in alphabetic order. # e.g. using categories.sort() # TODO: Taking main cats to top # for name in categories: # if re.search(u"(.+?)\|(.{,1}?)",name.title()) or name.title()==name.title().split(":")[0]+title: # categories.remove(name) # categories.insert(0, name) text = textlib.replaceCategoryLinks(text, categories, site=self.site) # Adding stars templates if allstars: text = text.strip() + self.site.family.interwiki_text_separator allstars.sort() for element in allstars: text += '%s%s' % (element.strip(), config.line_separator) pywikibot.log(u'%s' % element.strip()) # Adding the interwiki if interwikiLinks: text = textlib.replaceLanguageLinks(text, interwikiLinks, site=self.site, template=self.template, template_subpage=subpage) return text