def buildDescription(flinfoDescription=u'', flickrreview=False, reviewer=u'', override=u'', addCategory=u'', removeCategories=False): """Build the final description for the image. The description is based on the info from flickrinfo and improved. """ description = u'== {{int:filedesc}} ==\n%s' % flinfoDescription if removeCategories: description = textlib.removeCategoryLinks(description, pywikibot.Site( 'commons', 'commons')) if override: description = description.replace(u'{{cc-by-sa-2.0}}\n', u'') description = description.replace(u'{{cc-by-2.0}}\n', u'') description = description.replace(u'{{flickrreview}}\n', u'') description = description.replace( '{{copyvio|Flickr, licensed as "All Rights Reserved" which is not ' 'a free license --~~~~}}\n', '') description = description.replace(u'=={{int:license}}==', u'=={{int:license}}==\n' + override) elif flickrreview: if reviewer: description = description.replace( '{{flickrreview}}', '{{flickrreview|' + reviewer + '|{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-{{subst:CURRENTDAY2}}}}') if addCategory: description = description.replace(u'{{subst:unc}}\n', u'') description = description + u'\n[[Category:' + addCategory + ']]\n' description = description.replace(u'\r\n', u'\n') return description
def getWordCount(self, text): text = textlib.removeDisabledParts(text) text = textlib.removeHTMLParts(text) text = textlib.removeLanguageLinks(text) text = textlib.removeCategoryLinks(text) word_list = re.findall(r"[\w']+", text) return len(word_list)
def saveImagePage(imagepage, newcats): """Remove the old categories and add the new categories to the image.""" newtext = textlib.removeCategoryLinks(imagepage.text, imagepage.site) newtext += '\n' for category in newcats: newtext = newtext + '[[Category:' + category + ']]\n' comment = 'Filtering categories' pywikibot.showDiff(imagepage.text, newtext) imagepage.text = newtext imagepage.save(comment)
def getNewFieldsFromFreetext(self, imagepage): """Extract fields from free text for the new information template.""" text = imagepage.get() for toRemove in sourceGarbage[imagepage.site.lang]: text = re.sub(toRemove, '', text, flags=re.IGNORECASE) for regex, _ in licenseTemplates[imagepage.site.lang]: text = re.sub(regex, '', text, flags=re.IGNORECASE) text = removeCategoryLinks(text, imagepage.site()) description = self.convertLinks(text.strip(), imagepage.site()) date = self.getUploadDate(imagepage) source = self.getSource(imagepage) author = self.getAuthorText(imagepage) return (description, date, source, author)
def saveImagePage(imagepage, newcats, usage, galleries, onlyFilter): """Remove the old categories and add the new categories to the image.""" newtext = textlib.removeCategoryLinks(imagepage.text, imagepage.site) if not onlyFilter: newtext = removeTemplates(newtext) newtext = newtext + getCheckCategoriesTemplate(usage, galleries, len(newcats)) newtext += u'\n' for category in newcats: newtext = newtext + u'[[Category:' + category + u']]\n' if onlyFilter: comment = u'Filtering categories' else: comment = u'Image is categorized by a bot using data from [[Commons:Tools#CommonSense|CommonSense]]' pywikibot.showDiff(imagepage.text, newtext) imagepage.text = newtext imagepage.save(comment) return
def buildDescription(flinfoDescription='', flickrreview=False, reviewer='', override='', addCategory='', removeCategories=False): """Build the final description for the image. The description is based on the info from flickrinfo and improved. """ description = flinfoDescription # use template {{Taken on}} datetaken = re.search(r'\|Date=(.*)\n', description).group(1) if datetaken: datetaken = '{{Taken on|%s}}' % (datetaken) description = re.sub(r'\|Date=.*\n', '|Date={}\n'.format(datetaken), description) if removeCategories: description = textlib.removeCategoryLinks( description, pywikibot.Site('commons', 'commons')) if override: description = description.replace('{{cc-by-sa-2.0}}\n', '') description = description.replace('{{cc-by-2.0}}\n', '') description = description.replace('{{flickrreview}}\n', '') description = description.replace( '{{copyvio|Flickr, licensed as "All Rights Reserved" which is not ' 'a free license --~~~~}}\n', '') description = description.replace('=={{int:license}}==', '=={{int:license}}==\n' + override) elif flickrreview and reviewer: description = description.replace( '{{flickrreview}}', '{{flickrreview|%s|' '{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-' '{{subst:CURRENTDAY2}}}}' % reviewer) if '{{subst:unc}}' not in description: # Request category check description += '\n{{subst:chc}}\n' if addCategory: description = description.replace('{{subst:unc}}\n', '') description += '\n[[Category:{}]]\n'.format(addCategory) description = description.replace('\r\n', '\n') return description
def harvestSortKey(self, match): text = match.group() if self.defaultsortR.search(text): return text keys = {} categories = textlib.getCategoryLinks(text, site=self.site) if not any( category.title(with_ns=False) in ('Muži', 'Žijící lidé', 'Ženy') for category in categories): return text for category in categories: key = category.sortKey if key: key = self.tidy_sortkey(key) if not key.strip(): continue keys.setdefault(key, 0.0) keys[key] += 1 if len(keys) > 1: return text if not keys: return text if sum(keys.values()) < 4: return text key = list(keys.keys()).pop() for category in categories: if category.sortKey is not None: if self.tidy_sortkey(category.sortKey) == key: category.sortKey = None categories.sort(key=self.sort_category) text = textlib.removeCategoryLinks(text, self.site) text += '\n\n{{DEFAULTSORT:%s}}' % key before, _, after = textlib.replaceCategoryLinks( text, categories, self.site).rpartition('\n\n') # fixme: safer return before + '\n' + after
def buildDescription(flinfoDescription=u'', flickrreview=False, reviewer=u'', override=u'', addCategory=u'', removeCategories=False): """Build the final description for the image. The description is based on the info from flickrinfo and improved. """ description = u'%s' % flinfoDescription if removeCategories: description = textlib.removeCategoryLinks( description, pywikibot.Site('commons', 'commons')) if override: description = description.replace(u'{{cc-by-sa-2.0}}\n', u'') description = description.replace(u'{{cc-by-2.0}}\n', u'') description = description.replace(u'{{flickrreview}}\n', u'') description = description.replace( '{{copyvio|Flickr, licensed as "All Rights Reserved" which is not ' 'a free license --~~~~}}\n', '') description = description.replace(u'=={{int:license}}==', u'=={{int:license}}==\n' + override) elif flickrreview: if reviewer: description = description.replace( '{{flickrreview}}', '{{flickrreview|' + reviewer + '|{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-{{subst:CURRENTDAY2}}}}' ) #if addCategory: description = description.replace(u'{{subst:unc}}\n', u'') # description = description + u'\n[[Category:' + addCategory + ']]\n' description = description + u'{{Wiki Loves Monuments 2017|xx}}\n' description = description + u'[[Category:Images from Wiki Loves Monuments 2017 Flickr photowalks]]\n' description = description + u'[[Category:Images from Wiki Loves Monuments 2017 Flickr photowalks to check]]\n' description = description.replace(u'\r\n', u'\n') return description
def add_text(page, addText, summary=None, regexSkip=None, regexSkipUrl=None, always=False, up=False, putText=True, oldTextGiven=None, reorderEnabled=True, create=False): """ Add text to a page. @rtype: tuple of (text, newtext, always) """ site = page.site if not summary: summary = i18n.twtranslate(site, 'add_text-adding', {'adding': addText[:200]}) # When a page is tagged as "really well written" it has a star in the # interwiki links. This is a list of all the templates used (in regex # format) to make the stars appear. errorCount = 0 if putText: pywikibot.output(u'Loading %s...' % page.title()) if oldTextGiven is None: try: text = page.get() except pywikibot.NoPage: if create: pywikibot.output(u"%s doesn't exist, creating it!" % page.title()) text = u'' else: pywikibot.output(u"%s doesn't exist, skip!" % page.title()) return (False, False, always) except pywikibot.IsRedirectPage: pywikibot.output(u"%s is a redirect, skip!" % page.title()) return (False, False, always) else: text = oldTextGiven # Understand if the bot has to skip the page or not # In this way you can use both -except and -excepturl if regexSkipUrl is not None: url = page.full_url() result = re.findall(regexSkipUrl, site.getUrl(url)) if result != []: pywikibot.output('Exception! regex (or word) used with -exceptUrl ' 'is in the page. Skip!\n' 'Match was: %s' % result) return (False, False, always) if regexSkip is not None: result = re.findall(regexSkip, text) if result != []: pywikibot.output('Exception! regex (or word) used with -except ' 'is in the page. Skip!\n' 'Match was: %s' % result) return (False, False, always) # If not up, text put below if not up: newtext = text # Translating the \\n into binary \n addText = addText.replace('\\n', config.line_separator) if (reorderEnabled): # Getting the categories categoriesInside = textlib.getCategoryLinks(newtext, site) # Deleting the categories newtext = textlib.removeCategoryLinks(newtext, site) # Getting the interwiki interwikiInside = textlib.getLanguageLinks(newtext, site) # Removing the interwiki newtext = textlib.removeLanguageLinks(newtext, site) # Adding the text newtext += u"%s%s" % (config.line_separator, addText) # Reputting the categories newtext = textlib.replaceCategoryLinks(newtext, categoriesInside, site, True) # Dealing the stars' issue allstars = [] starstext = textlib.removeDisabledParts(text) for star in starsList: regex = re.compile( '(\{\{(?:template:|)%s\|.*?\}\}[\s]*)' % star, re.I) found = regex.findall(starstext) if found != []: newtext = regex.sub('', newtext) allstars += found if allstars != []: newtext = newtext.strip() + config.line_separator * 2 allstars.sort() for element in allstars: newtext += '%s%s' % (element.strip(), config.LS) # Adding the interwiki newtext = textlib.replaceLanguageLinks(newtext, interwikiInside, site) else: newtext += u"%s%s" % (config.line_separator, addText) else: newtext = addText + config.line_separator + text if putText and text != newtext: pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) pywikibot.showDiff(text, newtext) # Let's put the changes. while True: # If someone load it as module, maybe it's not so useful to put the # text in the page if putText: if not always: choice = pywikibot.input_choice( u'Do you want to accept these changes?', [('Yes', 'y'), ('No', 'n'), ('All', 'a'), ('open in Browser', 'b')], 'n', automatic_quit=False) if choice == 'a': always = True elif choice == 'n': return (False, False, always) elif choice == 'b': pywikibot.bot.open_webbrowser(page) if always or choice == 'y': try: if always: page.put(newtext, summary, minorEdit=page.namespace() != 3) else: page.put_async(newtext, summary, minorEdit=page.namespace() != 3) except pywikibot.EditConflict: pywikibot.output(u'Edit conflict! skip!') return (False, False, always) except pywikibot.ServerError: errorCount += 1 if errorCount < config.max_retries: pywikibot.output(u'Server Error! Wait..') time.sleep(config.retry_wait) continue else: raise pywikibot.ServerError(u'Fifth Server Error!') except pywikibot.SpamfilterError as e: pywikibot.output( u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) return (False, False, always) except pywikibot.LockedPage: pywikibot.output(u'Skipping %s (locked page)' % page.title()) return (False, False, always) except pywikibot.PageNotSaved as error: pywikibot.output(u'Error putting page: %s' % error.args) return (False, False, always) else: # Break only if the errors are one after the other... errorCount = 0 return (True, True, always) else: return (text, newtext, always)
def add_text( page, addText: str, summary: Optional[str] = None, regexSkip: Optional[str] = None, regexSkipUrl: Optional[str] = None, always: bool = False, up: bool = False, putText: bool = True, oldTextGiven: Optional[str] = None, reorderEnabled: bool = True, create: bool = False ) -> Union[Tuple[bool, bool, bool], Tuple[str, str, bool]]: """ Add text to a page. @param page: The page to add text to @type page: pywikibot.page.BasePage @param addText: Text to add @param summary: Summary of changes. If None, beginning of addText is used. @param regexSkip: Abort if text on page matches @param regexSkipUrl: Abort if full url matches @param always: Always add text without user confirmation @param up: If True, add text to top of page, else add at bottom. @param putText: If True, save changes to the page, else return (_, newtext, _) @param oldTextGiven: If None fetch page text, else use this text @param reorderEnabled: If True place text above categories and interwiki, else place at page bottom. No effect if up = False. @param create: Create page if it does not exist @return: If putText=True: (success, success, always) else: (_, newtext, _) """ site = page.site if not summary: summary = i18n.twtranslate(site, 'add_text-adding', {'adding': addText[:200]}) if putText: pywikibot.output('Loading {}...'.format(page.title())) text = get_text(page, oldTextGiven, create) if text is None: return (False, False, always) # Understand if the bot has to skip the page or not # In this way you can use both -except and -excepturl if regexSkipUrl is not None: url = page.full_url() result = re.findall(regexSkipUrl, site.getUrl(url)) if result != []: pywikibot.output('Exception! regex (or word) used with -exceptUrl ' 'is in the page. Skip!\n' 'Match was: {}'.format(result)) return (False, False, always) if regexSkip is not None: result = re.findall(regexSkip, text) if result != []: pywikibot.output('Exception! regex (or word) used with -except ' 'is in the page. Skip!\n' 'Match was: {}'.format(result)) return (False, False, always) # If not up, text put below if not up: newtext = text # Translating the \\n into binary \n addText = addText.replace('\\n', '\n') if reorderEnabled: # Getting the categories categoriesInside = textlib.getCategoryLinks(newtext, site) # Deleting the categories newtext = textlib.removeCategoryLinks(newtext, site) # Getting the interwiki interwikiInside = textlib.getLanguageLinks(newtext, site) # Removing the interwiki newtext = textlib.removeLanguageLinks(newtext, site) # Adding the text newtext += '\n' + addText # Reputting the categories newtext = textlib.replaceCategoryLinks(newtext, categoriesInside, site, True) # Adding the interwiki newtext = textlib.replaceLanguageLinks(newtext, interwikiInside, site) else: newtext += '\n' + addText else: newtext = addText + '\n' + text if not putText: # If someone load it as module, maybe it's not so useful to put the # text in the page return (text, newtext, always) if text != newtext: pywikibot.output( color_format('\n\n>>> {lightpurple}{0}{default} <<<', page.title())) pywikibot.showDiff(text, newtext) # Let's put the changes. error_count = 0 while True: if not always: try: choice = pywikibot.input_choice( 'Do you want to accept these changes?', [('Yes', 'y'), ('No', 'n'), ('All', 'a'), ('open in Browser', 'b')], 'n') except QuitKeyboardInterrupt: sys.exit('User quit bot run.') if choice == 'a': always = True elif choice == 'n': return (False, False, always) elif choice == 'b': pywikibot.bot.open_webbrowser(page) continue # either always or choice == 'y' is selected result = put_text(page, newtext, summary, error_count, asynchronous=not always) if result is not None: return (result, result, always) error_count += 1
def remove_cats_and_comments(self, text): """Remove categories, comments and trailing spaces from wikitext.""" text = textlib.removeCategoryLinks(text, site=self.site) text = textlib.removeDisabledParts(text, tags=['comments']) return text.strip()
def add_text(page, addText, summary=None, regexSkip=None, regexSkipUrl=None, always=False, up=False, putText=True, oldTextGiven=None, reorderEnabled=True, create=False): """ Add text to a page. @rtype: tuple of (text, newtext, always) """ site = page.site if not summary: summary = i18n.twtranslate(site, 'add_text-adding', {'adding': addText[:200]}) if putText: pywikibot.output('Loading {}...'.format(page.title())) text = get_text(page, oldTextGiven, create) if text is None: return (False, False, always) # Understand if the bot has to skip the page or not # In this way you can use both -except and -excepturl if regexSkipUrl is not None: url = page.full_url() result = re.findall(regexSkipUrl, site.getUrl(url)) if result != []: pywikibot.output('Exception! regex (or word) used with -exceptUrl ' 'is in the page. Skip!\n' 'Match was: {}'.format(result)) return (False, False, always) if regexSkip is not None: result = re.findall(regexSkip, text) if result != []: pywikibot.output('Exception! regex (or word) used with -except ' 'is in the page. Skip!\n' 'Match was: {}'.format(result)) return (False, False, always) # If not up, text put below if not up: newtext = text # Translating the \\n into binary \n addText = addText.replace('\\n', config.line_separator) if (reorderEnabled): # Getting the categories categoriesInside = textlib.getCategoryLinks(newtext, site) # Deleting the categories newtext = textlib.removeCategoryLinks(newtext, site) # Getting the interwiki interwikiInside = textlib.getLanguageLinks(newtext, site) # Removing the interwiki newtext = textlib.removeLanguageLinks(newtext, site) # Adding the text newtext += '{}{}'.format(config.line_separator, addText) # Reputting the categories newtext = textlib.replaceCategoryLinks(newtext, categoriesInside, site, True) # Adding the interwiki newtext = textlib.replaceLanguageLinks(newtext, interwikiInside, site) else: newtext += '{}{}'.format(config.line_separator, addText) else: newtext = addText + config.line_separator + text if putText and text != newtext: pywikibot.output( color_format('\n\n>>> {lightpurple}{0}{default} <<<', page.title())) pywikibot.showDiff(text, newtext) # Let's put the changes. error_count = 0 while True: # If someone load it as module, maybe it's not so useful to put the # text in the page if not putText: return (text, newtext, always) if not always: try: choice = pywikibot.input_choice( 'Do you want to accept these changes?', [('Yes', 'y'), ('No', 'n'), ('All', 'a'), ('open in Browser', 'b')], 'n') except QuitKeyboardInterrupt: sys.exit('User quit bot run.') if choice == 'a': always = True elif choice == 'n': return (False, False, always) elif choice == 'b': pywikibot.bot.open_webbrowser(page) if always or choice == 'y': result = put_text(page, newtext, summary, error_count, asynchronous=not always) if result is not None: return (result, result, always) error_count += 1
def add_text(page, addText, summary=None, regexSkip=None, regexSkipUrl=None, always=False, up=False, putText=True, oldTextGiven=None, reorderEnabled=True, create=False): """ Add text to a page. @rtype: tuple of (text, newtext, always) """ site = page.site if not summary: summary = i18n.twtranslate(site, 'add_text-adding', {'adding': addText[:200]}) # When a page is tagged as "really well written" it has a star in the # interwiki links. This is a list of all the templates used (in regex # format) to make the stars appear. errorCount = 0 if putText: pywikibot.output(u'Loading %s...' % page.title()) if oldTextGiven is None: try: text = page.get() except pywikibot.NoPage: if create: pywikibot.output(u"%s doesn't exist, creating it!" % page.title()) text = u'' else: pywikibot.output(u"%s doesn't exist, skip!" % page.title()) return (False, False, always) except pywikibot.IsRedirectPage: pywikibot.output(u"%s is a redirect, skip!" % page.title()) return (False, False, always) else: text = oldTextGiven # Understand if the bot has to skip the page or not # In this way you can use both -except and -excepturl if regexSkipUrl is not None: url = page.full_url() result = re.findall(regexSkipUrl, site.getUrl(url)) if result != []: pywikibot.output( 'Exception! regex (or word) used with -exceptUrl ' 'is in the page. Skip!\n' 'Match was: %s' % result) return (False, False, always) if regexSkip is not None: result = re.findall(regexSkip, text) if result != []: pywikibot.output( 'Exception! regex (or word) used with -except ' 'is in the page. Skip!\n' 'Match was: %s' % result) return (False, False, always) # If not up, text put below if not up: newtext = text # Translating the \\n into binary \n addText = addText.replace('\\n', config.line_separator) if (reorderEnabled): # Getting the categories categoriesInside = textlib.getCategoryLinks(newtext, site) # Deleting the categories newtext = textlib.removeCategoryLinks(newtext, site) # Getting the interwiki interwikiInside = textlib.getLanguageLinks(newtext, site) # Removing the interwiki newtext = textlib.removeLanguageLinks(newtext, site) # Adding the text newtext += u"%s%s" % (config.line_separator, addText) # Reputting the categories newtext = textlib.replaceCategoryLinks(newtext, categoriesInside, site, True) # Dealing the stars' issue allstars = [] starstext = textlib.removeDisabledParts(text) for star in starsList: regex = re.compile('(\{\{(?:template:|)%s\|.*?\}\}[\s]*)' % star, re.I) found = regex.findall(starstext) if found != []: newtext = regex.sub('', newtext) allstars += found if allstars != []: newtext = newtext.strip() + config.line_separator * 2 allstars.sort() for element in allstars: newtext += '%s%s' % (element.strip(), config.LS) # Adding the interwiki newtext = textlib.replaceLanguageLinks(newtext, interwikiInside, site) else: newtext += u"%s%s" % (config.line_separator, addText) else: newtext = addText + config.line_separator + text if putText and text != newtext: pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) pywikibot.showDiff(text, newtext) # Let's put the changes. while True: # If someone load it as module, maybe it's not so useful to put the # text in the page if putText: if not always: choice = pywikibot.input_choice( u'Do you want to accept these changes?', [('Yes', 'y'), ('No', 'n'), ('All', 'a'), ('open in Browser', 'b')], 'n', automatic_quit=False) if choice == 'a': always = True elif choice == 'n': return (False, False, always) elif choice == 'b': pywikibot.bot.open_webbrowser(page) if always or choice == 'y': try: if always: page.put(newtext, summary, minorEdit=page.namespace() != 3) else: page.put_async(newtext, summary, minorEdit=page.namespace() != 3) except pywikibot.EditConflict: pywikibot.output(u'Edit conflict! skip!') return (False, False, always) except pywikibot.ServerError: errorCount += 1 if errorCount < config.max_retries: pywikibot.output(u'Server Error! Wait..') time.sleep(config.retry_wait) continue else: raise pywikibot.ServerError(u'Fifth Server Error!') except pywikibot.SpamfilterError as e: pywikibot.output( u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) return (False, False, always) except pywikibot.LockedPage: pywikibot.output(u'Skipping %s (locked page)' % page.title()) return (False, False, always) except pywikibot.PageNotSaved as error: pywikibot.output(u'Error putting page: %s' % error.args) return (False, False, always) else: # Break only if the errors are one after the other... errorCount = 0 return (True, True, always) else: return (text, newtext, always)
def linkedImages(page): """Return a list of Pages that this Page links to. Only returns pages from "normal" internal links. Category links are omitted unless prefixed with ":". Image links are omitted when parameter withImageLinks is False. Embedded templates are omitted (but links within them are returned). All interwiki and external links are omitted. @param thistxt: the wikitext of the page @return: a list of Page objects. """ Rlink = re.compile(r'\[\[(?P<title>[^\]\|\[]*)(\|[^\]]*)?\]\]') result = [] try: thistxt = textlib.removeLanguageLinks(page.get(get_redirect=True), page.site) except pywikibot.NoPage: raise except pywikibot.IsRedirectPage: raise except pywikibot.SectionError: return [] thistxt = textlib.removeCategoryLinks(thistxt, page.site) # remove HTML comments, pre, nowiki, and includeonly sections # from text before processing thistxt = textlib.removeDisabledParts(thistxt) # resolve {{ns:-1}} or {{ns:Help}} # thistxt = page.site.resolvemagicwords(thistxt) for match in Rlink.finditer(thistxt): try: #print(match.group(0)) title = match.group('title') title = title.replace("_", " ").strip(" ") # print title if title == "": # empty link - problem in the page continue # convert relative link to absolute link if title.startswith(".."): parts = self.title().split('/') parts.pop() title = '/'.join(parts) + title[2:] elif title.startswith("/"): title = '%s/%s' % (page.title(), title[1:]) if title.startswith("#"): # this is an internal section link continue if not page.site.isInterwikiLink(title): page2 = pywikibot.Page(page.site, title) try: hash(str(page2)) except Exception: pywikibot.output("Page %s contains invalid link to [[%s]]." % (page.title(), title)) continue if not page2.isImage(): continue if page2.title(withSection=False) and page2 not in result: result.append(page2) except pywikibot.NoUsername: continue except: raise return result