def load_page(self): """Load the page to be archived and break it up into threads.""" self.header = '' self.threads = [] self.archives = {} self.archived_threads = 0 # Exclude non-thread headings text = self.get() marker = findmarker(text) text = re.sub(r'^===', marker + r'===', text, flags=re.M) # Find threads, avoid archiving categories or interwiki header, threads, footer = extract_sections(text, self.site) header = header.replace(marker, '') if header and footer: self.header = '\n\n'.join((header.rstrip(), footer, '')) else: self.header = header + footer for thread_heading, thread_content in threads: cur_thread = DiscussionThread(thread_heading.strip('= '), self.now, self.timestripper) lines = thread_content.replace(marker, '').splitlines() lines = lines[1:] # remove heading line for line in lines: cur_thread.feed_line(line) self.threads.append(cur_thread) # This extra info is not desirable when run under the unittest # framework, which may be run either directly or via setup.py if pywikibot.calledModuleName() not in ['archivebot_tests', 'setup']: pywikibot.output(u'%d Threads found on %s' % (len(self.threads), self))
def load_page(self) -> None: """Load the page to be archived and break it up into threads.""" self.header = '' self.threads = [] self.archives = {} self.archived_threads = 0 # Exclude unsupported headings (h1, h3, etc): # adding the marker will make them ignored by extract_sections() text = self.get() marker = findmarker(text) text = re.sub(r'^((=|={3,})[^=])', marker + r'\1', text, flags=re.M) # Find threads, avoid archiving categories or interwiki header, threads, footer = extract_sections(text, self.site) header = header.replace(marker, '') if header and footer: self.header = '\n\n'.join((header.rstrip(), footer, '')) else: self.header = header + footer for thread_heading, thread_content in threads: cur_thread = DiscussionThread(thread_heading.strip('= '), self.timestripper) # remove heading line _, *lines = thread_content.replace(marker, '').splitlines() for line in lines: cur_thread.feed_line(line) self.threads.append(cur_thread) # This extra info is not desirable when run under the unittest # framework, which may be run either directly or via setup.py if pywikibot.calledModuleName() not in ['archivebot_tests', 'setup']: pywikibot.output('{} thread(s) found on {}'.format( len(self.threads), self))
def removeEmptySections(self, text: str) -> str: """Cleanup empty sections.""" # userspace contains article stubs without nobots/in use templates if self.namespace == 2: return text skippings = ['comment', 'category'] skip_regexes = _get_regexes(skippings, self.site) # site defined templates skip_templates = { 'cs': ('Pahýl[ _]část',), # stub section } if self.site.code in skip_templates: for template in skip_templates[self.site.code]: skip_regexes.append( re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I)) # empty lists skip_regexes.append(re.compile(r'(?m)^[\*#] *$')) # get stripped sections stripped_text = textlib.removeLanguageLinks(text, self.site, '\n') for reg in skip_regexes: stripped_text = reg.sub(r'', stripped_text) strip_sections = textlib.extract_sections( stripped_text, self.site)[1] # get proper sections header, sections, footer = textlib.extract_sections(text, self.site) # iterate stripped sections and create a new page body new_body = [] for i, strip_section in enumerate(strip_sections): current_heading = sections[i][0] try: next_heading = sections[i + 1][0] except IndexError: next_heading = '' current_dep = (len(current_heading) - len(current_heading.lstrip('='))) next_dep = len(next_heading) - len(next_heading.lstrip('=')) if strip_section[1].strip() or current_dep < next_dep: new_body.extend(sections[i]) return header + ''.join(new_body) + footer
def removeEmptySections(self, text): """Cleanup empty sections.""" # comments, categories, and interwikis skippings = ['comment', 'category', 'interwiki'] skip_regexes = _get_regexes(skippings, self.site) # we want only interwikis, not interlanguage links skip_regexes[1] = re.compile( skip_regexes[1].pattern.replace(':?', '')) # site defined templates skip_templates = { 'cs': ('Pahýl[ _]část',), # stub section } if self.site.code in skip_templates: for template in skip_templates[self.site.code]: skip_regexes.append( re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I)) # empty lists skip_regexes.append(re.compile(r'(?m)^[\*#] *$')) # get stripped sections stripped_text = text for reg in skip_regexes: stripped_text = reg.sub(r'', stripped_text) strip_sections = textlib.extract_sections( stripped_text, self.site)[1] # get proper sections header, sections, footer = textlib.extract_sections(text, self.site) # iterate stripped sections and create a new page body new_body = [] for i, strip_section in enumerate(strip_sections): current_heading = sections[i][0] try: next_heading = sections[i + 1][0] except IndexError: next_heading = '' current_dep = (len(current_heading) - len(current_heading.lstrip('='))) next_dep = len(next_heading) - len(next_heading.lstrip('=')) if strip_section[1].strip() or current_dep < next_dep: new_body.extend(sections[i]) return header + ''.join(new_body) + footer
def removeEmptySections(self, text): """Cleanup empty sections.""" # userspace contains article stubs without nobots/in use templates if self.namespace == 2: return text skippings = ['comment', 'category'] skip_regexes = _get_regexes(skippings, self.site) # site defined templates skip_templates = { 'cs': ('Pahýl[ _]část',), # stub section } if self.site.code in skip_templates: for template in skip_templates[self.site.code]: skip_regexes.append( re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I)) # empty lists skip_regexes.append(re.compile(r'(?m)^[\*#] *$')) # get stripped sections stripped_text = textlib.removeLanguageLinks(text, self.site, '\n') for reg in skip_regexes: stripped_text = reg.sub(r'', stripped_text) strip_sections = textlib.extract_sections( stripped_text, self.site)[1] # get proper sections header, sections, footer = textlib.extract_sections(text, self.site) # iterate stripped sections and create a new page body new_body = [] for i, strip_section in enumerate(strip_sections): current_heading = sections[i][0] try: next_heading = sections[i + 1][0] except IndexError: next_heading = '' current_dep = (len(current_heading) - len(current_heading.lstrip('='))) next_dep = len(next_heading) - len(next_heading.lstrip('=')) if strip_section[1].strip() or current_dep < next_dep: new_body.extend(sections[i]) return header + ''.join(new_body) + footer
def removeEmptySections(self, text): """Cleanup empty sections.""" # comments, categories, and interwikis skippings = ['comment', 'category', 'interwiki'] skip_regexes = _get_regexes(skippings, self.site) # we want only interwikis, not interlanguage links skip_regexes[1] = re.compile(skip_regexes[1].pattern.replace(':?', '')) # site defined templates skip_templates = { 'cs': ('Pahýl[ _]část', ), # stub section } if self.site.code in skip_templates: for template in skip_templates[self.site.code]: skip_regexes.append( re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I)) # empty lists skip_regexes.append(re.compile(r'(?m)^[\*#] *$')) # get stripped sections stripped_text = text for reg in skip_regexes: stripped_text = reg.sub(r'', stripped_text) strip_sections = textlib.extract_sections(stripped_text, self.site)[1] # get proper sections header, sections, footer = textlib.extract_sections(text, self.site) # iterate stripped sections and create a new page body new_body = [] for i, strip_section in enumerate(strip_sections): current_heading = sections[i][0] try: next_heading = sections[i + 1][0] except IndexError: next_heading = '' current_dep = (len(current_heading) - len(current_heading.lstrip('='))) next_dep = len(next_heading) - len(next_heading.lstrip('=')) if strip_section[1].strip() or current_dep < next_dep: new_body = new_body + list(sections[i]) return header + ''.join(new_body) + footer
def main(*args): """ Process command line arguments and invoke bot. If args is an empty list, sys.argv is used. @param args: command line arguments @type args: str """ local_args = pywikibot.handle_args(args) # default values for options extra_summary = None for arg in local_args: option, sep, value = arg.partition(':') if option == '-summary': extra_summary = value else: pywikibot.warning("Unrecognized option {}".format(option)) def check_option(option, value): if not value: pywikibot.error("Missing argument for option '{}'".format(option)) return False return True page_number_regex = re.compile('\|([1-9][0-9]*)\}') filename_number_regex = re.compile('([1-9][0-9]*)') templates_ready = ['QC image', 'File information', 'Self'] site = pywikibot.Site() looked_at = set() pages = request_pages() for p in pages: if p['title'] in looked_at: pywikibot.output("Done.") break else: looked_at.add(p['title']) try: page_title = 'File:' + p['title'] page = pywikibot.Page(site, page_title) click_url = ROOT_URL + 'wiki/' + page.title(underscore=True) pywikibot.output("Page '{0}', id={1} | {2}".format(page_title, p['id'], click_url)) ts = page.templatesWithParams() if len(ts) > 0: found_ready = False for t in ts: for r in templates_ready: if r in t[0].title(): pywikibot.output(color_format("Page {lightgreen}{0}{default} has template: {1}", page_title, t[0])) found_ready = True break if found_ready: pywikibot.output("\tSkipping.") continue old_text = page.get() # categories = getCategoryLinks(old_text, site) # categories_text = '\n'.join(map(lambda c:c.aslink(), categories)) (header, body, footer) = extract_sections(old_text, site) summary = None licensing = None description = None for section in body: if 'ummary' in section[0] or 'escription' in section[0]: summary = section[1] if 'icens' in section[0]: licensing = section[1] got_summary_from_header = False if summary is None: got_summary_from_header = True summary = header new_text = None pywikibot.output(color_format("Editing page {lightblue}{0}{default}.", page_title)) if summary is not None and len(summary.strip()) > 0: summary = summary.strip() pywikibot.output("Have \"Summary\":\n\t{}".format(summary)) i = summary.find('{') if i > 0: summary = summary[0:i] i = summary.find(' in ') if i > 0: summary = summary[0:i] summary = summary.strip() if summary[-1] == '.': summary = summary[0:-1] pywikibot.output("Will have \"Summary\":\n\t{}".format(summary)) choice = pywikibot.input_choice("Is it a good summary?", [('Yes', 'y'), ('No', 'n'), ('open in Browser', 'b')], 'n') if choice == 'y': description = summary elif choice == 'n': pass elif choice == 'b': pywikibot.bot.open_webbrowser(page) if description is None: pywikibot.output("Type '[s]kip' to skip the image completely.") description = pywikibot.input("Please describe the file:") if description in ['s', 'skip']: continue if licensing is not None: pywikibot.output("Have \"Licensing\":\n\t{}".format(licensing.strip())) comic_num = None m = page_number_regex.search(old_text) if m: try: comic_num = int(m.group(1)) except: pass if comic_num is None: m = filename_number_regex.search(page.title()) if m: try: comic_num = int(m.group(1)) except: pass if comic_num is not None: pywikibot.output("Have comic #:\n\t{}".format(comic_num)) choice = pywikibot.input_choice("Is it a good comic number?", [('Yes', 'y'), ('No', 'n'), ('open in Browser', 'b')], 'n') if choice == 'y': pass else: comic_num = None if choice == 'b': pywikibot.bot.open_webbrowser(page) while comic_num is None: try: pywikibot.output("Need comic number. Type 0 to skip") comic_num = int(pywikibot.input("Comic number: ")) except ValueError: pass if comic_num == 0: comic_num = '' new_text = dedent(""" == Summary == {{{{QC image|{0}|{1}}}}} == Licensing == {{{{Fairuse}}}} """.format(description, comic_num)).strip() header = header.strip() if not got_summary_from_header and len(header) > 0: new_text = header + '\n\n' + new_text footer = footer.strip() if len(footer) > 0: new_text += '\n\n' + footer # check if the edit is sensible if old_text == new_text: pywikibot.output("No changes. Nothing to do.") continue # report what will happen pywikibot.showDiff(old_text, new_text, context=3) summary = "add [[Template:QC image]]; mark as fair use " + \ "([[User:AndrybakBot#Image maintenance|Image maintenance bot task]])" if extra_summary: summary = summary + " ({})".format(extra_summary) pywikibot.output(color_format("Summary will be" + "\n\t{lightblue}{0}{default}", summary)) choice = pywikibot.input_choice( "Do you want to accept these changes?", [('Yes', 'y'), ('No', 'n'), ('open in Browser', 'b')], 'n') # if choice == 'y': # pywikibot.output("Test run, doing nothing.") # continue if choice == 'n': pywikibot.output("Okay, doing nothing.") continue elif choice == 'b': pywikibot.bot.open_webbrowser(page) elif choice == 'y': error_count = 0 while True: result = put_text(page, new_text, summary, error_count) if result is not None: pywikibot.output("Got result of saving: {}".format(result)) break error_count += 1 continue elif choice == 'q': break except pywikibot.NoPage: pywikibot.error("{} doesn't exist, skipping.".format(page.title())) continue except pywikibot.IsRedirectPage: pywikibot.error("{} is a redirect, skipping".format(page.title())) continue except pywikibot.Error as e: pywikibot.bot.suggest_help(exception=e) continue except QuitKeyboardInterrupt: sys.exit("User quit bot run.") else: pass