Python extract_sections примеры, pywikibot.textlib.extract_sections Python примеры использования

Пример #1

0

Показать файл

    def load_page(self):
        """Load the page to be archived and break it up into threads."""
        self.header = ''
        self.threads = []
        self.archives = {}
        self.archived_threads = 0

        # Exclude non-thread headings
        text = self.get()
        marker = findmarker(text)
        text = re.sub(r'^===', marker + r'===', text, flags=re.M)

        # Find threads, avoid archiving categories or interwiki
        header, threads, footer = extract_sections(text, self.site)
        header = header.replace(marker, '')
        if header and footer:
            self.header = '\n\n'.join((header.rstrip(), footer, ''))
        else:
            self.header = header + footer
        for thread_heading, thread_content in threads:
            cur_thread = DiscussionThread(thread_heading.strip('= '), self.now,
                                          self.timestripper)
            lines = thread_content.replace(marker, '').splitlines()
            lines = lines[1:]  # remove heading line
            for line in lines:
                cur_thread.feed_line(line)
            self.threads.append(cur_thread)

        # This extra info is not desirable when run under the unittest
        # framework, which may be run either directly or via setup.py
        if pywikibot.calledModuleName() not in ['archivebot_tests', 'setup']:
            pywikibot.output(u'%d Threads found on %s' %
                             (len(self.threads), self))

Пример #2

0

Показать файл

Файл: archivebot.py Проект: dvorapa/pywikibot

    def load_page(self) -> None:
        """Load the page to be archived and break it up into threads."""
        self.header = ''
        self.threads = []
        self.archives = {}
        self.archived_threads = 0

        # Exclude unsupported headings (h1, h3, etc):
        # adding the marker will make them ignored by extract_sections()
        text = self.get()
        marker = findmarker(text)
        text = re.sub(r'^((=|={3,})[^=])', marker + r'\1', text, flags=re.M)

        # Find threads, avoid archiving categories or interwiki
        header, threads, footer = extract_sections(text, self.site)
        header = header.replace(marker, '')
        if header and footer:
            self.header = '\n\n'.join((header.rstrip(), footer, ''))
        else:
            self.header = header + footer
        for thread_heading, thread_content in threads:
            cur_thread = DiscussionThread(thread_heading.strip('= '),
                                          self.timestripper)
            # remove heading line
            _, *lines = thread_content.replace(marker, '').splitlines()
            for line in lines:
                cur_thread.feed_line(line)
            self.threads.append(cur_thread)

        # This extra info is not desirable when run under the unittest
        # framework, which may be run either directly or via setup.py
        if pywikibot.calledModuleName() not in ['archivebot_tests', 'setup']:
            pywikibot.output('{} thread(s) found on {}'.format(
                len(self.threads), self))

Пример #3

0

Показать файл

Файл: cosmetic_changes.py Проект: CCXXXI/pywikibot

    def removeEmptySections(self, text: str) -> str:
        """Cleanup empty sections."""
        # userspace contains article stubs without nobots/in use templates
        if self.namespace == 2:
            return text

        skippings = ['comment', 'category']
        skip_regexes = _get_regexes(skippings, self.site)
        # site defined templates
        skip_templates = {
            'cs': ('Pahýl[ _]část',),  # stub section
        }
        if self.site.code in skip_templates:
            for template in skip_templates[self.site.code]:
                skip_regexes.append(
                    re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I))
        # empty lists
        skip_regexes.append(re.compile(r'(?m)^[\*#] *$'))

        # get stripped sections
        stripped_text = textlib.removeLanguageLinks(text, self.site, '\n')
        for reg in skip_regexes:
            stripped_text = reg.sub(r'', stripped_text)
        strip_sections = textlib.extract_sections(
            stripped_text, self.site)[1]

        # get proper sections
        header, sections, footer = textlib.extract_sections(text, self.site)

        # iterate stripped sections and create a new page body
        new_body = []
        for i, strip_section in enumerate(strip_sections):
            current_heading = sections[i][0]
            try:
                next_heading = sections[i + 1][0]
            except IndexError:
                next_heading = ''
            current_dep = (len(current_heading)
                           - len(current_heading.lstrip('=')))
            next_dep = len(next_heading) - len(next_heading.lstrip('='))
            if strip_section[1].strip() or current_dep < next_dep:
                new_body.extend(sections[i])
        return header + ''.join(new_body) + footer

Пример #4

0

Показать файл

Файл: cosmetic_changes.py Проект: Zeffar/Elobot

    def removeEmptySections(self, text):
        """Cleanup empty sections."""
        # comments, categories, and interwikis
        skippings = ['comment', 'category', 'interwiki']
        skip_regexes = _get_regexes(skippings, self.site)
        # we want only interwikis, not interlanguage links
        skip_regexes[1] = re.compile(
            skip_regexes[1].pattern.replace(':?', ''))
        # site defined templates
        skip_templates = {
            'cs': ('Pahýl[ _]část',),  # stub section
        }
        if self.site.code in skip_templates:
            for template in skip_templates[self.site.code]:
                skip_regexes.append(
                    re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I))
        # empty lists
        skip_regexes.append(re.compile(r'(?m)^[\*#] *$'))

        # get stripped sections
        stripped_text = text
        for reg in skip_regexes:
            stripped_text = reg.sub(r'', stripped_text)
        strip_sections = textlib.extract_sections(
            stripped_text, self.site)[1]

        # get proper sections
        header, sections, footer = textlib.extract_sections(text, self.site)

        # iterate stripped sections and create a new page body
        new_body = []
        for i, strip_section in enumerate(strip_sections):
            current_heading = sections[i][0]
            try:
                next_heading = sections[i + 1][0]
            except IndexError:
                next_heading = ''
            current_dep = (len(current_heading)
                           - len(current_heading.lstrip('=')))
            next_dep = len(next_heading) - len(next_heading.lstrip('='))
            if strip_section[1].strip() or current_dep < next_dep:
                new_body.extend(sections[i])
        return header + ''.join(new_body) + footer

Пример #5

0

Показать файл

    def removeEmptySections(self, text):
        """Cleanup empty sections."""
        # userspace contains article stubs without nobots/in use templates
        if self.namespace == 2:
            return text

        skippings = ['comment', 'category']
        skip_regexes = _get_regexes(skippings, self.site)
        # site defined templates
        skip_templates = {
            'cs': ('Pahýl[ _]část',),  # stub section
        }
        if self.site.code in skip_templates:
            for template in skip_templates[self.site.code]:
                skip_regexes.append(
                    re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I))
        # empty lists
        skip_regexes.append(re.compile(r'(?m)^[\*#] *$'))

        # get stripped sections
        stripped_text = textlib.removeLanguageLinks(text, self.site, '\n')
        for reg in skip_regexes:
            stripped_text = reg.sub(r'', stripped_text)
        strip_sections = textlib.extract_sections(
            stripped_text, self.site)[1]

        # get proper sections
        header, sections, footer = textlib.extract_sections(text, self.site)

        # iterate stripped sections and create a new page body
        new_body = []
        for i, strip_section in enumerate(strip_sections):
            current_heading = sections[i][0]
            try:
                next_heading = sections[i + 1][0]
            except IndexError:
                next_heading = ''
            current_dep = (len(current_heading)
                           - len(current_heading.lstrip('=')))
            next_dep = len(next_heading) - len(next_heading.lstrip('='))
            if strip_section[1].strip() or current_dep < next_dep:
                new_body.extend(sections[i])
        return header + ''.join(new_body) + footer

Пример #6

0

Показать файл

    def removeEmptySections(self, text):
        """Cleanup empty sections."""
        # comments, categories, and interwikis
        skippings = ['comment', 'category', 'interwiki']
        skip_regexes = _get_regexes(skippings, self.site)
        # we want only interwikis, not interlanguage links
        skip_regexes[1] = re.compile(skip_regexes[1].pattern.replace(':?', ''))
        # site defined templates
        skip_templates = {
            'cs': ('Pahýl[ _]část', ),  # stub section
        }
        if self.site.code in skip_templates:
            for template in skip_templates[self.site.code]:
                skip_regexes.append(
                    re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I))
        # empty lists
        skip_regexes.append(re.compile(r'(?m)^[\*#] *$'))

        # get stripped sections
        stripped_text = text
        for reg in skip_regexes:
            stripped_text = reg.sub(r'', stripped_text)
        strip_sections = textlib.extract_sections(stripped_text, self.site)[1]

        # get proper sections
        header, sections, footer = textlib.extract_sections(text, self.site)

        # iterate stripped sections and create a new page body
        new_body = []
        for i, strip_section in enumerate(strip_sections):
            current_heading = sections[i][0]
            try:
                next_heading = sections[i + 1][0]
            except IndexError:
                next_heading = ''
            current_dep = (len(current_heading) -
                           len(current_heading.lstrip('=')))
            next_dep = len(next_heading) - len(next_heading.lstrip('='))
            if strip_section[1].strip() or current_dep < next_dep:
                new_body = new_body + list(sections[i])
        return header + ''.join(new_body) + footer

Пример #7

0

Показать файл

def main(*args):
    """
    Process command line arguments and invoke bot.

    If args is an empty list, sys.argv is used.

    @param args: command line arguments
    @type args: str
    """

    local_args = pywikibot.handle_args(args)

    # default values for options
    extra_summary = None

    for arg in local_args:
        option, sep, value = arg.partition(':')
        if option == '-summary':
            extra_summary = value
        else:
            pywikibot.warning("Unrecognized option {}".format(option))


    def check_option(option, value):
        if not value:
            pywikibot.error("Missing argument for option '{}'".format(option))
            return False
        return True


    page_number_regex = re.compile('\|([1-9][0-9]*)\}')
    filename_number_regex = re.compile('([1-9][0-9]*)')
    templates_ready = ['QC image', 'File information', 'Self']
    site = pywikibot.Site()
    looked_at = set()
    pages = request_pages()
    for p in pages:
        if p['title'] in looked_at:
            pywikibot.output("Done.")
            break
        else:
            looked_at.add(p['title'])
        try:
            page_title = 'File:' + p['title']
            page = pywikibot.Page(site, page_title)
            click_url = ROOT_URL + 'wiki/' + page.title(underscore=True)
            pywikibot.output("Page '{0}', id={1} | {2}".format(page_title, p['id'], click_url))
            ts = page.templatesWithParams()
            if len(ts) > 0:
                found_ready = False
                for t in ts:
                    for r in templates_ready:
                        if r in t[0].title():
                            pywikibot.output(color_format("Page {lightgreen}{0}{default} has template: {1}",
                                page_title, t[0]))
                            found_ready = True
                            break
                if found_ready:
                    pywikibot.output("\tSkipping.")
                    continue

            old_text = page.get()
            # categories = getCategoryLinks(old_text, site)
            # categories_text = '\n'.join(map(lambda c:c.aslink(), categories))
            (header, body, footer) = extract_sections(old_text, site)
            summary = None
            licensing = None
            description = None
            for section in body:
                if 'ummary' in section[0] or 'escription' in section[0]:
                    summary = section[1]
                if 'icens' in section[0]:
                    licensing = section[1]
            got_summary_from_header = False
            if summary is None:
                got_summary_from_header = True
                summary = header

            new_text = None
            pywikibot.output(color_format("Editing page {lightblue}{0}{default}.", page_title))
            if summary is not None and len(summary.strip()) > 0:
                summary = summary.strip()
                pywikibot.output("Have \"Summary\":\n\t{}".format(summary))
                i = summary.find('{')
                if i > 0:
                    summary = summary[0:i]
                i = summary.find(' in ')
                if i > 0:
                    summary = summary[0:i]
                summary = summary.strip()
                if summary[-1] == '.':
                    summary = summary[0:-1]
                pywikibot.output("Will have \"Summary\":\n\t{}".format(summary))
                choice = pywikibot.input_choice("Is it a good summary?",
                    [('Yes', 'y'), ('No', 'n'), ('open in Browser', 'b')], 'n')
                if choice == 'y':
                    description = summary
                elif choice == 'n':
                    pass
                elif choice == 'b':
                    pywikibot.bot.open_webbrowser(page)
            if description is None:
                pywikibot.output("Type '[s]kip' to skip the image completely.")
                description = pywikibot.input("Please describe the file:")
                if description in ['s', 'skip']:
                    continue
            if licensing is not None:
                pywikibot.output("Have \"Licensing\":\n\t{}".format(licensing.strip()))

            comic_num = None
            m = page_number_regex.search(old_text)
            if m:
                try:
                    comic_num = int(m.group(1))
                except:
                    pass
            if comic_num is None:
                m = filename_number_regex.search(page.title())
                if m:
                    try:
                        comic_num = int(m.group(1))
                    except:
                        pass
            if comic_num is not None:
                pywikibot.output("Have comic #:\n\t{}".format(comic_num))
                choice = pywikibot.input_choice("Is it a good comic number?",
                    [('Yes', 'y'), ('No', 'n'), ('open in Browser', 'b')], 'n')
                if choice == 'y':
                    pass
                else:
                    comic_num = None
                if choice == 'b':
                    pywikibot.bot.open_webbrowser(page)
            while comic_num is None:
                try:
                    pywikibot.output("Need comic number. Type 0 to skip")
                    comic_num = int(pywikibot.input("Comic number: "))
                except ValueError:
                    pass
            if comic_num == 0:
                comic_num = ''

            new_text = dedent("""
                == Summary ==
                {{{{QC image|{0}|{1}}}}}

                == Licensing ==
                {{{{Fairuse}}}}
                """.format(description, comic_num)).strip()
            header = header.strip()
            if not got_summary_from_header and len(header) > 0:
                new_text = header + '\n\n' + new_text
            footer = footer.strip()
            if len(footer) > 0:
                new_text += '\n\n' + footer

            # check if the edit is sensible
            if old_text == new_text:
                pywikibot.output("No changes. Nothing to do.")
                continue
            # report what will happen
            pywikibot.showDiff(old_text, new_text, context=3)

            summary = "add [[Template:QC image]]; mark as fair use " + \
                "([[User:AndrybakBot#Image maintenance|Image maintenance bot task]])"
            if extra_summary:
                summary = summary + " ({})".format(extra_summary)
            pywikibot.output(color_format("Summary will be" +
                "\n\t{lightblue}{0}{default}", summary))
            choice = pywikibot.input_choice(
                "Do you want to accept these changes?",
                [('Yes', 'y'), ('No', 'n'), ('open in Browser', 'b')], 'n')
            # if choice == 'y':
            #     pywikibot.output("Test run, doing nothing.")
            #     continue
            if choice == 'n':
                pywikibot.output("Okay, doing nothing.")
                continue
            elif choice == 'b':
                pywikibot.bot.open_webbrowser(page)
            elif choice == 'y':
                error_count = 0
                while True:
                    result = put_text(page, new_text, summary, error_count)
                    if result is not None:
                        pywikibot.output("Got result of saving: {}".format(result))
                        break
                    error_count += 1
                continue
            elif choice == 'q':
                break

        except pywikibot.NoPage:
            pywikibot.error("{} doesn't exist, skipping.".format(page.title()))
            continue
        except pywikibot.IsRedirectPage:
            pywikibot.error("{} is a redirect, skipping".format(page.title()))
            continue
        except pywikibot.Error as e:
            pywikibot.bot.suggest_help(exception=e)
            continue
        except QuitKeyboardInterrupt:
            sys.exit("User quit bot run.")
        else:
            pass

Python extract_sections примеры использования