예제 #1
0
    def __init__(self, generator, **kwargs):
        """- generator : Page generator."""
        self.availableOptions.update({
            'ignorepdf': False,  # boolean
            'limit': None,  # int, stop after n modified pages
            'summary': None,
        })

        super(ReferencesRobot, self).__init__(**kwargs)
        self.generator = generator
        self.site = pywikibot.Site()
        self._use_fake_user_agent = config.fake_user_agent_default.get(
            'reflinks', False)
        # Check
        manual = 'mw:Manual:Pywikibot/refLinks'
        code = None
        for alt in [self.site.code] + i18n._altlang(self.site.code):
            if alt in localized_msg:
                code = alt
                break
        if code:
            manual += '/{0}'.format(code)
        if self.getOption('summary') is None:
            self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals())
        else:
            self.msg = self.getOption('summary')

        local = i18n.translate(self.site, badtitles)
        if local:
            bad = '(' + globalbadtitles + '|' + local + ')'
        else:
            bad = globalbadtitles
        self.titleBlackList = re.compile(bad, re.I | re.S | re.X)
        self.norefbot = noreferences.NoReferencesBot(None, verbose=False)
        self.deduplicator = DuplicateReferences(self.site)

        self.site_stop_page = i18n.translate(self.site, stop_page)
        if self.site_stop_page:
            self.stop_page = pywikibot.Page(self.site, self.site_stop_page)
            if self.stop_page.exists():
                self.stop_page_rev_id = self.stop_page.latest_revision_id
            else:
                pywikibot.warning('The stop page {0} does not exist'
                                  .format(self.stop_page.title(as_link=True)))

        # Regex to grasp content-type meta HTML tag in HTML source
        self.META_CONTENT = re.compile(br'(?i)<meta[^>]*content\-type[^>]*>')
        # Extract the encoding from a charset property (from content-type !)
        self.CHARSET = re.compile(r'(?i)charset\s*=\s*(?P<enc>[^\'",;>/]*)')
        # Extract html title from page
        self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)')
        # Matches content inside <script>/<style>/HTML comments
        self.NON_HTML = re.compile(
            br'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|'
            br'<!--.*?-->|<!\[CDATA\[.*?\]\]>')

        # Authorized mime types for HTML pages
        self.MIME = re.compile(
            r'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
예제 #2
0
    def __init__(self, **kwargs):
        """Initializer."""
        super().__init__(**kwargs)
        self._use_fake_user_agent = config.fake_user_agent_default.get(
            'reflinks', False)
        # Check
        manual = 'mw:Manual:Pywikibot/refLinks'
        code = None
        for alt in [self.site.code] + i18n._altlang(self.site.code):
            if alt in localized_msg:
                code = alt
                break
        if code:
            manual += '/{}'.format(code)

        if self.opt.summary:
            self.msg = self.opt.summary
        else:
            self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals())

        local = i18n.translate(self.site, badtitles)
        if local:
            bad = '({}|{})'.format(globalbadtitles, local)
        else:
            bad = globalbadtitles

        self.titleBlackList = re.compile(bad, re.I | re.S | re.X)
        self.norefbot = noreferences.NoReferencesBot(verbose=False)
        self.deduplicator = DuplicateReferences(self.site)

        self.site_stop_page = i18n.translate(self.site, stop_page)
        if self.site_stop_page:
            self.stop_page = pywikibot.Page(self.site, self.site_stop_page)
            if self.stop_page.exists():
                self.stop_page_rev_id = self.stop_page.latest_revision_id
            else:
                pywikibot.warning('The stop page {} does not exist'.format(
                    self.stop_page.title(as_link=True)))

        # Regex to grasp content-type meta HTML tag in HTML source
        self.META_CONTENT = re.compile(
            br'(?i)<meta[^>]*(?:content\-type|charset)[^>]*>')
        # Extract the encoding from a charset property (from content-type !)
        self.CHARSET = re.compile(
            r'(?i)charset\s*=\s*(?P<enc>(?P<q>[\'"]?)[^\'",;>/]*(?P=q))')
        # Extract html title from page
        self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)')
        # Matches content inside <script>/<style>/HTML comments
        self.NON_HTML = re.compile(
            br'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|'
            br'<!--.*?-->|<!\[CDATA\[.*?\]\]>')

        # Authorized mime types for HTML pages
        self.MIME = re.compile(
            r'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
    def treat_page(self):
        """Load the given page, do some changes, and save it."""
        # let's define some basic variables
        urtext = self.current_page.text
        urtitle = self.current_page.title()

        #eng_site = pywikibot.Site('en')
        #eng_title = ''

        interDict = {}
        try:
            site = pywikibot.Site('ur', 'wikipedia')
            urpage = pywikibot.Page(site, urtitle)

            langlst = urpage.iterlanglinks()

            for i in langlst:
                lang = str(i.site).split(':')[1]
                interDict[lang] = i.title
                if lang == 'en':
                    break

            # If there is no inter-wiki page then exit the program
            if len(interDict) == 0:
                print('Link Dictionary is empty')
                sys.exit()

            eng_title = interDict['en']
        except:
            pywikibot.output(
                u'\03{lightred}Unable to fetch interwiki links!\03{default}')
            return False

        site = pywikibot.Site('en', 'wikipedia')
        enpage = pywikibot.Page(site, eng_title)

        wikitext = enpage.get()
        wikicode = mwp.parse(wikitext)

        # Forming reference if tag value is either ref,sfn,sfnp
        reftags = self.refTagForming(wikicode)

        dlinks = {}
        for k, v in reftags.items():
            dkey = 'و' + str(k) + 'و'
            if v[0] in sfn:
                dlinks[dkey] = str(v[1])
            else:
                dlinks[dkey] = '<ref>' + str(v[1]) + '</ref>'

        urtext = urpage.text
        for r in tuple(dlinks.items()):
            urtext = urtext.replace(*r)

        # Removing Duplicate references by using named reference(first instance) and
        # then only using named reference without content
        deDupRef = reflinks.DuplicateReferences(site)
        urtext = deDupRef.process(urtext)

        #write_output(urtext, 'AfterdeDuplicate.txt')

        # Used noreferences to add Reference List in Article
        norefbot = noreferences.NoReferencesBot(None)
        if norefbot.lacksReferences(urtext):
            urpage.text = norefbot.addReferences(urtext)
        else:
            urpage.text = urtext + '\n'

        # save the page
        urpage.save(summary=self.summary, minor=False)
예제 #4
0
    def __init__(self, generator, **kwargs):
        """- generator : Page generator."""
        self.availableOptions.update({
            'ignorepdf': False,  # boolean
            'repair': False,  # boolean
            'limit': None,  # int, stop after n modified pages
            'summary': None,
        })

        super(ReferencesRobot, self).__init__(**kwargs)
        self.generator = generator
        self.site = pywikibot.Site()
        self._use_fake_user_agent = config.fake_user_agent_default.get(
            'reflinks', False)
        # Check
        #manual = 'mw:Manual:Pywikibot/refLinks'
        manual = 'Wikipedysta:MastiBot/refLinks'
        code = None
        for alt in [self.site.code] + i18n._altlang(self.site.code):
            if alt in localized_msg:
                code = alt
                break
        if code:
            manual += '/%s' % code
        if self.getOption('summary') is None:
            self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals())
        else:
            self.msg = self.getOption('summary')

        local = i18n.translate(self.site, badtitles)
        if local:
            bad = '(' + globalbadtitles + '|' + local + ')'
        else:
            bad = globalbadtitles
        self.titleBlackList = re.compile(bad, re.I | re.S | re.X)
        self.norefbot = noreferences.NoReferencesBot(None, verbose=False)
        #self.deduplicator = DuplicateReferences()

        self.site_stop_page = i18n.translate(self.site, stop_page)
        if self.site_stop_page:
            self.stop_page = pywikibot.Page(self.site, self.site_stop_page)
            if self.stop_page.exists():
                self.stop_page_rev_id = self.stop_page.latest_revision_id
            else:
                pywikibot.warning('The stop page %s does not exist' %
                                  self.stop_page.title(asLink=True))

        # Regex that match bare references
            if self.getOption('repair'):
                self.linksInRef = re.compile(
                    ur'(?i)<ref(?P<name>[^>]*)>\.?\[?(?P<url>http[s]?:(\/\/[^:\s\?]+?)(\??[^\s<]*?)[^\]\.])(\]|\]\.)?( [^<]*?<!-- Tytuł wygenerowany przez bota -->[ \t]*\])[ \t]*<\/ref>'
                )
            else:
                self.linksInRef = re.compile(
                    ur'(?i)<ref(?P<name>[^>]*)>\.?\[?(?P<url>http[s]?:(\/\/[^:\s\?]+?)(\??[^\s<]*?)[^\]\.])(\]|\]\.)?[ \t]*<\/ref>'
                )

        # Regex to grasp content-type meta HTML tag in HTML source
        self.META_CONTENT = re.compile(br'(?i)<meta[^>]*content\-type[^>]*>')
        # Extract the encoding from a charset property (from content-type !)
        self.CHARSET = re.compile(r'(?i)charset\s*=\s*(?P<enc>[^\'",;>/]*)')
        # Extract html title from page
        #self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)')
        self.TITLE = re.compile(
            r'(?is)(<title[^>]*?>)(?P<title>.*?)(?=</title>)')
        # Matches content inside <script>/<style>/HTML comments
        self.NON_HTML = re.compile(
            br'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|'
            br'<!--.*?-->|<!\[CDATA\[.*?\]\]>')
        # Extract html language from page
        self.LANG = re.compile(
            r'(?i)(<html[^>]*?lang\s*?=\s*?|<meta\s*?HTTP-EQUIV\s*?=\s*?\"Content-Language\"\s*?CONTENT\s*?=\s*?|<meta property\s*?=\s*?\"og:locale\"\s*?content\s*?=\s*?)\"(?P<lang>.*?)[\_\-\"]'
        )

        # Authorized mime types for HTML pages
        self.MIME = re.compile(
            r'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
예제 #5
0
    def treat_page(self):
        """Load the given page, do some changes, and save it."""
        # let's define some basic variables
        urtext = self.current_page.text
        urlang = self.current_page.site.code
        urtitle = self.current_page.title()
        urcat = []
        eng_site = pywikibot.Site('en')
        eng_title = ''

        interDict = {}
        try:
            site = pywikibot.Site('ur', 'wikipedia')
            urpage = pywikibot.Page(site, urtitle)
            langlst = urpage.iterlanglinks()

            for i in langlst:
                lang = str(i.site).split(':')[1]
                interDict[lang] = i.title

            eng_title = interDict['en']
        except:
            pywikibot.output(
                u'\03{lightred}Unable to fetch interwiki links!\03{default}')
            return False

        site = pywikibot.Site('en', 'wikipedia')
        enpage = pywikibot.Page(site, eng_title)

        wikitext = enpage.get()
        wikicode = mwp.parse(wikitext)

        # Extracting sfn templates and converting them in REF tags
        sfnlist = []
        for template in wikicode.filter_templates():
            if template.name in ('sfn', 'sfn'):
                sfnlist.append(template)
                templ_rep = '<ref>' + str(template) + '</ref>'
                wikicode.replace(template, templ_rep)

        alltags = wikicode.filter_tags()
        reftags = {}

        def search(myDict, search1):
            for key, value in myDict.items():
                if search1 in value:
                    return key

        i = 1
        for tag in alltags:
            if tag.tag == 'ref':
                if tag.attributes == []:  # check if attributes list is empty
                    refval = 'NoRefName'  # Reference has no name so assigning "NoRefName"
                else:
                    name = tag.attributes[0]
                    refval = name.value

                if tag.contents is None:
                    #conval = search(reftags,refval)
                    #reftags[i] = (refval,reftags[conval][1])
                    pass
                else:
                    reftags[i] = (refval, tag.contents)
                    i += 1

        dlinks = {}
        for k, v in reftags.items():
            dkey = 'و' + str(k) + 'و'
            dlinks[dkey] = '<ref>' + str(v[1]) + '</ref>'

        urtext = urpage.text
        for r in tuple(dlinks.items()):
            urtext = urtext.replace(*r)

        # newln = '\n'
        # Using noreferences to add Reference template if not present
        self.norefbot = noreferences.NoReferencesBot(None)
        if self.norefbot.lacksReferences(urtext):
            urtext = self.norefbot.addReferences(urtext)
        else:
            urpage.text = urtext + '\n'

        print(urpage.text)

        # save the page
        urpage.save(summary=self.summary, minor=False)
예제 #6
0
    dlinks = {}
    for k, v in reftags.items():
        dkey = 'و' + str(k) + 'و'
        reftext = str(v[1])
        dlinks[dkey] = '<ref>' + reftext + '</ref>'

    urtext = urpage.text
    for r in tuple(dlinks.items()):
        urtext = urtext.replace(*r)

    # newln = '\n'
    # hawalajat = '{{حوالہ جات}}'
    # urduref = '== حوالہ جات ==' + newln + hawalajat + newln
    # if hawalajat not in urtext:
    #     urpage.text = urtext + newln*2 + urduref + newln
    # else:
    #     urpage.text = urtext + newln*2

    # Used noreferences to add Reference List in Article
    norefbot = noreferences.NoReferencesBot(None)
    if norefbot.lacksReferences(urtext):
        urpage.text = norefbot.addReferences(urtext)
    else:
        urpage.text = urtext + '\n'

    print('Printing appended Urdu Page')
    print(urpage.text)

    # save the page
    #urpage.save(summary=self.summary, minor=False)