def __init__(self, generator, **kwargs): """- generator : Page generator.""" self.availableOptions.update({ 'ignorepdf': False, # boolean 'limit': None, # int, stop after n modified pages 'summary': None, }) super(ReferencesRobot, self).__init__(**kwargs) self.generator = generator self.site = pywikibot.Site() self._use_fake_user_agent = config.fake_user_agent_default.get( 'reflinks', False) # Check manual = 'mw:Manual:Pywikibot/refLinks' code = None for alt in [self.site.code] + i18n._altlang(self.site.code): if alt in localized_msg: code = alt break if code: manual += '/{0}'.format(code) if self.getOption('summary') is None: self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals()) else: self.msg = self.getOption('summary') local = i18n.translate(self.site, badtitles) if local: bad = '(' + globalbadtitles + '|' + local + ')' else: bad = globalbadtitles self.titleBlackList = re.compile(bad, re.I | re.S | re.X) self.norefbot = noreferences.NoReferencesBot(None, verbose=False) self.deduplicator = DuplicateReferences(self.site) self.site_stop_page = i18n.translate(self.site, stop_page) if self.site_stop_page: self.stop_page = pywikibot.Page(self.site, self.site_stop_page) if self.stop_page.exists(): self.stop_page_rev_id = self.stop_page.latest_revision_id else: pywikibot.warning('The stop page {0} does not exist' .format(self.stop_page.title(as_link=True))) # Regex to grasp content-type meta HTML tag in HTML source self.META_CONTENT = re.compile(br'(?i)<meta[^>]*content\-type[^>]*>') # Extract the encoding from a charset property (from content-type !) self.CHARSET = re.compile(r'(?i)charset\s*=\s*(?P<enc>[^\'",;>/]*)') # Extract html title from page self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)') # Matches content inside <script>/<style>/HTML comments self.NON_HTML = re.compile( br'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|' br'<!--.*?-->|<!\[CDATA\[.*?\]\]>') # Authorized mime types for HTML pages self.MIME = re.compile( r'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
def __init__(self, **kwargs): """Initializer.""" super().__init__(**kwargs) self._use_fake_user_agent = config.fake_user_agent_default.get( 'reflinks', False) # Check manual = 'mw:Manual:Pywikibot/refLinks' code = None for alt in [self.site.code] + i18n._altlang(self.site.code): if alt in localized_msg: code = alt break if code: manual += '/{}'.format(code) if self.opt.summary: self.msg = self.opt.summary else: self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals()) local = i18n.translate(self.site, badtitles) if local: bad = '({}|{})'.format(globalbadtitles, local) else: bad = globalbadtitles self.titleBlackList = re.compile(bad, re.I | re.S | re.X) self.norefbot = noreferences.NoReferencesBot(verbose=False) self.deduplicator = DuplicateReferences(self.site) self.site_stop_page = i18n.translate(self.site, stop_page) if self.site_stop_page: self.stop_page = pywikibot.Page(self.site, self.site_stop_page) if self.stop_page.exists(): self.stop_page_rev_id = self.stop_page.latest_revision_id else: pywikibot.warning('The stop page {} does not exist'.format( self.stop_page.title(as_link=True))) # Regex to grasp content-type meta HTML tag in HTML source self.META_CONTENT = re.compile( br'(?i)<meta[^>]*(?:content\-type|charset)[^>]*>') # Extract the encoding from a charset property (from content-type !) self.CHARSET = re.compile( r'(?i)charset\s*=\s*(?P<enc>(?P<q>[\'"]?)[^\'",;>/]*(?P=q))') # Extract html title from page self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)') # Matches content inside <script>/<style>/HTML comments self.NON_HTML = re.compile( br'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|' br'<!--.*?-->|<!\[CDATA\[.*?\]\]>') # Authorized mime types for HTML pages self.MIME = re.compile( r'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
def treat_page(self): """Load the given page, do some changes, and save it.""" # let's define some basic variables urtext = self.current_page.text urtitle = self.current_page.title() #eng_site = pywikibot.Site('en') #eng_title = '' interDict = {} try: site = pywikibot.Site('ur', 'wikipedia') urpage = pywikibot.Page(site, urtitle) langlst = urpage.iterlanglinks() for i in langlst: lang = str(i.site).split(':')[1] interDict[lang] = i.title if lang == 'en': break # If there is no inter-wiki page then exit the program if len(interDict) == 0: print('Link Dictionary is empty') sys.exit() eng_title = interDict['en'] except: pywikibot.output( u'\03{lightred}Unable to fetch interwiki links!\03{default}') return False site = pywikibot.Site('en', 'wikipedia') enpage = pywikibot.Page(site, eng_title) wikitext = enpage.get() wikicode = mwp.parse(wikitext) # Forming reference if tag value is either ref,sfn,sfnp reftags = self.refTagForming(wikicode) dlinks = {} for k, v in reftags.items(): dkey = 'و' + str(k) + 'و' if v[0] in sfn: dlinks[dkey] = str(v[1]) else: dlinks[dkey] = '<ref>' + str(v[1]) + '</ref>' urtext = urpage.text for r in tuple(dlinks.items()): urtext = urtext.replace(*r) # Removing Duplicate references by using named reference(first instance) and # then only using named reference without content deDupRef = reflinks.DuplicateReferences(site) urtext = deDupRef.process(urtext) #write_output(urtext, 'AfterdeDuplicate.txt') # Used noreferences to add Reference List in Article norefbot = noreferences.NoReferencesBot(None) if norefbot.lacksReferences(urtext): urpage.text = norefbot.addReferences(urtext) else: urpage.text = urtext + '\n' # save the page urpage.save(summary=self.summary, minor=False)
def __init__(self, generator, **kwargs): """- generator : Page generator.""" self.availableOptions.update({ 'ignorepdf': False, # boolean 'repair': False, # boolean 'limit': None, # int, stop after n modified pages 'summary': None, }) super(ReferencesRobot, self).__init__(**kwargs) self.generator = generator self.site = pywikibot.Site() self._use_fake_user_agent = config.fake_user_agent_default.get( 'reflinks', False) # Check #manual = 'mw:Manual:Pywikibot/refLinks' manual = 'Wikipedysta:MastiBot/refLinks' code = None for alt in [self.site.code] + i18n._altlang(self.site.code): if alt in localized_msg: code = alt break if code: manual += '/%s' % code if self.getOption('summary') is None: self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals()) else: self.msg = self.getOption('summary') local = i18n.translate(self.site, badtitles) if local: bad = '(' + globalbadtitles + '|' + local + ')' else: bad = globalbadtitles self.titleBlackList = re.compile(bad, re.I | re.S | re.X) self.norefbot = noreferences.NoReferencesBot(None, verbose=False) #self.deduplicator = DuplicateReferences() self.site_stop_page = i18n.translate(self.site, stop_page) if self.site_stop_page: self.stop_page = pywikibot.Page(self.site, self.site_stop_page) if self.stop_page.exists(): self.stop_page_rev_id = self.stop_page.latest_revision_id else: pywikibot.warning('The stop page %s does not exist' % self.stop_page.title(asLink=True)) # Regex that match bare references if self.getOption('repair'): self.linksInRef = re.compile( ur'(?i)<ref(?P<name>[^>]*)>\.?\[?(?P<url>http[s]?:(\/\/[^:\s\?]+?)(\??[^\s<]*?)[^\]\.])(\]|\]\.)?( [^<]*?<!-- Tytuł wygenerowany przez bota -->[ \t]*\])[ \t]*<\/ref>' ) else: self.linksInRef = re.compile( ur'(?i)<ref(?P<name>[^>]*)>\.?\[?(?P<url>http[s]?:(\/\/[^:\s\?]+?)(\??[^\s<]*?)[^\]\.])(\]|\]\.)?[ \t]*<\/ref>' ) # Regex to grasp content-type meta HTML tag in HTML source self.META_CONTENT = re.compile(br'(?i)<meta[^>]*content\-type[^>]*>') # Extract the encoding from a charset property (from content-type !) self.CHARSET = re.compile(r'(?i)charset\s*=\s*(?P<enc>[^\'",;>/]*)') # Extract html title from page #self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)') self.TITLE = re.compile( r'(?is)(<title[^>]*?>)(?P<title>.*?)(?=</title>)') # Matches content inside <script>/<style>/HTML comments self.NON_HTML = re.compile( br'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|' br'<!--.*?-->|<!\[CDATA\[.*?\]\]>') # Extract html language from page self.LANG = re.compile( r'(?i)(<html[^>]*?lang\s*?=\s*?|<meta\s*?HTTP-EQUIV\s*?=\s*?\"Content-Language\"\s*?CONTENT\s*?=\s*?|<meta property\s*?=\s*?\"og:locale\"\s*?content\s*?=\s*?)\"(?P<lang>.*?)[\_\-\"]' ) # Authorized mime types for HTML pages self.MIME = re.compile( r'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
def treat_page(self): """Load the given page, do some changes, and save it.""" # let's define some basic variables urtext = self.current_page.text urlang = self.current_page.site.code urtitle = self.current_page.title() urcat = [] eng_site = pywikibot.Site('en') eng_title = '' interDict = {} try: site = pywikibot.Site('ur', 'wikipedia') urpage = pywikibot.Page(site, urtitle) langlst = urpage.iterlanglinks() for i in langlst: lang = str(i.site).split(':')[1] interDict[lang] = i.title eng_title = interDict['en'] except: pywikibot.output( u'\03{lightred}Unable to fetch interwiki links!\03{default}') return False site = pywikibot.Site('en', 'wikipedia') enpage = pywikibot.Page(site, eng_title) wikitext = enpage.get() wikicode = mwp.parse(wikitext) # Extracting sfn templates and converting them in REF tags sfnlist = [] for template in wikicode.filter_templates(): if template.name in ('sfn', 'sfn'): sfnlist.append(template) templ_rep = '<ref>' + str(template) + '</ref>' wikicode.replace(template, templ_rep) alltags = wikicode.filter_tags() reftags = {} def search(myDict, search1): for key, value in myDict.items(): if search1 in value: return key i = 1 for tag in alltags: if tag.tag == 'ref': if tag.attributes == []: # check if attributes list is empty refval = 'NoRefName' # Reference has no name so assigning "NoRefName" else: name = tag.attributes[0] refval = name.value if tag.contents is None: #conval = search(reftags,refval) #reftags[i] = (refval,reftags[conval][1]) pass else: reftags[i] = (refval, tag.contents) i += 1 dlinks = {} for k, v in reftags.items(): dkey = 'و' + str(k) + 'و' dlinks[dkey] = '<ref>' + str(v[1]) + '</ref>' urtext = urpage.text for r in tuple(dlinks.items()): urtext = urtext.replace(*r) # newln = '\n' # Using noreferences to add Reference template if not present self.norefbot = noreferences.NoReferencesBot(None) if self.norefbot.lacksReferences(urtext): urtext = self.norefbot.addReferences(urtext) else: urpage.text = urtext + '\n' print(urpage.text) # save the page urpage.save(summary=self.summary, minor=False)
dlinks = {} for k, v in reftags.items(): dkey = 'و' + str(k) + 'و' reftext = str(v[1]) dlinks[dkey] = '<ref>' + reftext + '</ref>' urtext = urpage.text for r in tuple(dlinks.items()): urtext = urtext.replace(*r) # newln = '\n' # hawalajat = '{{حوالہ جات}}' # urduref = '== حوالہ جات ==' + newln + hawalajat + newln # if hawalajat not in urtext: # urpage.text = urtext + newln*2 + urduref + newln # else: # urpage.text = urtext + newln*2 # Used noreferences to add Reference List in Article norefbot = noreferences.NoReferencesBot(None) if norefbot.lacksReferences(urtext): urpage.text = norefbot.addReferences(urtext) else: urpage.text = urtext + '\n' print('Printing appended Urdu Page') print(urpage.text) # save the page #urpage.save(summary=self.summary, minor=False)