def test_intersect_newpages_twice(self): """Test newpages intersection.""" site = self.get_site() self.assertEqualItertools([ pagegenerators.NewpagesPageGenerator(site=site, total=10), pagegenerators.NewpagesPageGenerator(site=site, total=10) ])
def _get_test_unconnected_page(site): """Get unconnected page from site for tests.""" gen = pagegenerators.NewpagesPageGenerator(site=site, total=10, namespaces=[1, ]) for page in gen: if not page.properties().get('wikibase_item'): return page
def test_intersect_newpages_and_recentchanges(self): """Test intersection betweem newpages and recentchanges.""" site = self.get_site() self.assertEqualItertools([ pagegenerators.NewpagesPageGenerator(site=site, total=50), pagegenerators.RecentChangesPageGenerator(site=site, total=200) ])
def test_intersect_newpages_csd(self): site = self.get_site() self.assertEqualItertools([ pagegenerators.NewpagesPageGenerator(site=site, total=10), pagegenerators.CategorizedPageGenerator( pywikibot.Category(site, 'Category:Candidates_for_speedy_deletion')) ])
# Pages per file limit = 500 output = "core/articles/newpages.txt" # Remove old file print("removing old file...") try: os.remove(output) except FileNotFoundError: pass output_file = open(output, "a") site = pywikibot.Site() gen = pagegenerators.NewpagesPageGenerator(site=None, namespaces=[0], total=limit) print("now writing...") for page in gen: output_file.write(page.title() + "\n") print(page.title()) output_file.close() print("saved to", output) print("done")
def main(): summary_commandline, gen, template = None, None, None namespaces, PageTitles, exceptions = [], [], [] encat, newcatfile = '', '' autoText, autoTitle = False, False recentcat, newcat = False, False genFactory = pagegenerators.GeneratorFactory() for arg in pywikibot.handleArgs(): if arg == '-autotitle': autoTitle = True elif arg == '-autotext': autoText = True elif arg.startswith('-page'): if len(arg) == 5: PageTitles.append( pywikibot.input(u'Which page do you want to chage?')) else: PageTitles.append(arg[6:]) break elif arg.startswith('-except:'): exceptions.append(arg[8:]) elif arg.startswith('-template:'): template = arg[10:] elif arg.startswith('-facat:'): encat = arg[7:].replace(u'Category:', u'').replace(u'category:', u'').replace(u'رده:', u'') encat = englishdictionry(u'رده:' + encat, fa_site, en_site).replace(u'Category:', u'').replace( u'category:', u'') break elif arg.startswith('-encat:'): encat = arg[7:].replace(u'Category:', u'').replace(u'category:', u'').replace(u'رده:', u'') break elif arg.startswith('-newcatfile:'): newcatfile = arg[12:] break elif arg.startswith('-recentcat'): arg = arg.replace(':', '') if len(arg) == 10: genfa = pagegenerators.RecentchangesPageGenerator() else: genfa = pagegenerators.RecentchangesPageGenerator( number=int(arg[10:])) genfa = pagegenerators.DuplicateFilterPageGenerator(genfa) genfa = pagegenerators.NamespaceFilterPageGenerator(genfa, [14]) preloadingGen = pagegenerators.PreloadingGenerator(genfa, 60) recentcat = True break elif arg.startswith('-newcat'): arg = arg.replace(':', '') if len(arg) == 7: genfa = pagegenerators.NewpagesPageGenerator(step=100, namespaces=14) else: genfa = pagegenerators.NewpagesPageGenerator(step=int(arg[7:]), namespaces=14) preloadingGen = pagegenerators.PreloadingGenerator(genfa, 60) newcat = True break elif arg.startswith('-namespace:'): namespaces.append(int(arg[11:])) elif arg.startswith('-summary:'): pywikibot.setAction(arg[9:]) summary_commandline = True else: generator = genFactory.handleArg(arg) if generator: gen = genFactory.getCombinedGenerator(gen) if encat != '': encatfalist, encatlists = encatlist(encat) if encatlists: for encat in encatlists: encat = englishdictionry(encat, en_site, fa_site) if encat: run([encat]) if encatfalist is not False: run(encatfalist) if PageTitles: pages = [ pywikibot.Page(fa_site, PageTitle) for PageTitle in PageTitles ] gen = iter(pages) if recentcat: for workpage in preloadingGen: workpage = workpage.title() cat = pywikibot.Category(fa_site, workpage) gent = pagegenerators.CategorizedPageGenerator(cat) run(gent) pywikibot.stopme() sys.exit() if newcat: for workpage in preloadingGen: workpage = workpage.title() workpage = englishdictionry(workpage, fa_site, en_site) if workpage is not False: encatfalist, encatlists = encatlist(workpage) if encatlists: for encat in encatlists: encat = englishdictionry(encat, en_site, fa_site) if encat: run([encat]) if encatfalist is not False: run(encatfalist) pywikibot.stopme() sys.exit() if newcatfile: text2 = codecs.open(newcatfile, 'r', 'utf8') text = text2.read() linken = re.findall(ur'\[\[.*?\]\]', text, re.S) if linken: for workpage in linken: workpage = workpage.split(u'|')[0].replace(u'[[', u'').replace( u']]', u'').strip() workpage = englishdictionry(workpage, fa_site, en_site) if workpage is not False: encatfalist, encatlists = encatlist(workpage) if encatlists: run(encatlists) if encatfalist is not False: run(encatfalist) pywikibot.stopme() sys.exit() if not gen: pywikibot.stopme() sys.exit() if namespaces != []: gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60) run(preloadingGen)
def main(): lang = 'en' wikisite = pywikibot.Site(lang, 'wikipedia') wdsite = pywikibot.Site('wikidata', 'wikidata') repo = wdsite.data_repository() total = 100 if len(sys.argv) >= 2: total = int(sys.argv[1]) gen = pagegenerators.NewpagesPageGenerator(site=wikisite, namespaces=[0], total=total) pre = pagegenerators.PreloadingGenerator(gen, groupsize=50) for page in pre: if page.isRedirectPage(): continue if not pageIsBiography(page=page): continue print('\n==', page.title().encode('utf-8'), '==') gender = calculateGender(page=page) item = '' try: item = pywikibot.ItemPage.fromPage(page) except: pass if item: print('Page has item') try: item.get() except: print('Error while retrieving item, skiping...') continue p31 = '' p21 = '' claims = item.claims if claims: if 'P31' in item.claims: p31 = item.claims['P31'][0].getTarget() if 'P21' in item.claims: p21 = item.claims['P21'][0].getTarget() print(page.title().encode('utf-8'), item, gender, p31, p21) if not p31: addHumanClaim(repo=repo, item=item) if not p21: addGenderClaim(repo=repo, item=item, gender=gender) else: print('Page without item') #search for a valid item, otherwise create if authorIsNewbie(page=page): if pageIsRubbish(page=page) or \ (not pageCategories(page=page)) or \ (not pageReferences(page=page)) or \ (not len(list(page.getReferences(namespaces=[0])))): continue print(page.title().encode('utf-8'), 'need item', gender) wtitle = page.title() wtitle_ = wtitle.split('(')[0].strip() searchitemurl = 'https://www.wikidata.org/wiki/Special:ItemDisambiguation?language=&label=%s' % ( urllib.parse.quote(wtitle_)) raw = getURL(searchitemurl) if 'Sorry, no item with that label was found' in raw: print('No useful item found. Creating a new one...') #create item newitemlabels = {lang: wtitle_} newitem = pywikibot.ItemPage(repo) newitem.editLabels( labels=newitemlabels, summary="BOT - Creating item for [[:%s:%s|%s]] (%s): %s %s" % (lang, wtitle, wtitle, lang, 'human', gender)) newitem.get() addHumanClaim(repo=repo, item=newitem) addGenderClaim(repo=repo, item=newitem, gender=gender) newitem.setSitelink( page, summary='BOT - Adding 1 sitelink: [[:%s:%s|%s]] (%s)' % (lang, page.title(), page.title(), lang)) else: print(searchitemurl.encode('utf-8')) #check birthdate and if it matches add interwiki m = re.findall( r'<li class="wikibase-disambiguation"><a title="(Q\d+)"', raw) if len(m) > 3: continue for itemfoundq in m: itemfound = pywikibot.ItemPage(repo, itemfoundq) itemfound.get() if ('%swiki' % (lang)) in itemfound.sitelinks: continue if 'P569' in itemfound.claims: birthyear = itemfound.claims['P569'][0].getTarget( ).year if birthyear and re.search( r'(?i)\[\[ *Category *\: *%s births *\]\]' % (birthyear), page.text): print( '%s birthyear found in item. Category:%s births found in page' % (birthyear, birthyear)) print('Adding sitelink %s:%s' % (lang, page.title().encode('utf-8'))) itemfound.setSitelink( page, summary= 'BOT - Adding 1 sitelink: [[:%s:%s|%s]] (%s)' % (lang, page.title(), page.title(), lang)) if not 'P31' in itemfound.claims: addHumanClaim(repo=repo, item=itemfound) if not 'P21' in itemfound.claims: addGenderClaim(repo=repo, item=itemfound, gender=gender) break
def main(): wdsite = pywikibot.Site('wikidata', 'wikidata') repo = wdsite.data_repository() langs = ['en', 'fr', 'de'] for lang in langs: wikisite = pywikibot.Site(lang, 'wikipedia') total = 100 if len(sys.argv) >= 2: total = int(sys.argv[1]) gen = pagegenerators.NewpagesPageGenerator(site=wikisite, namespaces=[0], total=total) #cat = pywikibot.Category(wikisite, 'Category:Articles without Wikidata item') #gen = pagegenerators.CategorizedPageGenerator(cat, recurse=False) pre = pagegenerators.PreloadingGenerator(gen, groupsize=50) for page in pre: if page.isRedirectPage(): continue if not pageIsBiography(page=page, lang=lang): continue print('\n==', page.title().encode('utf-8'), '==') gender = calculateGender(page=page, lang=lang) item = '' try: item = pywikibot.ItemPage.fromPage(page) except: pass if item: print('Page has item') print('https://www.wikidata.org/wiki/%s' % (item.title())) addBiographyClaims(repo=repo, wikisite=wikisite, item=item, page=page, lang=lang) else: print('Page without item') #search for a valid item, otherwise create if authorIsNewbie(page=page, lang=lang): print("Newbie author, checking quality...") if pageIsRubbish(page=page, lang=lang) or \ (not pageCategories(page=page, lang=lang)) or \ (not pageReferences(page=page, lang=lang)) or \ (not len(list(page.getReferences(namespaces=[0])))): print("Page didnt pass minimum quality, skiping") continue print(page.title().encode('utf-8'), 'need item', gender) wtitle = page.title() wtitle_ = wtitle.split('(')[0].strip() searchitemurl = 'https://www.wikidata.org/w/api.php?action=wbsearchentities&search=%s&language=%s&format=xml' % ( urllib.parse.quote(wtitle_), lang) raw = getURL(searchitemurl) print(searchitemurl.encode('utf-8')) #check birthdate and if it matches, then add data numcandidates = '' #do not set to zero if not '<search />' in raw: m = re.findall(r'id="(Q\d+)"', raw) numcandidates = len(m) print("Found %s candidates" % (numcandidates)) if numcandidates > 5: #too many candidates, skiping print("Too many, skiping") continue for itemfoundq in m: itemfound = pywikibot.ItemPage(repo, itemfoundq) itemfound.get() if ('%swiki' % (lang)) in itemfound.sitelinks: print("Candidate %s has sitelink, skiping" % (itemfoundq)) numcandidates -= 1 continue pagebirthyear = calculateBirthDate(page=page, lang=lang) pagebirthyear = pagebirthyear and int( pagebirthyear.split('-')[0]) or '' if not pagebirthyear: print("Page doesnt have birthdate, skiping") break #break, dont continue. Without birthdate we cant decide correctly if 'P569' in itemfound.claims and itemfound.claims[ 'P569'][0].getTarget().precision in [ 9, 10, 11 ]: #https://www.wikidata.org/wiki/Help:Dates#Precision itemfoundbirthyear = int( itemfound.claims['P569'][0].getTarget().year) print( "candidate birthdate = %s, page birthdate = %s" % (itemfoundbirthyear, pagebirthyear)) mindatelen = 4 if len(str( itemfoundbirthyear)) != mindatelen or len( str(pagebirthyear)) != mindatelen: print("%s birthdate length != %s" % (itemfoundq, mindatelen)) continue #reduce candidates if birthyear are different minyeardiff = 3 if itemfoundbirthyear >= pagebirthyear + minyeardiff or itemfoundbirthyear <= pagebirthyear - minyeardiff: print( "Candidate %s birthdate out of range, skiping" % (itemfoundq)) numcandidates -= 1 continue #but only assume it is the same person if birthyears match if itemfoundbirthyear == pagebirthyear: print( '%s birthyear found in candidate %s. Category:%s births found in page. OK!' % (itemfoundbirthyear, itemfoundq, itemfoundbirthyear)) print('Adding sitelink %s:%s' % (lang, page.title().encode('utf-8'))) try: itemfound.setSitelink( page, summary= 'BOT - Adding 1 sitelink: [[:%s:%s|%s]] (%s)' % (lang, page.title(), page.title(), lang)) except: print("Error adding sitelink. Skiping.") break addBiographyClaims(repo=repo, wikisite=wikisite, item=itemfound, page=page, lang=lang) break #no item found, or no candidates are useful if '<search />' in raw or (numcandidates == 0): print('No useful item found. Creating a new one...') #create item newitemlabels = {lang: wtitle_} newitem = pywikibot.ItemPage(repo) newitem.editLabels( labels=newitemlabels, summary= "BOT - Creating item for [[:%s:%s|%s]] (%s): %s %s" % (lang, wtitle, wtitle, lang, 'human', gender)) newitem.get() try: newitem.setSitelink( page, summary= 'BOT - Adding 1 sitelink: [[:%s:%s|%s]] (%s)' % (lang, page.title(), page.title(), lang)) except: print("Error adding sitelink. Skiping.") break addBiographyClaims(repo=repo, wikisite=wikisite, item=newitem, page=page, lang=lang)
def newPages(all=False): global nbrModif, nbrTotal log = u'' #BUGFIX bugfixPage = pywikibot.Page(site,u"Utilisateur:LinedBot") bugfixPage.save('') #END OF FIX homonCat = pywikibot.Category(site,u"Homonymie") ebaucheCat = pywikibot.Category(site,u"Ébauche") ebaucheCat = set(ebaucheCat.subcategories(recurse=3)) hiddenCat = pywikibot.Category(site,u"Catégorie cachée") hiddenCat = set(hiddenCat.subcategories()) portalCat = pywikibot.Category(site,u"Liste d'articles") portalCat = set(portalCat.subcategories()) ignoreCat = pywikibot.Category(site,u"Page ignorée par les robots") concoursCat = pywikibot.Category(site,u"Article VikiConcours") deadendPagesList = list(pagegenerators.DeadendPagesPageGenerator(site=site)) lonelyPagesList = list(pagegenerators.LonelyPagesPageGenerator(site=site)) if all: pagesList = pagegenerators.AllpagesPageGenerator(namespace=0,includeredirects=False,site=site) else: pagesList = pagegenerators.NewpagesPageGenerator(total=50,site=site) for page in pagesList: try: pageTemp = page.get() except pywikibot.NoPage: pywikibot.output(u"Page %s does not exist; skipping." % page.title(asLink=True)) except pywikibot.IsRedirectPage: pywikibot.output(u"Page %s is a redirect; skipping." % page.title(asLink=True)) except pywikibot.LockedPage: pywikibot.output(u"Page %s is locked; skipping." % page.title(asLink=True)) else: # On ne s'occupe de la page que si elle n'est ni une homonymie ni une page du VikiConcours pageCat = page.categories() if (not homonCat in pageCat) and (not concoursCat in pageCat): #On ne traite l'ajout de bandeau que si la page n'est pas ignorée jobList = [] if not ignoreCat in pageCat: # s'il existe des références, on retire le job 'orphelin' if page in lonelyPagesList: jobList.append(u'orphelin') # s'il n'existe aucune catégorie (directe), on ajoute le job 'catégoriser' realCat = list(set(pageCat) - set(hiddenCat) - set(ebaucheCat)) nbCat = len(list(realCat)) if nbCat == 0: jobList.append(u'catégoriser') # si la page n'appartient à aucun portail, on ajoute le job 'portail' nbPort = len(set(pageCat) & set(portalCat)) if nbPort == 0: jobList.append(u'portail') # si la page ne pointe vers aucune autre, on ajoute le job 'impasse' if page in deadendPagesList: jobList.append(u'impasse') """ # si la page fait plus de 2000 octets et ne contient aucun lien externe if len(pageTemp) > 2000 and len(list(page.extlinks())) == 0: jobList.append(u'sourcer') """ else: print u'Skipping [[' + page.title() + ']], page in ignore list.' pageTemp, oldJobList = removeBanner(pageTemp) jobList = updateJobList(oldJobList, jobList) job = u'' # Différence symétrique entre les deux listes, on regarde si des éléments ne sont pas contenus dans les deux listes : (A-B)+(B-A) diff = list(set(oldJobList).symmetric_difference(set(jobList))) if diff != []: nbrTotal += 1 if len(jobList) > 0: job = ','.join(jobList) banner = u'{{Maintenance|job=' + job + '|date=~~~~~}}\n\n' pageTemp = banner + pageTemp summary = u'[[VD:Robot|Robot]] : Mise à jour du bandeau de maintenance.' else: summary = u'[[VD:Robot|Robot]] : Retrait du bandeau de maintenance.' c = callback.Callback() page.text = pageTemp page.save(summary,callback=c) if c.error == None: nbrModif += 1 log +=u'*' + '{{Utilisateur:LinedBot/ExtLinker|' + page.title() + u'}} : Mise à jour du bandeau {{m|maintenance}} avec les paramètres suivants : ' + job + '\n' return log
def get_test_unconnected_page(site): gen = pagegenerators.NewpagesPageGenerator(site=site, total=1) return next(gen)