def fetchWikipediaLanguages(self): site = wikipedia('en') apiRes = site('sitematrix', smtype='language', smstate='all', smlangprop='code|name|site|dir|localname', smsiteprop='dbname|code|sitename|url|lang', smlimit='max')['sitematrix'] #print(apiRes) for ind in apiRes: if ind == 'count': continue #{'code': 'zu', 'name': 'isiZulu', 'site': [{'url': 'https://zu.wikipedia.org', 'dbname': 'zuwiki', 'code': 'wiki', 'lang': 'zu', 'sitename': 'Wikipedia'}, {'url': 'https://zu.wiktionary.org', 'dbname': 'zuwiktionary', 'code': 'wiktionary', 'lang': 'zu', 'sitename': 'Wiktionary'}, {'url': 'https://zu.wikibooks.org', 'dbname': 'zuwikibooks', 'code': 'wikibooks', 'lang': 'zu', 'sitename': 'Wikibooks', 'closed': True}], 'dir': 'ltr', 'localname': 'zulu'} data = apiRes[ind] #print(ind) #languageCode = data['site']['dbname']#data['code'] foundWiki = False for site in data['site']: if 'closed' in site and site['closed'] == True: continue if site['code'] == 'wiki': languageCode = site['dbname'] self.allLanguages.append(languageCode) break
def subcats(self, wiki, category): site = wikipedia(wiki.replace('wiki', '')) categories = [] for r in site.query(list='categorymembers', cmtitle=category, cmlimit="500"): for cat in r.categorymembers: ns = cat.get('ns') if ns != 14: continue categories.append(cat.get('title')) mapp = {} for category in categories: print(category) category = category.partition(':')[2] formateed = self.handle_subcategories(wiki.replace('wiki', ''), category, 1) mapp.update({category: formateed}) with open('test1.json', 'w', encoding='utf-8') as file_w: file_w.write( json.dumps(mapp, ensure_ascii=False, indent=4, sort_keys=True)) return mapp
def page_exists(wiki, title): if wiki == 'be_x_oldwiki': wiki = 'be-taraskwiki' site = wikipedia(wiki.replace('wiki', '')) res = site.query(titles=[title]) for r in res: for page in r.pages: if page.get('missing'): return False return True
def getWikidataItems(self, allItems): self.fetchWikipediaLanguages() self.wdSite = wikipedia('www', 'wikidata') filename = config['filename-api'] if os.path.isfile(filename): data = eval(open(filename, 'r', encoding='utf-8').read()) self.oneBatch(data, False) else: for group in chunker(allItems, 50): self.oneBatch(group) saveToFile(filename, json.dumps(self.itemData_FULL, ensure_ascii=False))
def test_query_pages(self): """Iterate over query results for two iterations (list=allpages)""" site = wikipedia(session=Tests.session) last_result = None for res in site.query(list='allpages', aplimit=1): self.assertIn('allpages', res) self.assertIsInstance(res['allpages'], list) self.assertEqual(len(res['allpages']), 1) self.assertIsInstance(res['allpages'][0], dict) self.assertIsNotNone(res['allpages'][0]['ns']) self.assertIsNotNone(res['allpages'][0]['title']) self.assertIsNotNone(res['allpages'][0]['pageid']) if last_result is None: last_result = res['allpages'][0]['pageid'] else: self.assertNotEqual(res['allpages'][0]['pageid'], last_result) break
def get_pages() -> [List[Dict], List[int]]: """ Get all lua modules from the Wiki, exclude doc pages :return: A list of page info and list of pageids where errors occurred """ site = wikipedia('en') pages = [] modules_names = [] error_pages = [] # Asks 500 (max) per iteration lua modules pages for api for r in site.query(list='allpages', apnamespace="828", aplimit="max"): # Iterates in the results for page in r.allpages: # Check if a documentation file if "/doc" not in page.title and "testcase" not in page.title and "Module:User:"******"/")[0] not in modules_names: try: # Not search submodules modules_names.append(page.title.split("/")[0]) # Get module lua content for module in site.iterate("parse", pageid=page.pageid, prop="wikitext"): data = { 'title': module.title, 'pageid': module.pageid, 'size': len(module.wikitext) } pages.append(data) print(f"{module.title} successfully added") save_script(module) # Wait 1 second time.sleep(1) except: # Saves pages that have errors error_pages.append(page.pageid) print( f"An error occurred while downloading the module: {module.title}" ) return pages, error_pages
from pywikiapi import wikipedia import json from helpers import chunks fileNameInput = 'data/petscan_data_all_wikidata_items.txt' fileOutputRaw = 'data/wikidata_raw_output.txt' fileOutput = 'data/wikidata_output.txt' site = wikipedia('www', 'wikidata') CHUNK_SIZE = 50 batchCounter = 0 ALL_DATA = {} BY_LANG = {} def formatData(): for entity in ALL_DATA: for wiki in ALL_DATA[entity]: if wiki in BY_LANG: BY_LANG[wiki].append(ALL_DATA[entity][wiki]) else: BY_LANG[wiki] = [ALL_DATA[entity][wiki]] def oneBatch(wdItems): global batchCounter batchCounter += 1 print(batchCounter)
def __init__(self): self.site = wikipedia('www', 'wikidata')
from pywikiapi import wikipedia site = wikipedia( 'en', headers={ 'User-Agent': 'Mozilla/5.0 (compatible; EarwigBotCCI/0.1; [email protected])' })
def test_get_metadata(self): """Get en.wikipedia metadata""" site = wikipedia(session=Tests.session) result = site('query', meta='siteinfo') self.assertEqual(result.query.general.mainpage, 'Main Page')
def test_url(self): """Test default WMF site object creation""" site = wikipedia(session=Tests.session) self.assertEqual(site.url, 'https://en.wikipedia.org/w/api.php')