예제 #1
0
    def fetchWikipediaLanguages(self):
        site = wikipedia('en')

        apiRes = site('sitematrix',
                      smtype='language',
                      smstate='all',
                      smlangprop='code|name|site|dir|localname',
                      smsiteprop='dbname|code|sitename|url|lang',
                      smlimit='max')['sitematrix']

        #print(apiRes)

        for ind in apiRes:
            if ind == 'count':
                continue
            #{'code': 'zu', 'name': 'isiZulu', 'site': [{'url': 'https://zu.wikipedia.org', 'dbname': 'zuwiki', 'code': 'wiki', 'lang': 'zu', 'sitename': 'Wikipedia'}, {'url': 'https://zu.wiktionary.org', 'dbname': 'zuwiktionary', 'code': 'wiktionary', 'lang': 'zu', 'sitename': 'Wiktionary'}, {'url': 'https://zu.wikibooks.org', 'dbname': 'zuwikibooks', 'code': 'wikibooks', 'lang': 'zu', 'sitename': 'Wikibooks', 'closed': True}], 'dir': 'ltr', 'localname': 'zulu'}
            data = apiRes[ind]
            #print(ind)
            #languageCode = data['site']['dbname']#data['code']
            foundWiki = False

            for site in data['site']:
                if 'closed' in site and site['closed'] == True:
                    continue

                if site['code'] == 'wiki':
                    languageCode = site['dbname']
                    self.allLanguages.append(languageCode)
                    break
예제 #2
0
    def subcats(self, wiki, category):
        site = wikipedia(wiki.replace('wiki', ''))
        categories = []
        for r in site.query(list='categorymembers',
                            cmtitle=category,
                            cmlimit="500"):
            for cat in r.categorymembers:
                ns = cat.get('ns')
                if ns != 14: continue

                categories.append(cat.get('title'))

        mapp = {}

        for category in categories:
            print(category)
            category = category.partition(':')[2]

            formateed = self.handle_subcategories(wiki.replace('wiki', ''),
                                                  category, 1)

            mapp.update({category: formateed})

        with open('test1.json', 'w', encoding='utf-8') as file_w:
            file_w.write(
                json.dumps(mapp, ensure_ascii=False, indent=4, sort_keys=True))

        return mapp
예제 #3
0
def page_exists(wiki, title):
    if wiki == 'be_x_oldwiki':
        wiki = 'be-taraskwiki'
    site = wikipedia(wiki.replace('wiki', ''))
    res = site.query(titles=[title])
    for r in res:
        for page in r.pages:
            if page.get('missing'):
                return False

    return True
예제 #4
0
    def getWikidataItems(self, allItems):
        self.fetchWikipediaLanguages()
        self.wdSite = wikipedia('www', 'wikidata')

        filename = config['filename-api']

        if os.path.isfile(filename):
            data = eval(open(filename, 'r', encoding='utf-8').read())
            self.oneBatch(data, False)
        else:
            for group in chunker(allItems, 50):
                self.oneBatch(group)

            saveToFile(filename,
                       json.dumps(self.itemData_FULL, ensure_ascii=False))
예제 #5
0
 def test_query_pages(self):
     """Iterate over query results for two iterations (list=allpages)"""
     site = wikipedia(session=Tests.session)
     last_result = None
     for res in site.query(list='allpages', aplimit=1):
         self.assertIn('allpages', res)
         self.assertIsInstance(res['allpages'], list)
         self.assertEqual(len(res['allpages']), 1)
         self.assertIsInstance(res['allpages'][0], dict)
         self.assertIsNotNone(res['allpages'][0]['ns'])
         self.assertIsNotNone(res['allpages'][0]['title'])
         self.assertIsNotNone(res['allpages'][0]['pageid'])
         if last_result is None:
             last_result = res['allpages'][0]['pageid']
         else:
             self.assertNotEqual(res['allpages'][0]['pageid'], last_result)
             break
예제 #6
0
def get_pages() -> [List[Dict], List[int]]:
    """
    Get all lua modules from the Wiki, exclude doc pages
    :return: A list of page info and list of pageids where errors occurred
    """
    site = wikipedia('en')
    pages = []
    modules_names = []
    error_pages = []
    # Asks 500 (max) per iteration lua modules pages for api
    for r in site.query(list='allpages', apnamespace="828", aplimit="max"):
        # Iterates in the results
        for page in r.allpages:
            # Check if a documentation file
            if "/doc" not in page.title and "testcase" not in page.title and "Module:User:"******"/")[0] not in modules_names:
                try:
                    # Not search submodules
                    modules_names.append(page.title.split("/")[0])
                    # Get module lua content
                    for module in site.iterate("parse",
                                               pageid=page.pageid,
                                               prop="wikitext"):
                        data = {
                            'title': module.title,
                            'pageid': module.pageid,
                            'size': len(module.wikitext)
                        }
                        pages.append(data)
                        print(f"{module.title} successfully added")
                        save_script(module)
                        # Wait 1 second
                        time.sleep(1)
                except:
                    # Saves pages that have errors
                    error_pages.append(page.pageid)
                    print(
                        f"An error occurred while downloading the module: {module.title}"
                    )
    return pages, error_pages
from pywikiapi import wikipedia
import json
from helpers import chunks

fileNameInput = 'data/petscan_data_all_wikidata_items.txt'
fileOutputRaw = 'data/wikidata_raw_output.txt'
fileOutput = 'data/wikidata_output.txt'

site = wikipedia('www', 'wikidata')

CHUNK_SIZE = 50

batchCounter = 0

ALL_DATA = {}
BY_LANG = {}


def formatData():
    for entity in ALL_DATA:
        for wiki in ALL_DATA[entity]:
            if wiki in BY_LANG:
                BY_LANG[wiki].append(ALL_DATA[entity][wiki])
            else:
                BY_LANG[wiki] = [ALL_DATA[entity][wiki]]


def oneBatch(wdItems):
    global batchCounter
    batchCounter += 1
    print(batchCounter)
예제 #8
0
 def __init__(self):
     self.site = wikipedia('www', 'wikidata')
예제 #9
0
from pywikiapi import wikipedia

site = wikipedia(
    'en',
    headers={
        'User-Agent':
        'Mozilla/5.0 (compatible; EarwigBotCCI/0.1; [email protected])'
    })
예제 #10
0
 def test_get_metadata(self):
     """Get en.wikipedia metadata"""
     site = wikipedia(session=Tests.session)
     result = site('query', meta='siteinfo')
     self.assertEqual(result.query.general.mainpage, 'Main Page')
예제 #11
0
 def test_url(self):
     """Test default WMF site object creation"""
     site = wikipedia(session=Tests.session)
     self.assertEqual(site.url, 'https://en.wikipedia.org/w/api.php')