def test_category_init(self): self.assertRaises(ValueError, wptools.category, pageid='TEST') self.assertRaises(ValueError, wptools.category, 'TEST', pageid=123) cat = wptools.category('TEST') self.assertEqual(cat.params, {'lang': 'en', 'title': 'TEST'}) self.assertTrue('requests' not in cat.data) try: cat = wptools.category(namespace='NOTINT') self.fail("failed to raise ValueError") except ValueError: pass
def test_category_get_members(self): cat = wptools.category('TEST') cat.cache['category'] = category.cache cat._set_data('category') self.assertEqual(len(cat.data['members']), 68) self.assertEqual(len(cat.data['subcategories']), 24) self.assertTrue('requests' not in cat.data)
def __get_category(self, category_name): ''' A recursion function that gets a category and returns all it's pages within it. If there are subcategories will recursively run the function Will add all pages to self.pages (set) @return: None ''' # Gets the old__data within the wiki page wiki_data = wptools.category(category_name).get_members().data try: # tries to get subcatgories subcategories = wiki_data['subcategories'] except: # Stop Condition - If there are no subcategories # Loops through all members (pages) in category # adds them to pages for member in wiki_data['members']: print(f"Added to Wikipedia pages list : {member['title']}") self.pages.add(member['title']) # cache results if self.config['extraction']['subcache']: if len(self.pages) % 100 == 0: save_data('\n'.join(self.pages), get_file_path(self.output_path, 'subcache')) return # For DEBUG - smaller subcategories if self.config['debug']['DEBUG']: subcategories = subcategories[:20] for cat in subcategories: print(f"Entered to Wikipedia Category : {cat['title']}") self.__get_category(cat['title'])
def get_cat_pages(self): #Get the title of pages of the corpus for the french network cat = wptools.category(self.category, lang=self.language) membres = cat.get_members() self.info_pages = pd.DataFrame.from_dict(membres.data['members']) self.info_pages = self.info_pages[['pageid', 'title']] #Get infos about the pages self.info_pages['content'] = self.info_pages['title'].apply( self.get_page_data) #Parse the infos into the DataFrame self.info_pages['summary'] = self.info_pages['content'].apply( self.get_summary) self.info_pages['url'] = self.info_pages['content'].apply(self.get_url) self.info_pages['length'] = self.info_pages['content'].apply( self.get_length) self.info_pages['links'] = self.info_pages['content'].apply( self.get_links) self.info_pages['modification'] = self.info_pages['content'].apply( self.get_modification) self.info_pages['wikibase'] = self.info_pages['content'].apply( self.get_wikibase) self.info_pages['wikidata_url'] = self.info_pages['content'].apply( self.get_wikidata_url) self.info_pages['aliases'] = self.info_pages['content'].apply( self.get_aliases) self.info_pages = self.info_pages.rename(columns={'pageid': 'Id'})
def collect_data(self, category, depth): if depth: print("Extracting data for subcategories of {} at depth {}".format( category, depth)) cat = wptools.category(category) cat_members = cat.get_members() if 'members' in cat_members.data.keys(): for cat_member in cat_members.data['members']: if cat_member['pageid'] not in self.get_ids(): try: page = wptools.page( pageid=cat_member['pageid']).get_parse() url = page.get_query().data['url'] text = BeautifulSoup( page.data['wikitext'], 'html.parser').get_text() clean_text = re.sub( r'\s*{.*}\s*|\s*\[.*\]\s*', '', text) print('Saving page with Id: {} and URL: {}'.format( cat_member['pageid'], url)) self.insert_page_content( cat_member['pageid'], category, url, clean_text) except Exception as e: print("Exception occured: {}".format(e)) if 'subcategories' in cat_members.data.keys(): sub_cats = cat_members.data['subcategories'] for sub_cat in sub_cats: self.categories.append(sub_cat) self.collect_data(sub_cat['title'], depth - 1)
def download_category(category, lang, name, include_subcat, results, level=""): if not os.path.exists(name): os.mkdir(name) wiki_cat = wptools.category(category, lang=lang, silent=True) wiki_cat.get_members() category_short = category.split(":")[1] print(level + " Scrapping category:{}".format(category)) if category_short in [x['title'] for x in wiki_cat.data['members']]: l = list( filter(lambda x: x['title'] == category_short, wiki_cat.data['members'])) wiki_cat.data['members'] = l if 'subcategories' in wiki_cat.data: wiki_cat.data.pop('subcategories') if include_subcat and 'subcategories' in wiki_cat.data: for subcat in wiki_cat.data['subcategories']: download_category(subcat['title'], lang, name, include_subcat, results, level + ">") for topic in wiki_cat.data['members']: title = topic['title'] if title in [x['title'] for x in results]: #print("title:{} ----- (exists)".format(title)) continue try: page = wptools.page(title, lang=lang, silent=True).get_query() except ValueError: results.append({"title": title, "ok": False}) continue thumb_img = page.images(fields='url', token='thumb') if thumb_img is None or len( thumb_img) == 0 or thumb_img[0]['url'] is None: print("title:{} ----- (no image)".format(title)) results.append({"title": title, "ok": False}) continue extract = "" if 'extract' in page.data: extract = page.data['extract'] page.get_more() tags = [ tag.split(':')[1].replace(' ', '_') for tag in page.data['categories'] ] url = thumb_img[0]['url'] info = { "title": title, "ok": True, "url": url, "extract": extract, "tags": tags } print("title:{} info:{}".format(title, extract[:100].replace('\n', ''))) results.append(info) save_info(info=results, folder=name)
def test_category_query(self): cat = wptools.category('TEST') qobj = wptools.query.WPToolsQuery() self.assertEqual(cat._query('random', qobj), ('https://en.wikipedia.org/w/api.php?' 'action=query&format=json&formatversion=2' '&list=random&rnlimit=1&rnnamespace=14')) self.assertEqual(cat._query('category', qobj), ('https://en.wikipedia.org/w/api.php?' 'action=query&format=json&formatversion=2' '&list=categorymembers&cmlimit=500&cmtitle=TEST'))
def test_category_get_members_continue(self): cat = wptools.category('TEST') cat.cache['category'] = category_cmcontinue.cache cat._set_data('category') self.assertTrue('continue' in cat.data) self.assertEqual(len(cat.data['members']), 1) qry = cat._query('category', wptools.query.WPToolsQuery()) self.assertTrue('&cmcontinue=page|' in qry) self.assertTrue(qry.endswith('|42525291')) cat.cache['category'] = category.cache cat._set_data('category') self.assertTrue('cmcontinue' not in cat.data) self.assertEqual(len(cat.data['members']), 69)
def get_categories_and_members(self, category, depth): """ Start with the defined category and download Wikipedia content up to the specific depth of categories :param category: :param depth: :return: """ print(u'Checking for subcategories of {} at depth {}'.format(category, depth)) if depth: # Get details of this category # Members are pages related to this category cat = wptools.category(category) cat_members = cat.get_members() # First let's save any members (pages) for this category if 'members' in cat_members.data.keys(): for cat_member in cat_members.data['members']: # Check to see if we have this page already, ignore if we do if cat_member['pageid'] not in self.get_page_ids(): # If we don't have this page, then get the page content page = wptools.page(pageid=cat_member['pageid']).get_parse() # Get URL in wikipedia url = page.get_query().data['url'] # Remove <ref> and other HTML syntax text = BeautifulSoup(page.data['wikitext'], 'html.parser').get_text() # Remove other markup such as [[...]] and {{...}} clean_content = re.sub(r'\s*{.*}\s*|\s*\[.*\]\s*', '', text) # Now store print('Saving pageid {} / url {}'.format(cat_member['pageid'], url)) self._save_page_content(category, cat_member['pageid'], url, clean_content) # Now iterate through any subcategories if 'subcategories' in cat_members.data.keys(): subcats = cat_members.data['subcategories'] for subcat in subcats: self.categories.append(subcat) # Recursively call this function until we've explored Wikipedia up to the specified depth self.get_categories_and_members(subcat['title'], depth - 1)
def extract_category_members(cat_name, prefix): sleep(1) cat = wptools.category(cat_name) prefix2 = prefix + "/" + cat_name.replace("Category:", "").replace( "category:", "").replace(" ", "_") if len(prefix2) > 2500: return members = [ x['title'] for x in cat.get_members().data['members'] if "list of" not in x['title'].lower() ] if len(members) > 2: if not os.path.exists(path + prefix): os.makedirs(path + prefix) with open(path + prefix2 + '.txt', 'w') as f: f.write('\n'.join(members)) if 'subcategories' in cat.get_members().data: for c_name in cat.get_members().data['subcategories']: extract_category_members(c_name['title'], prefix2)
def test_category_get_members_namespace(self): cat = wptools.category('TEST', namespace=0) cat.cache['category'] = category.cache cat._set_data('category') self.assertEqual(len(cat.data['members']), 68) self.assertTrue('requests' not in cat.data)
def test_category_caching(self): cat = wptools.category('TEST', silent=SILENT_FLAG) cat.cache['category'] = {'response': None} cat.get_members() self.assertEqual(len(cat.data), 0) self.assertTrue('requests' not in cat.data)
def test_category_random(self): cat = wptools.category('TEST') cat.cache = {'random': random_query.cache} cat._set_data('random') self.assertEqual(cat.data['title'], 'RANDOM TEST TITLE')
def test_category_get_members(self): cat = wptools.category('TEST') cat.cache['category'] = category.cache cat._set_data('category') self.assertTrue(len(cat.data['members']), 92)
def test_category_caching(self): cat = wptools.category('TEST', silent=True) cat.cache['category'] = {'response': None} cat.get_members() self.assertEqual(len(cat.data), 0)
def test_category_init(self): self.assertRaises(ValueError, wptools.category, pageid='TEST') self.assertRaises(ValueError, wptools.category, 'TEST', pageid=123) cat = wptools.category('TEST') self.assertEqual(cat.params, {'lang': 'en', 'title': 'TEST'})