예제 #1
0
    def test_category_init(self):
        self.assertRaises(ValueError, wptools.category, pageid='TEST')
        self.assertRaises(ValueError, wptools.category, 'TEST', pageid=123)

        cat = wptools.category('TEST')
        self.assertEqual(cat.params, {'lang': 'en', 'title': 'TEST'})
        self.assertTrue('requests' not in cat.data)

        try:
            cat = wptools.category(namespace='NOTINT')
            self.fail("failed to raise ValueError")
        except ValueError:
            pass
예제 #2
0
 def test_category_get_members(self):
     cat = wptools.category('TEST')
     cat.cache['category'] = category.cache
     cat._set_data('category')
     self.assertEqual(len(cat.data['members']), 68)
     self.assertEqual(len(cat.data['subcategories']), 24)
     self.assertTrue('requests' not in cat.data)
    def __get_category(self, category_name):
        '''
        A recursion function that gets a category and returns all it's pages within it.
        If there are subcategories will recursively run the function
        Will add all pages to self.pages (set)
        @return: None
        '''
        # Gets the old__data within the wiki page
        wiki_data = wptools.category(category_name).get_members().data

        try:
            # tries to get subcatgories
            subcategories = wiki_data['subcategories']
        except:
            # Stop Condition - If there are no subcategories
            # Loops through all members (pages) in category
            # adds them to pages
            for member in wiki_data['members']:
                print(f"Added to Wikipedia pages list : {member['title']}")
                self.pages.add(member['title'])

                # cache results
                if self.config['extraction']['subcache']:
                    if len(self.pages) % 100 == 0:
                        save_data('\n'.join(self.pages),
                                  get_file_path(self.output_path, 'subcache'))
            return

        # For DEBUG - smaller subcategories
        if self.config['debug']['DEBUG']:
            subcategories = subcategories[:20]

        for cat in subcategories:
            print(f"Entered to Wikipedia Category : {cat['title']}")
            self.__get_category(cat['title'])
 def get_cat_pages(self):
     #Get the title of pages of the corpus for the french network
     cat = wptools.category(self.category, lang=self.language)
     membres = cat.get_members()
     self.info_pages = pd.DataFrame.from_dict(membres.data['members'])
     self.info_pages = self.info_pages[['pageid', 'title']]
     #Get infos about the pages
     self.info_pages['content'] = self.info_pages['title'].apply(
         self.get_page_data)
     #Parse the infos into the DataFrame
     self.info_pages['summary'] = self.info_pages['content'].apply(
         self.get_summary)
     self.info_pages['url'] = self.info_pages['content'].apply(self.get_url)
     self.info_pages['length'] = self.info_pages['content'].apply(
         self.get_length)
     self.info_pages['links'] = self.info_pages['content'].apply(
         self.get_links)
     self.info_pages['modification'] = self.info_pages['content'].apply(
         self.get_modification)
     self.info_pages['wikibase'] = self.info_pages['content'].apply(
         self.get_wikibase)
     self.info_pages['wikidata_url'] = self.info_pages['content'].apply(
         self.get_wikidata_url)
     self.info_pages['aliases'] = self.info_pages['content'].apply(
         self.get_aliases)
     self.info_pages = self.info_pages.rename(columns={'pageid': 'Id'})
    def collect_data(self, category, depth):
        if depth:
            print("Extracting data for subcategories of {} at depth {}".format(
                category, depth))
            cat = wptools.category(category)
            cat_members = cat.get_members()

            if 'members' in cat_members.data.keys():
                for cat_member in cat_members.data['members']:
                    if cat_member['pageid'] not in self.get_ids():
                        try:
                            page = wptools.page(
                                pageid=cat_member['pageid']).get_parse()
                            url = page.get_query().data['url']
                            text = BeautifulSoup(
                                page.data['wikitext'], 'html.parser').get_text()
                            clean_text = re.sub(
                                r'\s*{.*}\s*|\s*\[.*\]\s*', '', text)
                            print('Saving page with Id: {} and URL: {}'.format(
                                cat_member['pageid'], url))
                            self.insert_page_content(
                                cat_member['pageid'], category, url, clean_text)
                        except Exception as e:
                            print("Exception occured: {}".format(e))

            if 'subcategories' in cat_members.data.keys():
                sub_cats = cat_members.data['subcategories']
                for sub_cat in sub_cats:
                    self.categories.append(sub_cat)
                    self.collect_data(sub_cat['title'], depth - 1)
예제 #6
0
def download_category(category, lang, name, include_subcat, results, level=""):
    if not os.path.exists(name):
        os.mkdir(name)
    wiki_cat = wptools.category(category, lang=lang, silent=True)
    wiki_cat.get_members()
    category_short = category.split(":")[1]
    print(level + " Scrapping category:{}".format(category))
    if category_short in [x['title'] for x in wiki_cat.data['members']]:
        l = list(
            filter(lambda x: x['title'] == category_short,
                   wiki_cat.data['members']))
        wiki_cat.data['members'] = l
        if 'subcategories' in wiki_cat.data:
            wiki_cat.data.pop('subcategories')
    if include_subcat and 'subcategories' in wiki_cat.data:
        for subcat in wiki_cat.data['subcategories']:
            download_category(subcat['title'], lang, name, include_subcat,
                              results, level + ">")
    for topic in wiki_cat.data['members']:
        title = topic['title']
        if title in [x['title'] for x in results]:
            #print("title:{} ----- (exists)".format(title))
            continue
        try:
            page = wptools.page(title, lang=lang, silent=True).get_query()
        except ValueError:
            results.append({"title": title, "ok": False})
            continue
        thumb_img = page.images(fields='url', token='thumb')
        if thumb_img is None or len(
                thumb_img) == 0 or thumb_img[0]['url'] is None:
            print("title:{} ----- (no image)".format(title))
            results.append({"title": title, "ok": False})
            continue
        extract = ""
        if 'extract' in page.data:
            extract = page.data['extract']
        page.get_more()
        tags = [
            tag.split(':')[1].replace(' ', '_')
            for tag in page.data['categories']
        ]
        url = thumb_img[0]['url']
        info = {
            "title": title,
            "ok": True,
            "url": url,
            "extract": extract,
            "tags": tags
        }
        print("title:{} info:{}".format(title, extract[:100].replace('\n',
                                                                     '')))
        results.append(info)
        save_info(info=results, folder=name)
예제 #7
0
 def test_category_query(self):
     cat = wptools.category('TEST')
     qobj = wptools.query.WPToolsQuery()
     self.assertEqual(cat._query('random', qobj),
                      ('https://en.wikipedia.org/w/api.php?'
                       'action=query&format=json&formatversion=2'
                       '&list=random&rnlimit=1&rnnamespace=14'))
     self.assertEqual(cat._query('category', qobj),
                      ('https://en.wikipedia.org/w/api.php?'
                       'action=query&format=json&formatversion=2'
                       '&list=categorymembers&cmlimit=500&cmtitle=TEST'))
예제 #8
0
    def test_category_get_members_continue(self):
        cat = wptools.category('TEST')
        cat.cache['category'] = category_cmcontinue.cache
        cat._set_data('category')
        self.assertTrue('continue' in cat.data)
        self.assertEqual(len(cat.data['members']), 1)

        qry = cat._query('category', wptools.query.WPToolsQuery())
        self.assertTrue('&cmcontinue=page|' in qry)
        self.assertTrue(qry.endswith('|42525291'))

        cat.cache['category'] = category.cache
        cat._set_data('category')
        self.assertTrue('cmcontinue' not in cat.data)
        self.assertEqual(len(cat.data['members']), 69)
    def get_categories_and_members(self, category, depth):
        """
        Start with the defined category and download Wikipedia content
        up to the specific depth of categories
        :param category:
        :param depth:
        :return:
        """
        print(u'Checking for subcategories of {} at depth {}'.format(category, depth))
        if depth:
            # Get details of this category
            # Members are pages related to this category
            cat = wptools.category(category)
            cat_members = cat.get_members()

            # First let's save any members (pages) for this category
            if 'members' in cat_members.data.keys():
                for cat_member in cat_members.data['members']:
                    # Check to see if we have this page already, ignore if we do
                    if cat_member['pageid'] not in self.get_page_ids():

                        # If we don't have this page, then get the page content
                        page = wptools.page(pageid=cat_member['pageid']).get_parse()

                        # Get URL in wikipedia
                        url = page.get_query().data['url']

                        # Remove <ref> and other HTML syntax
                        text = BeautifulSoup(page.data['wikitext'], 'html.parser').get_text()

                        # Remove other markup such as [[...]] and {{...}}
                        clean_content = re.sub(r'\s*{.*}\s*|\s*\[.*\]\s*', '', text)

                        # Now store
                        print('Saving pageid {} / url {}'.format(cat_member['pageid'], url))
                        self._save_page_content(category, cat_member['pageid'], url, clean_content)

            # Now iterate through any subcategories
            if 'subcategories' in cat_members.data.keys():
                subcats = cat_members.data['subcategories']
                for subcat in subcats:
                    self.categories.append(subcat)

                    # Recursively call this function until we've explored Wikipedia up to the specified depth
                    self.get_categories_and_members(subcat['title'], depth - 1)
예제 #10
0
def extract_category_members(cat_name, prefix):
    sleep(1)
    cat = wptools.category(cat_name)
    prefix2 = prefix + "/" + cat_name.replace("Category:", "").replace(
        "category:", "").replace(" ", "_")

    if len(prefix2) > 2500:
        return

    members = [
        x['title'] for x in cat.get_members().data['members']
        if "list of" not in x['title'].lower()
    ]
    if len(members) > 2:
        if not os.path.exists(path + prefix):
            os.makedirs(path + prefix)
        with open(path + prefix2 + '.txt', 'w') as f:
            f.write('\n'.join(members))
    if 'subcategories' in cat.get_members().data:
        for c_name in cat.get_members().data['subcategories']:
            extract_category_members(c_name['title'], prefix2)
예제 #11
0
 def test_category_get_members_namespace(self):
     cat = wptools.category('TEST', namespace=0)
     cat.cache['category'] = category.cache
     cat._set_data('category')
     self.assertEqual(len(cat.data['members']), 68)
     self.assertTrue('requests' not in cat.data)
예제 #12
0
 def test_category_caching(self):
     cat = wptools.category('TEST', silent=SILENT_FLAG)
     cat.cache['category'] = {'response': None}
     cat.get_members()
     self.assertEqual(len(cat.data), 0)
     self.assertTrue('requests' not in cat.data)
예제 #13
0
 def test_category_random(self):
     cat = wptools.category('TEST')
     cat.cache = {'random': random_query.cache}
     cat._set_data('random')
     self.assertEqual(cat.data['title'], 'RANDOM TEST TITLE')
예제 #14
0
 def test_category_get_members(self):
     cat = wptools.category('TEST')
     cat.cache['category'] = category.cache
     cat._set_data('category')
     self.assertTrue(len(cat.data['members']), 92)
예제 #15
0
 def test_category_caching(self):
     cat = wptools.category('TEST', silent=True)
     cat.cache['category'] = {'response': None}
     cat.get_members()
     self.assertEqual(len(cat.data), 0)
예제 #16
0
 def test_category_init(self):
     self.assertRaises(ValueError, wptools.category, pageid='TEST')
     self.assertRaises(ValueError, wptools.category, 'TEST', pageid=123)
     cat = wptools.category('TEST')
     self.assertEqual(cat.params, {'lang': 'en', 'title': 'TEST'})