Пример #1
0
 def test_skip_by_class(self):
     """Don't extract links of some class."""
     html = ('<a href="/wiki/foo" class="image"><img src="url" /></a>'
             '<a class="internal" href="/wiki/foo" title="foo">foo</a>')
     wikifile = FakeWikiFile(html)
     links = extract_pages(wikifile.soup)
     assert len(list(links)) == 0
Пример #2
0
def scrap_portal(language, lang_config):
    """Get the portal index and scrap it."""
    # get the portal url, get out if don't have it
    portal_index_title = lang_config.get('portal_index')
    if portal_index_title is None:
        logger.info("Not scraping portals, url not configured.")
        return

    logger.info("Scraping portal main page %s", portal_index_title)
    with NamedTemporaryFile('wt',
                            encoding='utf8',
                            dir='/tmp/',
                            prefix='cdpedia-') as tf:
        tf.write(portal_index_title + '\n')
        tf.flush()
        _call_scraper(language, tf.name)

    dir3, quoted_page = to3dirs.get_path_file(portal_index_title)
    portal_filepath = os.path.join(location.articles, dir3, quoted_page)

    logger.info("Parsing portal page")
    with open(portal_filepath, 'rt', encoding='utf8') as fh:
        soup = bs4.BeautifulSoup(fh, features="html.parser")

    cnt = 0
    _path = os.path.join(location.langdir, PORTAL_PAGES)
    with open(_path, 'wt', encoding='utf8') as fh:
        for page in preprocessors.extract_pages(soup):
            cnt += 1
            fh.write(page + '\n')

    logger.info("Scraping portal sub pages (total=%d)", cnt)
    _call_scraper(language, _path)

    logger.info("Portal scraping done")
Пример #3
0
 def test_extract_portal_link_redirect(self):
     """Redirection links to portal pages must be extracted."""
     html = ('<a href="/wiki/Portal:Astron%C3%A1utica" class="mw-redirect" '
             'title="Portal:Astronáutica">Astronáutica</a>')
     wikifile = FakeWikiFile(html)
     links = extract_pages(wikifile.soup)
     assert list(links) == ['Portal:Astronáutica']
Пример #4
0
 def test_extract_portal_link_normal(self):
     """Links to portal pages must be extracted."""
     html = ('<a href="/wiki/Portal:Exploraci%C3%B3n_espacial" '
             'title="Portal:Exploración espacial">Exploración espacial</a>')
     wikifile = FakeWikiFile(html)
     links = extract_pages(wikifile.soup)
     assert list(links) == ['Portal:Exploración_espacial']
Пример #5
0
 def test_remove_link_fragment(self):
     """Remove fragment from page URL."""
     html = '<a href="/wiki/foo#bar">foobar</a>'
     wikifile = FakeWikiFile(html)
     links = extract_pages(wikifile.soup)
     assert list(links) == ['foo']
Пример #6
0
 def test_skip_non_wiki_urls(self):
     """Don't extract links without a '/wiki/' prefix."""
     html = '<a href="/nowiki/foo">foo</a>'
     wikifile = FakeWikiFile(html)
     links = extract_pages(wikifile.soup)
     assert list(links) == []
Пример #7
0
 def test_extract_link(self):
     """Normal links to wiki pages must be extracted."""
     html = '<a href="/wiki/N%C3%BAmero_natural" title="Número natural">número natural</a>'
     wikifile = FakeWikiFile(html)
     links = extract_pages(wikifile.soup)
     assert list(links) == ['Número_natural']