def test_extract_text_from_html_wikipedia_page(): culture_path = EN_WIKIPEDIA_PATH / "Culture" / "body" culture_page = WikipediaArticle(culture_path.read_text(encoding="utf-8")) main_text = culture_page.get_main_text() assert main_text.startswith("Culture") assert main_text.endswith("tradition of textual theory.") paragraphs = main_text.split("\n\n") assert len(paragraphs) == 35
def test_extract_language_links_from_html_wikipedia_page(): culture_path = EN_WIKIPEDIA_PATH / "Culture" / "body" culture_page = WikipediaArticle(culture_path.read_text(encoding="utf-8")) language_links = culture_page.get_language_links() assert len(language_links) == 171 expected_link = "https://af.wikipedia.org/wiki/Kultuur" assert language_links[0] == expected_link expected_link = "https://zh.wikipedia.org/wiki/" + quote("文化") assert language_links[-1] == expected_link
def test_web_scraper_fetch(): scraper = SimpleWebScraper() headers, body = scraper.fetch("https://fr.wikipedia.org/wiki/Tomate") assert isinstance(headers, dict) assert isinstance(body, bytes) assert headers['Content-Type'] == "text/html; charset=UTF-8" article = WikipediaArticle(body, encoding="utf-8") expected_link = "https://en.wikipedia.org/wiki/Tomato" assert expected_link in article.get_language_links() main_text = article.get_main_text() assert main_text.startswith("Solanum lycopersicum\n\nLe plant de tomates")
def test_web_scraper_fetch_and_save(tmpdir): scraper = SimpleWebScraper(output_folder=tmpdir) result_folder = scraper.fetch_and_save( "https://fr.wikipedia.org/wiki/Pomme_de_terre") result_folder == tmpdir / "fr.wikipedia.org" / "wiki" / "Pomme_de_terre" with open(result_folder / "headers.json") as f: headers = json.load(f) assert headers['Content-Type'] == "text/html; charset=UTF-8" body = (result_folder / "body").read_bytes() article = WikipediaArticle(body) expected_link = "https://en.wikipedia.org/wiki/Potato" assert expected_link in article.get_language_links() assert article.get_main_text().startswith( "Solanum tuberosum\n\nLa pomme de terre, ou patate[1]")