def test_navigation(): assert is_navigation_page('https://test.org/') is False assert is_navigation_page('https://test.org/page/1') is True assert is_navigation_page('https://test.org/?p=11') is True assert is_not_crawlable('https://test.org/login') is True assert is_not_crawlable('https://test.org/login/') is True assert is_not_crawlable('https://test.org/login.php') is True assert is_not_crawlable('https://test.org/page') is False
def find_new_links(htmlstring, base_url, known_links, language=None, rules=None): """Extract and filter new internal links after an optional language check.""" new_links = [] # reference=None # optional language check: run baseline extraction + language identifier if language is not None and LANGID_FLAG is True: _, text, _ = baseline(htmlstring) result = cld3.get_language(text) if result is not None and result.language != language: return new_links, known_links # iterate through the links and filter them for link in extract_links(htmlstring, base_url, False, language=language, with_nav=True): # check robots.txt rules if rules is not None and not rules.can_fetch("*", link): continue # sanity check if is_known_link(link, known_links) is True or is_not_crawlable(link): continue new_links.append(link) known_links.add(link) return new_links, known_links