def handle_link(link, sitemapurl, domainname, baseurl, target_lang): '''Examine a link and determine if it's valid and if it leads to a sitemap or a web page.''' state = '0' # safety net: recursivity if link == sitemapurl: return link, state # fix and check link = fix_relative_urls(baseurl, link) # clean and normalize link = clean_url(link, target_lang) if link is not None: if lang_filter(link, target_lang) is True: newdomain = extract_domain(link) if newdomain is not None: # don't take links from another domain and make an exception for main platforms if newdomain != domainname and not WHITELISTED_PLATFORMS.search( newdomain): LOGGER.warning('Diverging domain names: %s %s', domainname, newdomain) else: if re.search(r'\.xml$|\.xml[.?#]', link): state = 'sitemap' else: state = 'link' else: LOGGER.error("Couldn't extract domain: %s", link) return link, state
def test_examples(): '''test README examples''' assert check_url('https://github.com/adbar/courlan') == ('https://github.com/adbar/courlan', 'github.com') assert check_url('https://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.org', strict=True) == ('https://httpbin.org/redirect-to', 'httpbin.org') assert clean_url('HTTPS://WWW.DWDS.DE:80/') == 'https://www.dwds.de' assert validate_url('http://1234') == (False, None) assert validate_url('http://www.example.org/')[0] is True assert normalize_url('http://test.net/foo.html?utm_source=twitter&post=abc&page=2#fragment', strict=True) == 'http://test.net/foo.html?page=2&post=abc'
def test_scrub(): # clean: scrub + normalize assert clean_url(5) is None assert clean_url('ø\xaa') == 'øª' # scrub assert scrub_url(' https://www.dwds.de') == 'https://www.dwds.de' assert scrub_url( '<![CDATA[https://www.dwds.de]]>') == 'https://www.dwds.de' assert scrub_url('https://www.dwds.de/test?param=test&other=test' ) == 'https://www.dwds.de/test?param=test&other=test' assert scrub_url('https://www.dwds.de/garbledhttps://www.dwds.de/' ) == 'https://www.dwds.de/garbled' assert scrub_url( 'https://g__https://www.dwds.de/') == 'https://www.dwds.de' # exception for archive URLs assert scrub_url( 'https://web.archive.org/web/20131021165347/https://www.imdb.com/' ) == 'https://web.archive.org/web/20131021165347/https://www.imdb.com' # social sharing assert scrub_url( 'https://twitter.com/share?&text=Le%20sabre%20de%20bambou%20%232&via=NouvellesJapon&url=https://nouvellesdujapon.com/le-sabre-de-bambou-2' ) == 'https://nouvellesdujapon.com/le-sabre-de-bambou-2' assert scrub_url( 'https://www.facebook.com/sharer.php?u=https://nouvellesdujapon.com/le-sabre-de-bambou-2' ) == 'https://nouvellesdujapon.com/le-sabre-de-bambou-2' # end of URL assert scrub_url('https://www.test.com/&') == 'https://www.test.com' # white space assert scrub_url('\x19https://www.test.com/\x06') == 'https://www.test.com' # markup assert scrub_url('https://www.test.com/</a>') == 'https://www.test.com' # garbled URLs e.g. due to quotes assert scrub_url('https://www.test.com/"' + '<p></p>' * 100) == 'https://www.test.com' assert scrub_url('https://www.test.com/"' * 50) != 'https://www.test.com' # simply too long, left untouched my_url = 'https://www.test.com/' + 'abcdefg' * 100 assert scrub_url(my_url) == my_url
def determine_feed(htmlstring, baseurl, reference): '''Try to extract the feed URL from the home page. Adapted from http://www.aaronsw.com/2002/feedfinder/''' # parse the page to look for feeds tree = load_html(htmlstring) # safeguard if tree is None: LOGGER.debug('Invalid HTML/Feed page: %s', baseurl) return [] feed_urls = [] for linkelem in tree.xpath('//link[@rel="alternate"]'): # discard elements without links if 'href' not in linkelem.attrib: continue # most common case if 'type' in linkelem.attrib and linkelem.get('type') in FEED_TYPES: feed_urls.append(linkelem.get('href')) # websites like geo.de elif 'atom' in linkelem.get('href') or 'rss' in linkelem.get('href'): feed_urls.append(linkelem.get('href')) # backup if not feed_urls: for linkelem in tree.xpath('//a[@href]'): if linkelem.get('href')[-4:].lower() in ('.rss', '.rdf', '.xml'): feed_urls.append(linkelem.get('href')) elif linkelem.get('href')[-5:].lower() == '.atom': feed_urls.append(linkelem.get('href')) elif 'atom' in linkelem.get('href') or 'rss' in linkelem.get( 'href'): feed_urls.append(linkelem.get('href')) # refine output_urls = [] for link in sorted(set(feed_urls)): link = fix_relative_urls(baseurl, link) link = clean_url(link) if link == reference or validate_url(link)[0] is False: continue if BLACKLIST.search(link): continue output_urls.append(link) # log result LOGGER.debug('Feed URLs found: %s of which %s valid', len(feed_urls), len(output_urls)) return output_urls
def handle_link(link, sitemapurl, domainname, baseurl, target_lang=None): '''Examine a link and determine if it's valid and if it leads to a sitemap or a web page.''' state = '0' # safety net: recursivity if link == sitemapurl: return link, state # fix and check link = fix_relative_urls(baseurl, link) # clean and normalize link = clean_url(link, target_lang) if link is not None: if lang_filter(link, target_lang) is True: newdomain = extract_domain(link) if newdomain != domainname: LOGGER.warning('Diverging domain names: %s %s', domainname, newdomain) else: if re.search(r'\.xml$|\.xml[.?#]', link): state = 'sitemap' else: state = 'link' return link, state
def determine_feed(htmlstring, baseurl, reference): '''Try to extract the feed URL from the home page''' feed_urls = [] # try to find RSS URL for feed_url in re.findall( r'<link[^<>]+?type="application/rss\+xml"[^<>]+?href="(.+?)"', htmlstring): feed_urls.append(feed_url) for feed_url in re.findall( r'<link[^<>]+?href="(.+?)"[^<>]+?type="application/rss\+xml"', htmlstring): feed_urls.append(feed_url) # try to find Atom URL if len(feed_urls) == 0: for feed_url in re.findall( r'<link[^<>]+?type="application/atom\+xml"[^<>]+?href="(.+?)"', htmlstring): feed_urls.append(feed_url) for feed_url in re.findall( r'<link[^<>]+?href="(.+?)"[^<>]+?type="application/atom\+xml"', htmlstring): feed_urls.append(feed_url) # refine output_urls = [] for link in sorted(list(set(feed_urls))): link = fix_relative_urls(baseurl, link) link = clean_url(link) if link == reference or validate_url(link)[0] is False: continue if 'comments' in link: continue output_urls.append(link) # log result LOGGER.debug('Feed URLs found: %s of which %s valid', len(feed_urls), len(output_urls)) return output_urls