def test_urlutils(): '''Test URL manipulation tools''' # domain extraction assert extract_domain('h') is None assert extract_domain('https://httpbin.org/') == 'httpbin.org' # url parsing result = _parse('https://httpbin.org/') assert isinstance(result, ParseResult) newresult = _parse(result) assert isinstance(result, ParseResult) with pytest.raises(TypeError): result = _parse(float(1.23)) assert get_base_url('https://example.org/path') == 'https://example.org' with pytest.raises(ValueError): assert get_host_and_path('123') is None assert get_host_and_path('https://example.org/path') == ( 'https://example.org', '/path') assert get_host_and_path('https://example.org/') == ('https://example.org', '/') assert get_host_and_path('https://example.org') == ('https://example.org', '/') assert get_hostinfo('https://httpbin.org/') == ('httpbin.org', 'https://httpbin.org') assert get_hostinfo('https://example.org/path') == ('example.org', 'https://example.org') # keeping track of known URLs known_links = {'https://test.org'} assert is_known_link('https://test.org/1', known_links) is False assert is_known_link('https://test.org', known_links) is True assert is_known_link('http://test.org', known_links) is True assert is_known_link('http://test.org/', known_links) is True assert is_known_link('https://test.org/', known_links) is True
def test_urlutils(): '''Test URL manipulation tools''' assert extract_domain('https://httpbin.org/') == 'httpbin.org' assert get_base_url('https://example.org/path') == 'https://example.org' assert get_host_and_path('https://example.org/path') == ('https://example.org', '/path') assert get_hostinfo('https://example.org/path') == ('example.org', 'https://example.org') assert get_hostinfo('https://httpbin.org/') == ('httpbin.org', 'https://httpbin.org')
def refresh_detection(htmlstring, homepage): "Check if there could be a redirection by meta-refresh tag." if '"refresh"' in htmlstring or '"REFRESH"' in htmlstring: try: html_tree = load_html(htmlstring) # test meta-refresh redirection # https://stackoverflow.com/questions/2318446/how-to-follow-meta-refreshes-in-python attr = html_tree.xpath( '//meta[@http-equiv="refresh"]/@content|//meta[@http-equiv="REFRESH"]/@content' )[0] _, text = attr.split(';') text = text.strip().lower() if text.startswith('url=') or text.startswith('URL='): url2 = text[4:] if not url2.startswith('http'): # Relative URL, adapt _, base_url = get_hostinfo(url2) url2 = fix_relative_urls(base_url, url2) # second fetch newhtmlstring = fetch_url(url2) if newhtmlstring is None: logging.warning('failed redirect: %s', url2) return None, None #else: htmlstring, homepage = newhtmlstring, url2 logging.info('successful redirect: %s', url2) except (IndexError, etree.ParserError, etree.XMLSyntaxError, etree.XPathEvalError) as err: logging.info('no redirect found: %s %s', homepage, err) return htmlstring, homepage
def init_crawl(homepage, todo, known_links, language=None, shortform=False, rules=None): """Start crawl by initializing variables and potentially examining the starting page.""" # config=DEFAULT_CONFIG _, base_url = get_hostinfo(homepage) known_links = known_links or set() i = 0 # fetch and parse robots.txt file if necessary if rules is None: rules = urllib.robotparser.RobotFileParser() rules.set_url(base_url + '/robots.txt') # exceptions happening here try: rules.read() except Exception as exc: LOGGER.error('cannot read robots.txt: %s', exc) rules = None # initialize crawl by visiting homepage if necessary if todo is None: todo = deque([homepage]) todo, known_links, i, _ = crawl_page(i, base_url, todo, known_links, lang=language, shortform=shortform, rules=rules, initial=True) return todo, known_links, base_url, i, rules
def find_feed_urls(url, target_lang=None): """Try to find feed URLs. Args: url: Webpage or feed URL as string. Triggers URL-based filter if the webpage isn't a homepage. target_lang: Define a language to filter URLs based on heuristics (two-letter string, ISO 639-1 format). Returns: The extracted links as a list (sorted list of unique links). """ domainname, baseurl = get_hostinfo(url) if domainname is None: LOGGER.warning('Invalid URL: %s', url) return [] urlfilter = None downloaded = fetch_url(url) if downloaded is not None: # assume it's a feed feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang) if len(feed_links) == 0: # assume it's a web page for feed in determine_feed(downloaded, baseurl, url): feed_string = fetch_url(feed) feed_links.extend( extract_links(feed_string, domainname, baseurl, url, target_lang)) # filter triggered, prepare it if len(url) > len(baseurl) + 2: urlfilter = url # return links found if len(feed_links) > 0: feed_links = filter_urls(feed_links, urlfilter) LOGGER.debug('%s feed links found for %s', len(feed_links), domainname) return feed_links LOGGER.debug('No usable feed links found: %s', url) else: LOGGER.warning('Could not download web page: %s', url) if url.strip('/') != baseurl: return try_homepage(baseurl, target_lang) # try alternative: Google News if target_lang is not None: downloaded = fetch_url('https://news.google.com/rss/search?q=site:' + baseurl + '&hl=' + target_lang + '&scoring=n&num=100') if downloaded is not None: feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang) feed_links = filter_urls(feed_links, urlfilter) LOGGER.debug('%s Google news links found for %s', len(feed_links), domainname) return feed_links return []
def sitemap_search(url, target_lang=None): """Look for sitemaps for the given URL and gather links. Args: url: Webpage or sitemap URL as string. Triggers URL-based filter if the webpage isn't a homepage. target_lang: Define a language to filter URLs based on heuristics (two-letter string, ISO 639-1 format). Returns: The extracted links as a list (sorted list of unique links). """ domainname, baseurl = get_hostinfo(url) if domainname is None: LOGGER.warning('Invalid URL: %s', url) return [] urlfilter = None sitemaps_seen = set() # determine sitemap URL if url.endswith('.xml') or url.endswith('.gz') or url.endswith('sitemap'): sitemapurl = url else: sitemapurl = baseurl + '/sitemap.xml' # filter triggered, prepare it if len(url) > len(baseurl) + 2: urlfilter = url sitemapurls, linklist = download_and_process_sitemap(sitemapurl, domainname, baseurl, target_lang) sitemaps_seen.add(sitemapurl) if sitemapurls == [] and len(linklist) > 0: linklist = filter_urls(linklist, urlfilter) LOGGER.debug('%s sitemap links found for %s', len(linklist), domainname) return linklist # try sitemaps in robots.txt file if nothing has been found if sitemapurls == [] and linklist == []: sitemapurls = find_robots_sitemaps(baseurl) # try additional URLs just in case if sitemapurls == []: sitemapurls = [''.join([baseurl, '/', g]) for g in GUESSES] # iterate through nested sitemaps and results i = 1 while sitemapurls: sitemapurl = sitemapurls.pop() sitemapurls, linklist = download_and_process_sitemap(sitemapurl, domainname, baseurl, target_lang, sitemapurls, linklist) # sanity check: keep track of visited sitemaps and exclude them sitemaps_seen.add(sitemapurl) sitemapurls = [s for s in sitemapurls if s not in sitemaps_seen] # counter and safeguard i += 1 if i > MAX_SITEMAPS_SEEN: break linklist = filter_urls(linklist, urlfilter) LOGGER.debug('%s sitemap links found for %s', len(linklist), domainname) return linklist
def probe_alternative_homepage(homepage): "Check if the homepage is redirected and return appropriate values." response = fetch_url(homepage, decode=False) if response is None or response == '': return None, None, None # get redirected URL here? if response.geturl() != homepage: logging.info('followed redirect: %s', response.geturl()) homepage = response.geturl() # decode response htmlstring = decode_response(response.data) # is there a meta-refresh on the page? htmlstring, homepage = refresh_detection(htmlstring, homepage) logging.info('fetching homepage OK: %s', homepage) _, base_url = get_hostinfo(homepage) return htmlstring, homepage, base_url