def test_urlutils(): '''Test URL manipulation tools''' # domain extraction assert extract_domain('h') is None assert extract_domain('https://httpbin.org/') == 'httpbin.org' # url parsing result = _parse('https://httpbin.org/') assert isinstance(result, ParseResult) newresult = _parse(result) assert isinstance(result, ParseResult) with pytest.raises(TypeError): result = _parse(float(1.23)) assert get_base_url('https://example.org/path') == 'https://example.org' with pytest.raises(ValueError): assert get_host_and_path('123') is None assert get_host_and_path('https://example.org/path') == ( 'https://example.org', '/path') assert get_host_and_path('https://example.org/') == ('https://example.org', '/') assert get_host_and_path('https://example.org') == ('https://example.org', '/') assert get_hostinfo('https://httpbin.org/') == ('httpbin.org', 'https://httpbin.org') assert get_hostinfo('https://example.org/path') == ('example.org', 'https://example.org') # keeping track of known URLs known_links = {'https://test.org'} assert is_known_link('https://test.org/1', known_links) is False assert is_known_link('https://test.org', known_links) is True assert is_known_link('http://test.org', known_links) is True assert is_known_link('http://test.org/', known_links) is True assert is_known_link('https://test.org/', known_links) is True
def test_urlutils(): '''Test URL manipulation tools''' assert extract_domain('https://httpbin.org/') == 'httpbin.org' assert get_base_url('https://example.org/path') == 'https://example.org' assert get_host_and_path('https://example.org/path') == ('https://example.org', '/path') assert get_hostinfo('https://example.org/path') == ('example.org', 'https://example.org') assert get_hostinfo('https://httpbin.org/') == ('httpbin.org', 'https://httpbin.org')
def store_todo_links(todo, new_links, shortform=False): """Store the retrieved internal links in todo-list while prioritizing the navigation ones.""" # add links to deque if todo is None: todo = deque() # prioritize navigation links # use most short links if there are no navlinks? for link in new_links: if shortform is True: link = get_host_and_path(link)[1] if is_navigation_page(link): todo.appendleft(link) else: todo.append(link) # unique list while preserving order return deque(OrderedDict.fromkeys(todo))
def add_to_compressed_dict(inputlist, blacklist=None, url_filter=None, inputdict=None): '''Filter, convert input URLs and add them to domain-aware processing dictionary''' # init if inputdict is None: inputdict = defaultdict(deque) # deduplicate while keeping order inputlist = list(OrderedDict.fromkeys(inputlist)) # filter if blacklist: inputlist = [ u for u in inputlist if re.sub(r'https?://', '', u) not in blacklist ] if url_filter: filtered_list = [] while inputlist: u = inputlist.pop() for f in url_filter: if f in u: filtered_list.append(u) break inputlist = filtered_list # validate and store in dict for url in inputlist: # validate URL if validate_url(url)[0] is False: continue # segment URL and add to domain dictionary try: hostinfo, urlpath = get_host_and_path(url) inputdict[hostinfo].append(urlpath) except ValueError: LOGGER.warning('Could not parse URL, discarding: %s', url) return inputdict
def cli_crawler(args, n=30, domain_dict=None): '''Start a focused crawler which downloads a fixed number of URLs within a website and prints the links found in the process''' config = use_config(filename=args.config_file) sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME') counter, crawlinfo, backoff_dict = None, {}, {} # load input URLs if domain_dict is None: domain_dict = load_input_dict(args) # load crawl data for website in domain_dict: homepage = website + domain_dict[website].popleft() crawlinfo[website] = {} domain_dict[website], crawlinfo[website]['known'], crawlinfo[website][ 'base'], crawlinfo[website]['count'], crawlinfo[website][ 'rules'] = init_crawl(homepage, None, set(), language=args.target_language, shortform=True) # update info # TODO: register changes? # if base_url != website: # ... # iterate until the threshold is reached while domain_dict: bufferlist, download_threads, domain_dict, backoff_dict = load_download_buffer( domain_dict, backoff_dict, sleep_time, threads=args.parallel) # start several threads for url, result in buffered_downloads(bufferlist, download_threads, decode=False): website, _ = get_host_and_path(url) crawlinfo[website]['count'] += 1 # handle result if result is not None and result != '': domain_dict[website], crawlinfo[website][ 'known'], htmlstring = process_response( result, domain_dict[website], crawlinfo[website]['known'], crawlinfo[website]['base'], args.target_language, shortform=True, rules=crawlinfo[website]['rules']) # only store content pages, not navigation if not is_navigation_page(url): # + response.geturl() if args.list: write_result(url, args) else: counter = process_result(htmlstring, args, url, counter, config) # just in case a crawl delay is specified in robots.txt sleep(get_crawl_delay(crawlinfo[website]['rules'])) #else: # LOGGER.debug('No result for URL: %s', url) # if args.archived is True: # errors.append(url) # early exit if maximum count is reached if any(i >= n for i in [dictvalue['count'] for _, dictvalue in crawlinfo.items()]): break # print results for website in sorted(domain_dict): for urlpath in sorted(domain_dict[website]): sys.stdout.write(website + urlpath + '\n')