def test_crawler_parse_hrefs(crawler): host_link = make_hyperlink("https://www.example.com") links = [ "https://www.example.com#with-fragment", "https://www.example.com?with=query", "https://www.example.com/?with=query#with-fragment", "#with-fragment", "?with=query", "/?with=query#with-fragment", "/some/path", "/another/path", "https://www.example.com/", "https://www.example.com/", "https://www.example.com/third/path", "https://www.dont-find.com", "https://www.subdomain.example.com", ] input_hrefs = make_hyperlink_set([make_hyperlink(link) for link in links]) assert crawler._parse_hrefs(input_hrefs, host_link) == make_hyperlink_set( [ host_link, host_link + "/some/path", host_link + "/another/path", host_link + "/third/path", ] )
def recover(self): retrieved_data = self.db.select_from_table(table=self.metadata_table_name, columns='*', where=f"crawler_tag='{self.tag}'", order_by='id', asc_or_desc='DESC') last_found_entry = next(filter(lambda row: row[2] == 'Found', retrieved_data)) url = make_hyperlink(last_found_entry[3]) current_href = last_found_entry[4] hrefs = last_found_entry[5].split(',') href_index = hrefs.index(current_href) hrefs = hrefs[href_index:] hrefs = [make_hyperlink(href) for href in hrefs] hrefs = make_hyperlink_set(hrefs) return url, hrefs
def test_anchor_tag_parser_multiple_links_with_duplicates(links): html, hrefs = ( make_html(make_a_tags(links)), {make_hyperlink(link) for link in links}, ) parser = AnchorTagParser() parser.feed(html) assert parser.found_links.collection == hrefs assert parser.found_links == make_hyperlink_set(hrefs)
def test_hyperlink_set_behaves_like_set(): links = {"/hello", "/world", "/?hello=world"} # check __init__ hrefs = make_hyperlink_set(links) # check __len__ assert len(hrefs) == 3 # check append hrefs.add(make_hyperlink("/?hello=world&world=hello")) # check __len__ again assert len(hrefs) == 4 # check __contains__ for link in links: assert make_hyperlink(link) in hrefs # check __iter__ found = set() for href in hrefs: found.add(href) assert found == hrefs.collection
def handle_starttag(self, tag: str, attrs: list) -> None: # https://docs.python.org/3/library/html.parser.html#html.parser.HTMLParser.handle_starttag # HTMLParser manages lowercase for us # grab only a tags if tag == "a": for attr, value in attrs: # grab only hrefs if attr == "href": href = make_hyperlink(value) self.found_links.add(href)
def crawl(self, domain: str) -> Set[str]: """crawl any site for all urls""" domain = make_hyperlink(domain) self._queue.put(domain) # get robots # todo: only do this if we obey robots? robots = self._get_robots(domain) with self._executor() as executor: while True: # exit if we have crawled all urls found if self._seen_urls == self._done_urls and self._seen_urls.is_not_empty(): # return results return self._render_results() # wait for more urls to enter queue or return if we timeout try: url = self._queue.get(timeout=self.timeout) except queue.Empty: # return results return self._render_results() # if the url has been done start flow again if url in self._done_urls: continue # if we are to obey the robots then we need to see what we can scrape if self.obey_robots: # start again if we can't fetch a url if not robots.can_fetch(self.user_agent, str(url)): print(f"{self.user_agent} can't crawl {url}") continue # there is a bug in py3.6 https://bugs.python.org/issue35922 # this try, except will allow for 3.6 try: # wait for delay if we can scrape but must crawl slowly if robots.crawl_delay(self.user_agent): delay = int(robots.crawl_delay(self.user_agent)) print(f"{self.user_agent} has a delay of {delay}, waiting...") time.sleep(delay) except AttributeError: pass # submit crawl_url to executor executor.submit(self._crawl_url, url)
def _get_hrefs(self, url: Hyperlink) -> HyperlinkSet: """get hrefs from url with requester""" resp = self._requester( url, check_head_first=self.check_head, follow_redirects=(not self.record_redirects), ) # if we want to record redirects # and the response returns a redirect # then we will grab the the "Location" header from the response # because there will be no links to scrape from the text if self.record_redirects and str(resp.status_code).startswith("3"): hrefs = make_hyperlink_set([make_hyperlink(resp.headers["Location"])]) # else we scrape from the text else: hrefs = get_hrefs_from_html(resp.text) return hrefs
def href(self): return make_hyperlink(self.url)
def test_anchor_tag_parser_single_link(link): html, href = make_html(make_a_tag(link)), make_hyperlink(link) parser = AnchorTagParser() parser.feed(html) assert parser.found_links.collection == {href} assert parser.found_links == make_hyperlink_set([href])
def test_get_hrefs_from_html_unique(input_links, output_results): html = make_html(make_a_tags(input_links)) hrefs = {make_hyperlink(link) for link in output_results} assert get_hrefs_from_html(html).collection == hrefs assert get_hrefs_from_html(html) == make_hyperlink_set(hrefs)
def test_hyperlink_join_with_relative_links(input_link, output_result): href = make_hyperlink(input_link) domain = "https://helloworld.com" assert str(href.join(domain)) == domain + output_result
def test_hyperlink_is_absolute_or_relative(input_link, is_absolute_link): href = make_hyperlink(input_link) assert href.is_absolute == is_absolute_link assert href.is_relative != is_absolute_link
def test_hyperlink(input_link, output_result): href = make_hyperlink(input_link) assert str(href) == output_result
def test_hyperlink_set_relative_links_join_all(input_links, output_links): links = make_hyperlink_set(input_links) domain = "https://www.google.com" assert links.join_all(domain) == make_hyperlink_set( [make_hyperlink(domain + link) for link in output_links] )
def test_hyperlink_normalisation(input_link, output_result): assert make_hyperlink(input_link).url == output_result