def test_retrieveLinks(self): '''Test method for HttpLinksCollector.retrieve_links. ''' # Test 1 - check HttpError 401 in log starting_url = "http://www.nature.com" target_url = \ "http://www.nature.com/nature/journal/v438/n7070/full/438900a.html" http_links_collector = HttpLinksCollector(starting_url) links_retrieved = http_links_collector.retrieve_links(target_url) self.assertTrue(not links_retrieved, \ "Retreved Links from:'" + target_url + "'") # Test 2 - check URLError - protocol irc. starting_url = "http://www.nature.com" target_url = "irc://irc.freenode.net/wikimedia-ayuda" http_links_collector = HttpLinksCollector(starting_url) links_retrieved = http_links_collector.retrieve_links(target_url) self.assertTrue(not links_retrieved, \ "Retreved Links from:'" + target_url + "'")
def crawler_start(self): '''Method to start crawling. * Checks input parameters. * returns the result of crawling printing a dictionary on the screen. ''' # ArgParse definition rules parser = argparse.ArgumentParser(description="Let's crawl a web") parser.add_argument('url', nargs=1, help='target URL') parser.add_argument('-n', '--number-of-levels', type = int, \ default = 1, help = 'how depth the crawl will go.') # Create argument object args = parser.parse_args() target_url = args.url.pop() depth = args.number_of_levels # Starting level to retrieve links level = 1 links = {} http_links_collector = HttpLinksCollector(target_url) links_list = http_links_collector.\ retrieve_links(target_url, depth, level) links[target_url] = links_list links_result = json.dumps(links, sort_keys=True, indent=4) # Print result in json view mode. self.logger.info(links_result)