def add(self, element, priority): """ Appends element to heap. """ element = url_normalize(element) # only use normalized urls if element not in self.hashtable: heapq.heappush(self.heap, (priority, element)) self.hashtable.add(element)
""" self.pre_process() LOGGER.info("starting at (%s)... "% self.root) count = 0 while self.unparsed_urls.heap and count < self.max_limit: # getting link to get url = self.unparsed_urls.get() count += 1 # fetching page page = self.fetch_url(url) if page.status not in [404, 403, 500] and 'text/html' in page.headers['content-type']: LOGGER.info("visited: %s " % (url)) self.process_page(page) self.process_page_links(page.body, page.url) return count if __name__ == '__main__': try: INPUT_URL = sys.argv[1] ALLOW_EXTERNAL = int(sys.argv[2]) ALLOW_REDIRECTS = int(sys.argv[3]) MAX_LIMIT = int(sys.argv[4]) except IndexError: LOGGER.info("Error: Incorrect start url / external options were passed\n note: all three parameters required") exit() BELA = Balerion(url_normalize(INPUT_URL), ALLOW_EXTERNAL, ALLOW_REDIRECTS, MAX_LIMIT) BELA.crawl()