def crawl(self): page = Linkfetcher(self.root) page.linkfetch() queue = Queue() for url in page.urls: queue.put(url) followed = [self.root] n = 0 while True: try: url = queue.get() except QueueEmpty: break n += 1 if url not in followed: try: host = urlparse.urlparse(url)[1] if self.locked and re.match(".*%s" % self.host, host): followed.append(url) self.followed += 1 page = Linkfetcher(url) page.linkfetch() for i, url in enumerate(page): if url not in self.urls: self.links += 1 queue.put(url) self.urls.append(url) if n > self.depth and self.depth > 0: break except Exception, e: print "ERROR: The URL '%s' can't be processed due to (%s)" % (url, e) print format_exc()
def getlinks(url): """Get Links from the Linkfetcher class.""" page = Linkfetcher(url) page.linkfetch() for i, url in enumerate(page): print("%d ==> %s" % (i, url))
def getlinks(url): page = Linkfetcher(url) page.linkfetch() for i, url in enumerate(page): print "%d ==> %s" % (i, url)