Пример #1
0
    def crawl(self):

        page = Linkfetcher(self.root)
        page.linkfetch()
        queue = Queue()
        for url in page.urls:
            queue.put(url)
        followed = [self.root]

        n = 0

        while True:
            try:
                url = queue.get()
            except QueueEmpty:
                break

            n += 1

            if url not in followed:
                try:

                    host = urlparse.urlparse(url)[1]

                    if self.locked and re.match(".*%s" % self.host, host):
                        followed.append(url)
                        self.followed += 1
                        page = Linkfetcher(url)
                        page.linkfetch()
                        for i, url in enumerate(page):
                            if url not in self.urls:
                                self.links += 1
                                queue.put(url)
                                self.urls.append(url)

                        if n > self.depth and self.depth > 0:
                            break
                except Exception, e:
                    print "ERROR: The URL '%s' can't be processed due to (%s)" % (url, e)
                    print format_exc()
Пример #2
0
def getlinks(url):
    """Get Links from the Linkfetcher class."""
    page = Linkfetcher(url)
    page.linkfetch()
    for i, url in enumerate(page):
        print("%d ==> %s" % (i, url))
Пример #3
0
def getlinks(url):
    page = Linkfetcher(url)
    page.linkfetch()
    for i, url in enumerate(page):
        print "%d ==> %s" % (i, url)