Пример #1
0
    def start_simple(self):

        # get the crawler
        crawler = Crawl()

        crawler.add_url(self.ask_for_link())
        crawler.load_next_page()
        links = crawler.crawl_next_page_for_links()
        for link in links:
            crawler.add_url(link)

        pass
Пример #2
0
def main():
    #    start by asking for a link
    toCrawl = []
    crawled = []
    toCrawl.append(getLinkToPage())
    crawlercl = Crawl()
    #    print "got page " + pageLink

    maxPagesSearched = 100
    i = 0

    #   search this page for links
    while len(toCrawl) > 0:
        # get first entry of "toCrawl" list
        crawl = toCrawl.pop()

        # TODO
        # ensure that this link is not contained in the "crawled" list        
        if crawl in crawled:
            # continue with the next loop
            continue

        links = crawlercl.crawl_next_page_for_links(crawl)

        # put into "crawled" list
        crawled.append(crawl)

        # add new found links to "toCrawl" list
        for link in links:
            # ensure that the link is not in the "crawled" list
            if link not in crawled:
                toCrawl.append(makeAbsoluteLink(link, crawl))
                print makeAbsoluteLink(link, crawl)

        # stop loop after .. iterations
        i += 1
        if i >= maxPagesSearched:
            break

    return 0