示例#1
0
    def crawl(current_url):
        print('Total in Queue', len(Crawler.queue), '| Total Crawled', len(Crawler.crawled))
        if '.vhd' not in current_url:
            try:
                with urllib.request.urlopen(current_url) as response:
                    html = response.read()

                soup = BeautifulSoup(html, "html.parser")
                print(" crawling", current_url)
                for link in soup.findAll('a', attrs={'href': re.compile("^http")}):
                    href = link.get('href')
                    if href not in Crawler.queue and href not in Crawler.crawled:
                        Crawler.queue.add(href)
                Crawler.crawled.add(current_url)
                Crawler.queue.discard(current_url)
                Indexer.indexer(current_url, soup)
                Crawler.save_lists()
            except:
                print("ERROR", current_url)
                Crawler.queue.discard(current_url)
                Crawler.save_lists()
                pass