def crawlAllUrl(self,outputFlag = False,crawlAmountLimit = CRAWL_AMOUNT_LIMIT): while len(Crawler.urlList)>0: Crawler.urlRecordLock.acquire()#lock the queue when loading the first element url = Crawler.urlList.pop() pathname = self.url2Pathname(url) Crawler.urlNotDone.pop(pathname) if Crawler.crawledAmount >= crawlAmountLimit: Crawler.urlRecordLock.release() break Crawler.urlRecordLock.release() result = self.crawlUrl(NORMAL_SITE,url,outputFlag) try: urlArr = urlparse.urlparse(url) #if can not crawl the url, accumulate to the errorCounter if result == False: Crawler.urlRecordLock.acquire() if Crawler.errorCounter.has_key(urlArr.netloc): Crawler.errorCounter[urlArr.netloc]+=1 else: Crawler.errorCounter[urlArr.netloc] = 1 Crawler.urlRecordLock.release() continue if Crawler.errorCounter[urlArr.netloc]> MIN_ERRORS_ALLOWED_FOR_A_SITE: continue _path = urlArr.path rightMostSlashIndex = _path.rfind('/') replaced = _path[rightMostSlashIndex : len(_path)] #try to parse relative address if replaced.find('.') != -1: _path = _path.replace(replaced,'') hostPath = urlArr.scheme + '://' + urlArr.netloc + _path parser = LinkParser() parser.setFlag(NORMAL_SITE) parser.setHostPath(hostPath) parser.feed(result) urlList = parser.hrefsList Crawler.urlRecordLock.acquire() self.addUrlList(urlList) Crawler.crawledAmount += 1 Crawler.urlRecordLock.release() parser.close() except Exception, e: #print(e) self.reportError(url, msg[ERROR_HTML_PARSE])
def main(): initResult = init.initGlobal() crawler = Crawler() if(initResult != False): #input print("Please enter your keyword") keyword = raw_input() keyword = keyword.replace(' ','+') #start crawling from search engine crawler = Crawler() startTime = time.time() crawler.loadRecord(LOG_OF_CRAWLED_URL) crawler.loadRecord(LOG_OF_CRAWLED_CONTENT) crawler.addSearchEngineUrl(keyword) htmlcode = crawler.crawlUrl(GOOGLE) parser = LinkParser() parser.setFlag(GOOGLE) parser.feed(htmlcode) top10 = parser.hrefsList crawler.addUrlList(top10,GOOGLE) parser.close() threadPool = [] # run the work with THREAD_NUM threads while len(threadPool) <= THREAD_NUM: th = threading.Thread(None,crawl) threadPool.append(th) for item in threadPool: item.start() for item in threadPool: item.join() crawler.flush() endTime = time.time() print("time used:") print(endTime-startTime) keyword = raw_input()