def downloadFunc(self, queue): selfKilling = False countOfWaitingForQueue = 0 while not selfKilling: while not queue.empty(): countOfWaitingForQueue = 0 url = self.queue.get() print threading.currentThread().name + '\tdownload the url: %s'%url rawContent, errormsg = getRawContent(url) if rawContent == None and errormsg != None: if not self.errorURLs.has_key(url): self.errorURLs[url] = 1 else: self.errorURLs[url] += 1 if self.errorURLs[url] <= self.countOfErrorTry: self.queue.put(url) print errormsg continue #print rawContent urls = self.getURLsFromRawContent(rawContent, self.rBase, url) for url in urls: if re.findall(self.rNavigation, url, re.S): if not self.navigations.has_key(url): self.navigations[url] = True print url queue.put(url) if len(self.navigations) % 20 == 0: self.saveURLsAndLog() print "Queue's size is: \t%s"%str(self.queue.qsize()) + '\t' + \ "Navigations 's size is: \t%s"%str(len(self.navigations)) countOfWaitingForQueue += 1 print threading.currentThread().name + ' sleep ' + str(self.sleepingTimesForGetJob) + ' seconds for ' + str(countOfWaitingForQueue) + ' time.' time.sleep(self.sleepingTimesForGetJob) if countOfWaitingForQueue == self.countOfTryingGetJob: self.saveURLsAndLog() selfKilling = True
def grabPage(self, page): if page.grabed: #print 'this page has been grabed: ' + page.path return url = page.url msg = threading.currentThread().name + '\n' msg += 'start download the url: ' + url + '\n' msg += "start time is : %s"%time.ctime() + '\n' msg += 'internet: %s.\t'%time.ctime() + '\n' page.rawContent, errormsg = getRawContent(url, self.timeout) if page.rawContent == None and errormsg != None: errormsg += '\tTime is: %s.\t'%time.ctime() + 'The url: %s\t'%url + 'The count of grabed pages: %s\t'%str(self.countOfGrabedPages) + '\n' self.printAndLog(errormsg, True) return #change strategy, because processing page data is cpu intensive. #we just save the page raw content and process after a while self.lock.acquire() self.countOfGrabedPages += 1 self.lock.release() #Tools.mylock.release() self.savePage(page) page.grabed = True if self.countOfGrabedPages % 25 == 0: self.saveGrabedLog() msg += "end time is : %s"%time.ctime() + '\n' msg += 'the number of grabed pages is: ' + str(self.countOfGrabedPages) + '\n\n' self.printAndLog(msg)