示例#1
0
文件: crawel.py 项目: lzbfeng/leiwumi
    def downloadFunc(self, queue):
        selfKilling = False
        countOfWaitingForQueue = 0
        while not selfKilling:
            while not queue.empty():
                countOfWaitingForQueue = 0
                url = self.queue.get()
                print threading.currentThread().name + '\tdownload the url: %s'%url
                rawContent, errormsg = getRawContent(url)

                if rawContent == None and errormsg != None:
                    if not self.errorURLs.has_key(url):
                        self.errorURLs[url] = 1
                    else:
                        self.errorURLs[url] += 1
                    if self.errorURLs[url] <= self.countOfErrorTry:
                        self.queue.put(url)

                    print errormsg
                    continue
                #print rawContent
                urls = self.getURLsFromRawContent(rawContent, self.rBase, url)
                for url in urls:
                    if re.findall(self.rNavigation, url, re.S):
                        if not self.navigations.has_key(url):
                            self.navigations[url] = True
                            print url
                            queue.put(url)

                if len(self.navigations) % 20 == 0:
                    self.saveURLsAndLog()
                print "Queue's size is: \t%s"%str(self.queue.qsize()) + '\t' + \
                        "Navigations 's size is: \t%s"%str(len(self.navigations))
            countOfWaitingForQueue += 1
            print threading.currentThread().name + ' sleep ' + str(self.sleepingTimesForGetJob) + ' seconds for ' + str(countOfWaitingForQueue) + ' time.'
            time.sleep(self.sleepingTimesForGetJob)
            if countOfWaitingForQueue == self.countOfTryingGetJob:
                self.saveURLsAndLog()
                selfKilling = True
示例#2
0
    def grabPage(self, page):

        if page.grabed:
            #print 'this page has been grabed: ' + page.path
            return

        url = page.url
        msg = threading.currentThread().name + '\n'
        msg += 'start download the url: ' + url + '\n'
        msg += "start time is : %s"%time.ctime() + '\n'

        msg += 'internet: %s.\t'%time.ctime() + '\n'

        page.rawContent, errormsg = getRawContent(url, self.timeout)

        if page.rawContent == None and errormsg != None:
            errormsg += '\tTime is: %s.\t'%time.ctime() + 'The url: %s\t'%url + 'The count of grabed pages: %s\t'%str(self.countOfGrabedPages) + '\n'
            self.printAndLog(errormsg, True)
            return
        #change strategy, because processing page data is cpu intensive.
        #we just save the page raw content and process after a while

        self.lock.acquire()
        self.countOfGrabedPages += 1
        self.lock.release()
        #Tools.mylock.release()

        self.savePage(page)
        page.grabed = True

        if self.countOfGrabedPages % 25 == 0:
            self.saveGrabedLog()

        msg += "end time is : %s"%time.ctime() + '\n'
        msg += 'the number of grabed pages is: ' + str(self.countOfGrabedPages) + '\n\n'
        self.printAndLog(msg)