Exemplo n.º 1
0
class ReptileLib(threading.Thread):
    '''
    爬虫线程库
    '''
    def __init__(self):
        '''
        全局数据控制
        '''
        self.htmldb = HtmlDB()
        threading.Thread.__init__(self, name = "reptilelib" )  
        print "... init ReptileLib ..."
        #信号队列 由人机界面控制程序运行
        self.inSignalQueue = Q.Queue()
        self.outSignalQueue = Q.Queue()
        self.Flock = threading.RLock()  

        #控制reptile线程是否运行
        self.continueRun = [True]
        #控制reptilelib 主程序及服务器是否运行 是否完全关闭
        self.reptileLibRun = [True]

        self.curSiteID = [0]
        #urlQueue and init in lib
        self.urlQueue = UrlQueue()
        
        self.urlist = Urlist()
        #为了列表的共享性 初始的数据初始化[] 之后不能随意改变
        self.homeUrls = []
        self.pages = []
        self.maxPages = []
        self.reptilectrl = ReptileCtrl(
            homeUrls = self.homeUrls,
            continueRun = self.continueRun,
            urlist = self.urlist,
            urlQueue = self.urlQueue,
            maxPages = self.maxPages,
            pages = self.pages,
            outSignalQueue = self.outSignalQueue,
        )
        self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue)
        #run init thread
        self.runInit()
    
    def runInit(self):
        '''
        run init thread 
        '''
        self.controlserver.start()
        self.start()

    def run(self):
        '''
        运行主程序
        signal:
        {
            type:type
        }
        '''
        print "... run while ..."

        while True:
            print '.. while ReptileLib running ..'
            signal = self.inSignalQueue.get()
            print 'get signal', signal
            _type = signal['type']
            print 'get type', _type

            if _type is 'init':
                '''
                全新运行
                '''
                print '.. init from empty project ..'
                self.init(
                    homeUrls = signal['homeurls'] ,
                    maxPages = signal['maxpages'] ,
                    threadNum = signal['reptilenum']
                    )

            elif _type is 'resume':
                print '.. resume from database ..'
                self.reptilectrl.resume()
            
            elif _type is 'stop':
                print '.. stop ..'
                self.reptilectrl.stop()
            
            elif _type is 'halt':
                print '.. halt ..'
                self.reptilectrl.halt()
            
            elif _type is 'status':
                '''
                ask for status
                '''
                print '.. status ..'
                #put status in queue
                self.reptilectrl.status()
                
            elif _type is 'start':
                '''
                run reptiles
                '''
                print '.. run reptile threads ..'
                print 'It works!'
                self.continueRun[0] = True
                self.initThreads()
                self.threadsRun()

        print 'ReptileLib core stopped!'
        print 'Reptile stopped'

    def init(self, homeUrls, maxPages, threadNum):
        '''
        完全初始化
        首次运行
        注意: 重复init时,为了list的共享数据特性
        每次需要清空[] 然后再重新赋值
        '''
        def clearList(_List):
            if not _List: return
            _size = len(_List)
            for i in range(_size):
                _List.pop()

        def initList(_List, List):
            #first clear list
            clearList(_List)
            for l in List:
                _List.append(l)

        initList(self.homeUrls ,homeUrls)
        initList(self.maxPages, maxPages)
        self.threadNum = threadNum
        self.maxPages = maxPages
        
        #self.htmldb = HtmlDB(self.htmlparser)
        #init self.pages 
        #self.pages used to calculate num of pages downloaded
        clearList(self.pages)
        for i in range(len(homeUrls)):
            self.pages.append(0)

        #init urlQueue
        self.urlQueue.init(self.homeUrls)
        self.urlQueue.initFrontPage()
        self.urlist.init(len(self.homeUrls))

        #存储 homeUrls
        self.htmldb.saveHomeUrls(homeUrls, maxPages, self.pages)


    def initThreads(self):
        self.thlist = []
        #default: from site 0
        self.curSiteID[0] = 0

        for i in range(self.threadNum):  
            #此处前缀也需要变化
            #修改  根据站点前缀命名爬虫
            th = Reptile(
                name = "reptile%d"%i, 
                urlQueue = self.urlQueue,
                urlist = self.urlist,
                Flock = self.Flock,
                homeUrls = self.homeUrls,
                maxPageNums = self.maxPages,
                pages = self.pages,
                curSiteID = self.curSiteID,
                continueRun = self.continueRun
            )
            self.thlist.append(th)  


    def threadsRun(self):
        for th in self.thlist:
            th.start()
Exemplo n.º 2
0
class ReptileLib(threading.Thread):
    """
    爬虫线程库
    """

    def __init__(self):
        """
        全局数据控制
        """
        self.htmldb = HtmlDB()
        threading.Thread.__init__(self, name="reptilelib")
        print "... init ReptileLib ..."
        # 信号队列 由人机界面控制程序运行
        self.inSignalQueue = Q.Queue()
        self.outSignalQueue = Q.Queue()
        self.Flock = threading.RLock()

        # 控制reptile线程是否运行
        self.continueRun = [False]
        # 控制reptilelib 主程序及服务器是否运行 是否完全关闭
        self.reptileLibRun = [True]

        # urlQueue and init in lib
        self.urlQueue = UrlQueue()

        self.urlist = Urlist()
        # 为了列表的共享性 初始的数据初始化[] 之后不能随意改变
        self.homeUrls = []
        self.pages = []
        self.imagenum = []
        self.imagenum.append(0)
        print "-" * 50
        print ".. init self.imagenum", self.imagenum, type(self.imagenum)
        print "-" * 50
        self.maxPages = []

        self.reptilectrl = ReptileCtrl(
            homeUrls=self.homeUrls,
            continueRun=self.continueRun,
            urlist=self.urlist,
            urlQueue=self.urlQueue,
            maxPages=self.maxPages,
            pages=self.pages,
            imagenum=self.imagenum,
            outSignalQueue=self.outSignalQueue,
        )
        self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue)
        # run init thread
        self.runInit()

    def runInit(self):
        """
        run init thread 
        """
        self.controlserver.start()
        self.start()

    def run(self):
        """
        运行主程序
        signal:
        {
            type:type
        }
        """
        print "... run while ..."

        while True:
            print ".. while ReptileLib running .."
            signal = self.inSignalQueue.get()
            print "get signal", signal
            _type = signal["type"]
            print "get type", _type

            if _type is "init":
                """
                全新运行
                """
                print ".. init from empty project .."
                self.init(homeUrls=signal["homeurls"], maxPages=signal["maxpages"], threadNum=signal["reptilenum"])

            elif _type is "resume":
                print ".. resume from database .."
                self.reptilectrl.resume()

            elif _type is "stop":
                print ".. stop .."
                self.reptilectrl.stop()

            elif _type is "halt":
                print ".. halt .."
                self.reptilectrl.halt()

            elif _type is "status":
                """
                ask for status
                """
                print ".. status .."
                # put status in queue
                self.reptilectrl.status()

            elif _type is "start":
                """
                run reptiles
                """
                print ".. run reptile threads .."
                print "It works!"
                if not self.continueRun[0]:
                    self.continueRun[0] = True
                    self.initThreads()
                    self.threadsRun()

        print "ReptileLib core stopped!"
        print "Reptile stopped"

    def init(self, homeUrls, maxPages, threadNum):
        """
        完全初始化
        首次运行
        注意: 重复init时,为了list的共享数据特性
        每次需要清空[] 然后再重新赋值
        """

        def clearList(_List):
            if not _List:
                return
            _size = len(_List)
            for i in range(_size):
                _List.pop()

        def initList(_List, List):
            # first clear list
            clearList(_List)
            for l in List:
                print l
                _List.append(l)

        print ".. init homeUrls"
        initList(self.homeUrls, homeUrls)
        initList(self.maxPages, maxPages)
        self.threadNum = threadNum
        self.maxPages = maxPages

        print ".. init maxPages:", self.maxPages
        print ".. init pages", self.pages

        # self.htmldb = HtmlDB(self.htmlparser)
        # init self.pages
        # self.pages used to calculate num of pages downloaded
        clearList(self.pages)
        for i in range(len(homeUrls)):
            self.pages.append(0)

        # init urlQueue
        self.urlQueue.init(self.homeUrls)
        self.urlQueue.initFrontPage()
        # self.urlist.init(len(self.homeUrls))

        # 存储 homeUrls
        self.htmldb.saveHomeUrls(homeUrls, maxPages, self.pages)

    def initThreads(self):
        self.thlist = []
        # default: from site 0
        print "$" * 50
        print "init thread imagenum", self.imagenum, type(self.imagenum)
        print "$" * 50

        for i in range(self.threadNum):
            # 此处前缀也需要变化
            # 修改  根据站点前缀命名爬虫
            th = Reptile(
                name="reptile%d" % i,
                urlQueue=self.urlQueue,
                urlist=self.urlist,
                Flock=self.Flock,
                homeUrls=self.homeUrls,
                maxPageNums=self.maxPages,
                pages=self.pages,
                imagenum=self.imagenum,
                continueRun=self.continueRun,
            )
            self.thlist.append(th)

    def threadsRun(self):
        for th in self.thlist:
            th.start()