示例#1
0
def debug_reptile():
    homeUrls = [
        ['CAU', 'http://www.cau.edu.cn'],
    ]
    maxPages = [
        30,
    ]
    urlQueue = UrlQueue()
    urlQueue.init(homeUrls)
    urlist = Urlist()
    urlist.init( len(homeUrls))

    r = Reptile(
       name = 'reptile 1',
       urlQueue = urlQueue,
       urlist = urlist,
       Flock = None,
       homeUrls = homeUrls,
       maxPageNums = maxPages,
       pages = [0],
       curSiteID = [0],
       continueRun = [True]
    )
    r.conn()
    #urlQueue.initFrontPage()
    urlQueue.append(0, ['CAU',''])
    r.start()
示例#2
0
    def __init__(self):
        '''
        全局数据控制
        '''
        self.htmldb = HtmlDB()
        threading.Thread.__init__(self, name = "reptilelib" )  
        print "... init ReptileLib ..."
        #信号队列 由人机界面控制程序运行
        self.inSignalQueue = Q.Queue()
        self.outSignalQueue = Q.Queue()
        self.Flock = threading.RLock()  

        #控制reptile线程是否运行
        self.continueRun = [True]
        #控制reptilelib 主程序及服务器是否运行 是否完全关闭
        self.reptileLibRun = [True]

        self.curSiteID = [0]
        #urlQueue and init in lib
        self.urlQueue = UrlQueue()
        
        self.urlist = Urlist()
        #为了列表的共享性 初始的数据初始化[] 之后不能随意改变
        self.homeUrls = []
        self.pages = []
        self.maxPages = []
        self.reptilectrl = ReptileCtrl(
            homeUrls = self.homeUrls,
            continueRun = self.continueRun,
            urlist = self.urlist,
            urlQueue = self.urlQueue,
            maxPages = self.maxPages,
            pages = self.pages,
            outSignalQueue = self.outSignalQueue,
        )
        self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue)
        #run init thread
        self.runInit()
示例#3
0
class ReptileLib(threading.Thread):
    '''
    爬虫线程库
    '''
    def __init__(self):
        '''
        全局数据控制
        '''
        self.htmldb = HtmlDB()
        threading.Thread.__init__(self, name = "reptilelib" )  
        print "... init ReptileLib ..."
        #信号队列 由人机界面控制程序运行
        self.inSignalQueue = Q.Queue()
        self.outSignalQueue = Q.Queue()
        self.Flock = threading.RLock()  

        #控制reptile线程是否运行
        self.continueRun = [True]
        #控制reptilelib 主程序及服务器是否运行 是否完全关闭
        self.reptileLibRun = [True]

        self.curSiteID = [0]
        #urlQueue and init in lib
        self.urlQueue = UrlQueue()
        
        self.urlist = Urlist()
        #为了列表的共享性 初始的数据初始化[] 之后不能随意改变
        self.homeUrls = []
        self.pages = []
        self.maxPages = []
        self.reptilectrl = ReptileCtrl(
            homeUrls = self.homeUrls,
            continueRun = self.continueRun,
            urlist = self.urlist,
            urlQueue = self.urlQueue,
            maxPages = self.maxPages,
            pages = self.pages,
            outSignalQueue = self.outSignalQueue,
        )
        self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue)
        #run init thread
        self.runInit()
    
    def runInit(self):
        '''
        run init thread 
        '''
        self.controlserver.start()
        self.start()

    def run(self):
        '''
        运行主程序
        signal:
        {
            type:type
        }
        '''
        print "... run while ..."

        while True:
            print '.. while ReptileLib running ..'
            signal = self.inSignalQueue.get()
            print 'get signal', signal
            _type = signal['type']
            print 'get type', _type

            if _type is 'init':
                '''
                全新运行
                '''
                print '.. init from empty project ..'
                self.init(
                    homeUrls = signal['homeurls'] ,
                    maxPages = signal['maxpages'] ,
                    threadNum = signal['reptilenum']
                    )

            elif _type is 'resume':
                print '.. resume from database ..'
                self.reptilectrl.resume()
            
            elif _type is 'stop':
                print '.. stop ..'
                self.reptilectrl.stop()
            
            elif _type is 'halt':
                print '.. halt ..'
                self.reptilectrl.halt()
            
            elif _type is 'status':
                '''
                ask for status
                '''
                print '.. status ..'
                #put status in queue
                self.reptilectrl.status()
                
            elif _type is 'start':
                '''
                run reptiles
                '''
                print '.. run reptile threads ..'
                print 'It works!'
                self.continueRun[0] = True
                self.initThreads()
                self.threadsRun()

        print 'ReptileLib core stopped!'
        print 'Reptile stopped'

    def init(self, homeUrls, maxPages, threadNum):
        '''
        完全初始化
        首次运行
        注意: 重复init时,为了list的共享数据特性
        每次需要清空[] 然后再重新赋值
        '''
        def clearList(_List):
            if not _List: return
            _size = len(_List)
            for i in range(_size):
                _List.pop()

        def initList(_List, List):
            #first clear list
            clearList(_List)
            for l in List:
                _List.append(l)

        initList(self.homeUrls ,homeUrls)
        initList(self.maxPages, maxPages)
        self.threadNum = threadNum
        self.maxPages = maxPages
        
        #self.htmldb = HtmlDB(self.htmlparser)
        #init self.pages 
        #self.pages used to calculate num of pages downloaded
        clearList(self.pages)
        for i in range(len(homeUrls)):
            self.pages.append(0)

        #init urlQueue
        self.urlQueue.init(self.homeUrls)
        self.urlQueue.initFrontPage()
        self.urlist.init(len(self.homeUrls))

        #存储 homeUrls
        self.htmldb.saveHomeUrls(homeUrls, maxPages, self.pages)


    def initThreads(self):
        self.thlist = []
        #default: from site 0
        self.curSiteID[0] = 0

        for i in range(self.threadNum):  
            #此处前缀也需要变化
            #修改  根据站点前缀命名爬虫
            th = Reptile(
                name = "reptile%d"%i, 
                urlQueue = self.urlQueue,
                urlist = self.urlist,
                Flock = self.Flock,
                homeUrls = self.homeUrls,
                maxPageNums = self.maxPages,
                pages = self.pages,
                curSiteID = self.curSiteID,
                continueRun = self.continueRun
            )
            self.thlist.append(th)  


    def threadsRun(self):
        for th in self.thlist:
            th.start()