def debug_reptile(): homeUrls = [ ['CAU', 'http://www.cau.edu.cn'], ] maxPages = [ 30, ] urlQueue = UrlQueue() urlQueue.init(homeUrls) urlist = Urlist() urlist.init( len(homeUrls)) r = Reptile( name = 'reptile 1', urlQueue = urlQueue, urlist = urlist, Flock = None, homeUrls = homeUrls, maxPageNums = maxPages, pages = [0], curSiteID = [0], continueRun = [True] ) r.conn() #urlQueue.initFrontPage() urlQueue.append(0, ['CAU','']) r.start()
def __init__(self): ''' 全局数据控制 ''' self.htmldb = HtmlDB() threading.Thread.__init__(self, name = "reptilelib" ) print "... init ReptileLib ..." #信号队列 由人机界面控制程序运行 self.inSignalQueue = Q.Queue() self.outSignalQueue = Q.Queue() self.Flock = threading.RLock() #控制reptile线程是否运行 self.continueRun = [True] #控制reptilelib 主程序及服务器是否运行 是否完全关闭 self.reptileLibRun = [True] self.curSiteID = [0] #urlQueue and init in lib self.urlQueue = UrlQueue() self.urlist = Urlist() #为了列表的共享性 初始的数据初始化[] 之后不能随意改变 self.homeUrls = [] self.pages = [] self.maxPages = [] self.reptilectrl = ReptileCtrl( homeUrls = self.homeUrls, continueRun = self.continueRun, urlist = self.urlist, urlQueue = self.urlQueue, maxPages = self.maxPages, pages = self.pages, outSignalQueue = self.outSignalQueue, ) self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue) #run init thread self.runInit()
class ReptileLib(threading.Thread): ''' 爬虫线程库 ''' def __init__(self): ''' 全局数据控制 ''' self.htmldb = HtmlDB() threading.Thread.__init__(self, name = "reptilelib" ) print "... init ReptileLib ..." #信号队列 由人机界面控制程序运行 self.inSignalQueue = Q.Queue() self.outSignalQueue = Q.Queue() self.Flock = threading.RLock() #控制reptile线程是否运行 self.continueRun = [True] #控制reptilelib 主程序及服务器是否运行 是否完全关闭 self.reptileLibRun = [True] self.curSiteID = [0] #urlQueue and init in lib self.urlQueue = UrlQueue() self.urlist = Urlist() #为了列表的共享性 初始的数据初始化[] 之后不能随意改变 self.homeUrls = [] self.pages = [] self.maxPages = [] self.reptilectrl = ReptileCtrl( homeUrls = self.homeUrls, continueRun = self.continueRun, urlist = self.urlist, urlQueue = self.urlQueue, maxPages = self.maxPages, pages = self.pages, outSignalQueue = self.outSignalQueue, ) self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue) #run init thread self.runInit() def runInit(self): ''' run init thread ''' self.controlserver.start() self.start() def run(self): ''' 运行主程序 signal: { type:type } ''' print "... run while ..." while True: print '.. while ReptileLib running ..' signal = self.inSignalQueue.get() print 'get signal', signal _type = signal['type'] print 'get type', _type if _type is 'init': ''' 全新运行 ''' print '.. init from empty project ..' self.init( homeUrls = signal['homeurls'] , maxPages = signal['maxpages'] , threadNum = signal['reptilenum'] ) elif _type is 'resume': print '.. resume from database ..' self.reptilectrl.resume() elif _type is 'stop': print '.. stop ..' self.reptilectrl.stop() elif _type is 'halt': print '.. halt ..' self.reptilectrl.halt() elif _type is 'status': ''' ask for status ''' print '.. status ..' #put status in queue self.reptilectrl.status() elif _type is 'start': ''' run reptiles ''' print '.. run reptile threads ..' print 'It works!' self.continueRun[0] = True self.initThreads() self.threadsRun() print 'ReptileLib core stopped!' print 'Reptile stopped' def init(self, homeUrls, maxPages, threadNum): ''' 完全初始化 首次运行 注意: 重复init时,为了list的共享数据特性 每次需要清空[] 然后再重新赋值 ''' def clearList(_List): if not _List: return _size = len(_List) for i in range(_size): _List.pop() def initList(_List, List): #first clear list clearList(_List) for l in List: _List.append(l) initList(self.homeUrls ,homeUrls) initList(self.maxPages, maxPages) self.threadNum = threadNum self.maxPages = maxPages #self.htmldb = HtmlDB(self.htmlparser) #init self.pages #self.pages used to calculate num of pages downloaded clearList(self.pages) for i in range(len(homeUrls)): self.pages.append(0) #init urlQueue self.urlQueue.init(self.homeUrls) self.urlQueue.initFrontPage() self.urlist.init(len(self.homeUrls)) #存储 homeUrls self.htmldb.saveHomeUrls(homeUrls, maxPages, self.pages) def initThreads(self): self.thlist = [] #default: from site 0 self.curSiteID[0] = 0 for i in range(self.threadNum): #此处前缀也需要变化 #修改 根据站点前缀命名爬虫 th = Reptile( name = "reptile%d"%i, urlQueue = self.urlQueue, urlist = self.urlist, Flock = self.Flock, homeUrls = self.homeUrls, maxPageNums = self.maxPages, pages = self.pages, curSiteID = self.curSiteID, continueRun = self.continueRun ) self.thlist.append(th) def threadsRun(self): for th in self.thlist: th.start()