def __init__(self, name, urlQueue, urlist, Flock, homeUrls, maxPageNums, pages, imagenum, continueRun=[True]): """ pages: 记录下载的网页数目 """ self.__name = name threading.Thread.__init__(self, name=name) # own data self.__homeUrls = homeUrls self.__urlist = urlist self.__urlQueue = urlQueue self.Flock = Flock self.__curSiteID = [0] # curSiteID self.__temSiteID = -1 self.__homeurl = None self.__pageinfo = None self.continueRun = continueRun # some information to send to UserFrame ---- # num of downloaded pages self.__maxPageNums = maxPageNums # 记录下载的页面数目 self.pages = pages self.imagenum = imagenum # --------------------- self.urlparser = UrlParser(homeUrls) self.htmlparser = HtmlParser(self.urlparser) self.htmldb = HtmlDB(self.htmlparser) self.imageparser = ImageParser(name) self.textfileparser = TextFileParser()
class Collector: ''' 搜集相关的信息 然后返回 ''' def __init__(self): self.htmldb = HtmlDB() self.htmlnum = None def run(self): ''' 主程序 ''' self.clearRecords() self.htmlnum = self.htmldb.getHtmlNum() for i in range(self.htmlnum): htmlinfo = self.htmldb.setRecordHandle(i) dectitle = htmlinfo.title title = self.htmldb.getTitle() _content = self.htmldb.getContent() pagedec = self.transPageDec(_content) url = htmlinfo.url date = htmlinfo.date record = Record( title = title, dectitle = dectitle, url = url, decsource = pagedec, date = date ) record.save() def transPageDec(self, source): length = config.getint('indexer', 'page_dec_length') return source[:length] def clearRecords(self): ''' 每次记录 清空所有的旧的记录 ''' Record.objects.all().delete()
def init(self, homeUrls): ''' homeUrls is a [title, url] ''' self.homeUrls = homeUrls self.htmldb = HtmlDB(None) #self.htmlparser) self.clear() self.__siteNum = len(self.homeUrls) for i in range(self.__siteNum): self.sizes.append(0)
def __init__(self): """ 全局数据控制 """ self.htmldb = HtmlDB() threading.Thread.__init__(self, name="reptilelib") print "... init ReptileLib ..." # 信号队列 由人机界面控制程序运行 self.inSignalQueue = Q.Queue() self.outSignalQueue = Q.Queue() self.Flock = threading.RLock() # 控制reptile线程是否运行 self.continueRun = [False] # 控制reptilelib 主程序及服务器是否运行 是否完全关闭 self.reptileLibRun = [True] # urlQueue and init in lib self.urlQueue = UrlQueue() self.urlist = Urlist() # 为了列表的共享性 初始的数据初始化[] 之后不能随意改变 self.homeUrls = [] self.pages = [] self.imagenum = [] self.imagenum.append(0) print "-" * 50 print ".. init self.imagenum", self.imagenum, type(self.imagenum) print "-" * 50 self.maxPages = [] self.reptilectrl = ReptileCtrl( homeUrls=self.homeUrls, continueRun=self.continueRun, urlist=self.urlist, urlQueue=self.urlQueue, maxPages=self.maxPages, pages=self.pages, imagenum=self.imagenum, outSignalQueue=self.outSignalQueue, ) self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue) # run init thread self.runInit()
def __init__(self, name, urlQueue, urlist, Flock, homeUrls, maxPageNums, pages, curSiteID = [0], continueRun = [True]): ''' pages: 记录下载的网页数目 ''' self.__name = name threading.Thread.__init__(self, name = name ) #own data self.__pages = pages self.__homeUrls = homeUrls self.__urlist = urlist self.__urlQueue = urlQueue self.Flock = Flock self.__homeurl = None self.continueRun = continueRun #some information to send to UserFrame ---- #num of downloaded pages self.__maxPageNums = maxPageNums #记录下载的页面数目 #--------------------- self.urlparser = UrlParser(homeUrls) self.htmlparser = HtmlParser(self.urlparser) self.htmldb = HtmlDB(self.htmlparser)
def __init__(self, homeUrls, continueRun, urlist, urlQueue, maxPages, pages, imagenum, outSignalQueue): ''' 需要掌握的数据: homeUrls urlist urlqueue pages maxpages ''' self.homeUrls = homeUrls #continue weather reptils continue run self.continueRun = continueRun self.urlist = urlist self.urlQueue = urlQueue self.pages = pages self.imagenum = imagenum self.maxPages = maxPages #向控制端陈需传递消息 self.outSignalQueue = outSignalQueue self.urlparser = UrlParser(homeUrls) self.htmlparser = HtmlParser(self.urlparser) self.htmldb = HtmlDB(self.htmlparser)
def __init__(self): self.htmldb = HtmlDB() self.htmlnum = None
class Reptile(threading.Thread): ''' 单个线程 ''' def __init__(self, name, urlQueue, urlist, Flock, homeUrls, maxPageNums, pages, curSiteID = [0], continueRun = [True]): ''' pages: 记录下载的网页数目 ''' self.__name = name threading.Thread.__init__(self, name = name ) #own data self.__pages = pages self.__homeUrls = homeUrls self.__urlist = urlist self.__urlQueue = urlQueue self.Flock = Flock self.__homeurl = None self.continueRun = continueRun #some information to send to UserFrame ---- #num of downloaded pages self.__maxPageNums = maxPageNums #记录下载的页面数目 #--------------------- self.urlparser = UrlParser(homeUrls) self.htmlparser = HtmlParser(self.urlparser) self.htmldb = HtmlDB(self.htmlparser) def requestSource(self, path): conn = self.conn() print '.. conn',conn conn.request("GET", path) r1 = conn.getresponse() data = r1.read() #需要对data的返回转台进行解析 return data def getPage(self, path): print '>>path to load', path try: r = self.requestSource(path) except: r = None return r def run(self): while True : if not self.continueRun[0]: print self.__name,"stopped!" return #从temSiteID开始 print '.. temSiteID : ', self.__temSiteID pathinfo = self.__urlQueue.pop(self.__curSiteID[0]) #get (siteID, (title, url)) print '.. get pathinfo', pathinfo if not pathinfo: ''' 如果所有的队列均为空 则退出线程 ''' print '.. get pathinfo empty' #return None break self.__temHomeUrl = self.__homeUrls[self.__curSiteID[0]][1] #print '.. get cursiteid', self.__curSiteID #print 'the path is ', pathinfo[1][1] try: htmlsource = self.getPage(pathinfo[1][1]) except: print 'pathinfo bool' continue if not htmlsource: print 'htmlsource is wrong' continue print '.. get htmlsource len', len(htmlsource) #判断是否为html源码 if not self.htmlparser.init(htmlsource) : print '.. source is not html' continue #添加 path 到队列中 pageStdUrl = self.urlparser.transToStdUrl(self.__temHomeUrl, pathinfo[1][1]) self.addNewInQueue(pageStdUrl) #处理源码为xml文件 存储到数据库 print '.. start to save html' self.Flock.acquire() self.htmldb.saveHtml(self.__curSiteID[0], pathinfo[1][0], pageStdUrl, htmlsource) self.Flock.release() print '.. ',self.__name, 'quit!' def addNewInQueue(self, pageStdUrl): ''' 直接从html source中提取出path列表 直接添加到各自的inqueue ''' urlist = self.htmlparser.getLinks() print 'get urlist' for url in urlist: print url[0], url[1] for urlInfor in urlist: #[title, path] #print 'pageStdUrl', pageStdUrl stdUrl = self.urlparser.transToStdUrl(pageStdUrl, urlInfor[1]) #print '.. get STDURL', stdUrl siteId = self.urlparser.judgeUrl(pageStdUrl, urlInfor[1]) #print '.. get SITEID', siteId if siteId != -1 : ''' 加入队列中 ''' if not self.__urlist.find(stdUrl) : ''' urlist 中不重复 ''' print '.. Add in Queue', path self.Flock.acquire() self.__urlQueue.append(siteId, (urlInfor[0] ,stdUrl)) self.Flock.release()
class ReptileLib(threading.Thread): ''' 爬虫线程库 ''' def __init__(self): ''' 全局数据控制 ''' self.htmldb = HtmlDB() threading.Thread.__init__(self, name = "reptilelib" ) print "... init ReptileLib ..." #信号队列 由人机界面控制程序运行 self.inSignalQueue = Q.Queue() self.outSignalQueue = Q.Queue() self.Flock = threading.RLock() #控制reptile线程是否运行 self.continueRun = [True] #控制reptilelib 主程序及服务器是否运行 是否完全关闭 self.reptileLibRun = [True] self.curSiteID = [0] #urlQueue and init in lib self.urlQueue = UrlQueue() self.urlist = Urlist() #为了列表的共享性 初始的数据初始化[] 之后不能随意改变 self.homeUrls = [] self.pages = [] self.maxPages = [] self.reptilectrl = ReptileCtrl( homeUrls = self.homeUrls, continueRun = self.continueRun, urlist = self.urlist, urlQueue = self.urlQueue, maxPages = self.maxPages, pages = self.pages, outSignalQueue = self.outSignalQueue, ) self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue) #run init thread self.runInit() def runInit(self): ''' run init thread ''' self.controlserver.start() self.start() def run(self): ''' 运行主程序 signal: { type:type } ''' print "... run while ..." while True: print '.. while ReptileLib running ..' signal = self.inSignalQueue.get() print 'get signal', signal _type = signal['type'] print 'get type', _type if _type is 'init': ''' 全新运行 ''' print '.. init from empty project ..' self.init( homeUrls = signal['homeurls'] , maxPages = signal['maxpages'] , threadNum = signal['reptilenum'] ) elif _type is 'resume': print '.. resume from database ..' self.reptilectrl.resume() elif _type is 'stop': print '.. stop ..' self.reptilectrl.stop() elif _type is 'halt': print '.. halt ..' self.reptilectrl.halt() elif _type is 'status': ''' ask for status ''' print '.. status ..' #put status in queue self.reptilectrl.status() elif _type is 'start': ''' run reptiles ''' print '.. run reptile threads ..' print 'It works!' self.continueRun[0] = True self.initThreads() self.threadsRun() print 'ReptileLib core stopped!' print 'Reptile stopped' def init(self, homeUrls, maxPages, threadNum): ''' 完全初始化 首次运行 注意: 重复init时,为了list的共享数据特性 每次需要清空[] 然后再重新赋值 ''' def clearList(_List): if not _List: return _size = len(_List) for i in range(_size): _List.pop() def initList(_List, List): #first clear list clearList(_List) for l in List: _List.append(l) initList(self.homeUrls ,homeUrls) initList(self.maxPages, maxPages) self.threadNum = threadNum self.maxPages = maxPages #self.htmldb = HtmlDB(self.htmlparser) #init self.pages #self.pages used to calculate num of pages downloaded clearList(self.pages) for i in range(len(homeUrls)): self.pages.append(0) #init urlQueue self.urlQueue.init(self.homeUrls) self.urlQueue.initFrontPage() self.urlist.init(len(self.homeUrls)) #存储 homeUrls self.htmldb.saveHomeUrls(homeUrls, maxPages, self.pages) def initThreads(self): self.thlist = [] #default: from site 0 self.curSiteID[0] = 0 for i in range(self.threadNum): #此处前缀也需要变化 #修改 根据站点前缀命名爬虫 th = Reptile( name = "reptile%d"%i, urlQueue = self.urlQueue, urlist = self.urlist, Flock = self.Flock, homeUrls = self.homeUrls, maxPageNums = self.maxPages, pages = self.pages, curSiteID = self.curSiteID, continueRun = self.continueRun ) self.thlist.append(th) def threadsRun(self): for th in self.thlist: th.start()
class UrlQueue: ''' url队列 ''' def __init__(self): self.__siteNum = None self.sizes = [] self.size = 0 def init(self, homeUrls): ''' homeUrls is a [title, url] ''' self.homeUrls = homeUrls self.htmldb = HtmlDB(None) #self.htmlparser) self.clear() self.__siteNum = len(self.homeUrls) for i in range(self.__siteNum): self.sizes.append(0) def clear(self): ''' 在一次全新项目时 清空整个urlqueue ''' self.htmldb.clearUrlQueue() def append(self, siteID, toDocID, stdUrlInfo): ''' stdUrlInfo = [title, url] toSiteID: 附属于的网页编号 -1:正常网页 >0 文件 输入时 url 必须为绝对地址 ''' self.htmldb.saveUrlQueue( stdUrlInfo, siteID, toDocID) self.size += 1 self.sizes[siteID] += 1 def initFrontPage(self): ''' put homeUrl as front page to queue and start to run default: reptile get homeurl as first page to download ''' for i,url in enumerate(self.homeUrls): self.append( i, -1, url) ''' #为蛋站但设计 for i in range(8): self.append(0, -1, ['信电学院',"http://www.ciee.cn/ciee/"]) homeurls = [ ['今日新闻', 'http://news.cau.edu.cn/list.php?mid=1'], ['媒体农大','http://news.cau.edu.cn/list.php?mid=4'], ['推荐新闻', 'http://news.cau.edu.cn/list.php?mid=3'], ['农大科技', 'http://news.cau.edu.cn/list.php?lid=3'], ] ''' def pop(self): ''' 如果需要的list为空 则循环返回其他list的path ''' if not self.size: #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #need to sleep for a moment #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #模仿queue的功能 睡眠3秒 thread.sleep(10000) if self.size > 0: url = self.htmldb.getCacheUrl() #print 'siteID', url.siteID self.sizes[url.siteID] -= 1 return url else: return None def show(self): for i,qu in enumerate(self.__queue) : print 'the %dth queue len is %d'%(i, qu.qsize() ) for u in qu: print u def getNums(self): ''' 返回每个queue的长度 ''' nums = [] for q in self.__queue: nums.append(q.qsize()) return nums def getAll(self): return self.__queue def resume(self, homeurls, queues): ''' queues = [ [ [title, path], ], ] ''' _size = len(queues) self.init(homeurls) for i,queue in enumerate(queues): for u in queue: self.__queue[i].put(u)
class ReptileCtrl: ''' Reptile 控制程序 将以一个控制线程的方式运行 接受 人机界面的控制 ''' def __init__(self, homeUrls, continueRun, urlist, urlQueue, maxPages, pages, imagenum, outSignalQueue): ''' 需要掌握的数据: homeUrls urlist urlqueue pages maxpages ''' self.homeUrls = homeUrls #continue weather reptils continue run self.continueRun = continueRun self.urlist = urlist self.urlQueue = urlQueue self.pages = pages self.imagenum = imagenum self.maxPages = maxPages #向控制端陈需传递消息 self.outSignalQueue = outSignalQueue self.urlparser = UrlParser(homeUrls) self.htmlparser = HtmlParser(self.urlparser) self.htmldb = HtmlDB(self.htmlparser) def stop(self): ''' 直接停止运行 ''' print '.. stop ..' self.continueRun[0] = False def halt(self): ''' 中断 保存: urlist urlqueue pages maxpages ''' print '.. halt ..' self.continueRun[0] = False #should use timer to sleep for some seconds time.sleep(halt_wait_time) self.htmldb.saveHomeUrls(self.homeUrls, self.maxPages, self.pages) #开始保存 urlist = self.urlist.getAll() #save it self.htmldb.saveList(urlist) urlqueue = self.urlQueue.getAll() #保存 self.htmldb.saveQueue(urlqueue) def resume(self): ''' resume from database init urlqueue and urlist ''' def clearList(_List): _size = len(_List) for i in range(_size): _List.pop() def resumeHomeurl(homeurls): ''' homeurls= [ [title, url], ] ''' #clear [] to empty clearList(self.homeUrls) for u in homeurls: self.homeUrls.append(u) def resumePages(localpages, pages): ''' resume pages or maxpages maxpages = [ 1,2,3, ] ''' clearList(localpages) for i in pages: localpages.append(i) status = self.htmldb.getStatus() _homeurls = status['homeurls'] resumeHomeurl(_homeurls) #resume maxpages resumePages(self.maxPages, status['maxpages']) #resume pages resumePages(self.pages, status['pages']) #resume urlist self.urlist.resume(status['urlist']) #resume urlqueue self.urlQueue.resume(status['homeurls'], status['urlqueue']) def status(self): ''' return status ''' _queue_num = [] signal = { 'type': 'status', 'pages': self.pages, 'imagenum': self.imagenum[0], 'queue_num': self.urlQueue.sizes, 'list_num': self.urlist.getNum(), } print signal self.outSignalQueue.put(signal)
def __init__(self): self.htmldb = HtmlDB() self.titles = self.htmldb.get_all_titles()
class QueryFrame: def __init__(self): self.htmldb = HtmlDB() self.titles = self.htmldb.get_all_titles() def index(self, request): Get = request.GET t = get_template('query/index.html') #搜索模式选择 hi = ['', '', ''] if 'type' in Get: _type = Get['type'] if Get['type'] == 'file': hi[2] = 'hi' elif Get['type'] == 'image': hi[1] = 'hi' else: hi[0] = 'hi' else: hi[0] = 'hi' _type = 'web' #加入站点信息 if 'site' in request.GET: ''' 此处site 为 0 1 2 3 0 为 index 1 为 siteID=0 ''' site = int( request.GET['site']) if site == 0: title = "内网" else: title = self.htmldb.get_title(site) else: #默认值应该为总搜索 site = 0 title = '内网全文' titles = self.htmldb.get_titles() html = t.render(Context({'site':site,'title':title,'titles':titles, 'type':_type, 'hi':hi})) return HttpResponse(html) def site_ctrl(self, request): t = get_template('query/site_ctrl.html') html = t.render(Context({})) return HttpResponse(html) def more_sites(self, request): ''' 展示更多站点 ''' t = get_template('query/more.html') titles = self.htmldb.get_all_titles() html = t.render(Context({'titles':titles})) return HttpResponse(html) def search(self, request): ''' 查询主程序 ''' Get = request.GET if 'query_text' in request.GET: text = request.GET['query_text'] print '.. search text', text if 'page' in request.GET: page = int(request.GET['page']) else: #from 1 page = 1 #print '.. page', page if 'site' in request.GET: siteID = int(request.GET['site']) else: siteID = 0 #print '.. siteID', siteID if 'type' in Get: if Get['type'] == 'image': #文本查询 #print 'search: text, page, siteID ',text, page, siteID t = get_template('query/search_image.html') res = queryer.searchImages(text, siteID, page) print res elif Get['type'] == 'file': t = get_template('query/search_file.html') res = queryer.searchFiles(text, siteID, page) else: t = get_template('query/search.html') res = queryer.searchText(text, siteID, page) else: t = get_template('query/search.html') res = queryer.searchText(text, siteID, page) if siteID != 0: res['title'] = self.titles[siteID-1] elif siteID == 7: res['title'] = '农学院' else: res['title'] = "全域" #print '.. res', res html = t.render( Context( res)) return HttpResponse(html)
class Reptile(threading.Thread): """ 单个线程 """ def __init__(self, name, urlQueue, urlist, Flock, homeUrls, maxPageNums, pages, imagenum, continueRun=[True]): """ pages: 记录下载的网页数目 """ self.__name = name threading.Thread.__init__(self, name=name) # own data self.__homeUrls = homeUrls self.__urlist = urlist self.__urlQueue = urlQueue self.Flock = Flock self.__curSiteID = [0] # curSiteID self.__temSiteID = -1 self.__homeurl = None self.__pageinfo = None self.continueRun = continueRun # some information to send to UserFrame ---- # num of downloaded pages self.__maxPageNums = maxPageNums # 记录下载的页面数目 self.pages = pages self.imagenum = imagenum # --------------------- self.urlparser = UrlParser(homeUrls) self.htmlparser = HtmlParser(self.urlparser) self.htmldb = HtmlDB(self.htmlparser) self.imageparser = ImageParser(name) self.textfileparser = TextFileParser() def requestSource(self, url): request = urllib2.Request(url) request.add_header("Accept-encoding", "gzip") try: page = self.opener.open(request, timeout=2) # 设置超时为2s if page.code == 200: predata = page.read() pdata = StringIO.StringIO(predata) gzipper = gzip.GzipFile(fileobj=pdata) try: data = gzipper.read() except (IOError): data = predata length = len(data) if length < 300 or length > 3000000: return False # begain to parse the page return data page.close() except: print "time out" def underPageLimit(self): """ 是否 某个站点的收录页面超出限制 """ _type = self.urlparser.typeDetect(self.__pathinfo.url)[0] # 如果 type 为‘’ 表示网页 image/doc表文件 if _type: # 对图片等文件不作计数 return True if self.pages[self.__temSiteID] >= self.__maxPageNums[self.__temSiteID]: return False return True def run(self): """ 运行主陈需 """ self.opener = urllib2.build_opener() while self.continueRun[0]: try: self.Flock.acquire() self.__pathinfo = self.__urlQueue.pop() self.Flock.release() except: print "nothing in urlqueue" print "droped" return print ".. get pathinfo", self.__pathinfo.url, self.__name # get (siteID, (title, path)) if not self.__pathinfo: """ 如果所有的队列均为空 则退出线程 """ print ".. get pathinfo empty" # return None break # self.__curSiteID[0] = pathinfo[0] self.__temSiteID = self.__pathinfo.siteID self.__temHomeUrl = self.__homeUrls[self.__temSiteID] # 判断是否超过限制页数 if not self.underPageLimit(): continue # print '.. curSite', self.__curSiteID[0] # print '.. homeurls', self.__homeUrls # print '.. get cursiteid', self.__curSiteID # print 'the path is ', pathinfo[1][1] source = self.requestSource(self.__pathinfo.url) # print source if not source: print "htmlsource is empty" continue filetype = self.urlparser.typeDetect(self.__pathinfo.url) _type = filetype[0] print ".. get file type", filetype, self.__name if not _type: self.dealHtml(source) elif _type == "image": self.dealImage(source, filetype[1]) print "self.imagenum", self.imagenum self.imagenum[0] += 1 elif _type == "doc": self.dealDoc() self.imagenum[0] += 1 else: print "some unknown type..." # 处理源码为xml文件 存储到数据库 # print '.. start to save html' # print '.. ',self.__name, 'quit!' def dealHtml(self, source): """ 对 html文件 从解析到存储的完整操作 """ print ".. get source len", len(source) # 过短视为无效 if len(source) < 300: return # 判断是否为html源码 if not self.htmlparser.init(source): print ".. source is not html" return # 开始进行处理 # 从 urlqueue中取得的url 已经为 绝对地址 self.pages[self.__temSiteID] += 1 # 取得links srcs列表 urlist = self.htmlparser.getLinks() urlist += self.htmlparser.getSrcs() # save html self.Flock.acquire() docID = self.htmldb.saveHtml(self.__pathinfo.siteID, self.__pathinfo.title, self.__pathinfo.url, source) self.Flock.release() self.addNewInQueue(docID, self.__pathinfo.url, urlist) def dealImage(self, source, extention): """ 对 image文件 从解析到存储的完整操作 """ try: self.imageparser.deal(source, extention, self.__pathinfo.url, self.__pathinfo.toDocID) except: return def dealDoc(self): """ 对 doc文件 从解析到存储的完整操作 """ self.textfileparser.deal(self.__pathinfo.title, self.__pathinfo.url, self.__pathinfo.toDocID) def addNewInQueue(self, docID, pageStdUrl, urlist): """ 直接从html source中提取出path列表 直接添加到各自的inqueue docID: 以及存储的page id urlist: html 及 文件地址混合列表 """ # 连同图片进行处理 # 图片也需要进行绝对化和判断是否重复等操作 # print 'get urlist' # for url in urlist: # print url[0], url[1] for urlInfor in urlist: # [title, path] # print 'pageStdUrl', pageStdUrl stdUrl = self.urlparser.transToStdUrl(pageStdUrl, urlInfor[1]) # print '.. get STDURL', stdUrl siteID = self.urlparser.judgeUrl(pageStdUrl, urlInfor[1]) _type = self.urlparser.typeDetect(stdUrl)[0] # print '.. get SITEID', siteID # path = self.urlparser.transPathByStd(stdUrl) # print '.. get PATH', path if siteID != -1: """ 加入队列中 """ # if not _type: # 正常网页 if not self.__urlist.find(stdUrl): """ urlist 中不重复 """ print ".. Add in Queue", stdUrl, _type if not _type: # 网页 self.Flock.acquire() # siteID toDocID urlinfo self.__urlQueue.append(siteID, -1, (urlInfor[0], stdUrl)) self.Flock.release() else: # 图片 及 其他文件 self.Flock.acquire() # siteID toDocID urlinfo self.__urlQueue.append(siteID, docID, (urlInfor[0], stdUrl)) self.Flock.release() """
class ReptileLib(threading.Thread): """ 爬虫线程库 """ def __init__(self): """ 全局数据控制 """ self.htmldb = HtmlDB() threading.Thread.__init__(self, name="reptilelib") print "... init ReptileLib ..." # 信号队列 由人机界面控制程序运行 self.inSignalQueue = Q.Queue() self.outSignalQueue = Q.Queue() self.Flock = threading.RLock() # 控制reptile线程是否运行 self.continueRun = [False] # 控制reptilelib 主程序及服务器是否运行 是否完全关闭 self.reptileLibRun = [True] # urlQueue and init in lib self.urlQueue = UrlQueue() self.urlist = Urlist() # 为了列表的共享性 初始的数据初始化[] 之后不能随意改变 self.homeUrls = [] self.pages = [] self.imagenum = [] self.imagenum.append(0) print "-" * 50 print ".. init self.imagenum", self.imagenum, type(self.imagenum) print "-" * 50 self.maxPages = [] self.reptilectrl = ReptileCtrl( homeUrls=self.homeUrls, continueRun=self.continueRun, urlist=self.urlist, urlQueue=self.urlQueue, maxPages=self.maxPages, pages=self.pages, imagenum=self.imagenum, outSignalQueue=self.outSignalQueue, ) self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue) # run init thread self.runInit() def runInit(self): """ run init thread """ self.controlserver.start() self.start() def run(self): """ 运行主程序 signal: { type:type } """ print "... run while ..." while True: print ".. while ReptileLib running .." signal = self.inSignalQueue.get() print "get signal", signal _type = signal["type"] print "get type", _type if _type is "init": """ 全新运行 """ print ".. init from empty project .." self.init(homeUrls=signal["homeurls"], maxPages=signal["maxpages"], threadNum=signal["reptilenum"]) elif _type is "resume": print ".. resume from database .." self.reptilectrl.resume() elif _type is "stop": print ".. stop .." self.reptilectrl.stop() elif _type is "halt": print ".. halt .." self.reptilectrl.halt() elif _type is "status": """ ask for status """ print ".. status .." # put status in queue self.reptilectrl.status() elif _type is "start": """ run reptiles """ print ".. run reptile threads .." print "It works!" if not self.continueRun[0]: self.continueRun[0] = True self.initThreads() self.threadsRun() print "ReptileLib core stopped!" print "Reptile stopped" def init(self, homeUrls, maxPages, threadNum): """ 完全初始化 首次运行 注意: 重复init时,为了list的共享数据特性 每次需要清空[] 然后再重新赋值 """ def clearList(_List): if not _List: return _size = len(_List) for i in range(_size): _List.pop() def initList(_List, List): # first clear list clearList(_List) for l in List: print l _List.append(l) print ".. init homeUrls" initList(self.homeUrls, homeUrls) initList(self.maxPages, maxPages) self.threadNum = threadNum self.maxPages = maxPages print ".. init maxPages:", self.maxPages print ".. init pages", self.pages # self.htmldb = HtmlDB(self.htmlparser) # init self.pages # self.pages used to calculate num of pages downloaded clearList(self.pages) for i in range(len(homeUrls)): self.pages.append(0) # init urlQueue self.urlQueue.init(self.homeUrls) self.urlQueue.initFrontPage() # self.urlist.init(len(self.homeUrls)) # 存储 homeUrls self.htmldb.saveHomeUrls(homeUrls, maxPages, self.pages) def initThreads(self): self.thlist = [] # default: from site 0 print "$" * 50 print "init thread imagenum", self.imagenum, type(self.imagenum) print "$" * 50 for i in range(self.threadNum): # 此处前缀也需要变化 # 修改 根据站点前缀命名爬虫 th = Reptile( name="reptile%d" % i, urlQueue=self.urlQueue, urlist=self.urlist, Flock=self.Flock, homeUrls=self.homeUrls, maxPageNums=self.maxPages, pages=self.pages, imagenum=self.imagenum, continueRun=self.continueRun, ) self.thlist.append(th) def threadsRun(self): for th in self.thlist: th.start()