class UrlQueue: ''' url队列 ''' def __init__(self): self.__siteNum = None self.sizes = [] self.size = 0 def init(self, homeUrls): ''' homeUrls is a [title, url] ''' self.homeUrls = homeUrls self.htmldb = HtmlDB(None) #self.htmlparser) self.clear() self.__siteNum = len(self.homeUrls) for i in range(self.__siteNum): self.sizes.append(0) def clear(self): ''' 在一次全新项目时 清空整个urlqueue ''' self.htmldb.clearUrlQueue() def append(self, siteID, toDocID, stdUrlInfo): ''' stdUrlInfo = [title, url] toSiteID: 附属于的网页编号 -1:正常网页 >0 文件 输入时 url 必须为绝对地址 ''' self.htmldb.saveUrlQueue( stdUrlInfo, siteID, toDocID) self.size += 1 self.sizes[siteID] += 1 def initFrontPage(self): ''' put homeUrl as front page to queue and start to run default: reptile get homeurl as first page to download ''' for i,url in enumerate(self.homeUrls): self.append( i, -1, url) ''' #为蛋站但设计 for i in range(8): self.append(0, -1, ['信电学院',"http://www.ciee.cn/ciee/"]) homeurls = [ ['今日新闻', 'http://news.cau.edu.cn/list.php?mid=1'], ['媒体农大','http://news.cau.edu.cn/list.php?mid=4'], ['推荐新闻', 'http://news.cau.edu.cn/list.php?mid=3'], ['农大科技', 'http://news.cau.edu.cn/list.php?lid=3'], ] ''' def pop(self): ''' 如果需要的list为空 则循环返回其他list的path ''' if not self.size: #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #need to sleep for a moment #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #模仿queue的功能 睡眠3秒 thread.sleep(10000) if self.size > 0: url = self.htmldb.getCacheUrl() #print 'siteID', url.siteID self.sizes[url.siteID] -= 1 return url else: return None def show(self): for i,qu in enumerate(self.__queue) : print 'the %dth queue len is %d'%(i, qu.qsize() ) for u in qu: print u def getNums(self): ''' 返回每个queue的长度 ''' nums = [] for q in self.__queue: nums.append(q.qsize()) return nums def getAll(self): return self.__queue def resume(self, homeurls, queues): ''' queues = [ [ [title, path], ], ] ''' _size = len(queues) self.init(homeurls) for i,queue in enumerate(queues): for u in queue: self.__queue[i].put(u)