def __init__(self, name, urlQueue, urlist, Flock, homeUrls, maxPageNums, pages, imagenum, continueRun=[True]): """ pages: 记录下载的网页数目 """ self.__name = name threading.Thread.__init__(self, name=name) # own data self.__homeUrls = homeUrls self.__urlist = urlist self.__urlQueue = urlQueue self.Flock = Flock self.__curSiteID = [0] # curSiteID self.__temSiteID = -1 self.__homeurl = None self.__pageinfo = None self.continueRun = continueRun # some information to send to UserFrame ---- # num of downloaded pages self.__maxPageNums = maxPageNums # 记录下载的页面数目 self.pages = pages self.imagenum = imagenum # --------------------- self.urlparser = UrlParser(homeUrls) self.htmlparser = HtmlParser(self.urlparser) self.htmldb = HtmlDB(self.htmlparser) self.imageparser = ImageParser(name) self.textfileparser = TextFileParser()
class Reptile(threading.Thread): """ 单个线程 """ def __init__(self, name, urlQueue, urlist, Flock, homeUrls, maxPageNums, pages, imagenum, continueRun=[True]): """ pages: 记录下载的网页数目 """ self.__name = name threading.Thread.__init__(self, name=name) # own data self.__homeUrls = homeUrls self.__urlist = urlist self.__urlQueue = urlQueue self.Flock = Flock self.__curSiteID = [0] # curSiteID self.__temSiteID = -1 self.__homeurl = None self.__pageinfo = None self.continueRun = continueRun # some information to send to UserFrame ---- # num of downloaded pages self.__maxPageNums = maxPageNums # 记录下载的页面数目 self.pages = pages self.imagenum = imagenum # --------------------- self.urlparser = UrlParser(homeUrls) self.htmlparser = HtmlParser(self.urlparser) self.htmldb = HtmlDB(self.htmlparser) self.imageparser = ImageParser(name) self.textfileparser = TextFileParser() def requestSource(self, url): request = urllib2.Request(url) request.add_header("Accept-encoding", "gzip") try: page = self.opener.open(request, timeout=2) # 设置超时为2s if page.code == 200: predata = page.read() pdata = StringIO.StringIO(predata) gzipper = gzip.GzipFile(fileobj=pdata) try: data = gzipper.read() except (IOError): data = predata length = len(data) if length < 300 or length > 3000000: return False # begain to parse the page return data page.close() except: print "time out" def underPageLimit(self): """ 是否 某个站点的收录页面超出限制 """ _type = self.urlparser.typeDetect(self.__pathinfo.url)[0] # 如果 type 为‘’ 表示网页 image/doc表文件 if _type: # 对图片等文件不作计数 return True if self.pages[self.__temSiteID] >= self.__maxPageNums[self.__temSiteID]: return False return True def run(self): """ 运行主陈需 """ self.opener = urllib2.build_opener() while self.continueRun[0]: try: self.Flock.acquire() self.__pathinfo = self.__urlQueue.pop() self.Flock.release() except: print "nothing in urlqueue" print "droped" return print ".. get pathinfo", self.__pathinfo.url, self.__name # get (siteID, (title, path)) if not self.__pathinfo: """ 如果所有的队列均为空 则退出线程 """ print ".. get pathinfo empty" # return None break # self.__curSiteID[0] = pathinfo[0] self.__temSiteID = self.__pathinfo.siteID self.__temHomeUrl = self.__homeUrls[self.__temSiteID] # 判断是否超过限制页数 if not self.underPageLimit(): continue # print '.. curSite', self.__curSiteID[0] # print '.. homeurls', self.__homeUrls # print '.. get cursiteid', self.__curSiteID # print 'the path is ', pathinfo[1][1] source = self.requestSource(self.__pathinfo.url) # print source if not source: print "htmlsource is empty" continue filetype = self.urlparser.typeDetect(self.__pathinfo.url) _type = filetype[0] print ".. get file type", filetype, self.__name if not _type: self.dealHtml(source) elif _type == "image": self.dealImage(source, filetype[1]) print "self.imagenum", self.imagenum self.imagenum[0] += 1 elif _type == "doc": self.dealDoc() self.imagenum[0] += 1 else: print "some unknown type..." # 处理源码为xml文件 存储到数据库 # print '.. start to save html' # print '.. ',self.__name, 'quit!' def dealHtml(self, source): """ 对 html文件 从解析到存储的完整操作 """ print ".. get source len", len(source) # 过短视为无效 if len(source) < 300: return # 判断是否为html源码 if not self.htmlparser.init(source): print ".. source is not html" return # 开始进行处理 # 从 urlqueue中取得的url 已经为 绝对地址 self.pages[self.__temSiteID] += 1 # 取得links srcs列表 urlist = self.htmlparser.getLinks() urlist += self.htmlparser.getSrcs() # save html self.Flock.acquire() docID = self.htmldb.saveHtml(self.__pathinfo.siteID, self.__pathinfo.title, self.__pathinfo.url, source) self.Flock.release() self.addNewInQueue(docID, self.__pathinfo.url, urlist) def dealImage(self, source, extention): """ 对 image文件 从解析到存储的完整操作 """ try: self.imageparser.deal(source, extention, self.__pathinfo.url, self.__pathinfo.toDocID) except: return def dealDoc(self): """ 对 doc文件 从解析到存储的完整操作 """ self.textfileparser.deal(self.__pathinfo.title, self.__pathinfo.url, self.__pathinfo.toDocID) def addNewInQueue(self, docID, pageStdUrl, urlist): """ 直接从html source中提取出path列表 直接添加到各自的inqueue docID: 以及存储的page id urlist: html 及 文件地址混合列表 """ # 连同图片进行处理 # 图片也需要进行绝对化和判断是否重复等操作 # print 'get urlist' # for url in urlist: # print url[0], url[1] for urlInfor in urlist: # [title, path] # print 'pageStdUrl', pageStdUrl stdUrl = self.urlparser.transToStdUrl(pageStdUrl, urlInfor[1]) # print '.. get STDURL', stdUrl siteID = self.urlparser.judgeUrl(pageStdUrl, urlInfor[1]) _type = self.urlparser.typeDetect(stdUrl)[0] # print '.. get SITEID', siteID # path = self.urlparser.transPathByStd(stdUrl) # print '.. get PATH', path if siteID != -1: """ 加入队列中 """ # if not _type: # 正常网页 if not self.__urlist.find(stdUrl): """ urlist 中不重复 """ print ".. Add in Queue", stdUrl, _type if not _type: # 网页 self.Flock.acquire() # siteID toDocID urlinfo self.__urlQueue.append(siteID, -1, (urlInfor[0], stdUrl)) self.Flock.release() else: # 图片 及 其他文件 self.Flock.acquire() # siteID toDocID urlinfo self.__urlQueue.append(siteID, docID, (urlInfor[0], stdUrl)) self.Flock.release() """