def __init__(self, depth, startUrls, keyword, htmlQueue, dataQueue, urlQueue, exitEvent): self.__htmlQueue = htmlQueue self.__dataQueue = dataQueue self.__urlQueue = urlQueue self.__keyword = keyword self.__depth = depth self.__startUrls = startUrls self.__exitEvent = exitEvent # pageFilter用于页面过滤,判断此页面是否需要存储 self.__myPageFilter = PageFilter(keyword) # urlFilter用于url过滤,判断url是否需要继续下载 self.__myUrlFilter = UrlFilter(self.__startUrls)
class Parser(WorkRequest): '''继承自线程池中的WorkRequest类,并实现线程执行函数 功能: 过滤html页面,判断其是否符合存储条件并将符合条件的页面放入data队列 解析html页面,过滤出符合条件的url并将其放入url队列 ''' def __init__(self, depth, startUrls, keyword, htmlQueue, dataQueue, urlQueue, exitEvent): self.__htmlQueue = htmlQueue self.__dataQueue = dataQueue self.__urlQueue = urlQueue self.__keyword = keyword self.__depth = depth self.__startUrls = startUrls self.__exitEvent = exitEvent # pageFilter用于页面过滤,判断此页面是否需要存储 self.__myPageFilter = PageFilter(keyword) # urlFilter用于url过滤,判断url是否需要继续下载 self.__myUrlFilter = UrlFilter(self.__startUrls) def getRepeatSetSize(self): return self.__myUrlFilter.getRepeatSetSize() def __parsePage(self): '''解析函数,完成解析模块的核心功能''' htmlNode = self.__htmlQueue.get() # 过滤页面,判断页面是否需要存储 if self.__myPageFilter.isGood(htmlNode.html): dataNode = HtmlModel(htmlNode.url, '', htmlNode.time, htmlNode.depth) self.__dataQueue.put(dataNode) # 爬取深度控制,如果爬取深度达到指定深度则不继续解析页面中的链接 if htmlNode.depth >= self.__depth: return linkList = [] try: # 解析html页面中的所有链接,使用lxml模块 doc = lxml.html.document_fromstring(htmlNode.html) doc.make_links_absolute(htmlNode.url) links = doc.iterlinks() for link in links: linkList.append(link[2]) except Exception, e: logger.warning('Parse page exception: %s', str(e)) return if len(linkList) == 0: logger.warning('Parse page success, but link is null: %s', htmlNode.url) return # 过滤url,包括去url重复、特定后缀以及站外链接 linkList = self.__myUrlFilter.urlfilter(linkList) # 将符合条件的url重新添加回url队列 for url in linkList: urlNode = UrlModel(url, htmlNode.url, timestamp(), htmlNode.depth + 1) self.__urlQueue.put(urlNode)
from downLoadPage import DownLoadPage from urlFilter import UrlFilter from urlparse import urljoin, urlparse from bs4 import BeautifulSoup import time urlfilter = UrlFilter() class GetLinks(object): def __init__(self, html, currentUrl): self.html = html self.links = [] self.currentUrl = currentUrl self.soup = BeautifulSoup(html) def getLinks(self, originalUrl, get='get'): #print "[INFO]:",self.currentUrl##################OUTPUT data if get == 'get': results = self.soup.find_all('a', href=True) ###get型:取a标签href属性 for i in results: href = i.get('href') if not href.startswith('http'): href = urljoin(self.currentUrl, href) if urlfilter.judgeUrlFormat(href, originalUrl): if urlfilter.filterSameLink(href): ###去重 if urlfilter.filterSimilarLink(href): ##去相似 self.links.append(href) else: continue