def __init__(self, start_urls, domains=None, regexRules=None, downloader=None, scheduler=None, saver=None, pageProcessor=None): super(Spider, self).__init__() self.start_urls = start_urls self.domains = domains self.regexRules = regexRules self.downloader = downloader or SyncDownloader() self.scheduler = scheduler or redisScheduler() # 默认将结果输出到控制台 self.saver = saver or ConsoleSaver() self.pageProcessor = pageProcessor or Processor() # Prefix start_urls for url_idex in range(len(self.start_urls)): self.start_urls[url_idex] = self.__fix_urls(self.start_urls[url_idex]) # Valid URL for url in self.start_urls: if self.__judge_urls(url): # Add init URL collection self.scheduler.pushUrl(pUrl(url), 0) self.config = Config() # Store thread of this Spider self.threads = {}
def __run(): while True: # Check the current thread's status thread = self.threads[threading.currentThread().getName()] if thread['status'] == 'Stopping': # Finish current thread break elif thread['status'] == 'Suspending': # Suspend current thread continue URL, _ = self.scheduler.popUrl() if URL is None and threading.activeCount() == 2: # Main-thread、Monitor-thread break elif URL is None: # Sleep one second time.sleep(1) else: # print URL.getUrl # step1: download URL document = self.downloader.download(URL) if document: # step2: Get URLs in document urls, document = document.parserLinks() # step3: handle URLs from step2 for url in urls: if self.regexRules is None: if self.__judge_urls(url.getUrl): self.scheduler.pushUrl(pUrl(self.__fix_urls(url.getUrl)), 0) else: if self.regexRules.isMatched(url.getUrl) and self.__judge_urls(url.getUrl): self.scheduler.pushUrl(pUrl(self.__fix_urls(url.getUrl)), 0) # step4: handle document document = self.pageProcessor.pageParser(document) # step5: save the result from document self.saver.save(document.getItems()) document.clear() self.scheduler.done()
def popUrl(self): """Pop the URL from redisQueue""" url = None score = 0 try: if self.qtype == 'q': url = self.__Queue.pop() elif self.qtype == 'p': url, score = self.__Queue.pop(withscores=True) except Exception as error: log.info('redisScheduler.RedisScheduler.popUrl ERROR(reason: %s)', error) return pUrl(url, score), None
def parserLinks(self): links = [] __Set = set() try: tagsOfa = self.parserDoc.select('a') for info in tagsOfa: if 'href' not in info.attrs: continue url, sign = self.__fixUrl(info.attrs['href']) if sign: if url not in __Set: __Set.add(url) links.append(pUrl(url)) except Exception as error: log.error('parser htmlDoc(%r) ERROR:%r', self.pUrl.getUrl, error) return links, self
try: if self.qtype == 'q': url = self.__Queue.pop() elif self.qtype == 'p': url, score = self.__Queue.pop(withscores=True) except Exception as error: log.info('redisScheduler.RedisScheduler.popUrl ERROR(reason: %s)', error) return pUrl(url, score), None def done(self): pass def getSize(self): """Get redisQueue size from redis server""" return self.__Queue.getSize() def isEmpty(self): """Judge redisQueue whether is empty""" return self.__Queue.getSize() == 0 def join(self): pass if __name__ == '__main__': r = redisScheduler() r.pushUrl(pUrl('www'), 0) print r.popUrl().getUrl
def setNewUrls(self, newUrls): self.newUrls = newUrls return self def getItems(self): return self.items def setItems(self, key, value): try: self.items[key] = value except: pass def getParserDoc(self): return self.parserDoc def clear(self): self.items = {} self.newUrls = [] self.parserDoc = None if __name__ == '__main__': import requests url = "http://news.nwsuaf.edu.cn/yxxw/75964.htm" htmlDoc = requests.get(url).text for URL in Document(pUrl(url), htmlDoc).parserLinks(): print(URL.getUrl)
#coding: utf-8 """ 同步下载器 """ from core.downloader import Downloader from core.downloader.Net import net from parserDoc.parserDoc import Document from defines.pUrl import pUrl class SyncDownloader(Downloader): def __init__(self): super(SyncDownloader, self).__init__() def download(self, pUrl): htmlDoc = net.getInstance().get(pUrl.getUrl) document = None if htmlDoc is not None: document = Document(pUrl, htmlDoc) return document if __name__ == '__main__': print SyncDownloader().download(pUrl('http://www.baidu.com'))