class Strategy(object): """爬取策略""" def __init__(self, name,path,switch=False,solr=None): self.name=name self.switch=switch self.path=path+'\logs\\'+self.name if not os.path.exists(path+'\logs'): os.mkdir(path+'\logs') if not os.path.exists(self.path): os.mkdir(self.path) self.routor=Routor(name,path) self.queue=Queue.Queue(maxsize=0) self.failQueue=Queue.Queue(maxsize=0)#失败队列 self.logger=self.newLogging(name) self.bloomfilter=BloomFilter() self.count=0 self.queue.put(self.routor.route[0]['pattern']) self.sleeptime=self.routor.route[-1]['sleeptime'] self.block=SleepTime(self.sleeptime)#屏蔽模块 self.fail=0 self.job=deal(name,'job',path,solr) self.company=deal(name,'company',path,solr) def newLogging(self,name): logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) # 创建一个handler,用于写入日志文件 fh = logging.FileHandler(self.path+'\\'+name+'.log') fh.setLevel(logging.DEBUG) # 再创建一个handler,用于输出到控制台 ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # 定义handler的输出格式 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) # 给logger添加handler logger.addHandler(fh) logger.addHandler(ch) return logger """功能函数包装""" def link_and_check(func): def _wrapper(*args,**kw): url=unicode(args[1]) headers ={'Accept':'text/html;q=0.9,*/*;q=0.8','Accept-Charset':'ISO-8859-1,utf-8,gb2312;q=0.7,*;q=0.3','Accept-Encoding':'gzip','Connection':'close','Referer':None} headers['User-Agent']=getHeader() #注意如果依然不能抓取的话,Referer可以设置抓取网站的host try: try: req=requests.get(url,timeout=5,headers=headers) except Exception as e: raise FailException(args[0],'bad requests:'+str(type(e))[8:]) if req: if req.content: kw['content']=req.content return func(*args,**kw) else: args[0].logger.warning('No Content in URL: %s'%url) raise FailException(args[0],'No Content in URL') except FailException as e: args[0].logger.warning('URL: %s | info: %s'%(url,e.info)) args[0].logger.warning('fail: %s | Stime: %s'%(args[0].fail,args[0].sleeptime)) if args[0].switch: print 'put in failQueue' args[0].failQueue.put(url) finally: if args[0].switch: tim=args[0].block.isBlocked(args[0].fail) if isinstance(tim,tuple): if tim[1]: #学习停止 args[0].switch=False args[0].routor.setST(tim[0]) tim=tim[0] args[0].sleeptime=tim print args[0].sleeptime if args[0].fail==0 or not args[0].switch: if not args[0].failQueue.empty(): for x in range(args[0].failQueue.qsize()): u=args[0].failQueue.get() args[0].queue.put(u) return _wrapper """功能函数""" @link_and_check def enter(self,url,**kw): #处理需要进入并且获取网页指定区域子连接的URL text=kw['content'] area=self.getArea(text,kw['loc']) linklist=self.getAllAch(area) for link in linklist: if not self.bloomfilter.isContain(link): self.queue.put(link) self.bloomfilter.insert(link) self.fail=0 @link_and_check def need(self,url,**kw): #处理目标页面的文本信息,直接下载到本地 text=kw['content'] if kw['ctg']=='job': filename='\job_save.log' self.job.txt=text self.job.url=url forsave= self.job.send('update')[0] else: filename='\company_save.log' self.company.txt=text self.company.url=url forsave= self.company.send('update')[0] with open(self.path+filename,'a') as f: f.write(forsave) self.fail=0 def auto(self,url,**kw): #处理需要调用URLgenerator的URL self.logger.warning('BEGIN USING ATUO generator!') self.routor.match(url,submodel=True) if len(kw['replace'])==2: replace=[str(n) for n in xrange(kw['replace'][0],kw['replace'][1])] else: replace=kw['replace'] for x in replace: u=urlGenerator(url,kw['between'],x) if not self.bloomfilter.isContain(u): self.distributor(u) self.bloomfilter.insert(u) self.routor.match(url,submodel=False) """策略核心""" def core(self): ti=time.time() isFinish=False #退出判定 t=time.time()-ti #计时退出 # try: # while not isFinish: # # size=self.queue.qsize() # # self.logger.info('before get url, Queue size = %s'%size) # url=self.queue.get() # self.distributor(url) # t=time.time()-ti # if t>3600: # isFinish=True # print 'COUNTE = ',self.count # self.logger.info('COUNT = %s'%self.count) # except: # print 'FINISH ! In Time:',t # print self.queue.qsize() # self.logger.info('FINISH ! In Time: %s'%t) while not isFinish: url=self.queue.get() self.distributor(url) #退出机制,测试用 # t=time.time()-ti # if t>3600: # isFinish=True # print 'COUNTE = ',self.count # self.logger.info('COUNT = %s'%self.count) print 'FINISH ! In Time:',t print self.queue.qsize() self.logger.info('FINISH ! In Time: %s'%t) def distributor(self,url): #分发链接 afterRoute=self.routor.match(url) if afterRoute: self.count+=1 self.logger.info('%s: %s'%(afterRoute['model'],url)) if afterRoute['model']=='enter': self.enter(url,**afterRoute['args']) elif afterRoute['model']=='need': self.need(url,**afterRoute['args']) elif afterRoute['model']=='auto': self.auto(url,**afterRoute['args']) # time.sleep(self.sleeptime) else: self.logger.warning('URL: %s is not found in Pattern !'%url) """工具方法""" def getArea(self,text,loc): #获取指定文本之间的文本 for k,v in loc.iteritems(): l=[k,v] t=getContent(text,l) if t: return t print text raise FailException(self,'No Area is Done') def getAllAch(self,text): #获取指定文本中的链接,并查重,返回list soup=BeautifulSoup(text) linklist=[link.get('href') for link in soup.find_all('a')] if len(linklist)==0: raise FailException(self,'No link in content') legallink=[] for link in linklist: link=str(link) if re.match(r'http://.*',link): legallink.append(link) linklist=legallink for script in soup.find_all('script'): scr=str(script) r=re.findall(r'"http://.*?"',scr) for sc in r: if sc: rs=re.search(r'"http://.*?"',sc) if rs: l=rs.group().replace('"','') linklist.append(l) return linklist