示例#1
0
class Strategy(object):
	"""爬取策略"""
	def __init__(self, name,path,switch=False,solr=None):
		self.name=name
		self.switch=switch
		self.path=path+'\logs\\'+self.name
		if not os.path.exists(path+'\logs'):
			os.mkdir(path+'\logs')
		if not os.path.exists(self.path):
			os.mkdir(self.path)
		self.routor=Routor(name,path)
		self.queue=Queue.Queue(maxsize=0)
		self.failQueue=Queue.Queue(maxsize=0)#失败队列
		self.logger=self.newLogging(name)
		self.bloomfilter=BloomFilter()
		self.count=0
		self.queue.put(self.routor.route[0]['pattern'])
		self.sleeptime=self.routor.route[-1]['sleeptime']
		self.block=SleepTime(self.sleeptime)#屏蔽模块
		self.fail=0
		self.job=deal(name,'job',path,solr)
		self.company=deal(name,'company',path,solr)

	def newLogging(self,name):
		logger = logging.getLogger(name)
		logger.setLevel(logging.DEBUG)
		# 创建一个handler,用于写入日志文件
		fh = logging.FileHandler(self.path+'\\'+name+'.log')
		fh.setLevel(logging.DEBUG)
		# 再创建一个handler,用于输出到控制台
		ch = logging.StreamHandler()
		ch.setLevel(logging.DEBUG)
		# 定义handler的输出格式
		formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
		fh.setFormatter(formatter)
		ch.setFormatter(formatter)
		# 给logger添加handler
		logger.addHandler(fh)
		logger.addHandler(ch)
		return logger
	"""功能函数包装"""
	def link_and_check(func):
		def _wrapper(*args,**kw):
			url=unicode(args[1])
			headers ={'Accept':'text/html;q=0.9,*/*;q=0.8','Accept-Charset':'ISO-8859-1,utf-8,gb2312;q=0.7,*;q=0.3','Accept-Encoding':'gzip','Connection':'close','Referer':None}
			headers['User-Agent']=getHeader()
			#注意如果依然不能抓取的话,Referer可以设置抓取网站的host
			try:
				try:
					req=requests.get(url,timeout=5,headers=headers)
				except Exception as e:
					raise FailException(args[0],'bad requests:'+str(type(e))[8:])
				if req:
					if req.content:
						kw['content']=req.content
						return func(*args,**kw)
				else:
					args[0].logger.warning('No Content in URL: %s'%url)
					raise FailException(args[0],'No Content in URL')
			except FailException as e:
				args[0].logger.warning('URL: %s | info: %s'%(url,e.info))
				args[0].logger.warning('fail: %s | Stime: %s'%(args[0].fail,args[0].sleeptime))
				if args[0].switch:
					print 'put in failQueue'
					args[0].failQueue.put(url)
			finally:
				if args[0].switch:
					tim=args[0].block.isBlocked(args[0].fail)
					if isinstance(tim,tuple):
						if tim[1]:
							#学习停止
							args[0].switch=False
							args[0].routor.setST(tim[0])
							tim=tim[0]
					args[0].sleeptime=tim
					print args[0].sleeptime
					if args[0].fail==0 or not args[0].switch:
						if not args[0].failQueue.empty():
							for x in range(args[0].failQueue.qsize()):
								u=args[0].failQueue.get()
								args[0].queue.put(u)
		return _wrapper
	"""功能函数"""
	@link_and_check
	def enter(self,url,**kw):
		#处理需要进入并且获取网页指定区域子连接的URL
		text=kw['content']
		area=self.getArea(text,kw['loc'])
		linklist=self.getAllAch(area)
		for link in linklist:
			if not self.bloomfilter.isContain(link):
				self.queue.put(link)
				self.bloomfilter.insert(link)
		self.fail=0

	@link_and_check
	def need(self,url,**kw):
		#处理目标页面的文本信息,直接下载到本地
		text=kw['content']
		if kw['ctg']=='job':
			filename='\job_save.log'
			self.job.txt=text
			self.job.url=url
			forsave= self.job.send('update')[0]
		else:
			filename='\company_save.log'
			self.company.txt=text
			self.company.url=url
			forsave= self.company.send('update')[0]
		with open(self.path+filename,'a') as f:
			f.write(forsave)

		self.fail=0


	def auto(self,url,**kw):
		#处理需要调用URLgenerator的URL
		self.logger.warning('BEGIN USING ATUO generator!')
		self.routor.match(url,submodel=True)
		if len(kw['replace'])==2:
			replace=[str(n) for n in xrange(kw['replace'][0],kw['replace'][1])]
		else:
			replace=kw['replace']

		for x in replace:
			u=urlGenerator(url,kw['between'],x)
			if not self.bloomfilter.isContain(u):
				self.distributor(u)
				self.bloomfilter.insert(u)
		self.routor.match(url,submodel=False)
				
	"""策略核心"""
	def core(self):
		ti=time.time()
		isFinish=False #退出判定
		t=time.time()-ti #计时退出

		# try:
		# 	while not isFinish:
		# 	# size=self.queue.qsize()
		# 	# self.logger.info('before get url, Queue size = %s'%size)
		# 		url=self.queue.get()
		# 		self.distributor(url)
		# 		t=time.time()-ti
		# 		if t>3600:
		# 			isFinish=True
		# 			print 'COUNTE = ',self.count
		# 			self.logger.info('COUNT = %s'%self.count)
		# except:
		# 	print 'FINISH ! In Time:',t
		# 	print self.queue.qsize()
		# 	self.logger.info('FINISH ! In Time: %s'%t)

		while not isFinish:
			url=self.queue.get()
			self.distributor(url)

			#退出机制,测试用
			# t=time.time()-ti
			# if t>3600:
			# 	isFinish=True
			# 	print 'COUNTE = ',self.count
			# 	self.logger.info('COUNT = %s'%self.count)

		print 'FINISH ! In Time:',t
		print self.queue.qsize()
		self.logger.info('FINISH ! In Time: %s'%t)

	def distributor(self,url):
		#分发链接
		afterRoute=self.routor.match(url)
		if afterRoute:
			self.count+=1
			self.logger.info('%s: %s'%(afterRoute['model'],url))
			if afterRoute['model']=='enter':
				self.enter(url,**afterRoute['args'])
			elif afterRoute['model']=='need':
				self.need(url,**afterRoute['args'])
			elif afterRoute['model']=='auto':
				self.auto(url,**afterRoute['args'])
			# time.sleep(self.sleeptime)
		else:
			self.logger.warning('URL: %s is not found in Pattern !'%url)

	"""工具方法"""
	def getArea(self,text,loc):
		#获取指定文本之间的文本
		for k,v in loc.iteritems():
			l=[k,v]
			t=getContent(text,l)
			if t:
				return t
		print text
		raise FailException(self,'No Area is Done')

	def getAllAch(self,text):
		#获取指定文本中的链接,并查重,返回list
		soup=BeautifulSoup(text)
		linklist=[link.get('href') for link in soup.find_all('a')]
		if len(linklist)==0:
			raise FailException(self,'No link in content')
		legallink=[]
		for link in linklist:
			link=str(link)
			if re.match(r'http://.*',link):
				legallink.append(link)
		linklist=legallink
		for script in soup.find_all('script'):
			scr=str(script)
			r=re.findall(r'"http://.*?"',scr)
			for sc in r:
				if sc:
					rs=re.search(r'"http://.*?"',sc)
					if rs:
						l=rs.group().replace('"','')
						linklist.append(l)
		return linklist