def parse_zero(self,response):
		#这个是真正处理第一级页面的函数,上面只不过是一个分类,每个层级需要传递的参数确定,再分别传输
		#规定:接受Segement参数,或者xpath参数,前者表示会分页,并将分页得到的link提交到绑定函数处理,再传递给正常的下一层
		Index_Url = response.meta.get('Index_Url',None)
		Segement = response.meta.get('Segement',None)
		First = response.meta.get('First',None)
		Second = response.meta.get('Second',None)
		Third = response.meta.get('Third',None)
		Final_Xpath = response.meta.get('Final_Xpath',None)
		max_pages = 1
		urls = ""
		level = 0
		is_sege = 0

		#分清跳转的前提条件:Segement存在Max_Page,即这个函数必然是跳转到对应的分页处理函数segment_xxx,然后再返回url给parse_next ; else即是跳转到正常parse_next
		if Segement.has_key('Max_Page'):
				if not Segement['Max_Page'].has_key('json'):
						try:
								max_pages = re.search(Segement['Max_Page']['re'],''.join(response.xpath(Segement['Max_Page']['xpath']).extract())).group()
						except Exception,e:
								print Exception,":",e
						if isinstance(max_pages,unicode):
								max_pages = max_pages.encode('utf-8')
						if isinstance(max_pages,int) or isinstance(max_pages,str):
								max_pages = T_P_C(self.name,int(max_pages),level)
						else:
								raise ValueError("parse_zero: ERROR,in the splashing,can not find the Max_page,please check!!!")
						urls,start_url = U_G(Index_Url,self.name,level)
				#存在json即用json的方式去解读
				else:
						res_json = json.loads(response.body_as_unicode())
						depth = 0
						if isinstance(Segement['Max_Page']['index'],list):
								try:
									while depth < len(Segement['Max_Page']['index']):
										res_json = res_json.get(Segement['Max_Page']['index'][depth])
										depth += 1
								except Exception,e:
									print Exception,":",e 
								max_pages = T_P_C(self.name,int(res_json),level)
						else:
示例#2
0
	def parse_third(self,response):
		#规定:接受Segement参数,或者xpath参数,前者表示会分页,并将分页得到的link提交到绑定函数处理,再传递给正常的下一层
		Index_Url = response.meta.get('Index_Url',None)
		Third = self.detail['Third'] if self.detail['Third'] else None
		max_pages = 1
		urls = ""
		level = 3
		is_sege = 0

		#分清跳转的前提条件:Segement存在Max_Page,即这个函数必然是跳转到对应的分页处理函数segment_xxx,然后再返回url给parse_next ; else即是跳转到正常parse_next
		if Third.has_key('Max_Page'):
			if not Third['Max_Page'].has_key('json'):
				try:
					max_pages = re.search(Third['Max_Page']['re'],''.join(response.xpath(Third['Max_Page']['xpath']).extract())).group()
				except Exception,e:
					print Exception,":",e
				if isinstance(max_pages,unicode):
					max_pages = max_pages.encode('utf-8')
				if isinstance(max_pages,int) or isinstance(max_pages,str):
					max_pages = T_P_C(self.name,int(max_pages),level)
				else:
					raise ValueError("parse_Third: ERROR 1,in the splashing parse,can not find the Max_page,please check!!!")
				urls,start_url = U_G(Index_Url,self.name,level)
				#存在json即用json的方式去解读
			else:
				res_json = json.loads(response.body_as_unicode())
				depth = 0
				if isinstance(Third['Max_Page']['index'],list):
					try:
						while depth < len(Third['Max_Page']['index']):
							res_json = res_json.get(Third['Max_Page']['index'][depth])
							depth += 1
					except Exception,e:
						print Exception,":",e 
					max_pages = T_P_C(self.name,int(res_json),level)
				elif isinstance(Third['Max_Page']['index'],int):
					max_pages = T_P_C(self.name,Third['Max_Page']['index'],level)
示例#3
0
				res_json = json.loads(response.body_as_unicode())
				depth = 0
				if isinstance(Zero['Max_Page']['index'],list):
					try:
						while depth < len(Zero['Max_Page']['index']):
							res_json = res_json.get(Zero['Max_Page']['index'][depth])
							depth += 1
					except Exception,e:
						print Exception,":",e 
					max_pages = T_P_C(self.name,int(res_json),level)
				elif isinstance(Zero['Max_Page']['index'],int):
					max_pages = T_P_C(self.name,Zero['Max_Page']['index'],level)
				else:
					raise ValueError("parse_zero: ERROR 1 ,in the json parse,can not find the Max_page,please check!!!")
				#将这个U_G做成和R_2_A一样的函数,主要应对的还是分页
				urls,start_url = U_G(Index_Url,self.name,level)
			#如果该站点压根没有告诉你有多少页面,那就只能手动给出一个值了,如下函数.
			max_pages = T_P_B(self.name,max_pages,level)	
			print "最大页数是:%d"%max_pages
			#分了页,之后,就是绑定分页处理函数.(存在segement参数就绑定,不存在就直接进入下一层)
			#当然跳转到下一层只是说明这一层分页得到的页面不要再处理,还是需要判断是否需要渲染
			if Zero['Max_Page'].has_key('segement'):
				if Zero['Max_Page'].has_key('splash'):
					begin = 0
					try:
						begin = re.search('\d+$',start_url).group()
					except Exception,e:
						print Exception,":",e,".parse_zero: ERROR 1-2,can not find the start page number in the splash page,please check!!!"
					for i in range(int(begin),max_pages+1):
						i = T_T_P(i,self.name,level)
						url = urls.format(page=str(i))