예제 #1
0
	def parse_third(self,response):
		Index_Url = response.meta['Index_Url']
		Target_Detail_Page = response.meta.get('Target_Detail_Page',None)
		Final_Xpath = response.meta.get('Final_Xpath',None)
		detail_url = Relative_to_Absolute(Index_Url,response.xpath(Target_Detail_Page['xpath']).extract(),self.name)	
		Some_Info = {}
		if 'Some_Info' in Target_Detail_Page.keys():
				keys = Target_Detail_Page['Some_Info'].keys()
				for key in keys:
						try:
								Some_Info[key] = response.xpath(Target_Detail_Page['Some_Info'][key]).extract()[0]
						except Exception,e:
								print Exception,":",e
예제 #2
0
	def parse_json2(self,response):
		Index_Url = response.meta.get('Index_Url',None)
		All_Detail_Page = response.meta.get('All_Detail_Page',None)
		Signal_Detail_Page = response.meta.get('Signal_Detail_Page',None)
		Target_Detail_Page = response.meta.get('Target_Detail_Page',None)
		Final_Xpath = response.meta.get('Final_Xpath',None)
		detail_url = []
		res_json = json.loads(response.body_as_unicode())
		#递归读取最底层的key对应的value值,我去,想出来了~~[这里是要for一遍最底层的list,所以要读到len-1处,然后在得到detail_url]
		depth = 0
		length = len(All_Detail_Page['index'])
		while depth < length - 1:
				res_json = res_json.get(All_Detail_Page['index'][depth])
				depth += 1
		#print "now the res_json is %s"%res_json
		for i in res_json:
				detail_url.append(i.get(All_Detail_Page['index'][length-1]))
		try:
				detail_url = Relative_to_Absolute(Index_Url,detail_url,self.name)
		except Exception,e:
				print Exception,":",e
예제 #3
0
		Signal_Detail_Page = response.meta.get('Signal_Detail_Page',None)
		Target_Detail_Page = response.meta.get('Target_Detail_Page',None)
		Final_Xpath = response.meta.get('Final_Xpath',None)
		Some_Info = {}
		if 'Some_Info' in All_Detail_Page.keys():
				keys = All_Detail_Page['Some_Info'].keys()
				for key in keys:
						try:
								Some_Info[key] = response.xpath(All_Detail_Page['Some_Info'][key]).extract()[0]
						except Exception,e:
								print Exception,":",e
		#一个页面可能会需要多个提取的xpath,这里就指定为一个list了
		detail_url = []
		
		for xpath in All_Detail_Page['xpath']:
				for url in Relative_to_Absolute(Index_Url,response.xpath(xpath).extract(),self.name):
						detail_url.append(url)
		#在考虑在每一层加一个判断,相当于如果没有(第一个)要传递给下一层的数据,就直接传递给final_parse(注:在传递给final_parse时需要判断是否需要渲染,这里我暂时先默认都渲染,但是之后可以考虑在config.json的Final_Xpath加一个flag,1表示需要渲染,0表示不需要)
		if Signal_Detail_Page is None:
				for url in detail_url:
						request = Request(url,callback = self.parse_final,dont_filter=True,meta={
											'splash':{
											'endpoint':'render.html',
											'args':{
													#只有aiyiyi需要load 10s,才能拿到播放量
													'wait':0.5,
													'images':0,
													'render_all':1
													}
											}
									})