Пример #1
0
class netTvSpider(scrapy.Spider):
	name ='tencent_sp'
	allowed_domain = []
		
	def __init__(self,*args,**kwargs):
		super(netTvSpider,self).__init__(*args,**kwargs)
		self.now = time.time()
		self.one_month_ago = datetime.datetime(time.localtime(self.now).tm_year,time.localtime(self.now).tm_mon-1,time.localtime(self.now).tm_mday)
		self.config = []
		self.Index_Url = ""
			
	
	def start_requests(self):
		with open('config.json','r') as f:
			data = json.load(f)
			for i in data.iteritems():
				if i[0].encode('utf-8') == self.name:
					self.config.append(i)
			f.close()
		
		for v in self.config:
			if len(v[1]) == 2:
				self.Index_Url = v[1][0]['Index_Url']
				Is_Json = v[1][0]['Is_Json']
				Max_Page = v[1][0]['Max_Page']
				Final_Xpath = v[1][1]['Final_Xpath']
				if Is_Json == 1:
						for url in self.Index_Url:
								request = Request(url,self.parse_json)
								request.meta['Index_Url'] = url
								request.meta['Max_Page'] = Max_Page
								request.meta['Final_Xpath'] = Final_Xpath
								yield request
				else:
						for url in  self.Index_Url:
								request = Request(url,self.parse_splash,meta={
										'splash':{
										'endpoint':'render.html',
										'args':{
												'wait':0.5,
												'images':0,
												'render_all':1
												}
										}
								})				
								request.meta['Index_Url'] = url
								request.meta['Max_Page'] = Max_Page
								request.meta['Final_Xpath'] = Final_Xpath
								yield request	
			
			if len(v[1]) == 3:
				self.Index_Url = v[1][0]['Index_Url']
				Is_Json = v[1][0]['Is_Json']
				Max_Page = v[1][0]['Max_Page']
				All_Detail_Page = v[1][1]['All_Detail_Page']
				Final_Xpath = v[1][2]['Final_Xpath']
				if Is_Json == 1:
						for url in self.Index_Url:
								request = Request(url,self.parse_json)
								request.meta['Index_Url'] = url
								request.meta['Max_Page'] = Max_Page
								request.meta['All_Detail_Page'] = All_Detail_Page
								request.meta['Final_Xpath'] = Final_Xpath
								yield request
				else:
						for url in  self.Index_Url:
								request = Request(url,self.parse_splash,meta={
										'splash':{
										'endpoint':'render.html',
										'args':{
												'wait':0.5,
												'images':0,
												'render_all':1
												}
										}
								})				
								request.meta['Index_Url'] = url
								request.meta['Max_Page'] = Max_Page
								request.meta['All_Detail_Page'] = All_Detail_Page
								request.meta['Final_Xpath'] = Final_Xpath
								yield request	
				
			if len(v[1]) == 4:
				self.Index_Url = v[1][0]['Index_Url']
				Is_Json = v[1][0]['Is_Json']
				Max_Page = v[1][0]['Max_Page']
				All_Detail_Page = v[1][1]['All_Detail_Page']
				Signal_Detail_Page = v[1][2]['Signal_Detail_Page']
				Final_Xpath = v[1][3]['Final_Xpath']
				if Is_Json == 1:
						for url in self.Index_Url:
								request = Request(url,callback = self.parse_json)
								request.meta['Index_Url'] = url
								request.meta['Max_Page'] = Max_Page
								request.meta['All_Detail_Page'] = All_Detail_Page
								request.meta['Signal_Detail_Page'] = Signal_Detail_Page
								request.meta['Final_Xpath'] = Final_Xpath
								yield request
				else:
						for url in self.Index_Url:
								request = Request(url,callback = self.parse_splash,dont_filter=True,meta={
											'splash':{
													'endpoint':'render.html',
													'args':{
															'wait':0.5,
															'images':0,
															'render_all':1
														}
													}
												})
								request.meta['Index_Url'] = url
								request.meta['Max_Page'] = Max_Page
								request.meta['All_Detail_Page'] = All_Detail_Page
								request.meta['Signal_Detail_Page'] = Signal_Detail_Page
								request.meta['Final_Xpath'] = Final_Xpath
								yield request

			if len(v[1]) == 5:
				self.Index_Url = v[1][0]['Index_Url']
				Is_Json = v[1][0]['Is_Json']
				Max_Page = v[1][0]['Max_Page']
				All_Detail_Page = v[1][1]['All_Detail_Page']
				Signal_Detail_Page = v[1][2]['Signal_Detail_Page']
				Target_Detail_Page = v[1][3]['Target_Detail_Page']
				Final_Xpath = v[1][4]['Final_Xpath']
				if Is_Json == 1:
						for url in self.Index_Url:
								request = Request(url,callback = self.parse_json)
								request.meta['Index_Url'] = url
								request.meta['Max_Page'] = Max_Page
								request.meta['All_Detail_Page'] = All_Detail_Page
								request.meta['Signal_Detail_Page'] = Signal_Detail_Page
								request.meta['Target_Detail_Page'] = Target_Detail_Page
								request.meta['Final_Xpath'] = Final_Xpath
								yield request
				else:
						for url in self.Index_Url:
								request = Request(url,callback = self.parse_splash,meta={
											'splash':{
													'endpoint':'render.html',
													'args':{
															'wait':0.5,
															'images':0,
															'render_all':1
														}
													}
												})
								request.meta['Index_Url'] = url
								request.meta['Max_Page'] = Max_Page
								request.meta['All_Detail_Page'] = All_Detail_Page
								request.meta['Signal_Detail_Page'] = Signal_Detail_Page
								request.meta['Target_Detail_Page'] = Target_Detail_Page
								request.meta['Final_Xpath'] = Final_Xpath
								yield request
				

	def parse_splash(self,response):
		#这边就是管你有没有,我都接收,在使用的时候判断,如果不存在,说明要直接到final_parse处
		Index_Url = response.meta.get('Index_Url',None)
		Max_Page = response.meta.get('Max_Page',None)
		All_Detail_Page = response.meta.get('All_Detail_Page',None)
		Signal_Detail_Page = response.meta.get('Signal_Detail_Page',None)
		Target_Detail_Page = response.meta.get('Target_Detail_Page',None)
		Final_Xpath = response.meta.get('Final_Xpath',None)
		max_pages = 2
		try:
				max_pages = re.search(Max_Page['re'],''.join(response.xpath(Max_Page['xpath']).extract())).group()
		except Exception,e:
				print Exception,":",e
		#这里是替换末尾的\d+,记住,遇上其他情况,就扩展这个get_HeadUrl()
		urls = get_HeadUrl(Index_Url,self.name)
		try:
				max_pages = Total_page_circulate(self.name,int(max_pages))
		except Exception,e:
				print Exception,":",e
Пример #2
0
		Signal_Detail_Page = response.meta.get('Signal_Detail_Page',None)
		Target_Detail_Page = response.meta.get('Target_Detail_Page',None)
		Final_Xpath = response.meta.get('Final_Xpath',None)
		res_json = json.loads(response.body_as_unicode())
		
		depth = 0
		try:
				while depth < len(Max_Page['index']):
						res_json = res_json.get(Max_Page['index'][depth])
						depth += 1
		except Exception,e:
				print Exception,":",e
		urls = get_HeadUrl(Index_Url,self.name)	
		
		print "now the res_json is %s"%res_json
		max_pages = Total_page_circulate(self.name,int(res_json))
		print "最大页数是:%d"%max_pages
		if All_Detail_Page is None:
				for i in range(1,max_pages+1):
						i = Turn_True_Page(i,self.name)
						url = urls.format(page=str(i))
						request = Request(url,callback = self.parse_final,dont_filter=True,meta={
											'splash':{
											'endpoint':'render.html',
											'args':{
													'wait':0.5,
													'images':0,
													'render_all':1
													}
											}
								})