class tvSpider(scrapy.Spider): name ='mangguo_tv' allowed_domain = [] def __init__(self,*args,**kwargs): super(tvSpider,self).__init__(*args,**kwargs) self.now = time.time() self.config = [] self.Index_Url = "" def start_requests(self): with open('config.json','r') as f: data = json.load(f) for i in data.iteritems(): if i[0].encode('utf-8') == self.name: self.config.append(i) f.close() for v in self.config: if len(v[1]) == 2: self.Index_Url = v[1][0]['Index_Url'] Is_Json = v[1][0]['Is_Json'] Max_Page = v[1][0]['Max_Page'] Final_Xpath = v[1][1]['Final_Xpath'] if Is_Json == 1: for url in self.Index_Url: request = Request(url,self.parse_json) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['Final_Xpath'] = Final_Xpath yield request else: for url in self.Index_Url: request = Request(url,self.parse_splash,meta={ 'splash':{ 'endpoint':'render.html', 'args':{ 'wait':0.5, 'images':0, 'render_all':1 } } }) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['Final_Xpath'] = Final_Xpath yield request if len(v[1]) == 3: self.Index_Url = v[1][0]['Index_Url'] Is_Json = v[1][0]['Is_Json'] Max_Page = v[1][0]['Max_Page'] All_Detail_Page = v[1][1]['All_Detail_Page'] Final_Xpath = v[1][2]['Final_Xpath'] if Is_Json == 1: for url in self.Index_Url: request = Request(url,self.parse_json) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request else: for url in self.Index_Url: request = Request(url,self.parse_splash,meta={ 'splash':{ 'endpoint':'render.html', 'args':{ 'wait':0.5, 'images':0, 'render_all':1 } } }) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request if len(v[1]) == 4: self.Index_Url = v[1][0]['Index_Url'] Is_Json = v[1][0]['Is_Json'] Max_Page = v[1][0]['Max_Page'] All_Detail_Page = v[1][1]['All_Detail_Page'] Signal_Detail_Page = v[1][2]['Signal_Detail_Page'] Final_Xpath = v[1][3]['Final_Xpath'] if Is_Json == 1: for url in self.Index_Url: request = Request(url,callback = self.parse_json) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Signal_Detail_Page'] = Signal_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request else: for url in self.Index_Url: request = Request(url,callback = self.parse_splash,dont_filter=True,meta={ 'splash':{ 'endpoint':'render.html', 'args':{ 'wait':0.5, 'images':0, 'render_all':1 } } }) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Signal_Detail_Page'] = Signal_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request if len(v[1]) == 5: self.Index_Url = v[1][0]['Index_Url'] Is_Json = v[1][0]['Is_Json'] Max_Page = v[1][0]['Max_Page'] All_Detail_Page = v[1][1]['All_Detail_Page'] Signal_Detail_Page = v[1][2]['Signal_Detail_Page'] Target_Detail_Page = v[1][3]['Target_Detail_Page'] Final_Xpath = v[1][4]['Final_Xpath'] if Is_Json == 1: for url in self.Index_Url: request = Request(url,callback = self.parse_json) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Signal_Detail_Page'] = Signal_Detail_Page request.meta['Target_Detail_Page'] = Target_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request else: for url in self.Index_Url: request = Request(url,callback = self.parse_splash,meta={ 'splash':{ 'endpoint':'render.html', 'args':{ 'wait':0.5, 'images':0, 'render_all':1 } } }) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Signal_Detail_Page'] = Signal_Detail_Page request.meta['Target_Detail_Page'] = Target_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request def parse_splash(self,response): #这边就是管你有没有,我都接收,在使用的时候判断,如果不存在,说明要直接到final_parse处 Index_Url = response.meta.get('Index_Url',None) Max_Page = response.meta.get('Max_Page',None) All_Detail_Page = response.meta.get('All_Detail_Page',None) Signal_Detail_Page = response.meta.get('Signal_Detail_Page',None) Target_Detail_Page = response.meta.get('Target_Detail_Page',None) Final_Xpath = response.meta.get('Final_Xpath',None) max_pages = 2 try: max_pages = re.search(Max_Page['re'],''.join(response.xpath(Max_Page['xpath']).extract())).group() except Exception,e: print Exception,":",e #这里是替换末尾的\d+,记住,遇上其他情况,就扩展这个get_HeadUrl() urls = get_HeadUrl(Index_Url,self.name) try: max_pages = Total_page_circulate(self.name,int(max_pages)) except Exception,e: print Exception,":",e
Index_Url = response.meta.get('Index_Url',None) Max_Page = response.meta.get('Max_Page',None) All_Detail_Page = response.meta.get('All_Detail_Page',None) Signal_Detail_Page = response.meta.get('Signal_Detail_Page',None) Target_Detail_Page = response.meta.get('Target_Detail_Page',None) Final_Xpath = response.meta.get('Final_Xpath',None) res_json = json.loads(response.body_as_unicode()) depth = 0 try: while depth < len(Max_Page['index']): res_json = res_json.get(Max_Page['index'][depth]) depth += 1 except Exception,e: print Exception,":",e urls = get_HeadUrl(Index_Url,self.name) print "now the res_json is %s"%res_json max_pages = Total_page_circulate(self.name,int(res_json)) print "最大页数是:%d"%max_pages if All_Detail_Page is None: for i in range(1,max_pages+1): i = Turn_True_Page(i,self.name) url = urls.format(page=str(i)) request = Request(url,callback = self.parse_final,dont_filter=True,meta={ 'splash':{ 'endpoint':'render.html', 'args':{ 'wait':0.5, 'images':0, 'render_all':1