def parse_third(self,response): Index_Url = response.meta['Index_Url'] Target_Detail_Page = response.meta.get('Target_Detail_Page',None) Final_Xpath = response.meta.get('Final_Xpath',None) detail_url = Relative_to_Absolute(Index_Url,response.xpath(Target_Detail_Page['xpath']).extract(),self.name) Some_Info = {} if 'Some_Info' in Target_Detail_Page.keys(): keys = Target_Detail_Page['Some_Info'].keys() for key in keys: try: Some_Info[key] = response.xpath(Target_Detail_Page['Some_Info'][key]).extract()[0] except Exception,e: print Exception,":",e
def parse_json2(self,response): Index_Url = response.meta.get('Index_Url',None) All_Detail_Page = response.meta.get('All_Detail_Page',None) Signal_Detail_Page = response.meta.get('Signal_Detail_Page',None) Target_Detail_Page = response.meta.get('Target_Detail_Page',None) Final_Xpath = response.meta.get('Final_Xpath',None) detail_url = [] res_json = json.loads(response.body_as_unicode()) #递归读取最底层的key对应的value值,我去,想出来了~~[这里是要for一遍最底层的list,所以要读到len-1处,然后在得到detail_url] depth = 0 length = len(All_Detail_Page['index']) while depth < length - 1: res_json = res_json.get(All_Detail_Page['index'][depth]) depth += 1 #print "now the res_json is %s"%res_json for i in res_json: detail_url.append(i.get(All_Detail_Page['index'][length-1])) try: detail_url = Relative_to_Absolute(Index_Url,detail_url,self.name) except Exception,e: print Exception,":",e
Signal_Detail_Page = response.meta.get('Signal_Detail_Page',None) Target_Detail_Page = response.meta.get('Target_Detail_Page',None) Final_Xpath = response.meta.get('Final_Xpath',None) Some_Info = {} if 'Some_Info' in All_Detail_Page.keys(): keys = All_Detail_Page['Some_Info'].keys() for key in keys: try: Some_Info[key] = response.xpath(All_Detail_Page['Some_Info'][key]).extract()[0] except Exception,e: print Exception,":",e #一个页面可能会需要多个提取的xpath,这里就指定为一个list了 detail_url = [] for xpath in All_Detail_Page['xpath']: for url in Relative_to_Absolute(Index_Url,response.xpath(xpath).extract(),self.name): detail_url.append(url) #在考虑在每一层加一个判断,相当于如果没有(第一个)要传递给下一层的数据,就直接传递给final_parse(注:在传递给final_parse时需要判断是否需要渲染,这里我暂时先默认都渲染,但是之后可以考虑在config.json的Final_Xpath加一个flag,1表示需要渲染,0表示不需要) if Signal_Detail_Page is None: for url in detail_url: request = Request(url,callback = self.parse_final,dont_filter=True,meta={ 'splash':{ 'endpoint':'render.html', 'args':{ #只有aiyiyi需要load 10s,才能拿到播放量 'wait':0.5, 'images':0, 'render_all':1 } } })