def parse_json(self, response): Index_Url = response.meta.get('Index_Url', None) Max_Page = response.meta.get('Max_Page', None) All_Detail_Page = response.meta.get('All_Detail_Page', None) Signal_Detail_Page = response.meta.get('Signal_Detail_Page', None) Target_Detail_Page = response.meta.get('Target_Detail_Page', None) Final_Xpath = response.meta.get('Final_Xpath', None) res_json = json.loads(response.body_as_unicode()) depth = 0 if isinstance(Max_Page['index'], list): try: while depth < len(Max_Page['index']): res_json = res_json.get(Max_Page['index'][depth]) depth += 1 except Exception, e: print Exception, ":", e max_pages = Total_page_circulate(self.name, int(res_json))
class movieSpider(scrapy.Spider): name = 'youku_movie' allowed_domain = [] def __init__(self, *args, **kwargs): super(movieSpider, self).__init__(*args, **kwargs) self.now = time.time() self.config = [] self.Index_Url = "" def start_requests(self): with open('config.json', 'r') as f: data = json.load(f) for i in data.iteritems(): if i[0].encode('utf-8') == self.name: self.config.append(i) f.close() for v in self.config: if len(v[1]) == 2: self.Index_Url = v[1][0]['Index_Url'] Is_Json = v[1][0]['Is_Json'] Max_Page = v[1][0]['Max_Page'] Final_Xpath = v[1][1]['Final_Xpath'] if Is_Json == 1: for url in self.Index_Url: request = Request(url, self.parse_json) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['Final_Xpath'] = Final_Xpath yield request else: for url in self.Index_Url: request = Request(url, self.parse_splash, meta={ 'splash': { 'endpoint': 'render.html', 'args': { 'wait': 0.5, 'images': 0, 'render_all': 1 } } }) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['Final_Xpath'] = Final_Xpath yield request if len(v[1]) == 3: self.Index_Url = v[1][0]['Index_Url'] Is_Json = v[1][0]['Is_Json'] Max_Page = v[1][0]['Max_Page'] All_Detail_Page = v[1][1]['All_Detail_Page'] Final_Xpath = v[1][2]['Final_Xpath'] if Is_Json == 1: for url in self.Index_Url: request = Request(url, self.parse_json) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request else: for url in self.Index_Url: request = Request(url, self.parse_splash, meta={ 'splash': { 'endpoint': 'render.html', 'args': { 'wait': 0.5, 'images': 0, 'render_all': 1 } } }) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request if len(v[1]) == 4: self.Index_Url = v[1][0]['Index_Url'] Is_Json = v[1][0]['Is_Json'] Max_Page = v[1][0]['Max_Page'] All_Detail_Page = v[1][1]['All_Detail_Page'] Signal_Detail_Page = v[1][2]['Signal_Detail_Page'] Final_Xpath = v[1][3]['Final_Xpath'] if Is_Json == 1: for url in self.Index_Url: request = Request(url, callback=self.parse_json) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Signal_Detail_Page'] = Signal_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request else: for url in self.Index_Url: request = Request(url, callback=self.parse_splash, dont_filter=True, meta={ 'splash': { 'endpoint': 'render.html', 'args': { 'wait': 0.5, 'images': 0, 'render_all': 1 } } }) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Signal_Detail_Page'] = Signal_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request if len(v[1]) == 5: self.Index_Url = v[1][0]['Index_Url'] Is_Json = v[1][0]['Is_Json'] Max_Page = v[1][0]['Max_Page'] All_Detail_Page = v[1][1]['All_Detail_Page'] Signal_Detail_Page = v[1][2]['Signal_Detail_Page'] Target_Detail_Page = v[1][3]['Target_Detail_Page'] Final_Xpath = v[1][4]['Final_Xpath'] if Is_Json == 1: for url in self.Index_Url: request = Request(url, callback=self.parse_json) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Signal_Detail_Page'] = Signal_Detail_Page request.meta['Target_Detail_Page'] = Target_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request else: for url in self.Index_Url: request = Request(url, callback=self.parse_splash, meta={ 'splash': { 'endpoint': 'render.html', 'args': { 'wait': 0.5, 'images': 0, 'render_all': 1 } } }) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Signal_Detail_Page'] = Signal_Detail_Page request.meta['Target_Detail_Page'] = Target_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request def parse_splash(self, response): #这边就是管你有没有,我都接收,在使用的时候判断,如果不存在,说明要直接到final_parse处 Index_Url = response.meta.get('Index_Url', None) Max_Page = response.meta.get('Max_Page', None) All_Detail_Page = response.meta.get('All_Detail_Page', None) Signal_Detail_Page = response.meta.get('Signal_Detail_Page', None) Target_Detail_Page = response.meta.get('Target_Detail_Page', None) Final_Xpath = response.meta.get('Final_Xpath', None) max_pages = 2 try: max_pages = re.search( Max_Page['re'], ''.join(response.xpath(Max_Page['xpath']).extract())).group() except Exception, e: print Exception, ":", e #这里是替换末尾的\d+,记住,遇上其他情况,就扩展这个get_HeadUrl() urls = get_HeadUrl(Index_Url, self.name) try: max_pages = Total_page_circulate(self.name, int(max_pages)) except Exception, e: print Exception, ":", e
All_Detail_Page = response.meta.get('All_Detail_Page', None) Signal_Detail_Page = response.meta.get('Signal_Detail_Page', None) Target_Detail_Page = response.meta.get('Target_Detail_Page', None) Final_Xpath = response.meta.get('Final_Xpath', None) res_json = json.loads(response.body_as_unicode()) depth = 0 try: while depth < len(Max_Page['index']): res_json = res_json.get(Max_Page['index'][depth]) depth += 1 except Exception, e: print Exception, ":", e urls = get_HeadUrl(Index_Url, self.name) max_pages = Total_page_circulate(self.name, int(res_json)) print "最大页数是:%d" % max_pages if All_Detail_Page is None: for i in range(1, max_pages + 1): i = Turn_True_Page(i, self.name) url = urls.format(page=str(i)) request = Request(url, callback=self.parse_final, dont_filter=True, meta={ 'splash': { 'endpoint': 'render.html', 'args': { 'wait': 0.5, 'images': 0, 'render_all': 1
class movieSpider(scrapy.Spider): name = 'letv_movie' allowed_domain = [] def __init__(self, *args, **kwargs): super(movieSpider, self).__init__(*args, **kwargs) self.now = time.time() self.config = [] self.Index_Url = "" def start_requests(self): with open('config.json', 'r') as f: data = json.load(f) for i in data.iteritems(): if i[0].encode('utf-8') == self.name: self.config.append(i) f.close() for v in self.config: if len(v[1]) == 2: self.Splash = v[1][0]['Splash'] self.Index_Url = v[1][0]['Index_Url'] Is_Json = v[1][0]['Is_Json'] Max_Page = v[1][0]['Max_Page'] Final_Xpath = v[1][1]['Final_Xpath'] if Is_Json == 1: for url in self.Index_Url: request = Request(url, self.parse_json) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['Final_Xpath'] = Final_Xpath yield request else: for url in self.Index_Url: request = Request(url, self.parse_splash, meta={ 'splash': { 'endpoint': 'render.html', 'args': { 'wait': 0.5, 'images': 0, 'render_all': 1 } } }) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['Final_Xpath'] = Final_Xpath yield request if len(v[1]) == 3: self.Splash = v[1][0]['Splash'] self.Index_Url = v[1][0]['Index_Url'] Is_Json = v[1][0]['Is_Json'] Max_Page = v[1][0]['Max_Page'] All_Detail_Page = v[1][1]['All_Detail_Page'] Final_Xpath = v[1][2]['Final_Xpath'] if Is_Json == 1: for url in self.Index_Url: request = Request(url, self.parse_json) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request else: for url in self.Index_Url: request = Request(url, self.parse_splash, meta={ 'splash': { 'endpoint': 'render.html', 'args': { 'wait': 0.5, 'images': 0, 'render_all': 1 } } }) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request if len(v[1]) == 4: self.Splash = v[1][0]['Splash'] self.Index_Url = v[1][0]['Index_Url'] Is_Json = v[1][0]['Is_Json'] Max_Page = v[1][0]['Max_Page'] All_Detail_Page = v[1][1]['All_Detail_Page'] Signal_Detail_Page = v[1][2]['Signal_Detail_Page'] Final_Xpath = v[1][3]['Final_Xpath'] if Is_Json == 1: for url in self.Index_Url: request = Request(url, callback=self.parse_json) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Signal_Detail_Page'] = Signal_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request else: for url in self.Index_Url: request = Request(url, callback=self.parse_splash, dont_filter=True, meta={ 'splash': { 'endpoint': 'render.html', 'args': { 'wait': 0.5, 'images': 0, 'render_all': 1 } } }) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Signal_Detail_Page'] = Signal_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request if len(v[1]) == 5: self.Splash = v[1][0]['Splash'] self.Index_Url = v[1][0]['Index_Url'] Is_Json = v[1][0]['Is_Json'] Max_Page = v[1][0]['Max_Page'] All_Detail_Page = v[1][1]['All_Detail_Page'] Signal_Detail_Page = v[1][2]['Signal_Detail_Page'] Target_Detail_Page = v[1][3]['Target_Detail_Page'] Final_Xpath = v[1][4]['Final_Xpath'] if Is_Json == 1: for url in self.Index_Url: request = Request(url, callback=self.parse_json) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Signal_Detail_Page'] = Signal_Detail_Page request.meta['Target_Detail_Page'] = Target_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request else: for url in self.Index_Url: request = Request(url, callback=self.parse_splash, meta={ 'splash': { 'endpoint': 'render.html', 'args': { 'wait': 0.5, 'images': 0, 'render_all': 1 } } }) request.meta['Index_Url'] = url request.meta['Max_Page'] = Max_Page request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Signal_Detail_Page'] = Signal_Detail_Page request.meta['Target_Detail_Page'] = Target_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request def parse_splash(self, response): #这边就是管你有没有,我都接收,在使用的时候判断,如果不存在,说明要直接到final_parse处 Index_Url = response.meta.get('Index_Url', None) Max_Page = response.meta.get('Max_Page', None) All_Detail_Page = response.meta.get('All_Detail_Page', None) Signal_Detail_Page = response.meta.get('Signal_Detail_Page', None) Target_Detail_Page = response.meta.get('Target_Detail_Page', None) Final_Xpath = response.meta.get('Final_Xpath', None) max_pages = 2 try: max_pages = re.search( Max_Page['re'], ''.join(response.xpath(Max_Page['xpath']).extract())).group() except Exception, e: print Exception, ":", e if max_pages.isdigit(): max_pages = Total_page_circulate(self.name, int(max_pages)) elif max_pages == '': max_pages = Total_Page_Byyourself(self.name) else: raise CloseSpider("渲染页面中,找不到Max_Page ,请重新确认 ,爬虫关闭!!!") urls = get_HeadUrl(Index_Url, self.name) print "最大页数是:%d" % max_pages if All_Detail_Page is None: if self.Splash: for i in range(1, max_pages + 1): i = Turn_True_Page(i, self.name) url = urls.format(page=str(i)) request = Request(url, callback=self.parse_final, dont_filter=True, meta={ 'splash': { 'endpoint': 'render.html', 'args': { 'wait': 0.5, 'images': 0, 'render_all': 1 } } }) request.meta['Final_Xpath'] = Final_Xpath yield request else: for i in range(1, max_pages + 1): i = Turn_True_Page(i, self.name) url = urls.format(page=str(i)) request = Request(url, callback=self.parse_final, dont_filter=True) request.meta['Final_Xpath'] = Final_Xpath yield request else: for i in range(1, int(max_pages) + 1): try: i = Turn_True_Page(i, self.name) url = urls.format(page=str(i)) except Exception, e: print Exception, ":", e request = Request(url, callback=self.parse_first, dont_filter=True, meta={ 'splash': { 'endpoint': 'render.html', 'args': { 'wait': 0.5, 'images': 0, 'render_all': 1 } } }) request.meta['Index_Url'] = Index_Url request.meta['All_Detail_Page'] = All_Detail_Page request.meta['Signal_Detail_Page'] = Signal_Detail_Page request.meta['Target_Detail_Page'] = Target_Detail_Page request.meta['Final_Xpath'] = Final_Xpath yield request