def parse_detail(self, response): jingmeiti = JingMTItem() jingmeiti['spider_url'] = response.url jingmeiti['md5'] = get_md5(response.url.encode('utf-8')) jingmeiti['title'] = response.meta.get("title", "") wenzhang = response.xpath( '//div[@class="post-title"]/div[@class="post_icon"]/span[@class="postoriginal"]/text()' ).extract() if len(wenzhang) == 0: wenzhang = ['非原创文章'] jingmeiti['wenzhang'] = wenzhang[0] pattern = response.xpath( '//div[@class="post-title"]/div[@class="post_icon"]/span[@class="postcat"]/a/text()' ).extract() jingmeiti['pattern'] = pattern[0] writer = response.xpath( '//div[@class="post-title"]/div[@class="post_icon"]/span[@class="postauthor"]/a/text()' ).extract() if len(writer) == 0: writer = response.xpath( '//div[@class="post-title"]/div[@class="post_icon"]/span[@class="postauthor"]/text()' ).extract() jingmeiti['writer'] = writer[0] public_time = response.xpath( '//div[@class="post-title"]/div[@class="post_icon"]/span[@class="postclock"]/text()' ).extract() jingmeiti['public_time'] = public_time[0] #//*[@id="page-content"]/div/div/div[1]/div[2]/div[2] htmls = response.xpath('//div[@class="post-content"]').extract() html = htmls[0] if isinstance(html, bytes): html = html.decode() jingmeiti['html'] = html yield jingmeiti
def parse_detail(self, response): if len(response.text) != 0: title = response.xpath( '//div[@id="wrapper"]/div[@class="sjbt"]/h1/text()').extract() if len(title) != 0: yitiku = YitikuShijuanItem() title = title[0] yitiku['title'] = title subject = response.xpath( '//div[@class="path"]/label[2]/text()').extract() if len(subject) != 0: subject = subject[0][-2:] for key, value in self.subject_item.items(): if subject in key: subject = value yitiku['subject'] = subject else: yitiku['subject'] = 11 heads_news = response.xpath( '//div[@id="wrapper"]/div[@class="sjbt"]/div').extract() if len(heads_news) != 0: heads_news = heads_news[0] grade = re.findall('<span>适用年级:(.+?)</span>', heads_news) yitiku['grade'] = grade[0] pattern = re.findall('<span>试卷类型:(.+?)</span>', heads_news) yitiku['pattern'] = pattern[0] province = re.findall('<span>适用省份:(.+?)</span>', heads_news) yitiku['province'] = province[0] year = re.findall('<span>试卷年份:(.+?)</span>', heads_news) yitiku['year'] = year[0] ti_number = re.findall('<span>题数:(.+?)</span>', heads_news) yitiku['ti_number'] = ti_number[0] watch_number = re.findall('<span>浏览数:(.+?)</span>', heads_news) yitiku['watch_number'] = watch_number[0] shijuan = response.xpath( '//div[@id="wrapper"]/div[@class="box1000"]').extract() #//*[@id="js_qs"]/li[2]/a //*[@id="js_qs"]/li[2]/a question_urls = response.xpath( '//ul[@id="js_qs"]/li[@class="icon5"]/a/@href').extract() question_url = [] for i in question_urls: new_urls = 'http://www.yitiku.cn/shijuan' + i question_url.append(new_urls) yitiku['question_urls'] = question_url yitiku['html'] = shijuan[0] yitiku['spider_url'] = response.url yitiku['md5'] = get_md5(response.url.encode('utf-8')) yitiku['spider_source'] = 59 yitiku['source_id'] = response.meta.get('source_id', "") yield yitiku # if response.url not in self.parsed_all_url: # self.parsed_all_url.append(response.url) yield scrapy.Request( url=response.url, dont_filter=True, headers={"User-Agent": random.choice(self.User_Agents)}, callback=self.parse)
def parse_fourth_url(self, response): next_url_number = response.xpath( '//div[@class="page"]/a[@style="display:none"]/text()').extract() yitiku = YitikuPageUrlItem() url_list = [] md5_list = [] if len(next_url_number) == 0: for i in range(1, 5): new_urls = response.url + '?page=' + str(i) md5 = get_md5(new_urls.encode('utf-8')) md5_list.append(md5) url_list.append(new_urls) else: for i in range(1, int(next_url_number[0]) + 1): new_urls = response.url + '?page=' + str(i) md5 = get_md5(new_urls.encode('utf-8')) md5_list.append(md5) url_list.append(new_urls) yitiku['spider_url'] = url_list yitiku['md5'] = md5_list yield yitiku
def parse_detail(self, response): if response.status == 200: try: title = response.xpath('//div[@id="div_MainCotent"]/section[@class="detailsMain center"]/div[@class="CSDMain"]/div[@class="newContent"]/h1/text()').extract() except: title = '' if len(title) != 0: jiemodui = JmdItem() jiemodui['title'] = title[0] jiemodui['spider_url'] = response.url jiemodui['md5'] = get_md5(response.url.encode('utf-8')) try: public_time = response.xpath('//div[@id="div_MainCotent"]/section[@class="detailsMain center"]/div[@class="CSDMain"]/div[@class="newContent"]/p/span/time/text()').extract() if len(public_time) != 0: jiemodui['public_time'] = public_time[0] else: jiemodui['public_time'] = '' except: pass try: writer = response.xpath('//div[@id="div_MainCotent"]/section[@class="detailsMain center"]/div[@class="CSDMain"]/div[@class="newContent"]/p/a/text()').extract() if len(writer) != 0: jiemodui['writer'] = writer[0] else: jiemodui['writer'] = '' except: pass try: comment = response.xpath('//ul[@class="minuteDiscuss clearfix"]/li/dl/dd/p[@class="xiangxi"]/text()').extract() if len(comment) != 0: comment = comment[0] if isinstance(eval(comment), bytes): comment = eval(comment).decode() jiemodui['comment'] = comment else: jiemodui['comment'] = '' except: pass try: html = response.xpath('//div[@id="div_MainCotent"]/section[@class="detailsMain center"]/div[@class="CSDMain"]/article').extract() jiemodui['html'] = html[0] except: pass yield jiemodui
def parse_detail(self, response): if len(response.text) != 0: subject = response.xpath( '//div[@class="full full03"]/div/div[@class="path"]/a[2]/text()' ).extract() if len(subject) != 0: yitiku = YitikuShitiItem() answer1 = response.xpath( '//li[@class="noborder"]/div').extract() answer2 = response.xpath( '//li[@class="noborder"]/b/text()').extract() answer = answer1 or answer2 if len(answer) == 0: yitiku['answer'] = '' else: yitiku['answer'] = answer[0] analy = response.xpath( '//div[@class="quesTxt quesTxt2"]/ul[2]/li[1]/div' ).extract() if len(analy) != 0: yitiku['analy'] = analy[0] else: yitiku['analy'] = '' yitiku['grade'] = subject[0][:2] subject = subject[0][-2:] yitiku['subject'] = subject for key, value in self.subject_item.items(): if subject in key: subject = value yitiku['subject'] = subject yitiku['book'] = response.meta.get("book", "") yitiku['version'] = response.meta.get("version", "") pattern = response.xpath( '//div[@class="detailsTitle"]/h3/text()').extract() if len(pattern) != 0: yitiku['pattern'] = pattern[0] else: yitiku['pattern'] = '' source_shijuan = response.xpath( '//div[@class="quesdiv"]/h1').extract() if len(source_shijuan) != 0: yitiku['source_shijuan'] = source_shijuan[0] else: yitiku['source_shijuan'] = '' difficulty = response.xpath( '//div[@class="handle"]/div/u[1]/i/text()').extract() if len(difficulty) != 0: yitiku['difficulty'] = difficulty[0] else: yitiku['difficulty'] = '' kaodian = response.xpath( '//div[@class="quesTxt quesTxt2"]/ul/li/div/a/text()' ).extract() if len(kaodian) != 0: yitiku['kaodian'] = kaodian[0] else: yitiku['kaodian'] = '' shijuan = response.xpath('//div[@class="quesdiv"]').extract() if len(shijuan) != 0: yitiku['topic'] = shijuan[0] yitiku['spider_url'] = response.url yitiku['md5'] = get_md5(response.url.encode('utf-8')) yitiku['spider_source'] = 59 yitiku['html'] = response.text yield yitiku
def parse_detail_shiti(self, response): question_id = response.meta.get("source_id", int) if len(response.text) != 0: subject = response.xpath( '//div[@class="full full03"]/div/div[@class="path"]/a[2]/text()' ).extract() if len(subject) != 0: yitiku = YitikuItem() answer1 = response.xpath( '//li[@class="noborder"]/div').extract() answer2 = response.xpath( '//li[@class="noborder"]/b/text()').extract() answer = answer1 or answer2 if len(answer) == 0: yitiku['answer'] = '' else: yitiku['answer'] = answer[0] analy = response.xpath( '//div[@class="quesTxt quesTxt2"]/ul[2]/li[1]/div' ).extract() if len(analy) != 0: yitiku['analy'] = analy[0] else: yitiku['analy'] = '' yitiku['grade'] = subject[0][:2] subject = subject[0][-2:] yitiku['subject'] = subject for key, value in self.subject_item.items(): if subject in key: subject = value yitiku['subject'] = subject pattern = response.xpath( '//div[@class="detailsTitle"]/h3/text()').extract() if len(pattern) != 0: yitiku['pattern'] = pattern[0] else: yitiku['pattern'] = '' source_shijuan = response.xpath( '//div[@class="quesdiv"]/h1').extract() if len(source_shijuan) != 0: yitiku['source_shijuan'] = source_shijuan[0] else: yitiku['source_shijuan'] = '' difficulty = response.xpath( '//div[@class="handle"]/div/u[1]/i/text()').extract() if len(difficulty) != 0: yitiku['difficulty'] = difficulty[0] else: yitiku['difficulty'] = '' kaodian = response.xpath( '//div[@class="quesTxt quesTxt2"]/ul/li/div/a/text()' ).extract() if len(kaodian) != 0: yitiku['kaodian'] = kaodian[0] else: yitiku['kaodian'] = '' shijuan = response.xpath('//div[@class="quesdiv"]').extract() if len(shijuan) != 0: yitiku['topic'] = shijuan[0] yitiku['spider_url'] = response.url yitiku['md5'] = get_md5(response.url.encode('utf-8')) yitiku['spider_source'] = 59 yitiku['html'] = response.text yitiku['source_id'] = question_id yield yitiku # if question_id <= self.end_number: # self.first_number = question_id + 1 # new_url = 'http://www.yitiku.cn/shiti/' + str(self.first_number) + '.html' # time.sleep(0.02) # yield scrapy.Request(url=new_url, dont_filter=True, headers={"User-Agent": random.choice(self.User_Agents)}, # cookies=self.cookies, callback=self.parse_detail_shiti, meta={"source_id": self.first_number}) yield scrapy.Request( url=response.url, dont_filter=True, callback=self.parse, headers={"User-Agent": random.choice(self.User_Agents)})