예제 #1
0
 def parse_detail(self, response):
     jingmeiti = JingMTItem()
     jingmeiti['spider_url'] = response.url
     jingmeiti['md5'] = get_md5(response.url.encode('utf-8'))
     jingmeiti['title'] = response.meta.get("title", "")
     wenzhang = response.xpath(
         '//div[@class="post-title"]/div[@class="post_icon"]/span[@class="postoriginal"]/text()'
     ).extract()
     if len(wenzhang) == 0:
         wenzhang = ['非原创文章']
     jingmeiti['wenzhang'] = wenzhang[0]
     pattern = response.xpath(
         '//div[@class="post-title"]/div[@class="post_icon"]/span[@class="postcat"]/a/text()'
     ).extract()
     jingmeiti['pattern'] = pattern[0]
     writer = response.xpath(
         '//div[@class="post-title"]/div[@class="post_icon"]/span[@class="postauthor"]/a/text()'
     ).extract()
     if len(writer) == 0:
         writer = response.xpath(
             '//div[@class="post-title"]/div[@class="post_icon"]/span[@class="postauthor"]/text()'
         ).extract()
     jingmeiti['writer'] = writer[0]
     public_time = response.xpath(
         '//div[@class="post-title"]/div[@class="post_icon"]/span[@class="postclock"]/text()'
     ).extract()
     jingmeiti['public_time'] = public_time[0]
     #//*[@id="page-content"]/div/div/div[1]/div[2]/div[2]
     htmls = response.xpath('//div[@class="post-content"]').extract()
     html = htmls[0]
     if isinstance(html, bytes):
         html = html.decode()
     jingmeiti['html'] = html
     yield jingmeiti
예제 #2
0
 def parse_detail(self, response):
     if len(response.text) != 0:
         title = response.xpath(
             '//div[@id="wrapper"]/div[@class="sjbt"]/h1/text()').extract()
         if len(title) != 0:
             yitiku = YitikuShijuanItem()
             title = title[0]
             yitiku['title'] = title
             subject = response.xpath(
                 '//div[@class="path"]/label[2]/text()').extract()
             if len(subject) != 0:
                 subject = subject[0][-2:]
                 for key, value in self.subject_item.items():
                     if subject in key:
                         subject = value
                 yitiku['subject'] = subject
             else:
                 yitiku['subject'] = 11
             heads_news = response.xpath(
                 '//div[@id="wrapper"]/div[@class="sjbt"]/div').extract()
             if len(heads_news) != 0:
                 heads_news = heads_news[0]
                 grade = re.findall('<span>适用年级:(.+?)</span>', heads_news)
                 yitiku['grade'] = grade[0]
                 pattern = re.findall('<span>试卷类型:(.+?)</span>', heads_news)
                 yitiku['pattern'] = pattern[0]
                 province = re.findall('<span>适用省份:(.+?)</span>',
                                       heads_news)
                 yitiku['province'] = province[0]
                 year = re.findall('<span>试卷年份:(.+?)</span>', heads_news)
                 yitiku['year'] = year[0]
                 ti_number = re.findall('<span>题数:(.+?)</span>', heads_news)
                 yitiku['ti_number'] = ti_number[0]
                 watch_number = re.findall('<span>浏览数:(.+?)</span>',
                                           heads_news)
                 yitiku['watch_number'] = watch_number[0]
             shijuan = response.xpath(
                 '//div[@id="wrapper"]/div[@class="box1000"]').extract()
             #//*[@id="js_qs"]/li[2]/a //*[@id="js_qs"]/li[2]/a
             question_urls = response.xpath(
                 '//ul[@id="js_qs"]/li[@class="icon5"]/a/@href').extract()
             question_url = []
             for i in question_urls:
                 new_urls = 'http://www.yitiku.cn/shijuan' + i
                 question_url.append(new_urls)
             yitiku['question_urls'] = question_url
             yitiku['html'] = shijuan[0]
             yitiku['spider_url'] = response.url
             yitiku['md5'] = get_md5(response.url.encode('utf-8'))
             yitiku['spider_source'] = 59
             yitiku['source_id'] = response.meta.get('source_id', "")
             yield yitiku
             # if response.url not in self.parsed_all_url:
         #     self.parsed_all_url.append(response.url)
     yield scrapy.Request(
         url=response.url,
         dont_filter=True,
         headers={"User-Agent": random.choice(self.User_Agents)},
         callback=self.parse)
예제 #3
0
 def parse_fourth_url(self, response):
     next_url_number = response.xpath(
         '//div[@class="page"]/a[@style="display:none"]/text()').extract()
     yitiku = YitikuPageUrlItem()
     url_list = []
     md5_list = []
     if len(next_url_number) == 0:
         for i in range(1, 5):
             new_urls = response.url + '?page=' + str(i)
             md5 = get_md5(new_urls.encode('utf-8'))
             md5_list.append(md5)
             url_list.append(new_urls)
     else:
         for i in range(1, int(next_url_number[0]) + 1):
             new_urls = response.url + '?page=' + str(i)
             md5 = get_md5(new_urls.encode('utf-8'))
             md5_list.append(md5)
             url_list.append(new_urls)
     yitiku['spider_url'] = url_list
     yitiku['md5'] = md5_list
     yield yitiku
예제 #4
0
    def parse_detail(self, response):
        if response.status == 200:
            try:
                title = response.xpath('//div[@id="div_MainCotent"]/section[@class="detailsMain center"]/div[@class="CSDMain"]/div[@class="newContent"]/h1/text()').extract()
            except:
                title = ''

            if len(title) != 0:
                jiemodui = JmdItem()
                jiemodui['title'] = title[0]
                jiemodui['spider_url'] = response.url
                jiemodui['md5'] = get_md5(response.url.encode('utf-8'))
                try:
                    public_time = response.xpath('//div[@id="div_MainCotent"]/section[@class="detailsMain center"]/div[@class="CSDMain"]/div[@class="newContent"]/p/span/time/text()').extract()
                    if len(public_time) != 0:
                        jiemodui['public_time'] = public_time[0]
                    else:
                        jiemodui['public_time'] = ''
                except:
                    pass

                try:
                    writer = response.xpath('//div[@id="div_MainCotent"]/section[@class="detailsMain center"]/div[@class="CSDMain"]/div[@class="newContent"]/p/a/text()').extract()
                    if len(writer) != 0:
                        jiemodui['writer'] = writer[0]
                    else:
                        jiemodui['writer'] = ''
                except:
                    pass

                try:
                    comment = response.xpath('//ul[@class="minuteDiscuss clearfix"]/li/dl/dd/p[@class="xiangxi"]/text()').extract()
                    if len(comment) != 0:
                        comment = comment[0]
                        if isinstance(eval(comment), bytes):
                            comment = eval(comment).decode()
                        jiemodui['comment'] = comment
                    else:
                        jiemodui['comment'] = ''
                except:
                    pass

                try:
                    html = response.xpath('//div[@id="div_MainCotent"]/section[@class="detailsMain center"]/div[@class="CSDMain"]/article').extract()
                    jiemodui['html'] = html[0]
                except:
                    pass
                yield jiemodui
예제 #5
0
 def parse_detail(self, response):
     if len(response.text) != 0:
         subject = response.xpath(
             '//div[@class="full full03"]/div/div[@class="path"]/a[2]/text()'
         ).extract()
         if len(subject) != 0:
             yitiku = YitikuShitiItem()
             answer1 = response.xpath(
                 '//li[@class="noborder"]/div').extract()
             answer2 = response.xpath(
                 '//li[@class="noborder"]/b/text()').extract()
             answer = answer1 or answer2
             if len(answer) == 0:
                 yitiku['answer'] = ''
             else:
                 yitiku['answer'] = answer[0]
             analy = response.xpath(
                 '//div[@class="quesTxt quesTxt2"]/ul[2]/li[1]/div'
             ).extract()
             if len(analy) != 0:
                 yitiku['analy'] = analy[0]
             else:
                 yitiku['analy'] = ''
             yitiku['grade'] = subject[0][:2]
             subject = subject[0][-2:]
             yitiku['subject'] = subject
             for key, value in self.subject_item.items():
                 if subject in key:
                     subject = value
             yitiku['subject'] = subject
             yitiku['book'] = response.meta.get("book", "")
             yitiku['version'] = response.meta.get("version", "")
             pattern = response.xpath(
                 '//div[@class="detailsTitle"]/h3/text()').extract()
             if len(pattern) != 0:
                 yitiku['pattern'] = pattern[0]
             else:
                 yitiku['pattern'] = ''
             source_shijuan = response.xpath(
                 '//div[@class="quesdiv"]/h1').extract()
             if len(source_shijuan) != 0:
                 yitiku['source_shijuan'] = source_shijuan[0]
             else:
                 yitiku['source_shijuan'] = ''
             difficulty = response.xpath(
                 '//div[@class="handle"]/div/u[1]/i/text()').extract()
             if len(difficulty) != 0:
                 yitiku['difficulty'] = difficulty[0]
             else:
                 yitiku['difficulty'] = ''
             kaodian = response.xpath(
                 '//div[@class="quesTxt quesTxt2"]/ul/li/div/a/text()'
             ).extract()
             if len(kaodian) != 0:
                 yitiku['kaodian'] = kaodian[0]
             else:
                 yitiku['kaodian'] = ''
             shijuan = response.xpath('//div[@class="quesdiv"]').extract()
             if len(shijuan) != 0:
                 yitiku['topic'] = shijuan[0]
             yitiku['spider_url'] = response.url
             yitiku['md5'] = get_md5(response.url.encode('utf-8'))
             yitiku['spider_source'] = 59
             yitiku['html'] = response.text
             yield yitiku
예제 #6
0
 def parse_detail_shiti(self, response):
     question_id = response.meta.get("source_id", int)
     if len(response.text) != 0:
         subject = response.xpath(
             '//div[@class="full full03"]/div/div[@class="path"]/a[2]/text()'
         ).extract()
         if len(subject) != 0:
             yitiku = YitikuItem()
             answer1 = response.xpath(
                 '//li[@class="noborder"]/div').extract()
             answer2 = response.xpath(
                 '//li[@class="noborder"]/b/text()').extract()
             answer = answer1 or answer2
             if len(answer) == 0:
                 yitiku['answer'] = ''
             else:
                 yitiku['answer'] = answer[0]
             analy = response.xpath(
                 '//div[@class="quesTxt quesTxt2"]/ul[2]/li[1]/div'
             ).extract()
             if len(analy) != 0:
                 yitiku['analy'] = analy[0]
             else:
                 yitiku['analy'] = ''
             yitiku['grade'] = subject[0][:2]
             subject = subject[0][-2:]
             yitiku['subject'] = subject
             for key, value in self.subject_item.items():
                 if subject in key:
                     subject = value
             yitiku['subject'] = subject
             pattern = response.xpath(
                 '//div[@class="detailsTitle"]/h3/text()').extract()
             if len(pattern) != 0:
                 yitiku['pattern'] = pattern[0]
             else:
                 yitiku['pattern'] = ''
             source_shijuan = response.xpath(
                 '//div[@class="quesdiv"]/h1').extract()
             if len(source_shijuan) != 0:
                 yitiku['source_shijuan'] = source_shijuan[0]
             else:
                 yitiku['source_shijuan'] = ''
             difficulty = response.xpath(
                 '//div[@class="handle"]/div/u[1]/i/text()').extract()
             if len(difficulty) != 0:
                 yitiku['difficulty'] = difficulty[0]
             else:
                 yitiku['difficulty'] = ''
             kaodian = response.xpath(
                 '//div[@class="quesTxt quesTxt2"]/ul/li/div/a/text()'
             ).extract()
             if len(kaodian) != 0:
                 yitiku['kaodian'] = kaodian[0]
             else:
                 yitiku['kaodian'] = ''
             shijuan = response.xpath('//div[@class="quesdiv"]').extract()
             if len(shijuan) != 0:
                 yitiku['topic'] = shijuan[0]
             yitiku['spider_url'] = response.url
             yitiku['md5'] = get_md5(response.url.encode('utf-8'))
             yitiku['spider_source'] = 59
             yitiku['html'] = response.text
             yitiku['source_id'] = question_id
             yield yitiku
     # if question_id <= self.end_number:
     #     self.first_number = question_id + 1
     #     new_url = 'http://www.yitiku.cn/shiti/' + str(self.first_number) + '.html'
     #     time.sleep(0.02)
     #     yield scrapy.Request(url=new_url, dont_filter=True, headers={"User-Agent": random.choice(self.User_Agents)},
     #                     cookies=self.cookies, callback=self.parse_detail_shiti, meta={"source_id": self.first_number})
     yield scrapy.Request(
         url=response.url,
         dont_filter=True,
         callback=self.parse,
         headers={"User-Agent": random.choice(self.User_Agents)})