示例#1
0
    def parse(self, response):
        item = QidianItem()
        # page = response.css('#page-container::attr(data-pagemax)').extract() 获取页数
        # names = response.css('#rank-view-list > div > ul > li > div.book-mid-info > h4 > a::text').extract() 测试获取小说名字
        for i in response.css(
                '#rank-view-list > div > ul > li > div.book-mid-info > h4'):
            item = QidianItem()
            item['name'] = i.css('a::text').extract()[0]
            item['url'] = "http:" + i.css('a::attr(href)').extract()[0]
            yield item

        # for name in names: 前期测试
        #     item['name']
        #     item['name'] = name
        #     yield  item

        # for pageurl in (1,int(page[0])+1): 前期测试
        #     tempurl = r"https://www.qidian.com/rank/yuepiao?chn=21&page=" + str(pageurl)
        #     item['pages'] = tempurl
        # print("这里需要打印" + str(page[0])) 测试打印数据毛,看看师傅正确
        # yield item

        next_page = response.css(
            '#page-container::attr(data-pagemax)').extract()
        # print(next_page) 下一页,翻页
        if next_page is not None:
            naxt_page = int(next_page[0]) + 1
            for i in range(1, naxt_page):
                url = response.urljoin(
                    r"https://www.qidian.com/rank/yuepiao?chn=21&page=" +
                    str(i))  #翻页
                # yield scrapy.Request(url, self.parse) 老写法
                yield response.follow(url, callback=self.parse)
示例#2
0
    def parse_content(self, response):
        title_sum = []
        for bb in response.xpath('//div[@class="main-text-wrap"]'):
            title = bb.xpath(
                '//div[@class="text-head"]/h3[@class="j_chapterName"]/text()'
            ).extract()
            content = bb.xpath(
                '//div[@class="read-content j_readContent"]/p/text()').extract(
                )
            #print(title)
            ###切分list,得到空格的位置
            kong_list = list(''.join(title))
            #print(type(kong_list))
            a = ' '
            if a in kong_list:  ###如果空格在这个list里面,则..
                kong_ge = list(''.join(title)).index(' ')  ###得到空格的下标位置
                #print(kong_list[1:kong_ge-1]) ###得到第几章里面的几章
                kong_ge_str = "".join(kong_list[1:kong_ge - 1])  ###list转换为str
                #print(kong_ge_str)
                title_sum.append(kong_ge_str)

            # else:
            #     print(kong_list)
            #title_sum.append(title)
            #####################得到的title是乱序的,此步骤要解决乱序的问题,要按照正序排列,开始########
            #print(title,len(title))
            ###我们打算用冒泡排序来解决这个问题###
            #####################得到的title是乱序的,此步骤要解决乱序的问题,要按照正序排列,结束########

        item = QidianItem()
        item[
            'title'] = title_sum  ######sort(reverse = False)  ###reverse = False 升序(默认)。sorted(n,key=lambda x:CN[x])
        #item['content']=content
        yield item
示例#3
0
 def parse(self, response):
     bot = Selector(response)
     csrfToken = self.get_cookies('_csrfToken', response)
     contents = bot.xpath('//tbody/tr')
     for content in contents:
         item = QidianItem()
         item['book_type'] = content.xpath(
             'td[1]/a[1]/text()').extract_first()
         item['book_sub_type'] = content.xpath(
             'td[1]/a[2]/text()').extract_first()
         item['book_name'] = content.xpath(
             'td[2]/a[1]/text()').extract_first()
         item['book_url'] = 'https:' + content.xpath(
             'td[2]/a[1]/@href').extract_first()
         item['total_words'] = content.xpath(
             'td[4]/span/text()').extract_first()
         item['author'] = content.xpath('td[5]/a/text()').extract_first()
         item['last_upload_date'] = content.xpath(
             'td[6]/text()').extract_first()
         yield scrapy.Request(item['book_url'],
                              meta={
                                  'item': item,
                                  'csrfToken': csrfToken
                              },
                              callback=self.parse_detail)
示例#4
0
    def parse(self, response):
        li_list = response.css(".book-img-text li")
        for li in li_list:
            item = QidianItem()
            item["title"] = li.css(".book-mid-info h4 a::text")[0].extract()
            item["url"] = "https:" + li.css(
                ".book-mid-info h4 a::attr(href)")[0].extract()
            item["author"] = li.css(".book-mid-info .author a")[0].xpath(
                "./text()")[0].extract()
            category = ""
            a_list = li.css(".book-mid-info .author a")[1:]
            for a in a_list:
                a_text = a.css("a::text")[0].extract()
                category += a_text
                category += " "
            item["category"] = category.strip()
            item["status"] = li.css(
                ".book-mid-info .author span::text")[0].extract()
            item["bref"] = li.css(
                ".book-mid-info .intro::text")[0].extract().strip()

            yield scrapy.Request(item['url'],
                                 callback=self.book_intro,
                                 meta={"item": item},
                                 headers={
                                     "User-Agent":
                                     "Mozilla/5.0 (Windows NT 10.0; WOW64) "
                                     "AppleWebKit/537.36 (KHTML, like Gecko) "
                                     "Chrome/63.0.3239.132 Safari/537.36"
                                 })
    def parse_book(self, response):
        #分析页面网站
        head = response.xpath("//div[@class='book-info ']")  #总入口

        title = head.xpath(".//h1/em/text()").get()  #文章标题
        author = head.xpath(".//h1/span/a/text()").get()  #作者
        statu = head.xpath(".//p/span/text()").getall()  #文章状态
        status = ''
        for status1 in statu:
            status = status + status1 + ' '
        type1 = head.xpath(".//p[@class='tag']/a/text()").getall()  #文章类别
        type = ''
        for ty in type1:
            type = type + ty + ' '
        brief = head.xpath(".//p[@class='intro']/text()").get()  #文章简介
        image = response.xpath(
            ".//div[@class='book-img']//img/@src").get()  #文章封面
        image = response.urljoin(image)
        contents = response.xpath(
            ".//div[@class='book-intro']/p/text()").getall()  #文章内容
        contents = list(map(lambda content: content.strip(), contents))
        content = ''
        for content1 in contents:
            content = content + content1 + '\n'
        url = response.url
        item = QidianItem(title=title,
                          author=author,
                          status=status,
                          type=type,
                          brief=brief,
                          contents=content,
                          image=image,
                          url=url)
        yield item
示例#6
0
    def parse_page(self, response):
        if self.num >= 21000:
            return
        selector = Selector(response)

        item = QidianItem()
        item['url'] = response.url

        book_info = selector.xpath('//div[@class="book-info "]')
        item['link'] = book_info.xpath(
            '//a[@class="red-btn J-getJumpUrl "]/@href').extract()[0]
        item['name'] = book_info.xpath('h1/em//text()').extract()[0]

        ast = selector.xpath('//div[@class="book-intro"]/p')[0]
        abcst = ast.xpath('text()').extract()
        abst = ""

        for d in abcst:
            abst += d

        item['intro'] = abst.replace('\n', '')
        item['intro'] = abst.replace('\r', '')
        item['intro'] = abst.replace('\t', ' ')
        item['intro'] = abst.strip()

        tmp = book_info.xpath('p')
        book_tags = tmp[0]
        book_tags_hrefs = book_tags.xpath('a/text()')
        item['major_category'] = book_tags_hrefs[0].extract()

        na = item['name'].encode('utf-8')
        mc = item['major_category'].encode('utf-8')
        intr = item['intro'].encode('utf-8')
        li = item['link'].encode('utf-8')

        if li == "" or mc not in self.dic or self.dic[mc] >= self.max_number:
            return

        self.dic[mc] += 1
        self.num += 1

        yield Request("http:" + li, callback=self.parse_content)

        f0 = open('title2.txt', 'a')
        f1 = open('tag2.txt', 'a')
        f2 = open('abs2.txt', 'a')
        f3 = open('link2.txt', 'a')
        f0.write(str(na) + '\n')
        f1.write(str(mc) + '\n')
        f2.write(str(intr) + '\n')
        f3.write(str(li) + '\n')

        #print 'line82' + mc + '\n'
        #print 'line83 ' + str(self.dic)

        yield item
示例#7
0
 def parse_info(self,response):
     selector = etree.HTML(response.text)
     item = QidianItem()
     item['id'] = response.meta['id']
     item['name'] =selector.xpath('//div[@class="book-info "]/h1/em/text()')[0]
     item['author'] =selector.xpath('//div[@class="book-info "]/h1/span/a/text()')[0]
     item['introduce'] =selector.xpath('//p[@class="intro"]/text()')[0]
     item['yuepiao'] =selector.xpath('//p[@class="num"]/i/text()')[0]
     item['dashang'] =selector.xpath('//i[@class="rewardNum"]/text()')[0]
     yield item
示例#8
0
 def parse_item(self, response):
     item = QidianItem()
     #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     #i['name'] = response.xpath('//div[@id="name"]').extract()
     #i['description'] = response.xpath('//div[@id="description"]').extract()
     item['title'] = response.xpath(
         '//*[@id="divBookInfo"]/div[1]/h1/text()').extract()[0].strip()
     item['name'] = response.xpath(
         '//*[@id="divBookInfo"]/div[1]/span/a/span/text()').extract(
         )[0].strip()
     item['week_click'] = response.xpath(
         '//*[@id="contentdiv"]/div/div[1]/table/tr/td[2]/text()').extract(
         )[1].strip()
     return item
示例#9
0
文件: spider.py 项目: yun2ye/qidian
class SpiderSpider(scrapy.Spider):
    name = 'spider'
    allowed_domains = ['qidian.com']
    start_urls = ['http://qidian.com/']
    item = QidianItem()

    # 获取目标小说详情页
    def parse(self, response):
        # 此处修改想要爬取的小说
        url_list = response.xpath(
            '/html/body/div[1]/div[7]/div[1]/div/ul/li[1]/strong/a/@href'
        ).extract()
        for url in url_list:
            yield scrapy.Request(url='https:' + url,
                                 meta={'item': self.item},
                                 callback=self.parse_one)

    def parse_one(self, response):
        item = response.meta['item']
        item['text'] = []
        item['chapter_name'] = []
        item['name'] = response.xpath(
            '/html/body/div/div[6]/div[1]/div[2]/h1/em/text()').extract_first(
            )
        chapter_list = response.xpath(
            '//*[@id="j-catalogWrap"]/div[2]/div/ul/li/a/@href').extract()
        yield scrapy.Request(url='https:' + chapter_list[0],
                             meta={'item': self.item},
                             callback=self.parse_two)

    def parse_two(self, response):
        item = response.meta['item']
        item['text_list'] = response.xpath(
            '//*[@class="read-content j_readContent"]/p/text()').extract()
        item['chapter_name'].append(
            response.xpath(
                '//*[@class="j_chapterName"]/span/text()').extract_first())
        url = response.xpath('//*[@id="j_chapterNext"]/@href').extract_first()
        nextChapterVip = re.findall(r'g_data.nextChapterVip = (\d);',
                                    response.text)[0]
        nextId = re.findall(r'nextId :(.*?),', response.text)[0]
        item['text'].append(' \n\n'.join(item['text_list']))
        if nextChapterVip == '0' and nextId != '-1':
            yield scrapy.Request(url='https:' + url,
                                 meta={'item': self.item},
                                 callback=self.parse_two)
        else:
            yield item
示例#10
0
 def parse(self, response):
     # 小说类型列表
     novel_type_list = response.xpath(
         '//dl[@class ="cf"]//dd//span//i/text()').extract()
     #小说类型URL列表
     novel_type_url_list = response.xpath(
         '//dl[@class ="cf"]//dd//a/@href').extract()
     #小说类型和URL只选取前12种.如:玄幻,奇幻,武侠,仙侠,都市,现实,军事,历史,游戏,体育,科幻,悬疑灵异(因网页结构不同)
     for novel_type, novel_type_url in zip(novel_type_list[0:-2],
                                           novel_type_url_list[0:-2]):
         item = QidianItem()
         item['novel_type'] = novel_type
         #小说分类页url
         URL = "https://www.qidian.com" + novel_type_url
         yield scrapy.Request(URL,
                              callback=self.parse_kind_parse,
                              meta={'item': item})
示例#11
0
 def parse(self, response):
     items = response.css('.all-book-list li')
     data = QidianItem()
     for item in items:
         data['name'] = item.css('.book-mid-info a::text').extract_first()
         data['author'] = item.css('.author a.name::text').extract_first()
         data['img'] = item.css(
             '.book-img-box a img::attr(src)').extract_first()
         data['url'] = item.css(
             '.book-img-box a::attr(href)').extract_first()
         data['state'] = item.css('.author span::text').extract_first()
         data['type'] = item.css(
             '.author a.go-sub-type::text').extract_first()
         data['intro'] = item.css('.intro::text').extract_first().strip()
         data['auturl'] = item.css(
             '.author a.name::attr(href)').extract_first()
         yield data
示例#12
0
 def parse_content(self, response):
     for bb in response.xpath('//div[@class="main-text-wrap"]'):
         title = bb.xpath(
             '//div[@class="text-head"]/h3[@class="j_chapterName"]/text()'
         ).extract()
         content = bb.xpath(
             '//div[@class="read-content j_readContent"]/p/text()').extract(
             )
         #print(title)
         ###切分list,得到空格的位置
         kong_list = list(''.join(title))
         #print(type(kong_li
         item = QidianItem()
         item[
             'title'] = title  ######sort(reverse = False)  ###reverse = False 升序(默认)。sorted(n,key=lambda x:CN[x])
         item['content'] = content
         yield item
示例#13
0
    def parse(self, response):
        """  This function parses a property page.

        :param response: 请求
        :return:  人会
        @url http://vip.book.sina.com.cn/weibobook/cate.php?cate_id=1036&w=0&s=0&order=1&vt=4&page=3
        @returns items 1
        @scrapes book_id src title img_url state author chan_name synoptic platform platform_src

        """

        l = ItemLoader(item=QidianItem(), response=response)
        l.add_xpath(
            'book_id',
            '//div[@class="book_list"]/ul//li/div[@class="book_info"]/p[@class="book_name"]/a/@href',
            re='[0-9]+')
        l.add_xpath(
            'src',
            '//div[@class="book_list"]/ul//li/div[@class="book_info"]/p[@class="book_name"]/a/@href'
        )
        l.add_xpath(
            'title',
            '//div[@class="book_list"]/ul//li/div[@class="book_info"]/p[@class="book_name"]/a/text()'
        )
        l.add_xpath(
            'img_url',
            '//div[@class="book_list"]/ul//li/div[@class="img_box"]/a/img/@src'
        )
        l.add_xpath(
            'state',
            '//div[@class="book_list"]/ul//li/div[@class="book_info"]/p[@class="book_author"]/text()',
            re='(?<=【).*?(?=】)')
        l.add_xpath(
            'author',
            '//div[@class="book_list"]/ul//li/div[@class="book_info"]/p[@class="book_author"]/span/text()'
        )
        l.add_xpath('chan_name', '//div[@class="all-fr-title"]/text()')
        l.add_xpath(
            'synoptic',
            '//div[@class="book_list"]/ul//li/div[@class="book_info"]/p[@class="info"]/a/text()'
        )
        l.add_value('platform', '新浪读书')
        l.add_value('platform_src', 'http://vip.book.sina.com.cn')
        return l.load_item()
示例#14
0
    def get_third(self, response):
        # 一个章节P_list
        Third = (response.xpath("//div[@class='read-content j_readContent']//p"))
        # print(len(Third))
        # print(Third)

        # 一章节内容
        story = ''
        for i in Third:
            story += (i.xpath('./text()').extract()[0]).strip() + '\n'
        print(story)
            # print('=' * 30)
            # story = s + '\n'

            #
            # print('='*30)
            # print(story)

        item = QidianItem()
        item['story'] = story
            # return item
        yield item
示例#15
0
    def parse_item(self, response):
        item = QidianItem()

        item['book_name'] = response.xpath(
            '//div[@class="book-info "]/h1/em/text()').extract()[0]
        item['author'] = response.xpath(
            '//div[@class="book-info "]/h1/span/a/text()').extract()[0]
        item['status'] = response.xpath(
            '//span[@class="blue"]/text()').extract()[0]
        classname = response.xpath(
            '//div[@class="book-info "]/p[3]/em[1]/span/@class').extract()[0]
        url = "https://qidian.gtimg.com/qd_anti_spider/{}.woff".format(
            classname)
        resp = response.body.decode('utf-8')

        pattern = re.compile(
            '</style><span class="\w+">(.*?);</span></em><cite>')
        word_list = pattern.search(resp).group(1).split(';')
        word_count = parse_font(url, word_list)
        item['word_count'] = word_count + '万字'

        item['ticket'] = response.xpath(
            '//*[@id="monthCount"]/text()').extract_first()
        yield item
示例#16
0
    def parse_page(self, response):
        selector = Selector(response)

        item = QidianItem()
        item['url'] = response.url

        #book image
        book_img = selector.xpath('//a[@id="bookImg"]/img')
        item['image'] = 'http:' + book_img[0].xpath(
            '@src').extract()[0].strip()

        #book info
        book_info = selector.xpath('//div[@class="book-info "]')

        item['name'] = book_info.xpath('h1/em//text()').extract()[0]
        item['author'] = book_info.xpath('h1/span/a/text()').extract()[0]

        item['intro'] = selector.xpath(
            '//div[@class="book-intro"]/p/text()').extract()[0]
        item['intro'] = item['intro'].replace('\n', '')
        item['intro'] = item['intro'].replace('\r', '')
        item['intro'] = item['intro'].replace('\t', ' ')
        item['intro'] = item['intro'].strip()

        tmp = book_info.xpath('p')
        book_tags = tmp[0]
        book_statistics = tmp[2].xpath('em/text()')
        book_statistics_desc = tmp[2].xpath('cite//text()')
        assert len(book_statistics_desc) == 7

        #book status
        tag_spans_texts = book_tags.xpath('span/text()')
        item['sign_status'] = u'未签'
        for book_tags_span in tag_spans_texts:
            status = book_tags_span.extract()
            if status == u'连载' or status == u'完本':
                item['progress'] = status
            elif status == u'签约':
                item['sign_status'] = status
            elif status == u'VIP' or status == u'免费':
                item['pay_status'] = status

        #book category
        book_tags_hrefs = book_tags.xpath('a/text()')
        item['major_category'] = book_tags_hrefs[0].extract()
        item['minor_category'] = book_tags_hrefs[1].extract()

        #book statistics
        text_count_desc = book_statistics_desc[0].extract()
        click_count_desc = book_statistics_desc[1].extract()
        weekly_click_count_desc = book_statistics_desc[3].extract()
        recommend_count_desc = book_statistics_desc[4].extract()
        weekly_recommend_count_desc = book_statistics_desc[6].extract()

        total_text_count = float(book_statistics[0].extract())
        if text_count_desc[0] == u'万':
            total_text_count = total_text_count * 10000.0
        total_text_count = int(total_text_count)

        total_click_count = float(book_statistics[1].extract())
        if click_count_desc[0] == u'万':
            total_click_count = total_click_count * 10000.0
        total_click_count = int(total_click_count)

        #会员周点击3.25万
        beg_pos = weekly_click_count_desc.find(u'会员周点击') + len(u'会员周点击')
        end_pos = weekly_click_count_desc.find(u'万', beg_pos)
        adjust_end_pos = end_pos
        if adjust_end_pos < 0:
            adjust_end_pos = len(weekly_click_count_desc)
        vip_weekly_click_count = float(
            weekly_click_count_desc[beg_pos:adjust_end_pos])
        if end_pos > 0:
            vip_weekly_click_count = vip_weekly_click_count * 10000.0
        vip_weekly_click_count = int(vip_weekly_click_count)

        toal_recommend_count = float(book_statistics[2].extract())
        if recommend_count_desc[0] == u'万':
            toal_recommend_count = toal_recommend_count * 10000.0
        toal_recommend_count = int(toal_recommend_count)

        #周13.52万
        beg_pos = weekly_recommend_count_desc.find(u'周') + len(u'周')
        end_pos = weekly_recommend_count_desc.find(u'万', beg_pos)
        adjust_end_pos = end_pos
        if adjust_end_pos < 0:
            adjust_end_pos = len(weekly_recommend_count_desc)
        weekly_recommend_count = float(
            weekly_recommend_count_desc[beg_pos:adjust_end_pos])
        if end_pos > 0:
            weekly_recommend_count = weekly_recommend_count * 10000.0
        weekly_recommend_count = int(weekly_recommend_count)

        item['total_text_count'] = total_text_count
        item['total_click_count'] = total_click_count
        item['vip_weekly_click_count'] = vip_weekly_click_count
        item['toal_recommend_count'] = toal_recommend_count
        item['weekly_recommend_count'] = weekly_recommend_count

        try:
            item['monthly_pass_count'] = selector.xpath(
                '//i[@id="monthCount"]/text()').extract()[0]
        except:
            item['monthly_pass_count'] = 0

        item['weekly_reward_count'] = selector.xpath(
            '//i[@id="rewardNum"]/text()').extract()[0]

        #from ajax
        '''
        self.browser.get(response.url)        
        page = self.browser.page_source
        selector = Selector(text=page)
        
        score_lhs = selector.xpath('//cite[@id="score1"]/text()').extract()[0]
        score_rhs = selector.xpath('//i[@id="score2"]/text()').extract()[0]
        item['score'] = score_lhs + '.' + score_rhs
        item['evaluate_users'] = selector.xpath('//p[@id="j_userCount"]/span/text()').extract()[0]            
        '''

        yield item
示例#17
0
 def catalog_item(self, response):
     l = ItemLoader(item=QidianItem(), response=response)
     l.add_xpath('title', '//div[@class="new_charpet"]/a/text()')
     l.add_xpath('src', '//div[@class="new_charpet"]/a/@href')
     return l.load_item()