Пример #1
0
    def get_text(self, response):
        '''
        找到每一章小说的标题和正文
        并自动生成id字段,用于表的排序
        '''
        item = BiqugeItem()

        # 小说名
        item['bookname'] = response.xpath(
            './/div[@class="con_top"]/a[2]/text()').extract()[0]

        # 章节名 ,将title单独找出来,为了提取章节中的数字
        title = response.xpath('.//h1/text()').extract()[0]
        item['title'] = title

        #  找到用于排序的id值
        item['order_id'] = Cn2An(get_tit_num(title))

        # 正文部分需要特殊处理
        body = response.xpath('.//div[@id="content"]/text()').extract()

        # 将抓到的body转换成字符串,接着去掉\t之类的排版符号,
        text = ''.join(body).strip().replace('\u3000', '')

        item['body'] = text

        return item
Пример #2
0
    def parse_book_index(self, response):
        # print(response.body.decode('utf-8'))
        item = BiqugeItem()
        item['id_name'] = response.request.url.split('/')[-2]
        item['name'] = response.css('div#info h1::text')[0].extract()
        print(item['name'])
        item['author'] = response.css('div#info p::text')[0].extract()
        item['brief'] = ''.join(response.css('div#intro::text').extract())
        item['update_chapter'] = response.css(
            'div#info p a::text')[-1].extract()
        # print(type(item['brief']))
        # with open('1111.txt','w',encoding='utf-8') as f:
        #     f.write(item['brief'])
        # print(item)

        book_chapter_url_list = response.css(
            'div#list dd a::attr(href)').extract()
        for index, book_chapter_url in enumerate(book_chapter_url_list):
            if book_chapter_url.startswith('//'):
                print(book_chapter_url)
                print('http:www.qu.la' + book_chapter_url)
                re.sub(r'//', '/', book_chapter_url)
            id_item = PassingItem()
            id_item['chapter_id'] = index
            request = scrapy.http.Request(url=''.join(
                ['http://www.qu.la', book_chapter_url]),
                                          headers=headers,
                                          callback=self.parse_book_content)
            request.meta['item'] = id_item
            yield request
        return item
Пример #3
0
    def get_content(self, response):
        '''
        抓取小说正文
        :param response:
        :return:
        '''

        title = response.xpath(
            "//div[@class=\"bookname\"]/h1/text()")[0].extract()

        contentList = response.xpath("//div[@id=\"content\"]//text()")
        print(title)

        content = ""
        for i in contentList:
            content += i.extract().replace("\u3000", '').replace(
                '\r\n', '').replace('\t', '') + '\n'

        item = BiqugeItem()

        item["level1"] = response.meta['level1']  # 一级标签
        item["level2"] = response.meta['level2']  # 二级标签
        item['author'] = response.meta['author']  # 作者
        item["chapter"] = response.meta['chapter']  # 三级标签,章节

        item["title"] = title  # 标题
        item["content"] = content  # 正文

        yield item
Пример #4
0
 def parse_content(self, response):
     item = BiqugeItem()
     contents = response.xpath('//*[@id="content"]/text()').extract()
     item['xiaoshuoming'] = response.meta['xiaoshuoming'].strip()
     item['zhangjieming'] = response.xpath(
         '/html/head/title/text()').extract()[0]
     conter = ''
     for content in contents:
         conter += content
     item['conter'] = conter
     yield item
Пример #5
0
 def parse_item(self, response):
     item = BiqugeItem()
     item['detail_url'] = response.url
     item['name'] = response.xpath("//h1/text()").extract_first()
     item['cover_img'] = response.xpath(
         "//div[@id='fmimg']/img/@src").extract_first()
     item['author'] = response.xpath(
         "//div[@id='info']/p[1]/a/text()").extract_first()
     item['introduce'] = response.xpath(
         "//div[@id='intro']/p/text()").extract_first()
     yield item
Пример #6
0
	def get_text(self, response):
		item = BiqugeItem()

		item['bookname'] = response.xpath('//div[@class="con_top"]/a[2]/text()').extract()[0]
		item['chapter_name'] = response.xpath('//h1/text()').extract()[0]

		novel_text = response.xpath('//div[@id="content"]/text()').extract()
		text = "".join(novel_text).strip().replace('\u3000','')
		item['body'] = text

		return item
Пример #7
0
 def parse_chapter(self,response):
     # 提取数据
     item = BiqugeItem()
     book_name = response.xpath('//div[@class="con_top"]/a[3]/text()').extract_first()
     # print(book_name)
     chapter_name = response.xpath('//div[@class="bookname"]/h1/text()').extract_first()
     content = response.xpath('string(//div[@id="content"])').extract_first()
     chapter_url = response.url
     item['book_name'] = book_name
     item['chapter_name'] = chapter_name
     item['content'] = content
     item['chapter_url'] = chapter_url
     # print(item)
     yield item
Пример #8
0
 def parse_detail(self, response):
     print(response.meta['item'])
     item = BiqugeItem()
     item['title'] = response.xpath(
         'string(//div[@class="bookname"]/h1)').get()
     item['content'] = response.xpath(
         'string(//*[@id="booktext"])').get().replace('\u3000\u3000', '\n')
     # print(content)
     # print(response.text)
     # item['content'] = [i.strip() for i in content]
     item['index'] = response.meta['item']['index']
     item['bookname'] = response.meta['item']['bookname']
     # print(item)
     yield item
Пример #9
0
 def parse(self, response):
     book_name = response.xpath('//div[@id="info"]/h1/text()').get()
     capters = response.xpath('//div[@id="list"]/dl/dd/a/text()').getall()
     detail_pages = response.xpath('//div[@id="list"]/dl/dd/a/@href').getall()
     order = 0
     for capter, detail_page in zip(capters, detail_pages):
         order += 1
         item = BiqugeItem()
         item ['order'] = order
         item['book_name'] = book_name
         item['capter_name'] = capter
         detail_page = r'https://www.biquge.com.cn{}'.format(detail_page)
         item['detail_page'] = detail_page
         yield scrapy.Request(detail_page, callback=self.cather, meta={'item': item})
Пример #10
0
 def parse_content(self, response):
     item = BiqugeItem()
     title = response.xpath(
         '//div[@class="con_top"]/a[last()]/text()').extract_first()
     chapter = response.xpath(
         '//div[@class="bookname"]/h1/text()').extract_first()
     content = re.sub(
         r'\s', '',
         ''.join(response.xpath('//div[@id="content"]/text()').extract()))
     item['title'] = title
     item['chapter'] = chapter
     item['content'] = content
     item['content'] = response.url
     # print(1)
     yield item
Пример #11
0
    def get_information_and_chapter(self, response):
        item = BiqugeItem()
        item['content'] = ''.join(response.xpath('//meta[@property="og:description"]/@content').extract()). \
            replace(' ', ''). \
            replace('\n', '')

        # 保存小说链接
        novel_url = response.meta['novel_a']
        item['url'] = novel_url

        # 提取小说名字
        novel_name = ''.join(
            response.xpath(
                '//meta[@property="og:novel:book_name"]/@content').extract())
        item['name'] = novel_name

        # 提取小说作者
        item['author'] = ''.join(
            response.xpath(
                '//meta[@property="og:novel:author"]/@content').extract())

        # 从url中提取小说id
        novel_id = ''.join(re.findall('\d', novel_url))
        item['novel_id'] = novel_id
        yield item

        urls = re.findall('<dd><a href="(.*?)">(.*?)</a>', response.text)
        num = 0
        for url in urls:
            num += 1
            chapter_url = self.base_url + '/book/' + novel_id + '/' + url[0]
            chapter_name = url[1]
            if Sql.select_chapter_name(chapter_name) == 1:
                print('章节已经存在')
                pass
            else:
                yield Request(chapter_url,
                              self.get_chapter_content,
                              meta={
                                  'num': num,
                                  'chapter_url': chapter_url,
                                  'chapter_name': chapter_name,
                                  'novel_id': novel_id
                              })
Пример #12
0
 def get_content(self, response):
     item = BiqugeItem()
     resp_url = response.url
     item['url'] = resp_url
     item['book_id'] = resp_url.split('/')[3]
     item['zhang_id'] = resp_url.split('/')[4].split('.')[0]
     item['book_name'] = response.xpath(
         '//div[@class="bookname"]/div/a[3]/text()').extract_first()
     item['book_cl'] = response.xpath(
         '//div[@class="con_top"]/text()[3]').extract_first()[3:7]
     item['title'] = response.xpath(
         '//div[@class="bookname"]/h1/text()').extract_first()
     contents = response.xpath('//*[@id="content"]/text()')
     s = ''
     for content in contents:
         if len(content.re('\S+')) > 0:
             s += content.re('\S+')[0]
     item['content'] = s
     return item
Пример #13
0
 def parse_chapter(self, response):
     book_id = response.meta["book_id"]
     for index, chapter in enumerate(response.xpath("//dd")):
         item = BiqugeItem()
         item["book_id"] = book_id
         item["book_name"] = response.xpath(
             "//*[@id='info']/h1/text()").extract()[0]
         item["book_type"] = response.xpath(
             "//*[@class='con_top']/a[2]/text()").extract()[0]
         author = response.xpath("//*[@id='info']/p[1]/text()").extract()[0]
         item["book_author"] = author[7:]
         item["book_url"] = response.url
         item["chapter_name"] = chapter.xpath("./a/text()").extract()[0]
         item["chapter_num"] = (index + 1)
         item["chapter_url"] = "http://www.xbiquge.la%s" % (
             chapter.xpath("./a/@href").extract())[0]
         request = scrapy.Request(url=item["chapter_url"],
                                  callback=self.parse_content)
         request.meta["item"] = item
         yield request
Пример #14
0
    def parse_item(self, response):
        # 爬取列表页全部的书名,作者,链接,保存为列表
        booknames = response.xpath(
            "//div[@class='cover']//a[@class='blue']//text()").getall()
        authors = response.xpath(
            "//div[@class='cover']//a[@class='blue']/following-sibling::a[1]//text()"
        ).getall()
        book_urls = response.xpath(
            "//div[@class='cover']//a[@class='blue']/@href").getall()

        # 遍历列表中的值,绑定为一个个item对象并返回给pipeline进行数据库操作
        for i in range(len(booknames)):
            bookname = booknames[i]
            author = authors[i]
            book_url = 'http://m.paoshu8.com' + book_urls[i]

            item = BiqugeItem(bookname=bookname,
                              author=author,
                              book_url=book_url)
            yield item
Пример #15
0
    def parse_novel_link(self, response):
        base_url = 'http://www.qu.la'
        book = BiqugeItem()
        book['novel_link'] = response.url
        book['novel_id'] = response.url.replace('http://www.qu.la/book/',
                                                '').replace('/', '')
        book['novel_name'] = response.xpath(
            '//*[@id="info"]/h1/text()').extract()
        book['author'] = response.xpath(
            '//*[@id="info"]/p[1]/text()').extract()[0].replace(
                '作\xa0\xa0者:', '')
        book['introduce'] = response.xpath(
            '//*[@id="intro"]/text()').extract()[0].replace(' ', '').replace(
                '\r', '').replace('\n', '')

        chapter_link = response.xpath(
            '//*[@id="list"]/dl/dd/a/@href').extract()
        for i in chapter_link:
            yield Request(base_url + i,
                          callback=self.parse_chapter_link,
                          meta={'book_info': book})