def get_text(self, response): ''' 找到每一章小说的标题和正文 并自动生成id字段,用于表的排序 ''' item = BiqugeItem() # 小说名 item['bookname'] = response.xpath( './/div[@class="con_top"]/a[2]/text()').extract()[0] # 章节名 ,将title单独找出来,为了提取章节中的数字 title = response.xpath('.//h1/text()').extract()[0] item['title'] = title # 找到用于排序的id值 item['order_id'] = Cn2An(get_tit_num(title)) # 正文部分需要特殊处理 body = response.xpath('.//div[@id="content"]/text()').extract() # 将抓到的body转换成字符串,接着去掉\t之类的排版符号, text = ''.join(body).strip().replace('\u3000', '') item['body'] = text return item
def parse_book_index(self, response): # print(response.body.decode('utf-8')) item = BiqugeItem() item['id_name'] = response.request.url.split('/')[-2] item['name'] = response.css('div#info h1::text')[0].extract() print(item['name']) item['author'] = response.css('div#info p::text')[0].extract() item['brief'] = ''.join(response.css('div#intro::text').extract()) item['update_chapter'] = response.css( 'div#info p a::text')[-1].extract() # print(type(item['brief'])) # with open('1111.txt','w',encoding='utf-8') as f: # f.write(item['brief']) # print(item) book_chapter_url_list = response.css( 'div#list dd a::attr(href)').extract() for index, book_chapter_url in enumerate(book_chapter_url_list): if book_chapter_url.startswith('//'): print(book_chapter_url) print('http:www.qu.la' + book_chapter_url) re.sub(r'//', '/', book_chapter_url) id_item = PassingItem() id_item['chapter_id'] = index request = scrapy.http.Request(url=''.join( ['http://www.qu.la', book_chapter_url]), headers=headers, callback=self.parse_book_content) request.meta['item'] = id_item yield request return item
def get_content(self, response): ''' 抓取小说正文 :param response: :return: ''' title = response.xpath( "//div[@class=\"bookname\"]/h1/text()")[0].extract() contentList = response.xpath("//div[@id=\"content\"]//text()") print(title) content = "" for i in contentList: content += i.extract().replace("\u3000", '').replace( '\r\n', '').replace('\t', '') + '\n' item = BiqugeItem() item["level1"] = response.meta['level1'] # 一级标签 item["level2"] = response.meta['level2'] # 二级标签 item['author'] = response.meta['author'] # 作者 item["chapter"] = response.meta['chapter'] # 三级标签,章节 item["title"] = title # 标题 item["content"] = content # 正文 yield item
def parse_content(self, response): item = BiqugeItem() contents = response.xpath('//*[@id="content"]/text()').extract() item['xiaoshuoming'] = response.meta['xiaoshuoming'].strip() item['zhangjieming'] = response.xpath( '/html/head/title/text()').extract()[0] conter = '' for content in contents: conter += content item['conter'] = conter yield item
def parse_item(self, response): item = BiqugeItem() item['detail_url'] = response.url item['name'] = response.xpath("//h1/text()").extract_first() item['cover_img'] = response.xpath( "//div[@id='fmimg']/img/@src").extract_first() item['author'] = response.xpath( "//div[@id='info']/p[1]/a/text()").extract_first() item['introduce'] = response.xpath( "//div[@id='intro']/p/text()").extract_first() yield item
def get_text(self, response): item = BiqugeItem() item['bookname'] = response.xpath('//div[@class="con_top"]/a[2]/text()').extract()[0] item['chapter_name'] = response.xpath('//h1/text()').extract()[0] novel_text = response.xpath('//div[@id="content"]/text()').extract() text = "".join(novel_text).strip().replace('\u3000','') item['body'] = text return item
def parse_chapter(self,response): # 提取数据 item = BiqugeItem() book_name = response.xpath('//div[@class="con_top"]/a[3]/text()').extract_first() # print(book_name) chapter_name = response.xpath('//div[@class="bookname"]/h1/text()').extract_first() content = response.xpath('string(//div[@id="content"])').extract_first() chapter_url = response.url item['book_name'] = book_name item['chapter_name'] = chapter_name item['content'] = content item['chapter_url'] = chapter_url # print(item) yield item
def parse_detail(self, response): print(response.meta['item']) item = BiqugeItem() item['title'] = response.xpath( 'string(//div[@class="bookname"]/h1)').get() item['content'] = response.xpath( 'string(//*[@id="booktext"])').get().replace('\u3000\u3000', '\n') # print(content) # print(response.text) # item['content'] = [i.strip() for i in content] item['index'] = response.meta['item']['index'] item['bookname'] = response.meta['item']['bookname'] # print(item) yield item
def parse(self, response): book_name = response.xpath('//div[@id="info"]/h1/text()').get() capters = response.xpath('//div[@id="list"]/dl/dd/a/text()').getall() detail_pages = response.xpath('//div[@id="list"]/dl/dd/a/@href').getall() order = 0 for capter, detail_page in zip(capters, detail_pages): order += 1 item = BiqugeItem() item ['order'] = order item['book_name'] = book_name item['capter_name'] = capter detail_page = r'https://www.biquge.com.cn{}'.format(detail_page) item['detail_page'] = detail_page yield scrapy.Request(detail_page, callback=self.cather, meta={'item': item})
def parse_content(self, response): item = BiqugeItem() title = response.xpath( '//div[@class="con_top"]/a[last()]/text()').extract_first() chapter = response.xpath( '//div[@class="bookname"]/h1/text()').extract_first() content = re.sub( r'\s', '', ''.join(response.xpath('//div[@id="content"]/text()').extract())) item['title'] = title item['chapter'] = chapter item['content'] = content item['content'] = response.url # print(1) yield item
def get_information_and_chapter(self, response): item = BiqugeItem() item['content'] = ''.join(response.xpath('//meta[@property="og:description"]/@content').extract()). \ replace(' ', ''). \ replace('\n', '') # 保存小说链接 novel_url = response.meta['novel_a'] item['url'] = novel_url # 提取小说名字 novel_name = ''.join( response.xpath( '//meta[@property="og:novel:book_name"]/@content').extract()) item['name'] = novel_name # 提取小说作者 item['author'] = ''.join( response.xpath( '//meta[@property="og:novel:author"]/@content').extract()) # 从url中提取小说id novel_id = ''.join(re.findall('\d', novel_url)) item['novel_id'] = novel_id yield item urls = re.findall('<dd><a href="(.*?)">(.*?)</a>', response.text) num = 0 for url in urls: num += 1 chapter_url = self.base_url + '/book/' + novel_id + '/' + url[0] chapter_name = url[1] if Sql.select_chapter_name(chapter_name) == 1: print('章节已经存在') pass else: yield Request(chapter_url, self.get_chapter_content, meta={ 'num': num, 'chapter_url': chapter_url, 'chapter_name': chapter_name, 'novel_id': novel_id })
def get_content(self, response): item = BiqugeItem() resp_url = response.url item['url'] = resp_url item['book_id'] = resp_url.split('/')[3] item['zhang_id'] = resp_url.split('/')[4].split('.')[0] item['book_name'] = response.xpath( '//div[@class="bookname"]/div/a[3]/text()').extract_first() item['book_cl'] = response.xpath( '//div[@class="con_top"]/text()[3]').extract_first()[3:7] item['title'] = response.xpath( '//div[@class="bookname"]/h1/text()').extract_first() contents = response.xpath('//*[@id="content"]/text()') s = '' for content in contents: if len(content.re('\S+')) > 0: s += content.re('\S+')[0] item['content'] = s return item
def parse_chapter(self, response): book_id = response.meta["book_id"] for index, chapter in enumerate(response.xpath("//dd")): item = BiqugeItem() item["book_id"] = book_id item["book_name"] = response.xpath( "//*[@id='info']/h1/text()").extract()[0] item["book_type"] = response.xpath( "//*[@class='con_top']/a[2]/text()").extract()[0] author = response.xpath("//*[@id='info']/p[1]/text()").extract()[0] item["book_author"] = author[7:] item["book_url"] = response.url item["chapter_name"] = chapter.xpath("./a/text()").extract()[0] item["chapter_num"] = (index + 1) item["chapter_url"] = "http://www.xbiquge.la%s" % ( chapter.xpath("./a/@href").extract())[0] request = scrapy.Request(url=item["chapter_url"], callback=self.parse_content) request.meta["item"] = item yield request
def parse_item(self, response): # 爬取列表页全部的书名,作者,链接,保存为列表 booknames = response.xpath( "//div[@class='cover']//a[@class='blue']//text()").getall() authors = response.xpath( "//div[@class='cover']//a[@class='blue']/following-sibling::a[1]//text()" ).getall() book_urls = response.xpath( "//div[@class='cover']//a[@class='blue']/@href").getall() # 遍历列表中的值,绑定为一个个item对象并返回给pipeline进行数据库操作 for i in range(len(booknames)): bookname = booknames[i] author = authors[i] book_url = 'http://m.paoshu8.com' + book_urls[i] item = BiqugeItem(bookname=bookname, author=author, book_url=book_url) yield item
def parse_novel_link(self, response): base_url = 'http://www.qu.la' book = BiqugeItem() book['novel_link'] = response.url book['novel_id'] = response.url.replace('http://www.qu.la/book/', '').replace('/', '') book['novel_name'] = response.xpath( '//*[@id="info"]/h1/text()').extract() book['author'] = response.xpath( '//*[@id="info"]/p[1]/text()').extract()[0].replace( '作\xa0\xa0者:', '') book['introduce'] = response.xpath( '//*[@id="intro"]/text()').extract()[0].replace(' ', '').replace( '\r', '').replace('\n', '') chapter_link = response.xpath( '//*[@id="list"]/dl/dd/a/@href').extract() for i in chapter_link: yield Request(base_url + i, callback=self.parse_chapter_link, meta={'book_info': book})