예제 #1
0
class SancunSpider(scrapy.Spider):
    name = 'tsxsy_bbsgx'
    allowed_domains = ['www.bbsgx.com']
    #start_urls = ['http://www.xbiquge.la/10/10489/']
    url_ori = "http://www.bbsgx.com"
    url_firstchapter = "http://www.bbsgx.com/book_211549/65039517.html"
    name_txt = "./novels/贴身小神医bbsgx"

    pipeline = XbiqugePipeline()
    pipeline.createtable(name)
    item = XbiqugeItem()
    item['name'] = name
    item['url_firstchapter'] = url_firstchapter
    item['name_txt'] = name_txt

    def start_requests(self):
        start_urls = ['http://www.bbsgx.com/book_211549/']
        for url in start_urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        dl = response.css('#list dl dd')  #提取章节链接相关信息
        for dd in dl:
            self.url_c = self.url_ori + '/book_211549/' + dd.css(
                'a::attr(href)').extract()[0]  #组合形成小说的各章节链接
            #print(self.url_c)
            #yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True)
            yield scrapy.Request(
                self.url_c, callback=self.parse_c
            )  #以生成器模式(yield)调用parse_c方法获得各章节链接、上一页链接、下一页链接和章节内容信息。
            #print(self.url_c)
    def parse_c(self, response):
        #item = XbiqugeItem()
        #item['name'] = self.name
        #item['url_firstchapter'] = self.url_firstchapter
        #item['name_txt'] = self.name_txt
        self.item['url'] = response.url
        self.item['preview_page'] = self.url_ori + response.css(
            '#wrapper > div.content_read > div > div.bookname > div.bottem1 > a:nth-child(2)::attr(href)'
        ).extract()[0]
        self.item['next_page'] = self.url_ori + response.css(
            '#wrapper > div.content_read > div > div.bookname > div.bottem1 > a:nth-child(4)::attr(href)'
        ).extract()[0]
        title = response.css(
            '#wrapper > div.content_read > div > div.bookname > h1::text'
        ).extract()[0]
        contents = response.css('#content::text').extract()
        text = ''
        for content in contents:
            text = text + content
        #print(text)
        self.item['content'] = title + "\n" + text.replace(
            '\15', '\n')  #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。
        yield self.item  #以生成器模式(yield)输出Item对象的内容给pipelines模块。

        if self.item['url'][33:41] == self.item['next_page'][33:41]:
            self.url_c = self.item['next_page']
            yield scrapy.Request(self.url_c, callback=self.parse_c)
예제 #2
0
파일: kjtsg.py 프로젝트: sfccl/xbiquge_w
class SancunSpider(scrapy.Spider):
    name = 'kjtsg'
    allowed_domains = ['www.xbiquge.la']
    #start_urls = ['http://www.xbiquge.la/10/10489/']
    url_ori = "http://www.xbiquge.la"
    url_firstchapter = "http://www.xbiquge.la/15/15480/8186557.html"
    name_txt = "./novels/科技图书馆"

    pipeline = XbiqugePipeline()
    pipeline.clearcollection(
        name)  #清空小说的数据集合(collection),mongodb的collection相当于mysql的数据表table
    item = XbiqugeItem()
    item['id'] = 0  #新增id字段,便于查询
    item['name'] = name
    item['url_firstchapter'] = url_firstchapter
    item['name_txt'] = name_txt

    def start_requests(self):
        start_urls = ['http://www.xbiquge.la/15/15480/']
        for url in start_urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        dl = response.css('#list dl dd')  #提取章节链接相关信息
        for dd in dl:
            self.url_c = self.url_ori + dd.css('a::attr(href)').extract()[
                0]  #组合形成小说的各章节链接
            #print(self.url_c)
            #yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True)
            yield scrapy.Request(
                self.url_c, callback=self.parse_c
            )  #以生成器模式(yield)调用parse_c方法获得各章节链接、上一页链接、下一页链接和章节内容信息。
            #print(self.url_c)
    def parse_c(self, response):
        #item = XbiqugeItem()
        #item['name'] = self.name
        #item['url_firstchapter'] = self.url_firstchapter
        #item['name_txt'] = self.name_txt
        self.item['id'] += 1
        self.item['url'] = response.url
        self.item['preview_page'] = self.url_ori + response.css(
            'div .bottem1 a::attr(href)').extract()[1]
        self.item['next_page'] = self.url_ori + response.css(
            'div .bottem1 a::attr(href)').extract()[3]
        title = response.css('.con_top::text').extract()[4]
        contents = response.css('#content::text').extract()
        text = ''
        for content in contents:
            text = text + content
        #print(text)
        self.item['content'] = title + "\n" + text.replace(
            '\15', '\n')  #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。
        yield self.item  #以生成器模式(yield)输出Item对象的内容给pipelines模块。
예제 #3
0
class SancunSpider(scrapy.Spider):
    name = 'sancun'
    allowed_domains = ['www.xbiquge.la']
    #start_urls = ['http://www.xbiquge.la/10/10489/']
    url_ori = "https://www.xbiquge.la"
    url_firstchapter = "https://www.xbiquge.la/10/10489/4534454.html"
    name_txt = "./novels/三寸人间"
    url_chapters = url_firstchapter[0:32]
    pipeline = XbiqugePipeline()
    novelcollection = pipeline.get_collection(
        name)  #获取小说数据集cursor对象,mongodb的数据集(collection)相当于mysql的数据表table
    #--------------------------------------------
    #如果next_page的值是小说目录页面url,则把包含目录页面的记录删除,以免再次抓取时,出现多>个目录页面url,使得无法获得最新内容。
    if novelcollection.find({"next_page": url_chapters}).count() != 0:
        print(
            "包含目录页面url的记录:",
            novelcollection.find({
                "next_page": url_chapters
            }, {
                "_id": 0,
                "id": 1,
                "url": 1,
                "next_page": 1
            }).next())
        novelcollection.remove({"next_page": url_chapters})
        print("已删除包含目录页面url的记录。")
    #--------------------------------------------
    novelcounts = novelcollection.find().count()
    novelurls = novelcollection.find({}, {"_id": 0, "id": 1, "url": 1})
    item = XbiqugeItem()
    item['id'] = novelcounts  #id置初值为colletion的记录总数
    item['name'] = name
    item['url_firstchapter'] = url_firstchapter
    item['name_txt'] = name_txt

    def start_requests(self):
        start_urls = [self.url_chapters]
        print("小说目录url:", start_urls)
        for url in start_urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):  #网页提取数据,并与mongodb数据集比较,没有相同的数据才从网页抓取。
        f = open("/root/xbiquge_w/url_list.txt", "w")  #打开文件,以便写入抓取页面url
        count_bingo = 0  #数据集中已有记录的条数
        dl = response.css('#list dl dd')  #提取章节链接相关信息
        for dd in dl:
            count_iterator = 0
            self.url_c = self.url_ori + dd.css('a::attr(href)').extract()[
                0]  #组合形成小说的各章节链接
            #print("网页提取url:", self.url_c)
            self.novelurls = self.novelcollection.find({}, {
                "_id": 0,
                "id": 1,
                "url": 1
            })  #通过重新赋值迭代器来重置迭代器指针,使for循环能够从头遍历迭代器。
            for url in self.novelurls:
                #print("mongodb提取url:", url)
                if url["url"] == self.url_c:  #如果数据集中找到与网页提取的url值相同,则跳出循环
                    count_bingo += 1
                    count_iterator += 1
                    break
            if count_iterator != 0:  #如果有命中结果,则继续下一个循环,不执行爬取动作
                continue
            #print("爬取url:",self.url_c)
            f.write("爬取url:" + self.url_c + "\n")
            #yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True)
            yield scrapy.Request(
                self.url_c, callback=self.parse_c
            )  #以生成器模式(yield)调用parse_c方法获得各章节链接、上一页链接、下一页链接和章节内容信息。
            #print(self.url_c)
        f.close()
        print("数据集已有记录数count_bingo:", count_bingo)

    def parse_c(self, response):
        self.item['id'] += 1
        self.item['url'] = response.url
        self.item['preview_page'] = self.url_ori + response.css(
            'div .bottem1 a::attr(href)').extract()[1]
        self.item['next_page'] = self.url_ori + response.css(
            'div .bottem1 a::attr(href)').extract()[3]
        title = response.css('.con_top::text').extract()[4]
        contents = response.css('#content::text').extract()
        text = ''
        for content in contents:
            text = text + content
        #print(text)
        self.item['content'] = title + "\n" + text.replace(
            '\15', '\n')  #各章节标题和内容组合成content数据,\15是^M的八进制表示,需要替换为换行符。
        yield self.item  #以生成器模式(yield)输出Item对象的内容给pipelines模块。

        if self.item['url'][32:39] == self.item['next_page'][
                32:39]:  #同一章有分页的处理
            self.url_c = self.item['next_page']
            yield scrapy.Request(self.url_c, callback=self.parse_c)
예제 #4
0
# -*- coding:utf-8 -*-
#调用pipelines.py中模块HanxiangPipeline的content2txt方法,生成txt文件
import sys
from xbiquge.pipelines import XbiqugePipeline
dbname = 'wdssny'
firsturl = 'https://www.xbiquge.la/80/80058/31226099.html'
txtname = './novels/我的杀手女友'
url_chapters = firsturl[0:32]
#XbiqugePipeline().content2txt(dbname,firsturl,txtname)
myset = XbiqugePipeline().get_collection(dbname)
url_counts = myset.find({
    "url": url_chapters
}, {
    "url": 1,
    "_id": 0,
    "next_page": 1
}).count()
print(url_counts)
#print(obj_url)
#print(obj_url.hasNext())
#if obj_url.next() != '' :
#try:
#    print("对象内容:",obj_url.next())
#except StopIteration:
#    print("数据集中没有找到数据")
예제 #5
0
# -*-coding:utf-8-*-
from xbiquge.pipelines import XbiqugePipeline

XbiqugePipeline().createtable()
예제 #6
0
파일: spider2txt.py 프로젝트: sfccl/xbiquge
# -*- coding:utf-8 -*-
#调用pipelines.py中模块HanxiangPipeline的content2txt方法,生成txt文件
from xbiquge.pipelines import XbiqugePipeline

XbiqugePipeline().content2txt()
예제 #7
0
# -*- coding:utf-8 -*-
#调用pipelines.py中模块HanxiangPipeline的content2txt方法,生成txt文件
from xbiquge.pipelines import XbiqugePipeline

dbname = 'ss'
firsturl = 'https://www.xbiquge.la/49/49527/21336447.html'
txtname = './novels/绍宋'
XbiqugePipeline().content2txt(dbname, firsturl, txtname)