示例#1
0
class NovelSpider(NovelSpiderBase):
    # name = "NovelBookben"
    name = SpiderTypes.getTypeName_BookBen()

    def __init__(self):
        super().__init__()

    def getXpathList(self, response):
        return ""

    def getXpathMainInfo(self, response):
        return ""

    def getStrItem_Link(self, item):
        return ""

    def getStrItem_Idex(self, item):
        return ""

    def getStrMainInfo_Name(self, info):
        return ""

    def getStrMainInfo_Author(self, info):
        return ""

    def getXpathItem_Main(self, response):
        return ""

    def getStrItem_Title(self, xpath_main):
        return ""

    def getStrItem_Content(self, xpath_main):
        return ""
示例#2
0
class NovelShiZhangFuRen(NovelSpiderBase):
    # name = "NovelLWXS520"
    name=SpiderTypes.getTypeName_LWXiaoShuo520()

    def __init__(self):
        super().__init__()

    def getXpathList(self, response):
        return response.xpath('//div[@id="defaulthtml4"]/table/tbody/tr/td/div[@class="dccss"]')

    def getXpathMainInfo(self, response):
        return response.xpath('//div[@id="defaulthtml4"]/table/tbody/tr/td/div[@class="dccss"]')

    def getStrItem_Link(self, item):
        part_url = item.xpath('./a/@href').extract()[0]
        link = self.start_urls[0] + part_url
        return link

    def getStrItem_Idex(self, item):
        return item.xpath('./a/text()').extract()[0]

    def getXpathItem_Main(self, response):
        return response.xpath('//table[@class="border_l_r"]/tbody/tr/td/div')

    def getStrItem_Name(self, xpath_main):
        return xpath_main.xpath('./h2/text()').extract()[0]

    def getStrItem_Author(self, xpath_main):
        return xpath_main.xpath('./div[@class="border_b"]/text()').extract()[0]

    def getStrItem_Title(self, xpath_main):
        return xpath_main.xpath('./h1/text()').extract()[0]

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath('./table/tbody/tr/td/div/p/text()').extract()
示例#3
0
class NovelSpider1(NovelSpiderBase):
    # name = "NovelQiShuLou"
    name = SpiderTypes.getTypeName_QiShuLou()

    def __init__(self):
        super().__init__()

    def getXpathList(self, response):
        return response.xpath('//div[@id="content-list"]/div[@class="book-list clearfix"]/ul/li')

    def getXpathMainInfo(self, response):
        return response.xpath('//div[@id="content-list"]/div[@class="book-intro clearfix"]/div[@class="book-describe"]')

    def getStrMainInfo_Name(self, info):
        return info.xpath('./h1/text()').extract()[0]

    def getStrMainInfo_Author(self, info):
        return info.xpath('./p/text()').extract()[0]

    def getStrItem_Link(self, item):
        return item.xpath('./a/@href').extract()[0]

    def getStrItem_Idex(self, item):
        return item.xpath('./a/text()').extract()[0]

    def getXpathItem_Main(self, response):
        return response.xpath('//div[@id="pagewrap"]/article[@class="post clearfix"]')

    def getStrItem_Title(self, xpath_main):
        return xpath_main.xpath('.//header[@class="post-header clearfix"]/h1/text()').extract()[0].strip().replace('  ', '').replace('\r', '').replace('\n', '').replace('\t', '')

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath('.//text()').extract()
示例#4
0
class NovelSpider1(NovelSpiderBase):
    # name = "NovelGuaZiBpi"
    name = SpiderTypes.getTypeName_YanQingKu()

    def __init__(self):
        super().__init__()
        self.headLink = "http://www.yqk.net/yanqing"

    def getXpathMainInfo(self, response):
        return response.xpath('//div[@class="base"]')

    def getStrMainInfo_Name(self, info):
        return info.xpath('./p/strong/a/text()').extract()[0]

    def getStrMainInfo_Author(self, info):
        return info.xpath('./p/a/text()').extract()[0]

    def getXpathList(self, response):
        return response.xpath('//dl[@class="chapter"]/dd')

    def getStrItem_Link(self, item):
        lastLink = item.xpath('./a/@href').extract()[0]
        return lastLink

    def getStrItem_Idex(self, item):
        lastLink = item.xpath('./a/@href').extract()[0]
        index = re.findall("(.*).html.*", lastLink)[0]
        return index

    def getXpathItem_Main(self, response):
        return response.xpath('//div[@class="main"]')

    def getStrItem_Title(self, xpath_main):
        title = xpath_main.xpath(
            './/div[@class="title"]/text()').extract()[0].strip().replace(
                '  ', '').replace('\r', '').replace('\n',
                                                    '').replace('\t', '')
        lastIndex = title.find("作者")
        return title[0:lastIndex]

    def getStrItem_Author(self, xpath_main):
        title = xpath_main.xpath(
            './/div[@class="title"]/text()').extract()[0].strip().replace(
                '  ', '').replace('\r', '').replace('\n',
                                                    '').replace('\t', '')
        lastIndex = title.find("作者")
        if lastIndex > 0:
            length = len(title)
            return title[lastIndex:length]
        else:
            return ""

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath('.//div[@class="content"]//text()').extract()
示例#5
0
class NovelSpider1(NovelSpiderBase):
    # name = "NovelSDKK88"
    name = SpiderTypes.getTypeName_SDKK88()

    def __init__(self):
        super().__init__()
        self.headLink = "http://www.sbkk88.com"

    def getXpathList(self, response):
        return response.xpath(
            '//div[@class="mingzhuMain"]/div[@class="mingzhuLeft"]/ul[@class="leftList"]/li'
        )

    def getXpathMainInfo(self, response):
        return response.xpath(
            '//div[@class="mingzhuMain"]/div[@class="mingzhuLeft"]/ul[@class="leftList"]/li'
        )

    def getStrMainInfo_Name(self, info):
        extract_author = info.xpath(
            '//div[@class="mingzhuMain"]/div[@class="mingzhuLeft"]/div[@class="mingzhuTitle"]/h1/text()'
        ).extract()[0]
        find_author = re.findall(".*:(.*)", extract_author)[0]
        if len(find_author) > 0:
            author = find_author
        else:
            author = extract_author
        return author

    def getStrItem_Link(self, item):
        part_url = item.xpath('./a/@href').extract()[0]
        link = self.headLink + part_url
        return link

    def getStrItem_Idex(self, item):
        return item.xpath('./a/text()').extract()[0]

    def getXpathItem_Main(self, response):
        return response

    def getStrItem_Author(self, xpath_main):
        return xpath_main.xpath('./dd')[1].xpath('./h3/text()').extract()[0]

    def getStrItem_Title(self, xpath_main):
        return xpath_main.xpath('//div[@id="f_title1"]/h1/text()').extract()[0]

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath(
            '//div[@id="f_content1"]/div[@id="f_article"]/p/text()').extract()
示例#6
0
class NovelYanYang(NovelSpiderBase):
    # name = "Novel2KXS"
    name= SpiderTypes.getTypeName_2KXiaoShuo()

    def __init__(self):
        super().__init__()
        self.headLink="http://www.2kxs.com"

    def getXpathList(self, response):
        return response.xpath('//dl[@class="book"]/dd')

    def getXpathMainInfo(self, response):
        return response.xpath('//div[@id="bookinfo"]/div[@id="title"]')

    def getStrMainInfo_Name(self, info):
        return info.xpath('./h1/text()').extract()[0]

    def getStrMainInfo_Author(self, info):
        return info.xpath('./address[@class="author"]/a/text()').extract()[0]

    def getStrItem_Link(self, item):
        try:
            part_url = item.xpath('./a/@href').extract()[0]
        except:
            part_url=""
        if part_url.__contains__(BooksSetting.getHtmlLast()):
            link = self.urls[0] + part_url
        else:
            link=""
        return link

    def getStrItem_Idex(self, item):
        return item.xpath('./a/text()').extract()[0]

    def getXpathItem_Main(self, response):
        return response.xpath('//div[@id="box"]')

    def getStrItem_Title(self, xpath_main):
        return xpath_main.xpath('./h2/text()').extract()[0].strip().replace('  ',
                                                                                                    '').replace(
            '\r', '').replace('\n', '').replace('\t', '')

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath('./p[@class="Text"]/text()').extract()
示例#7
0
class NovelSpider1(NovelSpiderBase):
    # name = "NovelShuXiangGe"
    name = SpiderTypes.getTypeName_ShuXiangGe()

    def __init__(self):
        super().__init__()

    def getXpathMainInfo(self, response):
        return response.xpath(
            '//div[@class="mu_contain"]/div[@class="info"]/div[@class="book"]')

    def getStrMainInfo_Name(self, info):
        return info.xpath('./h1/a/text()').extract()[0]

    def getStrMainInfo_Author(self, info):
        return info.xpath('./dl/dt/text()').extract()[0]

    def getXpathList(self, response):
        return response.xpath(
            '//div[@class="warpper"]/div[@class="mu_contain"]/ul[@class="mulu_list"]/li'
        )

    def getStrItem_Link(self, item):
        part_url = item.xpath('./a/@href').extract()[0]
        link = self.urls[0] + part_url
        return link

    def getStrItem_Idex(self, item):
        return item.xpath('./a/text()').extract()[0]

    def getXpathItem_Main(self, response):
        return response.xpath('//table[@id="content"]/tbody/tr/td')

    def getStrItem_Title(self, xpath_main):
        return xpath_main.xpath('./h1/text()').extract()[0]\
            .strip()\
            .replace('  ','')\
            .replace('\r', '')\
            .replace('\n', '')\
            .replace('\t', '')

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath('./div[@id="htmlContent"]/text()').extract()
示例#8
0
class NovelSpider1(NovelSpiderBase):
    # name = "NovelYueDu163"
    name = SpiderTypes.getTypeName_YueDu163()

    def __init__(self):
        super().__init__()
        self.headLink = "http://yuedu.163.com"

    def getXpathList(self, response):
        return self.getXpathMainInfo(response).xpath('./div/div/ul')

    def getXpathMainInfo(self, response):
        return response.xpath('//div[@class="g-mn"]')

    def getStrMainInfo_Name(self, info):
        return info.xpath(
            './div[@class="m-bookdetail"]/div[@class="f-fl"]/h3/@title'
        ).extract()[0]

    def getStrMainInfo_Author(self, info):
        return info.xpath(
            './div[@class="m-bookdetail"]/div[@class="f-fl"]/h3/span/a/text()'
        ).extract()[0]

    def getStrItem_Link(self, item):
        part_url = item.xpath('./a/@href').extract()[0]
        link = self.headLink + part_url
        return link

    def getStrItem_Idex(self, item):
        return item.xpath('./a/text()').extract()[0]

    def getXpathItem_Main(self, response):
        return response.xpath(
            '//div[@class="article J_Article"]/div[@class="portrait-page-box J_PortraitMoveBox"]/div[@class="article-content"]/div[@class="ne-content J_NEContent"]'
        )

    def getStrItem_Title(self, xpath_main):
        return xpath_main.xpath('./h1/text()').extract()[0].strip().replace(
            '  ', '').replace('\r', '').replace('\n', '').replace('\t', '')

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath('./p/text()').extract()
示例#9
0
class NovelSpider1(NovelSpiderBase):
    # name = "NovelXinShuBao"
    name = SpiderTypes.getTypeName_XinShuBao()

    def __init__(self):
        super().__init__()

    def getXpathMainInfo(self, response):
        return response.xpath(
            '//div[@class="box_con"]/div[@id="maininfo"]/div[@id="info"]')

    def getStrMainInfo_Name(self, info):
        return info.xpath('./h1/text()').extract()[0]

    def getStrMainInfo_Author(self, info):
        return info.xpath('./p/text()').extract()[0]

    def getXpathList(self, response):
        return response.xpath('//div[@id="btycz"]/div[@id="list"]/ul/li')

    def getStrItem_Link(self, item):
        link = item.xpath('./a/@href').extract()[0]
        return link

    def getStrItem_Idex(self, item):
        return item.xpath('./a/text()').extract()[0]

    def getXpathItem_Main(self, response):
        return response.xpath(
            '//div[@class="content_read"]/div[@class="box_con"]')

    def getStrItem_Title(self, xpath_main):
        return xpath_main.xpath('./div[@class="bookname"]/h1/text()').extract()[0]\
            .strip()\
            .replace('  ','')\
            .replace('\r', '')\
            .replace('\n', '')\
            .replace('\t', '')

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath('./div[@id="content"]/text()').extract()
示例#10
0
class NovelShiZhangFuRen(NovelSpiderBase):
    # name = "NovelLeWenXiaoShuo"
    name = SpiderTypes.getTypeName_LeWenXiaoShuo()

    def __init__(self):
        super().__init__()
        self.headLink = "http://www.lwxiaoshuo.com"
        self.web_head = BooksSetting.getHtml()
        self.web_last = ".html"

    def getXpathList(self, response):
        return response.xpath('//table[@style="MARGIN-BOTTOM: 10px"]/tbody')

    def getXpathMainInfo(self, response):
        return response.xpath('//table[@style="MARGIN-BOTTOM: 10px"]/tbody')

    def getStrItem_Name(self, xpath_main):
        return xpath_main.xpath('./div/h1/text()').extract()[0]

    def getStrItem_Author(self, xpath_main):
        return xpath_main.xpath(
            './div/div[@class="border_b"]/text()').extract()[0]

    def getStrItem_Link(self, item):
        part_url = item.xpath('.//tr/td/div[@class="dccss"]/a/@href').extract()
        link = self.headLink + part_url
        return link

    def getStrItem_Idex(self, item):
        return item.xpath('.//tr/td/div[@class="dccss"]/a/text()').extract()

    def getXpathItem_Main(self, response):
        return response.xpath('//table[@class="border_l_r"]/tbody/tr/td')

    def getStrItem_Title(self, xpath_main):
        return xpath_main.xpath(
            './div/h2/text()').extract()[0].strip().replace('  ', '').replace(
                '\r', '').replace('\n', '').replace('\t', '')

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath('./table/tbody/tr/td/div/p/text()').extract()
示例#11
0
class NovelSangWu(NovelSpiderBase):
    # name = "NovelSangWu"
    name = SpiderTypes.getTypeName_SangWu()
    start_urls = ["http://www.sangwu.org/book/5/5952/"]

    def __init__(self):
        super().__init__()

    def getXpathList(self, response):
        return response.xpath('//dd')

    def getXpathMainInfo(self, response):
        return response.xpath('//div[@class="wp b2 info_chapterlist"]/ul/li')

    def getStrItem_Link(self, item):
        part_url = item.xpath('./a/@href').extract()
        link = self.start_urls[0] + part_url
        return link

    def getStrItem_Idex(self, item):
        return item.xpath('./a/text()').extract()

    def getXpathItem_Main(self, response):
        return response.xpath('//div[@class="readmain"]')

    def getStrItem_Name(self, xpath_main):
        return xpath_main.xpath(
            './div[@class="bookname"]/h2/text()').extract()[0]

    def getStrItem_Author(self, xpath_main):
        return xpath_main.xpath(
            './div[@class="bookname"]/h2/text()').extract()[0]

    def getStrItem_Title(self, xpath_main):
        return xpath_main.xpath(
            './div[@class="bookname"]/h1/text()').extract()[0]

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath('./div[@class="centent"]/text()').extract()
示例#12
0
class NovelSpider1(NovelSpiderBase):
    # name = "NovelJJWXC"
    name = SpiderTypes.getTypeName_JJWXC()
    def __init__(self):
        super().__init__()

    def getXpathList(self, response):
        return response.xpath('//table[@class="cytable"]/tbody/tr[@itemprop="chapter"]')

    def getXpathMainInfo(self, response):
        return response.xpath('//table[@class="cytable"]/tbody/tr[@itemprop="chapter"]')

    def getStrItem_Name(self, xpath_main):
        return xpath_main.xpath('.//td[@class="noveltitle"]/h1/a/span/text()').extract()[0].strip().replace('  ', '').replace('\r', '').replace('\n', '').replace('\t', '')

    def getStrItem_Author(self, xpath_main):
        return xpath_main.xpath('.//td[@class="noveltitle"]/a/text()').extract()[0]

    def getStrItem_Link(self, item):
        try:
            url=item.xpath('./td/span[@itemprop="headline"]/div[@style="float:left"]/a/@href').extract()[0]
        except:
            url=""
        return url

    def getStrItem_Idex(self, item):
        return item.xpath('./td/span[@itemprop="headline"]/div[@style="float:left"]/a/text()').extract()[0]

    def getXpathItem_Main(self, response):
        return response.xpath('//table[@id="oneboolt"]')

    def getStrItem_Title(self, xpath_main):
        return xpath_main.xpath('.//div[@class="noveltext"]/div')[1].xpath('./h2/text()').extract()[0].strip().replace('  ',
                                                                                                    '').replace(
            '\r', '').replace('\n', '').replace('\t', '')

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath('.//div[@class="noveltext"]/text()').extract()
示例#13
0
class NovelSpider1(NovelSpiderBase):
    # name = "NovelBookbao8"
    name = SpiderTypes.getTypeName_BookBao()

    def __init__(self):
        super().__init__()
        self.headLink="https://www.bookbao8.com"

    def getXpathList(self, response):
        return response.xpath('//div[@class="wp b2 info_chapterlist"]/ul/li')

    def getXpathMainInfo(self, response):
        return response.xpath('//div[@class="wp b2 info_chapterlist"]/ul/li')

    def getStrItem_Link(self, item):
        part_url = item.xpath('./a/@href').extract()[0]
        link = self.headLink + part_url
        return link

    def getStrItem_Idex(self, item):
        return item.xpath('./a/text()').extract()[0]

    def getXpathItem_Main(self, response):
        return response.xpath('//div[@class="bdsub"]/dl')

    def getStrItem_Name(self, xpath_main):
        return xpath_main.xpath('./dd')[0].xpath('./h1/a/text()').extract()[0]

    def getStrItem_Author(self, xpath_main):
        return xpath_main.xpath('./dd')[1].xpath('./h3/text()').extract()[0]

    def getStrItem_Title(self, xpath_main):
        return xpath_main.xpath('./dd')[0].xpath('./h1/text()').extract()[0].strip().replace('  ',
                                                                                                    '').replace(
            '\r', '').replace('\n', '').replace('\t', '')

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath('./dd[@id="contents"]/text()').extract()
示例#14
0
class NovelSpider1(NovelSpiderBase):
    # name = "Novel7xxs"
    name = SpiderTypes.getTypeName_7xxs()

    def __init__(self):
        super().__init__()
        self.headLink = "http://www.7xxs.net"

    def getXpathList(self, response):
        return response.xpath('//div[@class="box_con"]/div[@id="list"]/dl/dd')

    def getXpathMainInfo(self, response):
        return response.xpath('//div[@class="box_con"]/div[@id="maininfo"]')

    def getStrMainInfo_Name(self, info):
        return info.xpath('./div[@id="intro"]/text()').extract()[0]

    def getStrMainInfo_Author(self, info):
        return info.xpath('./div[@id="info"]/p/text()').extract()[0]

    def getStrItem_Link(self, item):
        lastLink=item.xpath('./a/@href').extract()[0]
        link = self.headLink + lastLink
        return link

    def getStrItem_Idex(self, item):
        lastLink = item.xpath('./a/@href').extract()[0]
        index = re.findall(".*/(.*).html.*", lastLink)[0]
        return index

    def getXpathItem_Main(self, response):
        return response.xpath('//div[@class="content_read"]/div[@class="box_con"]')

    def getStrItem_Title(self, xpath_main):
        return xpath_main.xpath('.//div[@class="bookname"]/h1/text()').extract()[0].strip().replace('  ', '').replace('\r', '').replace('\n', '').replace('\t', '')

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath('.//div[@id="content"]/text()').extract()
示例#15
0
class NovelSpider1(NovelSpiderBase):
    # name = "NovelBQG"
    name = SpiderTypes.getTypeName_BiQuGuan()

    def __init__(self):
        super().__init__()

    def getXpathList(self, response):
        return response.xpath('//div[@id="wrapper"]/div[@class="box_con"]/div[@id="list"]/dl/dd')

    def getXpathMainInfo(self, response):
        return response.xpath('//div[@id="wrapper"]/div[@class="box_con"]/div[@id="maininfo"]/div[@id="info"]')

    def getStrMainInfo_Name(self, info):
        return info.xpath('./h1/text()').extract()[0]

    def getStrMainInfo_Author(self, info):
        return info.xpath('./p/text()').extract()[0]

    def getStrItem_Link(self, item):
        part_url = item.xpath('./a/@href').extract()[0]
        link = self.urls[0] + part_url
        return link

    def getStrItem_Idex(self, item):
        return item.xpath('./a/text()').extract()[0]

    def getXpathItem_Main(self, response):
        return response.xpath('//div[@class="box_con"]')

    def getStrItem_Title(self, xpath_main):
        return xpath_main.xpath('.//div[@class="bookname"]/h1/text()').extract()[0].strip().replace('  ',
                                                                                                    '').replace(
            '\r', '').replace('\n', '').replace('\t', '')

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath('.//div[@id="content"]/text()').extract()
示例#16
0
class NovelSpider(NovelSpiderBase):
    # name = "NovelMaoPu"
    name= SpiderTypes.getTypeName_MaoPu()

    def __init__(self):
        super().__init__()
        self.headLink="https://www.bookbao8.com"

    def getXpathList(self, response):
        return response.xpath('//div[@class="mu_contain"]/ul[@class="mulu_list"]/li')

    def getXpathMainInfo(self, response):
        return response.xpath('//div[@class="wp b2 info_chapterlist"]/ul/li')

    def getStrItem_Link(self, item):
        part_url = item.xpath('./a/@href').extract()[0]
        link = self.start_urls[0] + part_url
        return link

    def getStrItem_Idex(self, item):
        return ""

    def getXpathItem_Main(self, response):
        return response.xpath('//div[@id="content"]')

    def getStrItem_Name(self, xpath_main):
        return xpath_main.xpath('./h1/text()').extract()[0]

    def getStrItem_Author(self, xpath_main):
        return xpath_main.xpath('./h1/text()').extract()[0]

    def getStrItem_Title(self, xpath_main):
        return  xpath_main.xpath('./h1/text()').extract()[0]

    def getStrItem_Content(self, xpath_main):
        return xpath_main.xpath('./div[@class="chapter-content"]/text()').extract()
示例#17
0
 def getScrapyType(self):
     return SpiderTypes.getTypeName_MaoPu()
示例#18
0
 def getScrapyType(self):
     return SpiderTypes.getTypeName_LeWenXiaoShuo()
示例#19
0
 def getScrapyType(self):
     return SpiderTypes.getTypeName_SDKK88()
示例#20
0
 def getScrapyType(self):
     return SpiderTypes.getTypeName_XS74()
示例#21
0
 def getScrapyType(self):
     return SpiderTypes.getTypeName_QiShuLou()
示例#22
0
 def getScrapyType(self):
     return SpiderTypes.getTypeName_BookBao()
示例#23
0
 def getScrapyType(self):
     return SpiderTypes.getTypeName_DiJiuZWW()
示例#24
0
 def getScrapyType(self):
     return SpiderTypes.getTypeName_LWXiaoShuo520()
示例#25
0
 def getScrapyType(self):
     return SpiderTypes.getTypeName_YanQingKu()
示例#26
0
 def getScrapyType(self):
     return SpiderTypes.getTypeName_JJWXC()
示例#27
0
 def getScrapyType(self):
     return SpiderTypes.getTypeName_7xxs()
示例#28
0
 def getScrapyType(self):
     return SpiderTypes.getTypeName_BiQuGuan()
示例#29
0
 def getScrapyType(self):
     return SpiderTypes.getTypeName_XinShuBao()