示例#1
0
    def get_link(self, response):
        item = ManhuaItem()
        result = response.xpath('//script/text()').extract()
        href = re.search(r'<IMG SRC=[\s\S]*?(newkuku[\s\S]*?.jpg)\'>', result[0])
        jpg_url = 'http://n5.1whour.com/'+href.group(1)
        floder_name = response.meta['floder_name']
        page = re.search(r'/(\d+).htm', response.meta['url']).group(1)
        filename = floder_name + '/' + page + '.jpg'

        item['url'] = jpg_url
        item['floder_name'] = floder_name
        item['filename'] = filename
        yield item
示例#2
0
    def parse(self, response):
        item = ManhuaItem()
        item['img_urls'] = response.xpath(
            '/html/body/div[@class="main"]/div[1]/div[1]/div[2]/div[4]/a/img/@src'
        ).extract()
        yield item

        new_url_first = response.xpath(
            '/html/body/div[2]/div[1]/div[1]/div[3]/a[10]/@href'
        ).extract_first()
        new_url = 'http://www.xieeqiao.com/manhua/' + new_url_first

        if new_url:
            yield scrapy.Request(new_url, callback=self.parse)
示例#3
0
    def parse_tu(self, response):
        neirong = response.xpath('/html/body/script[2]/text()').extract()[0]
        pattern = re.compile(r'http:.*jpg-smh\.middle')
        urljpg = pattern.findall(neirong)[0]
        re1 = re.compile(r'","')
        jpg = re1.split(urljpg)
        for url in jpg:
            #
            item = ManhuaItem()

            item["path"] = os.path.join("ok", url.split('/')[-2])
            item["filename"] = url.split('/')[-1].split("-")[0]
            item["image_url"] = url

            yield item
示例#4
0
    def mai(self, response):
        URL = 'https://manhua.dmzj.com'
        for i in response.css('div.cartoon_online_border ul li').extract():
            text = ManhuaItem()
            i = pq(i)
            text['name'] = i('a').attr('title')
            text['chapter'] = i("a").text()
            text['link'] = [
                URL + i("a").attr('href') + '#@page=' + str(t)
                for t in range(1, 13)
            ]
            yield text

            for uri in text['link']:
                #self.logger.info('='*30+uri)
                yield scrapy.Request(url=uri, callback=self.download)
示例#5
0
 def parse1(self, response):
     hxs = Selector(response)
     urls = response.xpath('//dd/a[1]/@href').extract()
     dir_names = response.xpath('//dd/a[1]/text()').extract()
     for index in range(len(urls)):
         # if dir_names[index].split(' ')[1][-1:] != '话':
         #     if input(dir_names[index].split(' ')[1] + '是否忽略(y or n)') == 'y':
         #         continue
         if (float(dir_names[index].split(' ')[1][:-1]) >=
                 self.start_order_down
                 and float(dir_names[index].split(' ')[1][:-1]) <=
                 self.start_order_up) or self.is_all:
             item = ManhuaItem()
             item['link_url'] = self.server_link + urls[index]
             item['dir_name'] = dir_names[index]
             yield scrapy.Request(url=item['link_url'],
                                  meta={'item': item},
                                  callback=self.parse2)
示例#6
0
 def download(self, response):
     self.logger.info('=' * 30 + response.text)
     try:
         text = ManhuaItem()
         text['name'] = response.css(
             'h1.hotrmtexth1 a::attr(title)').extract_first()
         text['chapter'] = response.css(
             'span.redhot1::text').extract_first()
         self.logger.info('*' * 30 + text['name'] + '*' * 10 +
                          text['chapter'])
         text['link'] = 'https:' + (response.xpath(
             '//*[@id="center_box"]/img/@src').extract_first())
         #text['link']='https:'+response.css('div[id="center_box"] img::attr(src)').extract_first()
         self.logger.warn('*' * 30 + text['link'])
         yield text
         #self.logger.info('*'*30+text['link'])
         self.logger.debug(str(text))
     except:
         pass
示例#7
0
    def get_chapterurls(self, response):
        result = response.xpath('//td[@align="center"]/text()').extract()
        text = result[2]
        result = re.search(r'共(\d+)页', text)
        name = result.group(0)
        maxnum = result.group(1)
        floder_name = response.meta['floder_name'] + ' ' + name

        item = ManhuaItem()
        result2 = response.xpath('//script/text()').extract()
        href = re.search(r'<IMG SRC=[\s\S]*?(newkuku[\s\S]*?.jpg)\'>', result2[0])
        jpg_url = 'http://n5.1whour.com/' + href.group(1)
        filename = floder_name + '/' + '1.jpg'
        item['url'] = jpg_url
        item['floder_name'] = floder_name
        item['filename'] = filename
        yield item


        for i in range(1, int(maxnum)+1):
            photo_url = response.meta['new_url'].replace(r'1.htm', str(i)+'.htm')
            yield scrapy.Request(photo_url, headers=self.headers, callback=self.get_link, meta={'floder_name': floder_name, 'url': photo_url}, dont_filter=True)
示例#8
0
 def parse(self, response):
     item = ManhuaItem()
     imgs = response.xpath('//*[@id="comicContain"]/li')
     pattern = r'https://manhua.qpic.cn/manhua_detail'
     xiayiyexpath = '//*[@id="mainControlNext"]/@href'
     imgtitle = response.css('title::text').extract()[0].split(u'-')[0]
     count = 1
     for i in imgs:
         if len(i.xpath('img/@src').extract()) >= 1:
             imgurl = i.xpath('img/@src').extract()[0]
         else:
             imgurl = ''
         if imgurl != '' and imgurl.startswith(pattern):
             item['title'] = imgtitle.encode("utf-8")
             item['url'] = imgurl.encode("utf-8")
             item['imgname'] = 'img' + str(count)
             count = count + 1
             yield item
     next_page = response.xpath(xiayiyexpath).extract_first()
     if next_page is not None:
         next_page = response.urljoin(next_page)
         yield scrapy.Request(next_page, callback=self.parse)
示例#9
0
 def parse_detailed(self, response):
     item = ManhuaItem()
     jshtml = response.xpath('//div[@class="jshtml"]')
     # 书名
     item['name'] = jshtml.xpath('./ul/li[1]/text()').extract_first().split(
         ':')[-1]
     print(item['name'])
     # 图片
     # import ipdb; ipdb.set_trace()
     item['img'] = response.xpath(
         '//div[@id="offlinebtn-container"]/img/@data-url').extract_first()
     # 状态
     item['state'] = jshtml.xpath(
         './ul/li[2]/text()').extract_first().split(':')[-1]
     # 作者
     item['author'] = jshtml.xpath(
         './ul/li[3]/text()').extract_first().split(':')[-1]
     # 类型
     item['_type'] = jshtml.xpath(
         './ul/li[4]/text()').extract_first().split(':')[-1]
     # 简介
     item['title'] = jshtml.xpath(
         './div/div[@class="wz clearfix t1"]/div/text()').extract_first()
     # 更新
     item['update'] = jshtml.xpath('./ul/li[5]/text()').extract_first()
     # 章节
     data_chapter = {}
     for chapter in response.xpath('//ul[@id="topic1"]//li')[::-1]:
         chapter_name = chapter.xpath('./a/@title').extract_first()
         href = chapter.xpath('./a/@href').extract_first()
         chapter_url = furl(response.url).remove(path=True).join(href).url
         # list_url = self.parse_chapter(chapter_url)
         data_chapter[chapter_name] = chapter_url
         # yield scrapy.http.Request(url=chapter_url, callback=self.parse_chapter)
         # break
     # import ipdb; ipdb.set_trace()
     str_chapter = str(data_chapter).replace("'", '"')
     item['chapter'] = str_chapter
     yield item