示例#1
0
文件: pika.py 项目: Fouppy/scrape
 def parse_item(self, response):
     item = MangaItem()
     item['name'] = response.css("h1.titre-big").xpath('./text()').extract()
     item['release_date'] = response.css("div.date_sortie").xpath('./text()').extract()
     item['collection'] = response.css("div.categorie").xpath('./text()').extract()
     item['cover'] = response.css("div.item-fiche-livre div.mediao__figure img").xpath('./@src').extract()
     item['tome'] = response.css("div.block_infos_techniques div:nth-child(2)").xpath('./text()').extract()
     yield item
示例#2
0
    def parse_image_page(self, response):
        srcs = response.xpath(self.xpath.get("image")).extract()
        self.log(srcs)
        title = response.xpath("//title/text()").extract_first().split("-")
        manga_name = title[0].split(" ")[0]
        chapter_name = title[1].strip()

        logging.info("downloading " + manga_name + ": " + chapter_name)

        item = MangaItem(image_urls=srcs, manga_name=manga_name, chapter_name=chapter_name)
        return item
示例#3
0
 def parsePage(self, response):
     item = MangaItem()
     item['book'] = response.meta['book']
     item['page'] = response.css(".navigation .button-success::text").get()
     item['src'] = 'http://p17.manhuapan.com/' + response.css('script').re(r'mhurl="(.*?)"')[0]
     yield item
     if '下一页' == response.css('.navigation .pure-button.pure-button-primary::text').getall()[-1]:
         next_url = response.css('.navigation .pure-button.pure-button-primary::attr("href")').getall()[-1]
         request = Request(response.meta['url'] + next_url, self.parsePage)
         request.meta['book'] = response.meta['book']
         request.meta['url'] = response.meta['url']
         yield request
示例#4
0
    def parse_image_page(self, response):
        srcs = response.xpath(self.xpath.get("image")).extract()
        self.log(srcs)
        manga_name = response.xpath("//h1/text()").extract_first()
        # chapter_name = title[1].strip()

        logging.info("downloading " + manga_name)

        item = MangaItem(image_urls=srcs, manga_name=manga_name, chapter_name="")
        # item["image_urls"] = srcs
        # item["chapter_name"] = chapter_name
        # item["manga_name"] = manga_name
        return item
示例#5
0
    def parse_item(self, response):
        item = MangaItem()

        manga_attr = response.xpath("//table/tr/td")
        if len(manga_attr) == 0:
            return []

        item['tags'] = []

        #title
        item['title'] = response.css(".odd_anim_title_m").xpath(
            ".//h1/text()").extract()
        #synonym
        item['synonym'] = manga_attr[0].xpath("./text()").extract()
        #og name
        item['og_name'] = manga_attr[1].xpath("./text()").extract()
        #author url
        item['author_url'] = manga_attr[2].xpath('./a/@href').extract()
        #author
        item['author'] = manga_attr[2].xpath("./a/text()").extract()
        #area tag
        item['tags'] += manga_attr[3].xpath("./a/text()").extract()
        #status tag
        item['tags'] += manga_attr[4].xpath("./a/text()").extract()
        #hits
        item['hits'] = manga_attr[5].xpath("./a/text()").extract()
        #type tag
        item['tags'] += manga_attr[6].xpath("./a/text()").extract()
        #category tag
        item['tags'] += manga_attr[7].xpath("./a/text()").extract()
        #cover
        item['cover'] = response.css(
            "div.anim_intro_ptext > a > img::attr(src)").extract()

        urls = response.css(
            "div.cartoon_online_border > ul > li > a::attr(href)").extract()
        nums = range(1, len(urls) + 1)
        item['chapters'] = dict(zip(nums, urls))

        id_text = response.xpath("//script")[4].extract()
        m = re.search(r"\"([0-9]+)\"", id_text)
        if m:
            id = m.group(1)
            req = Request(url="http://manhua.dmzj.com/hits/" + id + ".json",
                          callback=self.parse_item2)
            req.meta["item"] = item
            return req
        else:
            return item