def parse_item(self, response): item = MangaItem() item['name'] = response.css("h1.titre-big").xpath('./text()').extract() item['release_date'] = response.css("div.date_sortie").xpath('./text()').extract() item['collection'] = response.css("div.categorie").xpath('./text()').extract() item['cover'] = response.css("div.item-fiche-livre div.mediao__figure img").xpath('./@src').extract() item['tome'] = response.css("div.block_infos_techniques div:nth-child(2)").xpath('./text()').extract() yield item
def parse_image_page(self, response): srcs = response.xpath(self.xpath.get("image")).extract() self.log(srcs) title = response.xpath("//title/text()").extract_first().split("-") manga_name = title[0].split(" ")[0] chapter_name = title[1].strip() logging.info("downloading " + manga_name + ": " + chapter_name) item = MangaItem(image_urls=srcs, manga_name=manga_name, chapter_name=chapter_name) return item
def parsePage(self, response): item = MangaItem() item['book'] = response.meta['book'] item['page'] = response.css(".navigation .button-success::text").get() item['src'] = 'http://p17.manhuapan.com/' + response.css('script').re(r'mhurl="(.*?)"')[0] yield item if '下一页' == response.css('.navigation .pure-button.pure-button-primary::text').getall()[-1]: next_url = response.css('.navigation .pure-button.pure-button-primary::attr("href")').getall()[-1] request = Request(response.meta['url'] + next_url, self.parsePage) request.meta['book'] = response.meta['book'] request.meta['url'] = response.meta['url'] yield request
def parse_image_page(self, response): srcs = response.xpath(self.xpath.get("image")).extract() self.log(srcs) manga_name = response.xpath("//h1/text()").extract_first() # chapter_name = title[1].strip() logging.info("downloading " + manga_name) item = MangaItem(image_urls=srcs, manga_name=manga_name, chapter_name="") # item["image_urls"] = srcs # item["chapter_name"] = chapter_name # item["manga_name"] = manga_name return item
def parse_item(self, response): item = MangaItem() manga_attr = response.xpath("//table/tr/td") if len(manga_attr) == 0: return [] item['tags'] = [] #title item['title'] = response.css(".odd_anim_title_m").xpath( ".//h1/text()").extract() #synonym item['synonym'] = manga_attr[0].xpath("./text()").extract() #og name item['og_name'] = manga_attr[1].xpath("./text()").extract() #author url item['author_url'] = manga_attr[2].xpath('./a/@href').extract() #author item['author'] = manga_attr[2].xpath("./a/text()").extract() #area tag item['tags'] += manga_attr[3].xpath("./a/text()").extract() #status tag item['tags'] += manga_attr[4].xpath("./a/text()").extract() #hits item['hits'] = manga_attr[5].xpath("./a/text()").extract() #type tag item['tags'] += manga_attr[6].xpath("./a/text()").extract() #category tag item['tags'] += manga_attr[7].xpath("./a/text()").extract() #cover item['cover'] = response.css( "div.anim_intro_ptext > a > img::attr(src)").extract() urls = response.css( "div.cartoon_online_border > ul > li > a::attr(href)").extract() nums = range(1, len(urls) + 1) item['chapters'] = dict(zip(nums, urls)) id_text = response.xpath("//script")[4].extract() m = re.search(r"\"([0-9]+)\"", id_text) if m: id = m.group(1) req = Request(url="http://manhua.dmzj.com/hits/" + id + ".json", callback=self.parse_item2) req.meta["item"] = item return req else: return item