def parse_detail(self, content, url): item = BookInfoItem() if u'抱歉,页面无法访问...' in content: return item sel = etree.HTML(content) item['folder_url'] = urljoin( url, sel.xpath('//a[@id="bookImg"]/img/@src')[0].replace('\r', '')) item['title'] = sel.xpath('//div[@class="book-info"]/h1/em/text()')[0] item['url'] = url item['author'] = sel.xpath( '//div[@class="book-info"]/h1/a/text()')[0][:-2] item['category'] = sel.xpath( '//div[@class="crumbs-nav center1020"]/span/a/text()')[1] item['sub_category'] = sel.xpath( '//div[@class="crumbs-nav center1020"]/span/a/text()')[2] item['status'] = sel.xpath( '//span[@class="tag"]/i[@class="blue"]/text()')[0] word_count = sel.xpath('//p[@class="total"]/span/text()')[0] site = sel.xpath('//p[@class="total"]/em/text()')[0] item['word_count'] = int(self.transform_word_count(word_count, site)) item['introduction'] = self.cleaner.fit_transform('\n'.join( sel.xpath( '//div[@class="book-information cf"]//p[@class="intro"]/text()' ))) return item
def parse_detail(self, content, url): self.logger.debug('Received info') item = BookInfoItem() if '抱歉,页面无法访问...' in content: return item sel = etree.HTML(content) item['folder_url'] = urljoin( url, sel.xpath('//div[@class="book-information cf"]/div[1]/a/img/@src') [0]).strip() item['title'] = sel.xpath( '//div[@class="book-information cf"]/div[2]/h1/em/text()')[0] item['url'] = url word_count = sel.xpath( '//div[@class="book-information cf"]/div[@class="book-info "]/p[3]/em[1]/text()' )[0] site = sel.xpath( '//div[@class="book-information cf"]/div[@class="book-info "]/p[3]/cite[1]/text()' )[0] item['word_count'] = int(self.transform_word_count(word_count, site)) item['author'] = sel.xpath( '//div[@class="book-information cf"]/div[2]/h1/span/a/text()')[0] item['category'] = sel.xpath( '//div[@class="book-information cf"]/div[2]/p[1]/a[1]/text()')[0] item['sub_category'] = sel.xpath( '//div[@class="book-information cf"]/div[2]/p[1]/a[2]/text()')[0] item['status'] = sel.xpath( '//div[@class="book-information cf"]/div[2]/p[1]/span[1]/text()' )[0] item['introduction'] = self.cleaner.fit_transform('\n'.join( sel.xpath('//div[@class="book-intro"]/p/text()'))) return item
def parse_detail(self, content, url): item = BookInfoItem() if u'不存在的网页' in content: return item sel = etree.HTML(content) item['folder_url'] = urljoin(url, sel.xpath('//div[@id="fmimg"]/img/@src')[0]) item['title'] = sel.xpath('//div[@id="info"]/h1/text()')[0] item['url'] = url item['word_count'] = 0 item['author'] = sel.xpath('//div[@id="info"]/p/text()')[0][7:] item['category'] = sel.xpath('//div[@class="con_top"]/a/text()')[1] item['sub_category'] = '' item['status'] = sel.xpath('//meta[@property="og:novel:status"]/@content')[0] item['introduction'] = self.cleaner.fit_transform('\n'.join(sel.xpath('//div[@id="intro"]/p/text()'))) return item
def parse_detail(self, content, url): self.logger.debug('Received info') item = BookInfoItem() if u'出现错误' in content: return item sel = etree.HTML(content) item['folder_url'] = urljoin( url, sel.xpath('//img[@class="BookImg"]/@src')[0]) item['title'] = sel.xpath('//img[@class="BookImg"]/@alt')[0] item['url'] = url item['author'] = sel.xpath('//h2[@class="BookAuthor"]/a/text()')[0] item['category'] = sel.xpath( '//h2[@class="BookAuthor"]/text()')[1].strip().split(u':')[-1] item['sub_category'] = '' item['word_count'] = 0 item[ 'status'] = '' # sel.xpath('//span[@id="adbanner_1"]/text()')[0][:3] item['introduction'] = self.cleaner.fit_transform('\n'.join( sel.xpath('//h3[@class="BookIntro"]/text()'))) return item
def parse_detail(self, content, url): self.logger.debug('Received info') item = BookInfoItem() if u'找不到页面' in content: return item sel = etree.HTML(content) item['folder_url'] = urljoin( url, sel.xpath('//dl[@class="bookprofile"]/dt/img/@src')[0]) item['title'] = sel.xpath('//div[@class="title"]/h1/text()')[0] item['url'] = url item['author'] = sel.xpath('//div[@class="title"]/span/a/text()')[0] item['category'] = sel.xpath( '//p[@class="sub-cols"]/span/text()')[2].strip().split(u':')[-1] item['sub_category'] = '' word_count = sel.xpath('//p[@class="sub-data"]/span/em/text()')[0][:-1] site = sel.xpath('//p[@class="sub-data"]/span/em/text()')[0] item['word_count'] = int(self.transform_word_count(word_count, site)) item['status'] = sel.xpath('//p[@class="sub-cols"]/span/text()')[1] item['introduction'] = self.cleaner.fit_transform('\n'.join( sel.xpath('//dl[@class="introcontent"]/dd/p/text()'))) return item