示例#1
0
 def parse_detail(self, content, url):
     item = BookInfoItem()
     if u'抱歉,页面无法访问...' in content:
         return item
     sel = etree.HTML(content)
     item['folder_url'] = urljoin(
         url,
         sel.xpath('//a[@id="bookImg"]/img/@src')[0].replace('\r', ''))
     item['title'] = sel.xpath('//div[@class="book-info"]/h1/em/text()')[0]
     item['url'] = url
     item['author'] = sel.xpath(
         '//div[@class="book-info"]/h1/a/text()')[0][:-2]
     item['category'] = sel.xpath(
         '//div[@class="crumbs-nav center1020"]/span/a/text()')[1]
     item['sub_category'] = sel.xpath(
         '//div[@class="crumbs-nav center1020"]/span/a/text()')[2]
     item['status'] = sel.xpath(
         '//span[@class="tag"]/i[@class="blue"]/text()')[0]
     word_count = sel.xpath('//p[@class="total"]/span/text()')[0]
     site = sel.xpath('//p[@class="total"]/em/text()')[0]
     item['word_count'] = int(self.transform_word_count(word_count, site))
     item['introduction'] = self.cleaner.fit_transform('\n'.join(
         sel.xpath(
             '//div[@class="book-information cf"]//p[@class="intro"]/text()'
         )))
     return item
示例#2
0
 def parse_detail(self, content, url):
     self.logger.debug('Received info')
     item = BookInfoItem()
     if '抱歉,页面无法访问...' in content:
         return item
     sel = etree.HTML(content)
     item['folder_url'] = urljoin(
         url,
         sel.xpath('//div[@class="book-information cf"]/div[1]/a/img/@src')
         [0]).strip()
     item['title'] = sel.xpath(
         '//div[@class="book-information cf"]/div[2]/h1/em/text()')[0]
     item['url'] = url
     word_count = sel.xpath(
         '//div[@class="book-information cf"]/div[@class="book-info "]/p[3]/em[1]/text()'
     )[0]
     site = sel.xpath(
         '//div[@class="book-information cf"]/div[@class="book-info "]/p[3]/cite[1]/text()'
     )[0]
     item['word_count'] = int(self.transform_word_count(word_count, site))
     item['author'] = sel.xpath(
         '//div[@class="book-information cf"]/div[2]/h1/span/a/text()')[0]
     item['category'] = sel.xpath(
         '//div[@class="book-information cf"]/div[2]/p[1]/a[1]/text()')[0]
     item['sub_category'] = sel.xpath(
         '//div[@class="book-information cf"]/div[2]/p[1]/a[2]/text()')[0]
     item['status'] = sel.xpath(
         '//div[@class="book-information cf"]/div[2]/p[1]/span[1]/text()'
     )[0]
     item['introduction'] = self.cleaner.fit_transform('\n'.join(
         sel.xpath('//div[@class="book-intro"]/p/text()')))
     return item
示例#3
0
 def parse_detail(self, content, url):
     item = BookInfoItem()
     if u'不存在的网页' in content:
         return item
     sel = etree.HTML(content)
     item['folder_url'] = urljoin(url, sel.xpath('//div[@id="fmimg"]/img/@src')[0])
     item['title'] = sel.xpath('//div[@id="info"]/h1/text()')[0]
     item['url'] = url
     item['word_count'] = 0
     item['author'] = sel.xpath('//div[@id="info"]/p/text()')[0][7:]
     item['category'] = sel.xpath('//div[@class="con_top"]/a/text()')[1]
     item['sub_category'] = ''
     item['status'] = sel.xpath('//meta[@property="og:novel:status"]/@content')[0]
     item['introduction'] = self.cleaner.fit_transform('\n'.join(sel.xpath('//div[@id="intro"]/p/text()')))
     return item
示例#4
0
 def parse_detail(self, content, url):
     self.logger.debug('Received info')
     item = BookInfoItem()
     if u'出现错误' in content:
         return item
     sel = etree.HTML(content)
     item['folder_url'] = urljoin(
         url,
         sel.xpath('//img[@class="BookImg"]/@src')[0])
     item['title'] = sel.xpath('//img[@class="BookImg"]/@alt')[0]
     item['url'] = url
     item['author'] = sel.xpath('//h2[@class="BookAuthor"]/a/text()')[0]
     item['category'] = sel.xpath(
         '//h2[@class="BookAuthor"]/text()')[1].strip().split(u':')[-1]
     item['sub_category'] = ''
     item['word_count'] = 0
     item[
         'status'] = ''  # sel.xpath('//span[@id="adbanner_1"]/text()')[0][:3]
     item['introduction'] = self.cleaner.fit_transform('\n'.join(
         sel.xpath('//h3[@class="BookIntro"]/text()')))
     return item
示例#5
0
 def parse_detail(self, content, url):
     self.logger.debug('Received info')
     item = BookInfoItem()
     if u'找不到页面' in content:
         return item
     sel = etree.HTML(content)
     item['folder_url'] = urljoin(
         url,
         sel.xpath('//dl[@class="bookprofile"]/dt/img/@src')[0])
     item['title'] = sel.xpath('//div[@class="title"]/h1/text()')[0]
     item['url'] = url
     item['author'] = sel.xpath('//div[@class="title"]/span/a/text()')[0]
     item['category'] = sel.xpath(
         '//p[@class="sub-cols"]/span/text()')[2].strip().split(u':')[-1]
     item['sub_category'] = ''
     word_count = sel.xpath('//p[@class="sub-data"]/span/em/text()')[0][:-1]
     site = sel.xpath('//p[@class="sub-data"]/span/em/text()')[0]
     item['word_count'] = int(self.transform_word_count(word_count, site))
     item['status'] = sel.xpath('//p[@class="sub-cols"]/span/text()')[1]
     item['introduction'] = self.cleaner.fit_transform('\n'.join(
         sel.xpath('//dl[@class="introcontent"]/dd/p/text()')))
     return item