def get_menu(self, url=None): """crawling novel menu""" if self._novel_html is None: if url: res = parse_url(url, 'utf-8', params=self._params) self._novel_html = etree.HTML(res) else: return False chapter_url = self._novel_html.xpath('//*[@id="list"]/dd/a/@href')[9:] chapter_title = self._novel_html.xpath( '//*[@id="list"]/dd/a/text()')[9:] for url, title in zip(chapter_url, chapter_title): chapter = {'source_url': self._base_url + url, 'title': title} yield chapter
def get_info(self, novel_url): """crawler novel info""" res = parse_url(novel_url, 'gbk', params=self._params) html = etree.HTML(res) self._novel_html = html image = html.xpath('//*[@id="fmimg"]/img/@src')[0] intro = html.xpath('//*[@id="intro"]/p/text()')[0] update_info = html.xpath('//*[@id="info"]/p[3]/text()')[0] state = self._novel_state(update_info) info = { 'image': self._base_url + image, 'intro': intro, 'state': state } return info
def get_info(self, novel_url): """crawler novel info""" res = parse_url(novel_url, 'utf-8', params=self._params) html = etree.HTML(res) self._novel_html = html image = html.xpath('//*[@id="fmimg"]/a/img/@src')[0] intro = html.xpath('//*[@id="intro"]/text()')[2] update_info = html.xpath('//*[@id="info"]/p[3]/text()')[0] _datetime = update_info.split(':')[1].split(' ')[0] state = self._novel_state(_datetime) info = { 'image': self._base_url + image, 'intro': intro, 'state': state } return info
def get_url(self, category): """crawler all novels url according to category""" url = self._xs147.format(category=category) res = parse_url(url, 'utf-8', params=self._params) html = etree.HTML(res) urls = html.xpath( '//*[@id="main"]/div[@class="novelslist"]/div[1]/ul/li/a/@href') names = html.xpath( '//*[@id="main"]/div[@class="novelslist"]/div[1]/ul/li/a/text()') authors = html.xpath( '//*[@id="main"]/div[@class="novelslist"]/div[1]/ul/li/text()') for url, name, author in zip(urls, names, authors): novel = { 'source_url': url, 'name': name, 'author': author.lstrip('/') } yield novel
def get_url(self, category): """crawler all novels url according to category""" url = self._biquge.format(category=category) res = parse_url(url, 'gbk', params=self._params) html = etree.HTML(res) urls = html.xpath('//*[@id="newscontent"]/div[@class="l"]/ul/li/span[1]/a/@href') urls_r = html.xpath('//*[@id="newscontent"]/div[@class="r"]/ul/li/span[1]/a/@href') names = html.xpath('//*[@id="newscontent"]/div[@class="l"]/ul/li/span[1]/a/text()') names_r = html.xpath('//*[@id="newscontent"]/div[@class="r"]/ul/li/span[1]/a/text()') authors = html.xpath('//*[@id="newscontent"]/div[@class="l"]/ul/li/span[3]/text()') authors_r = html.xpath('//*[@id="newscontent"]/div[@class="r"]/ul/li/span[2]/text()') urls.extend(urls_r) names.extend(names_r) authors.extend(authors_r) for url, name, author in zip(urls, names, authors): novel = { 'source_url': url, 'name': name, 'author': author } yield novel
def get_chapter(self, chapter_url): """crawler chapter content""" res = parse_url(chapter_url, 'gbk', params=self._params) html = etree.HTML(res) content = html.xpath('//*[@id="content"]/text()') return ''.join(content)