Exemplo n.º 1
0
class LLXSCrawler(Crawler):
    def __init__(self):
        Crawler.__init__(self)
        self.parser = LLXSParser()

    def crawl(self, url):
        #get novel page
        html = utils.http_get(url, encode="gbk")
        html = self.parser.to_utf8(html)
        #html = utils.gbk_to_utf8(html)
        novel = self.parser.parse_novel_page(url, html)
        list_url = novel['list_url']

        html = utils.http_get(list_url, encode='gbk')
        html = self.parser.to_utf8(html)
        chapter_list = self.parser.parse_list_page(list_url, html)
        novel['chapter_list'] = chapter_list
        novel["chapters"] = len(chapter_list)
        novel['update_time'] = time.time()
        novel["last_chapter"] = chapter_list[-1]["url"]
        return novel

    def crawl_content(self, url):
        html = utils.http_get(url, encode='gbk')
        html = self.parser.to_utf8(html)
        content = self.parser.parse_content_page(url, html)
        return content
Exemplo n.º 2
0
 def __init__(self):
     Crawler.__init__(self)
     self.parser = LLXSParser()