Пример #1
0
 def create_single_html_book(self, book_package):
     title = book_package.get_title()
     if not title:
         # 电子书题目为空时自动跳过
         # 否则会发生『rm -rf / 』的惨剧
         return
     Path.reset_path()
     Path.chdir(Path.result_path)
     Path.rmdir(u'./' + title)
     Path.mkdir(u'./' + title)
     Path.chdir(u'./' + title)
     page = []
     for book in book_package.book_list:
         page += book.page_list
     content = u' \r\n '.join([Match.html_body(x.content) for x in page]).replace(u'../images/', u'./images/')
     with open(TemplateConfig.content_base_uri) as html:
         content = html.read().format(title=title, body=content).replace(u'../style/', u'./')
     with open(title + u'.html', 'w') as html:
         html.write(content)
     Path.copy(Path.html_pool_path + u'/../{}/OEBPS/images'.format(title), u'./images')
     Path.copy(Path.www_css + u'/customer.css', u'./customer.css')
     Path.copy(Path.www_css + u'/markdown.css', u'./markdown.css')
     Path.copy(Path.www_css + u'/normalize.css', u'./normalize.css')
     # Path.copy(Path.www_css + u'/article.css', u'./article.css')         # TODO: 需要精简
     Path.reset_path()
     return title
Пример #2
0
 def create_single_html_book(self, book_package):
     title = book_package.get_title()
     if not title:
         # 电子书题目为空时自动跳过
         # 否则会发生『rm -rf / 』的惨剧
         return
     Path.reset_path()
     Path.chdir(Path.result_path)
     Path.rmdir(u'./' + title)
     Path.mkdir(u'./' + title)
     Path.chdir(u'./' + title)
     page = []
     for book in book_package.book_list:
         page += book.page_list
     content = u' \r\n '.join([Match.html_body(x.content) for x in page]).replace(u'../images/', u'./images/')
     with open(TemplateConfig.content_base_uri) as html:
         content = html.read().format(title=title, body=content).replace(u'../style/', u'./')
     with open(title + u'.html', 'w') as html:
         html.write(content)
     Path.copy(Path.html_pool_path + u'/../{}/OEBPS/images'.format(title), u'./images')
     Path.copy(Path.www_css + u'/customer.css', u'./customer.css')
     Path.copy(Path.www_css + u'/markdown.css', u'./markdown.css')
     Path.copy(Path.www_css + u'/normalize.css', u'./normalize.css')
     # Path.copy(Path.www_css + u'/article.css', u'./article.css')         # TODO: 需要精简
     Path.reset_path()
     return title
Пример #3
0
        def parse_sinablog_author(command):
            u"""

            :param command: 某个新浪博客博主的首页地址
            :return: task:
            """
            result = Match.sinablog_author(command)
            sinablog_author_id = result.group('sinablog_people_id')
            Debug.logger.debug(u"sinablog_people_id:" +
                               str(sinablog_author_id))
            task = SingleTask()

            task.author_id = sinablog_author_id
            task.kind = 'sinablog_author'
            task.spider.href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.\
                format(sinablog_author_id)
            task.spider.href = 'http://blog.sina.com.cn/u/{}'.format(
                sinablog_author_id)
            task.spider.href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(
                sinablog_author_id)
            task.book.kind = 'sinablog_author'
            task.book.sql.info_extra = 'creator_id = "{}"'.format(
                sinablog_author_id)
            task.book.sql.article_extra = 'author_id = "{}"'.format(
                sinablog_author_id)
            task.book.author_id = sinablog_author_id
            return task
Пример #4
0
        def parse_taoguba_author(command):
            u"""

            :param command: 淘股吧帖子首页
            :return: task:
            """
            result = Match.taoguba_article(command)
            taoguba_article_id = result.group('article_id')
            Debug.logger.debug(u"taoguba_article_id:" +
                               str(taoguba_article_id))
            task = SingleTask()

            task.author_id = taoguba_article_id
            task.kind = 'taoguba_author'
            task.spider.href_article_list = command
            task.spider.href = command
            # todo article info
            task.spider.href_profile = command
            task.book.kind = 'taoguba_author'
            task.book.sql.info_extra = 'creator_id = "{}"'.format(
                taoguba_article_id)
            task.book.sql.article_extra = 'author_id = "{}"'.format(
                taoguba_article_id)
            task.book.author_id = taoguba_article_id
            return task
Пример #5
0
        def parse_author(command):
            result = Match.author(command)
            author_id = result.group('author_id')
            task = SingleTask()
            task.kind = 'author'
            task.spider.href = 'https://www.zhihu.com/people/{}'.format(author_id)
            task.book.kind = 'author'

            client = ZhihuClient()
            try:
                client.load_token(Path.pwd_path + str(u'/ZHIHUTOKEN.pkl'))
            except IOError:
                print u"没有找到登录信息文件,请先登录"
                sys.exit()
            except NeedLoginException:
                print u"登录信息过期,请重新登录"
                sys.exit()
            people_oauth = client.people(author_id)
            _ = people_oauth.follower_count    # zhihu-oauth, issues #4
            author_id_hash = people_oauth.id
            task.book.sql.info = 'select * from AuthorInfo where author_id = "{}"'.format(author_id_hash)
            task.book.sql.question = 'select * from Question where question_id in (select question_id from \
            Answer where author_id = "{}")'.format(author_id_hash)
            task.book.sql.answer = 'select * from Answer where author_id = "{}"'.format(author_id_hash)
            return task
Пример #6
0
 def parse_column(command):
     result = Match.column(command)
     column_id = result.group('column_id')
     task = SingleTask()
     task.kind = 'column'
     task.spider.href = 'https://zhuanlan.zhihu.com/{}'.format(column_id)
     task.book.kind = 'column'
     task.book.sql.info = 'select * from ColumnInfo where column_id = "{}" '.format(column_id)
     task.book.sql.question = ''
     task.book.sql.answer = 'select * from Article where column_id = "{}" '.format(column_id)
     return task
Пример #7
0
        def parse_question(command):
            result = Match.question(command)
            question_id = result.group('question_id')
            task = SingleTask()
            task.kind = 'question'

            task.spider.href = 'https://www.zhihu.com/question/{}'.format(question_id)
            task.book.kind = 'question'
            task.book.sql.info = ' question_id = "{}" '.format(question_id)
            task.book.sql.question = 'question_id = "{}"'.format(question_id)
            task.book.sql.answer = 'question_id = "{}"'.format(question_id)
            return task
Пример #8
0
 def parse_article(command):
     result = Match.article(command)
     column_id = result.group('column_id')
     article_id = result.group('article_id')
     task = SingleTask()
     task.kind = 'article'
     task.spider.href = 'https://zhuanlan.zhihu.com/{}/{}'.format(column_id, article_id)
     task.book.kind = 'article'
     task.book.sql.info = ' column_id = "{}" and article_id = "{}" '.format(column_id, article_id)
     task.book.sql.question = ''
     task.book.sql.answer = ' column_id = "{}" and article_id = "{}" '.format(column_id, article_id)
     return task
Пример #9
0
 def parse_jianshu_notebooks(command):
     result = Match.jianshu_notebooks(command)
     notebooks_id = result.group('notebooks_id')
     task = SingleTask()
     task.kind = 'jianshu_notebooks'
     task.spider.href = 'http://www.jianshu.com/notebooks/{}/latest'.format(notebooks_id)  # config file???
     task.book.kind = 'jianshu_notebooks'
     task.book.sql.info = 'select * from jianshu_notebooks_info where notebooks_id = "{}"'.format(
         notebooks_id
     )
     task.book.sql.answer = 'select * from jianshu_article where href in (select href from ' + \
         'jianshu_notebooks_index where notebooks_id = "{}")'.format(notebooks_id)
     return task
Пример #10
0
 def parse_jianshu_collection(command):
     result = Match.jianshu_collection(command)
     collection_id = result.group('collection_id')
     task = SingleTask()
     task.kind = 'jianshu_collection'
     task.spider.href = 'http://www.jianshu.com/collection/{}'.format(collection_id)
     task.book.kind = 'jianshu_collection'
     task.book.sql.info = 'select * from jianshu_collection_info where collection_fake_id = "{}"'.format(
         collection_id
     )
     task.book.sql.answer = 'select * from jianshu_article where href in (select href from ' + \
         'jianshu_collection_index where collection_fake_id = "{}")'.format(collection_id)
     return task
Пример #11
0
 def parse_topic(command):
     result = Match.topic(command)
     topic_id = result.group('topic_id')
     task = SingleTask()
     task.kind = 'topic'
     task.spider.href = 'https://www.zhihu.com/topic/{}'.format(topic_id)
     task.book.kind = 'topic'
     task.book.sql.info = 'select * from TopicInfo where topic_id = "{}"'.format(topic_id)
     task.book.sql.question = 'select * from Question where question_id in (select question_id from ' + \
         'Answer where href in (select href from TopicIndex where topic_id = "{}"))'.format(topic_id)
     task.book.sql.answer = 'select * from Answer where href in (select href from ' + \
         'TopicIndex where topic_id = "{}")'.format(topic_id)
     return task
Пример #12
0
 def parse_collection(command):
     result = Match.collection(command)
     collection_id = result.group('collection_id')
     task = SingleTask()
     task.kind = 'collection'
     task.spider.href = 'https://www.zhihu.com/collection/{}'.format(collection_id)
     task.book.kind = 'collection'
     task.book.sql.info = 'select * from CollectionInfo where collection_id = "{}"'.format(
         collection_id
     )
     task.book.sql.question = 'select * from Question where question_id in (select question_id from \
     Answer where href in (select href from CollectionIndex where collection_id = "{}"))'.format(collection_id)
     task.book.sql.answer = 'select * from Answer where href in (select href from \
     CollectionIndex where collection_id = "{}")'.format(collection_id)
     return task
Пример #13
0
        def parse_jianshu_author(command):
            u"""

            :param command: homepage of someone, e.g. http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles
            :return: task:
            """
            result = Match.jianshu_author(command)
            jianshu_id = result.group('jianshu_id')

            task = SingleTask()
            task.author_id = jianshu_id
            task.kind = 'jianshu_author'
            task.spider.href = 'http://www.jianshu.com/users/{}/latest_articles'.format(jianshu_id)
            task.book.kind = 'jianshu_author'
            task.book.sql.info_extra = 'creator_id = "{}"'.format(jianshu_id)
            task.book.sql.article_extra = 'author_id = "{}"'.format(jianshu_id)
            task.book.author_id = jianshu_id
            return task
Пример #14
0
        def parse_csdnblog_author(command):
            u"""

            :param command: homepage of someone, e.g. http://blog.csdn.net/elton_xiao
            :return: task
            """
            result = Match.csdnblog_author(command)
            csdnblog_author_id = result.group('csdnblog_author_id')

            task = SingleTask()
            task.author_id = csdnblog_author_id     # ??? don't need?
            task.kind = 'csdnblog_author'
            task.spider.href = 'http://blog.csdn.net/{}'.format(csdnblog_author_id)
            task.book.kind = 'csdnblog_author'
            task.book.sql.info_extra = 'creator_id = "{}"'.format(csdnblog_author_id)
            task.book.sql.article_extra = 'author_id = "{}"'.format(csdnblog_author_id)
            task.book.author_id = csdnblog_author_id
            return task
Пример #15
0
        def parse_cnblogs_author(command):
            u"""

            :param command: home page, e.g. http://www.cnblogs.com/buptzym/
            :return:
            """
            result = Match.cnblogs_author(command)
            cnblogs_author_id = result.group('cnblogs_id')
            task = SingleTask()
            task.kind = 'cnblogs_author'
            task.spider.href = 'http://www.cnblogs.com/{}/'.format(cnblogs_author_id)
            task.book.kind = 'cnblogs_author'
            task.book.sql.info = 'select * from cnblogs_author_info where creator_id = "{}"'.format(cnblogs_author_id)
            task.book.sql.answer = 'select * from cnblogs_article where author_id = "{}"'.format(cnblogs_author_id)
            # task.book.sql.info_extra = 'creator_id = "{}"'.format(cnblogs_author_id)
            # task.book.sql.article_extra = 'author_id = "{}"'.format(cnblogs_author_id)
            task.book.author_id = cnblogs_author_id
            return task
Пример #16
0
        def parse_sinablog_author(command):
            u"""

            :param command: 某个新浪博客博主的首页地址
            :return: task:
            """
            result = Match.sinablog_author(command)
            sinablog_author_id = result.group('sinablog_people_id')
            Debug.logger.debug(u"sinablog_people_id:" + str(sinablog_author_id))
            task = SingleTask()

            task.author_id = sinablog_author_id
            task.kind = 'sinablog_author'
            task.spider.href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.\
                format(sinablog_author_id)
            task.spider.href = 'http://blog.sina.com.cn/u/{}'.format(sinablog_author_id)
            task.spider.href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(sinablog_author_id)
            task.book.kind = 'sinablog_author'
            task.book.sql.info_extra = 'creator_id = "{}"'.format(sinablog_author_id)
            task.book.sql.article_extra = 'author_id = "{}"'.format(sinablog_author_id)
            task.book.author_id = sinablog_author_id
            return task
Пример #17
0
    def parse_command(raw_command=''):
        u"""
        分析单条命令并返回待完成的task
        :param raw_command:   网址原始链接, 如:http://blog.sina.com.cn/u/1287694611
        :return: task
        task格式
        *   kind
            *   字符串,见TypeClass.type_list
        *   spider
            *   href
                *   网址原始链接,例http://www.zhihu.com/question/33578941
                *   末尾没有『/』
        *   book
            *   kind
            *   info
            *   question
            *   answer
        """

        def parse_question(command):
            result = Match.question(command)
            question_id = result.group('question_id')
            task = SingleTask()
            task.kind = 'question'

            task.spider.href = 'https://www.zhihu.com/question/{}'.format(question_id)
            task.book.kind = 'question'
            task.book.sql.info = ' question_id = "{}" '.format(question_id)
            task.book.sql.question = 'question_id = "{}"'.format(question_id)
            task.book.sql.answer = 'question_id = "{}"'.format(question_id)
            return task

        def parse_answer(command):
            result = Match.answer(command)
            question_id = result.group('question_id')
            answer_id = result.group('answer_id')
            task = SingleTask()
            task.kind = 'answer'
            task.spider.href = 'https://www.zhihu.com/question/{}/answer/{}'.format(question_id, answer_id)

            task.book.kind = 'answer'
            task.book.sql.info = ' question_id = "{}" '.format(question_id)
            task.book.sql.question = ' question_id = "{}" '.format(question_id)
            task.book.sql.answer = ' question_id = "{}" and answer_id = "{}" '.format(question_id, answer_id)
            return task

        def parse_author(command):
            result = Match.author(command)
            author_id = result.group('author_id')
            task = SingleTask()
            task.kind = 'author'
            task.spider.href = 'https://www.zhihu.com/people/{}'.format(author_id)
            task.book.kind = 'author'

            client = ZhihuClient()
            try:
                client.load_token(Path.pwd_path + str(u'/ZHIHUTOKEN.pkl'))
            except IOError:
                print u"没有找到登录信息文件,请先登录"
                sys.exit()
            except NeedLoginException:
                print u"登录信息过期,请重新登录"
                sys.exit()
            people_oauth = client.people(author_id)
            _ = people_oauth.follower_count    # zhihu-oauth, issues #4
            author_id_hash = people_oauth.id
            task.book.sql.info = 'select * from AuthorInfo where author_id = "{}"'.format(author_id_hash)
            task.book.sql.question = 'select * from Question where question_id in (select question_id from \
            Answer where author_id = "{}")'.format(author_id_hash)
            task.book.sql.answer = 'select * from Answer where author_id = "{}"'.format(author_id_hash)
            return task

        def parse_collection(command):
            result = Match.collection(command)
            collection_id = result.group('collection_id')
            task = SingleTask()
            task.kind = 'collection'
            task.spider.href = 'https://www.zhihu.com/collection/{}'.format(collection_id)
            task.book.kind = 'collection'
            task.book.sql.info = 'select * from CollectionInfo where collection_id = "{}"'.format(
                collection_id
            )
            task.book.sql.question = 'select * from Question where question_id in (select question_id from \
            Answer where href in (select href from CollectionIndex where collection_id = "{}"))'.format(collection_id)
            task.book.sql.answer = 'select * from Answer where href in (select href from \
            CollectionIndex where collection_id = "{}")'.format(collection_id)
            return task

        def parse_topic(command):
            result = Match.topic(command)
            topic_id = result.group('topic_id')
            task = SingleTask()
            task.kind = 'topic'
            task.spider.href = 'https://www.zhihu.com/topic/{}'.format(topic_id)
            task.book.kind = 'topic'
            task.book.sql.info = 'select * from TopicInfo where topic_id = "{}"'.format(topic_id)
            task.book.sql.question = 'select * from Question where question_id in (select question_id from ' + \
                'Answer where href in (select href from TopicIndex where topic_id = "{}"))'.format(topic_id)
            task.book.sql.answer = 'select * from Answer where href in (select href from ' + \
                'TopicIndex where topic_id = "{}")'.format(topic_id)
            return task

        def parse_article(command):
            result = Match.article(command)
            column_id = result.group('column_id')
            article_id = result.group('article_id')
            task = SingleTask()
            task.kind = 'article'
            task.spider.href = 'https://zhuanlan.zhihu.com/{}/{}'.format(column_id, article_id)
            task.book.kind = 'article'
            task.book.sql.info = ' column_id = "{}" and article_id = "{}" '.format(column_id, article_id)
            task.book.sql.question = ''
            task.book.sql.answer = ' column_id = "{}" and article_id = "{}" '.format(column_id, article_id)
            return task

        def parse_column(command):
            result = Match.column(command)
            column_id = result.group('column_id')
            task = SingleTask()
            task.kind = 'column'
            task.spider.href = 'https://zhuanlan.zhihu.com/{}'.format(column_id)
            task.book.kind = 'column'
            task.book.sql.info = 'select * from ColumnInfo where column_id = "{}" '.format(column_id)
            task.book.sql.question = ''
            task.book.sql.answer = 'select * from Article where column_id = "{}" '.format(column_id)
            return task

        def parse_sinablog_author(command):
            u"""

            :param command: 某个新浪博客博主的首页地址
            :return: task:
            """
            result = Match.sinablog_author(command)
            sinablog_author_id = result.group('sinablog_people_id')
            Debug.logger.debug(u"sinablog_people_id:" + str(sinablog_author_id))
            task = SingleTask()

            task.author_id = sinablog_author_id
            task.kind = 'sinablog_author'
            task.spider.href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.\
                format(sinablog_author_id)
            task.spider.href = 'http://blog.sina.com.cn/u/{}'.format(sinablog_author_id)
            task.spider.href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(sinablog_author_id)
            task.book.kind = 'sinablog_author'
            task.book.sql.info_extra = 'creator_id = "{}"'.format(sinablog_author_id)
            task.book.sql.article_extra = 'author_id = "{}"'.format(sinablog_author_id)
            task.book.author_id = sinablog_author_id
            return task

        def parse_jianshu_author(command):
            u"""

            :param command: homepage of someone, e.g. http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles
            :return: task:
            """
            result = Match.jianshu_author(command)
            jianshu_id = result.group('jianshu_id')

            task = SingleTask()
            task.author_id = jianshu_id
            task.kind = 'jianshu_author'
            task.spider.href = 'http://www.jianshu.com/users/{}/latest_articles'.format(jianshu_id)
            task.book.kind = 'jianshu_author'
            task.book.sql.info_extra = 'creator_id = "{}"'.format(jianshu_id)
            task.book.sql.article_extra = 'author_id = "{}"'.format(jianshu_id)
            task.book.author_id = jianshu_id
            return task

        def parse_jianshu_collection(command):
            result = Match.jianshu_collection(command)
            collection_id = result.group('collection_id')
            task = SingleTask()
            task.kind = 'jianshu_collection'
            task.spider.href = 'http://www.jianshu.com/collection/{}'.format(collection_id)
            task.book.kind = 'jianshu_collection'
            task.book.sql.info = 'select * from jianshu_collection_info where collection_fake_id = "{}"'.format(
                collection_id
            )
            task.book.sql.answer = 'select * from jianshu_article where href in (select href from ' + \
                'jianshu_collection_index where collection_fake_id = "{}")'.format(collection_id)
            return task

        def parse_jianshu_notebooks(command):
            result = Match.jianshu_notebooks(command)
            notebooks_id = result.group('notebooks_id')
            task = SingleTask()
            task.kind = 'jianshu_notebooks'
            task.spider.href = 'http://www.jianshu.com/notebooks/{}/latest'.format(notebooks_id)  # config file???
            task.book.kind = 'jianshu_notebooks'
            task.book.sql.info = 'select * from jianshu_notebooks_info where notebooks_id = "{}"'.format(
                notebooks_id
            )
            task.book.sql.answer = 'select * from jianshu_article where href in (select href from ' + \
                'jianshu_notebooks_index where notebooks_id = "{}")'.format(notebooks_id)
            return task

        def parse_cnblogs_author(command):
            u"""

            :param command: home page, e.g. http://www.cnblogs.com/buptzym/
            :return:
            """
            result = Match.cnblogs_author(command)
            cnblogs_author_id = result.group('cnblogs_id')
            task = SingleTask()
            task.kind = 'cnblogs_author'
            task.spider.href = 'http://www.cnblogs.com/{}/'.format(cnblogs_author_id)
            task.book.kind = 'cnblogs_author'
            task.book.sql.info = 'select * from cnblogs_author_info where creator_id = "{}"'.format(cnblogs_author_id)
            task.book.sql.answer = 'select * from cnblogs_article where author_id = "{}"'.format(cnblogs_author_id)
            # task.book.sql.info_extra = 'creator_id = "{}"'.format(cnblogs_author_id)
            # task.book.sql.article_extra = 'author_id = "{}"'.format(cnblogs_author_id)
            task.book.author_id = cnblogs_author_id
            return task

        def parse_csdnblog_author(command):
            u"""

            :param command: homepage of someone, e.g. http://blog.csdn.net/elton_xiao
            :return: task
            """
            result = Match.csdnblog_author(command)
            csdnblog_author_id = result.group('csdnblog_author_id')

            task = SingleTask()
            task.author_id = csdnblog_author_id     # ??? don't need?
            task.kind = 'csdnblog_author'
            task.spider.href = 'http://blog.csdn.net/{}'.format(csdnblog_author_id)
            task.book.kind = 'csdnblog_author'
            task.book.sql.info_extra = 'creator_id = "{}"'.format(csdnblog_author_id)
            task.book.sql.article_extra = 'author_id = "{}"'.format(csdnblog_author_id)
            task.book.author_id = csdnblog_author_id
            return task

        def parse_generic(command):
            u"""

            :param command:
            :return:
            """
            from .tools.type import Type
            task = SingleTask()
            for command_type in Type.type_list:
                result = getattr(Match, command_type)(command)
                if result:
                    task.author_id = result.group('subject_id')
                    task.kind = command_type
            task.spider.href = command
            task.book.kind = task.kind
            task.book.sql.info = 'select * from generic_info where creator_id = "{}"'.format(command)
            task.book.sql.answer = 'select * from generic_article where author_id = "{}"'.format(command)
            task.book.author_id = task.spider.href
            return task

        def parse_error(command):
            if command:
                Debug.logger.info(u"""Could not analysis:{}, please check it out and try again。""".format(command))
            return

        parser = {
            'answer': parse_answer,
            'question': parse_question,
            'author': parse_author,
            'collection': parse_collection,
            'topic': parse_topic,
            'article': parse_article,
            'column': parse_column,
            'sinablog_author': parse_sinablog_author,
            'cnblogs_author': parse_cnblogs_author,
            'jianshu_author': parse_jianshu_author,
            'jianshu_collection': parse_jianshu_collection,
            'jianshu_notebooks': parse_jianshu_notebooks,
            'csdnblog_author': parse_csdnblog_author,
            'yiibai': parse_generic,
            'talkpython': parse_generic,
            'unknown': parse_error,
        }

        kind = Match.detect_recipe_kind(raw_command)
        return parser[kind](raw_command)