def create_single_html_book(self, book_package): title = book_package.get_title() if not title: # 电子书题目为空时自动跳过 # 否则会发生『rm -rf / 』的惨剧 return Path.reset_path() Path.chdir(Path.result_path) Path.rmdir(u'./' + title) Path.mkdir(u'./' + title) Path.chdir(u'./' + title) page = [] for book in book_package.book_list: page += book.page_list content = u' \r\n '.join([Match.html_body(x.content) for x in page]).replace(u'../images/', u'./images/') with open(TemplateConfig.content_base_uri) as html: content = html.read().format(title=title, body=content).replace(u'../style/', u'./') with open(title + u'.html', 'w') as html: html.write(content) Path.copy(Path.html_pool_path + u'/../{}/OEBPS/images'.format(title), u'./images') Path.copy(Path.www_css + u'/customer.css', u'./customer.css') Path.copy(Path.www_css + u'/markdown.css', u'./markdown.css') Path.copy(Path.www_css + u'/normalize.css', u'./normalize.css') # Path.copy(Path.www_css + u'/article.css', u'./article.css') # TODO: 需要精简 Path.reset_path() return title
def parse_sinablog_author(command): u""" :param command: 某个新浪博客博主的首页地址 :return: task: """ result = Match.sinablog_author(command) sinablog_author_id = result.group('sinablog_people_id') Debug.logger.debug(u"sinablog_people_id:" + str(sinablog_author_id)) task = SingleTask() task.author_id = sinablog_author_id task.kind = 'sinablog_author' task.spider.href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.\ format(sinablog_author_id) task.spider.href = 'http://blog.sina.com.cn/u/{}'.format( sinablog_author_id) task.spider.href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format( sinablog_author_id) task.book.kind = 'sinablog_author' task.book.sql.info_extra = 'creator_id = "{}"'.format( sinablog_author_id) task.book.sql.article_extra = 'author_id = "{}"'.format( sinablog_author_id) task.book.author_id = sinablog_author_id return task
def parse_taoguba_author(command): u""" :param command: 淘股吧帖子首页 :return: task: """ result = Match.taoguba_article(command) taoguba_article_id = result.group('article_id') Debug.logger.debug(u"taoguba_article_id:" + str(taoguba_article_id)) task = SingleTask() task.author_id = taoguba_article_id task.kind = 'taoguba_author' task.spider.href_article_list = command task.spider.href = command # todo article info task.spider.href_profile = command task.book.kind = 'taoguba_author' task.book.sql.info_extra = 'creator_id = "{}"'.format( taoguba_article_id) task.book.sql.article_extra = 'author_id = "{}"'.format( taoguba_article_id) task.book.author_id = taoguba_article_id return task
def parse_author(command): result = Match.author(command) author_id = result.group('author_id') task = SingleTask() task.kind = 'author' task.spider.href = 'https://www.zhihu.com/people/{}'.format(author_id) task.book.kind = 'author' client = ZhihuClient() try: client.load_token(Path.pwd_path + str(u'/ZHIHUTOKEN.pkl')) except IOError: print u"没有找到登录信息文件,请先登录" sys.exit() except NeedLoginException: print u"登录信息过期,请重新登录" sys.exit() people_oauth = client.people(author_id) _ = people_oauth.follower_count # zhihu-oauth, issues #4 author_id_hash = people_oauth.id task.book.sql.info = 'select * from AuthorInfo where author_id = "{}"'.format(author_id_hash) task.book.sql.question = 'select * from Question where question_id in (select question_id from \ Answer where author_id = "{}")'.format(author_id_hash) task.book.sql.answer = 'select * from Answer where author_id = "{}"'.format(author_id_hash) return task
def parse_column(command): result = Match.column(command) column_id = result.group('column_id') task = SingleTask() task.kind = 'column' task.spider.href = 'https://zhuanlan.zhihu.com/{}'.format(column_id) task.book.kind = 'column' task.book.sql.info = 'select * from ColumnInfo where column_id = "{}" '.format(column_id) task.book.sql.question = '' task.book.sql.answer = 'select * from Article where column_id = "{}" '.format(column_id) return task
def parse_question(command): result = Match.question(command) question_id = result.group('question_id') task = SingleTask() task.kind = 'question' task.spider.href = 'https://www.zhihu.com/question/{}'.format(question_id) task.book.kind = 'question' task.book.sql.info = ' question_id = "{}" '.format(question_id) task.book.sql.question = 'question_id = "{}"'.format(question_id) task.book.sql.answer = 'question_id = "{}"'.format(question_id) return task
def parse_article(command): result = Match.article(command) column_id = result.group('column_id') article_id = result.group('article_id') task = SingleTask() task.kind = 'article' task.spider.href = 'https://zhuanlan.zhihu.com/{}/{}'.format(column_id, article_id) task.book.kind = 'article' task.book.sql.info = ' column_id = "{}" and article_id = "{}" '.format(column_id, article_id) task.book.sql.question = '' task.book.sql.answer = ' column_id = "{}" and article_id = "{}" '.format(column_id, article_id) return task
def parse_jianshu_notebooks(command): result = Match.jianshu_notebooks(command) notebooks_id = result.group('notebooks_id') task = SingleTask() task.kind = 'jianshu_notebooks' task.spider.href = 'http://www.jianshu.com/notebooks/{}/latest'.format(notebooks_id) # config file??? task.book.kind = 'jianshu_notebooks' task.book.sql.info = 'select * from jianshu_notebooks_info where notebooks_id = "{}"'.format( notebooks_id ) task.book.sql.answer = 'select * from jianshu_article where href in (select href from ' + \ 'jianshu_notebooks_index where notebooks_id = "{}")'.format(notebooks_id) return task
def parse_jianshu_collection(command): result = Match.jianshu_collection(command) collection_id = result.group('collection_id') task = SingleTask() task.kind = 'jianshu_collection' task.spider.href = 'http://www.jianshu.com/collection/{}'.format(collection_id) task.book.kind = 'jianshu_collection' task.book.sql.info = 'select * from jianshu_collection_info where collection_fake_id = "{}"'.format( collection_id ) task.book.sql.answer = 'select * from jianshu_article where href in (select href from ' + \ 'jianshu_collection_index where collection_fake_id = "{}")'.format(collection_id) return task
def parse_topic(command): result = Match.topic(command) topic_id = result.group('topic_id') task = SingleTask() task.kind = 'topic' task.spider.href = 'https://www.zhihu.com/topic/{}'.format(topic_id) task.book.kind = 'topic' task.book.sql.info = 'select * from TopicInfo where topic_id = "{}"'.format(topic_id) task.book.sql.question = 'select * from Question where question_id in (select question_id from ' + \ 'Answer where href in (select href from TopicIndex where topic_id = "{}"))'.format(topic_id) task.book.sql.answer = 'select * from Answer where href in (select href from ' + \ 'TopicIndex where topic_id = "{}")'.format(topic_id) return task
def parse_collection(command): result = Match.collection(command) collection_id = result.group('collection_id') task = SingleTask() task.kind = 'collection' task.spider.href = 'https://www.zhihu.com/collection/{}'.format(collection_id) task.book.kind = 'collection' task.book.sql.info = 'select * from CollectionInfo where collection_id = "{}"'.format( collection_id ) task.book.sql.question = 'select * from Question where question_id in (select question_id from \ Answer where href in (select href from CollectionIndex where collection_id = "{}"))'.format(collection_id) task.book.sql.answer = 'select * from Answer where href in (select href from \ CollectionIndex where collection_id = "{}")'.format(collection_id) return task
def parse_jianshu_author(command): u""" :param command: homepage of someone, e.g. http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles :return: task: """ result = Match.jianshu_author(command) jianshu_id = result.group('jianshu_id') task = SingleTask() task.author_id = jianshu_id task.kind = 'jianshu_author' task.spider.href = 'http://www.jianshu.com/users/{}/latest_articles'.format(jianshu_id) task.book.kind = 'jianshu_author' task.book.sql.info_extra = 'creator_id = "{}"'.format(jianshu_id) task.book.sql.article_extra = 'author_id = "{}"'.format(jianshu_id) task.book.author_id = jianshu_id return task
def parse_csdnblog_author(command): u""" :param command: homepage of someone, e.g. http://blog.csdn.net/elton_xiao :return: task """ result = Match.csdnblog_author(command) csdnblog_author_id = result.group('csdnblog_author_id') task = SingleTask() task.author_id = csdnblog_author_id # ??? don't need? task.kind = 'csdnblog_author' task.spider.href = 'http://blog.csdn.net/{}'.format(csdnblog_author_id) task.book.kind = 'csdnblog_author' task.book.sql.info_extra = 'creator_id = "{}"'.format(csdnblog_author_id) task.book.sql.article_extra = 'author_id = "{}"'.format(csdnblog_author_id) task.book.author_id = csdnblog_author_id return task
def parse_cnblogs_author(command): u""" :param command: home page, e.g. http://www.cnblogs.com/buptzym/ :return: """ result = Match.cnblogs_author(command) cnblogs_author_id = result.group('cnblogs_id') task = SingleTask() task.kind = 'cnblogs_author' task.spider.href = 'http://www.cnblogs.com/{}/'.format(cnblogs_author_id) task.book.kind = 'cnblogs_author' task.book.sql.info = 'select * from cnblogs_author_info where creator_id = "{}"'.format(cnblogs_author_id) task.book.sql.answer = 'select * from cnblogs_article where author_id = "{}"'.format(cnblogs_author_id) # task.book.sql.info_extra = 'creator_id = "{}"'.format(cnblogs_author_id) # task.book.sql.article_extra = 'author_id = "{}"'.format(cnblogs_author_id) task.book.author_id = cnblogs_author_id return task
def parse_sinablog_author(command): u""" :param command: 某个新浪博客博主的首页地址 :return: task: """ result = Match.sinablog_author(command) sinablog_author_id = result.group('sinablog_people_id') Debug.logger.debug(u"sinablog_people_id:" + str(sinablog_author_id)) task = SingleTask() task.author_id = sinablog_author_id task.kind = 'sinablog_author' task.spider.href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.\ format(sinablog_author_id) task.spider.href = 'http://blog.sina.com.cn/u/{}'.format(sinablog_author_id) task.spider.href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(sinablog_author_id) task.book.kind = 'sinablog_author' task.book.sql.info_extra = 'creator_id = "{}"'.format(sinablog_author_id) task.book.sql.article_extra = 'author_id = "{}"'.format(sinablog_author_id) task.book.author_id = sinablog_author_id return task
def parse_command(raw_command=''): u""" 分析单条命令并返回待完成的task :param raw_command: 网址原始链接, 如:http://blog.sina.com.cn/u/1287694611 :return: task task格式 * kind * 字符串,见TypeClass.type_list * spider * href * 网址原始链接,例http://www.zhihu.com/question/33578941 * 末尾没有『/』 * book * kind * info * question * answer """ def parse_question(command): result = Match.question(command) question_id = result.group('question_id') task = SingleTask() task.kind = 'question' task.spider.href = 'https://www.zhihu.com/question/{}'.format(question_id) task.book.kind = 'question' task.book.sql.info = ' question_id = "{}" '.format(question_id) task.book.sql.question = 'question_id = "{}"'.format(question_id) task.book.sql.answer = 'question_id = "{}"'.format(question_id) return task def parse_answer(command): result = Match.answer(command) question_id = result.group('question_id') answer_id = result.group('answer_id') task = SingleTask() task.kind = 'answer' task.spider.href = 'https://www.zhihu.com/question/{}/answer/{}'.format(question_id, answer_id) task.book.kind = 'answer' task.book.sql.info = ' question_id = "{}" '.format(question_id) task.book.sql.question = ' question_id = "{}" '.format(question_id) task.book.sql.answer = ' question_id = "{}" and answer_id = "{}" '.format(question_id, answer_id) return task def parse_author(command): result = Match.author(command) author_id = result.group('author_id') task = SingleTask() task.kind = 'author' task.spider.href = 'https://www.zhihu.com/people/{}'.format(author_id) task.book.kind = 'author' client = ZhihuClient() try: client.load_token(Path.pwd_path + str(u'/ZHIHUTOKEN.pkl')) except IOError: print u"没有找到登录信息文件,请先登录" sys.exit() except NeedLoginException: print u"登录信息过期,请重新登录" sys.exit() people_oauth = client.people(author_id) _ = people_oauth.follower_count # zhihu-oauth, issues #4 author_id_hash = people_oauth.id task.book.sql.info = 'select * from AuthorInfo where author_id = "{}"'.format(author_id_hash) task.book.sql.question = 'select * from Question where question_id in (select question_id from \ Answer where author_id = "{}")'.format(author_id_hash) task.book.sql.answer = 'select * from Answer where author_id = "{}"'.format(author_id_hash) return task def parse_collection(command): result = Match.collection(command) collection_id = result.group('collection_id') task = SingleTask() task.kind = 'collection' task.spider.href = 'https://www.zhihu.com/collection/{}'.format(collection_id) task.book.kind = 'collection' task.book.sql.info = 'select * from CollectionInfo where collection_id = "{}"'.format( collection_id ) task.book.sql.question = 'select * from Question where question_id in (select question_id from \ Answer where href in (select href from CollectionIndex where collection_id = "{}"))'.format(collection_id) task.book.sql.answer = 'select * from Answer where href in (select href from \ CollectionIndex where collection_id = "{}")'.format(collection_id) return task def parse_topic(command): result = Match.topic(command) topic_id = result.group('topic_id') task = SingleTask() task.kind = 'topic' task.spider.href = 'https://www.zhihu.com/topic/{}'.format(topic_id) task.book.kind = 'topic' task.book.sql.info = 'select * from TopicInfo where topic_id = "{}"'.format(topic_id) task.book.sql.question = 'select * from Question where question_id in (select question_id from ' + \ 'Answer where href in (select href from TopicIndex where topic_id = "{}"))'.format(topic_id) task.book.sql.answer = 'select * from Answer where href in (select href from ' + \ 'TopicIndex where topic_id = "{}")'.format(topic_id) return task def parse_article(command): result = Match.article(command) column_id = result.group('column_id') article_id = result.group('article_id') task = SingleTask() task.kind = 'article' task.spider.href = 'https://zhuanlan.zhihu.com/{}/{}'.format(column_id, article_id) task.book.kind = 'article' task.book.sql.info = ' column_id = "{}" and article_id = "{}" '.format(column_id, article_id) task.book.sql.question = '' task.book.sql.answer = ' column_id = "{}" and article_id = "{}" '.format(column_id, article_id) return task def parse_column(command): result = Match.column(command) column_id = result.group('column_id') task = SingleTask() task.kind = 'column' task.spider.href = 'https://zhuanlan.zhihu.com/{}'.format(column_id) task.book.kind = 'column' task.book.sql.info = 'select * from ColumnInfo where column_id = "{}" '.format(column_id) task.book.sql.question = '' task.book.sql.answer = 'select * from Article where column_id = "{}" '.format(column_id) return task def parse_sinablog_author(command): u""" :param command: 某个新浪博客博主的首页地址 :return: task: """ result = Match.sinablog_author(command) sinablog_author_id = result.group('sinablog_people_id') Debug.logger.debug(u"sinablog_people_id:" + str(sinablog_author_id)) task = SingleTask() task.author_id = sinablog_author_id task.kind = 'sinablog_author' task.spider.href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.\ format(sinablog_author_id) task.spider.href = 'http://blog.sina.com.cn/u/{}'.format(sinablog_author_id) task.spider.href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(sinablog_author_id) task.book.kind = 'sinablog_author' task.book.sql.info_extra = 'creator_id = "{}"'.format(sinablog_author_id) task.book.sql.article_extra = 'author_id = "{}"'.format(sinablog_author_id) task.book.author_id = sinablog_author_id return task def parse_jianshu_author(command): u""" :param command: homepage of someone, e.g. http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles :return: task: """ result = Match.jianshu_author(command) jianshu_id = result.group('jianshu_id') task = SingleTask() task.author_id = jianshu_id task.kind = 'jianshu_author' task.spider.href = 'http://www.jianshu.com/users/{}/latest_articles'.format(jianshu_id) task.book.kind = 'jianshu_author' task.book.sql.info_extra = 'creator_id = "{}"'.format(jianshu_id) task.book.sql.article_extra = 'author_id = "{}"'.format(jianshu_id) task.book.author_id = jianshu_id return task def parse_jianshu_collection(command): result = Match.jianshu_collection(command) collection_id = result.group('collection_id') task = SingleTask() task.kind = 'jianshu_collection' task.spider.href = 'http://www.jianshu.com/collection/{}'.format(collection_id) task.book.kind = 'jianshu_collection' task.book.sql.info = 'select * from jianshu_collection_info where collection_fake_id = "{}"'.format( collection_id ) task.book.sql.answer = 'select * from jianshu_article where href in (select href from ' + \ 'jianshu_collection_index where collection_fake_id = "{}")'.format(collection_id) return task def parse_jianshu_notebooks(command): result = Match.jianshu_notebooks(command) notebooks_id = result.group('notebooks_id') task = SingleTask() task.kind = 'jianshu_notebooks' task.spider.href = 'http://www.jianshu.com/notebooks/{}/latest'.format(notebooks_id) # config file??? task.book.kind = 'jianshu_notebooks' task.book.sql.info = 'select * from jianshu_notebooks_info where notebooks_id = "{}"'.format( notebooks_id ) task.book.sql.answer = 'select * from jianshu_article where href in (select href from ' + \ 'jianshu_notebooks_index where notebooks_id = "{}")'.format(notebooks_id) return task def parse_cnblogs_author(command): u""" :param command: home page, e.g. http://www.cnblogs.com/buptzym/ :return: """ result = Match.cnblogs_author(command) cnblogs_author_id = result.group('cnblogs_id') task = SingleTask() task.kind = 'cnblogs_author' task.spider.href = 'http://www.cnblogs.com/{}/'.format(cnblogs_author_id) task.book.kind = 'cnblogs_author' task.book.sql.info = 'select * from cnblogs_author_info where creator_id = "{}"'.format(cnblogs_author_id) task.book.sql.answer = 'select * from cnblogs_article where author_id = "{}"'.format(cnblogs_author_id) # task.book.sql.info_extra = 'creator_id = "{}"'.format(cnblogs_author_id) # task.book.sql.article_extra = 'author_id = "{}"'.format(cnblogs_author_id) task.book.author_id = cnblogs_author_id return task def parse_csdnblog_author(command): u""" :param command: homepage of someone, e.g. http://blog.csdn.net/elton_xiao :return: task """ result = Match.csdnblog_author(command) csdnblog_author_id = result.group('csdnblog_author_id') task = SingleTask() task.author_id = csdnblog_author_id # ??? don't need? task.kind = 'csdnblog_author' task.spider.href = 'http://blog.csdn.net/{}'.format(csdnblog_author_id) task.book.kind = 'csdnblog_author' task.book.sql.info_extra = 'creator_id = "{}"'.format(csdnblog_author_id) task.book.sql.article_extra = 'author_id = "{}"'.format(csdnblog_author_id) task.book.author_id = csdnblog_author_id return task def parse_generic(command): u""" :param command: :return: """ from .tools.type import Type task = SingleTask() for command_type in Type.type_list: result = getattr(Match, command_type)(command) if result: task.author_id = result.group('subject_id') task.kind = command_type task.spider.href = command task.book.kind = task.kind task.book.sql.info = 'select * from generic_info where creator_id = "{}"'.format(command) task.book.sql.answer = 'select * from generic_article where author_id = "{}"'.format(command) task.book.author_id = task.spider.href return task def parse_error(command): if command: Debug.logger.info(u"""Could not analysis:{}, please check it out and try again。""".format(command)) return parser = { 'answer': parse_answer, 'question': parse_question, 'author': parse_author, 'collection': parse_collection, 'topic': parse_topic, 'article': parse_article, 'column': parse_column, 'sinablog_author': parse_sinablog_author, 'cnblogs_author': parse_cnblogs_author, 'jianshu_author': parse_jianshu_author, 'jianshu_collection': parse_jianshu_collection, 'jianshu_notebooks': parse_jianshu_notebooks, 'csdnblog_author': parse_csdnblog_author, 'yiibai': parse_generic, 'talkpython': parse_generic, 'unknown': parse_error, } kind = Match.detect_recipe_kind(raw_command) return parser[kind](raw_command)