def parse_jianshu_author(command): u""" :param command: homepage of someone, e.g. http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles :return: task: """ result = Match.jianshu_author(command) jianshu_id = result.group('jianshu_id') task = SingleTask() task.author_id = jianshu_id task.kind = 'jianshu_author' task.spider.href = 'http://www.jianshu.com/users/{}/latest_articles'.format(jianshu_id) task.book.kind = 'jianshu_author' task.book.sql.info_extra = 'creator_id = "{}"'.format(jianshu_id) task.book.sql.article_extra = 'author_id = "{}"'.format(jianshu_id) task.book.author_id = jianshu_id return task
def parse_jianshu(command): u""" :param command: 某个新浪博客博主的首页地址 :return: task: """ result = Match.jianshu(command) jianshu_id = result.group('jianshu_id') task = SingleTask() task.author_id = jianshu_id task.kind = 'jianshu' task.spider.href = 'http://www.jianshu.com/users/{}/latest_articles'.format(jianshu_id) task.book.kind = 'jianshu' task.book.sql.info_extra = 'creator_id = "{}"'.format(jianshu_id) task.book.sql.article_extra = 'author_id = "{}"'.format(jianshu_id) task.book.author_id = jianshu_id return task
def parse_csdnblog_author(command): u""" :param command: homepage of someone, e.g. http://blog.csdn.net/elton_xiao :return: task """ result = Match.csdnblog_author(command) csdnblog_author_id = result.group('csdnblog_author_id') task = SingleTask() task.author_id = csdnblog_author_id # ??? don't need? task.kind = 'csdnblog_author' task.spider.href = 'http://blog.csdn.net/{}'.format(csdnblog_author_id) task.book.kind = 'csdnblog_author' task.book.sql.info_extra = 'creator_id = "{}"'.format(csdnblog_author_id) task.book.sql.article_extra = 'author_id = "{}"'.format(csdnblog_author_id) task.book.author_id = csdnblog_author_id return task
def parse_jianshu(command): u""" :param command: 某个新浪博客博主的首页地址 :return: task: """ result = Match.jianshu(command) jianshu_id = result.group('jianshu_id') task = SingleTask() task.author_id = jianshu_id task.kind = 'jianshu' task.spider.href = 'http://www.jianshu.com/users/{}/latest_articles'.format( jianshu_id) task.book.kind = 'jianshu' task.book.sql.info_extra = 'creator_id = "{}"'.format(jianshu_id) task.book.sql.article_extra = 'author_id = "{}"'.format(jianshu_id) task.book.author_id = jianshu_id return task
def parse_generic(command): u""" :param command: :return: """ from src.tools.type import Type task = SingleTask() for command_type in Type.type_list: result = getattr(Match, command_type)(command) if result: task.author_id = result.group('subject_id') task.kind = command_type task.spider.href = command task.book.kind = task.kind task.book.sql.info = 'select * from generic_info where creator_id = "{}"'.format(command) task.book.sql.answer = 'select * from generic_article where author_id = "{}"'.format(command) task.book.author_id = task.spider.href return task
def parse_sinablog_author(command): u""" :param command: 某个新浪博客博主的首页地址 :return: task: """ result = Match.sinablog_author(command) sinablog_author_id = result.group('sinablog_people_id') Debug.logger.debug(u"sinablog_people_id:" + str(sinablog_author_id)) task = SingleTask() task.author_id = sinablog_author_id task.kind = 'sinablog_author' task.spider.href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.\ format(sinablog_author_id) task.spider.href = 'http://blog.sina.com.cn/u/{}'.format(sinablog_author_id) task.spider.href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(sinablog_author_id) task.book.kind = 'sinablog_author' task.book.sql.info_extra = 'creator_id = "{}"'.format(sinablog_author_id) task.book.sql.article_extra = 'author_id = "{}"'.format(sinablog_author_id) task.book.author_id = sinablog_author_id return task
def parse_SinaBlog(command): u""" :param command: 某个新浪博客博主的首页地址 :return: task: """ result = Match.SinaBlog(command) SinaBlog_author_id = result.group('SinaBlog_people_id') Debug.logger.debug(u"SinaBlog_people_id:" + str(SinaBlog_author_id)) task = SingleTask() task.author_id = SinaBlog_author_id task.kind = 'SinaBlog' task.spider.href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.format(SinaBlog_author_id) task.spider.href = 'http://blog.sina.com.cn/u/{}'.format(SinaBlog_author_id) task.spider.href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(SinaBlog_author_id) task.book.kind = 'SinaBlog' task.book.sql.info_extra = 'creator_id = "{}"'.format(SinaBlog_author_id) task.book.sql.article_extra = 'author_id = "{}"'.format(SinaBlog_author_id) task.book.author_id = SinaBlog_author_id Debug.logger.debug(u"在parse_SinaBlog中, task.book.author_id为" + str(task.book.author_id)) return task