def parse_csdnblog_author(command): u""" :param command: homepage of someone, e.g. http://blog.csdn.net/elton_xiao :return: task """ result = Match.csdnblog_author(command) csdnblog_author_id = result.group('csdnblog_author_id') task = SingleTask() task.author_id = csdnblog_author_id # ??? don't need? task.kind = 'csdnblog_author' task.spider.href = 'http://blog.csdn.net/{}'.format(csdnblog_author_id) task.book.kind = 'csdnblog_author' task.book.sql.info_extra = 'creator_id = "{}"'.format(csdnblog_author_id) task.book.sql.article_extra = 'author_id = "{}"'.format(csdnblog_author_id) task.book.author_id = csdnblog_author_id return task
def create_work_set(self, target_url): u""" :param target_url: http://blog.csdn.net/dbzhang800 :return: """ if target_url in self.task_complete_set: return id_result = Match.csdnblog_author(target_url) csdn_author_id = id_result.group('csdnblog_author_id') index_content = Http.get_content(target_url) article_num, article_list = self.get_csdnblog_question_list(index_content) page_num = int(self.parse_max_page(index_content)) self.task_complete_set.add(target_url) for item in article_list: self.work_set.add(item) for page in range(page_num-1): # page+2, don't need to get the first page url = 'http://blog.csdn.net/{}/article/list/{}'.format(csdn_author_id, page+2) content = Http.get_content(url) _, article_list = self.get_csdnblog_question_list(content) for item in article_list: self.work_set.add(item) return