Exemplo n.º 1
0
        def parse_csdnblog_author(command):
            u"""

            :param command: homepage of someone, e.g. http://blog.csdn.net/elton_xiao
            :return: task
            """
            result = Match.csdnblog_author(command)
            csdnblog_author_id = result.group('csdnblog_author_id')

            task = SingleTask()
            task.author_id = csdnblog_author_id     # ??? don't need?
            task.kind = 'csdnblog_author'
            task.spider.href = 'http://blog.csdn.net/{}'.format(csdnblog_author_id)
            task.book.kind = 'csdnblog_author'
            task.book.sql.info_extra = 'creator_id = "{}"'.format(csdnblog_author_id)
            task.book.sql.article_extra = 'author_id = "{}"'.format(csdnblog_author_id)
            task.book.author_id = csdnblog_author_id
            return task
Exemplo n.º 2
0
    def create_work_set(self, target_url):
        u"""

        :param target_url: http://blog.csdn.net/dbzhang800
        :return:
        """
        if target_url in self.task_complete_set:
            return
        id_result = Match.csdnblog_author(target_url)
        csdn_author_id = id_result.group('csdnblog_author_id')
        index_content = Http.get_content(target_url)
        article_num, article_list = self.get_csdnblog_question_list(index_content)
        page_num = int(self.parse_max_page(index_content))
        self.task_complete_set.add(target_url)

        for item in article_list:
            self.work_set.add(item)
        for page in range(page_num-1):    # page+2, don't need to get the first page
            url = 'http://blog.csdn.net/{}/article/list/{}'.format(csdn_author_id, page+2)
            content = Http.get_content(url)
            _, article_list = self.get_csdnblog_question_list(content)
            for item in article_list:
                self.work_set.add(item)
        return