Exemplo n.º 1
0
    def create_work_set(self, target_url):
        if target_url in self.task_complete_set:
            return
        result = Match.column(target_url)
        column_id = result.group('column_id')
        content = Http.get_content('https://zhuanlan.zhihu.com/api/columns/' +
                                   column_id)
        if not content:
            return
        raw_info = json.loads(content)
        info = {}
        info['creator_id'] = raw_info['creator']['slug']
        info['creator_hash'] = raw_info['creator']['hash']
        info['creator_sign'] = raw_info['creator']['bio']
        info['creator_name'] = raw_info['creator']['name']
        info['creator_logo'] = raw_info['creator']['avatar'][
            'template'].replace('{id}',
                                raw_info['creator']['avatar']['id']).replace(
                                    '_{size}', '')

        info['column_id'] = raw_info['slug']
        info['name'] = raw_info['name']
        info['logo'] = raw_info['creator']['avatar']['template'].replace(
            '{id}', raw_info['avatar']['id']).replace('_{size}', '')
        info['article'] = raw_info['postsCount']
        info['follower'] = raw_info['followersCount']
        info['description'] = raw_info['description']
        self.info_list.append(info)
        self.task_complete_set.add(target_url)
        detect_url = 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset='.format(
            column_id)
        for i in range(info['article'] / 10 + 1):
            self.work_set.add(detect_url + str(i * 10))
        return
Exemplo n.º 2
0
    def create_work_set(self, target_url):
        if target_url in self.task_complete_set:
            return
        result = Match.column(target_url)
        self.column_id = result.group('column_id')
        content = Http.get_content('https://zhuanlan.zhihu.com/api/columns/' + self.column_id)
        if not content:
            return
        raw_info = json.loads(content)
        info = {}
        info['creator_id'] = raw_info['creator']['slug']
        info['creator_hash'] = raw_info['creator']['hash']
        info['creator_sign'] = raw_info['creator']['bio']
        info['creator_name'] = raw_info['creator']['name']
        info['creator_logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['creator']['avatar'][
            'id']).replace('_{size}', '')

        info['column_id'] = raw_info['slug']
        info['name'] = raw_info['name']
        info['logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['avatar']['id']).replace(
            '_{size}', '')
        info['article'] = raw_info['postsCount']
        info['follower'] = raw_info['followersCount']
        info['description'] = raw_info['description']
        self.info_list.append(info)
        self.task_complete_set.add(target_url)
        detect_url = 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset='.format(self.column_id)
        for i in range(info['article'] / 10 + 1):
            self.work_set.add(detect_url + str(i * 10))
        return
Exemplo n.º 3
0
    def create_work_set(self, target_url):
        if target_url in self.task_complete_set:
            return
        result = Match.column(target_url)
        self.column_id = result.group("column_id")
        content = Http.get_content("https://zhuanlan.zhihu.com/api/columns/" + self.column_id)
        if not content:
            return
        raw_info = json.loads(content)
        info = {}
        info["creator_id"] = raw_info["creator"]["slug"]
        info["creator_hash"] = raw_info["creator"]["hash"]
        info["creator_sign"] = raw_info["creator"]["bio"]
        info["creator_name"] = raw_info["creator"]["name"]
        info["creator_logo"] = (
            raw_info["creator"]["avatar"]["template"]
            .replace("{id}", raw_info["creator"]["avatar"]["id"])
            .replace("_{size}", "")
        )

        info["column_id"] = raw_info["slug"]
        info["name"] = raw_info["name"]
        info["logo"] = (
            raw_info["creator"]["avatar"]["template"].replace("{id}", raw_info["avatar"]["id"]).replace("_{size}", "")
        )
        info["article"] = raw_info["postsCount"]
        info["follower"] = raw_info["followersCount"]
        info["description"] = raw_info["description"]
        self.info_list.append(info)
        self.task_complete_set.add(target_url)
        detect_url = "https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset=".format(self.column_id)
        for i in range(info["article"] / 10 + 1):
            self.work_set.add(detect_url + str(i * 10))
        return
Exemplo n.º 4
0
 def parse_column(command):
     result = Match.column(command)
     column_id = result.group('column_id')
     task = SingleTask()
     task.kind = 'column'
     task.spider.href = 'https://zhuanlan.zhihu.com/{}'.format(column_id)
     task.book.kind = 'column'
     task.book.sql.info = 'select * from ColumnInfo where column_id = "{}" '.format(column_id)
     task.book.sql.question = ''
     task.book.sql.answer = 'select * from Article where column_id = "{}" '.format(column_id)
     return task
Exemplo n.º 5
0
 def parse_column(command):
     result = Match.column(command)
     column_id = result.group('column_id')
     task = SingleTask()
     task.kind = 'column'
     task.spider.href = 'https://zhuanlan.zhihu.com/{}'.format(
         column_id)
     task.book.kind = 'column'
     task.book.sql.info = 'select * from ColumnInfo where column_id = "{}" '.format(
         column_id)
     task.book.sql.question = ''
     task.book.sql.answer = 'select * from Article where column_id = "{}" '.format(
         column_id)
     return task
 def parse_column(command):
     result = Match.column(command)
     column_id = result.group(u'column_id')
     task = ColumnTask(column_id)
     return task