예제 #1
0
def answers(question_id, warehouse):
    offset = zhihu.Controller()
    warehouse = question_warehouse(question_id, warehouse)
    while not offset.is_end():
        response = net.answers_spider(question_id, offset.next_offset(),
                                      const.SORT_BY_VOT)
        if response is None:
            raise ValueError('Response is None')
        try:
            response_json = response.json()
            offset.totals = response_json['paging']['totals']
            database: list = response_json['data']
            offset.increase(len(database))
            for answer_content in database:
                msg = answer_msg(answer_content)
                if not offset.to_collect(answer_content):
                    continue
                content = BeautifulSoup(answer_content['content'], 'lxml').body
                an = document.Answer(content, msg)
                an.set_file_name(template='%a-%v')
                an.make_markdown(warehouse)
                print(an.answer_msg())
            timer.sleep_for(zhihu.SLEEP)
        except JSONDecodeError as e:
            raise e
예제 #2
0
def articles_id(column_id):
    article_list = list()
    offset = zhihu.Controller()
    while not offset.is_end():
        print(GET_ARTICLES_ID)
        response = net.column_spider(column_id, offset.next_offset())
        if response is None:
            raise ValueError('Response is None')
        content = response.text
        totals = re.search(r'"totals":\W(\d+)', content).group(1)
        offset.totals = int(totals)
        article_id_list = re.findall(r'"id":\W(\d+)', content)
        offset.increase(len(article_id_list))
        article_list.extend(article_id_list)
        article_id_list.clear()
        timer.random_sleep(end=zhihu.SLEEP)
    return article_list
예제 #3
0
def user_answers(user_id, warehouse):
    init_user_msg(user_id)
    offset = zhihu.Controller()
    while not offset.is_end():
        response = net.user_answers_spider(user_id, offset.next_offset(), const.SORT_BY_DAT)
        if response is None:
            raise ValueError('Response is None')
        try:
            response_json = response.json()
            offset.totals = response_json['paging']['totals']
            database: list = response_json['data']
            offset.increase(len(database))
            for answer_content in database:
                msg = answer_msg(answer_content)
                content = BeautifulSoup(answer_content['content'], 'lxml').body
                an = document.Answer(content, msg)
                an.make_markdown(warehouse)
                print(an.answer_msg())
            timer.sleep_for(zhihu.SLEEP)
        except JSONDecodeError as e:
            raise e
예제 #4
0
def topic_essence(topic_id, warehouse):
    """获取并解析精华内容,根据内容的类型向 essence_answer,essence_article 分流"""
    offset = zhihu.Controller(crawl_times=200, limit=10)  # 需要写一个新的控制器
    warehouse = topic_warehouse(topic_id, warehouse)
    while not offset.is_end():
        response = net.topic_essence_spider(topic_id, offset.next_offset())
        if response is None:
            raise ValueError('Response is None')
        try:
            response_json = response.json()
            database: list = response_json['data']
            offset.running_status(response_json['paging']['is_end'])
            offset.increase(len(database))
            for content in database:
                if content['target']['type'] == 'answer':
                    essence_answer(content['target'], warehouse)
                elif content['target']['type'] == 'article':
                    essence_article(content['target'], warehouse)
        except JSONDecodeError as e:
            raise e
        timer.sleep_for(zhihu.SLEEP)
    print(offset)
예제 #5
0
def make_answers_as_book(question_id, warehouse):
    offset = zhihu.Controller()
    response = net.question_msg_spider(question_id)
    if response is not None:
        response_json = response.json()
        name = response_json['title']
        name = re.sub(r'[\\/]', '、', name)
        title = re.sub(r'[??*:<>|]', '', name)
    else:
        raise ValueError('Response is None')
    book = open(os.path.join(warehouse, title + '.md'), 'a', encoding='utf8')
    while not offset.is_end():
        response = net.answers_spider(question_id, offset.next_offset(),
                                      const.SORT_BY_VOT)
        if response is None:
            raise ValueError('Response is None')
        try:
            response_json = response.json()
            offset.totals = response_json['paging']['totals']
            database: list = response_json['data']
            offset.increase(len(database))
            for answer_content in database:
                msg = answer_msg(answer_content)
                if not offset.to_collect(answer_content):
                    continue
                content = BeautifulSoup(answer_content['content'], 'lxml').body
                an = document.Answer(content, msg)
                an.set_file_name(template='%a-%v')
                book.write(an.to_markdown())
                book.write('\n---\n')
                print(an.answer_msg())
            timer.sleep_for(zhihu.SLEEP)
        except JSONDecodeError as e:
            book.close()
            os.remove(os.path.join(warehouse, title + '.md'))
            raise e
    book.close()