def answers(question_id, warehouse): offset = zhihu.Controller() warehouse = question_warehouse(question_id, warehouse) while not offset.is_end(): response = net.answers_spider(question_id, offset.next_offset(), const.SORT_BY_VOT) if response is None: raise ValueError('Response is None') try: response_json = response.json() offset.totals = response_json['paging']['totals'] database: list = response_json['data'] offset.increase(len(database)) for answer_content in database: msg = answer_msg(answer_content) if not offset.to_collect(answer_content): continue content = BeautifulSoup(answer_content['content'], 'lxml').body an = document.Answer(content, msg) an.set_file_name(template='%a-%v') an.make_markdown(warehouse) print(an.answer_msg()) timer.sleep_for(zhihu.SLEEP) except JSONDecodeError as e: raise e
def articles_id(column_id): article_list = list() offset = zhihu.Controller() while not offset.is_end(): print(GET_ARTICLES_ID) response = net.column_spider(column_id, offset.next_offset()) if response is None: raise ValueError('Response is None') content = response.text totals = re.search(r'"totals":\W(\d+)', content).group(1) offset.totals = int(totals) article_id_list = re.findall(r'"id":\W(\d+)', content) offset.increase(len(article_id_list)) article_list.extend(article_id_list) article_id_list.clear() timer.random_sleep(end=zhihu.SLEEP) return article_list
def user_answers(user_id, warehouse): init_user_msg(user_id) offset = zhihu.Controller() while not offset.is_end(): response = net.user_answers_spider(user_id, offset.next_offset(), const.SORT_BY_DAT) if response is None: raise ValueError('Response is None') try: response_json = response.json() offset.totals = response_json['paging']['totals'] database: list = response_json['data'] offset.increase(len(database)) for answer_content in database: msg = answer_msg(answer_content) content = BeautifulSoup(answer_content['content'], 'lxml').body an = document.Answer(content, msg) an.make_markdown(warehouse) print(an.answer_msg()) timer.sleep_for(zhihu.SLEEP) except JSONDecodeError as e: raise e
def topic_essence(topic_id, warehouse): """获取并解析精华内容,根据内容的类型向 essence_answer,essence_article 分流""" offset = zhihu.Controller(crawl_times=200, limit=10) # 需要写一个新的控制器 warehouse = topic_warehouse(topic_id, warehouse) while not offset.is_end(): response = net.topic_essence_spider(topic_id, offset.next_offset()) if response is None: raise ValueError('Response is None') try: response_json = response.json() database: list = response_json['data'] offset.running_status(response_json['paging']['is_end']) offset.increase(len(database)) for content in database: if content['target']['type'] == 'answer': essence_answer(content['target'], warehouse) elif content['target']['type'] == 'article': essence_article(content['target'], warehouse) except JSONDecodeError as e: raise e timer.sleep_for(zhihu.SLEEP) print(offset)
def make_answers_as_book(question_id, warehouse): offset = zhihu.Controller() response = net.question_msg_spider(question_id) if response is not None: response_json = response.json() name = response_json['title'] name = re.sub(r'[\\/]', '、', name) title = re.sub(r'[??*:<>|]', '', name) else: raise ValueError('Response is None') book = open(os.path.join(warehouse, title + '.md'), 'a', encoding='utf8') while not offset.is_end(): response = net.answers_spider(question_id, offset.next_offset(), const.SORT_BY_VOT) if response is None: raise ValueError('Response is None') try: response_json = response.json() offset.totals = response_json['paging']['totals'] database: list = response_json['data'] offset.increase(len(database)) for answer_content in database: msg = answer_msg(answer_content) if not offset.to_collect(answer_content): continue content = BeautifulSoup(answer_content['content'], 'lxml').body an = document.Answer(content, msg) an.set_file_name(template='%a-%v') book.write(an.to_markdown()) book.write('\n---\n') print(an.answer_msg()) timer.sleep_for(zhihu.SLEEP) except JSONDecodeError as e: book.close() os.remove(os.path.join(warehouse, title + '.md')) raise e book.close()