def fetch_latest(params): """下载最新的新闻(包括图片),并保存 :return: """ zh = daily.ZhiHu() # 获取最新的news_id列表 latest_news = zh.get_latest_news() latest_news_ids = _extract_news_ids(latest_news) date_str = _extract_date_str(latest_news) # 找出数据库中没有的news_id列表 not_exists_news_ids = _not_exists_news_ids(date_str, latest_news_ids) # 获取news和下载图片 not_exists_news_ids.reverse() wait_for_store_news_list = _get_news_list(not_exists_news_ids) # 保存图片 wait_for_store_news_list = _store_images(wait_for_store_news_list, date_str) # 保存news到数据库中 _store_news_list(wait_for_store_news_list) # 创建索引 _index_news_list([ wait_for_store_news['news'] for wait_for_store_news in wait_for_store_news_list ])
def _get_news_list(news_ids): """获取所有的news,image信息 :param news_ids: :return: """ zh = daily.ZhiHu() wait_for_store_news_list = [] for news_id in news_ids: try: news = zh.get_news(news_id) # 下载图片 image_url = news['image'] if 'image' in news else news[ 'theme_image'] image_type, image_data = _fetch_image(news['share_url'], image_url) wait_for_store_news_list.append( dict(news=news, image_type=image_type, image_data=image_data, image_url=image_url)) except Exception as e: stack = traceback.format_exc() logging.error("fetch latest error %s\n%s" % (e, stack)) return wait_for_store_news_list
def fetch(params): """下载最新的新闻(包括图片),并保存 :return: """ zh = daily.ZhiHu() if 'date' not in params: latest_news = zh.get_latest_news() else: date_str = params['date'][0] latest_news = zh.get_before_news(date_str) # 获取最新的news_id列表 latest_news_ids = _extract_news_ids(latest_news) date_str = _extract_date_str(latest_news) # 找出数据库中没有的news_id列表 not_exists_news_ids = _not_exists_news_ids(date_str, latest_news_ids) # 获取news和下载图片 not_exists_news_ids.reverse() wait_for_store_news_list = _fetch_news_list(not_exists_news_ids) # 保存图片 wait_for_store_news_list = _store_images(wait_for_store_news_list, date_str) # 保存news到数据库中 _store_news_list(wait_for_store_news_list)
def fetch_before(params): """下载某天的新闻,并保存 :param params: :return: """ if 'date' not in params: raise OperationException("lack of param date") date_str = params['date'][0] zh = daily.ZhiHu() # 获取最新的news_id列表 latest_news = zh.get_before_news(date_str) news_ids = _extract_news_ids(latest_news) date_str = _extract_date_str(latest_news) # 找出数据库中没有的news_id列表 not_exists_news_ids = _not_exists_news_ids(date_str, news_ids) # 获取news和下载图片 not_exists_news_ids.reverse() wait_for_store_news_list = _get_news_list(not_exists_news_ids) # 保存图片 wait_for_store_news_list = _store_images(wait_for_store_news_list, date_str) # 保存news到数据库中 _store_news_list(wait_for_store_news_list) # 创建索引 _index_news_list([ wait_for_store_news['news'] for wait_for_store_news in wait_for_store_news_list ])