def run(collect_config: dict): """微信公众号文章抓取爬虫 Args: collect_config (dict, optional): 采集器配置 """ s_nums = 0 wechat_list = collect_config["wechat_list"] delta_time = collect_config.get("delta_time", 3) for wechat_name in wechat_list: SGWechatSpider.wechat_name = wechat_name SGWechatSpider.request_config = { "RETRIES": 3, "DELAY": delta_time, "TIMEOUT": 5, } t_url = f"https://weixin.sogou.com/weixin?type=1&query={wechat_name}&ie=utf8&s_from=input&_sug_=n&_sug_type_=" SGWechatSpider.start_urls = [t_url] try: SGWechatSpider.start(middleware=ua_middleware) s_nums += 1 except Exception as e: err_msg = f"😿 公众号->{wechat_name} 文章更新失败! 错误信息: {e}" LOGGER.error(err_msg) msg = f"🤗 微信公众号文章更新完毕({s_nums}/{len(wechat_list)})!" LOGGER.info(msg)
def load_data_to_articlles(input_data: dict): """ 将获取的文章数据并持久化到 liuli_articles """ # 抓取状态 flag = False doc_source_name = input_data.get("doc_source_name") doc_source = input_data.get("doc_source") doc_name = input_data.get("doc_name") copy_input_data = deepcopy(input_data) copy_input_data["doc_ts"] = int( copy_input_data.get("doc_ts", int(time.time()))) if doc_source_name and doc_source and doc_name: # 抓取成功进行持久化 mongo_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) coll_conn = mongo_base.get_collection(coll_name="liuli_articles") filter_dict = {"doc_id": copy_input_data["doc_id"]} update_data = {"$set": copy_input_data} db_res = mongodb_update_data( coll_conn=coll_conn, filter_dict=filter_dict, update_data=update_data, upsert=True, ) if db_res["status"]: msg = f"来自 {doc_source} 的文章持久化成功! 👉 {doc_source_name}: {doc_name} " flag = True else: msg = f"来自 {doc_source} 的文章持久化失败! 👉 {doc_source_name} {db_res['info']}" else: msg = f"来自 {doc_source} 的文章抓取失败! 👉 {doc_source}/{doc_source_name}/{doc_name} " LOGGER.info(msg) return flag
def ad_marker( cos_value: float = 0.6, is_force=False, basic_filter=None, **kwargs, ): """对订阅的文章进行广告标记 Args: cos_value (str): 0.6 basic_filter (dict): {} 查询条件 is_force (bool): 是否强制重新判决 """ mongo_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) coll = mongo_base.get_collection(coll_name="liuli_articles") if is_force: query = {} else: query = {"cos_model": {"$exists": False}} query.update(basic_filter or {}) # 查找没有被标记的文章,基于相似度模型进行判断 for each_data in coll.find(query): doc_name = each_data["doc_name"] doc_source_name = each_data["doc_source_name"] doc_content = each_data["doc_content"] doc_keywords = each_data.get("doc_keywords") if not doc_keywords: keyword_list = extract_keyword_list(doc_content) doc_keywords = " ".join(keyword_list) each_data["doc_keywords"] = doc_keywords # 基于余弦相似度 cos_model_resp = model_predict_factory( model_name="cos", model_path="", input_dict={ "text": doc_name + doc_keywords, "cos_value": cos_value }, # input_dict={"text": doc_name, "cos_value": Config.COS_VALUE}, ).to_dict() each_data["cos_model"] = cos_model_resp if cos_model_resp["result"] == 1: LOGGER.info( f"[{doc_source_name}] {doc_name} 被识别为广告[{cos_model_resp['probability']}],链接为:{each_data['doc_link']}" ) coll.update_one( filter={"doc_id": each_data["doc_id"]}, update={"$set": each_data}, upsert=True, )
def update_ads_tag(is_force=False): """ 对订阅的文章进行广告标记 :param is_force: 是否强制重新判决 :return: """ mongo_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) coll = mongo_base.get_collection(coll_name="2c_articles") if is_force: query = {} else: query = {"cos_model": {"$exists": False}} # 查找没有被标记的文章,基于预先相似度模型进行判断 for each_data in coll.find(query): doc_name = each_data["doc_name"] doc_link = each_data["doc_link"] doc_source_name = each_data["doc_source_name"] doc_keywords = each_data.get("doc_keywords") if not doc_keywords: keyword_list = fetch_keyword_list(doc_link) doc_keywords = " ".join(keyword_list) each_data["doc_keywords"] = doc_keywords # 基于余弦相似度 cos_model_resp = model_predict_factory( model_name="cos", model_path="", input_dict={ "text": doc_name + doc_keywords, "cos_value": Config.COS_VALUE }, # input_dict={"text": doc_name, "cos_value": Config.COS_VALUE}, ).to_dict() each_data["cos_model"] = cos_model_resp if cos_model_resp["result"] == 1: LOGGER.info( f"[{doc_source_name}] {doc_name} 被识别为广告[{cos_model_resp['probability']}],链接为:{each_data['doc_link']}" ) coll.update_one( filter={"doc_id": each_data["doc_id"]}, update={"$set": each_data}, upsert=True, )
def run(collect_config: dict): """微信公众号文章抓取爬虫 Args: collect_config (dict, optional): 采集器配置 """ s_nums = 0 wechat_list = collect_config["wechat_list"] delta_time = collect_config.get("delta_time", 5) for name in wechat_list: time.sleep(delta_time) input_data = asyncio.run(playwright_main(name)) # 持久化,必须执行 flag = load_data_to_articlles(input_data) if flag: s_nums += 1 msg = f"🤗 微信公众号文章更新完毕({s_nums}/{len(wechat_list)})" LOGGER.info(msg)
def run(collect_config: dict): """rss解析,rss条目持久化 Args: collect_config (dict, optional): 采集器配置 """ feeds_dict: dict = collect_config.get("feeds_dict") feeds_name: list = list(feeds_dict) delta_time = collect_config.get("delta_time", 1) for name in feeds_name: LOGGER.info(f"rss源 {name}: {feeds_dict[name]}") fd = feedparser.parse(feeds_dict[name]) for entry in fd.entries: LOGGER.info(entry.link) # 休眠 time.sleep(delta_time) resp_text = get_html_by_requests( url=entry.link, headers={"User-Agent": Config.SPIDER_UA} ) _, doc_core_html = extract_core_html(resp_text) doc_core_html_lib = text_compress(doc_core_html) input_data = { "doc_date": entry.get("published", ""), "doc_image": "", "doc_name": entry.get("title", ""), "doc_ts": int(time.time()), "doc_link": entry.get("link", ""), "doc_source_meta_list": [], "doc_keywords": " ", "doc_des": entry.get("description", ""), "doc_core_html": doc_core_html_lib, "doc_type": "article", "doc_author": "", "doc_source_name": name, "doc_id": md5_encryption(f"{entry.get('title', '')}_{name}"), "doc_source": "liuli_feed", "doc_source_account_nick": "", "doc_source_account_intro": "", "doc_content": "", "doc_html": "", } load_data_to_articlles(input_data) msg = "🤗 liuli_feed 采集器执行完毕" LOGGER.info(msg)
def send_doc(sender_conf: dict): """ 对文章进行分发 Args: sender_conf (dict): 分发配置 """ sender_list = sender_conf["sender_list"] query_days = sender_conf.get("query_days", 2) delta_time = sender_conf.get("delta_time", 3) link_source = sender_conf.get("link_source", "self") basic_filter = sender_conf.get("basic_filter", {}) ignore_doc_source_name = sender_conf.get("ignore_doc_source_name", []) skip_ads = sender_conf.get("skip_ads", False) if sender_list: # 是否启用分发器 mongo_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) coll = mongo_base.get_collection(coll_name="liuli_articles") # 分别分发给各个目标 for send_type in sender_list: # 构建查询条件 cur_ts = int(time.time()) custom_filter = sender_conf.get("custom_filter", {}).get(send_type, {}) query_days = custom_filter.get("query_days", query_days) delta_time = custom_filter.get("delta_time", delta_time) link_source = custom_filter.get("link_source", link_source) skip_ads = custom_filter.get("skip_ads", skip_ads) ignore_doc_source_name = custom_filter.get( "ignore_doc_source_name", ignore_doc_source_name) filter_dict = { **basic_filter, **{ # 时间范围,除第一次外后面其实可以去掉 "doc_ts": { "$gte": cur_ts - (query_days * 24 * 60 * 60), "$lte": cur_ts, }, # 过滤文档源名称 "doc_source_name": { "$nin": ignore_doc_source_name }, }, } if skip_ads: filter_dict.update({ # 至少打上一个模型标签 "cos_model": { "$exists": True }, # 判定结果为非广告 "cos_model.result": 1, }) # 查找所有可分发文章 for each_data in coll.find(filter_dict): # 暂时固定,测试 init_config = sender_conf.get(f"{send_type}_init_config", {}) cos_model_resp = each_data.get("cos_model", {}) doc_cus_des = "" if cos_model_resp and skip_ads: # 经过模型判断 if cos_model_resp["result"] == 1: # 广告标记 doc_cus_des = f"👿广告[概率:{cos_model_resp['probability']}]" else: doc_cus_des = "🤓非广告" each_data["doc_cus_des"] = doc_cus_des each_data["doc_link"] = get_bak_doc_link( link_source=link_source, doc_data=each_data) # 每次分发休眠一定时间 time.sleep(delta_time) send_factory(send_type=send_type, init_config=init_config, send_data=each_data) else: LOGGER.error()("未配置分发器!")
async def playwright_main(wechat_name: str): """利用 playwright 获取公众号元信息,输出数据格式见上方 Args: wechat_name ([str]): 公众号名称 """ wechat_data = {} try: async with async_playwright() as p: # browser = await p.chromium.launch(headless=False) browser = await p.chromium.launch() context = await browser.new_context(user_agent=Config.SPIDER_UA) page = await context.new_page() # 进行公众号检索 await page.goto("https://weixin.sogou.com/") await page.wait_for_load_state() await page.click('input[name="query"]') await page.fill('input[name="query"]', wechat_name) await asyncio.sleep(1) await page.click("text=搜公众号") await page.wait_for_load_state() # await page.pause() # 抓取最新文章标题 sg_html_handle = await page.query_selector("html") sg_html = await sg_html_handle.inner_html() if sg_html: item_list = [] async for item in SGWechatItem.get_items(html=sg_html): item_list.append(item) if item_list: for target_item in item_list: if target_item.wechat_name == wechat_name: # 名字匹配才继续 info = f"playwright 匹配公众号 {wechat_name}({target_item.wechat_id}) 成功! 正在提取最新文章: {target_item.latest_title}" LOGGER.info(info) latest_href = target_item.latest_href await page.goto(latest_href) # 等待公众号图片加载出来,整个就算加载完毕 try: await page.wait_for_selector( selector="#js_pc_qr_code_img", timeout=6000 ) except Exception as _: pass await page.wait_for_load_state() wx_html_handle = await page.query_selector("html") wx_html = await wx_html_handle.inner_html() wechat_item: WechatItem = await WechatItem.get_item( html=wx_html ) # 获取当前微信公众号文章地址 wechat_item.doc_link = page.url doc_source_name = wechat_item.doc_source_name or wechat_name wechat_data = { **wechat_item.results, **{ "doc_id": md5_encryption( f"{wechat_item.doc_name}_{doc_source_name}" ), "doc_source_name": doc_source_name, "doc_link": wechat_item.doc_link, "doc_source": wechat_item.doc_source, "doc_source_account_nick": wechat_item.doc_source_account_nick, "doc_source_account_intro": wechat_item.doc_source_account_intro, "doc_content": html_to_text_h2t(wx_html), "doc_keywords": "", "doc_html": "", }, } break else: info = f"playwright 匹配公众号 {wechat_name} 失败! " LOGGER.error(info) else: info = f"playwright 抓取 HTML 失败: {wechat_name} " LOGGER.error(info) await browser.close() except Exception as e: info = f"playwright 抓取出错: {wechat_name} str{e}" LOGGER.error(info) return wechat_data
def backup_doc(backup_config: dict): """对文章进行备份 Args: backup_config (dict): 备份配置 """ backup_list = backup_config["backup_list"] query_days = backup_config.get("query_days", 2) delta_time = backup_config.get("delta_time", 3) basic_filter = backup_config.get("basic_filter", {}) doc_html_dict = backup_config.get("doc_html_dict", {}) init_config = backup_config.get("init_config", {}) after_get_content = backup_config.get("after_get_content", []) if backup_list: mongo_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) coll = mongo_base.get_collection(coll_name="liuli_articles") cur_ts = int(time.time()) filter_dict = { **basic_filter, **{ # 时间范围,除第一次外后面其实可以去掉 "doc_ts": { "$gte": cur_ts - (query_days * 24 * 60 * 60), "$lte": cur_ts } }, } db_res = mongodb_find( coll_conn=coll, filter_dict=filter_dict, return_dict={ "_id": 0, "doc_source": 1, "doc_source_name": 1, "doc_core_html": 1, "doc_html": 1, "doc_name": 1, "doc_link": 1, }, ) if db_res["status"]: # 查找所有可备份文章 for each_data in db_res["info"]: for each in backup_list: # 每次备份休眠一定时间 time.sleep(delta_time) backup_ins = backup_factory(backup_type=each, init_config=init_config) # 获取文档源 doc_source = each_data["doc_source"] # 获取最终存储数据 doc_html = get_bak_doc_html( doc_data=each_data, doc_html_type=doc_html_dict.get(doc_source, "default"), ) # 执行获取文本后的钩子函数 for func_dict in after_get_content: cur_func_dict = deepcopy(func_dict) func_name = cur_func_dict.pop("func") LOGGER.info( f"处理器(backup:after_get_content): {func_name} 正在执行..." ) cur_func_dict.update({"text": doc_html}) doc_html = processor_dict[func_name](**cur_func_dict) # 进行保存动作 each_data["doc_html"] = doc_html backup_ins.save(each_data) else: LOGGER.error(f"Backup 数据查询失败! {db_res['info']}") else: LOGGER.error("Backup 未配置备份源!")
def run(collect_config: dict): """书籍目录抓取爬虫 Args: collect_config (dict, optional): 采集器配置 """ book_dict: dict = collect_config["book_dict"] delta_time = collect_config.get("delta_time", 5) latest_chapter_nums = collect_config.get("latest_chapter_nums", 1) for book_name, book_url in book_dict.items(): resp_text = get_html_by_requests(book_url) all_chapters = extract_chapters(chapter_url=book_url, html=resp_text) latest_chapter_nums = (latest_chapter_nums if len(all_chapters) > latest_chapter_nums else len(all_chapters)) latest_chapter_list = all_chapters[-latest_chapter_nums:] for latest_chapter in latest_chapter_list: doc_name = latest_chapter.get("chapter_name") doc_link = latest_chapter.get("chapter_url") # 休眠 time.sleep(delta_time) resp_text = get_html_by_requests( url=doc_link, headers={"User-Agent": Config.SPIDER_UA}) _, doc_core_html = extract_core_html(resp_text) # 压缩为二进制进行存储 doc_core_html_lib = text_compress(doc_core_html) input_data = { "doc_date": "", "doc_image": "", "doc_name": doc_name, "doc_ts": int(time.time()), "doc_link": doc_link, "doc_source_meta_list": [], "doc_keywords": " ".join(extract_keyword_list(html_to_text_h2t(resp_text))), "doc_des": "", "doc_core_html": doc_core_html_lib, "doc_type": "article", "doc_author": "", "doc_source_name": book_name, "doc_id": md5_encryption(f"{doc_name}_{book_name}"), "doc_source": "liuli_book", "doc_source_account_nick": "", "doc_source_account_intro": "", "doc_content": "", "doc_html": "", } # 持久化,必须执行 load_data_to_articlles(input_data) msg = "🤗 liuli_book 采集器执行完毕" LOGGER.info(msg)