Exemplo n.º 1
0
def run(collect_config: dict):
    """微信公众号文章抓取爬虫

    Args:
        collect_config (dict, optional): 采集器配置
    """
    s_nums = 0
    wechat_list = collect_config["wechat_list"]
    delta_time = collect_config.get("delta_time", 3)
    for wechat_name in wechat_list:
        SGWechatSpider.wechat_name = wechat_name
        SGWechatSpider.request_config = {
            "RETRIES": 3,
            "DELAY": delta_time,
            "TIMEOUT": 5,
        }
        t_url = f"https://weixin.sogou.com/weixin?type=1&query={wechat_name}&ie=utf8&s_from=input&_sug_=n&_sug_type_="
        SGWechatSpider.start_urls = [t_url]
        try:
            SGWechatSpider.start(middleware=ua_middleware)
            s_nums += 1
        except Exception as e:
            err_msg = f"😿 公众号->{wechat_name} 文章更新失败! 错误信息: {e}"
            LOGGER.error(err_msg)

    msg = f"🤗 微信公众号文章更新完毕({s_nums}/{len(wechat_list)})!"
    LOGGER.info(msg)
Exemplo n.º 2
0
async def playwright_main(wechat_name: str):
    """利用 playwright 获取公众号元信息,输出数据格式见上方
    Args:
        wechat_name ([str]): 公众号名称
    """
    wechat_data = {}
    try:
        async with async_playwright() as p:
            # browser = await p.chromium.launch(headless=False)
            browser = await p.chromium.launch()
            context = await browser.new_context(user_agent=Config.SPIDER_UA)
            page = await context.new_page()
            # 进行公众号检索
            await page.goto("https://weixin.sogou.com/")
            await page.wait_for_load_state()
            await page.click('input[name="query"]')
            await page.fill('input[name="query"]', wechat_name)
            await asyncio.sleep(1)
            await page.click("text=搜公众号")
            await page.wait_for_load_state()
            # await page.pause()
            # 抓取最新文章标题
            sg_html_handle = await page.query_selector("html")
            sg_html = await sg_html_handle.inner_html()
            if sg_html:
                item_list = []
                async for item in SGWechatItem.get_items(html=sg_html):
                    item_list.append(item)

                if item_list:
                    for target_item in item_list:
                        if target_item.wechat_name == wechat_name:
                            # 名字匹配才继续
                            info = f"playwright 匹配公众号 {wechat_name}({target_item.wechat_id}) 成功! 正在提取最新文章: {target_item.latest_title}"
                            LOGGER.info(info)
                            latest_href = target_item.latest_href

                            await page.goto(latest_href)
                            # 等待公众号图片加载出来,整个就算加载完毕
                            try:
                                await page.wait_for_selector(
                                    selector="#js_pc_qr_code_img", timeout=6000
                                )
                            except Exception as _:
                                pass
                            await page.wait_for_load_state()
                            wx_html_handle = await page.query_selector("html")
                            wx_html = await wx_html_handle.inner_html()
                            wechat_item: WechatItem = await WechatItem.get_item(
                                html=wx_html
                            )
                            # 获取当前微信公众号文章地址
                            wechat_item.doc_link = page.url
                            doc_source_name = wechat_item.doc_source_name or wechat_name
                            wechat_data = {
                                **wechat_item.results,
                                **{
                                    "doc_id": md5_encryption(
                                        f"{wechat_item.doc_name}_{doc_source_name}"
                                    ),
                                    "doc_source_name": doc_source_name,
                                    "doc_link": wechat_item.doc_link,
                                    "doc_source": wechat_item.doc_source,
                                    "doc_source_account_nick": wechat_item.doc_source_account_nick,
                                    "doc_source_account_intro": wechat_item.doc_source_account_intro,
                                    "doc_content": html_to_text_h2t(wx_html),
                                    "doc_keywords": "",
                                    "doc_html": "",
                                },
                            }
                            break
                    else:
                        info = f"playwright 匹配公众号 {wechat_name} 失败! "
                        LOGGER.error(info)
            else:
                info = f"playwright 抓取 HTML 失败: {wechat_name} "
                LOGGER.error(info)
            await browser.close()
    except Exception as e:
        info = f"playwright 抓取出错: {wechat_name} str{e}"
        LOGGER.error(info)
    return wechat_data
Exemplo n.º 3
0
def send_doc(sender_conf: dict):
    """
    对文章进行分发
    Args:
        sender_conf (dict): 分发配置
    """
    sender_list = sender_conf["sender_list"]
    query_days = sender_conf.get("query_days", 2)
    delta_time = sender_conf.get("delta_time", 3)
    link_source = sender_conf.get("link_source", "self")
    basic_filter = sender_conf.get("basic_filter", {})
    ignore_doc_source_name = sender_conf.get("ignore_doc_source_name", [])
    skip_ads = sender_conf.get("skip_ads", False)
    if sender_list:
        # 是否启用分发器
        mongo_base = MongodbManager.get_mongo_base(
            mongodb_config=Config.MONGODB_CONFIG)
        coll = mongo_base.get_collection(coll_name="liuli_articles")

        # 分别分发给各个目标
        for send_type in sender_list:
            # 构建查询条件
            cur_ts = int(time.time())
            custom_filter = sender_conf.get("custom_filter",
                                            {}).get(send_type, {})
            query_days = custom_filter.get("query_days", query_days)
            delta_time = custom_filter.get("delta_time", delta_time)
            link_source = custom_filter.get("link_source", link_source)
            skip_ads = custom_filter.get("skip_ads", skip_ads)
            ignore_doc_source_name = custom_filter.get(
                "ignore_doc_source_name", ignore_doc_source_name)
            filter_dict = {
                **basic_filter,
                **{
                    # 时间范围,除第一次外后面其实可以去掉
                    "doc_ts": {
                        "$gte": cur_ts - (query_days * 24 * 60 * 60),
                        "$lte": cur_ts,
                    },
                    # 过滤文档源名称
                    "doc_source_name": {
                        "$nin": ignore_doc_source_name
                    },
                },
            }
            if skip_ads:
                filter_dict.update({
                    # 至少打上一个模型标签
                    "cos_model": {
                        "$exists": True
                    },
                    # 判定结果为非广告
                    "cos_model.result": 1,
                })
            # 查找所有可分发文章
            for each_data in coll.find(filter_dict):
                # 暂时固定,测试
                init_config = sender_conf.get(f"{send_type}_init_config", {})
                cos_model_resp = each_data.get("cos_model", {})
                doc_cus_des = ""
                if cos_model_resp and skip_ads:
                    # 经过模型判断
                    if cos_model_resp["result"] == 1:
                        # 广告标记
                        doc_cus_des = f"👿广告[概率:{cos_model_resp['probability']}]"
                    else:
                        doc_cus_des = "🤓非广告"

                each_data["doc_cus_des"] = doc_cus_des
                each_data["doc_link"] = get_bak_doc_link(
                    link_source=link_source, doc_data=each_data)
                # 每次分发休眠一定时间
                time.sleep(delta_time)
                send_factory(send_type=send_type,
                             init_config=init_config,
                             send_data=each_data)
    else:
        LOGGER.error()("未配置分发器!")
Exemplo n.º 4
0
def backup_doc(backup_config: dict):
    """对文章进行备份

    Args:
        backup_config (dict): 备份配置
    """
    backup_list = backup_config["backup_list"]
    query_days = backup_config.get("query_days", 2)
    delta_time = backup_config.get("delta_time", 3)
    basic_filter = backup_config.get("basic_filter", {})
    doc_html_dict = backup_config.get("doc_html_dict", {})
    init_config = backup_config.get("init_config", {})
    after_get_content = backup_config.get("after_get_content", [])
    if backup_list:
        mongo_base = MongodbManager.get_mongo_base(
            mongodb_config=Config.MONGODB_CONFIG)
        coll = mongo_base.get_collection(coll_name="liuli_articles")
        cur_ts = int(time.time())
        filter_dict = {
            **basic_filter,
            **{
                # 时间范围,除第一次外后面其实可以去掉
                "doc_ts": {
                    "$gte": cur_ts - (query_days * 24 * 60 * 60),
                    "$lte": cur_ts
                }
            },
        }
        db_res = mongodb_find(
            coll_conn=coll,
            filter_dict=filter_dict,
            return_dict={
                "_id": 0,
                "doc_source": 1,
                "doc_source_name": 1,
                "doc_core_html": 1,
                "doc_html": 1,
                "doc_name": 1,
                "doc_link": 1,
            },
        )

        if db_res["status"]:
            # 查找所有可备份文章
            for each_data in db_res["info"]:
                for each in backup_list:
                    # 每次备份休眠一定时间
                    time.sleep(delta_time)
                    backup_ins = backup_factory(backup_type=each,
                                                init_config=init_config)
                    # 获取文档源
                    doc_source = each_data["doc_source"]
                    # 获取最终存储数据
                    doc_html = get_bak_doc_html(
                        doc_data=each_data,
                        doc_html_type=doc_html_dict.get(doc_source, "default"),
                    )
                    # 执行获取文本后的钩子函数
                    for func_dict in after_get_content:
                        cur_func_dict = deepcopy(func_dict)
                        func_name = cur_func_dict.pop("func")
                        LOGGER.info(
                            f"处理器(backup:after_get_content): {func_name} 正在执行..."
                        )
                        cur_func_dict.update({"text": doc_html})
                        doc_html = processor_dict[func_name](**cur_func_dict)
                    # 进行保存动作
                    each_data["doc_html"] = doc_html
                    backup_ins.save(each_data)
        else:
            LOGGER.error(f"Backup 数据查询失败! {db_res['info']}")
    else:
        LOGGER.error("Backup 未配置备份源!")