예제 #1
0
def get_idx_data(idx_type, interval=[], get_conn=False):
    """
    @功能:取more_info里,mong字段中,含有idx_type标签的事件数据
    :param idx_type:
    :param interval: 为空则取全部,不空则按时段取
    :param get_conn:
    :return:
    """

    sqlstr_ = "SELECT gov_id, ROUND(CAST(actual_value/19 AS NUMERIC), 4) AS affect, more_info ->> 'verified_type' AS verified_type, more_info ->> 'sentiment' AS sentiment FROM %s WHERE more_info ->> 'verified_type' IS NOT NULL" % event_table

    if interval:
        sqlstr_ += " AND event_start_time >= '%s' AND event_start_time < '%s'" % (
            interval[0], interval[1])

    if idx_type != "mong":
        sqlstr_ += " AND more_info ->> 'mong' IS NOT NULL  AND ((more_info ->> 'mong')::jsonb ? '%s')" % idx_type

    sqlstr_ += " ORDER BY id ASC;"

    if get_conn:
        event_db_obj.get_conn()

    datas = event_db_obj.read_from_table(sqlstr_)

    if get_conn:
        event_db_obj.disconnect()

    return datas
def mark_event_verified_type(limit=None):

    start_time = time.time()

    none_verified_type_data = get_none_verified_type_data(limit)

    df_data = pd.DataFrame(none_verified_type_data)

    print("{} - 取未标记verified_type的事件数据完毕,耗时:{}".format(
        str(datetime.now()),
        time.time() - start_time),
          flush=True)

    event_db_obj.get_conn()

    PROCESS_BATCH = 1000

    for line in range(0, df_data.shape[0], PROCESS_BATCH):
        pre_start = time.time()

        df_data_ = deepcopy(df_data.iloc[line:line + PROCESS_BATCH])

        df_data_["verified_type"] = df_data_["url_list"].apply(
            lambda x: get_event_verified_type(x))

        df_data_["more_info"] = df_data_.apply(
            lambda x: {
                **x["more_info"], "verified_type": x["verified_type"]
            },
            axis=1)

        print("{} - 预处理-添加verified_type数据完毕,耗时:{}".format(
            str(datetime.now()),
            time.time() - pre_start),
              flush=True)
        # print(df_data, flush=True)

        into_db_start = time.time()

        update_db_more_info(df_data_)

        print("{} - 更新数据库more_info字段完毕,耗时:{}".format(
            str(datetime.now()),
            time.time() - into_db_start),
              flush=True)

    event_db_obj.disconnect()
    print("{} - 更新verified_type全程耗时:{}".format(str(datetime.now()),
                                               time.time() - start_time),
          flush=True)
    logger.info("[已完成more_info中verified_type标记] 更新数据:%d" % df_data.shape[0])
def get_none_verified_type_data(limit=None):
    """
    @功能:获取没写入post_type的数据,只取more_info已经写了的字段 // 避免和更新more_info字段搞混
    :param limit:
    :return:
    """

    # sqlstr = "SELECT event_id, event_title, url_list, more_info FROM {} WHERE more_info is not NULL AND more_info ->> 'verified_type' is NULL ORDER BY id ASC".format(
    #     event_table)
    _s_date = (datetime.now() - timedelta(days=100)).strftime("%Y-%m-%d")
    sqlstr = "SELECT event_id, event_title, url_list, more_info FROM {} WHERE event_start_time >= '{}' and more_info is not NULL AND more_info ->> 'verified_type' is NULL ORDER BY id ASC".format(
        event_table, _s_date)

    if limit:
        sqlstr += " LIMIT %d" % limit

    event_db_obj.get_conn()

    datas = event_db_obj.read_from_table(sqlstr)

    event_db_obj.disconnect()

    return datas
예제 #4
0
def get_law_data(interval=[], get_conn=False):
    """
    @功能:取打上verified_type标签的,【法治】相关的事件数据,
    :param interval: 为空则取全部,不空则按时段取
    :return:
    """
    if not interval:
        sqlstr_ = "SELECT gov_id, ROUND(CAST(actual_value/19 AS NUMERIC), 4) AS affect, more_info ->> 'verified_type' AS verified_type, more_info ->> 'sentiment' AS sentiment FROM {} WHERE (more_info ->> 'verified_type' IS NOT NULL) AND (more_info ->> 'sub_cates' LIKE '%法制%' OR more_info ->> 'sub_cates' LIKE '%刑法%') ORDER BY id ASC;".format(
            event_table)  # event_start_time,

    else:
        sqlstr_ = "SELECT gov_id, ROUND(CAST(actual_value/19 AS NUMERIC), 4) AS affect, more_info ->> 'verified_type' AS verified_type, more_info ->> 'sentiment' AS sentiment FROM {} WHERE (more_info ->> 'verified_type' IS NOT NULL AND event_start_time >= '{}' AND event_start_time < '{}') AND (more_info ->> 'sub_cates' LIKE '%法制%' OR more_info ->> 'sub_cates' LIKE '%刑法%') ORDER BY id ASC;".format(
            event_table, interval[0], interval[1])  # event_start_time,

    if get_conn:
        event_db_obj.get_conn()

    datas = event_db_obj.read_from_table(sqlstr_)

    if get_conn:
        event_db_obj.disconnect()

    return datas
예제 #5
0
def mark_idxs_main(idx_types: list = UNIFORM_DISPOSAL_INDEXES,
                   mark_by: str = "update",
                   test_mode: bool = False):
    """
    @功能:判断并标记,各种指标 —— 每种指标都默认重新标注
    :param idx_types:
    :param mark_by:
    "update": 更新所有指标(不含mong字段的数据);
    "renew":重跑(所有数据)
    :param test_mode:
    :return: 更新数据库
    """

    start_time = time.time()

    event_db_obj.get_conn()

    if test_mode:
        data_num = TEST_LIMIT
    else:
        data_num = None

    events_data = get_events_data_from_db(mark_by, limit=data_num)

    df_events = pd.DataFrame(events_data)

    if not df_events.shape[0]:
        print("没有待更新的数据!", flush=True)
        return 0

    df_events["event_title"] = df_events["event_title"].apply(
        lambda x: get_event_title_str(x))
    df_events["first_content"] = df_events["first_content"].apply(
        lambda x: x.encode('gbk', 'ignore').decode('gbk'))

    print("{} - 取数据并预处理完毕!耗时:{}".format(str(datetime.now()),
                                        time.time() - start_time),
          flush=True)

    for line in range(0, df_events.shape[0], PROCESS_BATCH):
        data_df = deepcopy(df_events.iloc[line:line + PROCESS_BATCH])

        mark_start = time.time()
        for idx_type_ in idx_types:
            data_df["more_info"] = data_df.apply(
                lambda x: dispose_more_info_mong_idx(x, idx_type_), axis=1)

        print("{} - batch={}:指标判断完毕!耗时:{}".format(
            str(datetime.now()), int(line / PROCESS_BATCH + 1),
            time.time() - mark_start),
              flush=True)

        update_db_start = time.time()
        update_db_more_info(data_df)
        print("{} - batch={}:更新数据完毕!耗时:{}".format(
            str(datetime.now()), int(line / PROCESS_BATCH + 1),
            time.time() - update_db_start),
              flush=True)

    event_db_obj.disconnect()
    print("{} - 全程耗时:{}".format(str(datetime.now()),
                                time.time() - start_time),
          flush=True)
    return df_events.shape[0]
def mark_more_info(test_mode=False):

    TEST_LIMIT = 10

    start_time = time.time()
    event_db_obj.get_conn()

    if test_mode:
        events_datas = get_none_more_info_data(TEST_LIMIT)
    else:
        events_datas = get_none_more_info_data()

    # event_db_obj.disconnect()

    print("{} - 取数据完毕!耗时:{}".format(str(datetime.now()),
                                    time.time() - start_time),
          flush=True)

    process_start = time.time()
    df_events = pd.DataFrame(events_datas)

    if not df_events.shape[0]:
        print("more_info字段无缺失!", flush=True)
        return

    # 处理关键词列表 -> 关键词字符串
    df_events["event_title"] = df_events["event_title"].apply(
        lambda x: get_event_title_str(x))
    # 处理非法字符
    df_events["first_content"] = df_events["first_content"].apply(
        lambda x: x.encode('gbk', 'ignore').decode('gbk'))

    print("{} - 数据预处理完毕!耗时:{}".format(str(datetime.now()),
                                      time.time() - process_start),
          flush=True)

    PROCESS_BATCH = 500

    for line in range(0, df_events.shape[0], PROCESS_BATCH):
        df_events_ = deepcopy(df_events.iloc[line:line + PROCESS_BATCH])

        nlp_start = time.time()
        # 输入为空数据时,默认返回的dict
        empty_result = {
            "positive": 0.5,
            "sentiment": "中",
            "category": "社会",
            "cate_score": 0.5,
            "sub_cates": [],
            "summary": ""
        }
        df_events_["more_info"] = df_events_.apply(
            lambda x: nlp_process_single(data_dict={
                "title": x["event_title"],
                "content": x["first_content"]
            },
                                         default_result=empty_result),
            axis=1)

        print("{} - NLP分析完毕!耗时:{}".format(str(datetime.now()),
                                          time.time() - nlp_start),
              flush=True)

        if test_mode:
            print(df_events, flush=True)

        update_start = time.time()
        # event_db_obj.get_conn()
        update_db_more_info(df_events_)
        print("{} - 更新数据完毕!耗时:{}".format(str(datetime.now()),
                                         time.time() - update_start),
              flush=True)

    event_db_obj.disconnect()
    print("{} - 全程耗时:{}".format(str(datetime.now()),
                                time.time() - start_time),
          flush=True)
    logger.info("[已完成调用BaiduNlp接口,更新xmd_basic_info中的more_info字段] 更新数据:%d" %
                df_events.shape[0])