def get_idx_data(idx_type, interval=[], get_conn=False): """ @功能:取more_info里,mong字段中,含有idx_type标签的事件数据 :param idx_type: :param interval: 为空则取全部,不空则按时段取 :param get_conn: :return: """ sqlstr_ = "SELECT gov_id, ROUND(CAST(actual_value/19 AS NUMERIC), 4) AS affect, more_info ->> 'verified_type' AS verified_type, more_info ->> 'sentiment' AS sentiment FROM %s WHERE more_info ->> 'verified_type' IS NOT NULL" % event_table if interval: sqlstr_ += " AND event_start_time >= '%s' AND event_start_time < '%s'" % ( interval[0], interval[1]) if idx_type != "mong": sqlstr_ += " AND more_info ->> 'mong' IS NOT NULL AND ((more_info ->> 'mong')::jsonb ? '%s')" % idx_type sqlstr_ += " ORDER BY id ASC;" if get_conn: event_db_obj.get_conn() datas = event_db_obj.read_from_table(sqlstr_) if get_conn: event_db_obj.disconnect() return datas
def mark_event_verified_type(limit=None): start_time = time.time() none_verified_type_data = get_none_verified_type_data(limit) df_data = pd.DataFrame(none_verified_type_data) print("{} - 取未标记verified_type的事件数据完毕,耗时:{}".format( str(datetime.now()), time.time() - start_time), flush=True) event_db_obj.get_conn() PROCESS_BATCH = 1000 for line in range(0, df_data.shape[0], PROCESS_BATCH): pre_start = time.time() df_data_ = deepcopy(df_data.iloc[line:line + PROCESS_BATCH]) df_data_["verified_type"] = df_data_["url_list"].apply( lambda x: get_event_verified_type(x)) df_data_["more_info"] = df_data_.apply( lambda x: { **x["more_info"], "verified_type": x["verified_type"] }, axis=1) print("{} - 预处理-添加verified_type数据完毕,耗时:{}".format( str(datetime.now()), time.time() - pre_start), flush=True) # print(df_data, flush=True) into_db_start = time.time() update_db_more_info(df_data_) print("{} - 更新数据库more_info字段完毕,耗时:{}".format( str(datetime.now()), time.time() - into_db_start), flush=True) event_db_obj.disconnect() print("{} - 更新verified_type全程耗时:{}".format(str(datetime.now()), time.time() - start_time), flush=True) logger.info("[已完成more_info中verified_type标记] 更新数据:%d" % df_data.shape[0])
def get_none_verified_type_data(limit=None): """ @功能:获取没写入post_type的数据,只取more_info已经写了的字段 // 避免和更新more_info字段搞混 :param limit: :return: """ # sqlstr = "SELECT event_id, event_title, url_list, more_info FROM {} WHERE more_info is not NULL AND more_info ->> 'verified_type' is NULL ORDER BY id ASC".format( # event_table) _s_date = (datetime.now() - timedelta(days=100)).strftime("%Y-%m-%d") sqlstr = "SELECT event_id, event_title, url_list, more_info FROM {} WHERE event_start_time >= '{}' and more_info is not NULL AND more_info ->> 'verified_type' is NULL ORDER BY id ASC".format( event_table, _s_date) if limit: sqlstr += " LIMIT %d" % limit event_db_obj.get_conn() datas = event_db_obj.read_from_table(sqlstr) event_db_obj.disconnect() return datas
def get_law_data(interval=[], get_conn=False): """ @功能:取打上verified_type标签的,【法治】相关的事件数据, :param interval: 为空则取全部,不空则按时段取 :return: """ if not interval: sqlstr_ = "SELECT gov_id, ROUND(CAST(actual_value/19 AS NUMERIC), 4) AS affect, more_info ->> 'verified_type' AS verified_type, more_info ->> 'sentiment' AS sentiment FROM {} WHERE (more_info ->> 'verified_type' IS NOT NULL) AND (more_info ->> 'sub_cates' LIKE '%法制%' OR more_info ->> 'sub_cates' LIKE '%刑法%') ORDER BY id ASC;".format( event_table) # event_start_time, else: sqlstr_ = "SELECT gov_id, ROUND(CAST(actual_value/19 AS NUMERIC), 4) AS affect, more_info ->> 'verified_type' AS verified_type, more_info ->> 'sentiment' AS sentiment FROM {} WHERE (more_info ->> 'verified_type' IS NOT NULL AND event_start_time >= '{}' AND event_start_time < '{}') AND (more_info ->> 'sub_cates' LIKE '%法制%' OR more_info ->> 'sub_cates' LIKE '%刑法%') ORDER BY id ASC;".format( event_table, interval[0], interval[1]) # event_start_time, if get_conn: event_db_obj.get_conn() datas = event_db_obj.read_from_table(sqlstr_) if get_conn: event_db_obj.disconnect() return datas
def mark_idxs_main(idx_types: list = UNIFORM_DISPOSAL_INDEXES, mark_by: str = "update", test_mode: bool = False): """ @功能:判断并标记,各种指标 —— 每种指标都默认重新标注 :param idx_types: :param mark_by: "update": 更新所有指标(不含mong字段的数据); "renew":重跑(所有数据) :param test_mode: :return: 更新数据库 """ start_time = time.time() event_db_obj.get_conn() if test_mode: data_num = TEST_LIMIT else: data_num = None events_data = get_events_data_from_db(mark_by, limit=data_num) df_events = pd.DataFrame(events_data) if not df_events.shape[0]: print("没有待更新的数据!", flush=True) return 0 df_events["event_title"] = df_events["event_title"].apply( lambda x: get_event_title_str(x)) df_events["first_content"] = df_events["first_content"].apply( lambda x: x.encode('gbk', 'ignore').decode('gbk')) print("{} - 取数据并预处理完毕!耗时:{}".format(str(datetime.now()), time.time() - start_time), flush=True) for line in range(0, df_events.shape[0], PROCESS_BATCH): data_df = deepcopy(df_events.iloc[line:line + PROCESS_BATCH]) mark_start = time.time() for idx_type_ in idx_types: data_df["more_info"] = data_df.apply( lambda x: dispose_more_info_mong_idx(x, idx_type_), axis=1) print("{} - batch={}:指标判断完毕!耗时:{}".format( str(datetime.now()), int(line / PROCESS_BATCH + 1), time.time() - mark_start), flush=True) update_db_start = time.time() update_db_more_info(data_df) print("{} - batch={}:更新数据完毕!耗时:{}".format( str(datetime.now()), int(line / PROCESS_BATCH + 1), time.time() - update_db_start), flush=True) event_db_obj.disconnect() print("{} - 全程耗时:{}".format(str(datetime.now()), time.time() - start_time), flush=True) return df_events.shape[0]
def mark_more_info(test_mode=False): TEST_LIMIT = 10 start_time = time.time() event_db_obj.get_conn() if test_mode: events_datas = get_none_more_info_data(TEST_LIMIT) else: events_datas = get_none_more_info_data() # event_db_obj.disconnect() print("{} - 取数据完毕!耗时:{}".format(str(datetime.now()), time.time() - start_time), flush=True) process_start = time.time() df_events = pd.DataFrame(events_datas) if not df_events.shape[0]: print("more_info字段无缺失!", flush=True) return # 处理关键词列表 -> 关键词字符串 df_events["event_title"] = df_events["event_title"].apply( lambda x: get_event_title_str(x)) # 处理非法字符 df_events["first_content"] = df_events["first_content"].apply( lambda x: x.encode('gbk', 'ignore').decode('gbk')) print("{} - 数据预处理完毕!耗时:{}".format(str(datetime.now()), time.time() - process_start), flush=True) PROCESS_BATCH = 500 for line in range(0, df_events.shape[0], PROCESS_BATCH): df_events_ = deepcopy(df_events.iloc[line:line + PROCESS_BATCH]) nlp_start = time.time() # 输入为空数据时,默认返回的dict empty_result = { "positive": 0.5, "sentiment": "中", "category": "社会", "cate_score": 0.5, "sub_cates": [], "summary": "" } df_events_["more_info"] = df_events_.apply( lambda x: nlp_process_single(data_dict={ "title": x["event_title"], "content": x["first_content"] }, default_result=empty_result), axis=1) print("{} - NLP分析完毕!耗时:{}".format(str(datetime.now()), time.time() - nlp_start), flush=True) if test_mode: print(df_events, flush=True) update_start = time.time() # event_db_obj.get_conn() update_db_more_info(df_events_) print("{} - 更新数据完毕!耗时:{}".format(str(datetime.now()), time.time() - update_start), flush=True) event_db_obj.disconnect() print("{} - 全程耗时:{}".format(str(datetime.now()), time.time() - start_time), flush=True) logger.info("[已完成调用BaiduNlp接口,更新xmd_basic_info中的more_info字段] 更新数据:%d" % df_events.shape[0])