title_set.add(var) # filter_title except KeyError: continue res_list.append(temp_list) res_list = [i for i in res_list if i[0] != i[2]] df_blood = pd.DataFrame(res_list, columns=['nums', 'POLICY_ID', 'POLICY_TITLE', 'FATHER_ID', 'FATHER_TITLE']) df_blood.to_csv('./test/policy_bj_blood.txt', index=False, header=None, encoding='utf-8', sep='\t') if __name__ == '__main__': # Init LoadData Class Load = LoadData() FLAG_VERIFY = True # 是否验证跟新数据在目前库中存在 # Load Policy Content Data (after run_policy_detail) df_content = Load.load_data('./test/policy_bj_content.txt') # Extract Father Policy df_temp = df_content[['id', 'title', 'content', '政策背景', '支持内容']].copy() df_etc = extract_policy(df_temp) # Load All Policy Title from Local File To Match Father # Firstly Match Policy list, Second Match Whole Pool with No Match Policy pool_policy_1 = Load.load_data('./file/policy_content_include.txt') # Which Policy include Content # pool_policy_2 = Load.load_data('./file/policy_content_exclude.txt') # Which Policy exclude Content if FLAG_VERIFY: # 通过 FLAG_VERIFY 返回值,判断是否直接更新 policy_content_include.txt表 还是后续手动更新 df_etc, pool_policy_new = verify_title(df_etc, pool_policy_1) # verify_title(df_etc, pool_policy_2, father_dict) # 通过 FLAG_VERIFY 返回值,判断是否直接更新 policy_content_include.txt表 还是后续手动更新
df_match = pd.DataFrame(res_list, columns=[ 'nums', 'POLICY_ID', 'POLICY_TITLE', 'SIMILARITY_ID', 'SIMILARITY_TITLE', 'SIMILARITY_PROB' ]) df_match.drop_duplicates(keep='first', inplace=True) df_match.to_csv('./test/policy_bj_match.txt', index=False, header=None, encoding='utf-8', sep='\t') if __name__ == '__main__': # Init LoadData Class Load = LoadData() # Load Policy List Data (after run_policy_list) df_list = Load.load_data('./test/policy_bj_list.txt') # Load All Policy Title from Local File To Match history_policy = Load.load_data( './file/policy_content_include.txt') # Which Policy include Content df_etc = match_policy(df_list, history_policy) dict_policy = trans_dict(df_list, history_policy) # 存储血缘关系 save_blood(df_etc, dict_policy)
temp = re.sub(r'&[a-z]{1,};', '', temp) temp = temp.strip() if temp == '': del_index.append(index) if del_index: lines = [i for index, i in enumerate(lines) if index not in del_index] each_line = split_data(lines, sep_tag) return each_line if __name__ == '__main__': # Init LoadData Class Load = LoadData() # Load Policy Content List Data (after run_policy_list) df_list = Load.load_data('./test/policy_bj_content_list.txt', 'title') print(df_list.shape) # clear Data df_list.loc[:, 'content'] = df_list['content'].map(clear_data) # predict Data model = Classify() df_list = model.paragraph_classify(_df=df_list) df_list = df_list[[ 'id', 'title', 'content', '政策背景', '支持内容', '申报条件', '申报材料', '申报方式', '其他内容', 'originalLink' ]] df_list.loc[:, 'content'] = df_list[['content' ]].applymap(lambda x: ''.join(x)) print(df_list.shape)
# coding:utf-8 import numpy as np import warnings import clear_data as cd from load_data import LoadData warnings.filterwarnings("ignore") np.random.seed(0) if __name__ == '__main__': # Init LoadData Class Load = LoadData() # Load zg_bj Data df = Load.load_data('./test/clear_zg_bj_0908.txt', 'title') # Load.save_data(df, './test/clear_zg_bj_0831.txt') # 保存清洗后的数据 # # Load All Data # df_match = Load.load_data('./clear/clear_zg_all_0831.txt', 'title') # # Load.save_data(df_match, './test/clear_zg_all_0831.txt') # 保存数据 # # nums = df_match.shape[0] cols = [ 'title', 'level', 'scope', 'source', 'province', 'city', 'county', 'power', 'funds', 'set', 'txt_id', 'originalLink', 'start_time', 'end_time', 'content' ] df_ = df[cols].copy() # Fill province, city, county df_fill = cd.fill_data(df_)