def main_CI_DI(self,p,q,data_check_name="",best_pd_name="",original_name =""): clean_tool_= clean_tool.Pd_Info() if len(data_check_name) and len(best_pd_name): pf_base = clean_tool_.data_day2month(data_check_name) best_pd = clean_tool_.get_data_file(best_pd_name) pf_cols = pf_base.columns.tolist() best_name_df_list = self.__get_best_zhibiao(best_pd,pf_base,p,q) #最优的相关数据名字 init_pfs = self.__get_ci(best_name_df_list) #获取CI DI_PF = self.__get_DI(best_name_df_list) #获取DI DI_PF.plot(title = DI_PF.name) init_pfs.index = pf_base.index[len(pf_base)-len(init_pfs):] pfs_all = pd.concat([init_pfs,pf_base[pf_cols[0]]],axis=1) #pfs_all = pfs_all.apply(self.hp_lb_c) name_list = ["超前","一致","滞后",pfs_all.columns.tolist()[3]] pfs_all.columns = name_list pfs_all = pfs_all[ [pfs_all.columns.tolist()[0], pfs_all.columns.tolist()[-1]]] #挑选指标 time_item = time.strftime('%Y-%m-%d %H-%M-%S',time.localtime(time.time())) pfs_all = self.zhibiao_res(pfs_all,original_name,time_item) cols = pfs_all.columns.tolist() print("pfs_all",pfs_all) #pfs_all = pfs_all.rolling(window=2).mean().dropna() #移动平均2个月 ax = pfs_all.plot(use_index=True, y=cols, figsize=(10, 6), secondary_y=cols[-2:-1], title="CI指标对比(%s)"%data_check_name) file_name = "合成超前_%s_%s.csv"%(data_check_name,time_item) pfs_all["date"] = pfs_all.index pfs_all.to_csv(file_name,index =False) plt.show()
def hp_plot_test(): filedir_path = os.path.join(os.getcwd(), "predict_2") file_list = os.listdir(filedir_path) clean_tool_ = clean_tool.Pd_Info() pf_or = clean_tool_.data_day2month("国债数据_月平均 (1).xlsx") for file_name in file_list: if not re.search("csv$", file_name): continue file_path = os.path.join(filedir_path, file_name) print("file_path", file_path) split_file_name = file_name.split("_") pf_old = clean_tool_.data_day2month(file_path) pf_all = pd.concat([pf_old, pf_or], axis=1, sort=True) pf_all.columns = ["超前指标", "hp滤波", "原始值"] list_name = split_file_name[:3] + split_file_name[ 7:10] + split_file_name[4:6] + split_file_name[-1:] print(list_name) file_name_mse = "%s_%s_%s_%s_%s_%s_(%s_%s)_MSE_%s" % tuple(list_name) pf_all.to_csv(file_name_mse) pf_all = pf_all.apply(get_guiyi) # 归一化处理 # 画图,保存 cols = pf_all.columns.tolist() pf_all[cols[0]] = pf_all[cols[0]].shift(6) ax = pf_all.plot(use_index=True, y=cols, figsize=(10, 6), secondary_y=cols[1:], title="CI指标对比(%s)" % file_name_mse.split(".")[0]) plt.savefig('%s.png' % file_name_mse.split(".")[0], dpi=200)
def zhibiao_res(self,pf,original_name,time_item): cols = pf.columns.tolist() a,b,best_corr_step,best_corr = self.__get_best_corr(pf,8) #12个月内的的数据进行TS #best_corr_step=-8 #根据两者比例进行换算 pd_hecheng = pf[cols[0]][best_corr_step-1:] pd_hecheng_rate = (pf[cols[0]][best_corr_step:] -pf[cols[0]][best_corr_step-1])/abs(pf[cols[1]][-1]) print("pd_hecheng",pd_hecheng) print("pd_hecheng_rate",pd_hecheng_rate) #pd_pridict = [ i*pf[cols[1]][-1]/pd_hecheng[best_corr_step-1] for i in pd_hecheng[best_corr_step:]] pd_pridict = [(i+1)*pf[cols[1]][-1] for i in pd_hecheng_rate] print("pd_pridict",pd_pridict) print("best_corr_step",pf[cols[1]][-1]) list_tb = pf[cols[1]].tolist()+pd_pridict tb_pd = pd.Series(list_tb,index = pd.date_range(start = pf.index[0],periods = len(list_tb),freq="M")) tb_pd.index = tb_pd.index.map(lambda x: x.strftime('%Y-%m')) tb_pd.name = "%s_预测"%cols[1] #根据原始序列进行还原 if original_name: clean_tool_= clean_tool.Pd_Info() pf_or = clean_tool_.data_day2month(original_name) pf_all = pd.concat([tb_pd,pf_or],axis =1,sort=True) cols = pf_all.columns.tolist() print(pf_all) pf_all["%s_预测"%cols[1]] = pf_all[cols[1]].shift(12)*(1+ pf_all[cols[0]]/100) print(pf_all) return pf_all return pd.concat([pf,tb_pd],axis=1,sort=True)
def X13_cycle_main(): ccd = CCD.CI_AND_DI("dict_type11-25.csv", ) # 获取数据 # 循环: name = "宏观月频_填充.xlsx" # 宏观数据 ccd.get_best_corr_data(name, move_len=12, data_process="original") raise clean_tool_ = clean_tool.Pd_Info() pf_all = clean_tool_.data_day2month(name).dropna() print(pf_all) pf_all = pf_all.truncate(before="%s-%s" % ("2008", "2".zfill(2))) for i in range(12, len(pf_all), 12): if i + 89 <= len(pf_all): # pf_cut = pf_all[i:i+89] # print(pf_cut) # cut_name = "%s_%s_%s.xlsx"%(name.split(".")[0],pf_cut.index[0],pf_cut.index[-1]) # pf_cut.to_excel(cut_name) # X13_file_name = cycle_X13.get_x13(cut_name,pf_cut.index[0].split("-")[0],pf_cut.index[0].split("-")[1]) # #X13_file_name = "X13_宏观月频_填充_2008-02_2015-06_2019-11-24 15-59-52.csv" # #合并同比和宏观数据 base_name = "CBA02701.CS_month.xlsx" # pf_base = clean_tool_.data_day2month(base_name).dropna() # pf_base = pf_base[pf_base.columns.tolist()[0]] # pf_cut_x13 = clean_tool_.data_day2month(X13_file_name) # pf_tb = get_tongbi(pf_base) #获取同比序列 # pf_tb.name = "%s_同比(12月)"%base_name.split(".")[0] # pf_cut_x13_and_tb = pd.concat([pf_tb,pf_cut_x13],axis = 1,sort =False).dropna() # print(pf_cut_x13_and_tb) # pf_cut_x13_and_tb["date"] = pf_cut_x13_and_tb.index # cut_name_x13_and_tb = "%s_and_%s.xlsx"%(X13_file_name.split(".")[0],base_name.split(".")[0]) # pf_cut_x13_and_tb.to_excel(cut_name_x13_and_tb,index=False) # #CBA02701 # #获取最优相关 # data_check_name,best_pd_name = ccd.get_best_corr_data(cut_name_x13_and_tb,move_len=12,data_process ="original") #获取周期数据及最优相关系数 # #2017 # # data_check_name = "X13_宏观月频_填充_2010-02_2017-06_2019-11-25 08-57-57_and_CBA02701_ 原始数据处理后_original_original __2019-11-25 09-16-59.csv" # # best_pd_name = "X13_宏观月频_填充_2010-02_2017-06_2019-11-25 08-57-57_and_CBA02701_ 最优相关系数_original_original __2019-11-25 09-16-59.csv" # # #2019 # # data_check_name = "X13_宏观月频_填充_2012-02_2019-06_2019-11-25 09-39-31_and_CBA02701_ 原始数据处理后_original_original __2019-11-25 10-00-03.csv" # # best_pd_name = "X13_宏观月频_填充_2012-02_2019-06_2019-11-25 09-39-31_and_CBA02701_ 最优相关系数_original_original __2019-11-25 10-00-03.csv" # #2018 # # data_check_name = "X13_宏观月频_填充_2011-02_2018-06_2019-11-25 09-17-36_and_CBA02701_ 原始数据处理后_original_original __2019-11-25 09-38-27.csv" # # best_pd_name = "X13_宏观月频_填充_2011-02_2018-06_2019-11-25 09-17-36_and_CBA02701_ 最优相关系数_original_original __2019-11-25 09-38-27.csv" # # #2016 # # data_check_name = "X13_宏观月频_填充_2009-02_2016-06_2019-11-24 17-00-31_and_CBA02701_ 原始数据处理后_original_original __2019-11-24 17-20-25.csv" # # best_pd_name = "X13_宏观月频_填充_2009-02_2016-06_2019-11-24 17-00-31_and_CBA02701_ 最优相关系数_original_original __2019-11-24 17-20-25.csv" # # #2015 # data_check_name = "X13_宏观月频_填充_2008-02_2015-06_2019-11-24 15-59-52_and_CBA02701_ 原始数据处理后_original_original __2019-11-24 16-57-13.csv" # best_pd_name = "X13_宏观月频_填充_2008-02_2015-06_2019-11-24 15-59-52_and_CBA02701_ 最优相关系数_original_original __2019-11-24 16-57-13.csv" # 根据最优相关,合成CI ccd.main_CI_DI(3, 10, best_pd_name=best_pd_name, data_check_name=data_check_name, original_name=base_name) raise
def get_best_corr_data(self,base_file_name,index_file_name="",move_len=12,data_process ="hp"): clean_tool_= clean_tool.Pd_Info() #只有一个文件 if index_file_name=="": pf_all = clean_tool_.data_day2month(base_file_name) #print(pf_all) pf_all_cols = pf_all.columns.tolist() pf_base = pf_all[pf_all_cols[0]] pf_indexs = pf_all[pf_all_cols[1:]] elif index_file_name and base_file_name: pf_base = clean_tool_.data_day2month(base_file_name) pf_indexs = clean_tool_.data_day2month(index_file_name) else: raise("参数错误,文件名不对") pf_all_new=pd.concat([pf_base,pf_indexs],axis=1) #pf_all_new= pf_all_new[59:] #12年后 pf_all__new_cols= pf_all_new.columns.tolist() #列名 data_check = "" if data_process=="hp_cycle": hp_lb_c_all = pf_all_new.apply(self.hp_lb_c) #hp_lb data_check = hp_lb_c_all.apply(self.__fft_gau) #高斯滤波 elif data_process=="hp": hp_lb_c_all = pf_all_new.apply(self.hp_lb_c) #hp_lb data_check = hp_lb_c_all elif data_process=="original" or data_process=="": data_check = pf_all_new else: raise("data_process ;只支持 'hp_cycle', 'hp', 'original'3种类型") print("data_check",data_check) str_all_list=[] print("*"*100) print("开始计算相关系数") for item in pf_all__new_cols[1:]: print("-"*50) print("%s 和 %s"%(pf_all__new_cols[0],item)) item_pf = data_check[item].dropna() item_list = self.__get_data_corr(data_check[pf_all__new_cols[0]],item_pf,move_len) str_all_list.append(item_list) best_pd = self.__to_pds(str_all_list,data_process) #生成 最优相关系数pd time_item = time.strftime('%Y-%m-%d %H-%M-%S',time.localtime(time.time())) # #加入date,并移到第一列 # data_check["date"] =data_check.index # col_name = data_check.columns.tolist() # col_name.insert(0,"date") # data_check = data_check.reindex(columns=col_name) data_check_name = "%s_%s 原始数据处理后_%s_%s __%s.csv"%(base_file_name.split(".")[0], index_file_name.split(".")[0],data_process,data_process,time_item) best_pd_name = "%s_%s 最优相关系数_%s_%s __%s.csv"%(base_file_name.split(".")[0], index_file_name.split(".")[0],data_process,data_process,time_item) best_pd.to_csv(best_pd_name) data_check.to_csv(data_check_name) return data_check_name,best_pd_name
def Accuracy_test(): filedir_path = os.path.join(os.getcwd(), "predict_1") file_list = os.listdir(filedir_path) clean_tool_ = clean_tool.Pd_Info() for file_name in file_list: if not re.search("csv$", file_name): continue file_path = os.path.join(filedir_path, file_name) print("file_path", file_path) split_file_name = file_name.split("_") predict_date = split_file_name[5] print("predict_date", predict_date) # print(predict_date) # raise pf_old = clean_tool_.data_day2month(file_path) pf_all = pf_old.truncate( before="%s-%s" % (predict_date.split("-")[0], str(int(predict_date.split("-")[1]) + 1).zfill(2))) cols = pf_all.columns.tolist() pf_MSE = (pf_all[cols[2]] - pf_all[cols[1]])**2 list_mse = [] for i in range(len(pf_MSE)): list_mse.append(pf_MSE[i].sum() / (i + 1)) pf_mse = pd.Series(list_mse, pf_all.index) list_name = split_file_name[:3] + split_file_name[ 7:10] + split_file_name[4:6] + split_file_name[-1:] print(list_name) pd_all_mse = pd.concat([pf_old, pf_mse], axis=1, sort=False) pd_all_mse.columns = pf_old.columns.tolist() + ["MSE"] pd_all_mse[cols[0][:-3]] = get_tongbi(pd_all_mse[cols[1]]) file_name_mse = "%s_%s_%s_%s_%s_%s_(%s_%s)_MSE_%s" % tuple(list_name) pd_all_mse.to_csv(file_name_mse) # 画图,保存 cols = pd_all_mse.columns.tolist() ax = pd_all_mse.plot(use_index=True, y=cols[:3] + cols[-1:], figsize=(10, 6), secondary_y=cols[1:3], title="CI指标对比(%s)" % file_name_mse.split(".")[0]) plt.savefig('%s.png' % file_name_mse.split(".")[0], dpi=200)
def get_best_indexs_by_type(self, list_indexs): clean_tool_ = clean_tool.Pd_Info() pf_type = clean_tool_.get_data_file(self.type_dict_name) indexs_type_rate_dict = self.indexs_rate_dict #各指标比例个数 indexs_dict = {} #指标分类 for index_name in list_indexs: for type_i in range(len(pf_type)): if index_name == pf_type["col_name"][type_i] or \ index_name.split(".")[0] == pf_type["col_name"][type_i] or \ index_name.split(".")[0] in pf_type["col_name"][type_i]: if pf_type["type"][type_i] in indexs_dict: indexs_dict[pf_type["type"][type_i]].append(index_name) else: indexs_dict[pf_type["type"][type_i]] = [index_name] break indexs_list = [] #筛选指标 print("指标分类", indexs_dict) all_n = 0 for key in indexs_type_rate_dict: key_n = indexs_type_rate_dict[key] all_n += key_n indexs_dict_item = indexs_dict.get(key, []) if len(indexs_dict_item): indexs_list += indexs_dict_item[:int(key_n)] #若指标不够,补充 if len(indexs_list) < all_n: list_key_others = [ item for item in indexs_dict.keys() if item not in indexs_type_rate_dict.keys() ] list_indexs = [] for item in list_key_others: list_indexs += indexs_dict.get(item, []) list_add = [item for item in list_indexs if item in list_indexs][:all_n - len(indexs_list)] indexs_list += list_add return indexs_list
def hp_lb_c(self, df): clean_tool_ = clean_tool.Pd_Info() step = 14400 #月度数据14400 df = df.dropna() return clean_tool_.hp_lb(df, step)[1]
import data_cycle_lib.data_clean_lib as clean_tool import matplotlib.pyplot as plt import data_cycle_lib.best_corr_CI_DI as CCD import pandas as pd clean_tool_ = clean_tool.Pd_Info() #将设定的基准数据提前到第一列 def set_base(base_name, pd_all): cols = list(pd_all) cols.insert(0, cols.pop(cols.index(base_name))) pf_res = pd_all.loc[:, cols] #pf_res = pf_res.rolling(window=3,center=True,min_periods=2).mean().dropna() #中心移动平均3个月 return pf_res #相关性 def best_corr_main(): ccd = CCD.CI_AND_DI() #获取指标分类数据 base_file_name = r"E:\微信\新建文件夹\WeChat Files\xisijialouluo\FileStorage\File\2019-12\X13_房地产数据 (2)_2019-12-04 09-49-13.xlsx" pf_base = clean_tool_.data_day2month(base_file_name) base_name = '商品房销售额:累计值' pf_all = set_base(base_name, pf_base) pf_all["date"] = pf_all.index #按照时间截取: time_cut = ''
def get_best_corr_data(self,base_file_name,index_file_name="",move_len=12,data_process ="hp"): clean_tool_= clean_tool.Pd_Info() #只有一个文件 if index_file_name=="": pf_all = clean_tool_.data_day2month(base_file_name) pf_all_cols = pf_all.columns.tolist() pf_base = pf_all[pf_all_cols[0]] pf_indexs = pf_all[pf_all_cols[1:]] elif index_file_name and base_file_name: pf_base = clean_tool_.data_day2month(base_file_name) pf_indexs = clean_tool_.data_day2month(index_file_name) else: raise("参数错误,文件名不对") pf_all_new=pd.concat([pf_base,pf_indexs],axis=1) pf_all__new_cols= pf_all_new.columns.tolist() #列名 data_check = "" if data_process=="hp_cycle": hp_lb_c_all = pf_all_new.apply(self.hp_lb_c) #hp_lb data_check = hp_lb_c_all.apply(self.__fft_gau) #高斯滤波 elif data_process=="hp": hp_lb_c_all = pf_all_new.apply(self.hp_lb_c) #hp_lb data_check = hp_lb_c_all elif data_process=="original" or data_process=="": data_check = pf_all_new data_process = "original" else: raise("data_process ;只支持 'hp_cycle', 'hp', 'original'3种类型") print("data_check",data_check) str_all_list=[] print("*"*100) print("开始计算相关系数") #多进程: pool = multiprocessing.Pool(processes=4) results = [] for item in pf_all__new_cols[1:]: print("-"*50) print("%s 和 %s"%(pf_all__new_cols[0],item)) item_pf = data_check[item].dropna() results.append(pool.apply_async(get_data_corr, (data_check[pf_all__new_cols[0]],item_pf,move_len ))) pool.close() # 关闭进程池,表示不能再往进程池中添加进程,需要在join之前调用 pool.join() # 等待进程池中的所有进程执行完毕 print("="*100,"进程关闭!!!") for res in results: str_all_list.append(res.get()) # #单进程 # for item in pf_all__new_cols[1:]: # print("-"*50) # print("%s 和 %s"%(pf_all__new_cols[0],item)) # item_pf = data_check[item].dropna() # item_list = get_data_corr(data_check[pf_all__new_cols[0]],item_pf,move_len) # str_all_list.append(item_list) best_pd = self.__to_pds(str_all_list,data_process) #生成 最优相关系数pd time_item = time.strftime('%Y-%m-%d %H-%M-%S',time.localtime(time.time())) data_check_name = "%s_%s原始数据处理后_%s_%s.csv"%(base_file_name.split(".")[0], index_file_name.split(".")[0],data_process,time_item) best_pd_name = "%s_%s最优相关系数_%s_%s.csv"%(base_file_name.split(".")[0], index_file_name.split(".")[0],data_process,time_item) best_pd.to_csv(best_pd_name) data_check.to_csv(data_check_name) return data_check_name,best_pd_name