예제 #1
0
    def main_CI_DI(self,p,q,data_check_name="",best_pd_name="",original_name =""):
        clean_tool_= clean_tool.Pd_Info()
        if len(data_check_name) and len(best_pd_name):
            pf_base = clean_tool_.data_day2month(data_check_name)
            best_pd = clean_tool_.get_data_file(best_pd_name)
        
            pf_cols = pf_base.columns.tolist()
            best_name_df_list = self.__get_best_zhibiao(best_pd,pf_base,p,q) #最优的相关数据名字

            init_pfs = self.__get_ci(best_name_df_list)  #获取CI
            DI_PF = self.__get_DI(best_name_df_list)    #获取DI
            
            DI_PF.plot(title = DI_PF.name)
            
            init_pfs.index = pf_base.index[len(pf_base)-len(init_pfs):]      
            pfs_all = pd.concat([init_pfs,pf_base[pf_cols[0]]],axis=1)
            
            #pfs_all = pfs_all.apply(self.hp_lb_c)
            name_list = ["超前","一致","滞后",pfs_all.columns.tolist()[3]]
            pfs_all.columns = name_list
            pfs_all = pfs_all[ [pfs_all.columns.tolist()[0], pfs_all.columns.tolist()[-1]]] #挑选指标
            time_item = time.strftime('%Y-%m-%d %H-%M-%S',time.localtime(time.time()))
            pfs_all = self.zhibiao_res(pfs_all,original_name,time_item)
            
            cols = pfs_all.columns.tolist()
            print("pfs_all",pfs_all)
            #pfs_all = pfs_all.rolling(window=2).mean().dropna()  #移动平均2个月
            ax = pfs_all.plot(use_index=True, y=cols, figsize=(10, 6),
                                secondary_y=cols[-2:-1], title="CI指标对比(%s)"%data_check_name)
            
            
            file_name = "合成超前_%s_%s.csv"%(data_check_name,time_item)
            pfs_all["date"] = pfs_all.index
            pfs_all.to_csv(file_name,index =False)
            plt.show()
예제 #2
0
def hp_plot_test():
    filedir_path = os.path.join(os.getcwd(), "predict_2")
    file_list = os.listdir(filedir_path)
    clean_tool_ = clean_tool.Pd_Info()
    pf_or = clean_tool_.data_day2month("国债数据_月平均 (1).xlsx")
    for file_name in file_list:
        if not re.search("csv$", file_name):
            continue
        file_path = os.path.join(filedir_path, file_name)
        print("file_path", file_path)
        split_file_name = file_name.split("_")

        pf_old = clean_tool_.data_day2month(file_path)
        pf_all = pd.concat([pf_old, pf_or], axis=1, sort=True)
        pf_all.columns = ["超前指标", "hp滤波", "原始值"]
        list_name = split_file_name[:3] + split_file_name[
            7:10] + split_file_name[4:6] + split_file_name[-1:]
        print(list_name)

        file_name_mse = "%s_%s_%s_%s_%s_%s_(%s_%s)_MSE_%s" % tuple(list_name)
        pf_all.to_csv(file_name_mse)

        pf_all = pf_all.apply(get_guiyi)  # 归一化处理

        # 画图,保存
        cols = pf_all.columns.tolist()
        pf_all[cols[0]] = pf_all[cols[0]].shift(6)
        ax = pf_all.plot(use_index=True,
                         y=cols,
                         figsize=(10, 6),
                         secondary_y=cols[1:],
                         title="CI指标对比(%s)" % file_name_mse.split(".")[0])
        plt.savefig('%s.png' % file_name_mse.split(".")[0], dpi=200)
예제 #3
0
 def zhibiao_res(self,pf,original_name,time_item):
     cols = pf.columns.tolist()
     a,b,best_corr_step,best_corr = self.__get_best_corr(pf,8) #12个月内的的数据进行TS
     
     #best_corr_step=-8
     #根据两者比例进行换算
     pd_hecheng = pf[cols[0]][best_corr_step-1:]
     pd_hecheng_rate = (pf[cols[0]][best_corr_step:] -pf[cols[0]][best_corr_step-1])/abs(pf[cols[1]][-1])
     print("pd_hecheng",pd_hecheng)
     print("pd_hecheng_rate",pd_hecheng_rate)
     #pd_pridict  = [ i*pf[cols[1]][-1]/pd_hecheng[best_corr_step-1] for i in pd_hecheng[best_corr_step:]]
     pd_pridict  = [(i+1)*pf[cols[1]][-1] for i in pd_hecheng_rate]
     print("pd_pridict",pd_pridict)
     print("best_corr_step",pf[cols[1]][-1])
     list_tb =  pf[cols[1]].tolist()+pd_pridict
     
     tb_pd = pd.Series(list_tb,index = pd.date_range(start = pf.index[0],periods = len(list_tb),freq="M"))
     tb_pd.index = tb_pd.index.map(lambda x: x.strftime('%Y-%m'))
     tb_pd.name = "%s_预测"%cols[1]
     
     #根据原始序列进行还原
     if original_name:
         clean_tool_= clean_tool.Pd_Info()
         pf_or = clean_tool_.data_day2month(original_name)
         pf_all = pd.concat([tb_pd,pf_or],axis =1,sort=True) 
         cols = pf_all.columns.tolist()
         print(pf_all)
         pf_all["%s_预测"%cols[1]] = pf_all[cols[1]].shift(12)*(1+ pf_all[cols[0]]/100)
         print(pf_all)
         
         return pf_all
         
     return pd.concat([pf,tb_pd],axis=1,sort=True)
예제 #4
0
def X13_cycle_main():
    ccd = CCD.CI_AND_DI("dict_type11-25.csv", )  # 获取数据
    # 循环:
    name = "宏观月频_填充.xlsx"  # 宏观数据
    ccd.get_best_corr_data(name, move_len=12, data_process="original")
    raise
    clean_tool_ = clean_tool.Pd_Info()
    pf_all = clean_tool_.data_day2month(name).dropna()
    print(pf_all)
    pf_all = pf_all.truncate(before="%s-%s" % ("2008", "2".zfill(2)))
    for i in range(12, len(pf_all), 12):
        if i + 89 <= len(pf_all):
            # pf_cut = pf_all[i:i+89]
            # print(pf_cut)
            # cut_name = "%s_%s_%s.xlsx"%(name.split(".")[0],pf_cut.index[0],pf_cut.index[-1])
            # pf_cut.to_excel(cut_name)

            # X13_file_name = cycle_X13.get_x13(cut_name,pf_cut.index[0].split("-")[0],pf_cut.index[0].split("-")[1])
            # #X13_file_name = "X13_宏观月频_填充_2008-02_2015-06_2019-11-24 15-59-52.csv"
            # #合并同比和宏观数据
            base_name = "CBA02701.CS_month.xlsx"
            # pf_base = clean_tool_.data_day2month(base_name).dropna()
            # pf_base = pf_base[pf_base.columns.tolist()[0]]
            # pf_cut_x13 = clean_tool_.data_day2month(X13_file_name)
            # pf_tb = get_tongbi(pf_base) #获取同比序列
            # pf_tb.name = "%s_同比(12月)"%base_name.split(".")[0]

            # pf_cut_x13_and_tb = pd.concat([pf_tb,pf_cut_x13],axis = 1,sort =False).dropna()
            # print(pf_cut_x13_and_tb)
            # pf_cut_x13_and_tb["date"] = pf_cut_x13_and_tb.index
            # cut_name_x13_and_tb = "%s_and_%s.xlsx"%(X13_file_name.split(".")[0],base_name.split(".")[0])
            # pf_cut_x13_and_tb.to_excel(cut_name_x13_and_tb,index=False)

            # #CBA02701
            # #获取最优相关
            # data_check_name,best_pd_name = ccd.get_best_corr_data(cut_name_x13_and_tb,move_len=12,data_process ="original")  #获取周期数据及最优相关系数

            # #2017
            # # data_check_name = "X13_宏观月频_填充_2010-02_2017-06_2019-11-25 08-57-57_and_CBA02701_ 原始数据处理后_original_original __2019-11-25 09-16-59.csv"
            # # best_pd_name = "X13_宏观月频_填充_2010-02_2017-06_2019-11-25 08-57-57_and_CBA02701_  最优相关系数_original_original __2019-11-25 09-16-59.csv"
            # # #2019
            # # data_check_name = "X13_宏观月频_填充_2012-02_2019-06_2019-11-25 09-39-31_and_CBA02701_ 原始数据处理后_original_original __2019-11-25 10-00-03.csv"
            # # best_pd_name = "X13_宏观月频_填充_2012-02_2019-06_2019-11-25 09-39-31_and_CBA02701_  最优相关系数_original_original __2019-11-25 10-00-03.csv"
            # #2018
            # # data_check_name = "X13_宏观月频_填充_2011-02_2018-06_2019-11-25 09-17-36_and_CBA02701_ 原始数据处理后_original_original __2019-11-25 09-38-27.csv"
            # # best_pd_name = "X13_宏观月频_填充_2011-02_2018-06_2019-11-25 09-17-36_and_CBA02701_  最优相关系数_original_original __2019-11-25 09-38-27.csv"
            # # #2016
            # # data_check_name = "X13_宏观月频_填充_2009-02_2016-06_2019-11-24 17-00-31_and_CBA02701_ 原始数据处理后_original_original __2019-11-24 17-20-25.csv"
            # # best_pd_name = "X13_宏观月频_填充_2009-02_2016-06_2019-11-24 17-00-31_and_CBA02701_  最优相关系数_original_original __2019-11-24 17-20-25.csv"
            # # #2015
            # data_check_name = "X13_宏观月频_填充_2008-02_2015-06_2019-11-24 15-59-52_and_CBA02701_ 原始数据处理后_original_original __2019-11-24 16-57-13.csv"
            # best_pd_name = "X13_宏观月频_填充_2008-02_2015-06_2019-11-24 15-59-52_and_CBA02701_  最优相关系数_original_original __2019-11-24 16-57-13.csv"

            # 根据最优相关,合成CI
            ccd.main_CI_DI(3,
                           10,
                           best_pd_name=best_pd_name,
                           data_check_name=data_check_name,
                           original_name=base_name)
            raise
예제 #5
0
 def get_best_corr_data(self,base_file_name,index_file_name="",move_len=12,data_process ="hp"):
     clean_tool_= clean_tool.Pd_Info()
     #只有一个文件
     if index_file_name=="":
         pf_all = clean_tool_.data_day2month(base_file_name)
         #print(pf_all)
         pf_all_cols = pf_all.columns.tolist()
         pf_base = pf_all[pf_all_cols[0]]
         pf_indexs = pf_all[pf_all_cols[1:]]
     elif  index_file_name and  base_file_name:
         pf_base = clean_tool_.data_day2month(base_file_name)
         pf_indexs = clean_tool_.data_day2month(index_file_name)
     else:
         raise("参数错误,文件名不对")
     
     
     pf_all_new=pd.concat([pf_base,pf_indexs],axis=1)
     #pf_all_new= pf_all_new[59:]  #12年后
     pf_all__new_cols= pf_all_new.columns.tolist()    #列名
     
     
     data_check = ""
     if data_process=="hp_cycle":
         hp_lb_c_all = pf_all_new.apply(self.hp_lb_c)  #hp_lb
         data_check = hp_lb_c_all.apply(self.__fft_gau)  #高斯滤波
     elif data_process=="hp":
         hp_lb_c_all = pf_all_new.apply(self.hp_lb_c)  #hp_lb
         data_check = hp_lb_c_all
     elif data_process=="original" or data_process=="":
         data_check = pf_all_new
     else:
         raise("data_process  ;只支持 'hp_cycle', 'hp', 'original'3种类型")
     print("data_check",data_check)
     str_all_list=[] 
     print("*"*100)
     print("开始计算相关系数")
     for item in pf_all__new_cols[1:]:
         print("-"*50)
         print("%s  和  %s"%(pf_all__new_cols[0],item))
         item_pf = data_check[item].dropna()
         item_list = self.__get_data_corr(data_check[pf_all__new_cols[0]],item_pf,move_len)
         str_all_list.append(item_list)
     best_pd = self.__to_pds(str_all_list,data_process)  #生成 最优相关系数pd
     time_item = time.strftime('%Y-%m-%d %H-%M-%S',time.localtime(time.time()))
     # #加入date,并移到第一列
     # data_check["date"] =data_check.index
     # col_name = data_check.columns.tolist()
     # col_name.insert(0,"date")
     # data_check = data_check.reindex(columns=col_name)
     data_check_name = "%s_%s 原始数据处理后_%s_%s __%s.csv"%(base_file_name.split(".")[0],
                         index_file_name.split(".")[0],data_process,data_process,time_item)
     best_pd_name = "%s_%s  最优相关系数_%s_%s __%s.csv"%(base_file_name.split(".")[0],
                         index_file_name.split(".")[0],data_process,data_process,time_item)
     best_pd.to_csv(best_pd_name)   
     data_check.to_csv(data_check_name)
     
     return data_check_name,best_pd_name
예제 #6
0
def Accuracy_test():
    filedir_path = os.path.join(os.getcwd(), "predict_1")
    file_list = os.listdir(filedir_path)
    clean_tool_ = clean_tool.Pd_Info()
    for file_name in file_list:
        if not re.search("csv$", file_name):
            continue
        file_path = os.path.join(filedir_path, file_name)
        print("file_path", file_path)
        split_file_name = file_name.split("_")
        predict_date = split_file_name[5]
        print("predict_date", predict_date)
        # print(predict_date)
        # raise
        pf_old = clean_tool_.data_day2month(file_path)
        pf_all = pf_old.truncate(
            before="%s-%s" %
            (predict_date.split("-")[0],
             str(int(predict_date.split("-")[1]) + 1).zfill(2)))

        cols = pf_all.columns.tolist()
        pf_MSE = (pf_all[cols[2]] - pf_all[cols[1]])**2

        list_mse = []
        for i in range(len(pf_MSE)):
            list_mse.append(pf_MSE[i].sum() / (i + 1))

        pf_mse = pd.Series(list_mse, pf_all.index)
        list_name = split_file_name[:3] + split_file_name[
            7:10] + split_file_name[4:6] + split_file_name[-1:]
        print(list_name)

        pd_all_mse = pd.concat([pf_old, pf_mse], axis=1, sort=False)
        pd_all_mse.columns = pf_old.columns.tolist() + ["MSE"]

        pd_all_mse[cols[0][:-3]] = get_tongbi(pd_all_mse[cols[1]])

        file_name_mse = "%s_%s_%s_%s_%s_%s_(%s_%s)_MSE_%s" % tuple(list_name)
        pd_all_mse.to_csv(file_name_mse)

        # 画图,保存
        cols = pd_all_mse.columns.tolist()
        ax = pd_all_mse.plot(use_index=True,
                             y=cols[:3] + cols[-1:],
                             figsize=(10, 6),
                             secondary_y=cols[1:3],
                             title="CI指标对比(%s)" % file_name_mse.split(".")[0])
        plt.savefig('%s.png' % file_name_mse.split(".")[0], dpi=200)
예제 #7
0
    def get_best_indexs_by_type(self, list_indexs):
        clean_tool_ = clean_tool.Pd_Info()
        pf_type = clean_tool_.get_data_file(self.type_dict_name)

        indexs_type_rate_dict = self.indexs_rate_dict  #各指标比例个数

        indexs_dict = {}  #指标分类
        for index_name in list_indexs:
            for type_i in range(len(pf_type)):
                if index_name == pf_type["col_name"][type_i] or \
                    index_name.split(".")[0] == pf_type["col_name"][type_i] or \
                    index_name.split(".")[0] in pf_type["col_name"][type_i]:
                    if pf_type["type"][type_i] in indexs_dict:
                        indexs_dict[pf_type["type"][type_i]].append(index_name)
                    else:
                        indexs_dict[pf_type["type"][type_i]] = [index_name]
                    break

        indexs_list = []  #筛选指标
        print("指标分类", indexs_dict)
        all_n = 0
        for key in indexs_type_rate_dict:
            key_n = indexs_type_rate_dict[key]
            all_n += key_n
            indexs_dict_item = indexs_dict.get(key, [])
            if len(indexs_dict_item):
                indexs_list += indexs_dict_item[:int(key_n)]

        #若指标不够,补充
        if len(indexs_list) < all_n:
            list_key_others = [
                item for item in indexs_dict.keys()
                if item not in indexs_type_rate_dict.keys()
            ]
            list_indexs = []
            for item in list_key_others:
                list_indexs += indexs_dict.get(item, [])
            list_add = [item for item in list_indexs
                        if item in list_indexs][:all_n - len(indexs_list)]

            indexs_list += list_add

        return indexs_list
예제 #8
0
 def hp_lb_c(self, df):
     clean_tool_ = clean_tool.Pd_Info()
     step = 14400  #月度数据14400
     df = df.dropna()
     return clean_tool_.hp_lb(df, step)[1]
예제 #9
0
import data_cycle_lib.data_clean_lib as clean_tool
import matplotlib.pyplot as plt
import data_cycle_lib.best_corr_CI_DI as CCD
import pandas as pd

clean_tool_ = clean_tool.Pd_Info()


#将设定的基准数据提前到第一列
def set_base(base_name, pd_all):
    cols = list(pd_all)
    cols.insert(0, cols.pop(cols.index(base_name)))
    pf_res = pd_all.loc[:, cols]
    #pf_res = pf_res.rolling(window=3,center=True,min_periods=2).mean().dropna()  #中心移动平均3个月

    return pf_res


#相关性
def best_corr_main():
    ccd = CCD.CI_AND_DI()  #获取指标分类数据

    base_file_name = r"E:\微信\新建文件夹\WeChat Files\xisijialouluo\FileStorage\File\2019-12\X13_房地产数据 (2)_2019-12-04 09-49-13.xlsx"
    pf_base = clean_tool_.data_day2month(base_file_name)

    base_name = '商品房销售额:累计值'
    pf_all = set_base(base_name, pf_base)
    pf_all["date"] = pf_all.index

    #按照时间截取:
    time_cut = ''
예제 #10
0
    def get_best_corr_data(self,base_file_name,index_file_name="",move_len=12,data_process ="hp"):
        clean_tool_= clean_tool.Pd_Info()
        #只有一个文件
        if index_file_name=="":
            pf_all = clean_tool_.data_day2month(base_file_name)
            
            pf_all_cols = pf_all.columns.tolist()
            pf_base = pf_all[pf_all_cols[0]]
            pf_indexs = pf_all[pf_all_cols[1:]]
        elif  index_file_name and  base_file_name:
            pf_base = clean_tool_.data_day2month(base_file_name)
            pf_indexs = clean_tool_.data_day2month(index_file_name)
        else:
            raise("参数错误,文件名不对")
        
        
        pf_all_new=pd.concat([pf_base,pf_indexs],axis=1)
        pf_all__new_cols= pf_all_new.columns.tolist()    #列名
        
        data_check = ""
        if data_process=="hp_cycle":
            hp_lb_c_all = pf_all_new.apply(self.hp_lb_c)  #hp_lb
            data_check = hp_lb_c_all.apply(self.__fft_gau)  #高斯滤波
        elif data_process=="hp":
            hp_lb_c_all = pf_all_new.apply(self.hp_lb_c)  #hp_lb
            data_check = hp_lb_c_all
        elif data_process=="original" or data_process=="":
            data_check = pf_all_new
            data_process = "original"
        else:
            raise("data_process  ;只支持 'hp_cycle', 'hp', 'original'3种类型")
        print("data_check",data_check)
        str_all_list=[] 
        print("*"*100)
        print("开始计算相关系数")
        
        #多进程:
        pool = multiprocessing.Pool(processes=4)
        results = []
        for item in pf_all__new_cols[1:]:
            print("-"*50)
            print("%s  和  %s"%(pf_all__new_cols[0],item))
            item_pf = data_check[item].dropna()
            results.append(pool.apply_async(get_data_corr, (data_check[pf_all__new_cols[0]],item_pf,move_len )))
        pool.close() # 关闭进程池,表示不能再往进程池中添加进程,需要在join之前调用
        pool.join() # 等待进程池中的所有进程执行完毕
        print("="*100,"进程关闭!!!")
        
        for res in results:
            str_all_list.append(res.get())
        
        
        
        # #单进程
        # for item in pf_all__new_cols[1:]:
            # print("-"*50)
            # print("%s  和  %s"%(pf_all__new_cols[0],item))
            # item_pf = data_check[item].dropna()
            # item_list = get_data_corr(data_check[pf_all__new_cols[0]],item_pf,move_len)
            # str_all_list.append(item_list)
        
        best_pd = self.__to_pds(str_all_list,data_process)  #生成 最优相关系数pd
        time_item = time.strftime('%Y-%m-%d %H-%M-%S',time.localtime(time.time()))

        
        data_check_name = "%s_%s原始数据处理后_%s_%s.csv"%(base_file_name.split(".")[0],
                            index_file_name.split(".")[0],data_process,time_item)
        best_pd_name = "%s_%s最优相关系数_%s_%s.csv"%(base_file_name.split(".")[0],
                            index_file_name.split(".")[0],data_process,time_item)
        best_pd.to_csv(best_pd_name)   
        data_check.to_csv(data_check_name)
        
        return data_check_name,best_pd_name