Exemplo n.º 1
0
def grey_auto_knn(data_cut, data_all, data_train, missing_ratio):
    n_neighbors = 3
    look_back = 5
    train_x, train_y = create_dataset(data_train.values, look_back)
    lost_index = get_lost_index(data_cut)
    # 利用自己的knn
    knn2 = defKnn.KNNClassifier(n_neighbors)
    knn2.fit(train_x, train_y)
    rmse = 0
    mre = 0
    sum = 0
    data_grey_auto_knn_fill = data_cut.copy()
    for i in range(len(lost_index)):
        for j in range(lost_index[i][0], lost_index[i][1] + 1):
            sum += 1
            look_back_x = np.array(data_grey_auto_knn_fill.values[j -
                                                                  look_back:j])
            look_back_x = look_back_x.reshape(1, -1)
            val = knn2.predict(look_back_x)
            data_grey_auto_knn_fill.loc[j:j + 1, 'y'] = val
            mre += abs((val - data_all.iloc[j]['y']) / data_all.iloc[j]['y'])
            rmse += pow(val - data_all.iloc[j]['y'], 2)

    mre = float(mre / sum)
    rmse = float(math.sqrt(rmse / sum))
    data_grey_auto_knn_fill.to_csv(
        r'D:\SJTU\机器学习\小论文\数据补全\data2\grey_auto_knn(ratio=' +
        str(missing_ratio) + ').csv')
    return mre, rmse
Exemplo n.º 2
0
def find_value():
    complete_file = r"file\2019.9.9-9.19(completed).csv"
    data_complete = create_dataframe(complete_file, 'I')

    lost_file = r"file\2019.9.9-9.19(lost).csv"
    data_lost = create_dataframe(lost_file, 'I')

    # 数据分割,得到数据缺失部分
    data_cut = data_lost.iloc[5500:6400]
    values = data_cut.values.flatten()
    lost_index = get_lost_index(data_cut)
    print(lost_index)

    # 分割训练数据
    data_train = data_complete.iloc[:5500]
    data_test = data_complete.iloc[5500:6400]

    data_fill = data_cut.copy()
    data_fill_all = data_cut.copy()

    train_x, train_y = create_dataset(data_train.values, 10)

    value_list = list(np.arange(0.15, 0.6, 0.05))
    # 利用自己的knn
    for value in value_list:
        knn2 = defKnn.KNNClassifier(7, value)
        knn2.fit(train_x, train_y)
        for i in range(len(lost_index)):
            for j in range(lost_index[i][0], lost_index[i][1] + 1):
                look_back_x = np.array(data_fill_all.values[j - 10:j])
                look_back_x = look_back_x.reshape(1, -1)
                val = knn2.predict(look_back_x)
                data_fill_all.loc[j:j + 1, 'y'] = val
Exemplo n.º 3
0
def get_k_fill():
    complete_file = r"file\2019.9.9-9.19(completed).csv"
    data_complete = create_dataframe(complete_file, 'I')

    lost_file = r"file\2019.9.9-9.19(lost).csv"

    look_back = 13  # 14
    missing_ratio = [0.03, 0.06, 0.1, 0.2, 0.3]
    # missing_ratio = [0.03]
    # 分割训练数据
    data_train = data_complete.iloc[:5500]
    data_test = data_complete.iloc[5500:6400]
    train_x, train_y = create_dataset(data_train.values, look_back)

    rmse_all = []
    mre_all = []
    k_list = list(range(4, 21))
    for ratio in missing_ratio:
        rmse_list = []
        mre_list = []
        for n_neighbors in k_list:
            data_cut, r = create_miss_data(data_test, ratio, 10)

            # 数据分割,得到数据缺失部分
            # data_cut = data_lost.iloc[5500:6400]
            values = data_cut.values.flatten()
            lost_index = get_lost_index(data_cut)

            # 利用自己的knn
            knn2 = defKnn.KNNClassifier(n_neighbors)
            knn2.fit(train_x, train_y)

            rmse = 0
            mre = 0
            sum = 0
            data_fill = data_cut.copy()
            for i in range(len(lost_index)):
                for j in range(lost_index[i][0], lost_index[i][1] + 1):
                    sum += 1
                    look_back_x = np.array(data_fill.values[j - look_back:j])
                    look_back_x = look_back_x.reshape(1, -1)
                    val = knn2.predict(look_back_x)
                    data_fill.loc[j:j + 1, 'y'] = val
                    mre += abs((val - data_test.iloc[j]['y']) /
                               data_test.iloc[j]['y'])
                    rmse += pow(val - data_test.iloc[j]['y'], 2)

            mre_list.append(float(mre / sum))
            rmse_list.append(float(math.sqrt(rmse / sum)))
        if len(rmse_all) == 0:
            rmse_all = rmse_list
        else:
            rmse_all = [i + j for i, j in zip(rmse_all, rmse_list)]

        #  以下为RMSE的图形展示
        plt.figure(figsize=(15, 9))
        plt.plot(np.array(k_list).astype(dtype=np.str), rmse_list, 'black')
        plt.xlabel("K值", size=23)
        plt.ylabel('${E_{RMSE}}$/A', size=23)
        # plt.ylim(1.95, 2.10)  # 缺失率=0.03
        # plt.ylim(1.98, 2.30)  # 缺失率=0.06
        # plt.ylim(2.15, 2.30)  # 缺失率=0.1
        # plt.ylim(3.00, 3.50)  # 缺失率=0.3
        # plt.xticks(range(2, 20, 3))
        ax = plt.gca()
        # ax为两条坐标轴的实例
        # ax.spines['bottom'].set_linewidth(4)   # 设置底部坐标轴的粗细
        # ax.spines['left'].set_linewidth(4)   # 设置左边坐标轴的粗细
        # ax.xaxis.set_major_locator(MultipleLocator(3))
        # ax.yaxis.set_major_locator(MultipleLocator(0.05))  # 缺失率=0.03/0.1
        # ax.yaxis.set_major_locator(MultipleLocator(0.1))  # 缺失率=0.06
        # ax.yaxis.set_major_locator(MultipleLocator(0.25))  # 缺失率=0.3
        # plt.show()
        # fig = plt.gcf()
        # plt.savefig(r'picture\(K)缺失率='+str(ratio)+'_RMSE.png',
        #             format='png',
        #             bbox_inches='tight',
        #             transparent=True)
        # plt.show()
        # plt.close()
        print('缺失率=' + str(ratio) + '_RMSE.png 已完成')

    plt.figure(figsize=(15, 9))
    plt.tick_params(labelsize=23)
    plt.plot(np.array(k_list).astype(dtype=np.str), rmse_all, 'black')
    plt.xlabel("K值", size=23)
    plt.ylabel('${E_{RMSE}}$/A', size=23)
    plt.savefig(r'picture\(K)缺失率汇总_RMSE.png',
                format='png',
                bbox_inches='tight',
                transparent=True)
Exemplo n.º 4
0
def fill_missing_data():
    complete_file = r"file\2019.9.9-9.19(completed).csv"
    data_complete = create_dataframe(complete_file, 'I')

    lost_file = r"file\2019.9.9-9.19(lost).csv"
    data_lost = create_dataframe(lost_file, 'I')

    look_back = 20  # 14
    missing_ratio = [0.03, 0.06, 0.1, 0.2, 0.3]
    # missing_ratio = [0.03]
    # 分割训练数据
    data_train = data_complete.iloc[:5500]
    data_test = data_complete.iloc[5500:6400]
    train_x, train_y = create_dataset(data_train.values, look_back)

    df_mre = pd.DataFrame(index=range(3, 21))
    df_rmse = pd.DataFrame(index=range(3, 21))
    k_list = list(range(4, 21))
    for ratio in missing_ratio:
        rmse_list = []
        mre_list = []
        data_fill_plt = None

        data_cut, r = create_miss_data(data_test, ratio, 10)

        # 数据分割,得到数据缺失部分
        # data_cut = data_lost.iloc[5500:6400]
        values = data_cut.values.flatten()
        lost_index = get_lost_index(data_cut)
        # 利用自己的knn
        knn2 = defKnn.KNNClassifier(6)
        knn2.fit(train_x, train_y)

        rmse = 0
        mre = 0
        sum = 0
        data_fill = data_cut.copy()
        for i in range(len(lost_index)):
            for j in range(lost_index[i][0], lost_index[i][1] + 1):
                sum += 1
                look_back_x = np.array(data_fill.values[j - look_back:j])
                look_back_x = look_back_x.reshape(1, -1)
                val = knn2.predict(look_back_x)
                data_fill.loc[j:j + 1, 'y'] = val
                mre += abs(
                    (val - data_test.iloc[j]['y']) / data_test.iloc[j]['y'])
                rmse += pow(val - data_test.iloc[j]['y'], 2)

        mre_list.append(float(mre / sum))
        rmse_list.append(float(math.sqrt(rmse / sum)))

        data_fill_plt = data_fill.copy()
        plt.figure(figsize=(15, 9))
        plt.plot([(str(d)).replace('T', ' ')[5:16]
                  for d in list(data_test.index.values)],
                 data_test['y'].values,
                 "black",
                 linestyle='-',
                 label='真实值')
        plt.plot([(str(d)).replace('T', ' ')[5:16]
                  for d in list(data_fill_plt.index.values)],
                 data_fill_plt['y'].values,
                 "black",
                 linestyle='--',
                 label='填补值')
        plt.gca().xaxis.set_major_locator(
            ticker.MultipleLocator(200))  # 设置刻度密度
        plt.tick_params(labelsize=23)
        plt.autoscale(enable=True, axis='x', tight=True)  # 去掉坐标边缘的留白
        plt.autoscale(enable=True, axis='y', tight=True)  # 去掉坐标边缘的留白
        # ax = plt.gca()
        # ax.spines['bottom'].set_linewidth(4)   # 设置底部坐标轴的粗细
        # ax.spines['left'].set_linewidth(4)   # 设置左边坐标轴的粗细
        # ax.tick_params(width=4)  # 设置刻度线的粗细(竖着的)
        plt.xticks(rotation=30)
        plt.xlabel("时刻", size=23)
        plt.ylabel("真实值/A,填补值/A", size=23)
        plt.legend(loc='upper center',
                   prop={'size': 23},
                   bbox_to_anchor=(0.5, 1),
                   ncol=2,
                   frameon=False)  # ncol=n设为n列
        plt.savefig(r'picture\fill' + '(ratio=' + str(ratio) + ').png',
                    format='png',
                    bbox_inches='tight',
                    transparent=True)
        plt.show()
        plt.close()

        plt.figure(1, figsize=(12, 6))
        plt.plot(data_cut.index, data_cut['y'], label='miss')
        plt.figure(2, figsize=(12, 6))
        plt.plot(data_test.index, data_test['y'], label='real')
        plt.legend()
        plt.show()
Exemplo n.º 5
0
def get_value_fill():
    complete_file = r"file\2019.9.9-9.19(completed).csv"
    data_complete = create_dataframe(complete_file, 'I')

    lost_file = r"file\2019.9.9-9.19(lost).csv"
    data_lost = create_dataframe(lost_file, 'I')

    look_back = 20  # 14
    missing_ratio = [0.03, 0.06, 0.1, 0.2, 0.3]
    # missing_ratio = [0.03]
    # 分割训练数据
    data_train = data_complete.iloc[:5500]
    data_test = data_complete.iloc[5500:6400]
    train_x, train_y = create_dataset(data_train.values, look_back)

    rmse_all = []
    value_list = list(np.arange(0.15, 0.51, 0.01))

    for ratio in missing_ratio:
        rmse_list = []
        mre_list = []
        # 利用自己的knn
        for value in value_list:
            data_cut, r = create_miss_data(data_test, ratio, 10)

            # 数据分割,得到数据缺失部分
            # data_cut = data_lost.iloc[5500:6400]
            values = data_cut.values.flatten()
            lost_index = get_lost_index(data_cut)

            # 利用自己的knn
            knn2 = defKnn.KNNClassifier(13, value)
            knn2.fit(train_x, train_y)

            rmse = 0
            mre = 0
            sum = 0
            data_fill = data_cut.copy()
            for i in range(len(lost_index)):
                for j in range(lost_index[i][0], lost_index[i][1] + 1):
                    sum += 1
                    look_back_x = np.array(data_fill.values[j - look_back:j])
                    look_back_x = look_back_x.reshape(1, -1)
                    val = knn2.predict(look_back_x)
                    data_fill.loc[j:j + 1, 'y'] = val
                    mre += abs((val - data_test.iloc[j]['y']) /
                               data_test.iloc[j]['y'])
                    rmse += pow(val - data_test.iloc[j]['y'], 2)

            mre_list.append(float('%.2f' % (mre / sum)))
            rmse_list.append(float('%.2f' % (math.sqrt(rmse / sum))))

        if len(rmse_all) == 0:
            rmse_all = rmse_list
        else:
            rmse_all = [i + j for i, j in zip(rmse_all, rmse_list)]

        # plt.figure(figsize=(15, 9))
        # plt.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文标签
        # plt.rcParams['axes.spines.top'] = False  # 去掉顶部轴,必须放在plot之前
        # plt.rcParams['axes.spines.right'] = False  # 去掉右部轴
        # plt.tick_params(labelsize=23)
        # plt.autoscale(enable=True, axis='x', tight=True)  # 去掉坐标边缘的留白
        # plt.autoscale(enable=True, axis='y', tight=True)  # 去掉坐标边缘的留白
        # plt.plot(np.array(value_list), rmse_list, 'black')
        # plt.xlabel("阈值", size=23)
        # plt.ylabel('${E_{RMSE}}$/A', size=23)
        # ax = plt.gca()
        # plt.savefig(r'picture\(自适应)缺失率='+str(ratio)+'_RMSE.png',
        #             format='png',
        #             bbox_inches='tight',
        #             transparent=True)
        # plt.show()
        # plt.close()
        print('(自适应)缺失率=' + str(ratio) + '_RMSE.png 已完成')
    plt.figure(figsize=(15, 9))
    plt.tick_params(labelsize=23)
    plt.plot(np.array(value_list), rmse_all, 'black')
    plt.xlabel("阈值", size=23)
    plt.ylabel('${E_{RMSE}}$/A', size=23)
    plt.savefig(r'picture\(自适应)缺失率汇总_RMSE.png',
                format='png',
                bbox_inches='tight',
                transparent=True)
Exemplo n.º 6
0
def get_lookback_fill():
    complete_file = r"file\2019.9.9-9.19(completed).csv"
    data_complete = create_dataframe(complete_file, 'I')

    lost_file = r"file\2019.9.9-9.19(lost).csv"
    data_lost = create_dataframe(lost_file, 'I')

    missing_ratio = [0.03, 0.06, 0.1, 0.2, 0.3]
    # missing_ratio = [0.03]
    # 分割训练数据
    data_train = data_complete.iloc[:5500]
    data_test = data_complete.iloc[5500:6400]

    look_back_list = list(range(12, 41))
    rmse_all = []

    for ratio in missing_ratio:
        rmse_list = []
        mre_list = []
        # 利用自己的knn
        for look_back in look_back_list:
            train_x, train_y = create_dataset(data_train.values, look_back)
            data_cut, r = create_miss_data(data_test, ratio, 10)

            # 数据分割,得到数据缺失部分
            # data_cut = data_lost.iloc[5500:6400]
            values = data_cut.values.flatten()
            lost_index = get_lost_index(data_cut)

            # 利用自己的knn
            knn2 = defKnn.KNNClassifier(7)
            knn2.fit(train_x, train_y)

            rmse = 0
            mre = 0
            sum = 0
            data_fill = data_cut.copy()
            for i in range(len(lost_index)):
                for j in range(lost_index[i][0], lost_index[i][1] + 1):
                    sum += 1
                    look_back_x = np.array(data_fill.values[j - look_back:j])
                    look_back_x = look_back_x.reshape(1, -1)
                    val = knn2.predict(look_back_x)
                    data_fill.loc[j:j + 1, 'y'] = val
                    mre += abs((val - data_test.iloc[j]['y']) /
                               data_test.iloc[j]['y'])
                    rmse += pow(val - data_test.iloc[j]['y'], 2)

            mre_list.append(float('%.2f' % (mre / sum)))
            rmse_list.append(float('%.2f' % math.sqrt(rmse / sum)))

        if len(rmse_all) == 0:
            rmse_all = rmse_list
        else:
            rmse_all = [i + j for i, j in zip(rmse_all, rmse_list)]

        plt.figure(figsize=(15, 9))
        plt.plot(np.array(look_back_list), rmse_list, 'black')
        plt.tick_params(labelsize=23)
        plt.xlabel("输入特征维度", size=23)
        plt.ylabel('${E_{RMSE}}$/A', size=23)
        plt.savefig(r'picture\(输入维度)缺失率=' + str(ratio) + '_RMSE.png',
                    format='png',
                    bbox_inches='tight',
                    transparent=True)
        # plt.show()
        # plt.close()
        print('(输入维度)缺失率=' + str(ratio) + '_RMSE.png 已完成')
    plt.figure(figsize=(15, 9))
    plt.tick_params(labelsize=23)
    plt.plot(np.array(look_back_list).astype(dtype=np.str), rmse_all, 'black')
    plt.xlabel("输入特征维度", size=23)
    plt.ylabel('${E_{RMSE}}$/A', size=23)
    plt.savefig(r'picture\(输入维度)缺失率汇总_RMSE.png',
                format='png',
                bbox_inches='tight',
                transparent=True)
Exemplo n.º 7
0
def find_lookback_k():
    complete_file = r"file\2019.9.9-9.19(completed).csv"
    data_complete = create_dataframe(complete_file, 'I')

    lost_file = r"file\2019.9.9-9.19(lost).csv"
    data_lost = create_dataframe(lost_file, 'I')

    # 数据分割,得到数据缺失部分
    data_cut = data_lost.iloc[5500:6400]
    values = data_cut.values.flatten()
    lost_index = get_lost_index(data_cut)
    print(lost_index)

    # 分割训练数据
    data_train = data_complete.iloc[:5500]
    data_test = data_complete.iloc[5500:6400]

    data_fill = data_cut.copy()
    data_fill_all = data_cut.copy()
    for i in range(len(lost_index)):
        print("===================No.%d missing data====================" % (i + 1))
        grc_cof_max = 0
        k_best = 0
        look_back_best = 0
        for look_back in range(5, 15):
            train_x, train_y = create_dataset(data_train.values, look_back)
            for n_neighbors in range(3, 10):
                print("------look_back = %d, k= %d-------" % (look_back, n_neighbors))
                sum = 0
                # 利用自己的knn
                knn2 = defKnn.KNNClassifier(n_neighbors)
                knn2.fit(train_x, train_y)

                for j in range(lost_index[i][0], lost_index[i][1] + 1):
                    sum += 1
                    look_back_x = np.array(data_fill.values[j - look_back:j])
                    look_back_x = look_back_x.reshape(1, -1)
                    val2 = knn2.predict(look_back_x)
                    data_fill.loc[j:j + 1, 'y'] = val2

                temp = knn2.get_grc_cof() / n_neighbors / sum

                if not np.isnan(temp) and temp > grc_cof_max:
                    grc_cof_max = temp
                    k_best = n_neighbors
                    look_back_best = look_back
                print("Now, No.%d: best look_back = %d, best k = %d\n" % (i + 1, look_back_best, k_best))
        print("No.%d: k_best = %d, look_back_best =%d\n" % (i + 1, k_best, look_back_best))

        train_x, train_y = create_dataset(data_train.values, look_back_best)
        # 利用自己的knn以及k_best以及look_back_best
        knn2 = defKnn.KNNClassifier(k_best)
        knn2.fit(train_x, train_y)
        for j in range(lost_index[i][0], lost_index[i][1] + 1):
            look_back_x = np.array(data_fill_all.values[j - look_back_best:j])
            look_back_x = look_back_x.reshape(1, -1)
            val = knn2.predict(look_back_x)
            data_fill_all.loc[j:j + 1, 'y'] = val
    print("=================================end===============================")

    plt.figure(0, figsize=(12, 6))
    plt.plot(data_test.index, data_test['y'], label='real')
    plt.plot(data_fill_all.index, data_fill_all['y'], label='auto-fill')
    plt.legend()

    plt.figure(1, figsize=(12, 6))
    plt.plot(data_cut.index, data_cut['y'], label='missing')
    plt.legend()

    plt.figure(2, figsize=(12, 6))
    plt.plot(data_test.index, data_test['y'], label='real')
    plt.legend()
    plt.show()