def impute(xt, xv, strategy):
    if strategy == 'mice':
        xt_imputed = mice(xt)
        xv_imputed = mice(xv)
    elif strategy == 'knn':
        xt_imputed = KNN(k=K, verbose=False).fit_transform(xt)
        xv_imputed = KNN(k=K, verbose=False).fit_transform(xv)
    else:
        imp = SimpleImputer(strategy=strategy)
        xt_imputed = pd.DataFrame(imp.fit_transform(xt))
        xv_imputed = pd.DataFrame(imp.transform(xv))
        # put column names back after imputation
        xt_imputed.columns = xt.columns
        xv_imputed.columns = xv.columns
    return xt_imputed, xv_imputed
示例#2
0
def parse_data_for_model(df):
    # create column of fraud or not fraud
    df['fraud'] = 1 * df.acct_type.str.contains('fraud')
    # fill some columns with zero if NaN value
    df['has_header'].fillna(0, inplace=True)
    df['delivery_method'].fillna(0, inplace=True)
    df['org_facebook'].fillna(0, inplace=True)
    df['org_twitter'].fillna(0, inplace=True)
    # create column of whether the observation has a location or not
    df['has_location'] = 1 * df['venue_latitude'].notnull()
    df['listed'] = df['listed'].map({'y': 1, 'n': 0})
    # create dummy variables
    dummy1 = pd.get_dummies(df['currency'])
    dummy2 = pd.get_dummies(df['payout_type'])
    df = pd.concat([df, dummy1, dummy2], axis=1)
    # drop rows
    df.drop([
        'acct_type', 'approx_payout_date', 'channels', 'country',
        'description', 'email_domain', 'event_created', 'event_end',
        'event_published', 'event_start', 'gts', 'name', 'num_order',
        'object_id', 'org_desc', 'org_name', 'payee_name', 'previous_payouts',
        'sale_duration2', 'ticket_types', 'user_created', 'venue_address',
        'venue_country', 'venue_name', 'venue_state', 'currency', 'USD',
        'payout_type', 'CHECK', ''
    ],
            axis=1,
            inplace=True)
    cols = df.columns
    # use KNN to impute missing longitude and latitude data
    # using long, lat rather than other location features
    df = KNN(k=3).fit_transform(df)
    df = pd.DataFrame(df)
    df.columns = cols
    return df
def impute_knn(df):
    y = df.y
    df_target = df.drop('y', axis=1)
    df_filled_knn = KNN(k=3, verbose=False).fit_transform(df_target)
    df_filled_knn = pd.DataFrame(df_filled_knn)
    df_filled_knn.columns = df_target.columns
    df_filled_knn['y'] = y
    df = df_filled_knn
    print('Data is imputed with using Knn Imputation')
    return (df)
示例#4
0
def impute_missing_data(df):
    cols = df.columns

    # use KNN to impute missing longitude and latitude data
    df = KNN(k=3).fit_transform(df)

    # reconstruct imputed df
    df = pd.DataFrame(df)
    df.columns = cols

    return df
def replace_missingness(df):

    # replace all negative values with NaN
    print "Replacing missing values with NaN"
    df[df < 0] = np.nan

    # replace Missing or Other with NaN
    df[df == 'Missing'] = np.nan
    df[df == 'Other'] = np.nan

    # drop columns with over 75% missingness
    print "Dropping columns with over 75% missingness"
    include_col = []
    theta = .75
    dropped_col = 0
    for col in df.columns:
        missingness = df[col].isnull().sum() / len(df[col])
        if theta >= missingness:
            include_col.append(col)
        else:
            dropped_col += 1
    df = df[include_col]
    print "Dropped ", dropped_col, " columns"

    # find and drop columns that do not contain numeric values
    print "Dropping columns that do not contain numeric values"
    df_subset = df.select_dtypes(exclude=[np.number])
    print "Columns dropped: ", df_subset.columns
    df = df.drop(df_subset.columns, axis=1)

    # using knn imputation
    print "Running Knn imputation"
    df_imputed_columns = df.columns
    k = 109
    df_imputed = KNN(k=k).complete(df)
    df_imputed = pd.DataFrame(df_imputed)
    #print df_imputed
    df_imputed.columns = df_imputed_columns

    # save result to imputed_background.csv
    df_imputed.to_csv('imputed_background_jobTraining.csv',
                      sep=',',
                      index=False)
    # 时间局部:KNN
    # 最近邻估算,使用两行都具有观测数据的特征的均方差来对样本进行加权。然后用加权的结果进行特征值填充
    # 相当于A0D17个点为特征进行近邻,则参数K为时间,即时间上最近的16行按特征的均方差进行加权,即哪个时间点的权重大一些
    try:
        data_Aqua_KNN = KNN(k=7).fit_transform(data_Aqua)
        data_Aqua_KNN = pd.DataFrame(data_Aqua_KNN)  # 结果中有许多零值,应为空值

        data_Terra_KNN = KNN(k=7).fit_transform(data_Terra)
        data_Terra_KNN = pd.DataFrame(data_Terra_KNN)  # 结果中有许多零值,应为空值

    except Exception as e:
        data_Terra_KNN = copy.deepcopy(data_Terra)
        data_Aqua_KNN = copy.deepcopy(data_Aqua)
    data_Aqua_KNN = data_Aqua_KNN.set_index(data_Aqua.index)
    data_Aqua_KNN.columns = data_Aqua.columns
    # data_Aqua_KNN["日期合并用"] = data_Aqua_KNN.index
    data_Terra_KNN = data_Terra_KNN.set_index(data_Terra.index)
    data_Terra_KNN.columns = data_Terra.columns
    # data_Terra_KNN["日期合并用"] = data_Terra_KNN.index

    data_Aqua_KNN = data_Aqua_KNN[['NDVI_0']]
    data_Terra_KNN = data_Terra_KNN[['NDVI_0']]
    # 时间全局: 平滑,常用于股市
    data_Aqua_ewm = pd.DataFrame.ewm(self=data_Aqua,
                                     com=0.5,
                                     ignore_na=True,
                                     adjust=True).mean()
    data_Aqua_ewm = data_Aqua_ewm.set_index(data_Aqua.index)
    data_Aqua_ewm.columns = data_Aqua.columns
    # data_Aqua_ewm["日期合并用"] = data_Aqua_ewm.index
def get4method(xx152):
    # 地理距离
    def geo_distance(lng1_df, lat1_df, lng2_df, lat2_df):
        lng1_df, lat1_df, lng2_df, lat2_df = map(
            radians, [lng1_df, lat1_df, lng2_df, lat2_df])
        d_lon = lng2_df - lng1_df
        d_lat = lat2_df - lat1_df
        a = sin(d_lat / 2)**2 + cos(lat1_df) * cos(lat2_df) * sin(d_lon / 2)**2
        dis = 2 * asin(sqrt(a)) * 6371.393 * 1000  # 地球半径
        return dis  # 输出结果的单位为“米”

    # 空间: 难以插值是因为大部分地区及其临近地区同一污染物值可能会一同缺失.
    def get_IDW(input_data):
        for darksky_weather in input_data.columns:  # 确定污染物列
            for indx in input_data.index:  # 获取索引
                print(darksky_weather, indx)
                res_list = []
                weight_list = []
                if pd.isnull(input_data[darksky_weather][indx]):  # 开始循环
                    for item_idw in JCZ_info["监测站"]:  # 获取距离,定义权重
                        if item_idw != name:
                            lng2 = JCZ_info[JCZ_info["监测站"] == item_idw]["经度"]
                            lat2 = JCZ_info[JCZ_info["监测站"] == item_idw]["纬度"]
                            dis_1 = geo_distance(lng1, lat1, lng2,
                                                 lat2)  # 两站地理距离
                            if dis_1 <= 200000:  # 152 中位数345933 平均数333118
                                data_to_add_in_1 = pd.read_excel(
                                    input_file_path_pollution + item_idw +
                                    ".xlsx")
                                data_to_add_in_1 = data_to_add_in_1.set_index(
                                    "日期")  # 需要日期为索引,方便下面添加
                                if indx in data_to_add_in_1.index and pd.notnull(
                                        data_to_add_in_1[darksky_weather]
                                    [indx]):
                                    weight_list.append((1 / dis_1))
                    weight_sum = np.sum(np.array(weight_list))  # 总距离,权重分母
                    for item_idw_2 in JCZ_info["监测站"]:  # 分配权重
                        if item_idw_2 != name:
                            lng2 = JCZ_info[JCZ_info["监测站"] ==
                                            item_idw_2]["经度"]
                            lat2 = JCZ_info[JCZ_info["监测站"] ==
                                            item_idw_2]["纬度"]
                            dis_1 = geo_distance(lng1, lat1, lng2,
                                                 lat2)  # 两站地理距离
                            if dis_1 <= 200000:
                                data_to_add_in = pd.read_excel(
                                    input_file_path_pollution + item_idw_2 +
                                    ".xlsx")
                                data_to_add_in = data_to_add_in.set_index(
                                    "日期")  # 需要日期为索引,方便下面添加
                                if indx in data_to_add_in.index and pd.notnull(
                                        data_to_add_in[darksky_weather][indx]):
                                    res = ((1/dis_1) / weight_sum) * \
                                        data_to_add_in[darksky_weather][indx]
                                    res_list.append(res)
                                    # print("已添加单元格插值:", res)
                    # 上下公式结果若为nan,并不会报错.会让最后的插值为nan.
                    res_output = np.sum(np.array(res_list))
                    try:
                        input_data.loc[indx, darksky_weather] = res_output
                    except Exception as e:
                        print("缺失严重, 插值未定义:", e)
        print("[IDW]Finished.")
        return input_data

    # 监测站
    jcz_152 = pd.read_excel("D:\\毕业论文程序\\MODIS\\坐标\\站点列表-2018.11.08起_152.xlsx",
                            sheet_name=xx152)
    jcz_152["监测站名称_152"] = jcz_152["城市"] + "-" + jcz_152["监测点名称"]
    for input_file_name in jcz_152["监测站名称_152"]:
        input_file_name = input_file_name + ".xlsx"
        if input_file_name in saved_list:
            print("已经完成:", input_file_name, xx152)
            # continue
        print("========正在计算%s========" % input_file_name)
        # 读取数据源
        data_pollution = pd.read_excel(input_file_path_pollution +
                                       input_file_name)
        data_pollution = data_pollution.set_index('日期')

        # 时间: 平滑,常用于股市;创建新的数据框,不会覆盖原始数据
        data_pollution_ewm_mid = pd.DataFrame.ewm(self=data_pollution,
                                                  com=0.5,
                                                  ignore_na=True,
                                                  adjust=True).mean()
        data_pollution_ewm = copy.deepcopy(data_pollution)  # 避免覆盖原始数据
        for columname in data_pollution_ewm.columns:
            if data_pollution[columname].count() != len(data_pollution):
                loc = data_pollution[columname][
                    data_pollution[columname].isnull().values ==
                    True].index.tolist()
                for nub in loc:
                    data_pollution_ewm.loc[
                        nub, columname] = data_pollution_ewm_mid.loc[nub,
                                                                     columname]

        print('[ewm]Finished')

        # 定义经纬度
        data_pollution_to_IDW = copy.deepcopy(data_pollution)
        name = str(input_file_name).replace(".xlsx", "")  # 定义相关变量
        lng1 = JCZ_info[JCZ_info["监测站"] == name]["经度"]
        lat1 = JCZ_info[JCZ_info["监测站"] == name]["纬度"]
        # 空间: IDW,反距离插值
        data_pollution_IDW = get_IDW(data_pollution_to_IDW)
        # 全局: 迭代回归,缺失特征作为y,其他特征作为x
        merge_list = []  # 同一监测站,不同污染物
        for darksky_weather_Iterative in data_pollution.columns:
            # 合并部分
            numb = 0
            data_darksky_weather_to_Iterative = copy.deepcopy(
                data_pollution[[darksky_weather_Iterative]])
            data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.reset_index(
            )
            for item in JCZ_info["监测站"]:  # 不同于气溶胶插值方法
                if item != name:
                    # 添加的文件
                    data_to_add_in_to_Iterative = pd.read_excel(
                        input_file_path_pollution + item + ".xlsx")
                    # 添加的列名
                    data_to_Iterative_concat = data_to_add_in_to_Iterative[[
                        darksky_weather_Iterative, '日期'
                    ]]
                    data_to_Iterative_concat.columns = [
                        darksky_weather_Iterative + "_add%s" % numb, '日期'
                    ]  # 如果有五个临近, 则NDVI1-NDVI5
                    data_darksky_weather_to_Iterative = pd.merge(
                        data_darksky_weather_to_Iterative,
                        data_to_Iterative_concat,
                        how='left',
                        on='日期')
                    numb += 1
                    # data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.set_index('日期')
                    # 上下哪个合适?
            data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.set_index(
                '日期')
            # 迭代部分
            count_1 = 0
            for value_1 in data_darksky_weather_to_Iterative.sum():
                if value_1 != 0:
                    count_1 += 1
            if count_1 > 1:  # 至少两个非空列才可以计算
                data_darksky_weather_Iterative_to_merge = IterativeImputer(
                    max_iter=30).fit_transform(
                        data_darksky_weather_to_Iterative)
            else:
                data_darksky_weather_Iterative_to_merge = copy.deepcopy(
                    data_darksky_weather_to_Iterative)
            data_darksky_weather_Iterative_to_merge = pd.DataFrame(
                data_darksky_weather_Iterative_to_merge)  # 格式转换
            data_darksky_weather_Iterative_to_merge = data_darksky_weather_Iterative_to_merge.set_index(
                data_darksky_weather_to_Iterative.index)  # ok
            if len(data_darksky_weather_Iterative_to_merge.columns) < len(
                    data_darksky_weather_to_Iterative.columns):
                reset_col_name_list = []  # 对非nan列先命名
                for col_name in data_darksky_weather_to_Iterative.columns:
                    if np.max(data_darksky_weather_to_Iterative[col_name]) > 0:
                        reset_col_name_list.append(col_name)
                data_darksky_weather_Iterative_to_merge.columns = reset_col_name_list

                for col_name in data_darksky_weather_to_Iterative.columns:  # 对缺失的nan列补充
                    if col_name not in data_darksky_weather_Iterative_to_merge.columns:
                        # 补全缺失nan列
                        data_darksky_weather_Iterative_to_merge[
                            col_name] = np.nan
            else:
                data_darksky_weather_Iterative_to_merge.columns = data_darksky_weather_to_Iterative.columns  # 重设列名
            for numb_del in data_darksky_weather_Iterative_to_merge.columns:
                if 'add' in numb_del:
                    del data_darksky_weather_Iterative_to_merge[numb_del]

            # 插补后的该监测点的气象特征列, 仅一列, 循环添加其他特征
            merge_list.append(data_darksky_weather_Iterative_to_merge)
        data_darksky_weather_Iterative_1 = pd.concat(merge_list,
                                                     axis=1,
                                                     sort=False)
        print('[Iterative]Finished')
        # 局部
        # 最近邻KNN,是使用K行都具有全部特征的样本,使用其他特征的均方差进行加权,判断最接近的时间点.
        merge_list2 = []  # 同一监测站,不同污染物

        for pol in data_pollution.columns:
            data_knn_raw = copy.deepcopy(data_pollution[[pol]])
            data_knn_raw = data_knn_raw.reset_index()
            numb1 = 0
            for item_idw in JCZ_info["监测站"]:  # 获取距离,定义权重
                if item_idw != name:
                    lng2 = JCZ_info[JCZ_info["监测站"] == item_idw]["经度"]
                    lat2 = JCZ_info[JCZ_info["监测站"] == item_idw]["纬度"]
                    dis_knn = geo_distance(lng1, lat1, lng2, lat2)  # 两站地理距离
                    if dis_knn <= 200000:
                        data_knnadd = pd.read_excel(input_file_path_pollution +
                                                    item_idw + '.xlsx')
                        data_knnadd = data_knnadd[[pol, '日期']]
                        data_knnadd.columns = [pol + "add_%s" % numb1, '日期']
                        if data_knnadd[pol + "add_%s" % numb1].sum() == 0:
                            continue
                        else:
                            data_knn_raw = pd.merge(data_knn_raw,
                                                    data_knnadd,
                                                    how='left',
                                                    on='日期')
                numb1 += 1
            data_knn_raw = data_knn_raw.set_index('日期')
            if pol + 'add_0' in data_knn_raw.columns:
                print('============================================')
                data_pollution_KNN = KNN(k=30).fit_transform(data_knn_raw)
                data_pollution_KNN = pd.DataFrame(data_pollution_KNN)
                data_pollution_KNN.columns = data_knn_raw.columns
            else:
                data_pollution_KNN = copy.deepcopy(data_knn_raw)
            for numb_del2 in data_pollution_KNN.columns:
                if 'add' in numb_del2:
                    del data_pollution_KNN[numb_del2]
            merge_list2.append(data_pollution_KNN)
        data_darksky_weather_KNN_1 = pd.concat(merge_list2, axis=1, sort=True)

        # 对结果的0值取np.nan
        data_darksky_weather_KNN_1.replace(0, np.nan, inplace=True)
        data_pollution_ewm.replace(0, np.nan, inplace=True)
        data_pollution_IDW.replace(0, np.nan, inplace=True)
        data_darksky_weather_Iterative_1.replace(0, np.nan, inplace=True)

        # 合并相同方法的结果
        data_pollution_KNN = data_darksky_weather_KNN_1.set_index(
            data_pollution.index)
        data_pollution_KNN.columns = data_pollution.columns
        data_pollution_ewm = data_pollution_ewm.set_index(data_pollution.index)
        data_pollution_ewm.columns = data_pollution.columns
        data_pollution_IDW = data_pollution_IDW.set_index(data_pollution.index)
        data_pollution_IDW.columns = data_pollution.columns
        data_pollution_Iterative = data_darksky_weather_Iterative_1.set_index(
            data_pollution.index)
        data_pollution_Iterative.columns = data_pollution.columns

        # 合并不同方法为一个文件
        sheet_name = ["KNN", "ewm", "IDW", "Iterative"]
        sheet_name_count = 0
        writer = pd.ExcelWriter(merge_output_file_path + '%s.xlsx' %
                                (input_file_name.replace(".xlsx", "")))
        for methods_output in [
                data_pollution_KNN, data_pollution_ewm, data_pollution_IDW,
                data_pollution_Iterative
        ]:
            methods_output.to_excel(writer,
                                    sheet_name=sheet_name[sheet_name_count])
            sheet_name_count = 1 + sheet_name_count
        writer.save()
    data_pollution_IDW = get_IDW(data_pollution)

    # 空间全局: 迭代函数法,缺失特征作为y,其他特征作为x
    data_pollution_Iterative = IterativeImputer(
        max_iter=10).fit_transform(data_pollution)
    data_pollution_Iterative = pd.DataFrame(data_pollution_Iterative)

    # 对结果的0值取np.nan
    data_pollution_KNN.replace(0, np.nan, inplace=True)
    data_pollution_ewm.replace(0, np.nan, inplace=True)
    data_pollution_IDW.replace(0, np.nan, inplace=True)
    data_pollution_Iterative.replace(0, np.nan, inplace=True)

    # 合并相同方法的结果
    data_pollution_KNN = data_pollution_KNN.set_index(data_pollution.index)
    data_pollution_KNN.columns = data_pollution.columns
    # data_pollution_KNN["日期合并用"] = data_pollution_KNN.index
    data_pollution_ewm = data_pollution_ewm.set_index(data_pollution.index)
    data_pollution_ewm.columns = data_pollution.columns
    # data_pollution_ewm["日期合并用"] = data_pollution_ewm.index
    data_pollution_IDW = data_pollution_IDW.set_index(data_pollution.index)
    data_pollution_IDW.columns = data_pollution.columns
    # data_pollution_IDW["日期合并用"] = data_pollution_IDW.index
    data_pollution_Iterative = data_pollution_Iterative.set_index(
        data_pollution.index)
    data_pollution_Iterative.columns = data_pollution.columns
    # data_pollution_Iterative["日期合并用"] = data_pollution_Iterative.index

    # 合并不同方法为一个文件
    sheet_name = ["KNN", "ewm", "IDW", "Iterative"]
    sheet_name_count = 0  # 为什么显示without usage ?  因为: 上面如果if为false则..
def get4method(xx152):
    # 地理距离
    def geo_distance(lng1_df, lat1_df, lng2_df, lat2_df):
        lng1_df, lat1_df, lng2_df, lat2_df = map(
            radians, [lng1_df, lat1_df, lng2_df, lat2_df])
        d_lon = lng2_df - lng1_df
        d_lat = lat2_df - lat1_df
        a = sin(d_lat / 2) ** 2 + cos(lat1_df) * \
            cos(lat2_df) * sin(d_lon / 2) ** 2
        dis = 2 * asin(sqrt(a)) * 6371.393 * 1000  # 地球半径
        return dis  # 输出结果的单位为“米”

    # 空间局部: 难以插值是因为大部分地区及其临近地区同一污染物值可能会一同缺失.
    def get_IDW(input_data):
        for darksky_weather in [
            'apparentTemperatureHigh',
            'apparentTemperatureLow',
            'apparentTemperatureMax',
            'apparentTemperatureMin',
            'cloudCover',
            'dewPoint',
            'humidity',
            'moonPhase',
            'ozone',
            'precipAccumulation',
            'precipIntensity',
            'precipIntensityMax',
            'pressure',
            'sunriseTime',
            'sunsetTime',
            'temperatureHigh',
            'temperatureLow',
            'temperatureMax',
            'temperatureMin',
            'uvIndex',
            'visibility',
            'windBearing',
            'windGust',
            'windSpeed',
            'apparentTemperature',
                'temperature']:  # 确定污染物列
            for indx in input_data.index:  # 获取索引
                res_list = []
                weight_list = []
                if pd.isnull(input_data[darksky_weather][indx]):  # 开始循环
                    for item_idw in JCZ_info["监测站"]:  # 获取距离,定义权重
                        if item_idw != name:
                            lng2 = JCZ_info[JCZ_info["监测站"] == item_idw]["经度"]
                            lat2 = JCZ_info[JCZ_info["监测站"] == item_idw]["纬度"]
                            dis_1 = geo_distance(
                                lng1, lat1, lng2, lat2)  # 两站地理距离
                            if dis_1 <= 50000:
                                data_to_add_in_1 = pd.read_excel(
                                    input_file_path_darksky_weather + item_idw + ".xlsx")
                                data_to_add_in_1 = data_to_add_in_1.set_index(
                                    "日期")  # 需要日期为索引,方便下面添加
                                if indx in data_to_add_in_1.index and pd.notnull(
                                        data_to_add_in_1[darksky_weather][indx]):
                                    weight_list.append(dis_1)
                    weight_sum = np.sum(np.array(weight_list))  # 总距离,权重分母
                    for item_idw_2 in JCZ_info["监测站"]:  # 分配权重
                        if item_idw_2 != name:
                            lng2 = JCZ_info[JCZ_info["监测站"]
                                            == item_idw_2]["经度"]
                            lat2 = JCZ_info[JCZ_info["监测站"]
                                            == item_idw_2]["纬度"]
                            dis_1 = geo_distance(
                                lng1, lat1, lng2, lat2)  # 两站地理距离
                            if dis_1 <= 50000:
                                data_to_add_in = pd.read_excel(
                                    input_file_path_darksky_weather + item_idw_2 + ".xlsx")
                                data_to_add_in = data_to_add_in.set_index(
                                    "日期")  # 需要日期为索引,方便下面添加
                                if indx in data_to_add_in.index and pd.notnull(
                                        data_to_add_in[darksky_weather][indx]):
                                    res = (dis_1 / weight_sum) * \
                                        data_to_add_in[darksky_weather][indx]
                                    res_list.append(res)
                                    # print("已添加单元格插值:", res)
                    # 上下公式结果若为nan,并不会报错.会让最后的插值为nan.
                    res_output = np.sum(np.array(res_list))
                    try:
                        input_data.loc[indx, darksky_weather] = res_output
                    except Exception as e:
                        print("缺失严重, 插值未定义:", e)
        print("[IDW]Finished.")
        return input_data

    # 监测站
    jcz_152 = pd.read_excel(
        "D:\\毕业论文程序\\MODIS\\坐标\\站点列表-2018.11.08起_152.xlsx",
        sheet_name=xx152)
    jcz_152["监测站名称_152"] = jcz_152["城市"] + "-" + jcz_152["监测点名称"]
    error_list = []
    for input_file_name in jcz_152["监测站名称_152"]:
        input_file_name = input_file_name + ".xlsx"
        # if input_file_name in saved_list:
        # print("已经完成:", input_file_name, xx152)
        # continue
        print("========正在计算%s========" % input_file_name)
        try:
            # 读取数据源
            data_darksky_weather = pd.read_excel(
                input_file_path_darksky_weather + input_file_name)
            data_darksky_weather = data_darksky_weather.set_index('日期')
            # 时间局部:最近邻KNN,是使用K行都具有全部特征的样本,使用其他特征的均方差进行加权,判断最接近的时间点.
            data_darksky_weather_KNN = KNN(
                k=7).fit_transform(data_darksky_weather)
            data_darksky_weather_KNN = pd.DataFrame(data_darksky_weather_KNN)
            # 时间全局: 平滑,常用于股市;创建新的数据框,不会覆盖原始数据
            data_darksky_weather_ewm_mid = pd.DataFrame.ewm(
                self=data_darksky_weather,
                com=0.5,
                ignore_na=True,
                adjust=True).mean()
            data_darksky_weather_ewm = copy.deepcopy(
                data_darksky_weather)  # 避免覆盖原始数据
            for columname in data_darksky_weather_ewm.columns:
                if data_darksky_weather[columname].count() != len(
                        data_darksky_weather):
                    loc = data_darksky_weather[columname][data_darksky_weather[columname].isnull(
                    ).values].index.tolist()
                    for nub in loc:
                        data_darksky_weather_ewm.loc[nub,
                                                     columname] = data_darksky_weather_ewm_mid.loc[nub,
                                                                                                   columname]

            # 空间
            data_darksky_weather_to_IDW = copy.deepcopy(data_darksky_weather)
            name = str(input_file_name).replace(".xlsx", "")  # 定义相关变量
            lng1 = JCZ_info[JCZ_info["监测站"] == name]["经度"]
            lat1 = JCZ_info[JCZ_info["监测站"] == name]["纬度"]
            # 空间局部: IDW,反距离插值
            data_darksky_weather_IDW = get_IDW(data_darksky_weather_to_IDW)

            # 空间全局: 迭代回归,缺失特征作为y,其他特征作为x
            merge_list = []  # 同一监测站,不同污染物
            for darksky_weather_Iterative in [
                'apparentTemperatureHigh',
                'apparentTemperatureLow',
                'apparentTemperatureMax',
                'apparentTemperatureMin',
                'cloudCover',
                'dewPoint',
                'humidity',
                'moonPhase',
                'ozone',
                'precipAccumulation',
                'precipIntensity',
                'precipIntensityMax',
                'pressure',
                'sunriseTime',
                'sunsetTime',
                'temperatureHigh',
                'temperatureLow',
                'temperatureMax',
                'temperatureMin',
                'uvIndex',
                'visibility',
                'windBearing',
                'windGust',
                'windSpeed',
                'apparentTemperature',
                    'temperature']:
                # 合并部分
                numb = 0
                data_darksky_weather_to_Iterative = copy.deepcopy(data_darksky_weather[[darksky_weather_Iterative]])
                data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.reset_index()
                for item in JCZ_info["监测站"]:  # 不同于气溶胶插值方法
                    if item != name:
                        # 添加的文件
                        data_to_add_in_to_Iterative = pd.read_excel(
                            input_file_path_darksky_weather + item + ".xlsx")
                        # 添加的列名
                        data_to_Iterative_concat = data_to_add_in_to_Iterative[[darksky_weather_Iterative, '日期']]
                        data_to_Iterative_concat.columns = [darksky_weather_Iterative + "_add%s" % numb,
                                                            '日期']  # 如果有五个临近, 则NDVI1-NDVI5

                        data_darksky_weather_to_Iterative = pd.merge(data_darksky_weather_to_Iterative,
                                                                     data_to_Iterative_concat,
                                                                     how='left',
                                                                     on='日期')
                        data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.set_index('日期')
                    numb += 1
                # 迭代部分
                count_1 = 0
                for value_1 in data_darksky_weather_to_Iterative.sum():
                    if value_1 != 0:
                        count_1 += 1
                if count_1 > 1:  # 至少两个非空列才可以计算
                    data_darksky_weather_Iterative_to_merge = IterativeImputer(
                        max_iter=100).fit_transform(data_darksky_weather_to_Iterative)
                else:
                    data_darksky_weather_Iterative_to_merge = copy.deepcopy(
                        data_darksky_weather_to_Iterative)
                data_darksky_weather_Iterative_to_merge = pd.DataFrame(
                    data_darksky_weather_Iterative_to_merge)  # 格式转换
                data_darksky_weather_Iterative_to_merge = data_darksky_weather_Iterative_to_merge.set_index(
                    data_darksky_weather_to_Iterative.index)  # ok
                if len(data_darksky_weather_Iterative_to_merge.columns) < len(data_darksky_weather_to_Iterative.columns):
                    reset_col_name_list = []  # 对非nan列先命名
                    for col_name in data_darksky_weather_to_Iterative.columns:
                        if np.max(data_darksky_weather_to_Iterative[col_name]) > 0:
                            reset_col_name_list.append(col_name)
                    data_darksky_weather_Iterative_to_merge.columns = reset_col_name_list

                    for col_name in data_darksky_weather_to_Iterative.columns:  # 对缺失的nan列补充
                        if col_name not in data_darksky_weather_Iterative_to_merge.columns:
                            # 补全缺失nan列
                            data_darksky_weather_Iterative_to_merge[col_name] = np.nan
                else:
                    data_darksky_weather_Iterative_to_merge.columns = data_darksky_weather_to_Iterative.columns  # 重设列名
                for numb_del in range(numb):
                    if darksky_weather_Iterative + "_add%s" % numb_del not in data_darksky_weather_Iterative_to_merge.columns:
                        continue
                    else:
                        del data_darksky_weather_Iterative_to_merge[darksky_weather_Iterative +
                                                                    "_add%s" %
                                                                    numb_del]
                # 插补后的该监测点的气象特征列, 仅一列, 循环添加其他特征
                merge_list.append(data_darksky_weather_Iterative_to_merge)
            data_darksky_weather_Iterative_1 = pd.concat(
                merge_list, axis=1, sort=False)

            # 对结果的0值取np.nan
            data_darksky_weather_KNN.replace(0, np.nan, inplace=True)
            data_darksky_weather_ewm.replace(0, np.nan, inplace=True)
            data_darksky_weather_IDW.replace(0, np.nan, inplace=True)
            data_darksky_weather_Iterative_1.replace(0, np.nan, inplace=True)

            # 合并相同方法的结果
            data_darksky_weather_KNN = data_darksky_weather_KNN.set_index(
                data_darksky_weather.index)
            data_darksky_weather_KNN.columns = data_darksky_weather.columns
            data_darksky_weather_ewm = data_darksky_weather_ewm.set_index(
                data_darksky_weather.index)
            data_darksky_weather_ewm.columns = data_darksky_weather.columns
            data_darksky_weather_IDW = data_darksky_weather_IDW.set_index(
                data_darksky_weather.index)
            data_darksky_weather_IDW.columns = data_darksky_weather.columns
            data_darksky_weather_Iterative = data_darksky_weather_Iterative_1.set_index(
                data_darksky_weather.index)
            data_darksky_weather_Iterative.columns = data_darksky_weather.columns

            # 合并不同方法为一个文件

            sheet_name = ["KNN", "ewm", "IDW", "Iterative"]
            sheet_name_count = 0
            writer = pd.ExcelWriter(
                merge_output_file_path + '%s.xlsx' %
                (input_file_name.replace(
                    ".xlsx", "")))
            for methods_output in [
                    data_darksky_weather_KNN,
                    data_darksky_weather_ewm,
                    data_darksky_weather_IDW,
                    data_darksky_weather_Iterative]:
                methods_output.to_excel(
                    writer, sheet_name=sheet_name[sheet_name_count])
                sheet_name_count = 1 + sheet_name_count
            writer.save()

        except Exception as e:
            print(input_file_name, "发生错误:", e)
示例#10
0
def get4method(xx152):
    # 地理距离
    def geo_distance(lng1_df, lat1_df, lng2_df, lat2_df):
        lng1_df, lat1_df, lng2_df, lat2_df = map(
            radians, [lng1_df, lat1_df, lng2_df, lat2_df])
        d_lon = lng2_df - lng1_df
        d_lat = lat2_df - lat1_df
        a = sin(d_lat / 2)**2 + cos(lat1_df) * cos(lat2_df) * sin(d_lon / 2)**2
        dis = 2 * asin(sqrt(a)) * 6371.393 * 1000  # 地球半径
        return dis  # 输出结果的单位为“米”

    # 空间局部: 难以插值是因为大部分地区及其临近地区同一污染物值可能会一同缺失.
    def get_IDW(input_data):
        for pollution in ["PM25", "PM10", "SO2", "NO2", "O3", "CO"]:  # 确定污染物列
            for indx in input_data.index:  # 获取索引
                res_list = []
                weight_list = []
                if pd.isnull(input_data[pollution][indx]):  # 开始循环
                    for item_idw in JCZ_info["监测站"]:  # 获取距离,定义权重
                        if item_idw != name:
                            lng2 = JCZ_info[JCZ_info["监测站"] == item_idw]["经度"]
                            lat2 = JCZ_info[JCZ_info["监测站"] == item_idw]["纬度"]
                            dis_1 = geo_distance(lng1, lat1, lng2,
                                                 lat2)  # 两站地理距离
                            if dis_1 <= 50000:
                                data_to_add_in_1 = pd.read_excel(
                                    input_file_path_pollution + item_idw +
                                    ".xlsx")
                                data_to_add_in_1 = data_to_add_in_1.set_index(
                                    "日期")  # 需要日期为索引,方便下面添加
                                if indx in data_to_add_in_1.index and pd.notnull(
                                        data_to_add_in_1[pollution][indx]):
                                    weight_list.append(dis_1)
                    weight_sum = np.sum(np.array(weight_list))  # 总距离,权重分母
                    for item_idw_2 in JCZ_info["监测站"]:  # 分配权重
                        if item_idw_2 != name:
                            lng2 = JCZ_info[JCZ_info["监测站"] ==
                                            item_idw_2]["经度"]
                            lat2 = JCZ_info[JCZ_info["监测站"] ==
                                            item_idw_2]["纬度"]
                            dis_1 = geo_distance(lng1, lat1, lng2,
                                                 lat2)  # 两站地理距离
                            if dis_1 <= 50000:
                                data_to_add_in = pd.read_excel(
                                    input_file_path_pollution + item_idw_2 +
                                    ".xlsx")
                                data_to_add_in = data_to_add_in.set_index(
                                    "日期")  # 需要日期为索引,方便下面添加
                                if indx in data_to_add_in.index and pd.notnull(
                                        data_to_add_in[pollution][indx]):
                                    res = (dis_1 / weight_sum
                                           ) * data_to_add_in[pollution][indx]
                                    res_list.append(res)
                                    # print("已添加单元格插值:", res)
                    res_output = np.sum(
                        np.array(res_list))  # 上下公式结果若为nan,并不会报错.会让最后的插值为nan.
                    try:
                        input_data[pollution][indx] = res_output
                    except Exception as e:
                        print("缺失严重, 插值未定义:", e)
        print("[IDW]Finished.")
        return input_data

    # 监测站
    jcz_152 = pd.read_excel("D:\\毕业论文程序\\MODIS\\坐标\\站点列表-2018.11.08起_152.xlsx",
                            sheet_name=xx152)
    jcz_152["监测站名称_152"] = jcz_152["城市"] + "-" + jcz_152["监测点名称"]
    for input_file_name in jcz_152["监测站名称_152"]:
        input_file_name = input_file_name + ".xlsx"
        if input_file_name in saved_list:
            print("已经完成:", input_file_name, xx152)
            continue
        print("========正在计算%s========" % input_file_name)
        # 读取数据源
        data_pollution = pd.read_excel(input_file_path_pollution +
                                       input_file_name)
        data_pollution = data_pollution.set_index('日期')
        # 时间局部:最近邻KNN,是使用K行都具有全部特征的样本,使用其他特征的均方差进行加权,判断最接近的时间点.
        data_pollution_KNN = KNN(k=7).fit_transform(data_pollution)
        data_pollution_KNN = pd.DataFrame(data_pollution_KNN)
        # 时间全局: 平滑,常用于股市;创建新的数据框,不会覆盖原始数据
        data_pollution_ewm_mid = pd.DataFrame.ewm(self=data_pollution,
                                                  com=0.5,
                                                  ignore_na=True,
                                                  adjust=True).mean()
        data_pollution_ewm = copy.deepcopy(data_pollution)  # 避免覆盖原始数据
        for columname in data_pollution_ewm.columns:
            if data_pollution[columname].count() != len(data_pollution):
                loc = data_pollution[columname][
                    data_pollution[columname].isnull().values ==
                    True].index.tolist()
                for nub in loc:
                    data_pollution_ewm[columname][
                        nub] = data_pollution_ewm_mid[columname][nub]

        # 空间
        data_pollution_to_IDW = copy.deepcopy(data_pollution)
        name = str(input_file_name).replace(".xlsx", "")  # 定义相关变量
        lng1 = JCZ_info[JCZ_info["监测站"] == name]["经度"]
        lat1 = JCZ_info[JCZ_info["监测站"] == name]["纬度"]
        # 空间局部: IDW,反距离插值
        data_pollution_IDW = get_IDW(data_pollution_to_IDW)
        # 空间全局: 迭代回归,缺失特征作为y,其他特征作为x
        merge_list = []  # 同一监测站,不同污染物
        for pollution_Iterative in ["PM25", "PM10", "SO2", "NO2", "O3", "CO"]:
            concat_list = []  # 用于添加同污染物,不同监测站的数值
            numb = 0
            for item in JCZ_info["监测站"]:  # 不同于气溶胶插值方法
                if item != name:
                    lng_2 = JCZ_info[JCZ_info["监测站"] == item]["经度"]
                    lat_2 = JCZ_info[JCZ_info["监测站"] == item]["纬度"]
                    dis_2 = geo_distance(lng1, lat1, lng_2, lat_2)  # 两站地理距离
                    if dis_2 <= 50000:  # 合并距离内的临近监测站
                        data_to_add_in_to_Iterative = pd.read_excel(
                            input_file_path_pollution + item + ".xlsx")
                        data_to_add_in_to_Iterative = data_to_add_in_to_Iterative.set_index(
                            "日期")
                        data_to_Iterative_concat = data_to_add_in_to_Iterative[
                            pollution_Iterative]
                        data_to_Iterative_concat = pd.DataFrame(
                            data_to_Iterative_concat)
                        data_to_Iterative_concat.columns = [
                            pollution_Iterative + "_add%s" % numb
                        ]
                        concat_list.append(data_to_Iterative_concat)
                        numb += 1
            if len(concat_list) > 0:  # 合并本身与临近
                data_to_Iterative = pd.concat(concat_list, axis=1, sort=False)
                data_to_Iterative = pd.concat(
                    [data_pollution[pollution_Iterative], data_to_Iterative],
                    axis=1,
                    sort=False)
            else:
                data_to_Iterative = data_pollution[pollution_Iterative].copy()
                data_to_Iterative = pd.DataFrame(data_to_Iterative)
                data_to_Iterative.columns = [pollution_Iterative]  # 本身
            data_pollution_Iterative_to_merge = IterativeImputer(
                max_iter=10).fit_transform(data_to_Iterative)
            data_pollution_Iterative_to_merge = pd.DataFrame(
                data_pollution_Iterative_to_merge)
            data_pollution_Iterative_to_merge = data_pollution_Iterative_to_merge.set_index(
                data_to_Iterative.index)
            data_pollution_Iterative_to_merge.columns = data_to_Iterative.columns
            for numb_del in range(numb):
                del data_pollution_Iterative_to_merge[pollution_Iterative +
                                                      "_add%s" % numb_del]
            merge_list.append(data_pollution_Iterative_to_merge)
        data_pollution_Iterative = pd.concat(merge_list, axis=1, sort=False)

        # 对结果的0值取np.nan
        data_pollution_KNN.replace(0, np.nan, inplace=True)
        data_pollution_ewm.replace(0, np.nan, inplace=True)
        data_pollution_IDW.replace(0, np.nan, inplace=True)
        data_pollution_Iterative.replace(0, np.nan, inplace=True)

        # 合并相同方法的结果

        data_pollution_KNN = data_pollution_KNN.set_index(data_pollution.index)
        data_pollution_KNN.columns = data_pollution.columns
        data_pollution_ewm = data_pollution_ewm.set_index(data_pollution.index)
        data_pollution_ewm.columns = data_pollution.columns
        data_pollution_IDW = data_pollution_IDW.set_index(data_pollution.index)
        data_pollution_IDW.columns = data_pollution.columns
        data_pollution_Iterative = data_pollution_Iterative.set_index(
            data_pollution.index)
        data_pollution_Iterative.columns = data_pollution.columns

        # 合并不同方法为一个文件
        sheet_name = ["KNN", "ewm", "IDW", "Iterative"]
        sheet_name_count = 0
        writer = pd.ExcelWriter(merge_output_file_path + '%s.xlsx' %
                                (input_file_name.replace(".xlsx", "")))
        for methods_output in [
                data_pollution_KNN, data_pollution_ewm, data_pollution_IDW,
                data_pollution_Iterative
        ]:
            methods_output.to_excel(writer,
                                    sheet_name=sheet_name[sheet_name_count])
            sheet_name_count = 1 + sheet_name_count
        writer.save()
示例#11
0
# Store categorical and non-categorical variables in separate dataframes
categorical = bg_[cat_cols]
continuous = bg_[cont_cols]

# Save the index and cols of the new matrix.
index = bg_.index
cols = bg_.columns

k = 5
print("Now unning KNN imputation using ", k, " nearest neighbours...")
bg_imputed = KNN(k=k).complete(bg_)

print("Converting back to dataframe")
bg_imputed = pd.DataFrame(bg_imputed)
bg_imputed.index = index
bg_imputed.columns = cols

# Just get continuous cols since categorical columns badly imputed
bg_imputed_cont = bg_imputed[cont_cols]
bg_imputed_cat = bg_imputed[cat_cols]


def convert_continuous_to_categorical(categories, orig_col, imputed_col):
    """"Takes a set of categories, the original column, and a column
    that has been imputed using the mean value of the KNN.

    Returns a new column where the imputed values are transformed to the
    nearest numeric category.

    Note: This is quite a rough way to recover the imputed categories,
    consider testing and potentially improving it."""
示例#12
0
    data_input_IDW = get_IDW(data_input)

    # 空间全局: 迭代函数法,缺失特征作为y,其他特征作为x
    data_input_Iterative = IterativeImputer(
        max_iter=10).fit_transform(data_input)
    data_input_Iterative = pd.DataFrame(data_input_Iterative)

    # 对结果的0值取np.nan
    data_input_KNN.replace(0, np.nan, inplace=True)
    data_input_ewm.replace(0, np.nan, inplace=True)
    data_input_IDW.replace(0, np.nan, inplace=True)
    data_input_Iterative.replace(0, np.nan, inplace=True)

    # 合并相同方法的结果
    data_input_KNN = data_input_KNN.set_index(data_input.index)
    data_input_KNN.columns = data_input.columns
    data_input_KNN["日期合并用"] = data_input_KNN.index
    data_input_ewm = data_input_ewm.set_index(data_input.index)
    data_input_ewm.columns = data_input.columns
    data_input_ewm["日期合并用"] = data_input_ewm.index
    data_input_IDW = data_input_IDW.set_index(data_input.index)
    data_input_IDW.columns = data_input.columns
    data_input_IDW["日期合并用"] = data_input_IDW.index
    data_input_Iterative = data_input_Iterative.set_index(data_input.index)
    data_input_Iterative.columns = data_input.columns
    data_input_Iterative["日期合并用"] = data_input_Iterative.index

    # 合并不同方法下的A/T为一个文件
    sheet_name = ["KNN", "ewm", "IDW", "Iterative"]
    sheet_name_count = 0  # 为什么显示without usage ?  因为下面如果if为false则..
    writer = pd.ExcelWriter(merge_output_file_path + '%s.xlsx' %
示例#13
0
def get4method(xx152):
    # 地理距离

    def geo_distance(lng1_df, lat1_df, lng2_df, lat2_df):
        lng1_df, lat1_df, lng2_df, lat2_df = map(
            radians, [lng1_df, lat1_df, lng2_df, lat2_df])
        d_lon = lng2_df - lng1_df
        d_lat = lat2_df - lat1_df
        a = sin(d_lat / 2)**2 + cos(lat1_df) * cos(lat2_df) * sin(d_lon / 2)**2
        dis = 2 * asin(sqrt(a)) * 6371.393 * 1000  # 地球半径
        return dis  # 输出结果的单位为“米”

    # 监测站
    jcz_152 = pd.read_excel("D:\\毕业论文程序\\MODIS\\坐标\\站点列表-2018.11.08起_152.xlsx",
                            sheet_name=xx152)
    jcz_152["监测站名称_152"] = jcz_152["城市"] + "-" + jcz_152["监测点名称"]
    for input_file_name in jcz_152["监测站名称_152"]:
        input_file_name = input_file_name + ".xlsx"
        if input_file_name in saved_list:
            print("已经完成:", input_file_name, xx152)
            continue
        #  print("========正在计算%s========" % input_file_name)
        # 读取数据源
        data_pollution = pd.read_excel(input_file_path_pollution +
                                       input_file_name)
        data_pollution = data_pollution.set_index('日期')

        # 时间: 平滑,常用于股市;创建新的数据框,不会覆盖原始数据
        print('======%s:开始进行时间特性捕捉======' %
              input_file_name.replace('.xlsx', ''))
        data_pollution_ewm_mid = pd.DataFrame.ewm(self=data_pollution,
                                                  com=0.5,
                                                  ignore_na=True,
                                                  adjust=True).mean()
        data_pollution_ewm = copy.deepcopy(data_pollution)  # 避免覆盖原始数据
        for columname in data_pollution_ewm.columns:
            if data_pollution[columname].count() != len(data_pollution):
                loc = data_pollution[columname][
                    data_pollution[columname].isnull().values ==
                    True].index.tolist()
                for nub in loc:
                    data_pollution_ewm.loc[
                        nub, columname] = data_pollution_ewm_mid.loc[nub,
                                                                     columname]

        print('[ewm]Finished')

        # 定义经纬度
        data_pollution_IDW = copy.deepcopy(data_pollution)
        name = str(input_file_name).replace(".xlsx", "")  # 定义相关变量
        lng1 = JCZ_info[JCZ_info["监测站"] == name]["经度"]
        lat1 = JCZ_info[JCZ_info["监测站"] == name]["纬度"]

        # 全局: 迭代回归,缺失特征作为y,其他特征作为x
        merge_list = []  # 同一监测站,不同污染物
        for darksky_weather_Iterative in data_pollution.columns:
            # 合并部分
            numb = 0
            data_darksky_weather_to_Iterative = copy.deepcopy(
                data_pollution[[darksky_weather_Iterative]])
            data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.reset_index(
            )
            if data_darksky_weather_to_Iterative[darksky_weather_Iterative].sum() == 0 \
                    or data_darksky_weather_to_Iterative[darksky_weather_Iterative].isnull().sum() == 0:
                data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.set_index(
                    '日期')
                merge_list.append(data_darksky_weather_to_Iterative)
            else:
                # 如果 该特征不是全空,则合并
                for item in JCZ_info["监测站"]:  # 不同于d气溶胶插值方法
                    if item != name:
                        # 添加的文件
                        data_to_add_in_to_Iterative = pd.read_excel(
                            input_file_path_pollution + item + ".xlsx")
                        # 添加的列名, 若要添加的列全空则跳过
                        if data_to_add_in_to_Iterative[darksky_weather_Iterative].sum() == 0 \
                                or data_to_add_in_to_Iterative[darksky_weather_Iterative].isnull().sum() == \
                                len(data_to_add_in_to_Iterative.index):
                            continue
                        else:
                            data_to_Iterative_concat = data_to_add_in_to_Iterative[
                                [darksky_weather_Iterative, '日期']]
                            data_to_Iterative_concat.columns = [
                                darksky_weather_Iterative + "_add%s" % numb,
                                '日期'
                            ]  # 如果有五个临近, 则NDVI1-NDVI5
                            data_darksky_weather_to_Iterative = pd.merge(
                                data_darksky_weather_to_Iterative,
                                data_to_Iterative_concat,
                                how='left',
                                on='日期')  # 不补全的时候会删掉有数的列, 导致列不同
                            numb += 1  # 添加了列则增加计数
                            # print(len(data_darksky_weather_to_Iterative.columns))
                data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.set_index(
                    '日期')
                # 迭代部分
                if numb >= 1:  # 至少两个非空列才可以计算
                    data_darksky_weather_Iterative_to_merge = IterativeImputer(
                        max_iter=10).fit_transform(
                            data_darksky_weather_to_Iterative)
                    # pd.DataFrame(data_darksky_weather_Iterative_to_merge).to_excel('tets1.xlsx')
                    data_darksky_weather_to_Iterative.to_excel('test2.xlsx')
                    data_darksky_weather_Iterative_to_merge = pd.DataFrame(
                        data_darksky_weather_Iterative_to_merge,
                        columns=data_darksky_weather_to_Iterative.columns
                    )  # 格式转换
                    data_darksky_weather_Iterative_to_merge = data_darksky_weather_Iterative_to_merge.set_index(
                        data_darksky_weather_to_Iterative.index)  # ok
                    # print(len(data_darksky_weather_Iterative_to_merge.columns))

                else:
                    data_darksky_weather_Iterative_to_merge = copy.deepcopy(
                        data_darksky_weather_to_Iterative)
                for numb_del in data_darksky_weather_Iterative_to_merge.columns:
                    if 'add' in numb_del:
                        del data_darksky_weather_Iterative_to_merge[
                            numb_del]  # 至此, 只剩下一列特征列
            # 插补后的该监测点的气象特征列, 仅一列, 循环添加其他特征
                merge_list.append(data_darksky_weather_Iterative_to_merge)
        data_darksky_weather_Iterative_1 = pd.concat(merge_list,
                                                     axis=1,
                                                     sort=False)
        print('[Iterative]Finished')

        # 局部 + 空间
        # 最近邻KNN,是使用K行都具有全部特征的样本,使用其他特征的均方差进行加权,判断最接近的时间点.
        print('======%s:开始进行空间特性和局部相关性捕捉======' %
              input_file_name.replace('.xlsx', ''))
        merge_list2 = []  # 同一监测站,不同污染物
        for pol in data_pollution_IDW.columns:
            data_knn_raw = copy.deepcopy(data_pollution_IDW[[pol]])
            data_knn_raw = data_knn_raw.reset_index()
            numb1 = 0
            weight_list = []
            null_idx = data_pollution_IDW[pol][data_pollution_IDW[pol].isnull(
            ).values == True].index.tolist()
            list_idw_out2 = []
            for item_idw in JCZ_info["监测站"]:  # 获取距离,定义权重
                if item_idw != name:
                    lng2 = JCZ_info[JCZ_info["监测站"] == item_idw]["经度"]
                    lat2 = JCZ_info[JCZ_info["监测站"] == item_idw]["纬度"]
                    dis_1 = geo_distance(lng1, lat1, lng2, lat2)  # 两站地理距离
                    if dis_1 <= 200000:  # 或 > 0
                        data_knnadd = pd.read_excel(input_file_path_pollution +
                                                    item_idw + '.xlsx')
                        data_knnadd = data_knnadd[[pol, '日期']]
                        data_knnadd.columns = [pol + "add_%s" % numb1, '日期']
                        if data_knnadd[pol + "add_%s" % numb1].sum() == 0:
                            continue
                        else:
                            weight_list.append((1 / dis_1))
                            data_knn_raw = pd.merge(data_knn_raw,
                                                    data_knnadd,
                                                    how='left',
                                                    on='日期')
                            data_knnadd = data_knnadd.set_index('日期')  # 为了下一行

                            list_idw_out1 = [
                                (1 / dis_1) *
                                data_knnadd[pol + "add_%s" % numb1][j]
                                for j in null_idx
                            ]
                            list_idw_out2.append(
                                list_idw_out1)  # 给列表 添加: 距离*观测
                            numb1 += 1  # 添加新列才能+1
                # numb1 += 1  # 非NDVI的时候
            # IDW 部分
            if numb1 >= 1:  # 避免没有符合条件的列, 即没有添加列, 而形成的错误
                list_idw_out3 = np.array(list_idw_out2)
                arrar01 = np.array([j / j for j in list_idw_out3])  # nan 1 矩阵
                list_nan = np.isnan(arrar01)
                arrar01[list_nan] = 0  # 0 1 矩阵
                arrayw = arrar01.T * weight_list  # 0 1 权重列表
                arrayw = arrayw.sum(1)
                list_idw_out3[np.isnan(
                    list_idw_out3)] = 0  # 距离 * 数据 矩阵 替换nan为0
                idw_output1 = list_idw_out3.T.sum(1)
                idw_output2 = idw_output1 / arrayw  # idw结果
                idw_output2 = pd.DataFrame(idw_output2,
                                           index=null_idx,
                                           columns=[pol])
                data_pollution_IDW[pol][
                    data_pollution_IDW[pol].isnull()] = idw_output2[pol]  # 插入
            print('[IDW]Finished')

            # KNN计算部分
            data_knn_raw = data_knn_raw.set_index('日期')
            if pol + 'add_0' in data_knn_raw.columns:
                print('============================================')
                data_pollution_KNN = KNN(k=30).fit_transform(data_knn_raw)
                data_pollution_KNN = pd.DataFrame(data_pollution_KNN)
                data_pollution_KNN.columns = data_knn_raw.columns
            else:
                data_pollution_KNN = copy.deepcopy(data_knn_raw)
            for numb_del2 in data_pollution_KNN.columns:
                if 'add' in numb_del2:
                    del data_pollution_KNN[numb_del2]
            merge_list2.append(data_pollution_KNN)
        data_darksky_weather_KNN_1 = pd.concat(merge_list2, axis=1, sort=True)

        # 对结果的0值取np.nan
        data_darksky_weather_KNN_1.replace(0, np.nan, inplace=True)
        data_pollution_ewm.replace(0, np.nan, inplace=True)
        data_pollution_IDW.replace(0, np.nan, inplace=True)
        data_darksky_weather_Iterative_1.replace(0, np.nan, inplace=True)

        # 合并相同方法的结果
        data_pollution_KNN = data_darksky_weather_KNN_1.set_index(
            data_pollution.index)
        data_pollution_KNN.columns = data_pollution.columns
        data_pollution_ewm = data_pollution_ewm.set_index(data_pollution.index)
        data_pollution_ewm.columns = data_pollution.columns
        data_pollution_IDW = data_pollution_IDW.set_index(data_pollution.index)
        data_pollution_IDW.columns = data_pollution.columns
        data_pollution_Iterative = data_darksky_weather_Iterative_1.set_index(
            data_pollution.index)
        data_pollution_Iterative.columns = data_pollution.columns

        # 合并不同方法为一个文件
        sheet_name = ["KNN", "ewm", "IDW", "Iterative"]
        sheet_name_count = 0
        writer = pd.ExcelWriter(merge_output_file_path + '%s.xlsx' %
                                (input_file_name.replace(".xlsx", "")))
        for methods_output in [
                data_pollution_KNN, data_pollution_ewm, data_pollution_IDW,
                data_pollution_Iterative
        ]:
            methods_output.to_excel(writer,
                                    sheet_name=sheet_name[sheet_name_count])
            sheet_name_count = 1 + sheet_name_count
        writer.save()
def get4method(xx152):
    # 地理距离
    def geo_distance(lng1_df, lat1_df, lng2_df, lat2_df):
        lng1_df, lat1_df, lng2_df, lat2_df = map(radians, [lng1_df, lat1_df, lng2_df, lat2_df])
        d_lon = lng2_df - lng1_df
        d_lat = lat2_df - lat1_df
        a = sin(d_lat / 2) ** 2 + cos(lat1_df) * cos(lat2_df) * sin(d_lon / 2) ** 2
        dis = 2 * asin(sqrt(a)) * 6371.393 * 1000  # 地球半径
        return dis  # 输出结果的单位为“米”

    # 空间局部: 难以插值是因为大部分地区及其临近地区同一污染物值可能会一同缺失.
    def get_IDW(input_data):
        for pollution in ["PM25"]:  # 确定污染物列
            for indx in input_data.index:  # 获取索引
                res_list = []
                weight_list = []
                if pd.isnull(input_data[pollution][indx]):  # 开始循环
                    for item_idw in JCZ_info["监测站"]:  # 获取距离,定义权重
                        if item_idw != name:
                            lng2 = JCZ_info[JCZ_info["监测站"] == item_idw]["经度"]
                            lat2 = JCZ_info[JCZ_info["监测站"] == item_idw]["纬度"]
                            dis_1 = geo_distance(lng1, lat1, lng2, lat2)  # 两站地理距离
                            if dis_1 <= 50000:
                                data_to_add_in_1 = pd.read_excel(input_file_path_pollution + item_idw + ".xlsx")
                                data_to_add_in_1 = data_to_add_in_1.set_index("日期")  # 需要日期为索引,方便下面添加
                                if indx in data_to_add_in_1.index and pd.notnull(data_to_add_in_1[pollution][indx]):
                                    weight_list.append(dis_1)
                    weight_sum = np.sum(np.array(weight_list))  # 总距离,权重分母
                    for item_idw_2 in JCZ_info["监测站"]:  # 分配权重
                        if item_idw_2 != name:
                            lng2 = JCZ_info[JCZ_info["监测站"] == item_idw_2]["经度"]
                            lat2 = JCZ_info[JCZ_info["监测站"] == item_idw_2]["纬度"]
                            dis_1 = geo_distance(lng1, lat1, lng2, lat2)  # 两站地理距离
                            if dis_1 <= 50000:
                                data_to_add_in = pd.read_excel(input_file_path_pollution + item_idw_2 + ".xlsx")
                                data_to_add_in = data_to_add_in.set_index("日期")  # 需要日期为索引,方便下面添加
                                if indx in data_to_add_in.index and pd.notnull(data_to_add_in[pollution][indx]):
                                    res = (dis_1 / weight_sum) * data_to_add_in[pollution][indx]
                                    res_list.append(res)
                                    # print("已添加单元格插值:", res)
                    res_output = np.sum(np.array(res_list))  # 上下公式结果若为nan,并不会报错.会让最后的插值为nan.
                    try:
                        input_data[pollution][indx] = res_output
                    except Exception as e:
                        print("缺失严重, 插值未定义:", e)
        print("[IDW]Finished.")
        return input_data

    # 监测站
    jcz_152 = pd.read_excel("D:\\毕业论文程序\\MODIS\\坐标\\站点列表-2018.11.08起_152.xlsx", sheet_name=xx152)
    jcz_152["监测站名称_152"] = jcz_152["城市"] + "-" + jcz_152["监测点名称"]
    error_list = []
    import random
    for input_file_name in jcz_152["监测站名称_152"]:
        input_file_name = input_file_name + ".xlsx"
        # if input_file_name in saved_list:
            # print("已经完成:", input_file_name, xx152)
            # continue
        print("========正在计算%s========" % input_file_name)
        try:
            # 读取数据源
            data_pollution = pd.read_excel(input_file_path_pollution + input_file_name)
            data_pollution = data_pollution.set_index('日期')

            # 处理AQUA,制造 缺失值
            saveA = list()
            for columname in data_pollution.columns:
                if columname != "日期":
                    if columname != "监测站":
                        # loc 是某列为空的行坐标
                        loc = data_pollution[columname][
                            data_pollution[columname].isnull().values == False].index.tolist()
                        # 筛选个数
                        c1 = int(len(loc) * 0.25)
                        # 筛选出样本
                        slice1 = random.sample(loc, c1)
                        # print(data_darksky_weather[columname][0])
                        # print(slice1)
                        # 保存 变空之前 的 变量位置和数值
                        exec('save_a_%s = list()' % columname)
                        for nub in slice1:
                            # print(data_darksky_weather[columname][nub])
                            # print((columname, nub, data_darksky_weather[columname][nub]))
                            exec('save_a_%s.append((columname, nub, data_pollution[columname][nub]))' % columname)
                            # exec("JCZ.append(JCZ%s)" % i)
                            # 下一行,修改成缺失值
                            data_pollution[columname][nub] = np.nan
                            # print(data_darksky_weather[columname][nub])
                        exec('saveA.append(save_a_%s)' % columname)

            # 保存编号
            sA = pd.DataFrame(saveA)
            sA.to_excel(null_output_path + "%s" % input_file_name)
            # 局部:局部局部局部局部局部局部局部局部局部局部局部局部局部局部局部局部最近邻KNN,使用其他监测点同一个特征的均方差进行加权,判断最接近的时间点.
            # 局部!合并部分!局部局部局部局部局部局部局部局部局部局部
            name2 = str(input_file_name).replace(".xlsx", "")  # 定义相关变量
            lng1 = JCZ_info[JCZ_info["监测站"] == name2]["经度"]
            lat1 = JCZ_info[JCZ_info["监测站"] == name2]["纬度"]
            merge_list_KNN = []  # 同一监测站,不同污染物
            for darksky_weather_KNN in ['PM25']:
                # 合并部分
                numb2 = 0
                data_darksky_weather_to_KNN = copy.deepcopy(data_pollution[[darksky_weather_KNN]])
                data_darksky_weather_to_KNN = data_darksky_weather_to_KNN.reset_index()
                for item in JCZ_info["监测站"]:  # 不同于气溶胶插值方法
                    if item != name2:
                        lng2 = JCZ_info[JCZ_info["监测站"] == item]["经度"]
                        lat2 = JCZ_info[JCZ_info["监测站"] == item]["纬度"]
                        dis_1 = geo_distance(lng1, lat1, lng2, lat2)  # 两站地理距离
                        if dis_1 > 0: # <=
                            # 添加的文件
                            data_to_add_in_to_KNN = pd.read_excel(
                                input_file_path_pollution + item + ".xlsx")
                            # 添加的列名
                            data_to_KNN_concat = data_to_add_in_to_KNN[[darksky_weather_KNN, '日期']]
                            data_to_KNN_concat.columns = [darksky_weather_KNN + "_add%s" % numb2,
                                                                '日期']  # 如果有五个临近, 则NDVI1-NDVI5

                            data_darksky_weather_to_KNN = pd.merge(data_darksky_weather_to_KNN,
                                                                         data_to_KNN_concat,
                                                                         how='left',
                                                                         on='日期')
                            data_darksky_weather_to_KNN = data_darksky_weather_to_KNN.set_index('日期')
                    numb2 += 1
                # 迭代部分
                count_2 = 0
                for value_1 in data_darksky_weather_to_KNN.sum():
                    if value_1 != 0:
                        count_2 += 1
                if count_2 > 1:  # 至少两个非空列才可以计算
                    data_darksky_weather_KNN_to_merge = KNN(k=7).fit_transform(data_darksky_weather_to_KNN)
                    # data_darksky_weather_KNN_to_merge = IterativeImputer(max_iter=100).fit_transform(data_darksky_weather_to_KNN)
                else:
                    data_darksky_weather_KNN_to_merge = copy.deepcopy(
                        data_darksky_weather_to_KNN)
                data_darksky_weather_KNN_to_merge = pd.DataFrame(
                    data_darksky_weather_KNN_to_merge)  # 格式转换
                data_darksky_weather_KNN_to_merge = data_darksky_weather_KNN_to_merge.set_index(
                    data_darksky_weather_to_KNN.index)  # ok
                if len(data_darksky_weather_KNN_to_merge.columns) < len(
                        data_darksky_weather_to_KNN.columns):
                    reset_col_name_list_KNN = []  # 对非nan列先命名
                    for col_name in data_darksky_weather_to_KNN.columns:
                        if np.max(data_darksky_weather_to_KNN[col_name]) > 0:
                            reset_col_name_list_KNN.append(col_name)
                    data_darksky_weather_KNN_to_merge.columns = reset_col_name_list_KNN

                    for col_name in data_darksky_weather_to_KNN.columns:  # 对缺失的nan列补充
                        if col_name not in data_darksky_weather_KNN_to_merge.columns:
                            # 补全缺失nan列
                            data_darksky_weather_KNN_to_merge[col_name] = np.nan
                else:
                    data_darksky_weather_KNN_to_merge.columns = data_darksky_weather_to_KNN.columns  # 重设列名
                for numb_del in data_darksky_weather_KNN_to_merge.columns:
                    if 'add' in numb_del:
                        del data_darksky_weather_KNN_to_merge[numb_del]

                # 插补后的该监测点的气象特征列, 仅一列, 循环添加其他特征
                merge_list_KNN.append(data_darksky_weather_KNN_to_merge)
            data_darksky_weather_KNN_1 = pd.concat(
                merge_list_KNN, axis=1, sort=False)
            # 对结果的0值取np.nan
            # data_pollution_KNN = KNN(k=7).fit_transform(data_pollution)
            # data_pollution_KNN = pd.DataFrame(data_pollution_KNN)

            # 时间全局: 平滑,常用于股市;创建新的数据框,不会覆盖原始数据
            data_pollution_ewm_mid = pd.DataFrame.ewm(
                self=data_pollution,
                com=0.8,
                ignore_na=True,
                adjust=True).mean()
            # data_pollution_ewm_mid = data_pollution.interpolate()  # 23%[时间视图33→19]
            # 替换空白处
            data_pollution_ewm = copy.deepcopy(data_pollution)  # 避免覆盖原始数据
            for columname in data_pollution_ewm.columns:
                if data_pollution[columname].count() != len(data_pollution):
                    loc = data_pollution[columname][data_pollution[columname].isnull().values == True].index.tolist()
                    for nub in loc:
                        data_pollution_ewm[columname][nub] = data_pollution_ewm_mid[columname][nub]
            #########################################################################################################################################
            #########################################################################################################################################
            #########################################################################################################################################

            # 空间
            data_pollution_to_IDW = copy.deepcopy(data_pollution)
            name = str(input_file_name).replace(".xlsx", "")  # 定义相关变量
            lng1 = JCZ_info[JCZ_info["监测站"] == name]["经度"]
            lat1 = JCZ_info[JCZ_info["监测站"] == name]["纬度"]
            # 空间局部: IDW,反距离插值
            data_pollution_IDW = get_IDW(data_pollution_to_IDW)
            # 空间全局: 迭代回归,缺失特征作为y,其他特征作为x
            merge_list = []  # 同一监测站,不同污染物
            for darksky_weather_Iterative in ['PM25']:
                # 合并部分
                numb = 0
                data_darksky_weather_to_Iterative = copy.deepcopy(data_pollution[[darksky_weather_Iterative]])
                data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.reset_index()
                for item in JCZ_info["监测站"]:  # 不同于气溶胶插值方法
                    if item != name:
                        # 添加的文件
                        data_to_add_in_to_Iterative = pd.read_excel(
                            input_file_path_pollution + item + ".xlsx")
                        # 添加的列名
                        data_to_Iterative_concat = data_to_add_in_to_Iterative[[darksky_weather_Iterative, '日期']]
                        data_to_Iterative_concat.columns = [darksky_weather_Iterative + "_add%s" % numb,
                                                            '日期']  # 如果有五个临近, 则NDVI1-NDVI5

                        data_darksky_weather_to_Iterative = pd.merge(data_darksky_weather_to_Iterative,
                                                                     data_to_Iterative_concat,
                                                                     how='left',
                                                                     on='日期')
                        data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.set_index('日期')
                    numb += 1
                # 迭代部分
                count_1 = 0
                for value_1 in data_darksky_weather_to_Iterative.sum():
                    if value_1 != 0:
                        count_1 += 1
                if count_1 > 1:  # 至少两个非空列才可以计算
                    data_darksky_weather_Iterative_to_merge = IterativeImputer(
                        max_iter=100).fit_transform(data_darksky_weather_to_Iterative)
                else:
                    data_darksky_weather_Iterative_to_merge = copy.deepcopy(
                        data_darksky_weather_to_Iterative)
                data_darksky_weather_Iterative_to_merge = pd.DataFrame(
                    data_darksky_weather_Iterative_to_merge)  # 格式转换
                data_darksky_weather_Iterative_to_merge = data_darksky_weather_Iterative_to_merge.set_index(
                    data_darksky_weather_to_Iterative.index)  # ok
                if len(data_darksky_weather_Iterative_to_merge.columns) < len(
                        data_darksky_weather_to_Iterative.columns):
                    reset_col_name_list = []  # 对非nan列先命名
                    for col_name in data_darksky_weather_to_Iterative.columns:
                        if np.max(data_darksky_weather_to_Iterative[col_name]) > 0:
                            reset_col_name_list.append(col_name)
                    data_darksky_weather_Iterative_to_merge.columns = reset_col_name_list

                    for col_name in data_darksky_weather_to_Iterative.columns:  # 对缺失的nan列补充
                        if col_name not in data_darksky_weather_Iterative_to_merge.columns:
                            # 补全缺失nan列
                            data_darksky_weather_Iterative_to_merge[col_name] = np.nan
                else:
                    data_darksky_weather_Iterative_to_merge.columns = data_darksky_weather_to_Iterative.columns  # 重设列名
                for numb_del in data_darksky_weather_Iterative_to_merge.columns:
                    if 'add' in numb_del:
                        del data_darksky_weather_Iterative_to_merge[numb_del]

                # 插补后的该监测点的气象特征列, 仅一列, 循环添加其他特征
                merge_list.append(data_darksky_weather_Iterative_to_merge)
            data_darksky_weather_Iterative_1 = pd.concat(
                merge_list, axis=1, sort=False)
            # 对结果的0值取np.nan
            # data_pollution_KNN.replace(0, np.nan, inplace=True)
            data_darksky_weather_KNN_1.replace(0, np.nan, inplace=True)  # 新
            data_pollution_ewm.replace(0, np.nan, inplace=True)
            data_pollution_IDW.replace(0, np.nan, inplace=True)
            data_darksky_weather_Iterative_1.replace(0, np.nan, inplace=True)

            # 合并相同方法的结果
            # data_pollution_KNN = data_pollution_KNN.set_index(data_pollution.index)
            # data_pollution_KNN.columns = data_pollution.columns
            data_pollution_KNN = data_darksky_weather_KNN_1.set_index(data_pollution.index)  # 新
            data_pollution_KNN.columns = data_pollution.columns  # 新

            data_pollution_ewm = data_pollution_ewm.set_index(data_pollution.index)
            data_pollution_ewm.columns = data_pollution.columns
            data_pollution_IDW = data_pollution_IDW.set_index(data_pollution.index)
            data_pollution_IDW.columns = data_pollution.columns
            data_pollution_Iterative = data_darksky_weather_Iterative_1.set_index(data_pollution.index)
            data_pollution_Iterative.columns = data_pollution.columns

            # 合并不同方法为一个文件
            sheet_name = ["KNN", "ewm", "IDW", "Iterative"]
            sheet_name_count = 0
            writer = pd.ExcelWriter(merge_output_file_path + '%s.xlsx' % (input_file_name.replace(".xlsx", "")))
            for methods_output in [data_pollution_KNN, data_pollution_ewm, data_pollution_IDW, data_pollution_Iterative]:
                methods_output.to_excel(writer, sheet_name=sheet_name[sheet_name_count])
                sheet_name_count = 1 + sheet_name_count
            writer.save()
        except Exception as e:
            print(input_file_name, "发生错误:", e)
            error_list.append(input_file_name)

        if len(error_list) != 0:
            error_list = pd.DataFrame(error_list)
            error_list.to_excel(xx152+".xlsx")
"""

cols = cols.fillna(method = 'bfill')

cols.isnull().sum()



temp = KNN(k=5).complete(cols)

cols = cols.drop(["status"])

from fancyimpute import  KNN
train_columns = list(cols)
train = KNN(cols,3)
train.columns = train_columns
#train.material = train.material.astype("object")
train = pd.get_dummies(train)

train

#We use the train dataframe from Titanic dataset
#fancy impute removes column names.
train_cols = list(cols)
# Use 5 nearest rows which have a feature to fill in each row's
# missing features
train = KNN(cols,5)
train.columns = train_cols