예제 #1
0
def dataset_exps(data, y, cv, add_binary):

    data_ignore, y_ignore = imputers.ignore_imputer(data, y)
    data_special_1 = imputers.special_value_imputer(data, -1, add_binary=add_binary)
    data_special_2 = imputers.special_value_imputer(data, 0, add_binary=add_binary)
    data_common = imputers.common_value_imputer(data, add_binary=add_binary)
    data_mean = imputers.mean_value_imputer(data, add_binary=add_binary)
    data_svd = imputers.svd_imputer(data, rank=data.shape[1] // 2, add_binary=add_binary)
    data_knn = imputers.knn_imputer(data, n_neighbors=5, add_binary=add_binary)
    data_rf = imputers.rf_imputer(data, add_binary=add_binary)
    data_lr = imputers.linear_imputer(data, add_binary=add_binary)
    data_em = imputers.em_imputer(data, add_binary=add_binary)
    data_km = imputers.kmean_imputer(data, add_binary=add_binary)
    data_zet = imputers.zet_imputer(data, competent_row_num=6, competent_col_num=4, add_binary=add_binary)

    result = np.zeros((len(_methods), len(_algs)))

    if data_ignore.shape[0] >= 10:
        result[0] = multi_algs_cv(data_ignore, y_ignore, 10)

    result[1, 0] = multi_algs_cv(data_special_1, y, cv)[0]
    result[1, 1:] = multi_algs_cv(data_special_2, y, cv)[1:]
    result[2] = multi_algs_cv(data_common, y, cv)
    result[3] = multi_algs_cv(data_mean, y, cv)
    result[4] = multi_algs_cv(data_svd, y, cv)
    result[5] = multi_algs_cv(data_knn, y, cv)
    result[6] = multi_algs_cv(data_rf, y, cv)
    result[7] = multi_algs_cv(data_lr, y, cv)
    result[8] = multi_algs_cv(data_em, y, cv)
    result[9] = multi_algs_cv(data_km, y, cv)
    result[10] = multi_algs_cv(data_zet, y, cv)

    result = pd.DataFrame(result, columns=_algs, index=_methods)

    if data_ignore.shape[0] < 10:
        result.drop('ignore', inplace=True)

    return result
예제 #2
0
def make_experiments(data_real, target, clf, cv, missing_frac_range, num_iter, sp_value, add_binary, del_columns=None):

    accuracy = pd.DataFrame(np.zeros((len(_methods), len(missing_frac_range))), index=_methods, columns=missing_frac_range)
    rmse = pd.DataFrame(np.zeros((len(_methods), len(missing_frac_range))), index=_methods, columns=missing_frac_range)

    for missing_frac in missing_frac_range:
        print('start fraction:', missing_frac)
        
        for iteration in range(num_iter):
            data_missing = random_deletion.make_missing_value(data_real, del_fraction=missing_frac,
                                                              del_fraction_column=0.5)

            # ignore
            data_imp, y = imputers.ignore_imputer(data_missing, target)
            if data_imp.shape[0] >= data_missing.shape[0] / 10:
                cur_accuracy = np.mean(cross_val_score(clf, data_imp, y, scoring='accuracy', cv=10))
            else:
                cur_accuracy = 0

            if iteration == 0:
                accuracy.ix['ignore', missing_frac] = cur_accuracy / num_iter
            elif cur_accuracy == 0 or accuracy.ix['ignore', missing_frac] == 0:
                accuracy.ix['ignore', missing_frac] = 0
            else:
                accuracy.ix['ignore', missing_frac] += cur_accuracy / num_iter

            # special value
            data_imp = imputers.special_value_imputer(data_missing, value=sp_value, add_binary=add_binary)
            cur_accuracy = np.mean(cross_val_score(clf, data_imp, target, scoring='accuracy', cv=cv))
            cur_rmse = np.sum(np.array((data_real - data_imp) ** 2)) ** 0.5
            accuracy.ix['special', missing_frac] += cur_accuracy / num_iter
            rmse.ix['special', missing_frac] += cur_rmse / num_iter

            # common value
            data_imp = imputers.common_value_imputer(data_missing, add_binary=add_binary)
            cur_accuracy = np.mean(cross_val_score(clf, data_imp, target, scoring='accuracy', cv=cv))
            cur_rmse = np.sum(np.array((data_real - data_imp) ** 2)) ** 0.5
            accuracy.ix['common', missing_frac] += cur_accuracy / num_iter
            rmse.ix['common', missing_frac] += cur_rmse / num_iter

            # mean value
            data_imp = imputers.mean_value_imputer(data_missing, add_binary=add_binary)
            cur_accuracy = np.mean(cross_val_score(clf, data_imp, target, scoring='accuracy', cv=cv))
            cur_rmse = np.sum(np.array((data_real - data_imp) ** 2)) ** 0.5
            accuracy.ix['mean', missing_frac] += cur_accuracy / num_iter
            rmse.ix['mean', missing_frac] += cur_rmse / num_iter

            # svd
            data_imp = imputers.svd_imputer(data_missing, rank=data_missing.shape[1] // 2, add_binary=add_binary)
            cur_accuracy = np.mean(cross_val_score(clf, data_imp, target, scoring='accuracy', cv=cv))
            cur_rmse = np.sum(np.array((data_real - data_imp) ** 2)) ** 0.5
            accuracy.ix['svd', missing_frac] += cur_accuracy / num_iter
            rmse.ix['svd', missing_frac] += cur_rmse / num_iter

            # knn
            data_imp = imputers.knn_imputer(data_missing, n_neighbors=5, add_binary=add_binary)
            cur_accuracy = np.mean(cross_val_score(clf, data_imp, target, scoring='accuracy', cv=cv))
            cur_rmse = np.sum(np.array((data_real - data_imp) ** 2)) ** 0.5
            accuracy.ix['knn', missing_frac] += cur_accuracy / num_iter
            rmse.ix['knn', missing_frac] += cur_rmse / num_iter

            # rf
            data_imp = imputers.rf_imputer(data_missing, add_binary=add_binary)
            cur_accuracy = np.mean(cross_val_score(clf, data_imp, target, scoring='accuracy', cv=cv))
            cur_rmse = np.sum(np.array((data_real - data_imp) ** 2)) ** 0.5
            accuracy.ix['rf', missing_frac] += cur_accuracy / num_iter
            rmse.ix['rf', missing_frac] += cur_rmse / num_iter

            # lr
            data_imp = imputers.linear_imputer(data_missing, add_binary=add_binary)
            cur_accuracy = np.mean(cross_val_score(clf, data_imp, target, scoring='accuracy', cv=cv))
            cur_rmse = np.sum(np.array((data_real - data_imp) ** 2)) ** 0.5
            accuracy.ix['lr', missing_frac] += cur_accuracy / num_iter
            rmse.ix['lr', missing_frac] += cur_rmse / num_iter

            # em
            data_imp = imputers.em_imputer(data_missing, add_binary=add_binary)
            cur_accuracy = np.mean(cross_val_score(clf, data_imp, target, scoring='accuracy', cv=cv))
            cur_rmse = np.sum(np.array((data_real - data_imp) ** 2)) ** 0.5
            accuracy.ix['em', missing_frac] += cur_accuracy / num_iter
            rmse.ix['em', missing_frac] += cur_rmse / num_iter

            # km
            data_imp = imputers.kmean_imputer(data_missing, add_binary=add_binary)
            cur_accuracy = np.mean(cross_val_score(clf, data_imp, target, scoring='accuracy', cv=cv))
            cur_rmse = np.sum(np.array((data_real - data_imp) ** 2)) ** 0.5
            accuracy.ix['k-means', missing_frac] += cur_accuracy / num_iter
            rmse.ix['k-means', missing_frac] += cur_rmse / num_iter

            # zet
            data_imp = imputers.zet_imputer(data_missing, competent_row_num=6, competent_col_num=4, add_binary=add_binary)
            cur_accuracy = np.mean(cross_val_score(clf, data_imp, target, scoring='accuracy', cv=cv))
            cur_rmse = np.sum(np.array((data_real - data_imp) ** 2)) ** 0.5
            accuracy.ix['zet', missing_frac] += cur_accuracy / num_iter
            rmse.ix['zet', missing_frac] += cur_rmse / num_iter

    return accuracy, rmse