Пример #1
0
def complete_multi_task(idx):
    global data_complete
    data_sim = gen_complete_random(data_complete,
                                   random_ratio=random_ratios[idx],
                                   print_all=False)
    result = test_imputation(data_sim,
                             complete_by_multi,
                             multi=True,
                             verboseID="multi_v1")
    return result
Пример #2
0
def complete_multi_v2_task(idx):
    global data_complete, PARAMS_DATA
    data_sim = gen_complete_random(data_complete,
                                   random_ratio=random_ratios[idx],
                                   print_all=False)
    result = test_imputation(data_sim,
                             partial(complete_by_multi_v2,
                                     target_feature=PARAMS_DATA["target"]),
                             multi=True,
                             verboseID="multi_v2")
    return result
Пример #3
0
def cross_val(data_original: Dataset, data_config, clf_config, complete_function=None, selected_cols=[]):
    bias = []
    acc = []
    smote = SMOTE()
    scaler = StandardScaler()
    for i in range(10):
        if complete_function: data = gen_complete_random(data_original, random_ratio=0.4, selected_cols=selected_cols)
        else: data = data_original
        print("Running Cross Validation {}".format(i))
        bias_cv = []
        acc_cv = []
        for train_idx, test_idx in StratifiedShuffleSplit(n_splits=20).split(data.X, data.y):
            X_train, X_test = data.X.iloc[train_idx].copy(), data.X.iloc[test_idx].copy()
            Y_train, Y_test = data.y[train_idx], data.y[test_idx]
            X_train.reset_index(drop=True, inplace=True)
            X_test.reset_index(drop=True, inplace=True)

            if complete_function:
                data_incomplete = Dataset("tmp", X_train, Y_train, types=data.types, 
                    protected_features=data.protected_features, categorical_features=data.categorical_features,
                    encoders=[data.X_encoders, data.y_encoder])
                try:
                    data_complete = complete_function(data_incomplete)
                except Exception as e:
                    print("Error: {}. Skipped".format(e))
                    continue
                if data_complete.X.isnull().sum().sum() > 0:
                    print("Complete function error, skipped")
                    continue
                X_train = data_complete.X.copy()
                Y_train = data_complete.y.copy()
            X_train.drop(columns=data.protected_features, inplace=True)

            if complete_function:
                data_incomplete = Dataset("tmp", X_test, Y_test, types=data.types, 
                    protected_features=data.protected_features, categorical_features=data.categorical_features,
                    encoders=[data.X_encoders, data.y_encoder])
                try:
                    data_complete = complete_function(data_incomplete)
                except Exception as e:
                    print("Error: {}. Skipped".format(e))
                    continue
                if data_complete.X.isnull().sum().sum() > 0:
                    print("Complete function error, skipped")
                    continue
                X_test = data_complete.X.copy()
                Y_test = data_complete.y.copy()
            
            X_train_res, Y_train_res = smote.fit_resample(X_train, Y_train)
            X_scaled = scaler.fit_transform(X_train_res)
            clf = LogisticRegression(max_iter=clf_config["max_iter"], C=clf_config["C"], tol=clf_config["tol"])
            clf.fit(X_scaled, Y_train_res)
            X_test_scaled = pd.DataFrame(scaler.transform(X_test.drop(columns=data.protected_features)), columns=X_test.drop(columns=data.protected_features).columns)
            X_test_scaled = pd.concat([X_test_scaled, X_test[data.protected_features]], axis=1)
            X_test_A = X_test_scaled[X_test_scaled[data_config["target"]] == data_config["A"]].drop(columns=data.protected_features).to_numpy()
            X_test_B = X_test_scaled[X_test_scaled[data_config["target"]] == data_config["B"]].drop(columns=data.protected_features).to_numpy()
            Y_test_A = Y_test[X_test_scaled[X_test_scaled[data_config["target"]] == data_config["A"]].index.tolist()]
            Y_test_B = Y_test[X_test_scaled[X_test_scaled[data_config["target"]] == data_config["B"]].index.tolist()]
            matrix_A = confusion_matrix(Y_test_A, clf.predict(X_test_A)).ravel().tolist()
            matrix_B = confusion_matrix(Y_test_B, clf.predict(X_test_B)).ravel().tolist()
            try:
                bias_cv.append(newBias(matrix_A+matrix_B))
            except Exception as e:
                print("\tError: {}, skipped".format(e))
            acc_cv.append(accuracy_score(clf.predict(X_test_scaled.drop(columns=data.protected_features).to_numpy()), Y_test))
        bias.append(np.mean(bias_cv))
        acc.append(np.mean(acc_cv))
    return (np.mean(bias), np.mean(acc))