Exemplo n.º 1
0
def load_optimpreproc_german_dataset():
    """
    Collect the Optimized Preprocessed German Data Set.

    :return: The Optimized Preprocessed German Dataset, split into training and test sets
    """
    dataset = load_preproc_data_german()
    train, test = dataset.split([0.7], shuffle=True)
    return train, test
Exemplo n.º 2
0
    def load(self):
        dataset_orig = load_preproc_data_german(['age'])
        scale_orig = StandardScaler()
        X_train = scale_orig.fit_transform(dataset_orig.features)
        y_train = dataset_orig.labels.ravel()

        lmod = LogisticRegression()
        lmod.fit(X_train, y_train, sample_weight=dataset_orig.instance_weights)

        self.model = lmod
        self.ready = True
Exemplo n.º 3
0
def get_data(dataset_used, protected_attribute_used):
    if dataset_used == "adult":
        dataset_orig = AdultDataset()
        if protected_attribute_used == 1:
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        else:
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]

    elif dataset_used == "german":
        dataset_orig = load_preproc_data_german()
        dataset_orig.labels -= 1
        if protected_attribute_used == 1:
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        else:
            privileged_groups = [{'age': 1}]
            unprivileged_groups = [{'age': 0}]

    elif dataset_used == "compas":
        dataset_orig = CompasDataset()
        if protected_attribute_used == 1:
            privileged_groups = [{'sex': 1}]
            unprivileged_groups = [{'sex': 0}]
        else:
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]

    elif dataset_used == "bank":
        dataset_orig = BankDataset()
        if protected_attribute_used == 1:
            privileged_groups = [{'age': 1}]
            unprivileged_groups = [{'age': 0}]
        else:
            privileged_groups = [{'race': 1}]
            unprivileged_groups = [{'race': 0}]

    else:
        raise ValueError(f"{dataset_used} is not an available dataset.")

    dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.6], shuffle=True, seed=101)
    dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5], shuffle=True, seed=101)

    return dataset_orig_train, dataset_orig_valid, dataset_orig_test, privileged_groups, unprivileged_groups
Exemplo n.º 4
0
def LoadData(dataset_name,protected_attribute_name,raw=True):

	optim_options=None

	if dataset_name == "adult":
		if raw:
			dataset_original = AdultDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 1}]
			unprivileged_groups = [{'sex': 0}]
			if not raw:
				dataset_original = load_preproc_data_adult(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_adult,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "race":
			privileged_groups = [{'race': 1}]
			unprivileged_groups = [{'race': 0}]
			if not raw:
				dataset_original = load_preproc_data_adult(['race'])
			optim_options = {
			"distortion_fun": get_distortion_adult,
			"epsilon": 0.05,
			"clist": [0.99, 1.99, 2.99],
			"dlist": [.1, 0.05, 0]
		}
	elif dataset_name == "german":
		if raw:
			dataset_original = GermanDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 1}]
			unprivileged_groups = [{'sex': 0}]
			if not raw:
				dataset_original = load_preproc_data_german(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_german,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "age":
			privileged_groups = [{'age': 1}]
			unprivileged_groups = [{'age': 0}]
			if not raw:
				dataset_original = load_preproc_data_german(['age'])
			optim_options = {
				"distortion_fun": get_distortion_german,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		dataset_original.labels = 2 - dataset_original.labels
		dataset_original.unfavorable_label = 0.
	elif dataset_name == "compas":
		if raw:
			dataset_original = CompasDataset()
		if protected_attribute_name == "sex":
			privileged_groups = [{'sex': 0}]
			unprivileged_groups = [{'sex': 1}]
			if not raw:
				dataset_original = load_preproc_data_compas(['sex'])
			optim_options = {
				"distortion_fun": get_distortion_compas,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}
		elif protected_attribute_name == "race":
			privileged_groups = [{'race': 1}]
			unprivileged_groups = [{'race': 0}]
			if not raw:
				dataset_original = load_preproc_data_compas(['race'])
			optim_options = {
				"distortion_fun": get_distortion_compas,
				"epsilon": 0.05,
				"clist": [0.99, 1.99, 2.99],
				"dlist": [.1, 0.05, 0]
			}

	protected_attribute_set={
		'sex':[[{'sex': 1}],[{'sex': 0}]],
		'age':[[{'age': 1}],[{'age': 0}]],
		'race':[[{'race': 1}],[{'race': 0}]]
	}

	if optim_options==None:
		print('No such dataset & group option:', dataset_name, protected_attribute_name)
		exit()

	return dataset_original,protected_attribute_set[protected_attribute_name][0],protected_attribute_set[protected_attribute_name][1],optim_options
Exemplo n.º 5
0
def get_data(dataset_used="adult", protected_attribute="sex", train_size=0.7):
    if dataset_used == "adult":
        if protected_attribute == "sex":
            dataset_orig = load_preproc_data_adult(['sex'])
        else:
            dataset_orig = load_preproc_data_adult(['race'])

    elif dataset_used == "german":
        if protected_attribute == "sex":
            dataset_orig = load_preproc_data_german(['sex'])
        else:
            dataset_orig = load_preproc_data_german(['age'])

    elif dataset_used == "compas":
        if protected_attribute == "sex":
            dataset_orig = load_preproc_data_compas(['sex'])
        else:
            dataset_orig = load_preproc_data_compas(['race'])

    #random seed
    np.random.seed(1)

    # Split into train, validation, and test
    dataset_orig_tvt, dataset_orig_ftest = dataset_orig.split([train_size],
                                                              shuffle=True)
    dataset_orig_train, dataset_orig_vt = dataset_orig_tvt.split([train_size],
                                                                 shuffle=True)
    dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5],
                                                                  shuffle=True)

    # Convert to dataframe
    df_all, _ = dataset_orig_tvt.convert_to_dataframe()
    df_all = df_all.reset_index(drop=True)
    df_train, _ = dataset_orig_train.convert_to_dataframe()
    df_train = df_train.reset_index(drop=True)
    df_valid, _ = dataset_orig_valid.convert_to_dataframe()
    df_valid = df_valid.reset_index(drop=True)
    df_test, _ = dataset_orig_test.convert_to_dataframe()
    df_test = df_test.reset_index(drop=True)
    df_ftest, _ = dataset_orig_ftest.convert_to_dataframe()
    df_ftest = df_ftest.reset_index(drop=True)

    X_all = df_all.drop(dataset_orig.label_names, axis=1)
    y_all = df_all[dataset_orig.label_names[0]]
    X_train = df_train.drop(dataset_orig.label_names, axis=1)
    y_train = df_train[dataset_orig.label_names[0]]
    X_valid = df_valid.drop(dataset_orig.label_names, axis=1)
    y_valid = df_valid[dataset_orig.label_names[0]]
    X_test = df_test.drop(dataset_orig.label_names, axis=1)
    y_test = df_test[dataset_orig.label_names[0]]
    X_ftest = df_ftest.drop(dataset_orig.label_names, axis=1)
    y_ftest = df_ftest[dataset_orig.label_names[0]]

    # Mab labels to favorable=1 and unfavorable=-1
    favorable = dataset_orig.favorable_label
    unfavorable = dataset_orig.unfavorable_label
    label_map = {favorable: 1, unfavorable: -1}
    y_all = y_all.map(label_map)
    y_train = y_train.map(label_map)
    y_valid = y_valid.map(label_map)
    y_test = y_test.map(label_map)
    y_ftest = y_ftest.map(label_map)

    return X_all, y_all, X_train, y_train, X_valid, y_valid, X_test, y_test, X_ftest, y_ftest