Exemplo n.º 1
0
def method3(acc_matrix):
	df_train = pd.read_csv ('temp/adult_binary_train_prediction0.csv')
	# df_train = pd.concat ([df_train] * 10, ignore_index=True)
	train = DataSet (df_train)
	df_test = pd.read_csv ('temp/adult_binary_test_prediction0.csv')
	df_test = pd.concat ([df_test] * 3, ignore_index=True)
	test = DataSet (df_test)
	acc = []

	for name in ['LR', 'SVM']:
		probabilistic_cbn = load_xml_to_cbn (os.path.join (src_path, '../data/adult/adult.xml'))

		def find_condition_prob(e, t):
			return probabilistic_cbn.find_prob (e, t)

		def get_loc(e):
			return probabilistic_cbn.get_loc (e)

		A1 = probabilistic_cbn.v['age']
		A2 = probabilistic_cbn.v['education']
		S = probabilistic_cbn.v['sex']
		M1 = probabilistic_cbn.v['workclass']
		M2 = probabilistic_cbn.v['marital-status']
		N = probabilistic_cbn.v['hours']
		Y = probabilistic_cbn.v['income']

		YH = Variable (name=name, index=Y.index + 1, domains=Y.domains)
		probabilistic_cbn.v[(YH.index, YH.name)] = YH

		YT = Variable (name=name + "M", index=Y.index + 2, domains=Y.domains)
		probabilistic_cbn.v[(YT.index, YT.name)] = YT

		# build linear loss function
		C_vector = np.zeros ((2 ** 8 + 2 ** 8 // 4, 1))
		for a1, a2, n, m1, m2, s in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (),
											 S.domains.get_all ()):
			p_x_s = train.get_marginal_prob (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))

			p_yh_1_y = p_x_s * train.count (Event ({Y: 0, YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), 'notequal')
			loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 0, YT: 0}))
			C_vector[loc] = p_yh_1_y * train.get_conditional_prob (Event ({YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))
			loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 1, YT: 1}))
			C_vector[loc] = p_yh_1_y * train.get_conditional_prob (Event ({YH: 1}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))

			p_yh__y = p_x_s * train.count (Event ({Y: 0, YH: 0}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, N: n, S: s}), 'equal')
			loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 0, YT: 1}))
			C_vector[loc] = p_yh__y * train.get_conditional_prob (Event ({YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))
			loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 1, YT: 0}))
			C_vector[loc] = p_yh__y * train.get_conditional_prob (Event ({YH: 1}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))

		# the inequality of max and min
		G_matrix_1 = np.zeros ((2 ** 8, 2 ** 8 + 2 ** 8 // 4))
		h_1 = np.zeros (2 ** 8)
		# max
		i = 0
		for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()):
			for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()):
				for yh in YH.domains.get_all ():
					loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))
					G_matrix_1[i, loc] = train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))
				loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt}))
				G_matrix_1[i, 2 ** 8 + loc] = -1
				i += 1
		# min
		assert i == 2 ** 8 // 2
		for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()):
			for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()):
				for yh in YH.domains.get_all ():
					loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))
					G_matrix_1[i, loc] = -train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))
				loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt}))
				G_matrix_1[i, 2 ** 8 + 2 ** 8 // 8 + loc] = 1
				i += 1

		# build counterfactual fairness constraints
		G_matrix_2 = np.zeros ((2 ** 4 * 2, 2 ** 8 + 2 ** 8 // 4))
		h_2 = np.ones (2 ** 4 * 2) * tau

		i = 0
		for a1, a2, m1, m2 in product (A1.domains.get_all (), A2.domains.get_all (), M1.domains.get_all (), M2.domains.get_all ()):
			for n in N.domains.get_all ():
				loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: spos, YT: yt_pos}))
				G_matrix_2[i, 2 ** 8 + loc] = find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: spos}))

				for yh in YH.domains.get_all ():
					loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg, YH: yh, YT: yt_pos}))
					G_matrix_2[i, loc] = -find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: sneg})) \
										 * train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg}))
			i += 1

		assert i == 2 ** 4
		for a1, a2, m1, m2 in product (A1.domains.get_all (), A2.domains.get_all (), M1.domains.get_all (), M2.domains.get_all ()):
			for n in N.domains.get_all ():
				loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: spos, YT: yt_pos}))
				G_matrix_2[i, 2 ** 8 + 2 ** 8 // 8 + loc] = -find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: spos}))

				for yh in YH.domains.get_all ():
					loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg, YH: yh, YT: yt_pos}))
					G_matrix_2[i, loc] = find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: sneg})) \
										 * train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg}))
			i += 1

		###########

		# mapping in [0, 1]
		G_matrix_3 = np.zeros ((2 * (2 ** 8 + 2 ** 8 // 4), 2 ** 8 + 2 ** 8 // 4))
		h_3 = np.zeros (2 * (2 ** 8 + 2 ** 8 // 4))

		for i in range (2 ** 8 + 2 ** 8 // 4):
			G_matrix_3[i, i] = 1
			h_3[i] = 1

			G_matrix_3[2 ** 8 + 2 ** 8 // 4 + i, i] = -1
			h_3[2 ** 8 + 2 ** 8 // 4 + i] = 0

		# sum = 1
		A_matrix = np.zeros ((2 ** 8 // 2, 2 ** 8 + 2 ** 8 // 4))
		b = np.ones (2 ** 8 // 2)

		i = 0
		for a1, a2, n, m1, m2, s, yh in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (),
												 S.domains.get_all (),
												 YH.domains.get_all ()):
			for yt in YT.domains.get_all ():
				A_matrix[i, get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 1
			i += 1

		assert i == 2 ** 8 // 2

		# combine the inequality constraints
		G_matrix = np.vstack ([G_matrix_1, G_matrix_2, G_matrix_3])
		h = np.hstack ([h_1, h_2, h_3])

		# Test
		# print (np.linalg.matrix_rank (A_matrix), A_matrix.shape[0])
		# print (np.linalg.matrix_rank (np.vstack ([A_matrix, G_matrix])), A_matrix.shape[1])

		# def check():
		# 	sol = np.zeros (2 ** 8 + 2 ** 8 // 4)
		# 	for a1, a2, n, m1, m2, s, yh, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (),
		# 												 S.domains.get_all (), YH.domains.get_all (), YT.domains.get_all ()):
		# 		if yh.name == yt.name:
		# 			sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 1.0
		# 		else:
		# 			sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 0.0
		#
		# 	for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()):
		# 		p_min = 1
		# 		p_max = 0
		# 		for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()):
		# 			p = 0.0
		# 			for yh in YH.domains.get_all ():
		# 				p = train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) \
		# 					* sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))]
		# 			if p < p_min:
		# 				p_min = p
		# 			if p > p_max:
		# 				p_max = p
		# 		loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt}))
		# 		sol[2 ** 8 + loc] = p_max
		# 		sol[2 ** 8 + 2 ** 8 // 8 + loc] = p_min
		#
		# 	np.dot (G_matrix_2, sol)

		# check ()

		# solver
		solvers.options['show_progress'] = False
		sol = solvers.lp (c=matrix (C_vector),
						  G=matrix (G_matrix),
						  h=matrix (h),
						  A=matrix (A_matrix),
						  b=matrix (b),
						  solver=solvers
						  )
		mapping = np.array (sol['x'])

		# build the post-processing result in training and testing
		train.df.loc[:, name + 'M'] = train.df[name]
		test.df[name + 'M'] = test.df[name]
		for a1, a2, n, m1, m2, s, yh, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (),
													 S.domains.get_all (), YH.domains.get_all (), YT.domains.get_all ()):
			if yh.name != yt.name:
				p = mapping[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt})), 0]
				train.random_assign (Event ({YH: yh, A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), Event ({YT: yt}), p)
				test.random_assign (Event ({YH: yh, A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), Event ({YT: yt}), p)

		train.df[name] = train.df[name + 'M']
		train.df.drop ([name + 'M'], axis=1)
		test.df[name] = test.df[name + 'M']
		test.df.drop ([name + 'M'], axis=1)
		acc.append (accuracy_score (train.df[name], train.df[Y.name]))
		acc.append (accuracy_score (test.df[name], test.df[Y.name]))

	acc_matrix.iloc[:, 3] = acc
	train.df.to_csv ('temp/adult_binary_train_prediction3.csv', index=False)
	test.df.to_csv ('temp/adult_binary_test_prediction3.csv', index=False)
def method3(acc_matrix):
    df_train = pd.read_csv('temp/synthetic_train_prediction0.csv')
    train = DataSet(df_train)
    df_test = pd.read_csv('temp/synthetic_test_prediction0.csv')
    test = DataSet(df_test)
    acc = []

    for name in ['LR', 'SVM']:
        probabilistic_cbn = load_xml_to_cbn(
            os.path.join(src_path,
                         '../data/synthetic/ProbabilisticBayesianModel.xml'))

        def find_condition_prob(e, t):
            return probabilistic_cbn.find_prob(e, t)

        def get_loc(e):
            return probabilistic_cbn.get_loc(e)

        A = probabilistic_cbn.v['A']
        S = probabilistic_cbn.v['S']
        N = probabilistic_cbn.v['N']
        M = probabilistic_cbn.v['M']
        Y = probabilistic_cbn.v['Y']

        YH = Variable(name='YH', index=Y.index + 1, domains=Y.domains)
        probabilistic_cbn.v[(YH.index, YH.name)] = YH

        YT = Variable(name='YT', index=Y.index + 2, domains=Y.domains)
        probabilistic_cbn.v[(YT.index, YT.name)] = YT

        # build linear loss function
        C_vector = np.zeros((2**6 + 2**6 // 2, 1))
        for a, n, m, s in product(A.domains.get_all(), N.domains.get_all(),
                                  M.domains.get_all(), S.domains.get_all()):
            p_x_s = train.get_marginal_prob(
                Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))

            p_yh_1_y = p_x_s * train.count(
                Event({
                    'Y': 0,
                    name: 0
                }), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }), 'notequal')
            loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 0, YT: 0}))
            C_vector[loc] = p_yh_1_y * train.get_conditional_prob(
                Event({name: 0}), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))
            loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 1, YT: 1}))
            C_vector[loc] = p_yh_1_y * train.get_conditional_prob(
                Event({name: 1}), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))

            p_yh__y = p_x_s * train.count(
                Event({
                    'Y': 0,
                    name: 0
                }), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }), 'equal')
            loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 0, YT: 1}))
            C_vector[loc] = p_yh__y * train.get_conditional_prob(
                Event({name: 0}), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))
            loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 1, YT: 0}))
            C_vector[loc] = p_yh__y * train.get_conditional_prob(
                Event({name: 1}), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))

        # the inequality of max and min
        G_matrix_1 = np.zeros((2**6, 2**6 + 2**6 // 2))
        h_1 = np.zeros(2**6)
        # max
        i = 0
        for a, n, s, yt in product(A.domains.get_all(), N.domains.get_all(),
                                   S.domains.get_all(), YT.domains.get_all()):
            for m in M.domains.get_all():
                for yh in YH.domains.get_all():
                    loc = get_loc(
                        Event({
                            A: a,
                            M: m,
                            N: n,
                            S: s,
                            YH: yh,
                            YT: yt
                        }))
                    G_matrix_1[i, loc] = train.get_conditional_prob(
                        Event({name: yh}),
                        Event({
                            'A': a,
                            'M': m,
                            'N': n,
                            'S': s
                        }))
                loc = get_loc(Event({A: a, N: n, S: s, YT: yt}))
                G_matrix_1[i, 2**6 + loc] = -1
                i += 1
        # min
        assert i == 2**6 // 2
        for a, n, s, yt in product(A.domains.get_all(), N.domains.get_all(),
                                   S.domains.get_all(), YT.domains.get_all()):
            for m in M.domains.get_all():
                for yh in YH.domains.get_all():
                    loc = get_loc(
                        Event({
                            A: a,
                            M: m,
                            N: n,
                            S: s,
                            YH: yh,
                            YT: yt
                        }))
                    G_matrix_1[i, loc] = -train.get_conditional_prob(
                        Event({name: yh}),
                        Event({
                            'A': a,
                            'M': m,
                            'N': n,
                            'S': s
                        }))
                loc = get_loc(Event({A: a, N: n, S: s, YT: yt}))
                G_matrix_1[i, 2**6 + 2**6 // 4 + loc] = 1
                i += 1

        # build counterfactual fairness constraints
        G_matrix_2 = np.zeros((2**2 * 2, 2**6 + 2**6 // 2))
        h_2 = np.ones(2**2 * 2) * tau

        i = 0
        for a, m in product(A.domains.get_all(), M.domains.get_all()):
            for n in N.domains.get_all():
                loc = get_loc(Event({A: a, N: n, S: spos, YT: yt_pos}))
                G_matrix_2[i, 2**6 + loc] = find_condition_prob(
                    Event({N: n}), Event({
                        A: a,
                        S: spos
                    }))

                for yh in YH.domains.get_all():
                    loc = get_loc(
                        Event({
                            A: a,
                            M: m,
                            N: n,
                            S: sneg,
                            YH: yh,
                            YT: yt_pos
                        }))
                    G_matrix_2[i, loc] = -find_condition_prob (Event ({N: n}), Event ({A: a, S: sneg})) \
                          * train.get_conditional_prob (Event ({name: yh}), Event ({'A': a, 'M': m, 'N': n, 'S': sneg}))
            i += 1

        assert i == 2**2
        for a, m in product(A.domains.get_all(), M.domains.get_all()):
            for n in N.domains.get_all():
                loc = get_loc(Event({A: a, N: n, S: spos, YT: yt_pos}))
                G_matrix_2[i, 2**6 + 2**6 // 4 + loc] = -find_condition_prob(
                    Event({N: n}), Event({
                        A: a,
                        S: spos
                    }))

                for yh in YH.domains.get_all():
                    loc = get_loc(
                        Event({
                            A: a,
                            M: m,
                            N: n,
                            S: sneg,
                            YH: yh,
                            YT: yt_pos
                        }))
                    G_matrix_2[i, loc] = find_condition_prob (Event ({N: n}), Event ({A: a, S: sneg})) \
                          * train.get_conditional_prob (Event ({name: yh}), Event ({'A': a, 'M': m, 'N': n, 'S': sneg}))
            i += 1

        ###########

        # mapping in [0, 1]
        G_matrix_3 = np.zeros(((2**6 + 2**6 // 2) * 2, 2**6 + 2**6 // 2))
        h_3 = np.zeros((2**6 + 2**6 // 2) * 2)

        for i in range(2**6 + 2**6 // 2):
            G_matrix_3[i, i] = 1
            h_3[i] = 1

            G_matrix_3[2**6 + 2**6 // 2 + i, i] = -1
            h_3[2**6 + 2**6 // 2 + i] = 0

        # sum = 1
        A_matrix = np.zeros((2**6 // 2, 2**6 + 2**6 // 2))
        b = np.ones(2**6 // 2)

        i = 0
        for a, n, m, s, yh in product(A.domains.get_all(), N.domains.get_all(),
                                      M.domains.get_all(), S.domains.get_all(),
                                      YH.domains.get_all()):
            for yt in YT.domains.get_all():
                A_matrix[
                    i,
                    get_loc(Event({
                        A: a,
                        M: m,
                        N: n,
                        S: s,
                        YH: yh,
                        YT: yt
                    }))] = 1
            i += 1

        assert i == 2**6 // 2

        # combine the inequality constraints
        G_matrix = np.vstack([G_matrix_1, G_matrix_2, G_matrix_3])
        h = np.hstack([h_1, h_2, h_3])

        # solver
        solvers.options['show_progress'] = False
        sol = solvers.lp(c=matrix(C_vector),
                         G=matrix(G_matrix),
                         h=matrix(h),
                         A=matrix(A_matrix),
                         b=matrix(b),
                         solver=solvers)
        mapping = np.array(sol['x'])

        # build the post-processing result in training and testing
        train.df[name + '1'] = train.df[name]
        test.df[name + '1'] = test.df[name]
        for a, n, m, s, yh, yt in product(A.domains.get_all(),
                                          N.domains.get_all(),
                                          M.domains.get_all(),
                                          S.domains.get_all(),
                                          YH.domains.get_all(),
                                          YT.domains.get_all()):
            if yh.name != yt.name:
                p = mapping[
                    get_loc(Event({
                        A: a,
                        M: m,
                        N: n,
                        S: s,
                        YH: yh,
                        YT: yt
                    })), 0]
                train.random_assign(
                    Event({
                        name: yh,
                        'A': a,
                        'M': m,
                        'N': n,
                        'S': s
                    }), Event({name + '1': yt}), p)
                test.random_assign(
                    Event({
                        name: yh,
                        'A': a,
                        'M': m,
                        'N': n,
                        'S': s
                    }), Event({name + '1': yt}), p)

        train.df[name] = train.df[name + '1']
        train.df.drop([name + '1'], axis=1)
        test.df[name] = test.df[name + '1']
        test.df.drop([name + '1'], axis=1)
        acc.append(accuracy_score(train.df['Y'], train.df[name]))
        acc.append(accuracy_score(test.df['Y'], test.df[name]))

    acc_matrix.iloc[:, 3] = acc

    train.df.to_csv('temp/synthetic_train_prediction3.csv', index=False)
    test.df.to_csv('temp/synthetic_test_prediction3.csv', index=False)