Пример #1
0
def detect_classifier(ce_matrix):
	cbn = load_xml_to_cbn (os.path.join (src_path, '../data/adult/adult.xml'))

	A1 = cbn.v['age']
	A2 = cbn.v['education']
	S = cbn.v['sex']
	M1 = cbn.v['workclass']
	M2 = cbn.v['marital-status']
	N = cbn.v['hours']
	Y = cbn.v['income']

	for i in [0, 1, 2, 3]:  # two datasets generated by two methods

		test = DataSet (pd.read_csv ('temp/adult_binary_test_prediction%d.csv' % i))
		for j, label in enumerate (['LR', 'SVM']):  # two classifiers

			# modify cpt of label before detect
			for a1, a2, n, m1, m2, s, y in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (),
													S.domains.get_all (), Y.domains.get_all ()):
				cbn.set_conditional_prob (Event ({Y: y}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}),
										  test.get_conditional_prob (Event ({label: y}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})))

			cbn.build_joint_table ()
			for k, (a1prime, a2prime, m1prime, m2prime) in enumerate (product ([0, 1], [0, 1], [0, 1], [0, 1])):
				p_u, p_l = detect_after_remove (cbn=cbn, s=spos, sprime=sneg, y=1, a1prime=a1prime, a2prime=a2prime, m1prime=m1prime, m2prime=m2prime)
				p = detect_after_remove (cbn=cbn, s=sneg, sprime=sneg, y=1, a1prime=a1prime, a2prime=a2prime, m1prime=m1prime, m2prime=m2prime)
				ce_matrix.iloc[j * 32 + k, 2 * i:2 * i + 2] = [p_u - p, p_l - p]

			for k, (a1prime, a2prime, m1prime, m2prime) in enumerate (product ([0, 1], [0, 1], [0, 1], [0, 1])):
				p_u, p_l = detect_after_remove (cbn=cbn, s=sneg, sprime=spos, y=1, a1prime=a1prime, a2prime=a2prime, m1prime=m1prime, m2prime=m2prime)
				p = detect_after_remove (cbn=cbn, s=spos, sprime=spos, y=1, a1prime=a1prime, a2prime=a2prime, m1prime=m1prime, m2prime=m2prime)
				ce_matrix.iloc[j * 32 + k + 16, 2 * i:2 * i + 2] = [p_u - p, p_l - p]
def convert(event: Event):
    for k in event.keys():
        if isinstance(k, int):
            name = event.name[k].name
            v = event.dict[k]
            event.dict.pop(k)
            event.dict[name] = v
Пример #3
0
def loosely_infer_with_partial_graph(y_val, s1_val, s0_val, ow, oa):
    partial_cbn = load_xml_to_cbn(partial_model)
    partial_cbn.build_joint_table()

    S = partial_cbn.v['S']
    W = partial_cbn.v['W']
    A = partial_cbn.v['A']
    Y_hat = partial_cbn.v['Y']

    def max_w_a():
        p_max = 0.0
        p_min = 1.0
        for w, a in product(W.domains.get_all(), A.domains.get_all()):
            p = partial_cbn.get_prob(Event({Y_hat: y_val}),
                                     Event({
                                         S: s1_val,
                                         W: w,
                                         A: a
                                     }))
            if p < p_min:
                p_min = p
            if p > p_max:
                p_max = p
        return p_max, p_min

    p1_upper, p1_lower = max_w_a()
    p2 = partial_cbn.get_prob(Event({Y_hat: y_val}),
                              Event({
                                  S: s0_val,
                                  W: ow,
                                  A: oa
                              }))

    return p1_upper - p2, p1_lower - p2
 def get_marginal_prob(self, event: Event):
     convert(event)
     groupby_object = self.df.groupby(event.keys())
     name = tuple(event.values())
     if name.__len__() == 1:
         name = name[0]
     try:
         return groupby_object.get_group(name).__len__()
     except:
         return 0
Пример #5
0
 def max_w_h():
     p_max = 0.0
     p_min = 1.0
     for m in Marital.domains.get_all():
         p_m = partial_cbn.get_prob(
             Event({Income: y_val}),
             Event({
                 Sex: s1_val,
                 Age: oa,
                 Edu: oe,
                 Marital: m
             }))
         p_max = max(p_m, p_max)
         p_min = min(p_m, p_min)
     return p_max, p_min
 def random_assign(self, event: Event, target: Event, p: float):
     convert(event)
     convert(target)
     groupby_object = self.df.groupby(event.keys())
     name = tuple(event.values())
     if name.__len__() == 1:
         name = name[0]
     try:
         index = groupby_object.get_group(name).index
         target_index = random.sample(list(index),
                                      int(round(index.__len__() * p)))
         col = self.df.columns.get_loc(target.keys()[0])
         self.df.ix[target_index, col] = target.values()[0]
     except KeyError:
         pass
Пример #7
0
 def max_w_a():
     p_max = 0.0
     p_min = 1.0
     for w, a in product(W.domains.get_all(), A.domains.get_all()):
         p = partial_cbn.get_prob(Event({Y_hat: y_val}),
                                  Event({
                                      S: s1_val,
                                      W: w,
                                      A: a
                                  }))
         if p < p_min:
             p_min = p
         if p > p_max:
             p_max = p
     return p_max, p_min
 def count(self, event: Event, condition: Event, relationship: str):
     convert(event)
     convert(condition)
     groupby_object = self.df.groupby(condition.keys())
     name = tuple(condition.values())
     if name.__len__() == 1:
         name = name[0]
     try:
         group = groupby_object.get_group(name)
         key0 = event.keys()[0]
         key1 = event.keys()[1]
         if relationship == 'equal':
             sub_group = group[group[key0] == group[key1]]
             return sub_group.__len__() / group.__len__()
         else:
             sub_group = group[group[key0] != group[key1]]
             return sub_group.__len__() / group.__len__()
     except:
         return 0
Пример #9
0
def detect_after_remove(cbn, s, sprime, y, a1prime, m1prime, m2prime, a2prime):
	A1 = cbn.v['age']
	A2 = cbn.v['education']
	S = cbn.v['sex']
	M1 = cbn.v['workclass']
	M2 = cbn.v['marital-status']
	N = cbn.v['hours']
	Y = cbn.v['income']

	if s == sprime:
		cbn.build_joint_table ()
		logger.info ('Identifiable:')
		p = cbn.get_conditional_prob (Event ({Y: y}), Event ({A1: a1prime, A2: a2prime, M1: m1prime, M2: m2prime, S: sprime}))
		logger.info ('Compute according the Bayesian network: %f' % p)
		return p

	else:
		logger.info ('Unidentifiable:')
		p_u = 0.0
		p_l = 0.0
		for n in N.domains.get_all ():
			p_m = []
			for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()):
				p_m.append (cbn.find_prob (Event ({Y: y}), Event ({A1: a1prime, A2: a2prime, N: n, M1: m1, M2: m2, S: s})))
			p_max = max (p_m)
			p_min = min (p_m)
			# print(p_max, p_min)
			p_n = cbn.find_prob (Event ({N: n}), Event ({A1: a1prime, A2: a2prime, M1: m1prime, M2: m2prime, S: s}))
			p_u += p_max * p_n
			p_l += p_min * p_n

		logger.info ('Upper bound of counterfactual: %f' % p_u)
		logger.info ('Lower bound of counterfactual: %f' % p_l)
		return p_u, p_l
Пример #10
0
def loosely_infer_with_partial_graph(y_val, s1_val, s0_val, oa, oe, om):
    partial_cbn = load_xml_to_cbn(partial_model)
    partial_cbn.build_joint_table()

    Age = partial_cbn.v['age']
    Edu = partial_cbn.v['education']
    Sex = partial_cbn.v['sex']
    Workclass = partial_cbn.v['workclass']
    Marital = partial_cbn.v['marital-status']
    Hours = partial_cbn.v['hours']
    Income = partial_cbn.v['income']

    def max_w_h():
        p_max = 0.0
        p_min = 1.0
        for m in Marital.domains.get_all():
            p_m = partial_cbn.get_prob(
                Event({Income: y_val}),
                Event({
                    Sex: s1_val,
                    Age: oa,
                    Edu: oe,
                    Marital: m
                }))
            p_max = max(p_m, p_max)
            p_min = min(p_m, p_min)
        return p_max, p_min

    p1_upper, p1_lower = max_w_h()

    p2 = partial_cbn.get_prob(
        Event({Income: y_val}),
        Event({
            Sex: s0_val,
            Age: oa,
            Edu: oe,
            Marital: om
        }))
    return p1_upper - p2, p1_lower - p2
def pearl_after_remove(data, cbn, s, sprime, y, aprime, mprime):
    UA = cbn.v['UA']
    UN = cbn.v['UN']
    UM = cbn.v['UM']
    US = cbn.v['US']
    UY = cbn.v['UY']
    A = cbn.v['A']
    S = cbn.v['S']
    N = cbn.v['N']
    M = cbn.v['M']
    Y = cbn.v['Y']
    p = 0.0

    for ua, un, um, us in product(UA.domains.get_all(), UN.domains.get_all(),
                                  UM.domains.get_all(), US.domains.get_all()):
        # compute p(u|z, s)
        ps = data.get_conditional_prob(
            Event({
                'UA': ua.index,
                'UN': un.index,
                'UM': um.index,
                'US': us.index
            }), Event({
                'A': aprime,
                'M': mprime,
                'S': sprime
            }))

        for a, n, m in product(A.domains.get_all(), N.domains.get_all(),
                               M.domains.get_all()):
            p += cbn.find_prob (Event ({A: a}), Event ({UA: ua})) * \
              cbn.find_prob (Event ({M: m}), Event ({S: s, A: a, UM: um})) * \
              cbn.find_prob (Event ({N: n}), Event ({S: s, A: a, UN: un})) * \
              cbn.find_prob (Event ({Y: y}), Event ({S: s, A: a, N: n, M: m, UY: 1})) * \
              ps
    logging.info("Pearl's three steps: %f" % p)
    return p
def compute_from_observed(s, sprime, y, aprime, mprime):
    probabilistic_cbn = load_xml_to_cbn(
        cwd + '/../data/synthetic/ProbabilisticBayesianModel.xml')
    probabilistic_cbn.build_joint_table()

    A = probabilistic_cbn.v['A']
    S = probabilistic_cbn.v['S']
    N = probabilistic_cbn.v['N']
    M = probabilistic_cbn.v['M']
    Y = probabilistic_cbn.v['Y']

    # Let's compute a counterfactual statement that is identifiable
    # print ('-' * 20)

    if s == sprime:
        probabilistic_cbn.build_joint_table()
        # print ('Identifiable:')
        # print ('Compute according the Bayesian network: '),
        # print (probabilistic_cbn.get_conditional_prob (Event ({Y: y}), Event ({A: aprime, M: mprime, S: sprime})))
        return probabilistic_cbn.get_conditional_prob(
            Event({Y: y}), Event({
                A: aprime,
                M: mprime,
                S: sprime
            }))
    else:
        # print ('Unidentifiable:')
        p_u = 0.0
        p_l = 0.0
        for n in N.domains.get_all():
            p_max = -1
            p_min = 2
            for m in M.domains.get_all():
                p_m = probabilistic_cbn.find_prob(
                    Event({Y: y}), Event({
                        A: aprime,
                        N: n,
                        M: m,
                        S: s
                    }))
                p_max = max(p_m, p_max)
                p_min = min(p_m, p_min)
            # print(p_max, p_min)
            p_n = probabilistic_cbn.find_prob(Event({N: n}),
                                              Event({
                                                  A: aprime,
                                                  S: s
                                              }))
            p_u += p_max * p_n
            p_l += p_min * p_n

        logging.info('Upper bound of counterfactual: %f' % p_u)
        logging.info('Lower bound of counterfactual: %f' % p_l)
        return p_u, p_l
def detect_after_remove(cbn, s, sprime, y, aprime, mprime):
    A = cbn.v['A']
    S = cbn.v['S']
    N = cbn.v['N']
    M = cbn.v['M']
    Y = cbn.v['Y']

    if s == sprime:
        cbn.build_joint_table()
        logging.info('Identifiable:')
        logging.info('Compute according the Bayesian network: '),
        p = cbn.get_conditional_prob(Event({Y: y}),
                                     Event({
                                         A: aprime,
                                         M: mprime,
                                         S: sprime
                                     }))
        logging.info(p)
        return p
    else:
        logging.info('Unidentifiable:')
        p_u = 0.0
        p_l = 0.0
        for n in N.domains.get_all():
            p_max = -1
            p_min = 2
            for m in M.domains.get_all():
                p_m = cbn.find_prob(Event({Y: y}),
                                    Event({
                                        A: aprime,
                                        N: n,
                                        M: m,
                                        S: s
                                    }))
                p_max = max(p_m, p_max)
                p_min = min(p_m, p_min)
            # print(p_max, p_min)
            p_n = cbn.find_prob(Event({N: n}), Event({A: aprime, S: s}))
            p_u += p_max * p_n
            p_l += p_min * p_n

        logging.info('Upper bound of counterfactual: %f' % p_u)
        logging.info('Lower bound of counterfactual: %f' % p_l)
        return p_u, p_l
Пример #14
0
def compute_from_observed(s, sprime, y, a1prime, m1prime, m2prime, a2prime):
	probabilistic_cbn = load_xml_to_cbn (cwd + '/../data/adult/adult.xml')
	probabilistic_cbn.build_joint_table ()

	A1 = probabilistic_cbn.v['age']
	A2 = probabilistic_cbn.v['education']
	S = probabilistic_cbn.v['sex']
	M1 = probabilistic_cbn.v['workclass']
	M2 = probabilistic_cbn.v['marital-status']
	N = probabilistic_cbn.v['hours']
	Y = probabilistic_cbn.v['income']

	# Let's compute a counterfactual statement that is identifiable
	# print ('-' * 20)

	if s == sprime:
		probabilistic_cbn.build_joint_table ()
		# print ('Identifiable:')
		# print ('Compute according the Bayesian network: ', end=''),
		p = probabilistic_cbn.get_conditional_prob (Event ({Y: y}), Event ({A1: a1prime, A2: a2prime, M1: m1prime, M2: m2prime, S: sprime}))
		return p

	else:
		# print ('Unidentifiable:')
		p_u = 0.0
		p_l = 0.0
		for n in N.domains.get_all ():
			p_max = -1
			p_min = 2
			for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()):
				p_m = probabilistic_cbn.find_prob (Event ({Y: y}), Event ({A1: a1prime, A2: a2prime, N: n, M1: m1, M2: m2, S: s}))
				p_max = max (p_m, p_max)
				p_min = min (p_m, p_min)
			# print(p_max, p_min)
			p_n = probabilistic_cbn.find_prob (Event ({N: n}), Event ({A1: a1prime, A2: a2prime, M1: m1prime, M2: m2prime, S: s}))
			p_u += p_max * p_n
			p_l += p_min * p_n

		return p_u, p_l
    def random_assign(self, event: Event, target: Event, p: float):
        convert(event)
        convert(target)
        groupby_object = self.df.groupby(event.keys())
        name = tuple(event.values())
        if name.__len__() == 1:
            name = name[0]
        try:
            index = groupby_object.get_group(name).index
            target_index = random.sample(list(index),
                                         int(round(index.__len__() * p)))
            col = self.df.columns.get_loc(target.keys()[0])
            self.df.ix[target_index, col] = target.values()[0]
        except KeyError:
            pass


if __name__ == '__main__':
    data = DataSet(
        pd.read_csv('../../data/synthetic/DeterministicData.txt', sep='\t'))
    print(
        data.get_conditional_prob(Event({'Y': 1}),
                                  Event({
                                      'A': 0,
                                      'N': 0,
                                      'M': 0,
                                      'S': 0
                                  })))
    pass
def pearl_detect_classifier(ce_matrix):
    cbn = load_xml_to_cbn(
        os.path.join(src_path,
                     '../data/synthetic/DeterministicBayesianModel.xml'))
    UA = cbn.v['UA']
    UN = cbn.v['UN']
    UM = cbn.v['UM']
    US = cbn.v['US']
    UY = cbn.v['UY']
    A = cbn.v['A']
    S = cbn.v['S']
    N = cbn.v['N']
    M = cbn.v['M']
    Y = cbn.v['Y']

    cbn.build_joint_table()
    event = cbn.jpt.groupby(
        Event({
            UA: 1,
            UN: 1,
            UM: 1,
            US: 1,
            A: 1,
            M: 1,
            S: 1
        }).keys())
    condition = cbn.jpt.groupby(Event({A: 1, M: 1, S: 1}).keys())

    def pearl_after_remove_(s, sprime, y, aprime, mprime):
        p = 0.0
        for ua, un, um, us in product(UA.domains.get_all(),
                                      UN.domains.get_all(),
                                      UM.domains.get_all(),
                                      US.domains.get_all()):
            e = Event({
                UA: ua,
                UN: un,
                UM: um,
                US: us,
                A: aprime,
                M: mprime,
                S: sprime
            })
            c = Event({A: aprime, M: mprime, S: sprime})
            ps = event.get_group(tuple(
                e.values()))['prob'].sum() / condition.get_group(
                    tuple(c.values()))['prob'].sum()

            for a, n, m in product(A.domains.get_all(), N.domains.get_all(),
                                   M.domains.get_all()):
                p += cbn.find_prob (Event ({A: a}), Event ({UA: ua})) * \
                  cbn.find_prob (Event ({M: m}), Event ({S: s, A: a, UM: um})) * \
                  cbn.find_prob (Event ({N: n}), Event ({S: s, A: a, UN: un})) * \
                  cbn.find_prob (Event ({Y: y}), Event ({S: s, A: a, N: n, M: m, UY: 1})) * \
                  ps
        return p

    for i in [0, 1, 2, 3]:  # two datasets generated by two methods
        test = DataSet(pd.read_csv('temp/synthetic_test_prediction%d.csv' % i))
        for j, label in enumerate(['LR', 'SVM']):  # two classifiers
            # modify cpt of label before detect
            for a, n, m, s, y in product(A.domains.get_all(),
                                         N.domains.get_all(),
                                         M.domains.get_all(),
                                         S.domains.get_all(),
                                         Y.domains.get_all()):
                cbn.set_conditional_prob(
                    Event({Y: y}), Event({
                        A: a,
                        M: m,
                        N: n,
                        S: s,
                        UY: 1
                    }),
                    test.get_conditional_prob(
                        Event({label: y}),
                        Event({
                            'A': a,
                            'M': m,
                            'N': n,
                            'S': s
                        })))

            for k, (aprime, mprime) in enumerate(product([0, 1], [0, 1])):
                ce = pearl_after_remove_ (s=spos, sprime=sneg, y=1, aprime=aprime, mprime=mprime) - \
                  pearl_after_remove_ (s=sneg, sprime=sneg, y=1, aprime=aprime, mprime=mprime)
                ce_matrix.iloc[j * 8 + k, 3 * i + 2] = ce

            for k, (aprime, mprime) in enumerate(product([0, 1], [0, 1])):
                ce = pearl_after_remove_ (s=sneg, sprime=spos, y=1, aprime=aprime, mprime=mprime) - \
                  pearl_after_remove_ (s=spos, sprime=spos, y=1, aprime=aprime, mprime=mprime)
                ce_matrix.iloc[j * 8 + k + 4, 3 * i + 2] = ce
def pearl_three_step(s, sprime, y, aprime, mprime):
    deterministic_cbn = load_xml_to_cbn(
        cwd + '/../data/synthetic/DeterministicBayesianModel.xml')

    UA = deterministic_cbn.v['UA']
    UN = deterministic_cbn.v['UN']
    UM = deterministic_cbn.v['UM']
    US = deterministic_cbn.v['US']
    UY = deterministic_cbn.v['UY']

    A = deterministic_cbn.v['A']
    S = deterministic_cbn.v['S']
    N = deterministic_cbn.v['N']
    M = deterministic_cbn.v['M']
    Y = deterministic_cbn.v['Y']
    """
	if s == sprime:
		print ('Identifiable:')
		print ("Let's validate pearl's three step by data")
		data = DataSet (pd.read_csv ('../data/synthetic/DeterministicData.txt', sep='\t'))
		print ('Read from data: ', end='')
		print (data.get_conditional_prob (Event ({'Y': y}), Event ({'A': aprime, 'M': mprime, 'S': sprime})))

		p = 0.0
		for ua, un, um, us, uy in product (UA.domains.get_all (), UN.domains.get_all (), UM.domains.get_all (), US.domains.get_all (), UY.domains.get_all ()):
			ps = data.get_conditional_prob (
				Event ({'UA': ua.index, 'UN': un.index, 'UM': um.index, 'US': us.index, 'UY': uy.index}),
				Event ({'A': aprime, 'M': mprime, 'S': sprime}))
			for a, n, m in product (A.domains.get_all (), N.domains.get_all (), M.domains.get_all ()):
				p += deterministic_cbn.find_prob (Event ({A: a}), Event ({UA: ua})) * \
					 deterministic_cbn.find_prob (Event ({M: m}), Event ({S: s, A: a, UM: um})) * \
					 deterministic_cbn.find_prob (Event ({N: n}), Event ({S: s, A: a, UN: un})) * \
					 deterministic_cbn.find_prob (Event ({Y: y}), Event ({S: s, A: a, N: n, M: m, UY: uy})) * \
					 ps
		print ("Pearl's three steps: (U is obtarined from data) %f" % p)
	"""

    p = 0.0
    deterministic_cbn.build_joint_table()
    for ua, un, um, us, uy in product(UA.domains.get_all(),
                                      UN.domains.get_all(),
                                      UM.domains.get_all(),
                                      US.domains.get_all(),
                                      UY.domains.get_all()):
        # compute p(u|z, s)
        ps = deterministic_cbn.get_conditional_prob(
            Event({
                UA: ua.index,
                UN: un.index,
                UM: um.index,
                US: us.index,
                UY: uy.index
            }), Event({
                A: aprime,
                M: mprime,
                S: sprime
            }))

        for a, n, m in product(A.domains.get_all(), N.domains.get_all(),
                               M.domains.get_all()):
            p += deterministic_cbn.find_prob (Event ({A: a}), Event ({UA: ua})) * \
              deterministic_cbn.find_prob (Event ({M: m}), Event ({S: s, A: a, UM: um})) * \
              deterministic_cbn.find_prob (Event ({N: n}), Event ({S: s, A: a, UN: un})) * \
              deterministic_cbn.find_prob (Event ({Y: y}), Event ({S: s, A: a, N: n, M: m, UY: uy})) * \
              ps
    logging.info("Pearl's three steps: %f" % p)
    return p
    def pearl_after_remove_(s, sprime, y, aprime, mprime):
        p = 0.0
        for ua, un, um, us in product(UA.domains.get_all(),
                                      UN.domains.get_all(),
                                      UM.domains.get_all(),
                                      US.domains.get_all()):
            e = Event({
                UA: ua,
                UN: un,
                UM: um,
                US: us,
                A: aprime,
                M: mprime,
                S: sprime
            })
            c = Event({A: aprime, M: mprime, S: sprime})
            ps = event.get_group(tuple(
                e.values()))['prob'].sum() / condition.get_group(
                    tuple(c.values()))['prob'].sum()

            for a, n, m in product(A.domains.get_all(), N.domains.get_all(),
                                   M.domains.get_all()):
                p += cbn.find_prob (Event ({A: a}), Event ({UA: ua})) * \
                  cbn.find_prob (Event ({M: m}), Event ({S: s, A: a, UM: um})) * \
                  cbn.find_prob (Event ({N: n}), Event ({S: s, A: a, UN: un})) * \
                  cbn.find_prob (Event ({Y: y}), Event ({S: s, A: a, N: n, M: m, UY: 1})) * \
                  ps
        return p
Пример #19
0
def method3(acc_matrix):
	df_train = pd.read_csv ('temp/adult_binary_train_prediction0.csv')
	# df_train = pd.concat ([df_train] * 10, ignore_index=True)
	train = DataSet (df_train)
	df_test = pd.read_csv ('temp/adult_binary_test_prediction0.csv')
	df_test = pd.concat ([df_test] * 3, ignore_index=True)
	test = DataSet (df_test)
	acc = []

	for name in ['LR', 'SVM']:
		probabilistic_cbn = load_xml_to_cbn (os.path.join (src_path, '../data/adult/adult.xml'))

		def find_condition_prob(e, t):
			return probabilistic_cbn.find_prob (e, t)

		def get_loc(e):
			return probabilistic_cbn.get_loc (e)

		A1 = probabilistic_cbn.v['age']
		A2 = probabilistic_cbn.v['education']
		S = probabilistic_cbn.v['sex']
		M1 = probabilistic_cbn.v['workclass']
		M2 = probabilistic_cbn.v['marital-status']
		N = probabilistic_cbn.v['hours']
		Y = probabilistic_cbn.v['income']

		YH = Variable (name=name, index=Y.index + 1, domains=Y.domains)
		probabilistic_cbn.v[(YH.index, YH.name)] = YH

		YT = Variable (name=name + "M", index=Y.index + 2, domains=Y.domains)
		probabilistic_cbn.v[(YT.index, YT.name)] = YT

		# build linear loss function
		C_vector = np.zeros ((2 ** 8 + 2 ** 8 // 4, 1))
		for a1, a2, n, m1, m2, s in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (),
											 S.domains.get_all ()):
			p_x_s = train.get_marginal_prob (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))

			p_yh_1_y = p_x_s * train.count (Event ({Y: 0, YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), 'notequal')
			loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 0, YT: 0}))
			C_vector[loc] = p_yh_1_y * train.get_conditional_prob (Event ({YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))
			loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 1, YT: 1}))
			C_vector[loc] = p_yh_1_y * train.get_conditional_prob (Event ({YH: 1}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))

			p_yh__y = p_x_s * train.count (Event ({Y: 0, YH: 0}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, N: n, S: s}), 'equal')
			loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 0, YT: 1}))
			C_vector[loc] = p_yh__y * train.get_conditional_prob (Event ({YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))
			loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 1, YT: 0}))
			C_vector[loc] = p_yh__y * train.get_conditional_prob (Event ({YH: 1}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))

		# the inequality of max and min
		G_matrix_1 = np.zeros ((2 ** 8, 2 ** 8 + 2 ** 8 // 4))
		h_1 = np.zeros (2 ** 8)
		# max
		i = 0
		for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()):
			for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()):
				for yh in YH.domains.get_all ():
					loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))
					G_matrix_1[i, loc] = train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))
				loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt}))
				G_matrix_1[i, 2 ** 8 + loc] = -1
				i += 1
		# min
		assert i == 2 ** 8 // 2
		for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()):
			for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()):
				for yh in YH.domains.get_all ():
					loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))
					G_matrix_1[i, loc] = -train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))
				loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt}))
				G_matrix_1[i, 2 ** 8 + 2 ** 8 // 8 + loc] = 1
				i += 1

		# build counterfactual fairness constraints
		G_matrix_2 = np.zeros ((2 ** 4 * 2, 2 ** 8 + 2 ** 8 // 4))
		h_2 = np.ones (2 ** 4 * 2) * tau

		i = 0
		for a1, a2, m1, m2 in product (A1.domains.get_all (), A2.domains.get_all (), M1.domains.get_all (), M2.domains.get_all ()):
			for n in N.domains.get_all ():
				loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: spos, YT: yt_pos}))
				G_matrix_2[i, 2 ** 8 + loc] = find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: spos}))

				for yh in YH.domains.get_all ():
					loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg, YH: yh, YT: yt_pos}))
					G_matrix_2[i, loc] = -find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: sneg})) \
										 * train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg}))
			i += 1

		assert i == 2 ** 4
		for a1, a2, m1, m2 in product (A1.domains.get_all (), A2.domains.get_all (), M1.domains.get_all (), M2.domains.get_all ()):
			for n in N.domains.get_all ():
				loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: spos, YT: yt_pos}))
				G_matrix_2[i, 2 ** 8 + 2 ** 8 // 8 + loc] = -find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: spos}))

				for yh in YH.domains.get_all ():
					loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg, YH: yh, YT: yt_pos}))
					G_matrix_2[i, loc] = find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: sneg})) \
										 * train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg}))
			i += 1

		###########

		# mapping in [0, 1]
		G_matrix_3 = np.zeros ((2 * (2 ** 8 + 2 ** 8 // 4), 2 ** 8 + 2 ** 8 // 4))
		h_3 = np.zeros (2 * (2 ** 8 + 2 ** 8 // 4))

		for i in range (2 ** 8 + 2 ** 8 // 4):
			G_matrix_3[i, i] = 1
			h_3[i] = 1

			G_matrix_3[2 ** 8 + 2 ** 8 // 4 + i, i] = -1
			h_3[2 ** 8 + 2 ** 8 // 4 + i] = 0

		# sum = 1
		A_matrix = np.zeros ((2 ** 8 // 2, 2 ** 8 + 2 ** 8 // 4))
		b = np.ones (2 ** 8 // 2)

		i = 0
		for a1, a2, n, m1, m2, s, yh in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (),
												 S.domains.get_all (),
												 YH.domains.get_all ()):
			for yt in YT.domains.get_all ():
				A_matrix[i, get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 1
			i += 1

		assert i == 2 ** 8 // 2

		# combine the inequality constraints
		G_matrix = np.vstack ([G_matrix_1, G_matrix_2, G_matrix_3])
		h = np.hstack ([h_1, h_2, h_3])

		# Test
		# print (np.linalg.matrix_rank (A_matrix), A_matrix.shape[0])
		# print (np.linalg.matrix_rank (np.vstack ([A_matrix, G_matrix])), A_matrix.shape[1])

		# def check():
		# 	sol = np.zeros (2 ** 8 + 2 ** 8 // 4)
		# 	for a1, a2, n, m1, m2, s, yh, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (),
		# 												 S.domains.get_all (), YH.domains.get_all (), YT.domains.get_all ()):
		# 		if yh.name == yt.name:
		# 			sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 1.0
		# 		else:
		# 			sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 0.0
		#
		# 	for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()):
		# 		p_min = 1
		# 		p_max = 0
		# 		for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()):
		# 			p = 0.0
		# 			for yh in YH.domains.get_all ():
		# 				p = train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) \
		# 					* sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))]
		# 			if p < p_min:
		# 				p_min = p
		# 			if p > p_max:
		# 				p_max = p
		# 		loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt}))
		# 		sol[2 ** 8 + loc] = p_max
		# 		sol[2 ** 8 + 2 ** 8 // 8 + loc] = p_min
		#
		# 	np.dot (G_matrix_2, sol)

		# check ()

		# solver
		solvers.options['show_progress'] = False
		sol = solvers.lp (c=matrix (C_vector),
						  G=matrix (G_matrix),
						  h=matrix (h),
						  A=matrix (A_matrix),
						  b=matrix (b),
						  solver=solvers
						  )
		mapping = np.array (sol['x'])

		# build the post-processing result in training and testing
		train.df.loc[:, name + 'M'] = train.df[name]
		test.df[name + 'M'] = test.df[name]
		for a1, a2, n, m1, m2, s, yh, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (),
													 S.domains.get_all (), YH.domains.get_all (), YT.domains.get_all ()):
			if yh.name != yt.name:
				p = mapping[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt})), 0]
				train.random_assign (Event ({YH: yh, A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), Event ({YT: yt}), p)
				test.random_assign (Event ({YH: yh, A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), Event ({YT: yt}), p)

		train.df[name] = train.df[name + 'M']
		train.df.drop ([name + 'M'], axis=1)
		test.df[name] = test.df[name + 'M']
		test.df.drop ([name + 'M'], axis=1)
		acc.append (accuracy_score (train.df[name], train.df[Y.name]))
		acc.append (accuracy_score (test.df[name], test.df[Y.name]))

	acc_matrix.iloc[:, 3] = acc
	train.df.to_csv ('temp/adult_binary_train_prediction3.csv', index=False)
	test.df.to_csv ('temp/adult_binary_test_prediction3.csv', index=False)
Пример #20
0
def infer_with_complete_graph(y_val, s1_val, s0_val, ow, oa):
    complete_cbn = load_xml_to_cbn(complete_model)

    US = complete_cbn.v['US']
    UW = complete_cbn.v['UW']
    UA = complete_cbn.v['UA']
    UY = complete_cbn.v['UY']

    S = complete_cbn.v['S']
    W = complete_cbn.v['W']
    A = complete_cbn.v['A']
    Y_hat = complete_cbn.v['Y']

    complete_cbn.build_joint_table()
    p1, p2 = 0.0, 0.0
    for us, uw, ua, uy in product(US.domains.get_all(), UW.domains.get_all(),
                                  UA.domains.get_all(), UY.domains.get_all()):
        # compute p(u|z, s)
        ps = complete_cbn.get_prob(Event({
            US: us,
            UW: uw,
            UA: ua
        }), Event({
            S: s0_val,
            W: ow,
            A: oa
        }))
        # print(ps)
        if ps == 0.00000:
            continue
        for w1, a1 in product(W.domains.get_all(), A.domains.get_all()):
            p1 += complete_cbn.get_prob(Event({W: w1}), Event({UW: uw, S: s1_val})) * \
                  complete_cbn.get_prob(Event({A: a1}), Event({UA: ua, W: w1})) * \
                  complete_cbn.get_prob(Event({Y_hat: y_val}), Event({UY: uy, S: s1_val, W: w1, A: a1})) * \
                  complete_cbn.get_prob(Event({UY: uy}), Event({})) * ps

        for w0, a0 in product(W.domains.get_all(), A.domains.get_all()):
            p2 += complete_cbn.get_prob(Event({W: w0}), Event({UW: uw, S: s0_val})) * \
                  complete_cbn.get_prob(Event({A: a0}), Event({UA: ua, W: w0})) * \
                  complete_cbn.get_prob(Event({Y_hat: y_val}), Event({S: s0_val, W: w0, A: a0})) * \
                  complete_cbn.get_prob(Event({UY: uy}), Event({})) * ps

    assert abs(p2 - complete_cbn.get_prob(Event({Y_hat: y_val}),
                                          Event({
                                              S: s0_val,
                                              W: ow,
                                              A: oa
                                          }))) < 0.001
    # print(p1, p2)
    return p1, p2
def infer_with_complete_graph(y_val, s1_val, s0_val):
    complete_cbn = load_xml_to_cbn(complete_model)

    US = complete_cbn.v['US']
    UW = complete_cbn.v['UW']
    UA = complete_cbn.v['UA']
    UY = complete_cbn.v['UY']

    S = complete_cbn.v['S']
    W = complete_cbn.v['W']
    A = complete_cbn.v['A']
    Y_hat = complete_cbn.v['Y']

    p1, p2 = 0.0, 0.0

    for uw, ua, uy in product(UW.domains.get_all(), UA.domains.get_all(),
                              UY.domains.get_all()):
        for w1, w0, a1 in product(W.domains.get_all(), W.domains.get_all(),
                                  A.domains.get_all()):
            p1 += complete_cbn.get_prob(Event({W: w1}), Event({UW: uw, S: s1_val})) * \
                  complete_cbn.get_prob(Event({A: a1}), Event({UA: ua, W: w1})) * \
                  complete_cbn.get_prob(Event({W: w0}), Event({UW: uw, S: s0_val})) * \
                  complete_cbn.get_prob(Event({Y_hat: y_val}), Event({UY: uy, S: s1_val, W: w0, A: a1})) * \
                  complete_cbn.get_prob(Event({UW: uw}), Event({})) * \
                  complete_cbn.get_prob(Event({UA: ua}), Event({})) * \
                  complete_cbn.get_prob(Event({UY: uy}), Event({}))

        for w0, a0 in product(W.domains.get_all(), A.domains.get_all()):
            p2 += complete_cbn.get_prob(Event({W: w0}), Event({UW: uw, S: s0_val})) * \
                  complete_cbn.get_prob(Event({A: a0}), Event({UA: ua, W: w0})) * \
                  complete_cbn.get_prob(Event({Y_hat: y_val}), Event({UY: uy, S: s0_val, W: w0, A: a0})) * \
                  complete_cbn.get_prob(Event({UW: uw}), Event({})) * \
                  complete_cbn.get_prob(Event({UA: ua}), Event({})) * \
                  complete_cbn.get_prob(Event({UY: uy}), Event({}))

    # complete_cbn.build_joint_table()
    # assert abs(p2 - complete_cbn.get_prob(Event({Y_hat: y_val}), Event({S: s0_val}))) <= 0.0001

    # print(p1, p2)
    return p1, p2
def test():
    ###############
    complete_cbn = load_xml_to_cbn(complete_model)

    US = complete_cbn.v['US']
    UW = complete_cbn.v['UW']
    UA = complete_cbn.v['UA']
    UY = complete_cbn.v['UY']

    S = complete_cbn.v['S']
    W = complete_cbn.v['W']
    A = complete_cbn.v['A']
    Y_hat = complete_cbn.v['Y']

    print('Generate using conditional cpts')
    for s in S.domains.get_all():
        p = 0.0
        for us in US.domains.get_all():
            p = p + complete_cbn.get_prob(Event({S: s}), Event(
                {US: us})) * complete_cbn.get_prob(Event({US: us}), Event({}))
        print(p)
    print()

    for s, w in product(S.domains.get_all(), W.domains.get_all()):
        p = 0.0
        for uw in UW.domains.get_all():
            p = p + complete_cbn.get_prob(Event({W: w}), Event({
                UW: uw,
                S: s
            })) * complete_cbn.get_prob(Event({UW: uw}), Event({}))
        print(p)
    print()

    for w, a in product(W.domains.get_all(), A.domains.get_all()):
        p = 0.0
        for ua in UA.domains.get_all():
            p = p + complete_cbn.get_prob(Event({A: a}), Event({
                UA: ua,
                W: w
            })) * complete_cbn.get_prob(Event({UA: ua}), Event({}))
        print(p)
    print()

    for s, w, a, y in product(S.domains.get_all(), W.domains.get_all(),
                              A.domains.get_all(), Y_hat.domains.get_all()):
        p = 0.0
        for uy in UY.domains.get_all():
            p = p + complete_cbn.get_prob(Event(
                {Y_hat: y}), Event({
                    UY: uy,
                    S: s,
                    W: w,
                    A: a
                })) * complete_cbn.get_prob(Event({UY: uy}), Event({}))
        print(p)

    print()
    print()

    print('Generate using joint cpt')
    complete_cbn.build_joint_table()
    for s in S.domains.get_all():
        print(complete_cbn.get_prob(Event({S: s})))
    print()

    for s, w in product(S.domains.get_all(), W.domains.get_all()):
        print(complete_cbn.get_prob(Event({W: w}), Event({S: s})))
    print()

    for w, a in product(W.domains.get_all(), A.domains.get_all()):
        print(complete_cbn.get_prob(Event({A: a}), Event({W: w})))
    print()

    for s, w, a, y in product(S.domains.get_all(), W.domains.get_all(),
                              A.domains.get_all(), Y_hat.domains.get_all()):
        print(
            complete_cbn.get_prob(Event({Y_hat: y}), Event({
                S: s,
                W: w,
                A: a
            })))
    print()
    print()

    partial_cbn = load_xml_to_cbn(partial_model)
    partial_cbn.build_joint_table()
    S = partial_cbn.v['S']
    W = partial_cbn.v['W']
    A = partial_cbn.v['A']
    Y_hat = partial_cbn.v['Y']

    print('Generate using partial SCM')
    for s in S.domains.get_all():
        print(partial_cbn.get_prob(Event({S: s}), Event({})))
    print()

    for s, w in product(S.domains.get_all(), W.domains.get_all()):
        print(partial_cbn.get_prob(Event({W: w}), Event({S: s})))
    print()

    for w, a in product(W.domains.get_all(), A.domains.get_all()):
        print(partial_cbn.get_prob(Event({A: a}), Event({W: w})))
    print()

    for s, w, a, y in product(S.domains.get_all(), W.domains.get_all(),
                              A.domains.get_all(), Y_hat.domains.get_all()):
        print(
            partial_cbn.get_prob(Event({Y_hat: y}), Event({
                S: s,
                W: w,
                A: a
            })))
    print()
    print()
Пример #23
0
def tight_infer_with_partial_graph(y_val, s1_val, s0_val, oa, oe, om):
    partial_cbn = load_xml_to_cbn(partial_model)
    partial_cbn.build_joint_table()

    Age = partial_cbn.v['age']
    Edu = partial_cbn.v['education']
    Sex = partial_cbn.v['sex']
    Workclass = partial_cbn.v['workclass']
    Marital = partial_cbn.v['marital-status']
    Hours = partial_cbn.v['hours']
    Income = partial_cbn.v['income']

    if s1_val == s0_val:
        # there is no difference when active value = reference value
        return 0.00, 0.00
    else:
        # define variable for P(r)
        PR = cvx.Variable(Marital.domain_size**8)

        # define ell functions
        g = {}
        for v in {Marital}:
            v_index = v.index
            v_domain_size = v.domain_size
            parents_index = partial_cbn.index_graph.pred[v_index].keys()
            parents_domain_size = np.prod(
                [partial_cbn.v[i].domain_size for i in parents_index])
            g[v_index] = list(
                product(range(v_domain_size), repeat=int(parents_domain_size)))

        # format
        # [(), (), ()]
        # r corresponds to the tuple
        # parents corresponds to the location of the tuple

        # assert the response function. (t function of Pearl, I function in our paper)
        def Indicator(obs, parents, response):
            # sort the parents by id
            par_key = parents.keys()
            # map the value to index
            par_index = 0
            for k in par_key:
                par_index = par_index * partial_cbn.v[
                    k].domain_size + parents.dict[k]

            return 1 if obs.first_value() == g[
                obs.first_key()][response][par_index] else 0

        # build the object function
        weights = np.zeros(shape=[Marital.domain_size**8])

        for rm in range(Marital.domain_size**8):
            # assert r -> o to obtain the conditional individuals
            product_i = 1
            for (obs, parents, response) in [(Event({Marital: om}),
                                              Event({
                                                  Sex: s0_val,
                                                  Age: oa,
                                                  Edu: oe
                                              }), rm)]:
                product_i *= Indicator(obs, parents, response)

            if product_i == 1:
                # if ALL I()= 1, then continue the counterfactual inference
                # the first term for pse
                sum_identity = 0.0
                for m1, w, h in product(Marital.domains.get_all(),
                                        Workclass.domains.get_all(),
                                        Hours.domains.get_all()):
                    product_i = partial_cbn.get_prob(Event({Sex: s0_val}), Event({})) * \
                                partial_cbn.get_prob(Event({Age: oa}), Event({})) * \
                                partial_cbn.get_prob(Event({Edu: oe}), Event({Age: oa})) * \
                                Indicator(Event({Marital: m1}), Event({Sex: s1_val, Age: oa, Edu: oe}), rm) * \
                                partial_cbn.get_prob(Event({Workclass: w}), Event({Age: oa, Edu: oe, Marital: m1})) * \
                                partial_cbn.get_prob(Event({Hours: h}), Event({Workclass: w, Edu: oe, Marital: m1, Age: oa, Sex: s1_val})) * \
                                partial_cbn.get_prob(Event({Income: y_val}), Event({Sex: s1_val, Edu: oe, Workclass: w, Marital: m1, Hours: h, Age: oa}))

                    sum_identity += product_i

                weights[rm] += sum_identity

                # the second term for pse
                sum_identity = 0.0
                for m0, w, h in product(Marital.domains.get_all(),
                                        Workclass.domains.get_all(),
                                        Hours.domains.get_all()):
                    product_i = partial_cbn.get_prob(Event({Sex: s0_val}), Event({})) * \
                                partial_cbn.get_prob(Event({Age: oa}), Event({})) * \
                                partial_cbn.get_prob(Event({Edu: oe}), Event({Age: oa})) * \
                                Indicator(Event({Marital: m0}), Event({Sex: s0_val, Age: oa, Edu: oe}), rm) * \
                                partial_cbn.get_prob(Event({Workclass: w}), Event({Age: oa, Edu: oe, Marital: m0})) * \
                                partial_cbn.get_prob(Event({Hours: h}), Event({Workclass: w, Edu: oe, Marital: m0, Age: oa, Sex: s0_val})) * \
                                partial_cbn.get_prob(Event({Income: y_val}), Event({Sex: s0_val, Edu: oe, Workclass: w, Marital: m0, Hours: h, Age: oa}))

                    sum_identity += product_i

                weights[rm] -= sum_identity

        # build the objective function
        objective = weights.reshape(1, -1) @ PR / partial_cbn.get_prob(
            Event({
                Sex: s0_val,
                Age: oa,
                Edu: oe,
                Marital: om
            }))

        ############################
        ### to build the constraints
        ############################

        ### the inferred model is consistent with the observational distribution
        A_mat = np.zeros(
            (Age.domain_size, Edu.domain_size, Marital.domain_size,
             Sex.domain_size, Marital.domain_size**8))
        b_vex = np.zeros((Age.domain_size, Edu.domain_size,
                          Marital.domain_size, Sex.domain_size))

        # assert r -> v
        for a, e, m, s in product(Age.domains.get_all(), Edu.domains.get_all(),
                                  Marital.domains.get_all(),
                                  Sex.domains.get_all()):
            # calculate the probability of observation
            b_vex[a.index, e.index, m.index, s.index] = partial_cbn.get_prob(
                Event({
                    Age: a,
                    Edu: e,
                    Marital: m,
                    Sex: s
                }))
            # sum of P(r)
            for rm in range(Marital.domain_size**8):
                product_i = partial_cbn.get_prob(Event({Sex: s}), Event({})) * \
                            partial_cbn.get_prob(Event({Age: a}), Event({})) * \
                            partial_cbn.get_prob(Event({Edu: e}), Event({Age: a})) * \
                            Indicator(Event({Marital: m}), Event({Sex: s, Age: a, Edu: e}), rm)
                A_mat[a.index, e.index, m.index, s.index, rm] = product_i

        # flatten the matrix and vector
        A_mat = A_mat.reshape(-1, Marital.domain_size**8)
        b_vex = b_vex.reshape(-1, 1)

        ### the probability <= 1
        C_mat = np.identity(Marital.domain_size**8)
        d_vec = np.ones(Marital.domain_size**8)

        ### the probability is positive
        E_mat = np.identity(Marital.domain_size**8)
        f_vec = np.zeros(Marital.domain_size**8)

        constraints = [
            A_mat @ PR == b_vex, C_mat @ PR <= d_vec, E_mat @ PR >= f_vec
        ]

        # minimize the causal effect
        problem = cvx.Problem(cvx.Minimize(objective), constraints)
        problem.solve()

        # print('tight lower effect: %f' % (problem.value))
        lower = problem.value

        # maximize the causal effect
        problem = cvx.Problem(cvx.Maximize(objective), constraints)
        problem.solve()

        # print('tight upper effect: %f' % (problem.value))
        upper = problem.value

        return upper, lower
def detect_classifier(ce_matrix):
    cbn = load_xml_to_cbn(
        os.path.join(src_path,
                     '../data/synthetic/ProbabilisticBayesianModel.xml'))
    A = cbn.v['A']
    S = cbn.v['S']
    N = cbn.v['N']
    M = cbn.v['M']
    Y = cbn.v['Y']

    for i in [0, 1, 2, 3]:  # two datasets generated by two methods
        test = DataSet(pd.read_csv('temp/synthetic_test_prediction%d.csv' % i))
        for j, label in enumerate(['LR', 'SVM']):  # two classifiers
            # modify cpt of label before detect
            for a, n, m, s, y in product(A.domains.get_all(),
                                         N.domains.get_all(),
                                         M.domains.get_all(),
                                         S.domains.get_all(),
                                         Y.domains.get_all()):
                cbn.set_conditional_prob(
                    Event({Y: y}), Event({
                        A: a,
                        M: m,
                        N: n,
                        S: s
                    }),
                    test.get_conditional_prob(
                        Event({label: y}),
                        Event({
                            'A': a,
                            'M': m,
                            'N': n,
                            'S': s
                        })))
            cbn.build_joint_table()

            for k, (aprime, mprime) in enumerate(product([0, 1], [0, 1])):
                p_u, p_l = detect_after_remove(cbn=cbn,
                                               s=spos,
                                               sprime=sneg,
                                               y=1,
                                               aprime=aprime,
                                               mprime=mprime)
                p = detect_after_remove(cbn=cbn,
                                        s=sneg,
                                        sprime=sneg,
                                        y=1,
                                        aprime=aprime,
                                        mprime=mprime)
                ce_matrix.iloc[j * 8 + k, 3 * i:3 * i + 2] = [p_u - p, p_l - p]

            for k, (aprime, mprime) in enumerate(product([0, 1], [0, 1])):
                p_u, p_l = detect_after_remove(cbn=cbn,
                                               s=sneg,
                                               sprime=spos,
                                               y=1,
                                               aprime=aprime,
                                               mprime=mprime)
                p = detect_after_remove(cbn=cbn,
                                        s=spos,
                                        sprime=spos,
                                        y=1,
                                        aprime=aprime,
                                        mprime=mprime)
                ce_matrix.iloc[j * 8 + k + 4,
                               3 * i:3 * i + 2] = [p_u - p, p_l - p]
def infer_with_complete_graph(y_val, s1_val, s0_val, ow, oa):
    complete_cbn = load_xml_to_cbn(complete_model)

    U = complete_cbn.v['USWAY']

    S = complete_cbn.v['S']
    W = complete_cbn.v['W']
    A = complete_cbn.v['A']
    Y_hat = complete_cbn.v['Y']

    p1, p2 = 0.0, 0.0
    complete_cbn.build_joint_table()
    for u in U.domains.get_all():
        # compute p(u|z, s)
        ps = complete_cbn.get_prob(Event({U: u.index}), Event({S: s0_val, W: ow, A: oa}))
        if ps == 0.00000:
            continue
        for w1, w0, a1 in product(W.domains.get_all(), W.domains.get_all(), A.domains.get_all()):
            p1 += complete_cbn.get_prob(Event({W: w1}), Event({U: u, S: s1_val})) * \
                  complete_cbn.get_prob(Event({A: a1}), Event({U: u, W: w1})) * \
                  complete_cbn.get_prob(Event({W: w0}), Event({U: u, S: s0_val})) * \
                  complete_cbn.get_prob(Event({Y_hat: y_val}), Event({U: u, S: s1_val, W: w0, A: a1})) * \
                  ps

        for w0, a0 in product(W.domains.get_all(), A.domains.get_all()):
            p2 += complete_cbn.get_prob(Event({W: w0}), Event({U: u, S: s0_val})) * \
                  complete_cbn.get_prob(Event({A: a0}), Event({U: u, W: w0})) * \
                  complete_cbn.get_prob(Event({Y_hat: y_val}), Event({U: u, S: s0_val, W: w0, A: a0})) * \
                  ps

    return p1, p2
def tight_infer_with_partial_graph(y_val, s1_val, s0_val, ow, oa):
    partial_cbn = load_xml_to_cbn(partial_model)
    partial_cbn.build_joint_table()

    S = partial_cbn.v['S']
    W = partial_cbn.v['W']
    A = partial_cbn.v['A']
    Y_hat = partial_cbn.v['Y']

    if s1_val == s0_val:
        # there is no difference when active value = reference value
        return 0.00, 0.00
    else:
        # define variable for P(r)
        PR = cvx.Variable(S.domain_size * \
                          W.domain_size ** S.domain_size * \
                          A.domain_size ** W.domain_size * \
                          Y_hat.domain_size ** (S.domain_size * W.domain_size * A.domain_size))

        # define ell functions
        g = {}
        for v in {S, Y_hat, W, A}:
            v_index = v.index
            v_domain_size = v.domain_size
            parents_index = partial_cbn.index_graph.pred[v_index].keys()
            parents_domain_size = np.prod([partial_cbn.v[i].domain_size for i in parents_index])
            g[v_index] = list(product(range(v_domain_size), repeat=int(parents_domain_size)))

        # format
        # [(), (), ()]
        # r corresponds to the tuple
        # parents corresponds to the location of the tuple

        # assert the response function. (t function of Pearl, I function in our paper)
        def Indicator(obs, parents, response):
            # sort the parents by id
            par_key = parents.keys()
            # map the value to index
            par_index = 0
            for k in par_key:
                par_index = par_index * partial_cbn.v[k].domain_size + parents.dict[k]

            return 1 if obs.first_value() == g[obs.first_key()][response][par_index] else 0

        # build the object function
        weights = np.zeros(shape=[S.domain_size,
                                  W.domain_size ** S.domain_size,
                                  A.domain_size ** W.domain_size,
                                  Y_hat.domain_size ** (S.domain_size * W.domain_size * A.domain_size)])

        for rs, rw, ra, ry in product(range(S.domain_size),
                                      range(W.domain_size ** S.domain_size),
                                      range(A.domain_size ** W.domain_size),
                                      range(Y_hat.domain_size ** (S.domain_size * W.domain_size * A.domain_size))):
            # assert r -> o to obtain the conditional individuals
            product_i = 1
            for (obs, parents, response) in [(Event({S: s0_val}), Event({}), rs),
                                             (Event({W: ow}), Event({S: s0_val}), rw),
                                             (Event({A: oa}), Event({W: ow}), ra)]:
                product_i *= Indicator(obs, parents, response)

            if product_i == 1:
                # if ALL I()= 1, then continue the counterfactual inference
                # the first term for pse
                sum_identity = 0.0
                for w1, w0, a1 in product(W.domains.get_all(), W.domains.get_all(), A.domains.get_all()):
                    product_i = 1
                    for (v, parents, response) in [(Event({W: w1}), Event({S: s1_val}), rw),
                                                   (Event({A: a1}), Event({W: w1}), ra),
                                                   (Event({W: w0}), Event({S: s0_val}), rw),
                                                   (Event({Y_hat: y_val}), Event({S: s1_val, W: w0, A: a1}), ry)]:
                        product_i *= Indicator(v, parents, response)

                    sum_identity += product_i

                weights[rs, rw, ra, ry] = weights[rs, rw, ra, ry] + sum_identity

                # the second term for pse
                sum_identity = 0.0
                for w0, a0 in product(W.domains.get_all(), A.domains.get_all()):
                    product_i = 1
                    for (v, parents, response) in [(Event({W: w0}), Event({S: s0_val}), rw),
                                                   (Event({A: a0}), Event({W: w0}), ra),
                                                   (Event({Y_hat: y_val}), Event({S: s0_val, W: w0, A: a0}), ry)]:
                        product_i = product_i * Indicator(v, parents, response)
                    sum_identity += product_i

                weights[rs, rw, ra, ry] -= sum_identity

        # build the objective function
        objective = weights.reshape(1, -1) @ PR / partial_cbn.get_prob(Event({S: s0_val, W: ow, A: oa}))

        ############################
        ### to build the constraints
        ############################

        ### the inferred model is consistent with the observational distribution
        A_mat = np.zeros((S.domain_size, W.domain_size, A.domain_size, Y_hat.domain_size,
                          S.domain_size, W.domain_size ** S.domain_size, A.domain_size ** W.domain_size,
                          Y_hat.domain_size ** (S.domain_size * W.domain_size * A.domain_size)))
        b_vex = np.zeros((S.domain_size, W.domain_size, A.domain_size, Y_hat.domain_size))

        # assert r -> v
        for s, w, a, y in product(S.domains.get_all(),
                                  W.domains.get_all(),
                                  A.domains.get_all(),
                                  Y_hat.domains.get_all()):
            # calculate the probability of observation
            b_vex[s.index, w.index, a.index, y.index] = partial_cbn.get_prob(Event({S: s, W: w, A: a, Y_hat: y}))
            # sum of P(r)
            for rs, rw, ra, ry in product(range(S.domain_size),
                                          range(W.domain_size ** S.domain_size),
                                          range(A.domain_size ** W.domain_size),
                                          range(Y_hat.domain_size ** (S.domain_size * W.domain_size * A.domain_size))):
                product_i = 1
                for (v, parents, response) in [(Event({S: s}), Event({}), rs),
                                               (Event({W: w}), Event({S: s}), rw),
                                               (Event({A: a}), Event({W: w}), ra),
                                               (Event({Y_hat: y}), Event({S: s, W: w, A: a}), ry)]:
                    product_i = product_i * Indicator(v, parents, response)

                A_mat[s.index, w.index, a.index, y.index, rs, rw, ra, ry] = product_i

        # flatten the matrix and vector
        A_mat = A_mat.reshape(
            S.domain_size * W.domain_size * A.domain_size * Y_hat.domain_size,
            S.domain_size * W.domain_size ** S.domain_size * A.domain_size ** W.domain_size * Y_hat.domain_size ** (S.domain_size * W.domain_size * A.domain_size))
        b_vex = b_vex.reshape(-1, 1)

        ### the probability <= 1
        C_mat = np.identity(S.domain_size * W.domain_size ** S.domain_size * A.domain_size ** W.domain_size * Y_hat.domain_size ** (S.domain_size * W.domain_size * A.domain_size))
        d_vec = np.ones(S.domain_size * W.domain_size ** S.domain_size * A.domain_size ** W.domain_size * Y_hat.domain_size ** (S.domain_size * W.domain_size * A.domain_size))

        ### the probability is positive
        E_mat = np.identity(S.domain_size * W.domain_size ** S.domain_size * A.domain_size ** W.domain_size * Y_hat.domain_size ** (S.domain_size * W.domain_size * A.domain_size))
        f_vec = np.zeros(S.domain_size * W.domain_size ** S.domain_size * A.domain_size ** W.domain_size * Y_hat.domain_size ** (S.domain_size * W.domain_size * A.domain_size))

        constraints = [
            A_mat @ PR == b_vex,
            # C_mat @ PR == d_vec,
            C_mat @ PR <= d_vec,
            E_mat @ PR >= f_vec
        ]

        # minimize the causal effect
        problem = cvx.Problem(cvx.Minimize(objective), constraints)
        problem.solve()

        # print('tight lower effect: %f' % (problem.value))
        lower = problem.value

        # maximize the causal effect
        problem = cvx.Problem(cvx.Maximize(objective), constraints)
        problem.solve()

        # print('tight upper effect: %f' % (problem.value))
        upper = problem.value

        return upper, lower
def method3(acc_matrix):
    df_train = pd.read_csv('temp/synthetic_train_prediction0.csv')
    train = DataSet(df_train)
    df_test = pd.read_csv('temp/synthetic_test_prediction0.csv')
    test = DataSet(df_test)
    acc = []

    for name in ['LR', 'SVM']:
        probabilistic_cbn = load_xml_to_cbn(
            os.path.join(src_path,
                         '../data/synthetic/ProbabilisticBayesianModel.xml'))

        def find_condition_prob(e, t):
            return probabilistic_cbn.find_prob(e, t)

        def get_loc(e):
            return probabilistic_cbn.get_loc(e)

        A = probabilistic_cbn.v['A']
        S = probabilistic_cbn.v['S']
        N = probabilistic_cbn.v['N']
        M = probabilistic_cbn.v['M']
        Y = probabilistic_cbn.v['Y']

        YH = Variable(name='YH', index=Y.index + 1, domains=Y.domains)
        probabilistic_cbn.v[(YH.index, YH.name)] = YH

        YT = Variable(name='YT', index=Y.index + 2, domains=Y.domains)
        probabilistic_cbn.v[(YT.index, YT.name)] = YT

        # build linear loss function
        C_vector = np.zeros((2**6 + 2**6 // 2, 1))
        for a, n, m, s in product(A.domains.get_all(), N.domains.get_all(),
                                  M.domains.get_all(), S.domains.get_all()):
            p_x_s = train.get_marginal_prob(
                Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))

            p_yh_1_y = p_x_s * train.count(
                Event({
                    'Y': 0,
                    name: 0
                }), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }), 'notequal')
            loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 0, YT: 0}))
            C_vector[loc] = p_yh_1_y * train.get_conditional_prob(
                Event({name: 0}), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))
            loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 1, YT: 1}))
            C_vector[loc] = p_yh_1_y * train.get_conditional_prob(
                Event({name: 1}), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))

            p_yh__y = p_x_s * train.count(
                Event({
                    'Y': 0,
                    name: 0
                }), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }), 'equal')
            loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 0, YT: 1}))
            C_vector[loc] = p_yh__y * train.get_conditional_prob(
                Event({name: 0}), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))
            loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 1, YT: 0}))
            C_vector[loc] = p_yh__y * train.get_conditional_prob(
                Event({name: 1}), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))

        # the inequality of max and min
        G_matrix_1 = np.zeros((2**6, 2**6 + 2**6 // 2))
        h_1 = np.zeros(2**6)
        # max
        i = 0
        for a, n, s, yt in product(A.domains.get_all(), N.domains.get_all(),
                                   S.domains.get_all(), YT.domains.get_all()):
            for m in M.domains.get_all():
                for yh in YH.domains.get_all():
                    loc = get_loc(
                        Event({
                            A: a,
                            M: m,
                            N: n,
                            S: s,
                            YH: yh,
                            YT: yt
                        }))
                    G_matrix_1[i, loc] = train.get_conditional_prob(
                        Event({name: yh}),
                        Event({
                            'A': a,
                            'M': m,
                            'N': n,
                            'S': s
                        }))
                loc = get_loc(Event({A: a, N: n, S: s, YT: yt}))
                G_matrix_1[i, 2**6 + loc] = -1
                i += 1
        # min
        assert i == 2**6 // 2
        for a, n, s, yt in product(A.domains.get_all(), N.domains.get_all(),
                                   S.domains.get_all(), YT.domains.get_all()):
            for m in M.domains.get_all():
                for yh in YH.domains.get_all():
                    loc = get_loc(
                        Event({
                            A: a,
                            M: m,
                            N: n,
                            S: s,
                            YH: yh,
                            YT: yt
                        }))
                    G_matrix_1[i, loc] = -train.get_conditional_prob(
                        Event({name: yh}),
                        Event({
                            'A': a,
                            'M': m,
                            'N': n,
                            'S': s
                        }))
                loc = get_loc(Event({A: a, N: n, S: s, YT: yt}))
                G_matrix_1[i, 2**6 + 2**6 // 4 + loc] = 1
                i += 1

        # build counterfactual fairness constraints
        G_matrix_2 = np.zeros((2**2 * 2, 2**6 + 2**6 // 2))
        h_2 = np.ones(2**2 * 2) * tau

        i = 0
        for a, m in product(A.domains.get_all(), M.domains.get_all()):
            for n in N.domains.get_all():
                loc = get_loc(Event({A: a, N: n, S: spos, YT: yt_pos}))
                G_matrix_2[i, 2**6 + loc] = find_condition_prob(
                    Event({N: n}), Event({
                        A: a,
                        S: spos
                    }))

                for yh in YH.domains.get_all():
                    loc = get_loc(
                        Event({
                            A: a,
                            M: m,
                            N: n,
                            S: sneg,
                            YH: yh,
                            YT: yt_pos
                        }))
                    G_matrix_2[i, loc] = -find_condition_prob (Event ({N: n}), Event ({A: a, S: sneg})) \
                          * train.get_conditional_prob (Event ({name: yh}), Event ({'A': a, 'M': m, 'N': n, 'S': sneg}))
            i += 1

        assert i == 2**2
        for a, m in product(A.domains.get_all(), M.domains.get_all()):
            for n in N.domains.get_all():
                loc = get_loc(Event({A: a, N: n, S: spos, YT: yt_pos}))
                G_matrix_2[i, 2**6 + 2**6 // 4 + loc] = -find_condition_prob(
                    Event({N: n}), Event({
                        A: a,
                        S: spos
                    }))

                for yh in YH.domains.get_all():
                    loc = get_loc(
                        Event({
                            A: a,
                            M: m,
                            N: n,
                            S: sneg,
                            YH: yh,
                            YT: yt_pos
                        }))
                    G_matrix_2[i, loc] = find_condition_prob (Event ({N: n}), Event ({A: a, S: sneg})) \
                          * train.get_conditional_prob (Event ({name: yh}), Event ({'A': a, 'M': m, 'N': n, 'S': sneg}))
            i += 1

        ###########

        # mapping in [0, 1]
        G_matrix_3 = np.zeros(((2**6 + 2**6 // 2) * 2, 2**6 + 2**6 // 2))
        h_3 = np.zeros((2**6 + 2**6 // 2) * 2)

        for i in range(2**6 + 2**6 // 2):
            G_matrix_3[i, i] = 1
            h_3[i] = 1

            G_matrix_3[2**6 + 2**6 // 2 + i, i] = -1
            h_3[2**6 + 2**6 // 2 + i] = 0

        # sum = 1
        A_matrix = np.zeros((2**6 // 2, 2**6 + 2**6 // 2))
        b = np.ones(2**6 // 2)

        i = 0
        for a, n, m, s, yh in product(A.domains.get_all(), N.domains.get_all(),
                                      M.domains.get_all(), S.domains.get_all(),
                                      YH.domains.get_all()):
            for yt in YT.domains.get_all():
                A_matrix[
                    i,
                    get_loc(Event({
                        A: a,
                        M: m,
                        N: n,
                        S: s,
                        YH: yh,
                        YT: yt
                    }))] = 1
            i += 1

        assert i == 2**6 // 2

        # combine the inequality constraints
        G_matrix = np.vstack([G_matrix_1, G_matrix_2, G_matrix_3])
        h = np.hstack([h_1, h_2, h_3])

        # solver
        solvers.options['show_progress'] = False
        sol = solvers.lp(c=matrix(C_vector),
                         G=matrix(G_matrix),
                         h=matrix(h),
                         A=matrix(A_matrix),
                         b=matrix(b),
                         solver=solvers)
        mapping = np.array(sol['x'])

        # build the post-processing result in training and testing
        train.df[name + '1'] = train.df[name]
        test.df[name + '1'] = test.df[name]
        for a, n, m, s, yh, yt in product(A.domains.get_all(),
                                          N.domains.get_all(),
                                          M.domains.get_all(),
                                          S.domains.get_all(),
                                          YH.domains.get_all(),
                                          YT.domains.get_all()):
            if yh.name != yt.name:
                p = mapping[
                    get_loc(Event({
                        A: a,
                        M: m,
                        N: n,
                        S: s,
                        YH: yh,
                        YT: yt
                    })), 0]
                train.random_assign(
                    Event({
                        name: yh,
                        'A': a,
                        'M': m,
                        'N': n,
                        'S': s
                    }), Event({name + '1': yt}), p)
                test.random_assign(
                    Event({
                        name: yh,
                        'A': a,
                        'M': m,
                        'N': n,
                        'S': s
                    }), Event({name + '1': yt}), p)

        train.df[name] = train.df[name + '1']
        train.df.drop([name + '1'], axis=1)
        test.df[name] = test.df[name + '1']
        test.df.drop([name + '1'], axis=1)
        acc.append(accuracy_score(train.df['Y'], train.df[name]))
        acc.append(accuracy_score(test.df['Y'], test.df[name]))

    acc_matrix.iloc[:, 3] = acc

    train.df.to_csv('temp/synthetic_train_prediction3.csv', index=False)
    test.df.to_csv('temp/synthetic_test_prediction3.csv', index=False)