def test_joint_variables__unequal_numbers_of_instances(): # Variable animals has 6 instances. animals = Variable(['cat', 'dog', 'cat', 'mouse', 'dog', 'cat']) animals.ID = 3 animals.name = 'animals' # Variable colors has 6 instances. colors = Variable(['gray', 'yellow', 'brown', 'silver', 'white', 'gray']) colors.ID = 2 colors.name = 'colors' # Variable sizes has only 5 instances, which will cause an error. sizes = Variable(['small', 'small', 'large', 'small', 'normal']) sizes.ID = 1 sizes.name = 'sizes' with pytest.raises(VariableInstancesOfUnequalCount): fauna = JointVariables(animals, colors, sizes) sizes = Variable(['small', 'small', 'large', 'small', 'normal', 'small']) sizes.ID = 1 sizes.name = 'sizes' fauna = JointVariables(animals, colors, sizes) can_fly = Variable([False, False, False, False, False]) can_fly.ID = 4 can_fly.name = 'can_fly' with pytest.raises(VariableInstancesOfUnequalCount): JointVariables(fauna, can_fly)
def make_pmfs_from_datasetmatrix(self, X: int, Y: int, Zl: list[int]) -> tuple[CPMF, CPMF, CPMF, PMF]: PrZ: PMF PrXcZ: CPMF PrYcZ: CPMF PrXYcZ: CPMF (VarX, VarY, VarZ) = self.load_variables(X, Y, Zl) if len(Zl) == 0: PrXY = PMF(JointVariables(VarX, VarY)) PrX = PMF(VarX) PrY = PMF(VarY) PrZ = OmegaPMF() PrXYcZ = OmegaCPMF(PrXY) PrXcZ = OmegaCPMF(PrX) PrYcZ = OmegaCPMF(PrY) else: PrXYZ = PMF(JointVariables(VarX, VarY, VarZ)) PrXZ = PMF(JointVariables(VarX, VarZ)) PrYZ = PMF(JointVariables(VarY, VarZ)) PrZ = PMF(VarZ) PrXcZ = PrXZ.condition_on(PrZ) PrYcZ = PrYZ.condition_on(PrZ) PrXYcZ = PrXYZ.condition_on(PrZ) return (PrXYcZ, PrXcZ, PrYcZ, PrZ)
def test_conditional_pmf__multiple_values(): sizes = Variable(['small', 'small', 'large', 'small', 'normal', 'small']) sizes.ID = 1 sizes.name = 'sizes' colors = Variable(['gray', 'yellow', 'brown', 'silver', 'white', 'gray']) colors.ID = 2 colors.name = 'colors' animals = Variable(['cat', 'dog', 'cat', 'snake', 'dog', 'cat']) animals.ID = 3 animals.name = 'animals' is_pet = Variable(['yes', 'yes', 'yes', 'maybe', 'yes', 'yes']) is_pet.ID = 4 is_pet.name = 'is_pet' Pr = CPMF(JointVariables(colors, is_pet), JointVariables(sizes, animals)) assert Pr.given('small', 'cat').p('gray', 'yes') == 2 / 2 assert Pr.given('small', 'cat').p('yellow', 'yes') == 0 / 1 assert Pr.given('small', 'cat').p('brown', 'maybe') == 0 / 1 assert Pr.given('small', 'dog').p('yellow', 'yes') == 1 / 1 assert Pr.given('small', 'dog').p('yellow', 'maybe') == 0 / 1 assert Pr.given('small', 'dog').p('silver', 'maybe') == 0 / 1 assert Pr.given('large', 'cat').p('brown', 'yes') == 1 / 1 assert Pr.given('large', 'cat').p('yellow', 'yes') == 0 / 1 assert Pr.given('small', 'snake').p('silver', 'maybe') == 1 / 1 assert Pr.given('small', 'snake').p('silver', 'no') == 0 / 1 assert Pr.given('normal', 'dog').p('white', 'yes') == 1 / 1 assert Pr.given('normal', 'dog').p('silver', 'yes') == 0 / 1 assert Pr.given('normal', 'dog').p('yellow', 'maybe') == 0 / 1 SA = JointVariables(sizes, animals) PrAll = CPMF(JointVariables(colors, is_pet), SA) PrSA = PMF(SA) PrCcSA = CPMF(colors, SA) PrIPcSA = CPMF(is_pet, SA) test_p_all = 0.0 test_p_c = 0.0 test_p_ip = 0.0 for (sa, psa) in PrSA.items(): for (c, pcsa) in PrCcSA.given(sa).items(): test_p_c += pcsa * PrSA.p(sa) for (ip, pipsa) in PrIPcSA.given(sa).items(): pall = PrAll.given(sa).p(c, ip) test_p_all += pall * PrSA.p(sa) test_p_ip += pipsa * PrSA.p(sa) assert almostEqual(1, test_p_all) assert almostEqual(1, test_p_c) assert almostEqual(1, test_p_ip)
def test_conditional_pmf__binary(): V0 = Variable([0, 1, 0, 1, 0, 1, 0, 1]) V1 = Variable([0, 0, 1, 1, 0, 0, 1, 1]) V2 = Variable([0, 0, 0, 0, 1, 1, 1, 1]) V78 = Variable([0, 0, 0, 0, 0, 0, 1, 1]) Pr = CPMF(V0, V78) assert Pr.given(0).p(0) == 3 / 6 assert Pr.given(0).p(1) == 3 / 6 assert Pr.given(1).p(0) == 1 / 2 assert Pr.given(1).p(1) == 1 / 2 Pr = CPMF(V2, V78) assert Pr.given(0).p(0) == 4 / 6 assert Pr.given(0).p(1) == 2 / 6 assert Pr.given(1).p(0) == 0 / 2 assert Pr.given(1).p(1) == 2 / 2 Pr = CPMF(V78, V1) assert Pr.given(0).p(0) == 4 / 4 assert Pr.given(0).p(1) == 0 / 4 assert Pr.given(1).p(0) == 2 / 4 assert Pr.given(1).p(1) == 2 / 4 Pr = CPMF(V1, JointVariables(V2, V78)) assert Pr.given(0, 0).p(0) == 2 / 4 assert Pr.given(0, 0).p(1) == 2 / 4 assert Pr.given(0, 1).p(0) == 0 / 1 assert Pr.given(0, 1).p(1) == 0 / 1 assert Pr.given(1, 0).p(0) == 2 / 2 assert Pr.given(1, 0).p(1) == 0 / 2 assert Pr.given(1, 1).p(0) == 0 / 2 assert Pr.given(1, 1).p(1) == 2 / 2
def test_joint_variables_pmf(): animals = Variable(['cat', 'dog', 'cat', 'mouse', 'dog', 'cat']) animals.ID = 3 animals.name = 'animals' colors = Variable(['gray', 'yellow', 'brown', 'silver', 'white', 'gray']) colors.ID = 2 colors.name = 'colors' sizes = Variable(['small', 'small', 'large', 'small', 'normal', 'small']) sizes.ID = 1 sizes.name = 'sizes' fauna = JointVariables(sizes, colors, animals) fauna.update_values() assert [1, 2, 3] == fauna.variableIDs assert fauna.variables[0] is sizes assert fauna.variables[1] is colors assert fauna.variables[2] is animals expected_values = [('large', 'brown', 'cat'), ('normal', 'white', 'dog'), ('small', 'gray', 'cat'), ('small', 'silver', 'mouse'), ('small', 'yellow', 'dog')] assert fauna.values == expected_values PrFauna = PMF(fauna) assert PrFauna.p('small', 'gray', 'cat') == 2 / 6 assert PrFauna.p('small', 'silver', 'mouse') == 1 / 6 assert PrFauna.p('small', 'silver', 'dog') == 0 singleton_joint = JointVariables(animals) assert ['cat', 'dog', 'cat', 'mouse', 'dog', 'cat'] == singleton_joint.instances()
def test_G_value__lungcancer(ds_lungcancer_4e4): Omega = ds_lungcancer_4e4.omega lungcancer = ds_lungcancer_4e4.datasetmatrix bn = ds_lungcancer_4e4.bayesiannetwork ASIA = lungcancer.get_variable('X', 0) BRONC = lungcancer.get_variable('X', 1) DYSP = lungcancer.get_variable('X', 2) EITHER = lungcancer.get_variable('X', 3) LUNG = lungcancer.get_variable('X', 4) SMOKE = lungcancer.get_variable('X', 5) TUB = lungcancer.get_variable('X', 6) XRAY = lungcancer.get_variable('X', 7) parameters = dict() parameters['ci_test_significance'] = 0.95 parameters['ci_test_debug'] = 0 parameters['omega'] = Omega parameters['source_bayesian_network'] = bn parameters[ 'ci_test_dof_calculator_class'] = mbtk.math.DoFCalculators.StructuralDoF G_test = mbtk.math.G_test__unoptimized.G_test(lungcancer, parameters) assertCITestAccurate(G_test, ASIA, SMOKE, Omega) assertCITestAccurate(G_test, ASIA, LUNG, Omega) assertCITestAccurate(G_test, ASIA, BRONC, Omega) assertCITestAccurate(G_test, ASIA, TUB, Omega) assertCITestAccurate(G_test, ASIA, EITHER, Omega) assertCITestAccurate(G_test, ASIA, XRAY, Omega) assertCITestAccurate(G_test, EITHER, ASIA, JointVariables(TUB, LUNG)) assertCITestAccurate(G_test, EITHER, SMOKE, JointVariables(TUB, LUNG)) assertCITestAccurate(G_test, DYSP, SMOKE, JointVariables(EITHER, BRONC)) assertCITestAccurate(G_test, DYSP, LUNG, JointVariables(EITHER, BRONC)) assertCITestAccurate(G_test, DYSP, TUB, JointVariables(EITHER, BRONC)) assertCITestAccurate(G_test, XRAY, TUB, EITHER) assertCITestAccurate(G_test, XRAY, LUNG, EITHER) assertCITestAccurate(G_test, XRAY, ASIA, EITHER) assertCITestAccurate(G_test, XRAY, SMOKE, EITHER) assertCITestAccurate(G_test, XRAY, DYSP, EITHER) assertCITestAccurate(G_test, XRAY, BRONC, EITHER) assertCITestAccurate(G_test, XRAY, EITHER, Omega) assertCITestAccurate(G_test, XRAY, LUNG, Omega) assertCITestAccurate(G_test, XRAY, SMOKE, Omega) assertCITestAccurate(G_test, XRAY, TUB, Omega)
def calculate_pmf_for_cmi( X: Variable, Y: Variable, Z: Union[Variable, JointVariables], ) -> tuple[CPMF, CPMF, CPMF, PMF]: PrXYcZ = CPMF(JointVariables(X, Y), Z) PrXcZ = CPMF(X, Z) PrYcZ = CPMF(Y, Z) PrZ = PMF(Z) return (PrXYcZ, PrXcZ, PrYcZ, PrZ)
def test_make_cpmf_PrXcZ_variant_1() -> None: V0 = Variable([0, 1, 1, 1, 0, 1, 0, 1]) V1 = Variable([0, 0, 1, 1, 0, 1, 1, 1]) PrXZ = PMF(JointVariables(V0, V1)) PrXZ.IDs(1000, 1111) assert PrXZ.IDs() == (1000, 1111) assert PrXZ.p((0, 0)) == 2 / 8 assert PrXZ.p((0, 1)) == 1 / 8 assert PrXZ.p((1, 0)) == 1 / 8 assert PrXZ.p((1, 1)) == 4 / 8
def test_pmf_summing_over_variable(): V0 = Variable([0, 1, 1, 1, 0, 1, 0, 1]) V1 = Variable([0, 0, 1, 1, 0, 1, 1, 1]) V2 = Variable([0, 0, 0, 0, 1, 0, 1, 1]) V3 = Variable([0, 0, 0, 0, 0, 0, 1, 1]) V0.ID = 1000 V1.ID = 1111 V2.ID = 1222 V3.ID = 1333 Pr = PMF(JointVariables(V0, V1, V2, V3)) assert Pr.IDs() == (1000, 1111, 1222, 1333) assert Pr.p((0, 0, 0, 0)) == 1 / 8 assert Pr.p((1, 0, 0, 0)) == 1 / 8 assert Pr.p((1, 1, 0, 0)) == 3 / 8 assert Pr.p((0, 0, 1, 0)) == 1 / 8 assert Pr.p((0, 1, 1, 1)) == 1 / 8 assert Pr.p((1, 1, 1, 1)) == 1 / 8 Pr = Pr.sum_over(V2.ID) assert sum(Pr.probabilities.values()) == 1 assert Pr.p((0, 0, 0)) == 2 / 8 assert Pr.p((1, 0, 0)) == 1 / 8 assert Pr.p((1, 1, 0)) == 3 / 8 assert Pr.p((0, 1, 1)) == 1 / 8 assert Pr.p((1, 1, 1)) == 1 / 8 assert Pr.IDs() == (V0.ID, V1.ID, V3.ID) Pr = Pr.sum_over(V1.ID) assert sum(Pr.probabilities.values()) == 1 assert Pr.p((0, 0)) == 2 / 8 assert Pr.p((1, 0)) == 4 / 8 assert Pr.p((0, 1)) == 1 / 8 assert Pr.p((1, 1)) == 1 / 8 assert Pr.IDs() == (V0.ID, V3.ID) Pr = Pr.sum_over(V0.ID) assert sum(Pr.probabilities.values()) == 1 print(Pr.probabilities) assert Pr.p(0) == 6 / 8 assert Pr.p(1) == 2 / 8 assert Pr.IDs() == (V3.ID,)
def get_variables(self, matrix_label, columns): if columns is None: return None if isinstance(columns, int): column = columns return self.get_variable(matrix_label, column) if isinstance(columns, list) and len(columns) == 1: column = columns[0] return self.get_variable(matrix_label, column) variables = [] for column in columns: variables.append(self.get_variable(matrix_label, column)) return JointVariables(*variables)
def G_test_conditionally_independent(self, X: int, Y: int, Z: list[int]) -> CITestResult: (VarX, VarY, VarZ) = self.load_variables(X, Y, Z) result = CITestResult() result.start_timing() PrZ: PMF PrXcZ: CPMF PrYcZ: CPMF PrXYcZ: CPMF if len(Z) == 0: PrXY = PMF(JointVariables(VarX, VarY)) PrX = PMF(VarX) PrY = PMF(VarY) PrZ = OmegaPMF() PrXYcZ = OmegaCPMF(PrXY) PrXcZ = OmegaCPMF(PrX) PrYcZ = OmegaCPMF(PrY) if self.DoF_calculator.requires_pmfs: self.DoF_calculator.set_context_pmfs(PrXY, PrX, PrY, None) else: PrXYZ = PMF(JointVariables(VarX, VarY, VarZ)) PrXZ = PMF(JointVariables(VarX, VarZ)) PrYZ = PMF(JointVariables(VarY, VarZ)) PrZ = PMF(VarZ) PrXcZ = PrXZ.condition_on(PrZ) PrYcZ = PrYZ.condition_on(PrZ) PrXYcZ = PrXYZ.condition_on(PrZ) if self.DoF_calculator.requires_pmfs: self.DoF_calculator.set_context_pmfs(PrXYZ, PrXZ, PrYZ, PrZ) self.DoF_calculator.set_context_variables(X, Y, Z) if self.DoF_calculator.requires_cpmfs: self.DoF_calculator.set_context_cpmfs(PrXYcZ, PrXcZ, PrYcZ, PrZ) DoF = self.DoF_calculator.calculate_DoF(X, Y, Z) if not self.sufficient_samples(DoF): result.end_timing() result.index = self.ci_test_counter + 1 result.set_insufficient_samples() result.set_variables(VarX, VarY, VarZ) result.extra_info = ' DoF {}'.format(DoF) return result G = self.G_value(PrXYcZ, PrXcZ, PrYcZ, PrZ) p = chi2.cdf(G, DoF) independent = None if p < self.significance: independent = True else: independent = False result.end_timing() result.index = self.ci_test_counter + 1 result.set_independent(independent, self.significance) result.set_variables(VarX, VarY, VarZ) result.set_statistic('G', G, dict()) result.set_distribution('chi2', p, {'DoF': DoF}) result.extra_info = ' DoF {}'.format(DoF) return result
def calculate_pmf_for_mi(X: Variable, Y: Variable) -> tuple[PMF, PMF, PMF]: PrXY = PMF(JointVariables(X, Y)) PrX = PMF(X) PrY = PMF(Y) return (PrXY, PrX, PrY)
def test_conditional_pmf__from_bayesian_network(): configuration = dict() configuration['sourcepath'] = testutil.bif_folder / 'survey.bif' configuration['sample_count'] = int(4e4) # Using a random seed of 42 somehow requires 2e6 samples to pass, but # with the seed 1984, it is sufficient to generate only 4e4. Maybe the # random generator is biased somehow? configuration['random_seed'] = 1984 configuration['values_as_indices'] = False configuration['objectives'] = ['R', 'TRN'] bayesian_network = BayesianNetwork.from_bif_file(configuration['sourcepath'], use_cache=False) bayesian_network.finalize() sbnds = SampledBayesianNetworkDatasetSource(configuration) sbnds.reset_random_seed = True datasetmatrix = sbnds.create_dataset_matrix('test_sbnds') assert ['AGE', 'EDU', 'OCC', 'SEX'] == datasetmatrix.column_labels_X assert ['R', 'TRN'] == datasetmatrix.column_labels_Y AGE = Variable(datasetmatrix.get_column_by_label('X', 'AGE')) PrAge = PMF(AGE) SEX = Variable(datasetmatrix.get_column_by_label('X', 'SEX')) PrSex = PMF(SEX) assert_PMF_AlmostEquals_BNProbDist( bayesian_network.variable_nodes['AGE'].probdist, PrAge) assert_PMF_AlmostEquals_BNProbDist( bayesian_network.variable_nodes['SEX'].probdist, PrSex) EDU = Variable(datasetmatrix.get_column_by_label('X', 'EDU')) PrEdu = CPMF(EDU, given=JointVariables(AGE, SEX)) assert_CPMF_AlmostEquals_BNProbDist( bayesian_network.variable_nodes['EDU'].probdist, PrEdu) OCC = Variable(datasetmatrix.get_column_by_label('X', 'OCC')) PrOcc = CPMF(OCC, given=EDU) assert_CPMF_AlmostEquals_BNProbDist( bayesian_network.variable_nodes['OCC'].probdist, PrOcc) R = Variable(datasetmatrix.get_column_by_label('Y', 'R')) PrR = CPMF(R, given=EDU) assert_CPMF_AlmostEquals_BNProbDist( bayesian_network.variable_nodes['R'].probdist, PrR) TRN = Variable(datasetmatrix.get_column_by_label('Y', 'TRN')) PrTRN = CPMF(TRN, given=JointVariables(OCC, R)) assert_CPMF_AlmostEquals_BNProbDist( bayesian_network.variable_nodes['TRN'].probdist, PrTRN)