def createData(): nd = NodeData() skel = GraphSkeleton() fpath = "job_interview.txt" nd.load(fpath) skel.load(fpath) skel.toporder() bn = DiscreteBayesianNetwork(skel, nd) learner = PGMLearner() data = bn.randomsample(1000) X, Y = 'Grades', 'Offer' c,p,w=learner.discrete_condind(data, X, Y, ['Interview']) print "independence between X and Y: ", c, " p-value ", p, " witness node: ", w result = learner.discrete_constraint_estimatestruct(data) print result.E
class TestPGMLearner(unittest.TestCase): def setUp(self): # instantiate learner self.l = PGMLearner() # generate graph skeleton skel = GraphSkeleton() skel.load("unittestdict.txt") skel.toporder() # generate sample sequence to try to learn from - discrete nd = NodeData() nd.load("unittestdict.txt") self.samplediscbn = DiscreteBayesianNetwork(skel, nd) self.samplediscseq = self.samplediscbn.randomsample(5000) # generate sample sequence to try to learn from - discrete nda = NodeData() nda.load("unittestlgdict.txt") self.samplelgbn = LGBayesianNetwork(skel, nda) self.samplelgseq = self.samplelgbn.randomsample(10000) self.skel = skel def test_discrete_mle_estimateparams(self): result = self.l.discrete_mle_estimateparams(self.skel, self.samplediscseq) indexa = result.Vdata['SAT']['vals'].index('lowscore') self.assertTrue(result.Vdata['SAT']['cprob']["['low']"][indexa] < 1 and result.Vdata['SAT']['cprob']["['low']"][indexa] > .9) indexb = result.Vdata['Letter']['vals'].index('weak') self.assertTrue(result.Vdata['Letter']['cprob']["['A']"][indexb] < .15 and result.Vdata['Letter']['cprob']["['A']"][indexb] > .05) def test_lg_mle_estimateparams(self): result = self.l.lg_mle_estimateparams(self.skel, self.samplelgseq) self.assertTrue(result.Vdata['SAT']['mean_base'] < 15 and result.Vdata['SAT']['mean_base'] > 5) self.assertTrue(result.Vdata['Letter']['variance'] < 15 and result.Vdata['Letter']['variance'] > 5) def test_discrete_constraint_estimatestruct(self): result = self.l.discrete_constraint_estimatestruct(self.samplediscseq) self.assertTrue(["Difficulty", "Grade"] in result.E) def test_lg_constraint_estimatestruct(self): result = self.l.lg_constraint_estimatestruct(self.samplelgseq) self.assertTrue(["Intelligence", "Grade"] in result.E) def test_discrete_condind(self): chi, pv, witness = self.l.discrete_condind(self.samplediscseq, "Difficulty", "Letter", ["Grade"]) self.assertTrue(pv > .05) self.assertTrue(witness, ["Grade"]) chia, pva, witnessa = self.l.discrete_condind(self.samplediscseq, "Difficulty", "Intelligence", []) self.assertTrue(pva < .05) def test_discrete_estimatebn(self): result = self.l.discrete_estimatebn(self.samplediscseq) self.assertTrue(result.V) self.assertTrue(result.E) self.assertTrue(result.Vdata["Difficulty"]["cprob"][0]) def test_lg_estimatebn(self): result = self.l.lg_estimatebn(self.samplelgseq) self.assertTrue(result.V) self.assertTrue(result.E) self.assertTrue(result.Vdata["Intelligence"]["mean_base"])
result_full_bn = learner_full.discrete_estimatebn(training_data) #result_full_bn.E # In[ ]: # We can also manually test and verify how independent two varaibles are # In[ ]: learner_indep = PGMLearner() learner_indep.discrete_condind(training_data,'Surv', 'Fare', ['Class']) # In this case, the result is chi, pval et variable U learner_indep = PGMLearner() print("Chi, pval, U: {}{}".format(learner_indep.discrete_condind(training_data,'Surv', 'Fare', ['Class']), "(Ho can't be rejected since Surv and Fare are cond independent)")) print("Chi, pval, U: {}{}".format(learner_indep.discrete_condind(training_data,'Surv', 'Class', ['Fare']), "(Ho is rejected: Surv and Class are not indep)")) print("Chi, pval, U: {}{}".format(learner_indep.discrete_condind(training_data,'Sex', 'Class', ['Surv']), "(Ho is rejected: Sex and Class are not indep)")) print("Chi, pval, U: {}".format(learner_indep.discrete_condind(training_data,'Fare', 'Class', ['Sex']))) print("Chi, pval, U: {}".format(learner_indep.discrete_condind(training_data,'Fare', 'Sex', ['Sex']))) chi – The result of the chi-squared test on the data (compare the actual and the expected distri of X and Y given U). The expected distribution is P(X,Y,U)=P(U)P(X|U)P(Y|U). The Chi squared is a measure of the deviance between two distributions. pval – The p-value of the test, meaning the probability of attaining a chi-square result as extreme as or more extreme than the one found, assuming that the null hypothesis is true. (e.g., a p-value of .05 means that if X and Y were independent given U, the chance of getting a chi-squared result this high or higher are .05). The Null H (independence) is rejected if the p-value is smaller than 0.05.