def anomaly_libpgm(): files = glob.glob(join('data', '*.txt')) for file in files[0:1]: print file data=read_data_libpgm(file) learner = PGMLearner() result=learner.lg_estimatebn(data, indegree=3) print result.E
class TestPGMLearner(unittest.TestCase): def setUp(self): # instantiate learner self.l = PGMLearner() # generate graph skeleton skel = GraphSkeleton() skel.load("unittestdict.txt") skel.toporder() # generate sample sequence to try to learn from - discrete nd = NodeData() nd.load("unittestdict.txt") self.samplediscbn = DiscreteBayesianNetwork(skel, nd) self.samplediscseq = self.samplediscbn.randomsample(5000) # generate sample sequence to try to learn from - discrete nda = NodeData() nda.load("unittestlgdict.txt") self.samplelgbn = LGBayesianNetwork(skel, nda) self.samplelgseq = self.samplelgbn.randomsample(10000) self.skel = skel def test_discrete_mle_estimateparams(self): result = self.l.discrete_mle_estimateparams(self.skel, self.samplediscseq) indexa = result.Vdata['SAT']['vals'].index('lowscore') self.assertTrue(result.Vdata['SAT']['cprob']["['low']"][indexa] < 1 and result.Vdata['SAT']['cprob']["['low']"][indexa] > .9) indexb = result.Vdata['Letter']['vals'].index('weak') self.assertTrue(result.Vdata['Letter']['cprob']["['A']"][indexb] < .15 and result.Vdata['Letter']['cprob']["['A']"][indexb] > .05) def test_lg_mle_estimateparams(self): result = self.l.lg_mle_estimateparams(self.skel, self.samplelgseq) self.assertTrue(result.Vdata['SAT']['mean_base'] < 15 and result.Vdata['SAT']['mean_base'] > 5) self.assertTrue(result.Vdata['Letter']['variance'] < 15 and result.Vdata['Letter']['variance'] > 5) def test_discrete_constraint_estimatestruct(self): result = self.l.discrete_constraint_estimatestruct(self.samplediscseq) self.assertTrue(["Difficulty", "Grade"] in result.E) def test_lg_constraint_estimatestruct(self): result = self.l.lg_constraint_estimatestruct(self.samplelgseq) self.assertTrue(["Intelligence", "Grade"] in result.E) def test_discrete_condind(self): chi, pv, witness = self.l.discrete_condind(self.samplediscseq, "Difficulty", "Letter", ["Grade"]) self.assertTrue(pv > .05) self.assertTrue(witness, ["Grade"]) chia, pva, witnessa = self.l.discrete_condind(self.samplediscseq, "Difficulty", "Intelligence", []) self.assertTrue(pva < .05) def test_discrete_estimatebn(self): result = self.l.discrete_estimatebn(self.samplediscseq) self.assertTrue(result.V) self.assertTrue(result.E) self.assertTrue(result.Vdata["Difficulty"]["cprob"][0]) def test_lg_estimatebn(self): result = self.l.lg_estimatebn(self.samplelgseq) self.assertTrue(result.V) self.assertTrue(result.E) self.assertTrue(result.Vdata["Intelligence"]["mean_base"])
sample[vertex] = vertexAverages[vertex] # Testing just 4 vertices for now (takes a really, really long time to use all of them) keysToRemove = list(vertices)[5:] #keysToRemove.remove('HIV') for sample in featureVectorSamples: for key in keysToRemove: del sample[key] # instantiate learner learner = PGMLearner() # Voila, it makes us a bayesian network! result = learner.lg_estimatebn(featureVectorSamples, pvalparam = 0.10) # output print json.dumps(result.Vdata, indent=2) print json.dumps(result.E, indent=2) # For progress report: previous things we tried! # Hackily removes all vertices with missing values, leaving just country name and year :P # Instead, we should totally impute values using our linear classifier! # commonVertices = vertices # for sample in featureVectorSamples: # commonVertices2 = set([v for v in commonVertices]) # for v in commonVertices: # if v not in sample.keys(): # commonVertices2.remove(v)
# if k in vertices: # newSample[k] = sample[k] # condensed_feature_vectors.append(newSample) ################################################ # import pprint # pp = pprint.PrettyPrinter(indent=4) # pp.pprint(condensed_feature_vectors) # instantiate learner learner = PGMLearner() # Voila, it makes us a bayesian network! bayesian_networks_by_region = {} for region in condensed_feature_vectors_by_region: bayesian_networks_by_region[region] = learner.lg_estimatebn(condensed_feature_vectors_by_region[region]) print region print json.dumps(bayesian_networks_by_region[region].Vdata, indent=2) print json.dumps(bayesian_networks_by_region[region].E, indent=2) #Evaluation: predictions = [] test_arrs_by_region = {} hiv_test_arrs_by_region = {} for i, sample in enumerate(test_arr): region = getRegion(sample['Country']) if not region: break if region not in test_arrs_by_region: test_arrs_by_region[region] = [] hiv_test_arrs_by_region[region] = []
result = learner.lg_constraint_estimatestruct(data) # output - toggle comment to see #print json.dumps(result.E, indent=2) # (12) ----------------------------------------------------------------------- # Learn entire Bayesian networks # say I have some data data = lgbn.randomsample(8000) # instantiate my learner learner = PGMLearner() # estimate parameters result = learner.lg_estimatebn(data) # output - toggle comment to see #print json.dumps(result.E, indent=2) #print json.dumps(result.Vdata, indent=2) # say I have some data data = bn.randomsample(2000) # instantiate my learner learner = PGMLearner() # estimate parameters result = learner.discrete_estimatebn(data) # output - toggle comment to see