def timer(inputfile, trials, datalength): # load nodedata and graphskeleton nd = NodeData() skel = GraphSkeleton() #print "bp1" nd.load(inputfile) #print "bp2" skel.load(inputfile) #print "bp3" # msg = "%d, %d" % (asizeof(nd), asizeof(skel)) # print >>op, msg # topologically order graphskeleton skel.toporder() # load bayesian network bn = DiscreteBayesianNetwork(skel, nd) # instantiate pgm learner l = PGMLearner() # free unused memory del nd #sum1 = summary.summarize(muppy.get_objects()) #summary.print_(sum1) # TIME totaltime = 0 for _ in range(trials): data = bn.randomsample(datalength) start = time.clock() ret = l.discrete_mle_estimateparams(skel, data) elapsed = time.clock() - start totaltime += elapsed totaltime /= trials print json.dumps(ret.Vdata, indent=1) return totaltime
def timer(inputfile, trials): # load nodedata and graphskeleton nd = NodeData() skel = GraphSkeleton() nd.load(inputfile) skel.load(inputfile) # topologically order graphskeleton skel.toporder() # load bayesian network bn = DiscreteBayesianNetwork(skel, nd) # TIME totaltime = 0 for _ in range(trials): start = time.clock() ret = bn.randomsample(100) elapsed = time.clock() - start totaltime += elapsed totaltime /= trials return totaltime
def discrete_mle_estimateparams(self, graphskeleton, data): ''' Estimate parameters for a discrete Bayesian network with a structure given by *graphskeleton* in order to maximize the probability of data given by *data*. This function takes the following arguments: 1. *graphskeleton* -- An instance of the :doc:`GraphSkeleton <graphskeleton>` class containing vertex and edge data. 2. *data* -- A list of dicts containing samples from the network in {vertex: value} format. Example:: [ { 'Grade': 'B', 'SAT': 'lowscore', ... }, ... ] This function normalizes the distribution of a node's outcomes for each combination of its parents' outcomes. In doing so it creates an estimated tabular conditional probability distribution for each node. It then instantiates a :doc:`DiscreteBayesianNetwork <discretebayesiannetwork>` instance based on the *graphskeleton*, and modifies that instance's *Vdata* attribute to reflect the estimated CPDs. It then returns the instance. The Vdata attribute instantiated is in the format seen in :doc:`unittestdict`, as described in :doc:`discretebayesiannetwork`. Usage example: this would learn parameters from a set of 200 discrete samples:: import json from libpgm.nodedata import NodeData from libpgm.graphskeleton import GraphSkeleton from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork from libpgm.pgmlearner import PGMLearner # generate some data to use nd = NodeData() nd.load("../tests/unittestdict.txt") # an input file skel = GraphSkeleton() skel.load("../tests/unittestdict.txt") skel.toporder() bn = DiscreteBayesianNetwork(skel, nd) data = bn.randomsample(200) # instantiate my learner learner = PGMLearner() # estimate parameters from data and skeleton result = learner.discrete_mle_estimateparams(skel, data) # output print json.dumps(result.Vdata, indent=2) ''' assert (isinstance(graphskeleton, GraphSkeleton)), "First arg must be a loaded GraphSkeleton class." assert (isinstance(data, list) and data and isinstance(data[0], dict)), "Second arg must be a list of dicts." # instantiate Bayesian network, and add parent and children data bn = DiscreteBayesianNetwork() graphskeleton.toporder() bn.V = graphskeleton.V bn.E = graphskeleton.E bn.Vdata = dict() for vertex in bn.V: bn.Vdata[vertex] = dict() bn.Vdata[vertex]["children"] = graphskeleton.getchildren(vertex) bn.Vdata[vertex]["parents"] = graphskeleton.getparents(vertex) # make placeholders for vals, cprob, and numoutcomes bn.Vdata[vertex]["vals"] = [] if (bn.Vdata[vertex]["parents"] == []): bn.Vdata[vertex]["cprob"] = [] else: bn.Vdata[vertex]["cprob"] = dict() bn.Vdata[vertex]["numoutcomes"] = 0 # determine which outcomes are possible for each node for sample in data: for vertex in bn.V: if (sample[vertex] not in bn.Vdata[vertex]["vals"]): bn.Vdata[vertex]["vals"].append(sample[vertex]) bn.Vdata[vertex]["numoutcomes"] += 1 # lay out probability tables, and put a [num, denom] entry in all spots: # define helper function to recursively set up cprob table def addlevel(vertex, _dict, key, depth, totaldepth): if depth == totaldepth: _dict[str(key)] = [] for _ in range(bn.Vdata[vertex]["numoutcomes"]): _dict[str(key)].append([0, 0]) return else: for val in bn.Vdata[bn.Vdata[vertex]["parents"][depth]]["vals"]: ckey = key[:] ckey.append(str(val)) addlevel(vertex, _dict, ckey, depth+1, totaldepth) # put [0, 0] at each entry of cprob table for vertex in bn.V: if (bn.Vdata[vertex]["parents"]): root = bn.Vdata[vertex]["cprob"] numparents = len(bn.Vdata[vertex]["parents"]) addlevel(vertex, root, [], 0, numparents) else: for _ in range(bn.Vdata[vertex]["numoutcomes"]): bn.Vdata[vertex]["cprob"].append([0, 0]) # fill out entries with samples: for sample in data: for vertex in bn.V: # compute index of result rindex = bn.Vdata[vertex]["vals"].index(sample[vertex]) # go to correct place in Vdata if bn.Vdata[vertex]["parents"]: pvals = [str(sample[t]) for t in bn.Vdata[vertex]["parents"]] lev = bn.Vdata[vertex]["cprob"][str(pvals)] else: lev = bn.Vdata[vertex]["cprob"] # increase all denominators for the current condition for entry in lev: entry[1] += 1 # increase numerator for current outcome lev[rindex][0] += 1 # convert arrays to floats for vertex in bn.V: if not bn.Vdata[vertex]["parents"]: bn.Vdata[vertex]["cprob"] = [x[0]/float(x[1]) for x in bn.Vdata[vertex]["cprob"]] else: for key in bn.Vdata[vertex]["cprob"].keys(): try: bn.Vdata[vertex]["cprob"][key] = [x[0]/float(x[1]) for x in bn.Vdata[vertex]["cprob"][key]] # default to even distribution if no data points except ZeroDivisionError: bn.Vdata[vertex]["cprob"][key] = [1/float(bn.Vdata[vertex]["numoutcomes"]) for x in bn.Vdata[vertex]["cprob"][key]] # return cprob table with estimated probability distributions return bn