def bayesNetDiscrete(textFile, quant_no, unique): cleanText(textFile, 'tempOutput.txt') ## imports textFile into pandas try: df = pd.read_csv('tempOutput.txt', sep='\s+', dtype='float64', header=None) except: print 'next file' return df.fillna(0, inplace=True) df.convert_objects(convert_numeric=True) ## set to either setUnique() or setMax() if unique is True: grouped = setUnique(df) else: grouped = setMax(df) ## quantiles is qcut(), fixed width divisions is cut grouped = quantize(quant_no, grouped) #turns into correct dictionary format for libpgm newDict = DFtoLibpgm(grouped) # instantiate my learner learner = PGMLearner() # estimate structure try: result = learner.discrete_estimatebn(newDict) except: print 'error' #result = learner.discrete_estimatebn([dict([('a',1),('b',2)])]) return # output return result
def bayesNetDiscrete(textFile,quant_no,unique): cleanText(textFile,'tempOutput.txt') ## imports textFile into pandas try: df = pd.read_csv('tempOutput.txt', sep='\s+',dtype='float64',header=None) except: print 'next file' return df.fillna(0, inplace=True) df.convert_objects(convert_numeric=True) ## set to either setUnique() or setMax() if unique is True: grouped = setUnique(df) else: grouped = setMax(df) ## quantiles is qcut(), fixed width divisions is cut grouped = quantize(quant_no,grouped) #turns into correct dictionary format for libpgm newDict = DFtoLibpgm(grouped) # instantiate my learner learner = PGMLearner() # estimate structure try: result = learner.discrete_estimatebn(newDict) except: print 'error' #result = learner.discrete_estimatebn([dict([('a',1),('b',2)])]) return # output return result
def main(): # filename features_file = './../data/features.csv' # read data into list handwriting_features = postmaster.readCSVIntoListAsDict(features_file) # learn structure # instantiate learner learner = PGMLearner() pvalue = 0.25 indegree = 1 # estimate structure #result = learner.discrete_constraint_estimatestruct( # handwriting_features, pvalue, indegree) result = learner.discrete_estimatebn(handwriting_features) #result = learner.discrete_condind(handwriting_features, 'f1', 'f2', # ['f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']) # output #print result.chi, result.pval, result.U #print json.dumps(result.E, indent=2) print json.dumps(result.Vdata, indent=2)
class TestPGMLearner(unittest.TestCase): def setUp(self): # instantiate learner self.l = PGMLearner() # generate graph skeleton skel = GraphSkeleton() skel.load("unittestdict.txt") skel.toporder() # generate sample sequence to try to learn from - discrete nd = NodeData() nd.load("unittestdict.txt") self.samplediscbn = DiscreteBayesianNetwork(skel, nd) self.samplediscseq = self.samplediscbn.randomsample(5000) # generate sample sequence to try to learn from - discrete nda = NodeData() nda.load("unittestlgdict.txt") self.samplelgbn = LGBayesianNetwork(skel, nda) self.samplelgseq = self.samplelgbn.randomsample(10000) self.skel = skel def test_discrete_mle_estimateparams(self): result = self.l.discrete_mle_estimateparams(self.skel, self.samplediscseq) indexa = result.Vdata['SAT']['vals'].index('lowscore') self.assertTrue(result.Vdata['SAT']['cprob']["['low']"][indexa] < 1 and result.Vdata['SAT']['cprob']["['low']"][indexa] > .9) indexb = result.Vdata['Letter']['vals'].index('weak') self.assertTrue(result.Vdata['Letter']['cprob']["['A']"][indexb] < .15 and result.Vdata['Letter']['cprob']["['A']"][indexb] > .05) def test_lg_mle_estimateparams(self): result = self.l.lg_mle_estimateparams(self.skel, self.samplelgseq) self.assertTrue(result.Vdata['SAT']['mean_base'] < 15 and result.Vdata['SAT']['mean_base'] > 5) self.assertTrue(result.Vdata['Letter']['variance'] < 15 and result.Vdata['Letter']['variance'] > 5) def test_discrete_constraint_estimatestruct(self): result = self.l.discrete_constraint_estimatestruct(self.samplediscseq) self.assertTrue(["Difficulty", "Grade"] in result.E) def test_lg_constraint_estimatestruct(self): result = self.l.lg_constraint_estimatestruct(self.samplelgseq) self.assertTrue(["Intelligence", "Grade"] in result.E) def test_discrete_condind(self): chi, pv, witness = self.l.discrete_condind(self.samplediscseq, "Difficulty", "Letter", ["Grade"]) self.assertTrue(pv > .05) self.assertTrue(witness, ["Grade"]) chia, pva, witnessa = self.l.discrete_condind(self.samplediscseq, "Difficulty", "Intelligence", []) self.assertTrue(pva < .05) def test_discrete_estimatebn(self): result = self.l.discrete_estimatebn(self.samplediscseq) self.assertTrue(result.V) self.assertTrue(result.E) self.assertTrue(result.Vdata["Difficulty"]["cprob"][0]) def test_lg_estimatebn(self): result = self.l.lg_estimatebn(self.samplelgseq) self.assertTrue(result.V) self.assertTrue(result.E) self.assertTrue(result.Vdata["Intelligence"]["mean_base"])
def learnDiscreteBN(df, continous_columns, features_column_names, label_column='cat', draw_network=False): features_df = df.copy() features_df = features_df.drop(label_column, axis=1) labels_df = DataFrame() labels_df[label_column] = df[label_column].copy() for i in continous_columns: bins = np.arange((min(features_df[i])), (max(features_df[i])), ((max(features_df[i]) - min(features_df[i])) / 5.0)) features_df[i] = pandas.np.digitize(features_df[i], bins=bins) data = [] for index, row in features_df.iterrows(): dict = {} for i in features_column_names: dict[i] = row[i] dict[label_column] = labels_df[label_column][index] data.append(dict) print "Init done" learner = PGMLearner() test = learner.discrete_estimatebn(data=data, pvalparam=0.05, indegree=1) # print test.__dict__ f = open('heart_structure.txt', 'w') s = str(test.__dict__) f.write(s) f.flush() f.close() print "done learning" edges = test.E vertices = test.V probas = test.Vdata # print probas dot_string = 'digraph BN{\n' dot_string += 'node[fontname="Arial"];\n' dataframes = {} print "save data" for vertice in vertices: print "New vertice: " + str(vertice) dataframe = DataFrame() pp = pprint.PrettyPrinter(indent=4) # pp.pprint(probas[vertice]) dot_string += vertice.replace(" ", "_") + ' [label="' + vertice + '\n' + '" ]; \n' if len(probas[vertice]['parents']) == 0: dataframe['Outcome'] = None dataframe['Probability'] = None vertex_dict = {} for index_outcome, outcome in enumerate(probas[vertice]['vals']): vertex_dict[str(outcome)] = probas[vertice]["cprob"][index_outcome] od = collections.OrderedDict(sorted(vertex_dict.items())) # print "Vertice: " + str(vertice) # print "%-7s|%-11s" % ("Outcome", "Probability") # print "-------------------" for k, v in od.iteritems(): # print "%-7s|%-11s" % (str(k), str(round(v, 3))) dataframe.loc[len(dataframe)] = [k, v] dataframes[vertice] = dataframe else: # pp.pprint(probas[vertice]) dataframe['Outcome'] = None vertexen = {} for index_outcome, outcome in enumerate(probas[vertice]['vals']): temp = [] for parent_index, parent in enumerate(probas[vertice]["parents"]): # print str([str(float(index_outcome))]) temp = probas[vertice]["cprob"] dataframe[parent] = None vertexen[str(outcome)] = temp dataframe['Probability'] = None od = collections.OrderedDict(sorted(vertexen.items())) # [str(float(i)) for i in ast.literal_eval(key)] # str(v[key][int(float(k))-1]) # print "Vertice: " + str(vertice) + " with parents: " + str(probas[vertice]['parents']) # print "Outcome" + "\t\t" + '\t\t'.join(probas[vertice]['parents']) + "\t\tProbability" # print "------------" * len(probas[vertice]['parents']) *3 # pp.pprint(od.values()) counter = 0 # print number_of_cols for outcome, cprobs in od.iteritems(): for key in cprobs.keys(): array_frame = [] array_frame.append((outcome)) print_string = str(outcome) + "\t\t" for parent_value, parent in enumerate([i for i in ast.literal_eval(key)]): # print "parent-value:"+str(parent_value) # print "parten:"+str(parent) array_frame.append(int(float(parent))) # print "lengte array_frame: "+str(len(array_frame)) print_string += parent + "\t\t" array_frame.append(cprobs[key][counter]) # print "lengte array_frame (2): "+str(len(array_frame)) # print cprobs[key][counter] print_string += str(cprobs[key][counter]) + "\t" # for stront in [str(round(float(i), 3)) for i in ast.literal_eval(key)]: # print_string += stront + "\t\t" # print "print string: " + print_string # print "array_frame:" + str(array_frame) dataframe.loc[len(dataframe)] = array_frame counter += 1 print "Vertice " + str(vertice) + " done" dataframes[vertice] = dataframe for edge in edges: dot_string += edge[0].replace(" ", "_") + ' -> ' + edge[1].replace(" ", "_") + ';\n' dot_string += '}' src = Source(dot_string) if draw_network:src.render('../data/BN', view=draw_network) if draw_network:src.render('../data/BN', view=False) print "vizualisation done" return dataframes
import json from libpgm.nodedata import NodeData from libpgm.graphskeleton import GraphSkeleton from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork from libpgm.pgmlearner import PGMLearner nd = NodeData() nd.load("nodedata.json") skel = GraphSkeleton() skel.load("nodedata.json") skel.toporder() bn = DiscreteBayesianNetwork(skel,nd) with open("manipulatedata.json") as fp: data = json.load(fp) learner = PGMLearner() # result = learner.discrete_constraint_estimatestruct(data) result = learner.discrete_estimatebn(data) print json.dumps(result.E, indent=2) print json.dumps(result.Vdata, indent=2)
# estimate parameters result = learner.lg_estimatebn(data) # output - toggle comment to see #print json.dumps(result.E, indent=2) #print json.dumps(result.Vdata, indent=2) # say I have some data data = bn.randomsample(2000) # instantiate my learner learner = PGMLearner() # estimate parameters result = learner.discrete_estimatebn(data) # output - toggle comment to see #print json.dumps(result.E, indent=2) #print json.dumps(result.Vdata, indent=2) # (13) ----------------------------------------------------------------------- # Forward sample on dynamic Bayesian networks # read input file path = "../tests/unittestdyndict.txt" f = open(path, 'r') g = eval(f.read()) # set up dynamic BN d = DynDiscBayesianNetwork()
# ## Learning both the structure and the parameters # In[ ]: #instatiate the learner learner_full = PGMLearner() # Learn structure and parameters. This method fully learns a BN from # discrete data given by data. This function combines the # discrete_constraint_estimatestruct method (where it passes in the # pvalparam and indegree arguments) with the discrete_mle_estimateparams method. # It returns a complete DiscreteBayesianNetwork class instance learned from the data result_full_bn = learner_full.discrete_estimatebn(training_data) #result_full_bn.E # In[ ]: # We can also manually test and verify how independent two varaibles are # In[ ]: learner_indep = PGMLearner() learner_indep.discrete_condind(training_data,'Surv', 'Fare', ['Class'])
Value=row.Value, Overall=row.Overall)) # result.append(dict(great = row.great, good = row.good, nice = row.nice, clean = row.clean, helpful = row.helpful, comfortable = row.comfortable, # beautiful = row.beautiful, wonderful = row.wonderful, friendly = row.friendly, fantastic = row.fantastic, bad = row.bad, # Cleanliness= row.Cleanliness, Location=row.Location ,Businessservice=row.Businessservice, # Checkin=row.Checkin, Service=row.Service, Rooms=row.Rooms, Value=row.Value, Overall=row.Overall )) return result #load all preprocessed training data df = pd.read_csv('./features_filtrato.csv', sep=',') #format data to let them correctly processed by libpgm functions node_data = format_data(df) skel = GraphSkeleton() #load structure of our net #skel.load("./json_skel.txt") #setting the topologic order #skel.toporder() #learner which will estimate parameters e if needed net structure learner = PGMLearner() #estismting parameters for our own model #res = learner.discrete_mle_estimateparams(skel, node_data) #estimating net structure given training data and paramenters this is an alternative to create a new model on our data net = learner.discrete_estimatebn(node_data) print json.dumps(net.V, indent=2) print json.dumps(net.E, indent=2) res = learner.discrete_mle_estimateparams(net, node_data) print(str(res))
def learnDiscreteBN(df, continous_columns, features_column_names, label_column='cat', draw_network=False): features_df = df.copy() features_df = features_df.drop(label_column, axis=1) labels_df = DataFrame() labels_df[label_column] = df[label_column].copy() for i in continous_columns: bins = np.arange((min(features_df[i])), (max(features_df[i])), ((max(features_df[i]) - min(features_df[i])) / 5.0)) features_df[i] = pandas.np.digitize(features_df[i], bins=bins) data = [] for index, row in features_df.iterrows(): dict = {} for i in features_column_names: dict[i] = row[i] dict[label_column] = labels_df[label_column][index] data.append(dict) print "Init done" learner = PGMLearner() test = learner.discrete_estimatebn(data=data, pvalparam=0.05, indegree=1) # print test.__dict__ f = open('heart_structure.txt', 'w') s = str(test.__dict__) f.write(s) f.flush() f.close() print "done learning" edges = test.E vertices = test.V probas = test.Vdata # print probas dot_string = 'digraph BN{\n' dot_string += 'node[fontname="Arial"];\n' dataframes = {} print "save data" for vertice in vertices: print "New vertice: " + str(vertice) dataframe = DataFrame() pp = pprint.PrettyPrinter(indent=4) # pp.pprint(probas[vertice]) dot_string += vertice.replace( " ", "_") + ' [label="' + vertice + '\n' + '" ]; \n' if len(probas[vertice]['parents']) == 0: dataframe['Outcome'] = None dataframe['Probability'] = None vertex_dict = {} for index_outcome, outcome in enumerate(probas[vertice]['vals']): vertex_dict[str( outcome)] = probas[vertice]["cprob"][index_outcome] od = collections.OrderedDict(sorted(vertex_dict.items())) # print "Vertice: " + str(vertice) # print "%-7s|%-11s" % ("Outcome", "Probability") # print "-------------------" for k, v in od.iteritems(): # print "%-7s|%-11s" % (str(k), str(round(v, 3))) dataframe.loc[len(dataframe)] = [k, v] dataframes[vertice] = dataframe else: # pp.pprint(probas[vertice]) dataframe['Outcome'] = None vertexen = {} for index_outcome, outcome in enumerate(probas[vertice]['vals']): temp = [] for parent_index, parent in enumerate( probas[vertice]["parents"]): # print str([str(float(index_outcome))]) temp = probas[vertice]["cprob"] dataframe[parent] = None vertexen[str(outcome)] = temp dataframe['Probability'] = None od = collections.OrderedDict(sorted(vertexen.items())) # [str(float(i)) for i in ast.literal_eval(key)] # str(v[key][int(float(k))-1]) # print "Vertice: " + str(vertice) + " with parents: " + str(probas[vertice]['parents']) # print "Outcome" + "\t\t" + '\t\t'.join(probas[vertice]['parents']) + "\t\tProbability" # print "------------" * len(probas[vertice]['parents']) *3 # pp.pprint(od.values()) counter = 0 # print number_of_cols for outcome, cprobs in od.iteritems(): for key in cprobs.keys(): array_frame = [] array_frame.append((outcome)) print_string = str(outcome) + "\t\t" for parent_value, parent in enumerate( [i for i in ast.literal_eval(key)]): # print "parent-value:"+str(parent_value) # print "parten:"+str(parent) array_frame.append(int(float(parent))) # print "lengte array_frame: "+str(len(array_frame)) print_string += parent + "\t\t" array_frame.append(cprobs[key][counter]) # print "lengte array_frame (2): "+str(len(array_frame)) # print cprobs[key][counter] print_string += str(cprobs[key][counter]) + "\t" # for stront in [str(round(float(i), 3)) for i in ast.literal_eval(key)]: # print_string += stront + "\t\t" # print "print string: " + print_string # print "array_frame:" + str(array_frame) dataframe.loc[len(dataframe)] = array_frame counter += 1 print "Vertice " + str(vertice) + " done" dataframes[vertice] = dataframe for edge in edges: dot_string += edge[0].replace(" ", "_") + ' -> ' + edge[1].replace( " ", "_") + ';\n' dot_string += '}' src = Source(dot_string) if draw_network: src.render('../data/BN', view=draw_network) if draw_network: src.render('../data/BN', view=False) print "vizualisation done" return dataframes