예제 #1
0
def bayesNetDiscrete(textFile, quant_no, unique):
    cleanText(textFile, 'tempOutput.txt')

    ## imports textFile into pandas
    try:
        df = pd.read_csv('tempOutput.txt',
                         sep='\s+',
                         dtype='float64',
                         header=None)
    except:
        print 'next file'
        return
    df.fillna(0, inplace=True)
    df.convert_objects(convert_numeric=True)

    ## set to either setUnique() or setMax()
    if unique is True:
        grouped = setUnique(df)
    else:
        grouped = setMax(df)

    ## quantiles is qcut(), fixed width divisions is cut
    grouped = quantize(quant_no, grouped)

    #turns into correct dictionary format for libpgm
    newDict = DFtoLibpgm(grouped)

    # instantiate my learner
    learner = PGMLearner()

    # estimate structure
    try:
        result = learner.discrete_estimatebn(newDict)
    except:
        print 'error'
        #result = learner.discrete_estimatebn([dict([('a',1),('b',2)])])
        return


# output
    return result
예제 #2
0
def bayesNetDiscrete(textFile,quant_no,unique):
	cleanText(textFile,'tempOutput.txt')

	## imports textFile into pandas
	try:
		df = pd.read_csv('tempOutput.txt', sep='\s+',dtype='float64',header=None)
	except:
		print 'next file'
		return
	df.fillna(0, inplace=True)
	df.convert_objects(convert_numeric=True)

	## set to either setUnique() or setMax()
	if unique is True:
		grouped = setUnique(df)
	else:
		grouped = setMax(df)


	## quantiles is qcut(), fixed width divisions is cut	
	grouped = quantize(quant_no,grouped)

	
	#turns into correct dictionary format for libpgm
	newDict = DFtoLibpgm(grouped)

# instantiate my learner 
	learner = PGMLearner()

# estimate structure
	try:
		result = learner.discrete_estimatebn(newDict)
	except:
		print 'error'
		#result = learner.discrete_estimatebn([dict([('a',1),('b',2)])])	
		return
# output
	return result
예제 #3
0
def main():
    # filename
    features_file = './../data/features.csv'

    # read data into list
    handwriting_features = postmaster.readCSVIntoListAsDict(features_file)

    # learn structure
    # instantiate learner
    learner = PGMLearner()

    pvalue = 0.25
    indegree = 1
    # estimate structure
    #result = learner.discrete_constraint_estimatestruct(
    #	handwriting_features, pvalue, indegree)
    result = learner.discrete_estimatebn(handwriting_features)

    #result = learner.discrete_condind(handwriting_features, 'f1', 'f2',
    #	['f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9'])
    # output
    #print result.chi, result.pval, result.U
    #print json.dumps(result.E, indent=2)
    print json.dumps(result.Vdata, indent=2)
예제 #4
0
class TestPGMLearner(unittest.TestCase):

    def setUp(self):
        # instantiate learner
        self.l = PGMLearner()

        # generate graph skeleton
        skel = GraphSkeleton()
        skel.load("unittestdict.txt")
        skel.toporder()

        # generate sample sequence to try to learn from - discrete
        nd = NodeData()
        nd.load("unittestdict.txt")
        self.samplediscbn = DiscreteBayesianNetwork(skel, nd)
        self.samplediscseq = self.samplediscbn.randomsample(5000)

        # generate sample sequence to try to learn from - discrete
        nda = NodeData()
        nda.load("unittestlgdict.txt")
        self.samplelgbn = LGBayesianNetwork(skel, nda)
        self.samplelgseq = self.samplelgbn.randomsample(10000)

        self.skel = skel

    def test_discrete_mle_estimateparams(self):
        result = self.l.discrete_mle_estimateparams(self.skel, self.samplediscseq)
        indexa = result.Vdata['SAT']['vals'].index('lowscore')
        self.assertTrue(result.Vdata['SAT']['cprob']["['low']"][indexa] < 1 and result.Vdata['SAT']['cprob']["['low']"][indexa] > .9)
        indexb = result.Vdata['Letter']['vals'].index('weak')
        self.assertTrue(result.Vdata['Letter']['cprob']["['A']"][indexb] < .15 and result.Vdata['Letter']['cprob']["['A']"][indexb] > .05)

    def test_lg_mle_estimateparams(self):
        result = self.l.lg_mle_estimateparams(self.skel, self.samplelgseq)
        self.assertTrue(result.Vdata['SAT']['mean_base'] < 15 and result.Vdata['SAT']['mean_base'] > 5)
        self.assertTrue(result.Vdata['Letter']['variance'] < 15 and result.Vdata['Letter']['variance'] > 5)

    def test_discrete_constraint_estimatestruct(self):
        result = self.l.discrete_constraint_estimatestruct(self.samplediscseq)
        self.assertTrue(["Difficulty", "Grade"] in result.E)

    def test_lg_constraint_estimatestruct(self):
        result = self.l.lg_constraint_estimatestruct(self.samplelgseq)
        self.assertTrue(["Intelligence", "Grade"] in result.E)

    def test_discrete_condind(self):
        chi, pv, witness = self.l.discrete_condind(self.samplediscseq, "Difficulty", "Letter", ["Grade"])
        self.assertTrue(pv > .05)
        self.assertTrue(witness, ["Grade"])
        chia, pva, witnessa = self.l.discrete_condind(self.samplediscseq, "Difficulty", "Intelligence", [])
        self.assertTrue(pva < .05)

    def test_discrete_estimatebn(self):
        result = self.l.discrete_estimatebn(self.samplediscseq)
        self.assertTrue(result.V)
        self.assertTrue(result.E)
        self.assertTrue(result.Vdata["Difficulty"]["cprob"][0])

    def test_lg_estimatebn(self):
        result = self.l.lg_estimatebn(self.samplelgseq)
        self.assertTrue(result.V)
        self.assertTrue(result.E)
        self.assertTrue(result.Vdata["Intelligence"]["mean_base"])
예제 #5
0
class TestPGMLearner(unittest.TestCase):
    
    def setUp(self):
        # instantiate learner
        self.l = PGMLearner()

        # generate graph skeleton
        skel = GraphSkeleton()
        skel.load("unittestdict.txt")
        skel.toporder()

        # generate sample sequence to try to learn from - discrete
        nd = NodeData()
        nd.load("unittestdict.txt")
        self.samplediscbn = DiscreteBayesianNetwork(skel, nd)
        self.samplediscseq = self.samplediscbn.randomsample(5000)

        # generate sample sequence to try to learn from - discrete
        nda = NodeData()
        nda.load("unittestlgdict.txt")
        self.samplelgbn = LGBayesianNetwork(skel, nda)
        self.samplelgseq = self.samplelgbn.randomsample(10000)

        self.skel = skel

    def test_discrete_mle_estimateparams(self):
        result = self.l.discrete_mle_estimateparams(self.skel, self.samplediscseq)
        indexa = result.Vdata['SAT']['vals'].index('lowscore')
        self.assertTrue(result.Vdata['SAT']['cprob']["['low']"][indexa] < 1 and result.Vdata['SAT']['cprob']["['low']"][indexa] > .9)
        indexb = result.Vdata['Letter']['vals'].index('weak')
        self.assertTrue(result.Vdata['Letter']['cprob']["['A']"][indexb] < .15 and result.Vdata['Letter']['cprob']["['A']"][indexb] > .05)

    def test_lg_mle_estimateparams(self):
        result = self.l.lg_mle_estimateparams(self.skel, self.samplelgseq)
        self.assertTrue(result.Vdata['SAT']['mean_base'] < 15 and result.Vdata['SAT']['mean_base'] > 5)
        self.assertTrue(result.Vdata['Letter']['variance'] < 15 and result.Vdata['Letter']['variance'] > 5)

    def test_discrete_constraint_estimatestruct(self):
        result = self.l.discrete_constraint_estimatestruct(self.samplediscseq)
        self.assertTrue(["Difficulty", "Grade"] in result.E)

    def test_lg_constraint_estimatestruct(self):
        result = self.l.lg_constraint_estimatestruct(self.samplelgseq)
        self.assertTrue(["Intelligence", "Grade"] in result.E)

    def test_discrete_condind(self):
        chi, pv, witness = self.l.discrete_condind(self.samplediscseq, "Difficulty", "Letter", ["Grade"])
        self.assertTrue(pv > .05)
        self.assertTrue(witness, ["Grade"])
        chia, pva, witnessa = self.l.discrete_condind(self.samplediscseq, "Difficulty", "Intelligence", [])  
        self.assertTrue(pva < .05)

    def test_discrete_estimatebn(self):
        result = self.l.discrete_estimatebn(self.samplediscseq)
        self.assertTrue(result.V)
        self.assertTrue(result.E)
        self.assertTrue(result.Vdata["Difficulty"]["cprob"][0])

    def test_lg_estimatebn(self):
        result = self.l.lg_estimatebn(self.samplelgseq)
        self.assertTrue(result.V)
        self.assertTrue(result.E)
        self.assertTrue(result.Vdata["Intelligence"]["mean_base"])
def learnDiscreteBN(df, continous_columns, features_column_names, label_column='cat', draw_network=False):
    features_df = df.copy()
    features_df = features_df.drop(label_column, axis=1)

    labels_df = DataFrame()
    labels_df[label_column] = df[label_column].copy()

    for i in continous_columns:
        bins = np.arange((min(features_df[i])), (max(features_df[i])),
                         ((max(features_df[i]) - min(features_df[i])) / 5.0))
        features_df[i] = pandas.np.digitize(features_df[i], bins=bins)

    data = []
    for index, row in features_df.iterrows():
        dict = {}
        for i in features_column_names:
            dict[i] = row[i]
        dict[label_column] = labels_df[label_column][index]
        data.append(dict)

    print "Init done"
    learner = PGMLearner()

    test = learner.discrete_estimatebn(data=data, pvalparam=0.05, indegree=1)

    # print test.__dict__

    f = open('heart_structure.txt', 'w')
    s = str(test.__dict__)
    f.write(s)
    f.flush()
    f.close()

    print "done learning"
    edges = test.E
    vertices = test.V
    probas = test.Vdata

    # print probas

    dot_string = 'digraph BN{\n'
    dot_string += 'node[fontname="Arial"];\n'

    dataframes = {}

    print "save data"
    for vertice in vertices:
        print "New vertice: " + str(vertice)
        dataframe = DataFrame()

        pp = pprint.PrettyPrinter(indent=4)
        # pp.pprint(probas[vertice])
        dot_string += vertice.replace(" ", "_") + ' [label="' + vertice + '\n' + '" ]; \n'

        if len(probas[vertice]['parents']) == 0:
            dataframe['Outcome'] = None
            dataframe['Probability'] = None
            vertex_dict = {}
            for index_outcome, outcome in enumerate(probas[vertice]['vals']):
                vertex_dict[str(outcome)] = probas[vertice]["cprob"][index_outcome]

            od = collections.OrderedDict(sorted(vertex_dict.items()))
            # print "Vertice: " + str(vertice)
            # print "%-7s|%-11s" % ("Outcome", "Probability")
            # print "-------------------"
            for k, v in od.iteritems():
                # print "%-7s|%-11s" % (str(k), str(round(v, 3)))
                dataframe.loc[len(dataframe)] = [k, v]
            dataframes[vertice] = dataframe
        else:
            # pp.pprint(probas[vertice])
            dataframe['Outcome'] = None

            vertexen = {}
            for index_outcome, outcome in enumerate(probas[vertice]['vals']):
                temp = []
                for parent_index, parent in enumerate(probas[vertice]["parents"]):
                    # print str([str(float(index_outcome))])
                    temp = probas[vertice]["cprob"]
                    dataframe[parent] = None
                vertexen[str(outcome)] = temp

            dataframe['Probability'] = None
            od = collections.OrderedDict(sorted(vertexen.items()))

            # [str(float(i)) for i in ast.literal_eval(key)]


            # str(v[key][int(float(k))-1])

            # print "Vertice: " + str(vertice) + " with parents: " + str(probas[vertice]['parents'])
            # print "Outcome" + "\t\t" + '\t\t'.join(probas[vertice]['parents']) + "\t\tProbability"
            # print "------------" * len(probas[vertice]['parents']) *3
            # pp.pprint(od.values())

            counter = 0
            # print number_of_cols
            for outcome, cprobs in od.iteritems():
                for key in cprobs.keys():
                    array_frame = []
                    array_frame.append((outcome))
                    print_string = str(outcome) + "\t\t"
                    for parent_value, parent in enumerate([i for i in ast.literal_eval(key)]):
                        # print "parent-value:"+str(parent_value)
                        # print "parten:"+str(parent)
                        array_frame.append(int(float(parent)))
                        # print "lengte array_frame: "+str(len(array_frame))
                        print_string += parent + "\t\t"
                    array_frame.append(cprobs[key][counter])
                    # print "lengte array_frame (2): "+str(len(array_frame))
                    # print  cprobs[key][counter]
                    print_string += str(cprobs[key][counter]) + "\t"
                    # for stront in [str(round(float(i), 3)) for i in ast.literal_eval(key)]:
                    #     print_string += stront + "\t\t"
                    # print "print string: " + print_string
                    # print "array_frame:" + str(array_frame)
                    dataframe.loc[len(dataframe)] = array_frame
                counter += 1
        print "Vertice " + str(vertice) + " done"
        dataframes[vertice] = dataframe

    for edge in edges:
        dot_string += edge[0].replace(" ", "_") + ' -> ' + edge[1].replace(" ", "_") + ';\n'

    dot_string += '}'
    src = Source(dot_string)
    if draw_network:src.render('../data/BN', view=draw_network)
    if draw_network:src.render('../data/BN', view=False)
    print "vizualisation done"
    return dataframes
예제 #7
0
import json

from libpgm.nodedata import NodeData
from libpgm.graphskeleton import GraphSkeleton
from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork
from libpgm.pgmlearner import PGMLearner

nd = NodeData()
nd.load("nodedata.json")
skel = GraphSkeleton()
skel.load("nodedata.json")
skel.toporder()

bn = DiscreteBayesianNetwork(skel,nd)
with open("manipulatedata.json") as fp:
    data = json.load(fp)

learner = PGMLearner()

# result = learner.discrete_constraint_estimatestruct(data)
result = learner.discrete_estimatebn(data)

print json.dumps(result.E, indent=2)
print json.dumps(result.Vdata, indent=2)
예제 #8
0
# estimate parameters
result = learner.lg_estimatebn(data)

# output - toggle comment to see
#print json.dumps(result.E, indent=2)
#print json.dumps(result.Vdata, indent=2)

# say I have some data
data = bn.randomsample(2000)

# instantiate my learner
learner = PGMLearner()

# estimate parameters
result = learner.discrete_estimatebn(data)

# output - toggle comment to see
#print json.dumps(result.E, indent=2)
#print json.dumps(result.Vdata, indent=2)

# (13) -----------------------------------------------------------------------
# Forward sample on dynamic Bayesian networks

# read input file
path = "../tests/unittestdyndict.txt"
f = open(path, 'r')
g = eval(f.read())

# set up dynamic BN
d = DynDiscBayesianNetwork()
예제 #9
0

# ## Learning both the structure and the parameters

# In[ ]:


#instatiate the learner
learner_full = PGMLearner()

# Learn structure and parameters. This method fully learns a BN from
# discrete data given by data. This function combines the 
# discrete_constraint_estimatestruct method (where it passes in the 
# pvalparam and indegree arguments) with the discrete_mle_estimateparams method.
# It returns a complete DiscreteBayesianNetwork class instance learned from the data
result_full_bn = learner_full.discrete_estimatebn(training_data)

#result_full_bn.E


# In[ ]:


# We can also manually test and verify how independent two varaibles are


# In[ ]:


learner_indep = PGMLearner()
learner_indep.discrete_condind(training_data,'Surv', 'Fare', ['Class'])
                 Value=row.Value,
                 Overall=row.Overall))
        #      result.append(dict(great = row.great, good = row.good, nice = row.nice, clean = row.clean, helpful = row.helpful, comfortable = row.comfortable,
    # beautiful = row.beautiful, wonderful = row.wonderful, friendly = row.friendly, fantastic = row.fantastic, bad = row.bad,
    #   Cleanliness= row.Cleanliness, Location=row.Location ,Businessservice=row.Businessservice,
    #    Checkin=row.Checkin, Service=row.Service, Rooms=row.Rooms, Value=row.Value, Overall=row.Overall ))
    return result


#load all preprocessed training data
df = pd.read_csv('./features_filtrato.csv', sep=',')
#format data to let them correctly processed by libpgm functions
node_data = format_data(df)

skel = GraphSkeleton()
#load structure of our net
#skel.load("./json_skel.txt")
#setting the topologic order
#skel.toporder()
#learner which will estimate parameters e if needed net structure
learner = PGMLearner()

#estismting parameters for our own model
#res = learner.discrete_mle_estimateparams(skel, node_data)

#estimating net structure given training data and paramenters this is an alternative to create a new model on our data
net = learner.discrete_estimatebn(node_data)
print json.dumps(net.V, indent=2)
print json.dumps(net.E, indent=2)
res = learner.discrete_mle_estimateparams(net, node_data)
print(str(res))
예제 #11
0
def learnDiscreteBN(df,
                    continous_columns,
                    features_column_names,
                    label_column='cat',
                    draw_network=False):
    features_df = df.copy()
    features_df = features_df.drop(label_column, axis=1)

    labels_df = DataFrame()
    labels_df[label_column] = df[label_column].copy()

    for i in continous_columns:
        bins = np.arange((min(features_df[i])), (max(features_df[i])),
                         ((max(features_df[i]) - min(features_df[i])) / 5.0))
        features_df[i] = pandas.np.digitize(features_df[i], bins=bins)

    data = []
    for index, row in features_df.iterrows():
        dict = {}
        for i in features_column_names:
            dict[i] = row[i]
        dict[label_column] = labels_df[label_column][index]
        data.append(dict)

    print "Init done"
    learner = PGMLearner()

    test = learner.discrete_estimatebn(data=data, pvalparam=0.05, indegree=1)

    # print test.__dict__

    f = open('heart_structure.txt', 'w')
    s = str(test.__dict__)
    f.write(s)
    f.flush()
    f.close()

    print "done learning"
    edges = test.E
    vertices = test.V
    probas = test.Vdata

    # print probas

    dot_string = 'digraph BN{\n'
    dot_string += 'node[fontname="Arial"];\n'

    dataframes = {}

    print "save data"
    for vertice in vertices:
        print "New vertice: " + str(vertice)
        dataframe = DataFrame()

        pp = pprint.PrettyPrinter(indent=4)
        # pp.pprint(probas[vertice])
        dot_string += vertice.replace(
            " ", "_") + ' [label="' + vertice + '\n' + '" ]; \n'

        if len(probas[vertice]['parents']) == 0:
            dataframe['Outcome'] = None
            dataframe['Probability'] = None
            vertex_dict = {}
            for index_outcome, outcome in enumerate(probas[vertice]['vals']):
                vertex_dict[str(
                    outcome)] = probas[vertice]["cprob"][index_outcome]

            od = collections.OrderedDict(sorted(vertex_dict.items()))
            # print "Vertice: " + str(vertice)
            # print "%-7s|%-11s" % ("Outcome", "Probability")
            # print "-------------------"
            for k, v in od.iteritems():
                # print "%-7s|%-11s" % (str(k), str(round(v, 3)))
                dataframe.loc[len(dataframe)] = [k, v]
            dataframes[vertice] = dataframe
        else:
            # pp.pprint(probas[vertice])
            dataframe['Outcome'] = None

            vertexen = {}
            for index_outcome, outcome in enumerate(probas[vertice]['vals']):
                temp = []
                for parent_index, parent in enumerate(
                        probas[vertice]["parents"]):
                    # print str([str(float(index_outcome))])
                    temp = probas[vertice]["cprob"]
                    dataframe[parent] = None
                vertexen[str(outcome)] = temp

            dataframe['Probability'] = None
            od = collections.OrderedDict(sorted(vertexen.items()))

            # [str(float(i)) for i in ast.literal_eval(key)]

            # str(v[key][int(float(k))-1])

            # print "Vertice: " + str(vertice) + " with parents: " + str(probas[vertice]['parents'])
            # print "Outcome" + "\t\t" + '\t\t'.join(probas[vertice]['parents']) + "\t\tProbability"
            # print "------------" * len(probas[vertice]['parents']) *3
            # pp.pprint(od.values())

            counter = 0
            # print number_of_cols
            for outcome, cprobs in od.iteritems():
                for key in cprobs.keys():
                    array_frame = []
                    array_frame.append((outcome))
                    print_string = str(outcome) + "\t\t"
                    for parent_value, parent in enumerate(
                        [i for i in ast.literal_eval(key)]):
                        # print "parent-value:"+str(parent_value)
                        # print "parten:"+str(parent)
                        array_frame.append(int(float(parent)))
                        # print "lengte array_frame: "+str(len(array_frame))
                        print_string += parent + "\t\t"
                    array_frame.append(cprobs[key][counter])
                    # print "lengte array_frame (2): "+str(len(array_frame))
                    # print  cprobs[key][counter]
                    print_string += str(cprobs[key][counter]) + "\t"
                    # for stront in [str(round(float(i), 3)) for i in ast.literal_eval(key)]:
                    #     print_string += stront + "\t\t"
                    # print "print string: " + print_string
                    # print "array_frame:" + str(array_frame)
                    dataframe.loc[len(dataframe)] = array_frame
                counter += 1
        print "Vertice " + str(vertice) + " done"
        dataframes[vertice] = dataframe

    for edge in edges:
        dot_string += edge[0].replace(" ", "_") + ' -> ' + edge[1].replace(
            " ", "_") + ';\n'

    dot_string += '}'
    src = Source(dot_string)
    if draw_network: src.render('../data/BN', view=draw_network)
    if draw_network: src.render('../data/BN', view=False)
    print "vizualisation done"
    return dataframes