예제 #1
0
def buildBN(trainingData, binstyleDict, numbinsDict,
            **kwargs):  # need to modify to accept skel or skelfile

    discretized_training_data, bin_ranges = discretizeTrainingData(
        trainingData, binstyleDict, numbinsDict, True)
    print 'discret training ', discretized_training_data

    if 'skel' in kwargs:
        # load file into skeleton
        if isinstance(kwargs['skel'], basestring):
            skel = GraphSkeleton()
            skel.load(kwargs['skel'])
            skel.toporder()
        else:
            skel = kwargs['skel']

    # learn bayesian network
    learner = PGMLearner()
    # baynet = learner.discrete_mle_estimateparams(skel, discretized_training_data)
    # baynet = discrete_estimatebn(learner, discretized_training_data, skel, 0.05, 1)
    baynet = discrete_mle_estimateparams2(
        skel, discretized_training_data
    )  # using discrete_mle_estimateparams2 written as function in this file, not calling from libpgm

    return baynet
예제 #2
0
def estimate_distrib(skel, samples, query, evidence):
    learner = PGMLearner()
    bayesnet = learner.discrete_mle_estimateparams(skel, samples)
    tablecpd = TableCPDFactorization(bayesnet)
    fac = tablecpd.condprobve(query, evidence)
    df2 = printdist(fac, bayesnet)
    return df2
예제 #3
0
def test_libpgm(df1):

    data = df1.T.to_dict().values()
    #pprint(data)
    skel = GraphSkeleton()
    skel.load("bn_struct.txt")
    
    learner = PGMLearner()
    result = learner.discrete_mle_estimateparams(skel, data)
    
    print json.dumps(result.Vdata, indent=2)
예제 #4
0
def learnBN(fdata_array, bn_file):

    bn_path = os.path.join(experiment_dir, 'parameters', bn_file + '.txt')

    skel = GraphSkeleton()
    skel.load(bn_path)
    skel.toporder()

    learner = PGMLearner()
    bn = learner.discrete_mle_estimateparams(skel, fdata_array)

    return bn
def getBNparams(graph, ddata, n):
    # Gets Disc. BN parameters given a graph skeleton
    #skeleton should include t-1 and t nodes for each variable
    nodes = range(1, (n * 2) + 1)
    nodes = map(str, nodes)
    edges = gk.edgelist(graph)
    for i in range(len(edges)):
        edges[i] = list([edges[i][0], str(n + int(edges[i][1]))])
    skel = GraphSkeleton()
    skel.V = nodes
    skel.E = edges
    learner = PGMLearner()
    result = learner.discrete_mle_estimateparams(skel, ddata)
    return result
예제 #6
0
 def __init__(self):
     self.learner = PGMLearner()
     rospy.Service("~discrete/parameter_estimation",
                   DiscreteParameterEstimation,
                   self.discrete_parameter_estimation_cb)
     rospy.Service("~discrete/query", DiscreteQuery, self.discrete_query_cb)
     rospy.Service("~discrete/structure_estimation",
                   DiscreteStructureEstimation,
                   self.discrete_structure_estimation_cb)
     rospy.Service("~linear_gaussian/parameter_estimation",
                   LinearGaussianParameterEstimation,
                   self.lg_parameter_estimation_cb)
     rospy.Service("~linear_gaussian/structure_estimation",
                   LinearGaussianStructureEstimation,
                   self.lg_structure_estimation_cb)
예제 #7
0
def em(data, bn, skel):
    lk_last = 100
    times = 0
    while 1:
        d2 = data_with_hidden(data, bn)
        learner = PGMLearner()  #toolbox
        bn = learner.discrete_mle_estimateparams(skel, d2)  #toolbox
        lk = likelihood(d2, bn)
        print "LogLikelihood:", lk
        times += 1

        if abs((lk - lk_last) / lk_last) < 0.001:
            break
        lk_last = lk
    print times
    return bn
예제 #8
0
def net2():
    nd = NodeData()
    skel = GraphSkeleton()
    nd.load("net.txt")  # an input file
    skel.load("net.txt")

    # topologically order graphskeleton
    skel.toporder()

    # load bayesian network
    lgbn = LGBayesianNetwork(skel, nd)

    in_data = read_data.getdata2()
    learner = PGMLearner()
    bn = learner.lg_mle_estimateparams(skel, in_data)

    p = cal_prob(in_data[300:500], bn)
    print p
    return 0
예제 #9
0
def bayesNet(textFile):
    cleanText(textFile, 'tempOutput.txt')

    ## imports textFile into pandas

    try:
        df = pd.read_csv('tempOutput.txt',
                         sep='\s+',
                         dtype='float32',
                         header=None)
    except:
        print 'next file'
        return
    df.fillna(0, inplace=True)
    df.convert_objects(convert_numeric=True)

    ##

    for i, row in df.iterrows():
        print df.ix[0, i]
        df.ix[0, i] = df.ix[0, i] + str(i)

    grouped = df.set_index([0], verify_integrity=True)

    df2 = grouped.to_dict()

    print json.dumps(df2, indent=2)

    newDict = []

    for key in df2.keys():
        newDict.append(df2[key])

    #print json.dumps(newDict, indent=2)
# instantiate my learner
    learner = PGMLearner()

    # estimate structure
    result = learner.lg_constraint_estimatestruct(newDict)

    # output
    return json.dumps(result.E, indent=2)
예제 #10
0
def calc_accuracy(dff_train, dff_train_target, nb_iterations):
    
    result = np.zeros(nb_iterations)

    for itera in range(nb_iterations):
        XX_train, XX_test, yy_train, yy_test = train_test_split(dff_train, dff_train_target, test_size=0.33)
        data4bn = format_data(XX_train)
        learner = PGMLearner()
        # estimate parameters
        result_bn = learner.discrete_mle_estimateparams(skel, data4bn)
        #result_bn.Vdata
        result_predict = calc_BNprob(XX_test)
        BN_test_probs = pd.DataFrame()
        BN_test_probs['ground_truth'] = yy_test
        Test_prob = pd.concat([yy_test.reset_index().Surv, result_predict],  axis = 1, ignore_index = True)                    .rename(columns = {0:'ground_truth' , 1:'class_resu'})
        accuracy = Test_prob[Test_prob.ground_truth == Test_prob.class_resu].shape[0]/(1.0*Test_prob.shape[0])
        #print("Accuracy is {}").format(accuracy)
        result[itera] = accuracy
        
    return result
예제 #11
0
    def setUp(self):
        # instantiate learner
        self.l = PGMLearner()

        # generate graph skeleton
        skel = GraphSkeleton()
        skel.load("unittestdict.txt")
        skel.toporder()

        # generate sample sequence to try to learn from - discrete
        nd = NodeData.load("unittestdict.txt")
        self.samplediscbn = DiscreteBayesianNetwork(nd)
        self.samplediscseq = self.samplediscbn.randomsample(5000)

        # generate sample sequence to try to learn from - discrete
        nda = NodeData.load("unittestlgdict.txt")
        self.samplelgbn = LGBayesianNetwork(nda)
        self.samplelgseq = self.samplelgbn.randomsample(10000)

        self.skel = skel
예제 #12
0
def bayesNetDiscrete(textFile, quant_no, unique):
    cleanText(textFile, 'tempOutput.txt')

    ## imports textFile into pandas
    try:
        df = pd.read_csv('tempOutput.txt',
                         sep='\s+',
                         dtype='float64',
                         header=None)
    except:
        print 'next file'
        return
    df.fillna(0, inplace=True)
    df.convert_objects(convert_numeric=True)

    ## set to either setUnique() or setMax()
    if unique is True:
        grouped = setUnique(df)
    else:
        grouped = setMax(df)

    ## quantiles is qcut(), fixed width divisions is cut
    grouped = quantize(quant_no, grouped)

    #turns into correct dictionary format for libpgm
    newDict = DFtoLibpgm(grouped)

    # instantiate my learner
    learner = PGMLearner()

    # estimate structure
    try:
        result = learner.discrete_estimatebn(newDict)
    except:
        print 'error'
        #result = learner.discrete_estimatebn([dict([('a',1),('b',2)])])
        return


# output
    return result
예제 #13
0
def bayesNetCont(textFile, unique):
    cleanText(textFile, 'tempOutput.txt')

    ## imports textFile into pandas
    try:
        df = pd.read_csv('tempOutput.txt',
                         sep='\s+',
                         dtype='float64',
                         header=None)
    except:
        print 'next file'
        return
    df.fillna(0, inplace=True)
    df.convert_objects(convert_numeric=True)

    ## set to either setUnique() or setMax()
    if unique is True:
        grouped = setUnique(df)
    else:
        grouped = setMax(df)

    #turns into correct dictionary format for libpgm
    newDict = DFtoLibpgm(grouped)

    # instantiate my learner
    learner = PGMLearner()

    # estimate structure
    #gaussian
    try:
        result = learner.lg_constraint_estimatestruct(newDict)
    except:
        print 'error'
        return

# output
    return result
예제 #14
0
def main():
    # filename
    features_file = './../data/features.csv'

    # read data into list
    handwriting_features = postmaster.readCSVIntoListAsDict(features_file)

    # learn structure
    # instantiate learner
    learner = PGMLearner()

    pvalue = 0.25
    indegree = 1
    # estimate structure
    #result = learner.discrete_constraint_estimatestruct(
    #	handwriting_features, pvalue, indegree)
    result = learner.discrete_estimatebn(handwriting_features)

    #result = learner.discrete_condind(handwriting_features, 'f1', 'f2',
    #	['f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9'])
    # output
    #print result.chi, result.pval, result.U
    #print json.dumps(result.E, indent=2)
    print json.dumps(result.Vdata, indent=2)
예제 #15
0
def fun(inputData):

    #Defining formatting data method
    def format_data(df):
        result = []
        for row in df.itertuples():
            #print(row.Pclass)
            result.append(
                dict(great=row.great,
                     good=row.good,
                     clean=row.clean,
                     comfortable=row.comfortable,
                     bad=row.bad,
                     old=row.old,
                     Cleanliness=row.Cleanliness,
                     Location=row.Location,
                     Service=row.Service,
                     Rooms=row.Rooms,
                     Value=row.Value,
                     Overall=row.Overall))
        return result

    #load all preprocessed training data
    df = pd.read_csv('features.csv', sep=',')

    #format data to let them correctly processed by libpgm functions
    node_data = format_data(df)

    skel = GraphSkeleton()
    #load structure of our net
    skel.load("./our-skel.txt")
    #setting the topologic order
    skel.toporder()
    #learner which will estimate parameters e if needed net structure
    learner = PGMLearner()

    #estismting parameters for our own model
    res = learner.discrete_mle_estimateparams(skel, node_data)

    # get CPT
    a = TableCPDFactorization(res)
    #compute the query and evidences as dicts
    query = dict(Overall=1)
    # prepare dictionary of values (dopo gli uguali devi mettere i valori che leggi dalla GUI)

    evidence = dict(Value=inputData[0],
                    Location=inputData[1],
                    Cleanliness=inputData[2],
                    Service=inputData[3],
                    Rooms=inputData[4],
                    bad=inputData[5],
                    old=inputData[6],
                    good=inputData[7],
                    great=inputData[8],
                    comfortable=inputData[9],
                    clean=inputData[10])

    print(query)
    print(evidence)

    #run the query given evidence
    result = a.condprobve(query, evidence)

    print json.dumps(result.vals, indent=2)
    #res.Vdata["Overall"]["vals"][pos]
    #arr=[]
    dizionario = {}
    for i in range(1, 6):
        dizionario[res.Vdata["Overall"]["vals"][i - 1]] = result.vals[i - 1]
    #    arr.append(dizionario)
    #print(str(arr))
    return dizionario
예제 #16
0
def bn_learn(attr, cicli, passed_file):
    path_to_sentiments = 'sentiment_AFINN'

    print "Using AFINN sentiment dictionary"

    if attr == 0:
        print "Considering tweets' number"
    elif attr == 1:
        print "Considering averaged number of positive, negative and neutral tweets"
    elif attr == 2:
        print "Considering averaged value of positive and negative tweets"
    elif attr == 3:
        print "Considering positive and negative tweets\' increment"
    elif attr == 4:
        print "Considering bullisment index obtained by number of tweets sentiment"
    elif attr == 5:
        print "Considering bullisment index obtained by tweets value of sentiment"

    print "And considering market trend"

    all_data = []
    files = [
        path_to_sentiments + "/" + file
        for file in os.listdir(path_to_sentiments) if file.endswith('.json')
    ]
    for file in files:
        with open(file) as sentiment_file:
            data = json.load(sentiment_file)

            vdata = {}
            if attr == 0:
                vdata["com"] = data["n_tweets"]
            elif attr == 1:
                vdata["pos"] = data["n_pos_ave"]
                vdata["neg"] = data["n_neg_ave"]
                vdata["neu"] = data["n_neu_ave"]
            elif attr == 2:
                vdata["pos"] = data["pos_val_ave"]
                vdata["neg"] = data["neg_val_ave"]
            elif attr == 3:
                vdata["pos"] = data["pos_inc"]
                vdata["neg"] = data["neg_inc"]
            elif attr == 4:
                vdata["com"] = data["bull_ind"]
            elif attr == 5:
                vdata["com"] = data["bull_ind_val"]

            vdata["market"] = data["market_inc"]

            all_data.append(vdata)

    skel = GraphSkeleton()
    if len(all_data[0]) == 2:
        skel.load("network_struct_1_vertex.json")
        print "Loading structure with 2 node"
    elif len(all_data[0]) == 3:
        skel.load("network_struct_2_vertex.json")
        print "Loading structure with 3 node"
    elif len(all_data[0]) == 4:
        skel.load("network_struct_3_vertex.json")
        print "Loading structure with 4 node"
    skel.toporder()

    learner = PGMLearner()
    result = learner.lg_mle_estimateparams(skel, all_data)
    for key in result.Vdata.keys():
        result.Vdata[key]['type'] = 'lg'

    prob_pos = prob_neg = prob_neu = 0
    for data in all_data:
        if data['market'] == 1:
            prob_pos += 1
        elif data['market'] == 0:
            prob_neu += 1
        else:
            prob_neg += 1
    prob_pos = float(prob_pos) / float(len(all_data))
    prob_neg = float(prob_neg) / float(len(all_data))
    prob_neu = float(prob_neu) / float(len(all_data))

    tmp = {}
    tmp['numoutcomes'] = len(all_data)
    tmp['cprob'] = [prob_pos, prob_neg, prob_neu]
    tmp['parents'] = result.Vdata['market']['parents']
    tmp['vals'] = ['positive', 'negative', 'neutral']
    tmp['type'] = 'discrete'
    tmp['children'] = result.Vdata['market']['children']
    result.Vdata['market'] = tmp

    node = Discrete(result.Vdata["market"])
    print "Loading node as Discrete"

    estimated, real = mcmc_json(passed_file, attr, cicli, node)

    return estimated, real
예제 #17
0
[{'Class': 3, 'Fare': 0, 'Sex': 1, 'Surv': 0},
 {'Class': 1, 'Fare': 1, 'Sex': 0, 'Surv': 1},
 {'Class': 3, 'Fare': 0, 'Sex': 0, 'Surv': 1},
 {'Class': 1, 'Fare': 1, 'Sex': 0, 'Surv': 1},...]
# In[ ]:


nd       = NodeData()
skel     = GraphSkeleton()

#The structure is defined in the file titanic_skel
jsonpath ="titanic_skel.json"
skel.load(jsonpath)

#instatiate the learner
learner = PGMLearner()

# The methos estimates the parameters for a discrete Bayesian network with
# a structure given by graphskeleton in order to maximize the probability 
# of data given by data
result_params = learner.discrete_mle_estimateparams(skel, training_data)

result_params.Vdata['Class']# to inspect the network


# Check the prediction accuracy

# In[ ]:


#results = calc_accuracy(dff_train, dff_train_target, 100)
예제 #18
0
from libpgm.graphskeleton import GraphSkeleton
from libpgm.tablecpdfactorization import TableCPDFactorization
from libpgm.pgmlearner import PGMLearner

text = open("../unifiedMLData2.json")
data = text.read()
printable = set(string.printable)
asciiData = filter(lambda x: x in printable, data)

#listofDicts=json.dumps(data)
listofDicts = json.loads(asciiData)

skel = GraphSkeleton()
skel.load("../skeleton.json")

learner = PGMLearner()

result = learner.discrete_mle_estimateparams(skel, listofDicts)

tcf = TableCPDFactorization(result)

#Rating 1 Given Occupation is student
myquery = dict(rating=[1])
myevidence = dict(occupation='student')
result = tcf.specificquery(query=myquery, evidence=myevidence)
print result

tcf.refresh()

#Rating 2 Given Occupation is student
myquery = dict(rating=[2])
예제 #19
0
def learnDiscreteBN_with_structure(df,
                                   continous_columns,
                                   features_column_names,
                                   label_column='cat',
                                   draw_network=False):
    features_df = df.copy()
    features_df = features_df.drop(label_column, axis=1)

    labels_df = DataFrame()
    labels_df[label_column] = df[label_column].copy()

    for i in continous_columns:
        bins = np.arange((min(features_df[i])), (max(features_df[i])),
                         ((max(features_df[i]) - min(features_df[i])) / 5.0))
        features_df[i] = pandas.np.digitize(features_df[i], bins=bins)

    data = []
    for index, row in features_df.iterrows():
        dict = {}
        for i in features_column_names:
            dict[i] = row[i]
        dict[label_column] = labels_df[label_column][index]
        data.append(dict)

    print "Init done"
    learner = PGMLearner()

    graph = GraphSkeleton()

    graph.V = []
    graph.E = []

    graph.V.append(label_column)

    for vertice in features_column_names:
        graph.V.append(vertice)
        graph.E.append([vertice, label_column])

    test = learner.discrete_mle_estimateparams(graphskeleton=graph, data=data)

    print "done learning"

    edges = test.E
    vertices = test.V
    probas = test.Vdata

    # print probas

    dot_string = 'digraph BN{\n'
    dot_string += 'node[fontname="Arial"];\n'

    dataframes = {}

    print "save data"
    for vertice in vertices:
        print "New vertice: " + str(vertice)
        dataframe = DataFrame()

        pp = pprint.PrettyPrinter(indent=4)
        # pp.pprint(probas[vertice])
        dot_string += vertice.replace(
            " ", "_") + ' [label="' + vertice + '\n' + '" ]; \n'

        if len(probas[vertice]['parents']) == 0:
            dataframe['Outcome'] = None
            dataframe['Probability'] = None
            vertex_dict = {}
            for index_outcome, outcome in enumerate(probas[vertice]['vals']):
                vertex_dict[str(
                    outcome)] = probas[vertice]["cprob"][index_outcome]

            od = collections.OrderedDict(sorted(vertex_dict.items()))
            # print "Vertice: " + str(vertice)
            # print "%-7s|%-11s" % ("Outcome", "Probability")
            # print "-------------------"
            for k, v in od.iteritems():
                # print "%-7s|%-11s" % (str(k), str(round(v, 3)))
                dataframe.loc[len(dataframe)] = [k, v]
            dataframes[vertice] = dataframe
        else:
            # pp.pprint(probas[vertice])
            dataframe['Outcome'] = None

            vertexen = {}
            for index_outcome, outcome in enumerate(probas[vertice]['vals']):
                temp = []
                for parent_index, parent in enumerate(
                        probas[vertice]["parents"]):
                    # print str([str(float(index_outcome))])
                    temp = probas[vertice]["cprob"]
                    dataframe[parent] = None
                vertexen[str(outcome)] = temp

            dataframe['Probability'] = None
            od = collections.OrderedDict(sorted(vertexen.items()))

            # [str(float(i)) for i in ast.literal_eval(key)]

            # str(v[key][int(float(k))-1])

            # print "Vertice: " + str(vertice) + " with parents: " + str(probas[vertice]['parents'])
            # print "Outcome" + "\t\t" + '\t\t'.join(probas[vertice]['parents']) + "\t\tProbability"
            # print "------------" * len(probas[vertice]['parents']) *3
            # pp.pprint(od.values())

            counter = 0
            # print number_of_cols
            for outcome, cprobs in od.iteritems():
                for key in cprobs.keys():
                    array_frame = []
                    array_frame.append((outcome))
                    print_string = str(outcome) + "\t\t"
                    for parent_value, parent in enumerate(
                        [i for i in ast.literal_eval(key)]):
                        # print "parent-value:"+str(parent_value)
                        # print "parten:"+str(parent)
                        array_frame.append(int(float(parent)))
                        # print "lengte array_frame: "+str(len(array_frame))
                        print_string += parent + "\t\t"
                    array_frame.append(cprobs[key][counter])
                    # print "lengte array_frame (2): "+str(len(array_frame))
                    # print  cprobs[key][counter]
                    print_string += str(cprobs[key][counter]) + "\t"
                    # for stront in [str(round(float(i), 3)) for i in ast.literal_eval(key)]:
                    #     print_string += stront + "\t\t"
                    # print "print string: " + print_string
                    # print "array_frame:" + str(array_frame)
                    dataframe.loc[len(dataframe)] = array_frame
                counter += 1
        print "Vertice " + str(vertice) + " done"
        dataframes[vertice] = dataframe

    for edge in edges:
        dot_string += edge[0].replace(" ", "_") + ' -> ' + edge[1].replace(
            " ", "_") + ';\n'

    dot_string += '}'
    # src = Source(dot_string)
    # src.render('../data/BN', view=draw_network)
    # src.render('../data/BN', view=False)
    print "vizualisation done"
    return dataframes