def setUp(self):
        self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB'))
        self.rand_data['C'] = self.rand_data['B']
        self.est_rand = HillClimbSearch(self.rand_data, scoring_method=K2Score(self.rand_data))
        self.model1 = BayesianModel()
        self.model1.add_nodes_from(['A', 'B', 'C'])
        self.model2 = self.model1.copy()
        self.model2.add_edge('A', 'B')

        # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv"
        self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv')
        self.titanic_data1 = self.titanic_data[["Survived", "Sex", "Pclass", "Age", "Embarked"]]
        self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]]
        self.est_titanic1 = HillClimbSearch(self.titanic_data1)
        self.est_titanic2 = HillClimbSearch(self.titanic_data2)
Пример #2
0
def task4():
	global andRawData, task4_best_bm
	k2Scores = []
	andRawData_temp = pd.DataFrame(andRawData.values, columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9'])
	#Model 1
	est = HillClimbSearch(andRawData_temp, scoring_method=K2Score(andRawData_temp))
	model_temp = est.estimate()
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 1: Model through HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 1: K2 Accuracy Score is "+str(k2Scores_temp))
	#Model 2: Manual Model based on HillClimbSearch
	model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f3', 'f8'), ('f1', 'f7'), ('f5', 'f3'), ('f9', 'f8'), ('f1', 'f6'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f2')])
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 2: Manual Model based on HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 2: K2 Accuracy Score is "+str(k2Scores_temp))
	#Model 3: Manual Model based on HillClimbSearch
	model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f3', 'f8'), ('f5', 'f7'), ('f5', 'f3'), ('f9', 'f8'), ('f1', 'f2'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f2')])
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 3: Manual Model based on HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 3: K2 Accuracy Score is "+str(k2Scores_temp))
	#Model 4: Manual Model based on HillClimbSearch
	model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f5', 'f7'), ('f5', 'f3'), ('f1', 'f2'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f8'),])
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 4: Manual Model based on HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 4: K2 Accuracy Score is "+str(k2Scores_temp))
	#Model 5: Manual Model based on Intuition
	model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f4', 'f7'), ('f1', 'f2'), ('f8', 'f5'), ('f9', 'f6'), ('f9', 'f8')])
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 5: Manual Model based on HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 5: K2 Accuracy Score is "+str(k2Scores_temp))
	task4_best_bm = task4_bms[k2Scores.index(max(k2Scores))]
	print("	Best Bayesian Model with the highest accuracy score is thus Model "+str(1+k2Scores.index(max(k2Scores))))
Пример #3
0
import pandas as pd
from pgmpy.estimators import HillClimbSearch, ExhaustiveSearch
from pgmpy.estimators import BDeuScore, BicScore, K2Score
##结构学习
data = pd.read_csv('data.csv', encoding='gb18030')
df = pd.DataFrame(data)
bic = BicScore(df)
k2 = K2Score(df)
hc = HillClimbSearch(df, scoring_method=bic)
#hc = ExhaustiveSearch(df, k2)
model = hc.estimate()
for ee in model.edges():
    print(ee)



##参数学习
from pgmpy.models import BayesianModel
mod = BayesianModel(model.edges())
mod.fit(df)
for cpd in mod.get_cpds():
    print(cpd)

#print(mod.local_independencies('HA'))

##模型推理
from pgmpy.inference import VariableElimination, BeliefPropagation
cancer_infer = VariableElimination(mod)
q = cancer_infer.query(variables=['HA'])
print(q)
    def learn_structure(self,
                        file_path,
                        algorithm="hc",
                        significance_level=0.05):
        """
        Employs `pgmpy` package's Bayesian Network structure learning algorithms to learn
        structure from a dataset. Saves a tabular version of the result as a CSV file.

        Arguments:
            algorithm: str, optional (default = 'hc')
                Determines whether the hill-climbing or Peter-Clark are employed.
                Two possible values include: 'hc', 'pc'. Note, I found a bug in pgmpy implementation
                halfway through this project. Don't use the 'pc' method.
            file_path: str, the absolute path to save the file to (e.g. "~/Desktop/BN_structure.csv")
            significance_level: float, option (default = 0.05)
                Statistical significance cutoff for use in pruning the network when using the PC
                algorithm. Lower values produce sparser networks.

        Returns:
            None
        """
        self.structure_algorithm = algorithm

        if self.verbose:
            print(
                "Depending on the number of variables in your dataset, this might take some time..."
            )

        # Learn structure, using one of the algorithms
        np.random.seed(self.random_seed)

        if algorithm == "hc":

            # Filter out columns with zero correlation with target variable
            self.filtered_df = self._initial_filter()

            # Run HC algorithm
            self.structure_model = HillClimbSearch(
                self.filtered_df,
                scoring_method=BicScore(self.filtered_df)).estimate()

            if self.verbose:
                print(
                    f"Structure learned! Saving structure to the following CSV: {file_path}"
                )

            # Eliminate isolated subgraphs
            G = self.structure_model.to_undirected()

            connected_nodes = list(
                nx.algorithms.components.node_connected_component(
                    G, self.target_variable))

            disconnected_nodes = list(
                set(list(self.structure_model.nodes)) - set(connected_nodes))

            for node in disconnected_nodes:
                self.structure_model.remove_node(node)
                self.filtered_df.drop([node], axis=1, inplace=True)

            pd.DataFrame(
                list(self.structure_model.edges),
                columns=["from_variable", "to_variable"],
            ).to_csv(file_path, index=False)

        elif algorithm == "pc":
            self.filtered_df = self.df
            self.structure_model = ConstraintBasedEstimator(
                self.filtered_df).estimate(
                    significance_level=significance_level)

            if self.verbose:
                print(
                    f"Structure learned! Saving structure to the following CSV: {file_path}"
                )

            pd.DataFrame(
                list(self.structure_model.edges),
                columns=["from_variable", "to_variable"],
            ).to_csv(file_path, index=False)
Пример #5
0
@author: Adele
'''
from pgmpy.estimators import HillClimbSearch
import pandas as pd
import numpy as np
from pgmpy.estimators import BdeuScore, K2Score, BicScore

# create some data with dependencies
data = pd.DataFrame(np.random.randint(0, 3, size=(2500, 8)),
                    columns=list('ABCDEFGH'))
data['A'] += data['B'] + data['C']
data['H'] = data['G'] - data['A']
#print(data)

hc = HillClimbSearch(data, scoring_method=BicScore(data))

best_model = hc.estimate()
print(hc.scoring_method)
print(best_model.edges())

hc = HillClimbSearch(data, scoring_method=BdeuScore(data))
best_model = hc.estimate()
print(hc.scoring_method)
print(best_model.edges())

hc = HillClimbSearch(data, scoring_method=K2Score(data))
best_model = hc.estimate()
print(hc.scoring_method)
print(best_model.edges())
Пример #6
0
class TestBaseEstimator(unittest.TestCase):
    def setUp(self):
        self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)),
                                      columns=list('AB'))
        self.rand_data['C'] = self.rand_data['B']
        self.est_rand = HillClimbSearch(self.rand_data,
                                        scoring_method=K2Score(self.rand_data))
        self.model1 = BayesianModel()
        self.model1.add_nodes_from(['A', 'B', 'C'])
        self.model2 = self.model1.copy()
        self.model2.add_edge('A', 'B')

        # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv"
        self.titanic_data = pd.read_csv(
            'pgmpy/tests/test_estimators/testdata/titanic_train.csv')
        self.titanic_data1 = self.titanic_data[[
            "Survived", "Sex", "Pclass", "Age", "Embarked"
        ]]
        self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]]
        self.est_titanic1 = HillClimbSearch(self.titanic_data1)
        self.est_titanic2 = HillClimbSearch(self.titanic_data2)

    def test_legal_operations(self):
        model2_legal_ops = list(self.est_rand._legal_operations(self.model2))
        model2_legal_ops_ref = [(('+', ('C', 'A')), -28.15602208305154),
                                (('+', ('A', 'C')), -28.155467430966382),
                                (('+', ('C', 'B')), 7636.947544933631),
                                (('+', ('B', 'C')), 7937.805375579936),
                                (('-', ('A', 'B')), 28.155467430966382),
                                (('flip', ('A', 'B')), -0.0005546520851567038)]
        self.assertSetEqual(set([op for op, score in model2_legal_ops]),
                            set([op for op, score in model2_legal_ops_ref]))

    def test_legal_operations_titanic(self):
        est = self.est_titanic1
        start_model = BayesianModel([("Survived", "Sex"), ("Pclass", "Age"),
                                     ("Pclass", "Embarked")])

        legal_ops = est._legal_operations(start_model)
        self.assertEqual(len(list(legal_ops)), 20)

        tabu_list = [('-', ("Survived", "Sex")), ('-', ("Survived", "Pclass")),
                     ('flip', ("Age", "Pclass"))]
        legal_ops_tabu = est._legal_operations(start_model,
                                               tabu_list=tabu_list)
        self.assertEqual(len(list(legal_ops_tabu)), 18)

        legal_ops_indegree = est._legal_operations(start_model, max_indegree=1)
        self.assertEqual(len(list(legal_ops_indegree)), 11)

        legal_ops_both = est._legal_operations(start_model,
                                               tabu_list=tabu_list,
                                               max_indegree=1)
        legal_ops_both_ref = [
            (('+', ('Embarked', 'Survived')), 10.050632580087608),
            (('+', ('Survived', 'Pclass')), 41.88868046549101),
            (('+', ('Age', 'Survived')), -23.635716036430836),
            (('+', ('Pclass', 'Survived')), 41.81314459373226),
            (('+', ('Sex', 'Pclass')), 4.772261678792802),
            (('-', ('Pclass', 'Age')), 11.546515590731815),
            (('-', ('Pclass', 'Embarked')), -32.171482832532774),
            (('flip', ('Pclass', 'Embarked')), 3.3563814191281836),
            (('flip', ('Survived', 'Sex')), 0.039737027979640516)
        ]
        self.assertSetEqual(set(legal_ops_both), set(legal_ops_both_ref))

    def test_estimate_rand(self):
        est1 = self.est_rand.estimate()
        self.assertSetEqual(set(est1.nodes()), set(['A', 'B', 'C']))
        self.assertTrue(est1.edges() == [('B', 'C')]
                        or est1.edges() == [('C', 'B')])

        est2 = self.est_rand.estimate(start=BayesianModel([('A',
                                                            'B'), ('A', 'C')]))
        self.assertTrue(est2.edges() == [('B', 'C')]
                        or est2.edges() == [('C', 'B')])

    def test_estimate_titanic(self):
        self.assertSetEqual(
            set(self.est_titanic2.estimate().edges()),
            set([('Survived', 'Pclass'), ('Sex', 'Pclass'),
                 ('Sex', 'Survived')]))

    def tearDown(self):
        del self.rand_data
        del self.est_rand
        del self.model1
        del self.titanic_data
        del self.titanic_data1
        del self.titanic_data2
        del self.est_titanic1
        del self.est_titanic2
Пример #7
0
import time
t0 = time.time()
# Uncomment below to perform exhaustive search
#searcher = ExhaustiveSearch(data2, scoring_method=K2Score(data2))
#search = searcher.all_scores()
print('time:', time.time() - t0)
#print(len(search))
# Uncomment for printout:
# for score, model in search:
#     print("{0}        {1}".format(score, model.edges()))

# hillclimb search with K2score and Bicscore
separator()
print('\n\nHillClimb search based on structure scores:')
est = HillClimbSearch(data2, scoring_method=K2Score(data2))
best_model = est.estimate()
t0 = time.time()
print("Best model nodes:", sorted(best_model.nodes()))
print("Best model edges:", best_model.edges())
print('time:', time.time() - t0)

separator()
print('\n\nHillClimb search based on structure scores:')
est = HillClimbSearch(data2, scoring_method=BicScore(data2))
best_model = est.estimate()
t0 = time.time()
print("Best model nodes:", sorted(best_model.nodes()))
print("Best model edges:", best_model.edges())
print('time:', time.time() - t0)
# End of Task 6
Пример #8
0
def bic(train, test, scoring_function, resultlist):
    #print(set(train['Person'].values))
    #print(set(train['c0'].values))
    #print(set(train['c1'].values))

    #print(len(test))
    #print('################')
    array = ['Person']
    trainstart = time.time()
    #bic=BicScore(train)
    sc = scoring_function(train)
    hc = HillClimbSearch(train, scoring_method=sc)
    best_model = hc.estimate()
    #print("best_model.edges:" , best_model.edges())

    #edges=[('c3', 'c2'), ('c3', 'c5'), ('c3', 'c1'), ('c3', 'Person'), ('Person', 'c2'), ('Person', 'c5'), ('Person', 'c1')]
    edges = best_model.edges()
    model = BayesianModel(edges)
    model.fit(train, estimator=BayesianEstimator, prior_type="BDeu")
    trainend = time.time() - trainstart

    #for n in model.nodes():
    #    print(model.get_cpds(n))

    #print("nodes:", model.nodes())
    #print("test column:", test.columns)

    flag = 0
    if (set(model.nodes()) - set(array) == set(model.nodes())):
        flag = 1
    elif (set(model.nodes()) - set(array) == set(test.columns)):
        teststart = time.time()
        #print(test)
        result = model.predict(test).values.ravel()
        testend = time.time() - teststart
        pred = list(result)
        #print("y_true: \n" , resultlist , "\ny_predicted:\n" , pred)
    else:
        indicator = list(set(test.columns) - set(model.nodes()))
        #print("indicator:\n" , indicator)
        #print("come in testchange***********************")
        #print("before cahnge:" , len(test))
        testchange = test.copy()
        #print(testchange)

        for f in range(len(indicator)):
            #print(f)
            del testchange[indicator[f]]
        #print(testchange)
        #print("after cahnge:" , len(testchange))

        teststart = time.time()
        result = model.predict(testchange).values.ravel()
        testend = time.time() - teststart
        pred = list(result)
        #print("y_true: \n" , resultlist , "\ny_predicted:\n" , pred)

    #model_data = XMLBIFWriter(model)
    #model_data.write_xmlbif(address+name+'_bic.bif')
    if flag == 1:
        print('##############flag:', flag)
    if (flag == 0):
        #fscore,accuracy,precision,recall=calscore(resultlist,pred)
        scores = calculate_different_metrics(y_true=resultlist,
                                             y_predicted=pred)
        #draw(model.edges(),name,"bic",folder)
        #WriteData(address+"bicpred\\",name+".xlsx",name,pred)
    else:
        fscore = accuracy = precision = recall = trainend = testend = 0
        scores = {
            'f1_score_micro': 0,
            'f1_score_macro': 0,
            'f1_score_binary': 0,
            'precision': 0,
            'recall': 0,
            'accuracy': 0
        }

    #print("set(pred)", set(pred))
    #print("set(resultlist):", set(resultlist))
    #print("fscore:" , fscore,"accuracy:" ,accuracy,"precision:" ,precision, "recall: ",recall)
    #print("scores:", scores)
    return (model, scores, trainend, testend, pred)
Пример #9
0
    def opt(self, file1, file2):
        f1 = open(file1, encoding="utf8")
        lines = f1.readlines()
        nodes = self.getegdes(lines[0])
        edges = self.getegdes(lines[1])
        data = pd.read_csv(file2)

        G = BayesianModel()
        G.add_nodes_from(nodes)
        for i in range(int(len(edges) / 2)):
            G.add_edge(edges[2 * i], edges[2 * i + 1])
        # nx.draw(G)
        # plt.show()
        k2 = K2Score(data).score(G)
        bic = BicScore(data).score(G)
        bdeu = BDeuScore(data).score(G)
        print(k2, ",", bic, ",", bdeu)

        est = HillClimbSearch(data, scoring_method=K2Score(data))
        model = est.estimate()
        model_edges = model.edges()
        G_ = nx.DiGraph()
        G_.add_edges_from(model_edges)
        G_copy = nx.DiGraph()
        G_copy.add_edges_from(G.edges)
        add = []
        add_mut = []
        delete = []
        delete_mut = []
        # a = list(G.edges._adjdict.key())
        for edge in model_edges:
            node1 = edge[0]
            node2 = edge[1]
            if not nx.has_path(G, node2, node1):
                if not G.has_edge(node1, node2):
                    this = (node1, node2)
                    # this = '('+node1+','+node2+')'
                    add.append(this)
                    x = data[node1]
                    mut = mr.mutual_info_score(data[node1], data[node2])
                    add_mut.append(mut)
        seq = list(zip(add_mut, add))
        seq = sorted(seq, key=lambda s: s[0], reverse=True)
        alpha = 0.015
        # if seq[0][0] > alpha:
        #     add = seq[0:1]

        add = seq[0:1]

        data_edges = []
        for edge in G.edges:
            node1 = edge[0]
            node2 = edge[1]
            mut = mr.mutual_info_score(data[node1], data[node2])
            delete_mut.append(mut)
            data_edges.append(edge)
            # if not (nx.has_path(G_, node1, node2) or nx.has_path(G_, node2, node1)):
            #     this = '('+node1+','+node2+')'
            #     delete.append(this)
        seq = list(zip(delete_mut, data_edges))
        seq = sorted(seq, key=lambda s: s[0])

        # if seq[0][0] < alpha:
        #     delete = seq[0:1]
        if len(edges) > 2:
            delete = seq[0:1]
            if len(add) > 0:
                if delete[0][0] > add[0][0]:
                    delete = []

        print('add')
        for i in add:
            print(str(i[1]) + "," + str(i[0]))

        print('delete')
        for j in delete:
            print(str(j[1]) + "," + str(j[0]))
            # print(j[0])

        print('cpt')
        estimator = BayesianEstimator(G, data)
        for i in G.nodes:
            cpd = estimator.estimate_cpd(i, prior_type="K2")
            nodeName = i
            values = dict(data[i].value_counts())
            valueNum = len(values)
            CPT = np.transpose(cpd.values)
            # CPT = cpd.values
            sequence = cpd.variables[1::]
            card = []
            for x in sequence:
                s = len(dict(data[x].value_counts()))
                card.append(s)
            output = nodeName + '\t' + str(valueNum) + '\t' + str(
                CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card)
            print(output)

        print('mutual')
        output1 = []
        for i in range(int(len(edges) / 2)):
            mut = mr.mutual_info_score(data[edges[2 * i]],
                                       data[edges[2 * i + 1]])
            output1.append(mut)
        output2 = {}
        for node1 in G.nodes():
            d = {}
            for node2 in G.nodes():
                if node1 == node2:
                    continue
                mut = mr.mutual_info_score(data[node1], data[node2])

                d[node2] = mut
            output2[node1] = d
        print(output1)
        print(output2)
# In[292]:


# removing the image ids from FeatureMatrix
featureMatrixImageIdRem = np.delete(FeatureMatrix,[0],axis=1)
print(featureMatrixImageIdRem.shape)


# # hill climb search for best model estimate

# In[306]:


col = list('abcdefghijklmno')
dataset = pd.DataFrame(featureMatrixImageIdRem, columns = col)
est = HillClimbSearch(dataset,scoring_method = K2Score(dataset))
bestModel = est.estimate()#max_indegree=2)
print(bestModel.edges())


# In[340]:


model = BayesianModel([('a', 'e'), ('a', 'b'), ('c', 'g'), ('c', 'a'), 
                      ('c', 'l'), ('c', 'b'), ('c', 'm'), ('c', 'i'), 
                      ('d', 'c'), ('d', 'f'), ('d', 'g'), ('d', 'a'), 
                      ('e', 'j'), ('e', 'm'), ('f', 'm'), ('f', 'b'),
                      ('f', 'i'), ('f', 'j'), ('f', 'e'), ('g', 'f'), 
                      ('g', 'h'), ('i', 'a'), ('k', 'o'), ('k', 'n'),
                      ('k', 'd'), ('k', 'l'), ('k', 'f'), ('k', 'c'), 
                      ('k', 'j'), ('l', 'f'), ('l', 'm'), ('l', 'e'), 
Пример #11
0
evidence = []
# The target list is for checking accuracy in the future
targets = []

for row in testing.iterrows():
    blank_row = {i: None for i in df.columns if i != 'class'}
    for cls in blank_row:
        blank_row[cls] = row[1].to_dict()[cls]
    targets.append(row[1].to_dict()['class'])
    evidence.append(blank_row)

# ******* The task is to infer the "class" node from the data. *******

# Test out Hill climb search. Hill climb is one of the various structure leanring
# Algorithms in PGMPY
est = HillClimbSearch(data=training)

# Remove the paths that go from "class" to another node
blacklisted = [("class", i) for i in df.columns if i != 'class']
estimated_model = est.estimate(black_list=blacklisted)

# Make a Bayesian Model with the edges of the graph
edges = estimated_model.edges()
model = BayesianModel(edges)

# Bayes networks work off conditional probablities ... Estimate with MLE
mle = MaximumLikelihoodEstimator(model, df)
# Prior type Bdeu? Used default from docs
model.fit(df, estimator=BayesianEstimator, prior_type="BDeu")

# Visualize the network
Пример #12
0
def create_BN_model_using_BayesianEstimator(data):
    #data = pd.DataFrame(sensor_data)#, columns= feature_names)#['X', 'Y'])
    #print(data)
    data = pd.DataFrame(
        data
    )  #read_data_from_file_remove_date_and_time(r"E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\converted\pgmpy\sensor+PCA_n=5.csv" , data_type='float'))
    #print(data)

    #start_time = time.time()
    # 2 hours running, without output
    #hc = HillClimbSearch(data, scoring_method=BicScore(data))
    #best_model = hc.estimate()
    #print(hc.scoring_method)
    #print(best_model.edges())
    #end_time = time.time()
    #print("execution time in seconds:")
    #print(end_time-start_time)

    #start_time = time.time()
    #hc = HillClimbSearch(data, scoring_method=BdeuScore(data))
    #best_model = hc.estimate()
    #print(hc.scoring_method)
    #print(best_model.edges())
    #end_time = time.time()
    #print("execution time in seconds:")
    #print(end_time-start_time)

    #structure learning
    print("structure learning")
    start_time = time.time()
    hc = HillClimbSearch(data, scoring_method=K2Score(
        data))  #BicScore(data))#K2Score(data))BdeuScore(data)
    best_model = hc.estimate()
    print(hc.scoring_method)
    print(best_model.edges())
    end_time = time.time()
    print("execution time in seconds:{}".format(end_time - start_time))

    #parameter learning
    #model = BayesianModel([('A', 'C'), ('B', 'C')])
    #model.fit(data)
    #model.get_cpds()

    ######
    #best_model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D')])

    casas7_model = BayesianModel(best_model.edges())
    print("*******************")
    #BayesianEstimator.get_parameters(self, prior_type, equivalent_sample_size, pseudo_counts)
    #####estimator = BayesianEstimator(best_model, data)
    #####print(estimator.get_parameters(prior_type='K2'))#, equivalent_sample_size=5)

    estimator = BayesianEstimator(casas7_model, data)

    #casas7_model.fit(data, estimator=BayesianEstimator, prior_type="K2")#MaximumLikelihoodEstimator)
    ######print(casas7_model.get_cpds())
    ###casas7_model.predict(data)
    #print("casas7_model.node:{}".format(casas7_model.node))

    ########return estimator
    return estimator
class TestBaseEstimator(unittest.TestCase):
    def setUp(self):
        self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB'))
        self.rand_data['C'] = self.rand_data['B']
        self.est_rand = HillClimbSearch(self.rand_data, scoring_method=K2Score(self.rand_data))
        self.model1 = BayesianModel()
        self.model1.add_nodes_from(['A', 'B', 'C'])
        self.model2 = self.model1.copy()
        self.model2.add_edge('A', 'B')

        # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv"
        self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv')
        self.titanic_data1 = self.titanic_data[["Survived", "Sex", "Pclass", "Age", "Embarked"]]
        self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]]
        self.est_titanic1 = HillClimbSearch(self.titanic_data1)
        self.est_titanic2 = HillClimbSearch(self.titanic_data2)

    def test_legal_operations(self):
        model2_legal_ops = list(self.est_rand._legal_operations(self.model2))
        model2_legal_ops_ref = [(('+', ('C', 'A')), -28.15602208305154),
                                (('+', ('A', 'C')), -28.155467430966382),
                                (('+', ('C', 'B')), 7636.947544933631),
                                (('+', ('B', 'C')), 7937.805375579936),
                                (('-', ('A', 'B')), 28.155467430966382),
                                (('flip', ('A', 'B')), -0.0005546520851567038)]
        self.assertSetEqual(set([op for op, score in model2_legal_ops]),
                            set([op for op, score in model2_legal_ops_ref]))

    def test_legal_operations_titanic(self):
        est = self.est_titanic1
        start_model = BayesianModel([("Survived", "Sex"),
                                     ("Pclass", "Age"),
                                     ("Pclass", "Embarked")])

        legal_ops = est._legal_operations(start_model)
        self.assertEqual(len(list(legal_ops)), 20)

        tabu_list = [('-', ("Survived", "Sex")),
                     ('-', ("Survived", "Pclass")),
                     ('flip', ("Age", "Pclass"))]
        legal_ops_tabu = est._legal_operations(start_model, tabu_list=tabu_list)
        self.assertEqual(len(list(legal_ops_tabu)), 18)

        legal_ops_indegree = est._legal_operations(start_model, max_indegree=1)
        self.assertEqual(len(list(legal_ops_indegree)), 11)

        legal_ops_both = est._legal_operations(start_model, tabu_list=tabu_list, max_indegree=1)
        legal_ops_both_ref = [(('+', ('Embarked', 'Survived')), 10.050632580087608),
                              (('+', ('Survived', 'Pclass')), 41.88868046549101),
                              (('+', ('Age', 'Survived')), -23.635716036430836),
                              (('+', ('Pclass', 'Survived')), 41.81314459373226),
                              (('+', ('Sex', 'Pclass')), 4.772261678792802),
                              (('-', ('Pclass', 'Age')), 11.546515590731815),
                              (('-', ('Pclass', 'Embarked')), -32.171482832532774),
                              (('flip', ('Pclass', 'Embarked')), 3.3563814191281836),
                              (('flip', ('Survived', 'Sex')), 0.039737027979640516)]
        self.assertSetEqual(set(legal_ops_both), set(legal_ops_both_ref))

    def test_estimate_rand(self):
        est1 = self.est_rand.estimate()
        self.assertSetEqual(set(est1.nodes()), set(['A', 'B', 'C']))
        self.assertTrue(est1.edges() == [('B', 'C')] or est1.edges() == [('C', 'B')])

        est2 = self.est_rand.estimate(start=BayesianModel([('A', 'B'), ('A', 'C')]))
        self.assertTrue(est2.edges() == [('B', 'C')] or est2.edges() == [('C', 'B')])

    def test_estimate_titanic(self):
        self.assertSetEqual(set(self.est_titanic2.estimate().edges()),
                            set([('Survived', 'Pclass'), ('Sex', 'Pclass'), ('Sex', 'Survived')]))

    def tearDown(self):
        del self.rand_data
        del self.est_rand
        del self.model1
        del self.titanic_data
        del self.titanic_data1
        del self.titanic_data2
        del self.est_titanic1
        del self.est_titanic2
Пример #14
0
import numpy as np
import pandas as pd
from pgmpy.estimators import HillClimbSearch
from pgmpy.models import BayesianModel
from pgmpy.estimators import K2Score
from pgmpy.factors.discrete import TabularCPD
from pgmpy.sampling import BayesianModelSampling
from pgmpy.inference import VariableElimination

feature_val1 = pd.read_csv('15features_f.csv')
'''1pen_pressure	2letter_spacing	3size	4dimension	 5is_lowercase
6is_continuous 7slantness 8tilt	9entry_stroke_a
10staff_of_a	11formation_n	 12staff_of_d	 13exit_stroke_d	 
14word_formation  15constancy'''

hill = HillClimbSearch(feature_val1, scoring_method=K2Score(feature_val1))
f_model = hill.estimate()
print(f_model.edges())

feature_val2 = pd.read_csv('15features_g.csv')

hill1 = HillClimbSearch(feature_val2, scoring_method=K2Score(feature_val2))
g_model = hill1.estimate()
print(g_model.edges())

corr_mat = feature_val1.corr()
print(corr_mat)
corr_feature = set()
for i in range(len(corr_mat.columns)):
    for j in range(i):
        if abs(corr_mat.iloc[i, j]) > 0.2:
Пример #15
0
(y_pred["Survived"] == test["Survived"]).sum() / len(test)  # 测试集精度


model_infer = VariableElimination(model)
q = model_infer.query(variables=["Survived"], evidence={"Fare": 0})
print(q["Survived"])
q = model_infer.map_query(
    variables=["Fare", "Age", "Sex", "Pclass", "Cabin"], evidence={"Survived": 1}
)
print(q)


# # 用结构学习建立模型


hc = HillClimbSearch(train, scoring_method=BicScore(train))
best_model = hc.estimate()
print(best_model.edges())

best_model.fit(
    train, estimator=BayesianEstimator, prior_type="BDeu"
)  # default equivalent_sample_size=5
predict_data = test.drop(columns=["Survived"], axis=1)
y_pred = best_model.predict(predict_data)
(y_pred["Survived"] == test["Survived"]).sum() / len(test)  # 测试集精度


# # 预测原test集并保存csv


kaggle_test = full.drop(
# %% markdown [markdown]
# #### Heuristic Search
# Once more ndoes are involved we need to switch to heuristic search. The `HillClimbSearch` implements a greedy local search that starts from the DAG `start` (default disconnected DAG) and proceeds by iteratively performing single-edge manipulations that maximally increase the score. The search terminates once a local maximum is found.
#
# **Example 1:** $Z = X + Y$
# %% codecell
from pgmpy.estimators import HillClimbSearch

# Create data with dependencies:
data: DataFrame = DataFrame(np.random.randint(low=0, high=3, size=(2500, 8)),
                            columns=list('ABCDEFGH'))
data['A'] += data['B'] + data['C']
data['H'] = data['G'] - data['A']

hc = HillClimbSearch(data=data, scoring_method=BicScore(data))

bestModel = hc.estimate()
# %% codecell
bestModel.edges()
# %% codecell
drawGraph(bestModel)

# %% markdown [markdown]
# The search correctly identifies that $B$ and $C$ do not influence $H$ directly, only through $A$ and of course that $D$, $E$, $F$ are independent.
#
# To enforce a wider exploration of the search space, the search can be enhanced with a tabu list. The list keeps track of the last n modfications; those are then not allowed to be reversed, regardless of the score. Additionally a `white_list` or `black_list` can be supplied to restrict the search to a particular subset or to exclude certain edges. The parameter `max_indegree` allows to restrict the maximum number of parents for each node.
#
# **Example 2:** Fruit data
# %% codecell
hc = HillClimbSearch(fruitData, scoring_method=BicScore(fruitData))