def setUp(self): self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) self.rand_data['C'] = self.rand_data['B'] self.est_rand = HillClimbSearch(self.rand_data, scoring_method=K2Score(self.rand_data)) self.model1 = BayesianModel() self.model1.add_nodes_from(['A', 'B', 'C']) self.model2 = self.model1.copy() self.model2.add_edge('A', 'B') # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv') self.titanic_data1 = self.titanic_data[["Survived", "Sex", "Pclass", "Age", "Embarked"]] self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] self.est_titanic1 = HillClimbSearch(self.titanic_data1) self.est_titanic2 = HillClimbSearch(self.titanic_data2)
def task4(): global andRawData, task4_best_bm k2Scores = [] andRawData_temp = pd.DataFrame(andRawData.values, columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9']) #Model 1 est = HillClimbSearch(andRawData_temp, scoring_method=K2Score(andRawData_temp)) model_temp = est.estimate() estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 1: Model through HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 1: K2 Accuracy Score is "+str(k2Scores_temp)) #Model 2: Manual Model based on HillClimbSearch model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f3', 'f8'), ('f1', 'f7'), ('f5', 'f3'), ('f9', 'f8'), ('f1', 'f6'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f2')]) estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 2: Manual Model based on HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 2: K2 Accuracy Score is "+str(k2Scores_temp)) #Model 3: Manual Model based on HillClimbSearch model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f3', 'f8'), ('f5', 'f7'), ('f5', 'f3'), ('f9', 'f8'), ('f1', 'f2'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f2')]) estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 3: Manual Model based on HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 3: K2 Accuracy Score is "+str(k2Scores_temp)) #Model 4: Manual Model based on HillClimbSearch model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f5', 'f7'), ('f5', 'f3'), ('f1', 'f2'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f8'),]) estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 4: Manual Model based on HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 4: K2 Accuracy Score is "+str(k2Scores_temp)) #Model 5: Manual Model based on Intuition model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f4', 'f7'), ('f1', 'f2'), ('f8', 'f5'), ('f9', 'f6'), ('f9', 'f8')]) estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 5: Manual Model based on HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 5: K2 Accuracy Score is "+str(k2Scores_temp)) task4_best_bm = task4_bms[k2Scores.index(max(k2Scores))] print(" Best Bayesian Model with the highest accuracy score is thus Model "+str(1+k2Scores.index(max(k2Scores))))
import pandas as pd from pgmpy.estimators import HillClimbSearch, ExhaustiveSearch from pgmpy.estimators import BDeuScore, BicScore, K2Score ##结构学习 data = pd.read_csv('data.csv', encoding='gb18030') df = pd.DataFrame(data) bic = BicScore(df) k2 = K2Score(df) hc = HillClimbSearch(df, scoring_method=bic) #hc = ExhaustiveSearch(df, k2) model = hc.estimate() for ee in model.edges(): print(ee) ##参数学习 from pgmpy.models import BayesianModel mod = BayesianModel(model.edges()) mod.fit(df) for cpd in mod.get_cpds(): print(cpd) #print(mod.local_independencies('HA')) ##模型推理 from pgmpy.inference import VariableElimination, BeliefPropagation cancer_infer = VariableElimination(mod) q = cancer_infer.query(variables=['HA']) print(q)
def learn_structure(self, file_path, algorithm="hc", significance_level=0.05): """ Employs `pgmpy` package's Bayesian Network structure learning algorithms to learn structure from a dataset. Saves a tabular version of the result as a CSV file. Arguments: algorithm: str, optional (default = 'hc') Determines whether the hill-climbing or Peter-Clark are employed. Two possible values include: 'hc', 'pc'. Note, I found a bug in pgmpy implementation halfway through this project. Don't use the 'pc' method. file_path: str, the absolute path to save the file to (e.g. "~/Desktop/BN_structure.csv") significance_level: float, option (default = 0.05) Statistical significance cutoff for use in pruning the network when using the PC algorithm. Lower values produce sparser networks. Returns: None """ self.structure_algorithm = algorithm if self.verbose: print( "Depending on the number of variables in your dataset, this might take some time..." ) # Learn structure, using one of the algorithms np.random.seed(self.random_seed) if algorithm == "hc": # Filter out columns with zero correlation with target variable self.filtered_df = self._initial_filter() # Run HC algorithm self.structure_model = HillClimbSearch( self.filtered_df, scoring_method=BicScore(self.filtered_df)).estimate() if self.verbose: print( f"Structure learned! Saving structure to the following CSV: {file_path}" ) # Eliminate isolated subgraphs G = self.structure_model.to_undirected() connected_nodes = list( nx.algorithms.components.node_connected_component( G, self.target_variable)) disconnected_nodes = list( set(list(self.structure_model.nodes)) - set(connected_nodes)) for node in disconnected_nodes: self.structure_model.remove_node(node) self.filtered_df.drop([node], axis=1, inplace=True) pd.DataFrame( list(self.structure_model.edges), columns=["from_variable", "to_variable"], ).to_csv(file_path, index=False) elif algorithm == "pc": self.filtered_df = self.df self.structure_model = ConstraintBasedEstimator( self.filtered_df).estimate( significance_level=significance_level) if self.verbose: print( f"Structure learned! Saving structure to the following CSV: {file_path}" ) pd.DataFrame( list(self.structure_model.edges), columns=["from_variable", "to_variable"], ).to_csv(file_path, index=False)
@author: Adele ''' from pgmpy.estimators import HillClimbSearch import pandas as pd import numpy as np from pgmpy.estimators import BdeuScore, K2Score, BicScore # create some data with dependencies data = pd.DataFrame(np.random.randint(0, 3, size=(2500, 8)), columns=list('ABCDEFGH')) data['A'] += data['B'] + data['C'] data['H'] = data['G'] - data['A'] #print(data) hc = HillClimbSearch(data, scoring_method=BicScore(data)) best_model = hc.estimate() print(hc.scoring_method) print(best_model.edges()) hc = HillClimbSearch(data, scoring_method=BdeuScore(data)) best_model = hc.estimate() print(hc.scoring_method) print(best_model.edges()) hc = HillClimbSearch(data, scoring_method=K2Score(data)) best_model = hc.estimate() print(hc.scoring_method) print(best_model.edges())
class TestBaseEstimator(unittest.TestCase): def setUp(self): self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) self.rand_data['C'] = self.rand_data['B'] self.est_rand = HillClimbSearch(self.rand_data, scoring_method=K2Score(self.rand_data)) self.model1 = BayesianModel() self.model1.add_nodes_from(['A', 'B', 'C']) self.model2 = self.model1.copy() self.model2.add_edge('A', 'B') # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv( 'pgmpy/tests/test_estimators/testdata/titanic_train.csv') self.titanic_data1 = self.titanic_data[[ "Survived", "Sex", "Pclass", "Age", "Embarked" ]] self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] self.est_titanic1 = HillClimbSearch(self.titanic_data1) self.est_titanic2 = HillClimbSearch(self.titanic_data2) def test_legal_operations(self): model2_legal_ops = list(self.est_rand._legal_operations(self.model2)) model2_legal_ops_ref = [(('+', ('C', 'A')), -28.15602208305154), (('+', ('A', 'C')), -28.155467430966382), (('+', ('C', 'B')), 7636.947544933631), (('+', ('B', 'C')), 7937.805375579936), (('-', ('A', 'B')), 28.155467430966382), (('flip', ('A', 'B')), -0.0005546520851567038)] self.assertSetEqual(set([op for op, score in model2_legal_ops]), set([op for op, score in model2_legal_ops_ref])) def test_legal_operations_titanic(self): est = self.est_titanic1 start_model = BayesianModel([("Survived", "Sex"), ("Pclass", "Age"), ("Pclass", "Embarked")]) legal_ops = est._legal_operations(start_model) self.assertEqual(len(list(legal_ops)), 20) tabu_list = [('-', ("Survived", "Sex")), ('-', ("Survived", "Pclass")), ('flip', ("Age", "Pclass"))] legal_ops_tabu = est._legal_operations(start_model, tabu_list=tabu_list) self.assertEqual(len(list(legal_ops_tabu)), 18) legal_ops_indegree = est._legal_operations(start_model, max_indegree=1) self.assertEqual(len(list(legal_ops_indegree)), 11) legal_ops_both = est._legal_operations(start_model, tabu_list=tabu_list, max_indegree=1) legal_ops_both_ref = [ (('+', ('Embarked', 'Survived')), 10.050632580087608), (('+', ('Survived', 'Pclass')), 41.88868046549101), (('+', ('Age', 'Survived')), -23.635716036430836), (('+', ('Pclass', 'Survived')), 41.81314459373226), (('+', ('Sex', 'Pclass')), 4.772261678792802), (('-', ('Pclass', 'Age')), 11.546515590731815), (('-', ('Pclass', 'Embarked')), -32.171482832532774), (('flip', ('Pclass', 'Embarked')), 3.3563814191281836), (('flip', ('Survived', 'Sex')), 0.039737027979640516) ] self.assertSetEqual(set(legal_ops_both), set(legal_ops_both_ref)) def test_estimate_rand(self): est1 = self.est_rand.estimate() self.assertSetEqual(set(est1.nodes()), set(['A', 'B', 'C'])) self.assertTrue(est1.edges() == [('B', 'C')] or est1.edges() == [('C', 'B')]) est2 = self.est_rand.estimate(start=BayesianModel([('A', 'B'), ('A', 'C')])) self.assertTrue(est2.edges() == [('B', 'C')] or est2.edges() == [('C', 'B')]) def test_estimate_titanic(self): self.assertSetEqual( set(self.est_titanic2.estimate().edges()), set([('Survived', 'Pclass'), ('Sex', 'Pclass'), ('Sex', 'Survived')])) def tearDown(self): del self.rand_data del self.est_rand del self.model1 del self.titanic_data del self.titanic_data1 del self.titanic_data2 del self.est_titanic1 del self.est_titanic2
import time t0 = time.time() # Uncomment below to perform exhaustive search #searcher = ExhaustiveSearch(data2, scoring_method=K2Score(data2)) #search = searcher.all_scores() print('time:', time.time() - t0) #print(len(search)) # Uncomment for printout: # for score, model in search: # print("{0} {1}".format(score, model.edges())) # hillclimb search with K2score and Bicscore separator() print('\n\nHillClimb search based on structure scores:') est = HillClimbSearch(data2, scoring_method=K2Score(data2)) best_model = est.estimate() t0 = time.time() print("Best model nodes:", sorted(best_model.nodes())) print("Best model edges:", best_model.edges()) print('time:', time.time() - t0) separator() print('\n\nHillClimb search based on structure scores:') est = HillClimbSearch(data2, scoring_method=BicScore(data2)) best_model = est.estimate() t0 = time.time() print("Best model nodes:", sorted(best_model.nodes())) print("Best model edges:", best_model.edges()) print('time:', time.time() - t0) # End of Task 6
def bic(train, test, scoring_function, resultlist): #print(set(train['Person'].values)) #print(set(train['c0'].values)) #print(set(train['c1'].values)) #print(len(test)) #print('################') array = ['Person'] trainstart = time.time() #bic=BicScore(train) sc = scoring_function(train) hc = HillClimbSearch(train, scoring_method=sc) best_model = hc.estimate() #print("best_model.edges:" , best_model.edges()) #edges=[('c3', 'c2'), ('c3', 'c5'), ('c3', 'c1'), ('c3', 'Person'), ('Person', 'c2'), ('Person', 'c5'), ('Person', 'c1')] edges = best_model.edges() model = BayesianModel(edges) model.fit(train, estimator=BayesianEstimator, prior_type="BDeu") trainend = time.time() - trainstart #for n in model.nodes(): # print(model.get_cpds(n)) #print("nodes:", model.nodes()) #print("test column:", test.columns) flag = 0 if (set(model.nodes()) - set(array) == set(model.nodes())): flag = 1 elif (set(model.nodes()) - set(array) == set(test.columns)): teststart = time.time() #print(test) result = model.predict(test).values.ravel() testend = time.time() - teststart pred = list(result) #print("y_true: \n" , resultlist , "\ny_predicted:\n" , pred) else: indicator = list(set(test.columns) - set(model.nodes())) #print("indicator:\n" , indicator) #print("come in testchange***********************") #print("before cahnge:" , len(test)) testchange = test.copy() #print(testchange) for f in range(len(indicator)): #print(f) del testchange[indicator[f]] #print(testchange) #print("after cahnge:" , len(testchange)) teststart = time.time() result = model.predict(testchange).values.ravel() testend = time.time() - teststart pred = list(result) #print("y_true: \n" , resultlist , "\ny_predicted:\n" , pred) #model_data = XMLBIFWriter(model) #model_data.write_xmlbif(address+name+'_bic.bif') if flag == 1: print('##############flag:', flag) if (flag == 0): #fscore,accuracy,precision,recall=calscore(resultlist,pred) scores = calculate_different_metrics(y_true=resultlist, y_predicted=pred) #draw(model.edges(),name,"bic",folder) #WriteData(address+"bicpred\\",name+".xlsx",name,pred) else: fscore = accuracy = precision = recall = trainend = testend = 0 scores = { 'f1_score_micro': 0, 'f1_score_macro': 0, 'f1_score_binary': 0, 'precision': 0, 'recall': 0, 'accuracy': 0 } #print("set(pred)", set(pred)) #print("set(resultlist):", set(resultlist)) #print("fscore:" , fscore,"accuracy:" ,accuracy,"precision:" ,precision, "recall: ",recall) #print("scores:", scores) return (model, scores, trainend, testend, pred)
def opt(self, file1, file2): f1 = open(file1, encoding="utf8") lines = f1.readlines() nodes = self.getegdes(lines[0]) edges = self.getegdes(lines[1]) data = pd.read_csv(file2) G = BayesianModel() G.add_nodes_from(nodes) for i in range(int(len(edges) / 2)): G.add_edge(edges[2 * i], edges[2 * i + 1]) # nx.draw(G) # plt.show() k2 = K2Score(data).score(G) bic = BicScore(data).score(G) bdeu = BDeuScore(data).score(G) print(k2, ",", bic, ",", bdeu) est = HillClimbSearch(data, scoring_method=K2Score(data)) model = est.estimate() model_edges = model.edges() G_ = nx.DiGraph() G_.add_edges_from(model_edges) G_copy = nx.DiGraph() G_copy.add_edges_from(G.edges) add = [] add_mut = [] delete = [] delete_mut = [] # a = list(G.edges._adjdict.key()) for edge in model_edges: node1 = edge[0] node2 = edge[1] if not nx.has_path(G, node2, node1): if not G.has_edge(node1, node2): this = (node1, node2) # this = '('+node1+','+node2+')' add.append(this) x = data[node1] mut = mr.mutual_info_score(data[node1], data[node2]) add_mut.append(mut) seq = list(zip(add_mut, add)) seq = sorted(seq, key=lambda s: s[0], reverse=True) alpha = 0.015 # if seq[0][0] > alpha: # add = seq[0:1] add = seq[0:1] data_edges = [] for edge in G.edges: node1 = edge[0] node2 = edge[1] mut = mr.mutual_info_score(data[node1], data[node2]) delete_mut.append(mut) data_edges.append(edge) # if not (nx.has_path(G_, node1, node2) or nx.has_path(G_, node2, node1)): # this = '('+node1+','+node2+')' # delete.append(this) seq = list(zip(delete_mut, data_edges)) seq = sorted(seq, key=lambda s: s[0]) # if seq[0][0] < alpha: # delete = seq[0:1] if len(edges) > 2: delete = seq[0:1] if len(add) > 0: if delete[0][0] > add[0][0]: delete = [] print('add') for i in add: print(str(i[1]) + "," + str(i[0])) print('delete') for j in delete: print(str(j[1]) + "," + str(j[0])) # print(j[0]) print('cpt') estimator = BayesianEstimator(G, data) for i in G.nodes: cpd = estimator.estimate_cpd(i, prior_type="K2") nodeName = i values = dict(data[i].value_counts()) valueNum = len(values) CPT = np.transpose(cpd.values) # CPT = cpd.values sequence = cpd.variables[1::] card = [] for x in sequence: s = len(dict(data[x].value_counts())) card.append(s) output = nodeName + '\t' + str(valueNum) + '\t' + str( CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card) print(output) print('mutual') output1 = [] for i in range(int(len(edges) / 2)): mut = mr.mutual_info_score(data[edges[2 * i]], data[edges[2 * i + 1]]) output1.append(mut) output2 = {} for node1 in G.nodes(): d = {} for node2 in G.nodes(): if node1 == node2: continue mut = mr.mutual_info_score(data[node1], data[node2]) d[node2] = mut output2[node1] = d print(output1) print(output2)
# In[292]: # removing the image ids from FeatureMatrix featureMatrixImageIdRem = np.delete(FeatureMatrix,[0],axis=1) print(featureMatrixImageIdRem.shape) # # hill climb search for best model estimate # In[306]: col = list('abcdefghijklmno') dataset = pd.DataFrame(featureMatrixImageIdRem, columns = col) est = HillClimbSearch(dataset,scoring_method = K2Score(dataset)) bestModel = est.estimate()#max_indegree=2) print(bestModel.edges()) # In[340]: model = BayesianModel([('a', 'e'), ('a', 'b'), ('c', 'g'), ('c', 'a'), ('c', 'l'), ('c', 'b'), ('c', 'm'), ('c', 'i'), ('d', 'c'), ('d', 'f'), ('d', 'g'), ('d', 'a'), ('e', 'j'), ('e', 'm'), ('f', 'm'), ('f', 'b'), ('f', 'i'), ('f', 'j'), ('f', 'e'), ('g', 'f'), ('g', 'h'), ('i', 'a'), ('k', 'o'), ('k', 'n'), ('k', 'd'), ('k', 'l'), ('k', 'f'), ('k', 'c'), ('k', 'j'), ('l', 'f'), ('l', 'm'), ('l', 'e'),
evidence = [] # The target list is for checking accuracy in the future targets = [] for row in testing.iterrows(): blank_row = {i: None for i in df.columns if i != 'class'} for cls in blank_row: blank_row[cls] = row[1].to_dict()[cls] targets.append(row[1].to_dict()['class']) evidence.append(blank_row) # ******* The task is to infer the "class" node from the data. ******* # Test out Hill climb search. Hill climb is one of the various structure leanring # Algorithms in PGMPY est = HillClimbSearch(data=training) # Remove the paths that go from "class" to another node blacklisted = [("class", i) for i in df.columns if i != 'class'] estimated_model = est.estimate(black_list=blacklisted) # Make a Bayesian Model with the edges of the graph edges = estimated_model.edges() model = BayesianModel(edges) # Bayes networks work off conditional probablities ... Estimate with MLE mle = MaximumLikelihoodEstimator(model, df) # Prior type Bdeu? Used default from docs model.fit(df, estimator=BayesianEstimator, prior_type="BDeu") # Visualize the network
def create_BN_model_using_BayesianEstimator(data): #data = pd.DataFrame(sensor_data)#, columns= feature_names)#['X', 'Y']) #print(data) data = pd.DataFrame( data ) #read_data_from_file_remove_date_and_time(r"E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\converted\pgmpy\sensor+PCA_n=5.csv" , data_type='float')) #print(data) #start_time = time.time() # 2 hours running, without output #hc = HillClimbSearch(data, scoring_method=BicScore(data)) #best_model = hc.estimate() #print(hc.scoring_method) #print(best_model.edges()) #end_time = time.time() #print("execution time in seconds:") #print(end_time-start_time) #start_time = time.time() #hc = HillClimbSearch(data, scoring_method=BdeuScore(data)) #best_model = hc.estimate() #print(hc.scoring_method) #print(best_model.edges()) #end_time = time.time() #print("execution time in seconds:") #print(end_time-start_time) #structure learning print("structure learning") start_time = time.time() hc = HillClimbSearch(data, scoring_method=K2Score( data)) #BicScore(data))#K2Score(data))BdeuScore(data) best_model = hc.estimate() print(hc.scoring_method) print(best_model.edges()) end_time = time.time() print("execution time in seconds:{}".format(end_time - start_time)) #parameter learning #model = BayesianModel([('A', 'C'), ('B', 'C')]) #model.fit(data) #model.get_cpds() ###### #best_model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D')]) casas7_model = BayesianModel(best_model.edges()) print("*******************") #BayesianEstimator.get_parameters(self, prior_type, equivalent_sample_size, pseudo_counts) #####estimator = BayesianEstimator(best_model, data) #####print(estimator.get_parameters(prior_type='K2'))#, equivalent_sample_size=5) estimator = BayesianEstimator(casas7_model, data) #casas7_model.fit(data, estimator=BayesianEstimator, prior_type="K2")#MaximumLikelihoodEstimator) ######print(casas7_model.get_cpds()) ###casas7_model.predict(data) #print("casas7_model.node:{}".format(casas7_model.node)) ########return estimator return estimator
class TestBaseEstimator(unittest.TestCase): def setUp(self): self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) self.rand_data['C'] = self.rand_data['B'] self.est_rand = HillClimbSearch(self.rand_data, scoring_method=K2Score(self.rand_data)) self.model1 = BayesianModel() self.model1.add_nodes_from(['A', 'B', 'C']) self.model2 = self.model1.copy() self.model2.add_edge('A', 'B') # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv') self.titanic_data1 = self.titanic_data[["Survived", "Sex", "Pclass", "Age", "Embarked"]] self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] self.est_titanic1 = HillClimbSearch(self.titanic_data1) self.est_titanic2 = HillClimbSearch(self.titanic_data2) def test_legal_operations(self): model2_legal_ops = list(self.est_rand._legal_operations(self.model2)) model2_legal_ops_ref = [(('+', ('C', 'A')), -28.15602208305154), (('+', ('A', 'C')), -28.155467430966382), (('+', ('C', 'B')), 7636.947544933631), (('+', ('B', 'C')), 7937.805375579936), (('-', ('A', 'B')), 28.155467430966382), (('flip', ('A', 'B')), -0.0005546520851567038)] self.assertSetEqual(set([op for op, score in model2_legal_ops]), set([op for op, score in model2_legal_ops_ref])) def test_legal_operations_titanic(self): est = self.est_titanic1 start_model = BayesianModel([("Survived", "Sex"), ("Pclass", "Age"), ("Pclass", "Embarked")]) legal_ops = est._legal_operations(start_model) self.assertEqual(len(list(legal_ops)), 20) tabu_list = [('-', ("Survived", "Sex")), ('-', ("Survived", "Pclass")), ('flip', ("Age", "Pclass"))] legal_ops_tabu = est._legal_operations(start_model, tabu_list=tabu_list) self.assertEqual(len(list(legal_ops_tabu)), 18) legal_ops_indegree = est._legal_operations(start_model, max_indegree=1) self.assertEqual(len(list(legal_ops_indegree)), 11) legal_ops_both = est._legal_operations(start_model, tabu_list=tabu_list, max_indegree=1) legal_ops_both_ref = [(('+', ('Embarked', 'Survived')), 10.050632580087608), (('+', ('Survived', 'Pclass')), 41.88868046549101), (('+', ('Age', 'Survived')), -23.635716036430836), (('+', ('Pclass', 'Survived')), 41.81314459373226), (('+', ('Sex', 'Pclass')), 4.772261678792802), (('-', ('Pclass', 'Age')), 11.546515590731815), (('-', ('Pclass', 'Embarked')), -32.171482832532774), (('flip', ('Pclass', 'Embarked')), 3.3563814191281836), (('flip', ('Survived', 'Sex')), 0.039737027979640516)] self.assertSetEqual(set(legal_ops_both), set(legal_ops_both_ref)) def test_estimate_rand(self): est1 = self.est_rand.estimate() self.assertSetEqual(set(est1.nodes()), set(['A', 'B', 'C'])) self.assertTrue(est1.edges() == [('B', 'C')] or est1.edges() == [('C', 'B')]) est2 = self.est_rand.estimate(start=BayesianModel([('A', 'B'), ('A', 'C')])) self.assertTrue(est2.edges() == [('B', 'C')] or est2.edges() == [('C', 'B')]) def test_estimate_titanic(self): self.assertSetEqual(set(self.est_titanic2.estimate().edges()), set([('Survived', 'Pclass'), ('Sex', 'Pclass'), ('Sex', 'Survived')])) def tearDown(self): del self.rand_data del self.est_rand del self.model1 del self.titanic_data del self.titanic_data1 del self.titanic_data2 del self.est_titanic1 del self.est_titanic2
import numpy as np import pandas as pd from pgmpy.estimators import HillClimbSearch from pgmpy.models import BayesianModel from pgmpy.estimators import K2Score from pgmpy.factors.discrete import TabularCPD from pgmpy.sampling import BayesianModelSampling from pgmpy.inference import VariableElimination feature_val1 = pd.read_csv('15features_f.csv') '''1pen_pressure 2letter_spacing 3size 4dimension 5is_lowercase 6is_continuous 7slantness 8tilt 9entry_stroke_a 10staff_of_a 11formation_n 12staff_of_d 13exit_stroke_d 14word_formation 15constancy''' hill = HillClimbSearch(feature_val1, scoring_method=K2Score(feature_val1)) f_model = hill.estimate() print(f_model.edges()) feature_val2 = pd.read_csv('15features_g.csv') hill1 = HillClimbSearch(feature_val2, scoring_method=K2Score(feature_val2)) g_model = hill1.estimate() print(g_model.edges()) corr_mat = feature_val1.corr() print(corr_mat) corr_feature = set() for i in range(len(corr_mat.columns)): for j in range(i): if abs(corr_mat.iloc[i, j]) > 0.2:
(y_pred["Survived"] == test["Survived"]).sum() / len(test) # 测试集精度 model_infer = VariableElimination(model) q = model_infer.query(variables=["Survived"], evidence={"Fare": 0}) print(q["Survived"]) q = model_infer.map_query( variables=["Fare", "Age", "Sex", "Pclass", "Cabin"], evidence={"Survived": 1} ) print(q) # # 用结构学习建立模型 hc = HillClimbSearch(train, scoring_method=BicScore(train)) best_model = hc.estimate() print(best_model.edges()) best_model.fit( train, estimator=BayesianEstimator, prior_type="BDeu" ) # default equivalent_sample_size=5 predict_data = test.drop(columns=["Survived"], axis=1) y_pred = best_model.predict(predict_data) (y_pred["Survived"] == test["Survived"]).sum() / len(test) # 测试集精度 # # 预测原test集并保存csv kaggle_test = full.drop(
# %% markdown [markdown] # #### Heuristic Search # Once more ndoes are involved we need to switch to heuristic search. The `HillClimbSearch` implements a greedy local search that starts from the DAG `start` (default disconnected DAG) and proceeds by iteratively performing single-edge manipulations that maximally increase the score. The search terminates once a local maximum is found. # # **Example 1:** $Z = X + Y$ # %% codecell from pgmpy.estimators import HillClimbSearch # Create data with dependencies: data: DataFrame = DataFrame(np.random.randint(low=0, high=3, size=(2500, 8)), columns=list('ABCDEFGH')) data['A'] += data['B'] + data['C'] data['H'] = data['G'] - data['A'] hc = HillClimbSearch(data=data, scoring_method=BicScore(data)) bestModel = hc.estimate() # %% codecell bestModel.edges() # %% codecell drawGraph(bestModel) # %% markdown [markdown] # The search correctly identifies that $B$ and $C$ do not influence $H$ directly, only through $A$ and of course that $D$, $E$, $F$ are independent. # # To enforce a wider exploration of the search space, the search can be enhanced with a tabu list. The list keeps track of the last n modfications; those are then not allowed to be reversed, regardless of the score. Additionally a `white_list` or `black_list` can be supplied to restrict the search to a particular subset or to exclude certain edges. The parameter `max_indegree` allows to restrict the maximum number of parents for each node. # # **Example 2:** Fruit data # %% codecell hc = HillClimbSearch(fruitData, scoring_method=BicScore(fruitData))