def test_score_titanic(self): scorer = BicScore(self.titanic_data2) titanic = BayesianModel([("Sex", "Survived"), ("Pclass", "Survived")]) self.assertAlmostEqual(scorer.score(titanic), -1896.7250012840179) titanic2 = BayesianModel([("Pclass", "Sex")]) titanic2.add_nodes_from(["Sex", "Survived", "Pclass"]) self.assertLess(scorer.score(titanic2), scorer.score(titanic))
def pgm_generate(self, target, data, pgm_stats, subnodes, child=None): subnodes = [str(int(node)) for node in subnodes] target = str(int(target)) subnodes_no_target = [node for node in subnodes if node != target] data.columns = data.columns.astype(str) MK_blanket = self.search_MK(data, target, subnodes_no_target.copy()) if child == None: est = HillClimbSearch(data[subnodes_no_target], scoring_method=BicScore(data)) pgm_no_target = est.estimate() for node in MK_blanket: if node != target: pgm_no_target.add_edge(node, target) # Create the pgm pgm_explanation = BayesianModel() for node in pgm_no_target.nodes(): pgm_explanation.add_node(node) for edge in pgm_no_target.edges(): pgm_explanation.add_edge(edge[0], edge[1]) # Fit the pgm data_ex = data[subnodes].copy() data_ex[target] = data[target].apply(self.generalize_target) for node in subnodes_no_target: data_ex[node] = data[node].apply(self.generalize_others) pgm_explanation.fit(data_ex) else: data_ex = data[subnodes].copy() data_ex[target] = data[target].apply(self.generalize_target) for node in subnodes_no_target: data_ex[node] = data[node].apply(self.generalize_others) est = HillClimbSearch(data_ex, scoring_method=BicScore(data_ex)) pgm_w_target_explanation = est.estimate() # Create the pgm pgm_explanation = BayesianModel() for node in pgm_w_target_explanation.nodes(): pgm_explanation.add_node(node) for edge in pgm_w_target_explanation.edges(): pgm_explanation.add_edge(edge[0], edge[1]) # Fit the pgm data_ex = data[subnodes].copy() data_ex[target] = data[target].apply(self.generalize_target) for node in subnodes_no_target: data_ex[node] = data[node].apply(self.generalize_others) pgm_explanation.fit(data_ex) return pgm_explanation
def model_change(dag, data): bay_model = [] data = pd.DataFrame(data) print(data) for i in range(len(dag)): for j in range(np.size(data, 1)): if dag[i][j] != 0: bay_model.append((str(i), str(j))) best_model = BayesianModel(bay_model) bic = BicScore(data) return bic.score(best_model)
def train_joke_type_selection(): #one table jokes = Jokes.query.all() joke_preferences = [] for i in range( sum([joke_preference.nerd_joke for joke_preference in jokes])): joke_preferences.append("nerd joke") for i in range(sum([joke.weird_joke for joke in jokes])): joke_preferences.append("weird joke") for i in range(sum([joke.cat_meme for joke in jokes])): joke_preferences.append("cat meme") for i in range(sum([joke.dog_meme for joke in jokes])): joke_preferences.append("dog meme") for i in range(sum([joke.dad_joke for joke in jokes])): joke_preferences.append("dad joke") data = pd.DataFrame() for joke_preference in joke_preferences: data = data.append({"joke_preference": joke_preference}, ignore_index=True) bic = BicScore(data) import code code.interact(local=locals()) es = ExhaustiveSearch(data, scoring_method=bic) best_model = es.estimate() return best_model
def pgm_generate(self, target, data, stats, subnodes): stats_pd = pd.Series(stats, name='p-values') MK_blanket_frame = stats_pd[stats_pd < 0.05] MK_blanket = [node for node in MK_blanket_frame.index if node in subnodes] subnodes_no_target = [node for node in subnodes if node != target] est = HillClimbSearch(data[subnodes_no_target], scoring_method=BicScore(data)) pgm_no_target = est.estimate() for node in MK_blanket: if node != target: pgm_no_target.add_edge(node,target) # Create the pgm pgm_explanation = BayesianModel() for node in pgm_no_target.nodes(): pgm_explanation.add_node(node) for edge in pgm_no_target.edges(): pgm_explanation.add_edge(edge[0],edge[1]) # Fit the pgm data_ex = data[subnodes].copy() data_ex[target] = data[target].apply(self.generalize_target) for node in subnodes_no_target: data_ex[node] = data[node].apply(self.generalize_others) pgm_explanation.fit(data_ex) return pgm_explanation
def mutacao(x, fitness_aux, prob, max_v, min_v): if len(x) * len(x[0]) * prob < 1: print("entando") for i in range(len(x)): for j in range(len(x[i])): r = random.random() if r <= prob: valor_mut = x[i][j] while (valor_mut == x[i][j]): valor_mut = min_v + random.randint( min_valor, max_valor) x[i][j] = valor_mut else: numero_mutacao = round(len(x) * len(x[0]) * prob) while (numero_mutacao > 0): ind_escolhido = round(random.random() * (len(x) - 1)) val = round(random.random() * (len(x[ind_escolhido]) - 1)) valor_mut = deepcopy(x[ind_escolhido][val]) valor_mut_antigo = deepcopy(x[ind_escolhido][val]) while (valor_mut == x[ind_escolhido][val]): valor_mut = min_v + random.randint(min_valor, max_valor) x[ind_escolhido][val] = valor_mut if x[ind_escolhido][val] not in nao_dag: G = vetor_Rede(x[ind_escolhido], nodes) if G: fitness_aux[ind_escolhido] = abs(BicScore(data).score(G)) numero_mutacao = numero_mutacao - 1 else: nao_dag.append(x[ind_escolhido]) x[ind_escolhido][val] = valor_mut_antigo else: x[ind_escolhido][val] = valor_mut_antigo
def learnedStructureModel(): # trainingData, testingData = differenceBetweenFeatures(True) trainingInputs, trainingOutputs, testingInputs, testingOutputs = \ gtd.formSameWriterDiffWriterInputOutputFeaturePairs(5, True) trainingData = pd.DataFrame( data = np.concatenate((trainingInputs, trainingOutputs), axis=1), columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9',\ 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'h']) testingData = pd.DataFrame( data = np.concatenate((testingInputs, testingOutputs), axis=1), columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9',\ 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'h']) #trainingData = trainingData.drop(['f9', 'f18'], axis=1) #testingData = testingData.drop(['f9', 'f18'], axis=1) hc = HillClimbSearch(trainingData, scoring_method=BicScore(trainingData)) model = hc.estimate(max_indegree=20) state_names = { 'f1': [0, 1, 2, 3], 'f2': [0, 1, 2, 3, 4], 'f3': [0, 1, 2], 'f4': [0, 1, 2, 3, 4], 'f5': [0, 1, 2, 3], 'f6': [0, 1, 2, 3], 'f7': [0, 1, 2, 3], 'f8': [0, 1, 2, 3, 4], 'f9': [0, 1, 2], 'f11': [0, 1, 2, 3], 'f12': [0, 1, 2, 3, 4], 'f13': [0, 1, 2], 'f14': [0, 1, 2, 3, 4], 'f15': [0, 1, 2, 3], 'f16': [0, 1, 2, 3], 'f17': [0, 1, 2, 3], 'f18': [0, 1, 2, 3, 4], 'f19': [0, 1, 2], 'h': [0, 1] } # fit model and data, compute CPDs model.fit(trainingData, estimator=BayesianEstimator, prior_type='BDeu', state_names=state_names) print(model.edges()) # inference object # computing probability of Hyothesis given evidence evidenceNodes = ['f1','f2','f3','f4','f5','f6','f7','f8','f9',\ 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19'] evaluateModel(model, testingData, 'h', evidenceNodes)
def build_structure(data): df = pd.DataFrame(data) est = HillClimbSearch(df, scoring_method=BicScore(df)) model = est.estimate() DAG = np.zeros((data.shape[1], data.shape[1]), np.int64) for edge in model.edges(): DAG[edge[0], edge[1]] = 1 np.save('dataset/DAG.npy', DAG) return DAG
def _SetScoringType(df, scoretype, verbose=3): if verbose >= 3: print('[bnlearn] >Set scoring type at [%s]' % (scoretype)) if scoretype == 'bic': scoring_method = BicScore(df) elif scoretype == 'k2': scoring_method = K2Score(df) elif scoretype == 'bdeu': scoring_method = BDeuScore(df, equivalent_sample_size=5) return (scoring_method)
def main(): data, string = readData() genes = np.array(data.columns[1:]) labels = np.array(data.columns) bayesianModel = BayesianModel() transitionModel = DBN() bayesianModel.add_nodes_from(genes) transitionModel.add_nodes_from(genes) bData, tData = getData(data, labels) print "\nDynamic Bayesian Network inference", print "\nB_0 network relations: " hcb = HillClimbSearch(bData, genes, scoring_method=BicScore(bData, labels, bk1=string, weight=4)) best_model_b = hcb.estimate(start=bayesianModel, tabu_length=15, max_indegree=2) print(best_model_b.edges()) printOutputB(best_model_b) print "\nLocal Probability Model: " best_model_b.fit(bData, BayesianEstimator) for cpd in best_model_b.get_cpds(): print(cpd) print "\nB_transition network relations: " hct = HillClimbSearch(tData, genes, scoring_method=BicScore(tData, labels, bk1=string, weight=4)) best_model_t = hct.estimate_dynamic(start=transitionModel, tabu_length=15, max_indegree=2) print(best_model_t.edges()) printOutputT(best_model_t) print "\nLocal Probability Model: " best_model_t.fit(tData, BayesianEstimator) for cpd in best_model_t.get_cpds(): print(cpd)
def SetScoringType(df, scoretype, verbose=3): if verbose >= 3: print('[BNLEARN][STRUCTURE LEARNING] Set scoring type at [%s]' % (scoretype)) if scoretype == 'bic': scoring_method = BicScore(df) elif scoretype == 'k2': scoring_method = K2Score(df) elif scoretype == 'bdeu': scoring_method = BdeuScore(df, equivalent_sample_size=5) return (scoring_method)
def bei_ye_si(): warnings.filterwarnings("ignore") print('现在进行的算法是贝叶斯网络') f = open('泰坦尼克号.txt') dataset = pd.read_table(f, delim_whitespace=True) train = dataset[:800] test = dataset[800:] hc = HillClimbSearch(train, scoring_method=BicScore(train)) best_model = hc.estimate() best_model.fit(train, estimator=BayesianEstimator, prior_type="BDeu") # default equivalent_sample_size=5 predict_data = test.drop(columns=['Survived'], axis=1) y_pred = best_model.predict(predict_data) print( (y_pred['Survived'] == test['Survived']).sum() / len(test)) # 测试集精度'''
def setUp(self): self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) self.rand_data['C'] = self.rand_data['B'] self.est_rand = ExhaustiveSearch(self.rand_data) self.est_rand_bdeu = ExhaustiveSearch(self.rand_data, scoring_method=BdeuScore( self.rand_data)) self.est_rand_bic = ExhaustiveSearch(self.rand_data, scoring_method=BicScore( self.rand_data)) # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv( 'pgmpy/tests/test_estimators/testdata/titanic_train.csv') self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] self.est_titanic = ExhaustiveSearch(self.titanic_data2)
def scoreStructureLearn(data, search='HillClimbSearch', scoring_method='BicScore'): #基于score-search的结构学习 #search:HillClimbSearch, ExhaustiveSearch #scoring_method: 'BicScore', K2Score, BdeuScore if scoring_method == 'BicScore': scoring_method_tmp = BicScore(data) elif scoring_method == 'K2Score': scoring_method_tmp = K2Score(data) elif scoring_method == 'BdeuScore': scoring_method_tmp = BdeuScore(data, equivalent_sample_size=5) if search == 'HillClimbSearch': es = HillClimbSearch(data, scoring_method=scoring_method_tmp) else: es = ExhaustiveSearch(data, scoring_method=scoring_method_tmp) best_model = es.estimate() return best_model
def learn_structure(self, method, scoring_method, log=True): ''' (4) Method that builds the structure of the data ----------------- Parameters: method : The technique used to search for the structure -> scoring_approx - To use an approximated search with scoring method -> scoring_exhaustive - To use an exhaustive search with scoring method -> constraint - To use the constraint based technique scoring_method : K2, bic, bdeu log - "True" if you want to print debug information in the console ''' #Select the scoring method for the local search of the structure if scoring_method == "K2": scores = K2Score(self.data) elif scoring_method == "bic": scores = BicScore(self.data) elif scoring_method == "bdeu": scores = BdeuScore(self.data) #Select the actual method if method == "scoring_approx": est = HillClimbSearch(self.data, scores) elif method == "scoring_exhaustive": est = ExhaustiveSearch(self.data, scores) elif method == "constraint": est = ConstraintBasedEstimator(self.data) self.best_model = est.estimate() self.eliminate_isolated_nodes( ) # REMOVE all nodes not connected to anything else for edge in self.best_model.edges_iter(): self.file_writer.write_txt(str(edge)) self.log("Method used for structural learning: " + method, log) #self.log("Training instances skipped: " + str(self.extractor.get_skipped_lines()), log) self.log("Search terminated", log)
def scoreModels(h0Diff, h0Rarity): diffModel0 = [('d5', 'd9'), ('d5', 'd3'), ('d3', 'd4'), ('d3', 'd8'), ('d9', 'd6'), ('d9', 'd1'), ('d9', 'd7'), ('d9', 'd8')] diffModel1 = [('d2', 'd5'), ('d5', 'd9'), ('d5', 'd3'), ('d3', 'd4'), ('d3', 'd8'), ('d9', 'd6'), ('d9', 'd1'), ('d9', 'd7'), ('d9', 'd8')] diffModel2 = [('d1', 'd2'), ('d5', 'd9'), ('d5', 'd3'), ('d3', 'd4'), ('d3', 'd8'), ('d9', 'd6'), ('d9', 'd1'), ('d9', 'd7'), ('d9', 'd8')] print(' \nestimating K2/BIC score of difference structures\n') print('k2score model0: {0} BicScore model0: {1}'.format( K2Score(h0Diff).score(BayesianModel(diffModel0)), BicScore(h0Diff).score(BayesianModel(diffModel0)))) print('k2score model1: {0} BicScore model1: {1}'.format( K2Score(h0Diff).score(BayesianModel(diffModel1)), BicScore(h0Diff).score(BayesianModel(diffModel1)))) print('k2score model2: {0} BicScore model2: {1}'.format( K2Score(h0Diff).score(BayesianModel(diffModel2)), BicScore(h0Diff).score(BayesianModel(diffModel2)))) rarityModel0 = [('r5', 'r9'), ('r5', 'r3'), ('r9', 'r1'), ('r8', 'r3'), ('r6', 'r9'), ('r6', 'r3')] rarityModel1 = [('r6', 'r9'), ('r7', 'r9'), ('r3', 'r4'), ('r3', 'r5'), ('r3', 'r9'), ('r2', 'r9'), ('r5', 'r9'), ('r9', 'r8'), ('r9', 'r1')] rarityModel2 = [('r7', 'r9'), ('r4', 'r3'), ('r4', 'r9'), ('r1', 'r2'), ('r1', 'r9'), ('r2', 'r9'), ('r5', 'r9'), ('r9', 'r8'), ('r9', 'r6')] print(' \nestimating K2/BIC score of rarity structures\n') print('k2score model0: {0} BicScore model0: {1}'.format( K2Score(h0Rarity).score(BayesianModel(rarityModel0)), BicScore(h0Rarity).score(BayesianModel(rarityModel0)))) print('k2score model1: {0} BicScore model1: {1}'.format( K2Score(h0Rarity).score(BayesianModel(rarityModel1)), BicScore(h0Rarity).score(BayesianModel(rarityModel1)))) print('k2score model2: {0} BicScore model2: {1}'.format( K2Score(h0Rarity).score(BayesianModel(rarityModel2)), BicScore(h0Rarity).score(BayesianModel(rarityModel2))))
'Cancer': cancer, 'Age': age, 'Tuberculose': tuberculosis, 'TbOuCa': tbOrCa, 'VisiteAsie': visitAsia, 'Radiographie': xray, 'Bronchite': bronchitis, 'Dyspnea': dyspnea, 'Geographie': geographical }) print(data) #Apprentissage de la structure from pgmpy.estimators import HillClimbSearch, BicScore bic = BicScore(data) hc = HillClimbSearch(data, scoring_method=bic) best_model = hc.estimate() print(best_model.edges()) # la relecture de la structure trouvée révèle que le programme donne les liaisons mais pas le sens de ces dernières. # le model avec le bon sens serait donc : bon_model = BayesianModel([('Cancer', 'TbOuCa'), ('TbOuCa', 'Dyspnea'), ('TbOuCa', 'Bronchite'), ('TbOuCa', 'Radiographie'), ('Fumeur', 'Bronchite'), ('Radiographie', 'Dyspnea'), ('Tuberculose', 'TbOuCa'), ('Bronchite', 'Dyspnea')]) #apprentissage des paramètres #print("estimation des cpds :") from pgmpy.estimators import BayesianEstimator
data2 = pd.DataFrame(data=raw_data2) import time t0 = time.time() # Uncomment below to perform exhaustive search searcher = ExhaustiveSearch(data2, scoring_method=K2Score(data2)) search = searcher.all_scores() print('time:', time.time() - t0) # Uncomment for printout: #for score, model in search: # print("{0} {1}".format(score, model.edges())) separator() hcs = HillClimbSearch(data2, scoring_method=K2Score(data)) model = hcs.estimate() hcs2 = HillClimbSearch(data2, scoring_method=K2Score(data2)) model2 = hcs2.estimate() hcs_bic = HillClimbSearch(data, scoring_method=BicScore(data)) model_bic = hcs_bic.estimate() hcs_bic2 = HillClimbSearch(data2, scoring_method=BicScore(data2)) model_bic2 = hcs_bic2.estimate() # End of Task 6
# 时间:2020/12/21 15:38 import pandas as pd import networkx as nx from matplotlib import pyplot as plt from pgmpy.models import BayesianModel from pgmpy.estimators import HillClimbSearch from pgmpy.estimators import BicScore data = pd.read_csv( r'C:\Users\haomiaowu\Desktop\BN-Cheminformatics\Train-clear.csv') bic = BicScore(data) hs = HillClimbSearch(data, scoring_method=BicScore(data)) best_model = hs.estimate() print(best_model.edges()) nx.draw( best_model, with_labels=True, node_size=1000, font_weight='bold', node_color='y', ) plt.show()
def annealing(maxsteps=1000, debug=True): """ Optimize the black-box function 'cost_function' with the simulated annealing algorithm.""" #Ler data with open('Asia.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') aux = 0 data = [] data1 = [[] for i in range(8)] for row in csv_reader: data.append(row) for i in range(len(row)): data1[i].append(row[i]) aux = aux + 1 if aux == 50001: break data = {} for i in range(len(data1)): data[data1[i][0]] = [data1[i][j] for j in range(1, len(data1[i]))] data = pd.DataFrame(data) print("Data: ") print(data) #Dados Retirandos do arquivo prob = 0.5 min_valor = 0 max_valor = 2 nao_dag = [] nodes = ['Pollution', 'Smoker', 'Cancer', 'Xray', 'Dyspnoea'] nodes = ['asia', 'tub', 'smoke', 'lung', 'bronc', 'either', 'xray', 'dysp'] ind_size = round((len(nodes) * len(nodes) - len(nodes)) / 2) ind = False while ind == False: aux = [random.randint(min_valor, max_valor) for i in range(ind_size)] if aux not in nao_dag: G = vetor_Rede(aux, nodes) if G: state = deep_copy(aux) ind = True else: nao_dag.append(aux) print('state') print(state) bic_score = BicScore(data) print(vetor_Rede(state, nodes)) cost = cost_function(state, bic_score, nodes) states, costs = [state], [cost] for step in range(maxsteps): print(step) fraction = step / float(maxsteps) T = temperature(fraction) #[new_state,new_cost]=pertubacao(deep_copy(state),deep_copy(cost),prob,max_valor,min_valor,bic_score,nodes,nao_dag) [new_state, new_cost] = mutacao(deep_copy(state), deep_copy(cost), prob, max_valor, min_valor, bic_score, nodes, nao_dag) #new_cost = cost_function(new_state,bic_score,nodes) #if debug: print("Step #{:>2}/{:>2} : T = {:>4.3g}, state = {:>4.3g}, cost = {:>4.3g}, new_state = {:>4.3g}, new_cost = {:>4.3g} ...".format(step, maxsteps, T, state, cost, new_state, new_cost)) if acceptance_probability(cost, new_cost, T) > random.random(): state1 = new_state.copy() cost = deep_copy(new_cost) states.append(state1) costs.append(cost) state = deep_copy(state1) # print(" ==> Accept it!") # else: # print(" ==> Reject it...") return state, cost_function(state, bic_score, nodes), states, costs
state, c, states, costs = annealing(maxsteps=3000, debug=True) nodes = ['asia', 'tub', 'smoke', 'lung', 'bronc', 'either', 'xray', 'dysp'] G = vetor_Rede(state, nodes) nx.draw(G, with_labels=True) print(state) print(c) with open('Asia.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') aux = 0 data = [] data1 = [[] for i in range(8)] for row in csv_reader: data.append(row) for i in range(len(row)): data1[i].append(row[i]) aux = aux + 1 if aux == 50001: break #22376.39851240954 data = {} for i in range(len(data1)): data[data1[i][0]] = [data1[i][j] for j in range(1, len(data1[i]))] data = pd.DataFrame(data) print("Data: ") print(data) #Dados Retirandos do arquivo reader = BIFReader('asia.bif') # melhor rede do asia, como esta no bnlearn.com asia_model = reader.get_model() # lendo esse modelo print("Score BIC") print(abs(BicScore(data).score(asia_model))) #see_annealing(states, costs)
#print(dataPreparation.get_work_lists()) feature_names = dataPreparation.get_work_lists() feature_names.append("Person") print(feature_names) #mydata = np.random.randint(low=0, high=2,size=(100, 6)) mydata = np.genfromtxt( r'E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\converted\pgmpy\activities+time_ordered_withoutdatetime.csv', delimiter=",") #pd.read_csv(r'E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\converted\pgmpy\data.csv') #print(mydata) data = pd.DataFrame(mydata, columns=feature_names) #['X', 'Y']) print(data) list_of_scoring_methods = [ BicScore(data), #BdeuScore(data), #K2Score(data) ] for scoreMethod in list_of_scoring_methods: start_time = time.time() hc = HillClimbSearch(data, scoreMethod) best_model = hc.estimate() print(hc.scoring_method) print(best_model.edges()) end_time = time.time() print("execution time in seconds:") print(end_time - start_time) estimator = BayesianEstimator(best_model, data)
def learn_structure(self, file_path, algorithm="hc", significance_level=0.05): """ Employs `pgmpy` package's Bayesian Network structure learning algorithms to learn structure from a dataset. Saves a tabular version of the result as a CSV file. Arguments: algorithm: str, optional (default = 'hc') Determines whether the hill-climbing or Peter-Clark are employed. Two possible values include: 'hc', 'pc'. Note, I found a bug in pgmpy implementation halfway through this project. Don't use the 'pc' method. file_path: str, the absolute path to save the file to (e.g. "~/Desktop/BN_structure.csv") significance_level: float, option (default = 0.05) Statistical significance cutoff for use in pruning the network when using the PC algorithm. Lower values produce sparser networks. Returns: None """ self.structure_algorithm = algorithm if self.verbose: print( "Depending on the number of variables in your dataset, this might take some time..." ) # Learn structure, using one of the algorithms np.random.seed(self.random_seed) if algorithm == "hc": # Filter out columns with zero correlation with target variable self.filtered_df = self._initial_filter() # Run HC algorithm self.structure_model = HillClimbSearch( self.filtered_df, scoring_method=BicScore(self.filtered_df)).estimate() if self.verbose: print( f"Structure learned! Saving structure to the following CSV: {file_path}" ) # Eliminate isolated subgraphs G = self.structure_model.to_undirected() connected_nodes = list( nx.algorithms.components.node_connected_component( G, self.target_variable)) disconnected_nodes = list( set(list(self.structure_model.nodes)) - set(connected_nodes)) for node in disconnected_nodes: self.structure_model.remove_node(node) self.filtered_df.drop([node], axis=1, inplace=True) pd.DataFrame( list(self.structure_model.edges), columns=["from_variable", "to_variable"], ).to_csv(file_path, index=False) elif algorithm == "pc": self.filtered_df = self.df self.structure_model = ConstraintBasedEstimator( self.filtered_df).estimate( significance_level=significance_level) if self.verbose: print( f"Structure learned! Saving structure to the following CSV: {file_path}" ) pd.DataFrame( list(self.structure_model.edges), columns=["from_variable", "to_variable"], ).to_csv(file_path, index=False)
from pgmpy.estimators import HillClimbSearch, BicScore, BayesianEstimator from pgmpy.models import BayesianModel from pgmpy.readwrite.BIF import BIFWriter import pandas as pd import numpy as np from time import time import graphviz as gv import os train = pd.read_csv('../msnbcWithHeader.csv', sep=',') train = train[train.sum(axis=1) < 200] train[train > 1] = 1 train_start = time() bic = BicScore(train) hc = HillClimbSearch(train, scoring_method=bic) best_model = hc.estimate(prog_bar=True) edges = best_model.edges() model = BayesianModel(edges) model.fit(train, estimator=BayesianEstimator, prior_type="BDeu") variables = model.nodes() print(model.edges()) train_end = time() - train_start print("train time " + str(train_end)) my_graph = gv.Digraph(format='png') for node in variables: my_graph.node(node) for edge in edges: my_graph.edge(edge[0], edge[1])
def learn(self, file1, file2): f1 = open(file1, encoding="utf8") lines = f1.readlines() edges = self.getegdes(lines[0]) data = pd.read_csv(file2) G = nx.DiGraph() for i in range(int(len(edges) / 2)): G.add_edge(edges[2 * i], edges[2 * i + 1]) est = HillClimbSearch(data, scoring_method=BicScore(data)) model = est.estimate() G_ = nx.DiGraph() G_.add_edges_from(model.edges()) for i, j in G_.edges(): if i not in G.nodes() or j not in G.nodes(): G.add_edge(i, j) elif not nx.has_path(G, j, i): G.add_edge(i, j) new_model = BayesianModel() new_model.add_edges_from(G.edges) G = new_model.copy() # N = G.number_of_nodes() # B = np.zeros((N*(N-1)//2, N)) # i = 0 # y = [] # k = 0 # nodes = list(G.nodes._nodes.keys()) # for i in range(len(nodes)): # for j in range(i+1, len(nodes)): # if nx.has_path(G, nodes[i], nodes[j]): # y.append(1) # B[k, i] = 1 # B[k, j] = -1 # elif nx.has_path(G, nodes[j], nodes[i]): # y.append(-1) # B[k, i] = 1 # B[k, j] = -1 # else: # y.append(0) # k += 1 # # W = np.eye(N, N) # est = HillClimbSearch(data, scoring_method=BicScore(data)) # model = est.estimate() # G_ = nx.DiGraph() # G_.add_edges_from(model.edges()) # queue = [] # for node in G_.nodes(): # if G_.in_degree(node) == 0: # queue.append(node) # G.node[node]['s'] = N # else: # G.node[node]['s'] = N//2 # while len(queue)>0: # now = queue[0] # l = list(G_._succ[now].keys()) # for i in l: # G.node[i]['s'] = G.node[now]['s'] - 1 # queue += l # queue.pop(0) # # phai = [] # for node in G.nodes(): # phai.append(G.node[node]['s']) # miu1 = np.dot(np.transpose(B), B) # miu1 = np.linalg.pinv(miu1) # miu2 = np.dot(np.transpose(B), y) # miu2 = miu2 + phai # miu = np.dot(miu1, miu2) # # seq = miu.tolist() # seq = list(zip(seq, nodes)) # seq = sorted(seq, key=lambda s: s[0]) # seq = [x[1] for x in seq] # nx.draw(G) # plt.show() estimator = BayesianEstimator(G, data) edges = [] for i in G.edges: edges.append(str(i)) print(edges) for i in G.nodes: cpd = estimator.estimate_cpd(i, prior_type="K2") nodeName = i values = dict(data[i].value_counts()) valueNum = len(values) CPT = np.transpose(cpd.values) # CPT = cpd.values sequence = cpd.variables[1::] card = [] for x in sequence: s = len(dict(data[x].value_counts())) card.append(s) output = nodeName + '\t' + str(valueNum) + '\t' + str( CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card) print(output)
import pandas as pd from pgmpy.estimators import HillClimbSearch, ExhaustiveSearch from pgmpy.estimators import BDeuScore, BicScore, K2Score ##结构学习 data = pd.read_csv('data.csv', encoding='gb18030') df = pd.DataFrame(data) bic = BicScore(df) k2 = K2Score(df) hc = HillClimbSearch(df, scoring_method=bic) #hc = ExhaustiveSearch(df, k2) model = hc.estimate() for ee in model.edges(): print(ee) ##参数学习 from pgmpy.models import BayesianModel mod = BayesianModel(model.edges()) mod.fit(df) for cpd in mod.get_cpds(): print(cpd) #print(mod.local_independencies('HA')) ##模型推理 from pgmpy.inference import VariableElimination, BeliefPropagation cancer_infer = VariableElimination(mod) q = cancer_infer.query(variables=['HA']) print(q)
col_names = pd.read_csv('data/names.csv') # 'data/names.csv' data = pd.read_csv('data/breast-cancer-wisconsin.data', names=col_names.columns) data = data[data["bare_nuclei"] != '?'] data.set_index('id', inplace=True) #stop the model from using id as a node train, test = train_test_split(data, test_size=0.2, random_state=0) Y_test = test['class'] test = test.drop(['class'], axis=1) #convert labels to something that can be handled be sklearn's eval functions labelencoder = LabelEncoder() Y_test = labelencoder.fit_transform(Y_test.values.ravel()) ### Greedy Structure Learning with Hill Climbing hc = HillClimbSearch(data, scoring_method=BicScore(train)) hc_model = hc.estimate() ### Parameter Learning with Bayesian Estimation hc_model.fit(train, estimator=BayesianEstimator, prior_type="BDeu") ### If the following for loop is un-commented the terminal will be flooded with CPDs """ for cpd in best_model.get_cpds(): print(cpd) """ print() ### Another Method (it will throw errors about sample size - but it still runs and shouldn't be too messed up) ###Constraint Based Structure Learning est = ConstraintBasedEstimator(train)
def __init__(self, dataframe): self.estimator = BicScore(dataframe) self.hashed_local_scores = {}
model.add_nodes_from(data.columns.values) # Learn temporal relations from data model.learn_temporal_relationships(data) # Delete columns with temporal information data.fillna(0, inplace=True) for col in list(data.columns.values): if col.endswith(ITBN.start_time_marker) or col.endswith( ITBN.end_time_marker): data.drop(col, axis=1, inplace=True) elif not col.startswith(ITBN.temporal_node_marker): data[col] = data[col].map({1: 'Y', -1: 'N'}) # Learn model structure from data and temporal relations hc = HillClimbSearchITBN(data, scoring_method=BicScore(data)) model = hc.estimate(start=model) # model.add_edge('response', 'command') # model.add_edge('response', 'tm_response_command') # model.add_edge('command', 'tm_response_command') # Learn model parameters model.fit(data) # Add observation nodes and cpds obs_edges = list() obs_cpds = list() state_names = { 'command': ['N', 'Y'], 'prompt': ['N', 'Y'], 'reward': ['N', 'Y'],
class MDL_Scorer: def __init__(self, dataframe): self.estimator = BicScore(dataframe) self.hashed_local_scores = {} def local_score(self, node_name, parent_names): key = node_name + str(parent_names) if key not in self.hashed_local_scores: score = abs(self.estimator.local_score(node_name, parent_names)) self.hashed_local_scores[key] = score return self.hashed_local_scores[key] def score(self, network, verbose=0): total = 0 if verbose > 2: print("starting scoring") for i in range(network.num_nodes()): if verbose > 3: print("node", i) parents = network.get_parents(i) # get parents node_name = network.node_names(i) parent_names = network.node_names(parents) # print("node", node_name, "parents", parent_names) if verbose > 3: print("starting local score") local_score_ = self.local_score(node_name, parent_names) if verbose > 3: print("ended local score") # print("node", node_name, "parents", # parent_names, "local score", local_score) total += local_score_ if verbose > 2: print("ended scoring") return total # this performance can be improved def n_lowest_score(self, n, networks, score_history_list, network_history_list, verbose=False): networks_sorted = networks.copy() scores = [] for network in networks: score_ = self.score(network, verbose=verbose) scores.append(score_) x, y = sort_together([scores, networks_sorted]) score_history_list += list(x) network_history_list += list(y) y = list(y[:n]) return y def lowest_score(self, networks, verbose=False): result = { 'best_index': -1, 'best_score': float('inf'), 'best_network': None } for i in range(len(networks)): network = networks[i] local = self.score(network) if local < result['best_score']: result['best_index'] = i result['best_score'] = local result['best_network'] = networks[i] return result