def kFold_cross_validation_bayesian(X, y, splits=10): """ cross-validation per la rete bayesiana :param X: X dataframe - valori noti :param y: y column(s) - valori da predire :param splits: numero di folds da utilizzare :return: valore medio di accuracy """ folds = KFold_splitting(X, y, splits) scores = [] for fold in folds: model = BayesianModel([('fat_value', 'saturated-fat_value'), ('carbohydrates_value', 'sugars_value'), ('proteins_value', 'salt_value'), ('fat_value', 'energy_value'), ('carbohydrates_value', 'energy_value'), ('salt_value', 'nutri_value'), ('energy_value', 'nutri_value'), ('saturated-fat_value', 'nutri_value'), ('sugars_value', 'nutri_value')]) predict_data = fold[1].copy() real_data = fold[3].copy() X['nutri_value'] = y model.fit(X, estimator=BayesianEstimator, prior_type="BDeu") y_pred = model.predict(predict_data) scores.append(accuracy_score(y_pred, real_data)) avg_scores = statistics.mean(scores) std_scores = statistics.stdev(scores) print('Accuracy: %.3f (Standard Dev: %.3f)' % (avg_scores, std_scores)) return avg_scores
def test_predict(self): titanic = BayesianModel() titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")]) titanic.fit(self.titanic_data2[500:]) p1 = titanic.predict(self.titanic_data2[["Sex", "Pclass"]][:30]) p2 = titanic.predict(self.titanic_data2[["Survived", "Pclass"]][:30]) p3 = titanic.predict(self.titanic_data2[["Survived", "Sex"]][:30]) p1_res = np.array([ '0', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' ]) p2_res = np.array([ 'male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'female', 'female', 'male', 'female', 'male', 'male', 'female', 'male' ]) p3_res = np.array([ '3', '1', '1', '1', '3', '3', '3', '3', '1', '1', '1', '1', '3', '3', '3', '1', '3', '1', '3', '1', '3', '1', '1', '1', '3', '1', '3', '3', '1', '3' ]) np_test.assert_array_equal(p1.values.ravel(), p1_res) np_test.assert_array_equal(p2.values.ravel(), p2_res) np_test.assert_array_equal(p3.values.ravel(), p3_res)
def create_BN_model(data): #structure learning print("Structure learning") start_time = datetime.now() print("Start time: ", start_time) #DECOMMENT TO CREATE A MODEL WITH THE HILL CLIMB ALGORITHM hc = HillClimbSearch(data) best_model = hc.estimate() print(best_model.edges()) edges = best_model.edges() model = BayesianModel(edges) print('Fitting the model...') # Evaluation of cpds using Maximum Likelihood Estimation model.fit(data) end_time = datetime.now() print("End time: ", end_time) model_write = BIFWriter(model) model_write.write_bif('model_pgmpy.bif') if model.check_model(): print( "Your network structure and CPD's are correctly defined. The probabilities in the columns sum to 1. Hill Climb worked fine!" ) else: print("not good") return (model, end_time - start_time)
def naiveModel2(): trainingInputs, trainingOutputs, testingInputs, testingOutputs = \ gtd.formSameWriterDiffWriterInputOutputFeaturePairs(5, True) trainingData = pd.DataFrame( data = np.concatenate((trainingInputs, trainingOutputs), axis=1), columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10',\ 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19']) testingData = pd.DataFrame( data = np.concatenate((testingInputs, testingOutputs), axis=1), columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10',\ 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19']) # create model model = BayesianModel([('f19', 'f1'), ('f19', 'f2'), ('f19', 'f3'), ('f19', 'f4'), ('f19', 'f5'), ('f19', 'f6'), ('f19', 'f7'), ('f19', 'f8'), ('f19', 'f9'), ('f19', 'f10'), ('f19', 'f11'), ('f19', 'f12'), ('f19', 'f13'), ('f19', 'f14'), ('f19', 'f15'), ('f19', 'f16'), ('f19', 'f17'), ('f19', 'f18')]) # fit model and data, compute CPDs model.fit(trainingData, estimator=BayesianEstimator, prior_type='BDeu') # inference object # computing probability of Hyothesis given evidence evaluateModel(model, testingData, 'f19', featuresLabelList2)
def _train_bn(self): model = BayesianModel(self._dag.edges) model.add_nodes_from(self._dag.nodes) model.fit(self._get_training_data(), BayesianEstimator, prior_type='BDeu') return model
def configure_network(self, X, root_node, estimator_type, class_node=None, draw_dag=True) : """ Learn structure of data and fit a Bayesian Network model, default method is TreeSearch :param X: pandas DataFrame, shape (n_samples, n_features) :param root_node: str, int. Root node of the tree structure. :param estimator_type: str (chow-liu | tan). The algorithm to use for estimating the DAG. :param class_node: str, int. Required if estimator_type = 'tan'. :return: self : object """ est = TreeSearch(X, root_node) dag = est.estimate(estimator_type=estimator_type, class_node=class_node) model = BayesianModel(dag.edges()) model.fit(X, estimator=BayesianEstimator, prior_type='dirichlet', pseudo_counts=0.1) self.dag = dag self.model = model if draw_dag : self.draw_network(self.dag) return self
def pgm_generate(self, target, data, stats, subnodes): stats_pd = pd.Series(stats, name='p-values') MK_blanket_frame = stats_pd[stats_pd < 0.05] MK_blanket = [node for node in MK_blanket_frame.index if node in subnodes] subnodes_no_target = [node for node in subnodes if node != target] est = HillClimbSearch(data[subnodes_no_target], scoring_method=BicScore(data)) pgm_no_target = est.estimate() for node in MK_blanket: if node != target: pgm_no_target.add_edge(node,target) # Create the pgm pgm_explanation = BayesianModel() for node in pgm_no_target.nodes(): pgm_explanation.add_node(node) for edge in pgm_no_target.edges(): pgm_explanation.add_edge(edge[0],edge[1]) # Fit the pgm data_ex = data[subnodes].copy() data_ex[target] = data[target].apply(self.generalize_target) for node in subnodes_no_target: data_ex[node] = data[node].apply(self.generalize_others) pgm_explanation.fit(data_ex) return pgm_explanation
def single_bayes_net(df, independent_vars, dependent_vars): model = BayesianModel() model.add_nodes_from(independent_vars) for independent_var in independent_vars: for dependent_var in dependent_vars: model.add_edge(independent_var, dependent_var) model.fit(df) return model
def train_model(df, lst_relations: list = default_list) -> BayesianModel: model = BayesianModel(lst_relations) # model.cpds = [] model.fit(df, estimator=BayesianEstimator, prior_type="k2", equivalent_sample_size=10, complete_samples_only=False) return model
def pgm_generate(self, target, data, pgm_stats, subnodes, child=None): subnodes = [str(int(node)) for node in subnodes] target = str(int(target)) subnodes_no_target = [node for node in subnodes if node != target] data.columns = data.columns.astype(str) MK_blanket = self.search_MK(data, target, subnodes_no_target.copy()) if child == None: est = HillClimbSearch(data[subnodes_no_target], scoring_method=BicScore(data)) pgm_no_target = est.estimate() for node in MK_blanket: if node != target: pgm_no_target.add_edge(node, target) # Create the pgm pgm_explanation = BayesianModel() for node in pgm_no_target.nodes(): pgm_explanation.add_node(node) for edge in pgm_no_target.edges(): pgm_explanation.add_edge(edge[0], edge[1]) # Fit the pgm data_ex = data[subnodes].copy() data_ex[target] = data[target].apply(self.generalize_target) for node in subnodes_no_target: data_ex[node] = data[node].apply(self.generalize_others) pgm_explanation.fit(data_ex) else: data_ex = data[subnodes].copy() data_ex[target] = data[target].apply(self.generalize_target) for node in subnodes_no_target: data_ex[node] = data[node].apply(self.generalize_others) est = HillClimbSearch(data_ex, scoring_method=BicScore(data_ex)) pgm_w_target_explanation = est.estimate() # Create the pgm pgm_explanation = BayesianModel() for node in pgm_w_target_explanation.nodes(): pgm_explanation.add_node(node) for edge in pgm_w_target_explanation.edges(): pgm_explanation.add_edge(edge[0], edge[1]) # Fit the pgm data_ex = data[subnodes].copy() data_ex[target] = data[target].apply(self.generalize_target) for node in subnodes_no_target: data_ex[node] = data[node].apply(self.generalize_others) pgm_explanation.fit(data_ex) return pgm_explanation
class BayesNetwork: def __init__(self, dataset, graph_structure_index): self.dataset = dataset self.columns = dataset.dataframe.columns self.graph_structure_index = graph_structure_index def build_graph(self): graph_structure_name = list( map(lambda tuple: (self.columns[tuple[0]], self.columns[tuple[1]]), self.graph_structure_index)) self.model = BayesianModel(graph_structure_name) def draw_graph(self): Drawer.draw_graph(self.model) def fit_model(self, prior=False, prior_data=[]): if prior: pseudo_counts = {{ 'D': [300, 700], 'I': [500, 500], 'G': [800, 200], 'L': [500, 500], 'S': [400, 600] }} raise NotImplementedError else: self.model.fit(self.dataset.dataframe[0:-3], estimator=MaximumLikelihoodEstimator) def inference(self, name): from pgmpy.inference import VariableElimination self.infer = VariableElimination(self.model) q = self.infer.query(variables=[name]) print(q[name]) def evaluate_result(self): for cpd in self.model.get_cpds(): print("CPD of {variable}:".format(variable=cpd.variable)) print(cpd) accept_node = cpd.variables[0] ##3D-dimension if len(cpd.values.shape) > 3: pass # Drawer.draw_3D(cpd.values, x_label=cpd.variables[1], # y_label=cpd.variables[2], z_label=cpd.variables[3]) ##2D Dimension elif len(cpd.values.shape) == 2: title = cpd.variables[1] + '----->' + accept_node Drawer(title=title, is_show=False, is_save=False, save_path='img/' + title + '.jpg').draw_matrix( cpd.values)
class BN: def __init__(self, DAG): self.data = [] self.model = BayesianModel(DAG) def take_only_relevant_features(self, DAG, db_file): all_data = pd.read_csv(db_file) data = pd.DataFrame() relevant_features = () for tuple_of_two in DAG: relevant_features = relevant_features + tuple_of_two for column in all_data: if column in relevant_features: data[column] = all_data[column] return data def BNLearning(self, DAG, db_file): self.data = self.take_only_relevant_features(DAG, db_file) self.model = BayesianModel(DAG) self.model.fit(self.data, BayesianEstimator) def BNTesting(self, results_file): # separate data for test training_part = int(0.8 * len(self.data)) testing_data = self.data[training_part:] # predict predict_data = testing_data.copy() predict_data.drop('song_popularity', axis=1, inplace=True) y_pred = self.model.predict(predict_data) with open(results_file, 'w', newline='') as file: y_pred.to_csv(file) def BNForOneSong(self, DAG, db_file, results_file, songFile): data = self.take_only_relevant_features(DAG, db_file) dataToPredictRF = self.take_only_relevant_features(DAG, songFile) dataToPredict = pd.read_csv(songFile) model = BayesianModel(DAG) model.fit(data, BayesianEstimator) dataToPredictRF = dataToPredictRF.copy() y_pred = model.predict(dataToPredictRF) # print(y_pred) with open(results_file, 'w', newline='') as file: y_pred.to_csv(file) return y_pred['song_popularity'][0]
class BaseEliminationTest(TestCase): def setUp(self): self.model = BayesianModel([('diff', 'grade'), ('intel', 'grade'), ('intel', 'sat'), ('grade', 'reco')]) raw_data = np.random.randint(low=0, high=2, size=(1000, 5)) data = pd.DataFrame(raw_data, columns=['diff', 'grade', 'intel', 'sat', 'reco']) self.model.fit(data) def tearDown(self): del self.model del self.elimination_order
class BaseEliminationTest(TestCase): def setUp(self): self.model = BayesianModel([('diff', 'grade'), ('intel', 'grade'), ('intel', 'sat'), ('grade', 'reco')]) raw_data = np.random.randint(low=0, high=2, size=(1000, 5)) data = pd.DataFrame(raw_data, columns=['diff', 'grade', 'intel', 'sat', 'reco']) self.model.fit(data) def tearDown(self): del self.model del self.elimination_order
def bayesnet_examples(): from pgmpy.factors import TabularCPD from pgmpy.models import BayesianModel import pandas as pd student_model = BayesianModel([('D', 'G'), ('I', 'G'), ('G', 'L'), ('I', 'S')]) # we can generate some random data. raw_data = np.random.randint(low=0, high=2, size=(1000, 5)) data = pd.DataFrame(raw_data, columns=['D', 'I', 'G', 'L', 'S']) data_train = data[: int(data.shape[0] * 0.75)] student_model.fit(data_train) student_model.get_cpds() data_test = data[int(0.75 * data.shape[0]): data.shape[0]] data_test.drop('D', axis=1, inplace=True) student_model.predict(data_test) grade_cpd = TabularCPD( variable='G', variable_card=3, values=[[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], evidence=['I', 'D'], evidence_card=[2, 2]) difficulty_cpd = TabularCPD( variable='D', variable_card=2, values=[[0.6, 0.4]]) intel_cpd = TabularCPD( variable='I', variable_card=2, values=[[0.7, 0.3]]) letter_cpd = TabularCPD( variable='L', variable_card=2, values=[[0.1, 0.4, 0.99], [0.9, 0.6, 0.01]], evidence=['G'], evidence_card=[3]) sat_cpd = TabularCPD( variable='S', variable_card=2, values=[[0.95, 0.2], [0.05, 0.8]], evidence=['I'], evidence_card=[2]) student_model.add_cpds(grade_cpd, difficulty_cpd, intel_cpd, letter_cpd, sat_cpd)
class BaseEliminationTest(TestCase): def setUp(self): self.model = BayesianModel([("diff", "grade"), ("intel", "grade"), ("intel", "sat"), ("grade", "reco")]) raw_data = np.random.randint(low=0, high=2, size=(1000, 5)) data = pd.DataFrame(raw_data, columns=["diff", "grade", "intel", "sat", "reco"]) self.model.fit(data) def tearDown(self): del self.model del self.elimination_order
def bayesnet_examples(): from pgmpy.factors import TabularCPD from pgmpy.models import BayesianModel import pandas as pd student_model = BayesianModel([('D', 'G'), ('I', 'G'), ('G', 'L'), ('I', 'S')]) # we can generate some random data. raw_data = np.random.randint(low=0, high=2, size=(1000, 5)) data = pd.DataFrame(raw_data, columns=['D', 'I', 'G', 'L', 'S']) data_train = data[: int(data.shape[0] * 0.75)] student_model.fit(data_train) student_model.get_cpds() data_test = data[int(0.75 * data.shape[0]): data.shape[0]] data_test.drop('D', axis=1, inplace=True) student_model.predict(data_test) grade_cpd = TabularCPD( variable='G', variable_card=3, values=[[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], evidence=['I', 'D'], evidence_card=[2, 2]) difficulty_cpd = TabularCPD( variable='D', variable_card=2, values=[[0.6, 0.4]]) intel_cpd = TabularCPD( variable='I', variable_card=2, values=[[0.7, 0.3]]) letter_cpd = TabularCPD( variable='L', variable_card=2, values=[[0.1, 0.4, 0.99], [0.9, 0.6, 0.01]], evidence=['G'], evidence_card=[3]) sat_cpd = TabularCPD( variable='S', variable_card=2, values=[[0.95, 0.2], [0.05, 0.8]], evidence=['I'], evidence_card=[2]) student_model.add_cpds(grade_cpd, difficulty_cpd, intel_cpd, letter_cpd, sat_cpd)
def pgmpy_test(): raw_data = np.array([0] * 30 + [1] * 70) # Representing heads by 0 and tails by 1 data = pd.DataFrame(raw_data, columns=['coin']) print(data) model = BayesianModel() model.add_node('coin') # Fitting the data to the model using Maximum Likelihood Estimator model.fit(data, estimator=MaximumLikelihoodEstimator) print(model.get_cpds('coin'))
def create_bayes_net(file, keep_atts, edges): atts = pd.read_csv(file) atts = atts[keep_atts] graph = BayesianModel() graph.add_nodes_from(atts.columns) # defining the structure of edges graph.add_edges_from(edges) # fit estimates the CPD tables for the given structure graph.fit(atts) return graph
class TestBayesianModelFitPredict(unittest.TestCase): def setUp(self): self.model_disconnected = BayesianModel() self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E']) self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) def test_disconnected_fit(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model_disconnected.fit(values) for node in ['A', 'B', 'C', 'D', 'E']: cpd = self.model_disconnected.get_cpds(node) self.assertEqual(cpd.variable, node) np_test.assert_array_equal(cpd.cardinality, np.array([2])) value = (values.ix[:, node].value_counts() / values.ix[:, node].value_counts().sum()) value = value.reindex(sorted(value.index)).values np_test.assert_array_equal(cpd.values, value) def test_connected_predict(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:800] predict_data = values[800:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict, predict_data) predict_data.drop('E', axis=1, inplace=True) e_predict = self.model_connected.predict(predict_data) np_test.assert_array_equal(e_predict.values.ravel(), np.array([1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0])) def tearDown(self): del self.model_connected del self.model_disconnected
class TestBayesianModelFitPredict(unittest.TestCase): def setUp(self): self.model_disconnected = BayesianModel() self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E']) self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) def test_disconnected_fit(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model_disconnected.fit(values) for node in ['A', 'B', 'C', 'D', 'E']: cpd = self.model_disconnected.get_cpds(node) self.assertEqual(cpd.variable, node) np_test.assert_array_equal(cpd.cardinality, np.array([2])) value = (values.ix[:, node].value_counts() / values.ix[:, node].value_counts().sum()) value = value.reindex(sorted(value.index)).values np_test.assert_array_equal(cpd.values, value) def test_connected_predict(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:800] predict_data = values[800:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict, predict_data) predict_data.drop('E', axis=1, inplace=True) e_predict = self.model_connected.predict(predict_data) np_test.assert_array_equal( e_predict.values.ravel(), np.array([ 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0 ])) def tearDown(self): del self.model_connected del self.model_disconnected
def pgmpy_test2(): # example from https://github.com/pgmpy/pgmpy/blob/dev/examples/Learning%20from%20data.ipynb # Generating radom data with each variable have 2 states and equal probabilities for each state raw_data = np.random.randint(low=0, high=2, size=(1000, 5)) data = pd.DataFrame(raw_data, columns=['D', 'I', 'G', 'L', 'S']) model = BayesianModel([('D', 'G'), ('I', 'G'), ('I', 'S'), ('G', 'L')]) # Learing CPDs using Maximum Likelihood Estimators model.fit(data, estimator=MaximumLikelihoodEstimator) for cpd in model.get_cpds(): print("CPD of {variable}:".format(variable=cpd.variable)) print(cpd)
def create_model_and_inference(): dep_df = pd.read_csv('dependencies.csv', sep=';') def connect(df, source, edgelist): source_df = df[df['Column2'] == source] for col in source_df.iloc[0, 3:len(source_df.columns)]: target_df = df[df['Column1'] == col]['Column2'] if not target_df.empty: target = target_df.item() if not (target, source) in edgelist: edgelist.append((source, target)) connect(df, target, edgelist) edges = [] connect(dep_df, 'myproximus-usage', edges) edges = [(t[1], t[0]) for t in edges] nodes = set(itertools.chain.from_iterable(edges)) nodes_df = dep_df.iloc[:, 1].to_frame() nodes_df = nodes_df[nodes_df['Column2'].isin(nodes)] nodes_df['0'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['1'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['2'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['3'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['4'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['5'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['6'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['7'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['8'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['9'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df['10'] = pd.DataFrame(data=np.random.randint(0, 2, size=64).T) nodes_df = nodes_df.set_index('Column2').transpose() model = BayesianModel() model.add_nodes_from(nodes) for edge in edges: try: model.add_edge(edge[0], edge[1]) except: print('WARNING: tried to add edge which forms loop: ' + str(edge)) model.fit(nodes_df, estimator=BayesianEstimator, prior_type="BDeu") # for cpd in model.get_cpds(): # print(cpd) draw_network(model.nodes(), model.edges(), {}, []) return model, VariableElimination(model)
def fully_connected_model(nodes=None): if not nodes: nodes = [BOREDOM, DESIRE, MOBILE, MOTOR_HYPO, LEFT_ARM] network = BayesianModel() network.add_nodes_from(nodes) for hypo in nodes: if 'hypo' in hypo: for obs in nodes: if 'obs' in obs or 'motor' in obs: network.add_edge(u=hypo, v=obs) network.fit(TRAINING_DATA, estimator=BayesianEstimator, prior_type="BDeu") return network
def create_bayes_net(): atts = pd.read_csv('../../data/list_attr_celeba.csv') atts = atts[KEEP_ATTS] graph = BayesianModel() graph.add_nodes_from(atts.columns) graph.add_edges_from([('Young', 'Eyeglasses'), ('Young', 'Bald'), ('Young', 'Mustache'), ('Male', 'Mustache'), ('Male', 'Smiling'), ('Male', 'Wearing_Lipstick'), ('Young', 'Mouth_Slightly_Open'), ('Young', 'Narrow_Eyes'), ('Male', 'Narrow_Eyes'), ('Smiling', 'Narrow_Eyes'), ('Smiling', 'Mouth_Slightly_Open'), ('Young', 'Smiling')]) graph.fit(atts) return graph
def generate_approx_model_from_graph(ebunch, nodes, df): """ Aprende un modelo Bayesiano de pgmpy usando un datos de un dataframe de pandas. Primero se hace un barajado de los datos. """ df = df.sample(frac=1) approx_model = BayesianModel(ebunch) approx_model.add_nodes_from(nodes) state_names = dict() for pair in ebunch: state_names[pair[0]] = [0, 1] state_names[pair[1]] = [0, 1] for node in nodes: state_names[node] = [0, 1] approx_model.fit(df, state_names=state_names, estimator=SmoothedMaximumLikelihoodEstimator) return approx_model
def kNN(k): fileName = ''; dataMat, dataLab = file2matrix(fileName, 9); trainMat = dataMat[]; trainLab = np.array(dataLab[]); testMat = dataMat[]; testLab = np.array(dataLab[]); coef = 1; distPos = np.zeros((testMat.shape[0], trainMat.shape[0])); distTim = np.zeros((testMat.shape[0], trainMat.shape[0])); for i in range(testMat.shape[0]): for j in range(trainMat.shape[0]): distPos[i,j] = distSLC(testMat[i], trainMat[j]); distTim[i,j] = disTim(testMat[i], trainMat[j]); distPosNor = dataNorm(distPos); distTimNor = dataNorm(distTim); distAll = distPosNor*coef + distTimNor*(1-coef); distIndex = distAll.argsort(); testI = np.zeros((testMat.shape[0], 4), dtype='int32'); count = 0; for i in testMat[:, 2:6]: testI[count,:] = map(int, i); count += 1; testInput = pd.DataFrame(testI, columns=[]); trainMatK = trainMat[distIndex[:,0:k]]; labelPre = []; for i in range(len(trainMatK)): num = 0; trainI = np.zeros((trainMatK[0].shape[0], 5), dtype='int32'); for j in trainMatK[i][:, [2,3,4,5,8]]: trainI[num, :] = map(int, j); num += 1; trainFraK = pd.DataFrame(trainI,columns=[]); trainInput = trainFraK[[]]; model = BayesianModel([(),(),(), ()]); model.fit(trainInput); a = pd.DataFrame([testInput.ix[i].values.tolist()], columns=[]); labelPre.append(model.predict(a).values[0][0]); # for i in range(len(testLakK)): # labels = testLakK[i]; # labelPre.append(getLabel(labels)); count = 0; #print labelPre; for i in range(len(labelPre)): if labelPre[i]==testLab[i]: count += 1; print '准确度:', float(count)/len(testLab);
def BNForOneSong(self, DAG, db_file, results_file, songFile): data = self.take_only_relevant_features(DAG, db_file) dataToPredictRF = self.take_only_relevant_features(DAG, songFile) dataToPredict = pd.read_csv(songFile) model = BayesianModel(DAG) model.fit(data, BayesianEstimator) dataToPredictRF = dataToPredictRF.copy() y_pred = model.predict(dataToPredictRF) # print(y_pred) with open(results_file, 'w', newline='') as file: y_pred.to_csv(file) return y_pred['song_popularity'][0]
def BN(DAG): data = take_only_relevant_features(DAG) training_data = data[:15068] predict_data = data[15068:16952] model = BayesianModel(DAG) model.fit(data, BayesianEstimator) predict_data = predict_data.copy() predict_data.drop('song_popularity', axis=1, inplace=True) y_pred = model.predict(predict_data) print(y_pred) with open('predicted_results.csv', 'w', newline='') as file: y_pred.to_csv(file)
def init(df, miss_node): # get miss_idx and miss_size miss_idx = df[df[miss_node].isnull()].index.tolist() miss_size = len(miss_idx) # random guess missing values if miss_size == 0: df_complete = df else: init_vals = np.random.choice(3, size=miss_size) df_complete = copy.deepcopy(df) df_complete[miss_node][miss_idx] = init_vals # assume complete data, estimate parameters using MLE bn_model = BayesianModel([('D', 'G'), ('I', 'G'), ('E', 'L'), ('G', 'L')]) bn_model.fit(df_complete, estimator=MaximumLikelihoodEstimator) # cpds = bn_model.get_cpds() # for cpd in bn_model.get_cpds(): # print("CPD of {variable}:".format(variable=cpd.variable)) # print(cpd) return bn_model
def naiveModel(): trainingData, testingData = differenceBetweenFeatures(True) # create model '''model = BayesianModel( [('f10','f1'), ('f10','f2'), ('f10','f3'), ('f10','f4'), ('f10','f5'), ('f10','f6'), ('f10','f7'), ('f10','f8'), ('f10','f9')])''' model = BayesianModel([('f1', 'h'), ('f2', 'h'), ('f3', 'h'), ('f4', 'h'), ('f5', 'h'), ('f6', 'h'), ('f7', 'h'), ('f8', 'h'), ('f9', 'h')]) # fit model and data, compute CPDs model.fit(trainingData, estimator=BayesianEstimator, prior_type='BDeu') # inference object # computing probability of Hyothesis given evidence evaluateModel(model, testingData, 'h', featuresLabelList)
def createBayesGraph(graph_list,mapping,data): ''' Creating the bayesian network graph and table the graph_list, mapping and data are the parameters needed for creating the tables this function returns: bayes_model - the bayes model and its order cpds_array - array of the tables categories_each_element - categories of each element in the graph ''' cpds_array = [] categories_each_element = {} # Returning an array with the values of each element bayes_model = BayesianModel() bayes_model.add_nodes_from(list(mapping)) for value in graph_list: temp_list=value.split(',') bayes_model.add_edge(temp_list[0],temp_list[1]) data_dict = {mapping[i]: data[:,i] for i in range(0, len(mapping))} data_dict_pd = pandas.DataFrame(data=data_dict) bayes_model.fit(data_dict_pd) cpds_tables = bayes_model.get_cpds() # Creating the array which returs to the client for cpd in cpds_tables: cpds_list = {} for cat in cpd.state_names: categories_each_element[cat] = cpd.state_names[cat] cpd_string = str(cpd).split('|') temp_array = [] cpd_matrix_values = [] digits_numbers = False for a in cpd_string: if (is_number(a)): temp_array.append(float(a.strip())) digits_numbers = True elif ("-+" in a and digits_numbers == True): cpd_matrix_values.append(temp_array) temp_array = [] digits_numbers = False cpds_list[str(list(cpd.variables))] = cpd_matrix_values cpds_array.append(cpds_list) return(bayes_model,cpds_array,categories_each_element)
def create_bayes_net(): atts = pd.read_csv('./data/list_attr_celeba.csv') atts = atts[KEEP_ATTS] graph = BayesianModel() graph.add_nodes_from(atts.columns) # can't automate this part # defining the structure of edges graph.add_edges_from([('Young', 'Eyeglasses'), ('Young', 'Bald'), ('Young', 'Mustache'), ('Male', 'Mustache'), ('Male', 'Smiling'), ('Male', 'Wearing_Lipstick'), ('Young', 'Mouth_Slightly_Open'), ('Young', 'Narrow_Eyes'), ('Male', 'Narrow_Eyes'), ('Smiling', 'Narrow_Eyes'), ('Smiling', 'Mouth_Slightly_Open'), ('Young', 'Smiling')]) # fit estimates the CPD tables for the given structure graph.fit(atts) return graph
def test_predict(self): titanic = BayesianModel() titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")]) titanic.fit(self.titanic_data2[500:]) p1 = titanic.predict(self.titanic_data2[["Sex", "Pclass"]][:30]) p2 = titanic.predict(self.titanic_data2[["Survived", "Pclass"]][:30]) p3 = titanic.predict(self.titanic_data2[["Survived", "Sex"]][:30]) p1_res = np.array(['0', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']) p2_res = np.array(['male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'female', 'female', 'male', 'female', 'male', 'male', 'female', 'male']) p3_res = np.array(['3', '1', '1', '1', '3', '3', '3', '3', '1', '1', '1', '1', '3', '3', '3', '1', '3', '1', '3', '1', '3', '1', '1', '1', '3', '1', '3', '3', '1', '3']) np_test.assert_array_equal(p1.values.ravel(), p1_res) np_test.assert_array_equal(p2.values.ravel(), p2_res) np_test.assert_array_equal(p3.values.ravel(), p3_res)
def setup(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) model.fit(values) self.inference = VariableElimination(model)
class TestBayesianModelFitPredict(unittest.TestCase): def setUp(self): self.model_disconnected = BayesianModel() self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E']) self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) self.model2 = BayesianModel([('A', 'C'), ('B', 'C')]) self.data1 = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]}) self.data2 = pd.DataFrame(data={'A': [0, np.NaN, 1], 'B': [0, 1, 0], 'C': [1, 1, np.NaN], 'D': [np.NaN, 'Y', np.NaN]}) # data_link - "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv', dtype=str) self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] def test_bayesian_fit(self): print(isinstance(BayesianEstimator, BaseEstimator)) print(isinstance(MaximumLikelihoodEstimator, BaseEstimator)) self.model2.fit(self.data1, estimator=BayesianEstimator, prior_type="dirichlet", pseudo_counts=[9, 3]) self.assertEqual(self.model2.get_cpds('B'), TabularCPD('B', 2, [[11.0 / 15], [4.0 / 15]])) def test_fit_missing_data(self): self.model2.fit(self.data2, state_names={'C': [0, 1]}, complete_samples_only=False) cpds = set([TabularCPD('A', 2, [[0.5], [0.5]]), TabularCPD('B', 2, [[2. / 3], [1. / 3]]), TabularCPD('C', 2, [[0, 0.5, 0.5, 0.5], [1, 0.5, 0.5, 0.5]], evidence=['A', 'B'], evidence_card=[2, 2])]) self.assertSetEqual(cpds, set(self.model2.get_cpds())) def test_disconnected_fit(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model_disconnected.fit(values) for node in ['A', 'B', 'C', 'D', 'E']: cpd = self.model_disconnected.get_cpds(node) self.assertEqual(cpd.variable, node) np_test.assert_array_equal(cpd.cardinality, np.array([2])) value = (values.ix[:, node].value_counts() / values.ix[:, node].value_counts().sum()) value = value.reindex(sorted(value.index)).values np_test.assert_array_equal(cpd.values, value) def test_predict(self): titanic = BayesianModel() titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")]) titanic.fit(self.titanic_data2[500:]) p1 = titanic.predict(self.titanic_data2[["Sex", "Pclass"]][:30]) p2 = titanic.predict(self.titanic_data2[["Survived", "Pclass"]][:30]) p3 = titanic.predict(self.titanic_data2[["Survived", "Sex"]][:30]) p1_res = np.array(['0', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']) p2_res = np.array(['male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'female', 'female', 'male', 'female', 'male', 'male', 'female', 'male']) p3_res = np.array(['3', '1', '1', '1', '3', '3', '3', '3', '1', '1', '1', '1', '3', '3', '3', '1', '3', '1', '3', '1', '3', '1', '1', '1', '3', '1', '3', '3', '1', '3']) np_test.assert_array_equal(p1.values.ravel(), p1_res) np_test.assert_array_equal(p2.values.ravel(), p2_res) np_test.assert_array_equal(p3.values.ravel(), p3_res) def test_connected_predict(self): np.random.seed(42) values = pd.DataFrame(np.array(np.random.randint(low=0, high=2, size=(1000, 5)), dtype=str), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:800] predict_data = values[800:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict, predict_data) predict_data.drop('E', axis=1, inplace=True) e_predict = self.model_connected.predict(predict_data) np_test.assert_array_equal(e_predict.values.ravel(), np.array([1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0], dtype=str)) def test_connected_predict_probability(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(100, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:80] predict_data = values[80:].copy() self.model_connected.fit(fit_data) predict_data.drop('E', axis=1, inplace=True) e_prob = self.model_connected.predict_probability(predict_data) np_test.assert_allclose(e_prob.values.ravel(), np.array([0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.5 ]), atol = 0) predict_data = pd.DataFrame(np.random.randint(low=0, high=2, size=(1, 5)), columns=['A', 'B', 'C', 'F', 'E'])[:] def test_predict_probability_errors(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(2, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:1] predict_data = values[1:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict_probability, predict_data) predict_data = pd.DataFrame(np.random.randint(low=0, high=2, size=(1, 5)), columns=['A', 'B', 'C', 'F', 'E'])[:] self.assertRaises(ValueError, self.model_connected.predict_probability, predict_data) def tearDown(self): del self.model_connected del self.model_disconnected
import numpy as np import pandas as pd from pgmpy.models import BayesianModel from pgmpy.estimators import BayesianEstimator # Generating some random data raw_data = np.random.randint(low=0, high=2, size=(1000, 6)) print(raw_data) data = pd.DataFrame(raw_data, columns=['A', 'R', 'J', 'G', 'L', 'Q']) # Creating the network structures student_model = BayesianModel([('A', 'J'), ('R', 'J'), ('J', 'Q'), ('J', 'L'), ('G', 'L')]) student_model.fit(data, estimator=BayesianEstimator) student_model.get_cpds() print(student_model.get_cpds('D'))
import numpy as np import pandas as pd from pgmpy.inference import VariableElimination from pgmpy.models import BayesianModel data = pd.read_csv('~/Documents/unifiedMLData.csv') #print data movie_model = BayesianModel([ ('occupation','rating') #,('gender','rating') #,('age','rating') #,('age','occupation') #,('gender','occupation') #,('genre','movie_title') #,('movie_title','rating') ]) movie_model.fit(data) model_infer = VariableElimination(movie_model) results = model_infer.query('rating') print(results['rating']) #print(movie_model.get_cpds('rating'))
import numpy as np import pandas as pd from pgmpy.models import BayesianModel from pgmpy.estimators import BayesianEstimator # Generating random data for two coin tossing examples raw_data = np.random.randint(low=0, high=2, size=(1000, 2)) data = pd.DataFrame(raw_data, columns=['X', 'Y']) print(data) coin_model = BayesianModel() coin_model.fit(data, estimator=BayesianEstimator) coin_model.get_cpds() coin_model.nodes() coin_model.edges()
ax_temp.bar(x, z, zs=y, zdir='y', alpha=0.6, color='r' * 4) ax_temp.set_xlabel('X') ax_temp.set_ylabel('Y') ax_temp.set_zlabel('Z') ax_temp.title.set_text(('Feature ' + str(mean_indices[counter]))) counter += 1 plt.show() # Learning naive bayes model from various subsets of data naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2]) naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 4]) naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 3, 4, 5]) # Splitting train and test data for PGM model temp_data = pd.concat([all_city_data, pd.DataFrame(all_city_label, columns=[13])], axis=1) pgm_train_set = temp_data.loc[0:700] pgm_test_set = temp_data.loc[700:] print(pgm_train_set) # Implementing PGM model on data # Using these features: 0: (age) 1: (sex) 2: (cp) pgm_model = BayesianModel() pgm_model.add_nodes_from([0, 1, 2, 13]) pgm_model.add_edges_from([(1, 13)]) pgm_model.fit(pgm_train_set.loc[:, [0, 1, 2, 13]]) pgm_test_set = pgm_test_set.loc[:, [0, 1, 2, 13]].drop(13, axis=1) print(pgm_test_set) print(pgm_model.get_cpds(13))
# Now in general machine learning problems it doesn't matter which # column of the array represents which variable (until we use same # order for both training and prediction) because all the values # are on symmetrical axis but in graphical models each variable is # different (in the way it is connected to other variables etc) so # we will need to specify which columns of data are for which # variable. For that we will use pandas. import pandas as pd data = pd.DataFrame(data, columns=['cost', 'quality', 'location', 'no_of_people']) data train = data[:750] # We will try to predict the no_of_people from our model. So for # test data we will delete that column and then later on predict # those values. test = data[750:].drop('no_of_people', axis=1) test # Now we will need to create the base network structure for the # model. restaurant_model = BayesianModel([('location', 'cost'), ('quality', 'cost'), ('location', 'no_of_people'), ('cost', 'no_of_people')]) restaurant_model.fit(train) # Fit computes the cpd of all the variables from the training data # that we provided. restaurant_model.get_cpds() # Now for predicting the values of no_of_people using this model # we can simply call the predict method on our test data. restaurant_model.predict(test).values.ravel()
import numpy as np import pandas as pd from pgmpy.models import BayesianModel from pgmpy.estimators import MaximumLikelihoodEstimator # Generating some random data raw_data = np.random.randint(low=0, high=2, size=(100, 2)) print(raw_data) data = pd.DataFrame(raw_data, columns=['X', 'Y']) print(data) # Two coin tossing model assuming that they are dependent. coin_model = BayesianModel([('X', 'Y')]) coin_model.fit(data, estimator=MaximumLikelihoodEstimator) cpd_x = coin_model.get_cpds('X') print(cpd_x)