def bayesnet_examples(): from pgmpy.factors import TabularCPD from pgmpy.models import BayesianModel import pandas as pd student_model = BayesianModel([('D', 'G'), ('I', 'G'), ('G', 'L'), ('I', 'S')]) # we can generate some random data. raw_data = np.random.randint(low=0, high=2, size=(1000, 5)) data = pd.DataFrame(raw_data, columns=['D', 'I', 'G', 'L', 'S']) data_train = data[: int(data.shape[0] * 0.75)] student_model.fit(data_train) student_model.get_cpds() data_test = data[int(0.75 * data.shape[0]): data.shape[0]] data_test.drop('D', axis=1, inplace=True) student_model.predict(data_test) grade_cpd = TabularCPD( variable='G', variable_card=3, values=[[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], evidence=['I', 'D'], evidence_card=[2, 2]) difficulty_cpd = TabularCPD( variable='D', variable_card=2, values=[[0.6, 0.4]]) intel_cpd = TabularCPD( variable='I', variable_card=2, values=[[0.7, 0.3]]) letter_cpd = TabularCPD( variable='L', variable_card=2, values=[[0.1, 0.4, 0.99], [0.9, 0.6, 0.01]], evidence=['G'], evidence_card=[3]) sat_cpd = TabularCPD( variable='S', variable_card=2, values=[[0.95, 0.2], [0.05, 0.8]], evidence=['I'], evidence_card=[2]) student_model.add_cpds(grade_cpd, difficulty_cpd, intel_cpd, letter_cpd, sat_cpd)
def parents_instantiated(self, cid: BayesianModel) -> bool: """Checks that all parents have been instantiated, which is a pre-condition for instantiating self""" for p in self.evidence: p_cpd = cid.get_cpds(p) if not (p_cpd and hasattr(p_cpd, 'state_names')): return False return True
def to_dynamic_cpd( static_model: BayesianModel, stat_to_dyn_map: typing.Dict[str, str], next_to_curr_map: typing.Dict[str, str], ) -> TabularCPD: # Lambda to obtain dynamic nodes' name get_dynamic_node = (lambda node: (stat_to_dyn_map[node], 0) if node.endswith("_T") else (stat_to_dyn_map[next_to_curr_map[node]], 1)) # Extract information about CPDs of the static model cpds_info = [{ "variable": get_dynamic_node(cpd.variable), "variable_card": 4, "values": cpd.get_values(), "evidence": [(stat_to_dyn_map[e], 0) for e in cpd.get_evidence()][::-1] if len(cpd.get_evidence()) > 0 else None, "evidence_card": [4] * len(cpd.get_evidence()) if len(cpd.get_evidence()) > 0 else None, "state_names": {get_dynamic_node(k): v for k, v in cpd.state_names.items()}, } for cpd in static_model.get_cpds()] return [TabularCPD(**cpd_info) for cpd_info in cpds_info]
def pgmpyToGrid(model: BayesianModel, queryNode: Name, shorten: bool = True) -> Grid: ''' Renders a list of lists (grid) from the pgmpy model, out of the CPD for the given query node. ''' # Get the dictionary of 'var' : [states] allVarStates: Dict[Name, List[State]] = model.get_cpds(queryNode).state_names condVarStates: Dict[Name, List[State]] = dict(list(allVarStates.items())[1:]) # Doing product between states of the evidence (conditional) variables to get: (Dumb, Easy), (Dumb, Hard), # (Intelligent, Easy), (Intelligent, Hard) ... condStateProducts: List[Tuple[State, State]] = list( itertools.product(*list(condVarStates.values()))) # Transposing the CPDs to get the rows in column format, since this is what the renderTable function expects to use. cpdProbabilities: List[np.ndarray] = list( model.get_cpds(queryNode).get_values().T) # This is basically the gird, with titles next to probabilities but need to format so everything is a list and no # other structure is inside: tempGrid: Grid = list(zip(condStateProducts, cpdProbabilities)) grid: Grid = [ list(nameProduct) + list(probs) for nameProduct, probs in tempGrid ] if shorten and len( grid ) > 15: # extra test to ensure no putting dots when there are fewer than 15 rows #MAX_ROWS: int = 15 BOTTOM_ROWS: int = 5 TOP_ROWS: int = 10 # Shortening the grid blankRow = ['...' for _ in range(len(grid[0]))] grid: Grid = grid[0:TOP_ROWS] + [blankRow ] + grid[len(grid) - BOTTOM_ROWS:] return grid
def parent_values(self, cid: BayesianModel) -> List[List]: """Return a list of lists for the values each parent can take (based on the parent state names)""" assert self.parents_instantiated(cid) parent_values = [] for p in self.evidence: p_cpd = cid.get_cpds(p) if p_cpd and hasattr(p_cpd, 'state_names'): parent_values.append(p_cpd.state_names[p]) return parent_values
def conditionalDistDf(model: BayesianModel, query: RandomVariable) -> DataFrame: ''' Given a query variable, gets its conditional TabularCPD and puts that into a pandas DataFrame ''' # Get the Tabular CPD (learned) from the model: queryTCPD: TabularCPD = model.get_cpds(query.var) return tabularDf(cpd=queryTCPD)
class BayesNetwork: def __init__(self, dataset, graph_structure_index): self.dataset = dataset self.columns = dataset.dataframe.columns self.graph_structure_index = graph_structure_index def build_graph(self): graph_structure_name = list( map(lambda tuple: (self.columns[tuple[0]], self.columns[tuple[1]]), self.graph_structure_index)) self.model = BayesianModel(graph_structure_name) def draw_graph(self): Drawer.draw_graph(self.model) def fit_model(self, prior=False, prior_data=[]): if prior: pseudo_counts = {{ 'D': [300, 700], 'I': [500, 500], 'G': [800, 200], 'L': [500, 500], 'S': [400, 600] }} raise NotImplementedError else: self.model.fit(self.dataset.dataframe[0:-3], estimator=MaximumLikelihoodEstimator) def inference(self, name): from pgmpy.inference import VariableElimination self.infer = VariableElimination(self.model) q = self.infer.query(variables=[name]) print(q[name]) def evaluate_result(self): for cpd in self.model.get_cpds(): print("CPD of {variable}:".format(variable=cpd.variable)) print(cpd) accept_node = cpd.variables[0] ##3D-dimension if len(cpd.values.shape) > 3: pass # Drawer.draw_3D(cpd.values, x_label=cpd.variables[1], # y_label=cpd.variables[2], z_label=cpd.variables[3]) ##2D Dimension elif len(cpd.values.shape) == 2: title = cpd.variables[1] + '----->' + accept_node Drawer(title=title, is_show=False, is_save=False, save_path='img/' + title + '.jpg').draw_matrix( cpd.values)
def pgmpy_test(): raw_data = np.array([0] * 30 + [1] * 70) # Representing heads by 0 and tails by 1 data = pd.DataFrame(raw_data, columns=['coin']) print(data) model = BayesianModel() model.add_node('coin') # Fitting the data to the model using Maximum Likelihood Estimator model.fit(data, estimator=MaximumLikelihoodEstimator) print(model.get_cpds('coin'))
def create_network(models, processes, files): for p in range(files): temp_model = BayesianModel() for e in range(len(processes[p].get_errors())): temp_error = processes[p].get_error(e) for c in range(len(temp_error.get_causes())): temp_cause = temp_error.get_cause(c) q = temp_cause.get_occ_prob( ) / temp_error.get_total_cause_prob() temp_cause.set_occ_prob(q) temp_model.add_nodes_from([temp_cause, temp_error]) temp_model.add_edge(temp_cause, temp_error) temp_cause_cpd = TabularCPD(variable=temp_cause, variable_card=2, values=[[q, 1 - q]]) temp_model.add_cpds(temp_cause_cpd) temp_error_cpd = TabularCPD( variable=temp_error, variable_card=2, values=get_initial_error_cpd(len(temp_error.get_causes())), evidence=temp_error.get_causes(), evidence_card=[2] * (len(temp_error.get_causes()))) temp_model.add_cpds(temp_error_cpd) for f in range(len(temp_error.get_effects())): temp_effect = temp_error.get_effect(f) temp_model.add_nodes_from([temp_error, temp_effect]) temp_model.add_edge(temp_error, temp_effect) models.append(temp_model) #plotting Failure Tree dot = to_pydot(models[p]) with open('failure_tree_graph_%s.png' % processes[p], 'wb') as f: f.write(dot.create_png()) #Sample output of CPDs for causes and errors for e in range(len(processes[p].get_errors())): for c in range(len(processes[p].get_error(e).get_causes())): print( temp_model.get_cpds( processes[p].get_error(e).get_cause(c))) print(temp_model.get_cpds(processes[p].get_error(e)))
class TestBayesianModelFitPredict(unittest.TestCase): def setUp(self): self.model_disconnected = BayesianModel() self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E']) self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) def test_disconnected_fit(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model_disconnected.fit(values) for node in ['A', 'B', 'C', 'D', 'E']: cpd = self.model_disconnected.get_cpds(node) self.assertEqual(cpd.variable, node) np_test.assert_array_equal(cpd.cardinality, np.array([2])) value = (values.ix[:, node].value_counts() / values.ix[:, node].value_counts().sum()) value = value.reindex(sorted(value.index)).values np_test.assert_array_equal(cpd.values, value) def test_connected_predict(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:800] predict_data = values[800:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict, predict_data) predict_data.drop('E', axis=1, inplace=True) e_predict = self.model_connected.predict(predict_data) np_test.assert_array_equal(e_predict.values.ravel(), np.array([1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0])) def tearDown(self): del self.model_connected del self.model_disconnected
def pgmpy_test2(): # example from https://github.com/pgmpy/pgmpy/blob/dev/examples/Learning%20from%20data.ipynb # Generating radom data with each variable have 2 states and equal probabilities for each state raw_data = np.random.randint(low=0, high=2, size=(1000, 5)) data = pd.DataFrame(raw_data, columns=['D', 'I', 'G', 'L', 'S']) model = BayesianModel([('D', 'G'), ('I', 'G'), ('I', 'S'), ('G', 'L')]) # Learing CPDs using Maximum Likelihood Estimators model.fit(data, estimator=MaximumLikelihoodEstimator) for cpd in model.get_cpds(): print("CPD of {variable}:".format(variable=cpd.variable)) print(cpd)
def jointDistribution(model: BayesianModel) -> JointProbabilityDistribution: ''' Returns joint prob distribution over entire network''' # There is no reason the cpds must be converted to DiscreteFactors ; can access variables, values, cardinality the same way, but this is how the mini-example in API docs does it. (imap() implementation) factors: List[DiscreteFactor] = [ cpd.to_factor() for cpd in model.get_cpds() ] jointProbFactor: DiscreteFactor = reduce(mul, factors) # TODO need to assert that probabilities sum to 1? Always true? or to normalize here? return JointProbabilityDistribution( variables=jointProbFactor.variables, cardinality=jointProbFactor.cardinality, values=jointProbFactor.values)
def make_bayes_net(load=False, subtree=True, modelsdir=MODEL_CPDS_DIR): print('Making bayes net') graph_file = RUNNING_MODEL_DIR + '/' + 'graph.p' if os.path.isfile(graph_file) and load == True: print('Loading saved graph from file...') G = pickle.load(open(graph_file, 'rb')) G.check_model() else: print('loading data...') training_labels, go_dict = load_label_data() if subtree: labels_list = _subtree_labels() print(labels_list) else: labels_list = go_dict.keys() print('adding nodes and edges...') G = BayesianModel() G.add_edges_from([(label, label + '_hat') for label in labels_list]) obo_graph = obonet.read_obo(OBODB_FILE) for label in labels_list: children = [ c for c in networkx.ancestors(obo_graph, label) if c in labels_list ] for child in children: G.add_edge(child, label) predicted_cpds = get_model_cpds(labels_list=labels_list, modelsdir=MODEL_CPDS_DIR) for cpd in predicted_cpds: G.add_cpds(cpd) true_label_cpds = get_true_label_cpds(training_labels, go_dict, labels_list=labels_list) for cpd in true_label_cpds: G.add_cpds(cpd) remove_list = [] for node in G.nodes(): if G.get_cpds(node) == None: remove_list.append(node) # remove_list.append(node+'_hat') for node in remove_list: if node in G: G.remove_node(node) G.check_model() pickle.dump(G, open(graph_file, 'wb')) return G
def initialize_tabular_cpd(self, cid: BayesianModel) -> bool: """initialize the TabularCPD with a matrix representing a uniform random distribution""" parents = cid.get_parents(self.variable) # check that parents are initialized for parent in parents: if not cid.get_cpds(parent): return False parents_card = [cid.get_cardinality(p) for p in parents] transition_matrix = np.ones( (self.variable_card, np.product(parents_card).astype(int))) / self.variable_card super().__init__(self.variable, self.variable_card, transition_matrix, parents, parents_card, state_names=self.state_names) return True
def probnet(): # Defining the model structure. We can define the network by just passing a list of edges. model = BayesianModel([('H', 'S'), ('B', 'S'), ('D', 'S')]) # Defining individual CPDs. cpd_h = TabularCPD(variable='H', variable_card=2, values=[[0.2, 0.8]]) cpd_b = TabularCPD(variable='B', variable_card=2, values=[[0.1, 0.9]]) cpd_d = TabularCPD(variable='D', variable_card=2, values=[[0.5, 0.5]]) cpd_s = TabularCPD(variable='S', variable_card=2, values=[[0.1, 0.2, 0.1, 0.15, 0.4, 0.35, 0.45, 0.43], [0.9, 0.8, 0.9, 0.85, 0.6, 0.65, 0.55, 0.57]], evidence=['H', 'B', 'D'], evidence_card=[2, 2, 2]) # Associating the CPDs with the network model.add_cpds(cpd_h, cpd_b, cpd_d, cpd_s) # check_model checks for the network structure and CPDs and verifies that the CPDs are correctly # defined and sum to 1. model.check_model() print(model.get_cpds('S')) # infer = VariableElimination(model) # infer.map_query('S', evidence={'H': 1, 'B': 0, 'D': 1}) return model
def Bayesian_estimate(data, dependency_structure, graph_edges): data.columns = [i + 1 for i in range(data.shape[1])] print(data) model = BayesianModel(graph_edges) model.fit(data, estimator=BayesianEstimator, prior_type="BDeu") # default equivalent_sample_size=5 for column in data.columns: print(column, data[column].unique()) probs = {} for parent, child in dependency_structure: cpd = model.get_cpds(node=child) print() print(cpd) print(cpd.variable_card) index = [ cpd.variables.index(var) - 1 if var > 0 else cpd.variables.index(-1 * var) - 1 for var in parent ] ordered_parent = [x for _, x in sorted(zip(index, parent))] print(cpd.values) if (cpd.variable_card == 1): if (data[child].unique()[0] == 0): value = 1 - cpd.values[0] else: value = cpd.values[0] else: assert cpd.variable_card == 2 value = cpd.values[1] print(value) for var in ordered_parent: value = value[0] if var < 0 else value[1] probs[(parent, child)] = value print((parent, child), probs[(parent, child)]) return probs
class TestBayesianModelMethods(unittest.TestCase): def setUp(self): self.G = BayesianModel([('a', 'd'), ('b', 'd'), ('d', 'e'), ('b', 'c')]) self.G1 = BayesianModel([('diff', 'grade'), ('intel', 'grade')]) diff_cpd = TabularCPD('diff', 2, values=[[0.2], [0.8]]) intel_cpd = TabularCPD('intel', 3, values=[[0.5], [0.3], [0.2]]) grade_cpd = TabularCPD('grade', 3, values=[[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [0.8, 0.8, 0.8, 0.8, 0.8, 0.8]], evidence=['diff', 'intel'], evidence_card=[2, 3]) self.G1.add_cpds(diff_cpd, intel_cpd, grade_cpd) self.G2 = BayesianModel([('d', 'g'), ('g', 'l'), ('i', 'g'), ('i', 'l')]) def test_moral_graph(self): moral_graph = self.G.moralize() self.assertListEqual(sorted(moral_graph.nodes()), ['a', 'b', 'c', 'd', 'e']) for edge in moral_graph.edges(): self.assertTrue(edge in [('a', 'b'), ('a', 'd'), ('b', 'c'), ('d', 'b'), ('e', 'd')] or (edge[1], edge[0]) in [('a', 'b'), ('a', 'd'), ('b', 'c'), ('d', 'b'), ('e', 'd')]) def test_moral_graph_with_edge_present_over_parents(self): G = BayesianModel([('a', 'd'), ('d', 'e'), ('b', 'd'), ('b', 'c'), ('a', 'b')]) moral_graph = G.moralize() self.assertListEqual(sorted(moral_graph.nodes()), ['a', 'b', 'c', 'd', 'e']) for edge in moral_graph.edges(): self.assertTrue(edge in [('a', 'b'), ('c', 'b'), ('d', 'a'), ('d', 'b'), ('d', 'e')] or (edge[1], edge[0]) in [('a', 'b'), ('c', 'b'), ('d', 'a'), ('d', 'b'), ('d', 'e')]) def test_get_ancestors_of_success(self): ancenstors1 = self.G2._get_ancestors_of('g') ancenstors2 = self.G2._get_ancestors_of('d') ancenstors3 = self.G2._get_ancestors_of(['i', 'l']) self.assertEqual(ancenstors1, {'d', 'i', 'g'}) self.assertEqual(ancenstors2, {'d'}) self.assertEqual(ancenstors3, {'g', 'i', 'l', 'd'}) def test_get_ancestors_of_failure(self): self.assertRaises(ValueError, self.G2._get_ancestors_of, 'h') def test_local_independencies(self): self.assertEqual(self.G.local_independencies('a'), Independencies(['a', ['b', 'c']])) self.assertEqual(self.G.local_independencies('c'), Independencies(['c', ['a', 'd', 'e'], 'b'])) self.assertEqual(self.G.local_independencies('d'), Independencies(['d', 'c', ['b', 'a']])) self.assertEqual(self.G.local_independencies('e'), Independencies(['e', ['c', 'b', 'a'], 'd'])) self.assertEqual(self.G.local_independencies('b'), Independencies(['b', 'a'])) self.assertEqual(self.G1.local_independencies('grade'), Independencies()) def test_get_independencies(self): chain = BayesianModel([('X', 'Y'), ('Y', 'Z')]) self.assertEqual(chain.get_independencies(), Independencies(('X', 'Z', 'Y'), ('Z', 'X', 'Y'))) fork = BayesianModel([('Y', 'X'), ('Y', 'Z')]) self.assertEqual(fork.get_independencies(), Independencies(('X', 'Z', 'Y'), ('Z', 'X', 'Y'))) collider = BayesianModel([('X', 'Y'), ('Z', 'Y')]) self.assertEqual(collider.get_independencies(), Independencies(('X', 'Z'), ('Z', 'X'))) def test_is_imap(self): val = [0.01, 0.01, 0.08, 0.006, 0.006, 0.048, 0.004, 0.004, 0.032, 0.04, 0.04, 0.32, 0.024, 0.024, 0.192, 0.016, 0.016, 0.128] JPD = JointProbabilityDistribution(['diff', 'intel', 'grade'], [2, 3, 3], val) fac = DiscreteFactor(['diff', 'intel', 'grade'], [2, 3, 3], val) self.assertTrue(self.G1.is_imap(JPD)) self.assertRaises(TypeError, self.G1.is_imap, fac) def test_get_immoralities(self): G = BayesianModel([('x', 'y'), ('z', 'y'), ('x', 'z'), ('w', 'y')]) self.assertEqual(G.get_immoralities(), {('w', 'x'), ('w', 'z')}) G1 = BayesianModel([('x', 'y'), ('z', 'y'), ('z', 'x'), ('w', 'y')]) self.assertEqual(G1.get_immoralities(), {('w', 'x'), ('w', 'z')}) G2 = BayesianModel([('x', 'y'), ('z', 'y'), ('x', 'z'), ('w', 'y'), ('w', 'x')]) self.assertEqual(G2.get_immoralities(), {('w', 'z')}) def test_is_iequivalent(self): G = BayesianModel([('x', 'y'), ('z', 'y'), ('x', 'z'), ('w', 'y')]) self.assertRaises(TypeError, G.is_iequivalent, MarkovModel()) G1 = BayesianModel([('V', 'W'), ('W', 'X'), ('X', 'Y'), ('Z', 'Y')]) G2 = BayesianModel([('W', 'V'), ('X', 'W'), ('X', 'Y'), ('Z', 'Y')]) self.assertTrue(G1.is_iequivalent(G2)) G3 = BayesianModel([('W', 'V'), ('W', 'X'), ('Y', 'X'), ('Z', 'Y')]) self.assertFalse(G3.is_iequivalent(G2)) def test_copy(self): model_copy = self.G1.copy() self.assertEqual(sorted(self.G1.nodes()), sorted(model_copy.nodes())) self.assertEqual(sorted(self.G1.edges()), sorted(model_copy.edges())) self.assertNotEqual(id(self.G1.get_cpds('diff')), id(model_copy.get_cpds('diff'))) self.G1.remove_cpds('diff') diff_cpd = TabularCPD('diff', 2, values=[[0.3], [0.7]]) self.G1.add_cpds(diff_cpd) self.assertNotEqual(self.G1.get_cpds('diff'), model_copy.get_cpds('diff')) self.G1.remove_node('intel') self.assertNotEqual(sorted(self.G1.nodes()), sorted(model_copy.nodes())) self.assertNotEqual(sorted(self.G1.edges()), sorted(model_copy.edges())) def test_remove_node(self): self.G1.remove_node('diff') self.assertEqual(sorted(self.G1.nodes()), sorted(['grade', 'intel'])) self.assertRaises(ValueError, self.G1.get_cpds, 'diff') def test_remove_nodes_from(self): self.G1.remove_nodes_from(['diff', 'grade']) self.assertEqual(sorted(self.G1.nodes()), sorted(['intel'])) self.assertRaises(ValueError, self.G1.get_cpds, 'diff') self.assertRaises(ValueError, self.G1.get_cpds, 'grade') def tearDown(self): del self.G del self.G1
ax_temp.bar(x, z, zs=y, zdir='y', alpha=0.6, color='r' * 4) ax_temp.set_xlabel('X') ax_temp.set_ylabel('Y') ax_temp.set_zlabel('Z') ax_temp.title.set_text(('Feature ' + str(mean_indices[counter]))) counter += 1 plt.show() # Learning naive bayes model from various subsets of data naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2]) naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 4]) naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 3, 4, 5]) # Splitting train and test data for PGM model temp_data = pd.concat([all_city_data, pd.DataFrame(all_city_label, columns=[13])], axis=1) pgm_train_set = temp_data.loc[0:700] pgm_test_set = temp_data.loc[700:] print(pgm_train_set) # Implementing PGM model on data # Using these features: 0: (age) 1: (sex) 2: (cp) pgm_model = BayesianModel() pgm_model.add_nodes_from([0, 1, 2, 13]) pgm_model.add_edges_from([(1, 13)]) pgm_model.fit(pgm_train_set.loc[:, [0, 1, 2, 13]]) pgm_test_set = pgm_test_set.loc[:, [0, 1, 2, 13]].drop(13, axis=1) print(pgm_test_set) print(pgm_model.get_cpds(13))
def bayesnet(): """ References: https://class.coursera.org/pgm-003/lecture/17 http://www.cs.ubc.ca/~murphyk/Bayes/bnintro.html http://www3.cs.stonybrook.edu/~sael/teaching/cse537/Slides/chapter14d_BP.pdf http://www.cse.unsw.edu.au/~cs9417ml/Bayes/Pages/PearlPropagation.html https://github.com/pgmpy/pgmpy.git http://pgmpy.readthedocs.org/en/latest/ http://nipy.bic.berkeley.edu:5000/download/11 """ # import operator as op # # Enumerate all possible events # varcard_list = list(map(op.attrgetter('variable_card'), cpd_list)) # _esdat = list(ut.iprod(*map(range, varcard_list))) # _escol = list(map(op.attrgetter('variable'), cpd_list)) # event_space = pd.DataFrame(_esdat, columns=_escol) # # Custom compression of event space to inspect a specific graph # def compress_space_flags(event_space, var1, var2, var3, cmp12_): # """ # var1, var2, cmp_ = 'Lj', 'Lk', op.eq # """ # import vtool as vt # data = event_space # other_cols = ut.setdiff_ordered(data.columns.tolist(), [var1, var2, var3]) # case_flags12 = cmp12_(data[var1], data[var2]).values # # case_flags23 = cmp23_(data[var2], data[var3]).values # # case_flags = np.logical_and(case_flags12, case_flags23) # case_flags = case_flags12 # case_flags = case_flags.astype(np.int64) # subspace = np.hstack((case_flags[:, None], data[other_cols].values)) # sel_ = vt.unique_row_indexes(subspace) # flags = np.logical_and(mask, case_flags) # return flags # # Build special cases # case_same = event_space.loc[compress_space_flags(event_space, 'Li', 'Lj', 'Lk', op.eq)] # case_diff = event_space.loc[compress_space_flags(event_space, 'Li', 'Lj', 'Lk', op.ne)] # special_cases = [ # case_same, # case_diff, # ] from pgmpy.factors import TabularCPD from pgmpy.models import BayesianModel import pandas as pd from pgmpy.inference import BeliefPropagation # NOQA from pgmpy.inference import VariableElimination # NOQA name_nice = ['n1', 'n2', 'n3'] score_nice = ['low', 'high'] match_nice = ['diff', 'same'] num_names = len(name_nice) num_scores = len(score_nice) nid_basis = list(range(num_names)) score_basis = list(range(num_scores)) semtype2_nice = { 'score': score_nice, 'name': name_nice, 'match': match_nice, } var2_cpd = { } globals()['semtype2_nice'] = semtype2_nice globals()['var2_cpd'] = var2_cpd name_combo = np.array(list(ut.iprod(nid_basis, nid_basis))) combo_is_same = name_combo.T[0] == name_combo.T[1] def get_expected_scores_prob(level1, level2): part1 = combo_is_same * level1 part2 = (1 - combo_is_same) * (1 - (level2)) expected_scores_level = part1 + part2 return expected_scores_level # def make_cpd(): def name_cpd(aid): from pgmpy.factors import TabularCPD cpd = TabularCPD( variable='N' + aid, variable_card=num_names, values=[[1.0 / num_names] * num_names]) cpd.semtype = 'name' return cpd name_cpds = [name_cpd('i'), name_cpd('j'), name_cpd('k')] var2_cpd.update(dict(zip([cpd.variable for cpd in name_cpds], name_cpds))) if True: num_same_diff = 2 samediff_measure = np.array([ # get_expected_scores_prob(.12, .2), # get_expected_scores_prob(.88, .8), get_expected_scores_prob(0, 0), get_expected_scores_prob(1, 1), ]) samediff_vals = (samediff_measure / samediff_measure.sum(axis=0)).tolist() def samediff_cpd(aid1, aid2): cpd = TabularCPD( variable='A' + aid1 + aid2, variable_card=num_same_diff, values=samediff_vals, evidence=['N' + aid1, 'N' + aid2], # [::-1], evidence_card=[num_names, num_names]) # [::-1]) cpd.semtype = 'match' return cpd samediff_cpds = [samediff_cpd('i', 'j'), samediff_cpd('j', 'k'), samediff_cpd('k', 'i')] var2_cpd.update(dict(zip([cpd.variable for cpd in samediff_cpds], samediff_cpds))) if True: def score_cpd(aid1, aid2): semtype = 'score' evidence = ['A' + aid1 + aid2, 'N' + aid1, 'N' + aid2] evidence_cpds = [var2_cpd[key] for key in evidence] evidence_nice = [semtype2_nice[cpd.semtype] for cpd in evidence_cpds] evidence_card = list(map(len, evidence_nice)) evidence_states = list(ut.iprod(*evidence_nice)) variable_basis = semtype2_nice[semtype] variable_values = [] for mystate in variable_basis: row = [] for state in evidence_states: if state[0] == state[1]: if state[2] == 'same': val = .2 if mystate == 'low' else .8 else: val = 1 # val = .5 if mystate == 'low' else .5 elif state[0] != state[1]: if state[2] == 'same': val = .5 if mystate == 'low' else .5 else: val = 1 # val = .9 if mystate == 'low' else .1 row.append(val) variable_values.append(row) cpd = TabularCPD( variable='S' + aid1 + aid2, variable_card=len(variable_basis), values=variable_values, evidence=evidence, # [::-1], evidence_card=evidence_card) # [::-1]) cpd.semtype = semtype return cpd else: score_values = [ [.8, .1], [.2, .9], ] def score_cpd(aid1, aid2): cpd = TabularCPD( variable='S' + aid1 + aid2, variable_card=num_scores, values=score_values, evidence=['A' + aid1 + aid2], # [::-1], evidence_card=[num_same_diff]) # [::-1]) cpd.semtype = 'score' return cpd score_cpds = [score_cpd('i', 'j'), score_cpd('j', 'k')] cpd_list = name_cpds + score_cpds + samediff_cpds else: score_measure = np.array([get_expected_scores_prob(level1, level2) for level1, level2 in zip(np.linspace(.1, .9, num_scores), np.linspace(.2, .8, num_scores))]) score_values = (score_measure / score_measure.sum(axis=0)).tolist() def score_cpd(aid1, aid2): cpd = TabularCPD( variable='S' + aid1 + aid2, variable_card=num_scores, values=score_values, evidence=['N' + aid1, 'N' + aid2], evidence_card=[num_names, num_names]) cpd.semtype = 'score' return cpd score_cpds = [score_cpd('i', 'j'), score_cpd('j', 'k')] cpd_list = name_cpds + score_cpds pass input_graph = [] for cpd in cpd_list: if cpd.evidence is not None: for evar in cpd.evidence: input_graph.append((evar, cpd.variable)) name_model = BayesianModel(input_graph) name_model.add_cpds(*cpd_list) var2_cpd.update(dict(zip([cpd.variable for cpd in cpd_list], cpd_list))) globals()['var2_cpd'] = var2_cpd varnames = [cpd.variable for cpd in cpd_list] # --- PRINT CPDS --- cpd = score_cpds[0] def print_cpd(cpd): print('CPT: %r' % (cpd,)) index = semtype2_nice[cpd.semtype] if cpd.evidence is None: columns = ['None'] else: basis_lists = [semtype2_nice[var2_cpd[ename].semtype] for ename in cpd.evidence] columns = [','.join(x) for x in ut.iprod(*basis_lists)] data = cpd.get_cpd() print(pd.DataFrame(data, index=index, columns=columns)) for cpd in name_model.get_cpds(): print('----') print(cpd._str('phi')) print_cpd(cpd) # --- INFERENCE --- Ni = name_cpds[0] event_space_combos = {} event_space_combos[Ni.variable] = 0 # Set ni to always be Fred for cpd in cpd_list: if cpd.semtype == 'score': event_space_combos[cpd.variable] = list(range(cpd.variable_card)) evidence_dict = ut.all_dict_combinations(event_space_combos) # Query about name of annotation k given different event space params def pretty_evidence(evidence): return [key + '=' + str(semtype2_nice[var2_cpd[key].semtype][val]) for key, val in evidence.items()] def print_factor(factor): row_cards = factor.cardinality row_vars = factor.variables values = factor.values.reshape(np.prod(row_cards), 1).flatten() # col_cards = 1 # col_vars = [''] basis_lists = list(zip(*list(ut.iprod(*[range(c) for c in row_cards])))) nice_basis_lists = [] for varname, basis in zip(row_vars, basis_lists): cpd = var2_cpd[varname] _nice_basis = ut.take(semtype2_nice[cpd.semtype], basis) nice_basis = ['%s=%s' % (varname, val) for val in _nice_basis] nice_basis_lists.append(nice_basis) row_lbls = [', '.join(sorted(x)) for x in zip(*nice_basis_lists)] print(ut.repr3(dict(zip(row_lbls, values)), precision=3, align=True, key_order_metric='-val')) # name_belief = BeliefPropagation(name_model) name_belief = VariableElimination(name_model) import pgmpy import six # NOQA def try_query(evidence): print('--------') query_vars = ut.setdiff_ordered(varnames, list(evidence.keys())) evidence_str = ', '.join(pretty_evidence(evidence)) probs = name_belief.query(query_vars, evidence) factor_list = probs.values() joint_factor = pgmpy.factors.factor_product(*factor_list) print('P(' + ', '.join(query_vars) + ' | ' + evidence_str + ')') # print(six.text_type(joint_factor)) factor = joint_factor # NOQA # print_factor(factor) # import utool as ut print(ut.hz_str([(f._str(phi_or_p='phi')) for f in factor_list])) for evidence in evidence_dict: try_query(evidence) evidence = {'Aij': 1, 'Ajk': 1, 'Aki': 1, 'Ni': 0} try_query(evidence) evidence = {'Aij': 0, 'Ajk': 0, 'Aki': 0, 'Ni': 0} try_query(evidence) globals()['score_nice'] = score_nice globals()['name_nice'] = name_nice globals()['score_basis'] = score_basis globals()['nid_basis'] = nid_basis print('Independencies') print(name_model.get_independencies()) print(name_model.local_independencies([Ni.variable])) # name_belief = BeliefPropagation(name_model) # # name_belief = VariableElimination(name_model) # for case in special_cases: # test_data = case.drop('Lk', axis=1) # test_data = test_data.reset_index(drop=True) # print('----') # for i in range(test_data.shape[0]): # evidence = test_data.loc[i].to_dict() # probs = name_belief.query(['Lk'], evidence) # factor = probs['Lk'] # probs = factor.values # evidence_ = evidence.copy() # evidence_['Li'] = name_nice[evidence['Li']] # evidence_['Lj'] = name_nice[evidence['Lj']] # evidence_['Sij'] = score_nice[evidence['Sij']] # evidence_['Sjk'] = score_nice[evidence['Sjk']] # nice2_prob = ut.odict(zip(name_nice, probs.tolist())) # ut.print_python_code('P(Lk | {evidence}) = {cpt}'.format( # evidence=(ut.repr2(evidence_, explicit=True, nobraces=True, strvals=True)), # cpt=ut.repr3(nice2_prob, precision=3, align=True, key_order_metric='-val') # )) # for case in special_cases: # test_data = case.drop('Lk', axis=1) # test_data = test_data.drop('Lj', axis=1) # test_data = test_data.reset_index(drop=True) # print('----') # for i in range(test_data.shape[0]): # evidence = test_data.loc[i].to_dict() # query_vars = ['Lk', 'Lj'] # probs = name_belief.query(query_vars, evidence) # for queryvar in query_vars: # factor = probs[queryvar] # print(factor._str('phi')) # probs = factor.values # evidence_ = evidence.copy() # evidence_['Li'] = name_nice[evidence['Li']] # evidence_['Sij'] = score_nice[evidence['Sij']] # evidence_['Sjk'] = score_nice[evidence['Sjk']] # nice2_prob = ut.odict(zip([queryvar + '=' + x for x in name_nice], probs.tolist())) # ut.print_python_code('P({queryvar} | {evidence}) = {cpt}'.format( # query_var=query_var, # evidence=(ut.repr2(evidence_, explicit=True, nobraces=True, strvals=True)), # cpt=ut.repr3(nice2_prob, precision=3, align=True, key_order_metric='-val') # )) # _ draw model import plottool as pt import networkx as netx fig = pt.figure() # NOQA fig.clf() ax = pt.gca() netx_nodes = [(node, {}) for node in name_model.nodes()] netx_edges = [(etup[0], etup[1], {}) for etup in name_model.edges()] netx_graph = netx.DiGraph() netx_graph.add_nodes_from(netx_nodes) netx_graph.add_edges_from(netx_edges) # pos = netx.graphviz_layout(netx_graph) pos = netx.pydot_layout(netx_graph, prog='dot') netx.draw(netx_graph, pos=pos, ax=ax, with_labels=True) pt.plt.savefig('foo.png') ut.startfile('foo.png')
class TestInferenceBase(unittest.TestCase): def setUp(self): self.bayesian = BayesianModel([('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]) a_cpd = TabularCPD('a', 2, [[0.4, 0.6]]) b_cpd = TabularCPD('b', 2, [[0.2, 0.4], [0.3, 0.4]], evidence='a', evidence_card=[2]) c_cpd = TabularCPD('c', 2, [[0.1, 0.2], [0.3, 0.4]], evidence='b', evidence_card=[2]) d_cpd = TabularCPD('d', 2, [[0.4, 0.3], [0.2, 0.1]], evidence='c', evidence_card=[2]) e_cpd = TabularCPD('e', 2, [[0.3, 0.2], [0.4, 0.1]], evidence='d', evidence_card=[2]) self.bayesian.add_cpds(a_cpd, b_cpd, c_cpd, d_cpd, e_cpd) self.markov = MarkovModel([('a', 'b'), ('b', 'd'), ('a', 'c'), ('c', 'd')]) factor_1 = Factor(['a', 'b'], [2, 2], np.array([100, 1, 1, 100])) factor_2 = Factor(['a', 'c'], [2, 2], np.array([40, 30, 100, 20])) factor_3 = Factor(['b', 'd'], [2, 2], np.array([1, 100, 100, 1])) factor_4 = Factor(['c', 'd'], [2, 2], np.array([60, 60, 40, 40])) self.markov.add_factors(factor_1, factor_2, factor_3, factor_4) def test_bayesian_inference_init(self): infer_bayesian = Inference(self.bayesian) self.assertEqual(set(infer_bayesian.variables), {'a', 'b', 'c', 'd', 'e'}) self.assertEqual(infer_bayesian.cardinality, {'a': 2, 'b': 2, 'c': 2, 'd': 2, 'e': 2}) self.assertIsInstance(infer_bayesian.factors, defaultdict) self.assertEqual(set(infer_bayesian.factors['a']), set([self.bayesian.get_cpds('a').to_factor(), self.bayesian.get_cpds('b').to_factor()])) self.assertEqual(set(infer_bayesian.factors['b']), set([self.bayesian.get_cpds('b').to_factor(), self.bayesian.get_cpds('c').to_factor()])) self.assertEqual(set(infer_bayesian.factors['c']), set([self.bayesian.get_cpds('c').to_factor(), self.bayesian.get_cpds('d').to_factor()])) self.assertEqual(set(infer_bayesian.factors['d']), set([self.bayesian.get_cpds('d').to_factor(), self.bayesian.get_cpds('e').to_factor()])) self.assertEqual(set(infer_bayesian.factors['e']), set([self.bayesian.get_cpds('e').to_factor()])) def test_markov_inference_init(self): infer_markov = Inference(self.markov) self.assertEqual(set(infer_markov.variables), {'a', 'b', 'c', 'd'}) self.assertEqual(infer_markov.cardinality, {'a': 2, 'b': 2, 'c': 2, 'd': 2}) self.assertEqual(infer_markov.factors, {'a': [Factor(['a', 'b'], [2, 2], np.array([100, 1, 1, 100])), Factor(['a', 'c'], [2, 2], np.array([40, 30, 100, 20]))], 'b': [Factor(['a', 'b'], [2, 2], np.array([100, 1, 1, 100])), Factor(['b', 'd'], [2, 2], np.array([1, 100, 100, 1]))], 'c': [Factor(['a', 'c'], [2, 2], np.array([40, 30, 100, 20])), Factor(['c', 'd'], [2, 2], np.array([60, 60, 40, 40]))], 'd': [Factor(['b', 'd'], [2, 2], np.array([1, 100, 100, 1])), Factor(['c', 'd'], [2, 2], np.array([60, 60, 40, 40]))]})
[0.3, 0.05, 0.9, 0.5], # 该节点的概率表 [0.4, 0.25, 0.08, 0.3], [0.3, 0.7, 0.02, 0.2] ], evidence=["I", "D"], # 该节点的依赖节点 evidence_card=[2, 2] # 依赖节点的取值个数 ) drug_cpd = TabularCPD(variable="L", variable_card=2, values=[[0.1, 0.4, 0.99], [0.9, 0.6, 0.01]], evidence=["G"], evidence_card=[3]) toxicity_cpd = TabularCPD(variable="S", variable_card=2, values=[[0.95, 0.2], [0.05, 0.8]], evidence=["I"], evidence_card=[2]) Chemoinformatics_model.add_cpds(active_cpd, amino_cpd, benzene_cpd, drug_cpd, toxicity_cpd) Chemoinformatics_model.get_cpds() Chemoinformatics_infer = VariableElimination(Chemoinformatics_model) prob = Chemoinformatics_infer.query(variables=["L"], evidence={ "D": 1, "I": 1, "G": 0 }) print(prob)
class TestDirectedGraphCPDOperations(unittest.TestCase): def setUp(self): self.graph = BayesianModel() def test_add_single_cpd(self): cpd = TabularCPD('grade', 2, np.random.rand(2, 4), ['diff', 'intel'], [2, 2]) self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')]) self.graph.add_cpds(cpd) self.assertListEqual(self.graph.get_cpds(), [cpd]) def test_add_multiple_cpds(self): cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1)) cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1)) cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4), ['diff', 'intel'], [2, 2]) self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')]) self.graph.add_cpds(cpd1, cpd2, cpd3) self.assertListEqual(self.graph.get_cpds(), [cpd1, cpd2, cpd3]) def test_remove_single_cpd(self): cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1)) cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1)) cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4), ['diff', 'intel'], [2, 2]) self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')]) self.graph.add_cpds(cpd1, cpd2, cpd3) self.graph.remove_cpds(cpd1) self.assertListEqual(self.graph.get_cpds(), [cpd2, cpd3]) def test_remove_multiple_cpds(self): cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1)) cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1)) cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4), ['diff', 'intel'], [2, 2]) self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')]) self.graph.add_cpds(cpd1, cpd2, cpd3) self.graph.remove_cpds(cpd1, cpd3) self.assertListEqual(self.graph.get_cpds(), [cpd2]) def test_remove_single_cpd_string(self): cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1)) cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1)) cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4), ['diff', 'intel'], [2, 2]) self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')]) self.graph.add_cpds(cpd1, cpd2, cpd3) self.graph.remove_cpds('diff') self.assertListEqual(self.graph.get_cpds(), [cpd2, cpd3]) def test_remove_multiple_cpds_string(self): cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1)) cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1)) cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4), ['diff', 'intel'], [2, 2]) self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')]) self.graph.add_cpds(cpd1, cpd2, cpd3) self.graph.remove_cpds('diff', 'grade') self.assertListEqual(self.graph.get_cpds(), [cpd2]) def test_get_cpd_for_node(self): cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1)) cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1)) cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4), ['diff', 'intel'], [2, 2]) self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')]) self.graph.add_cpds(cpd1, cpd2, cpd3) self.assertEqual(self.graph.get_cpds('diff'), cpd1) self.assertEqual(self.graph.get_cpds('intel'), cpd2) self.assertEqual(self.graph.get_cpds('grade'), cpd3) def test_get_cpd_raises_error(self): cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1)) cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1)) cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4), ['diff', 'intel'], [2, 2]) self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')]) self.graph.add_cpds(cpd1, cpd2, cpd3) self.assertRaises(ValueError, self.graph.get_cpds, 'sat') def tearDown(self): del self.graph
class AwareEnv(object): def __init__(self): self.actions = [ 0.0, 0.01, 0.02, 0.03, 0.04, 0.05, -0.01, -0.02, -0.03, -0.04, -0.05 ] self.model = BayesianModel([('Consciente', 'DistracaoApp'), ('Consciente', 'DirecaoCarro'), ('Consciente', 'SomCarro'), ('Consciente', 'Percepcao')]) self.episodes = TEST.copy().drop('Consciente', axis=1) def reset(self): self.model = BayesianModel([('Consciente', 'DistracaoApp'), ('Consciente', 'DirecaoCarro'), ('Consciente', 'SomCarro'), ('Consciente', 'Percepcao')]) self.model.fit(TRAIN, estimator=BayesianEstimator) aware = [ node for node in self.model.get_cpds() if node.variable == 'Consciente' ].pop() self.state = [np.round(aware.values, 2)] self.cpds = self._tabular_cpds_to_dict(self.model) for node in self.model.get_cpds(): print(node) return self.state def render(self): aware = [ node for node in self.model.get_cpds() if node.variable == 'Consciente' ].pop() self.state = np.round(aware.values, 2) self.cpds = self._tabular_cpds_to_dict(self.model) def _tabular_cpds_to_dict(self, model): return { node.variable: { state: value for state, value in zip(node.state_names[node.variable], node.values) } for node in model.get_cpds() } def _get_cpd_values(self, node_values): cpds = [] for state, param in node_values.items(): if type(param) == dict: cpds.append(list(param.values())) else: cpds.append(param) return np.array(cpds) def step(self, adjustment, episode): print('######## Ajustes ########') print(adjustment) print('######## Episódio atual ########') print(episode) bp = BeliefPropagation(self.model) replaced_episode = {k: replacer[k][v] for k, v in episode.iteritems()} upper_bound = self.state[0] + adjustment lower_bound = self.state[1] - adjustment if not (upper_bound > 1 or upper_bound < 0): state_aware = [upper_bound, lower_bound] cpds = self._tabular_cpds_to_dict(self.model) adjustments = self.fit_probabilities(cpds, adjustment) for node in self.model.get_cpds(): if node.variable != 'Consciente': node.values = self._get_cpd_values( adjustments[node.variable]) node.normalize() else: node.values = np.array(state_aware) for node in self.model.get_cpds(): print(node) else: state_aware = [self.state] print('######## Consciente ########') bp = BeliefPropagation(self.model) print( bp.query(['Consciente'], evidence=replaced_episode)['Consciente']) reward = float(input('Recompensa entre -1 e 1: ')) next_state = [] next_state.append(np.round(state_aware, 2)) next_state.extend(list(replaced_episode.values())) return next_state, reward def fit_probabilities(self, cpds, adjustment): del cpds['Consciente'] adjusted_probabilities = {} position = int(adjustment < 0) for state, param in cpds.items(): params = list(param.keys()) param_values = list(param.values()) new_param_values = [] npt = np.transpose(param_values) for cpd_list, param in zip(npt, params): fitting = approximate[state][param] * (adjustment * 100) values = [] for cpd in cpd_list: fit = cpd + fitting if fit < 0: fit = 0 elif fit > 1: fit = 1 values.append(fit) new_param_values.append(self.normalize(values)) npt = np.transpose(new_param_values) adjusted_probabilities[state] = {} for i, param in enumerate(params): adjusted_probabilities[state][param] = np.array(npt[i]) return adjusted_probabilities def normalize(self, lst): s = sum(lst) return list(map(lambda x: float(x) / s, lst))
import numpy as np import pandas as pd from pgmpy.models import BayesianModel from pgmpy.estimators import BayesianEstimator # Generating some random data raw_data = np.random.randint(low=0, high=2, size=(1000, 6)) print(raw_data) data = pd.DataFrame(raw_data, columns=['A', 'R', 'J', 'G', 'L', 'Q']) # Creating the network structures student_model = BayesianModel([('A', 'J'), ('R', 'J'), ('J', 'Q'), ('J', 'L'), ('G', 'L')]) student_model.fit(data, estimator=BayesianEstimator) student_model.get_cpds() print(student_model.get_cpds('D'))
for a, b in edges: dot.edge(a, b) if save: dot.view(cleanup=True) return dot predict_data=test.drop(columns=['scene'],axis='1') # re=pd.read_csv('./re.txt') # print(re.info()) # print(predict_data.info()) print("预测数据集") print(predict_data) y_pred = model.predict(predict_data) showBN(model) print("预测结果") print(y_pred) # 预测结果 print("节点条件概率情况") print(model.get_cpds()) # 各个节点条件概率情况 # re['doors'] = re['doors'].astype('object') # print(model.predict_probability(re)) # 预测概率 print("预测准确率") print((y_pred['scene']==test['scene']).sum()/len(test)) end=time.process_time() print("总运行时间:") print('Running time: %s Seconds'%(end-start)) # 准确率
values=[[0.900, 0.200], [0.100, 0.800]], evidence=['HO'], evidence_card=[2]) cpd_posxray = TabularCPD(variable='PX', variable_card=2, values=[[0.990, 0.020], [0.010, 0.980]], evidence=['BT'], evidence_card=[2]) cpd_headache = TabularCPD(variable='HA', variable_card=2, values=[[0.980, 0.100, 0.300, 0.010], [0.020, 0.900, 0.700, 0.990] ], evidence=['HO', 'BT'], evidence_card=[2, 2]) cancer_model.add_cpds(cpd_party, cpd_braintumor, cpd_hangover, cpd_smellalcohol, cpd_posxray, cpd_headache) for cp in cancer_model.get_cpds(): print(cp) #进行预测HA发生的概率 from pgmpy.inference import VariableElimination cancer_infer = VariableElimination(cancer_model) q = cancer_infer.query(variables=['HA']) print(q) #诊断在某些证据下的概率 #from pgmpy.inference import VariableElimination cancer_infer = VariableElimination(cancer_model) q = cancer_infer.query(variables=['BT'], evidence={'PX': 1}) print(q)
class Utilities(object): def __init__(self, file): ''' no object creation -> opportune ?''' self.keywords = ['BENS', 'MEMS', 'LANS', 'MOTOR', 'WORLD'] self.standard_nodes = { 'RONS': { 'BENS': [], 'MEMS': [] }, 'LANS': { 'LANS': [] }, 'LENS': { 'MOTOR': [], 'WORLD': [] } } self.file = file self.get_json_path(file) self.pgmpy_object = BayesianModel() self.networkx_object = nx.DiGraph() self.header = '' self.dictionary = [] def get_nodes_in_family(self, family, attributes=False): nw_nodes = self.networkx_object.nodes() nw_dim = np.asarray(nw_nodes).ndim nodes = [] for i, node in enumerate(nw_nodes): if nw_dim > 1: node = node[0] if family in node: nodes.append(node) return nodes def check_json_path(directory): """ Checks whether the necessary project_repository directory exists. If not, creates it :param directory: the mother directory to search from downwards :type directory: string :rtype : none """ if not os.path.exists(directory + '\project_repository\\'): os.makedirs(directory + '\project_repository\\') def get_json_path(self, file): """ Creates a string containing the full path for the filename passed so it will be saved in the project_repository directory :param filename: filename without path or extension :return: a full path for the file :type filename :string :rtype : string """ levels = 5 common = os.path.dirname(os.path.realpath(__file__)) for i in range(levels + 1): common = os.path.dirname(common) if 'peepo\peepo' not in common: break Utilities.check_json_path(common) self.file = str(common + '\project_repository\\' + file + '.json') print('in get_json_path :', self.file) def save_json(self, astring): """ This helping function is only needed to have the json file formatted in a user friendly way as the "dump" method does not provide a lot of possibilities to get it "pretty" :param file :the ull path of the json file :param astring: the name of the string containing the whole information :return: void :type file: string :type astring : string :rtype : void """ text_file = open(str(self.file), "w") '''remove all LF written by the dump method''' astring = re.sub('\n', '', astring) '''For keywords -> insert LF and tabs''' astring = re.sub('\"Identification', '\n\"Identification', astring) astring = re.sub('\"Date', '\n\"Date', astring) astring = re.sub('\"Description', '\n\"Description', astring) astring = re.sub('\"Train_from', '\n\"Train_from', astring) astring = re.sub('\"Frozen', '\n\"Frozen', astring) astring = re.sub('\"Nodes', '\n\n\"Nodes', astring) astring = re.sub('\"RONS', '\n\t\t\"RONS', astring) astring = re.sub('\"BENS', '\n\t\t\t\"BENS', astring) astring = re.sub('\"MEMS', '\n\t\t\t\"MEMS', astring) astring = re.sub('\"LANS', '\n\t\t\"LANS', astring) astring = re.sub('\"LENS', '\n\t\t\"LENS', astring) astring = re.sub('\"MOTOR', '\n\t\t\t\"MOTOR', astring) astring = re.sub('\"WORLD', '\n\t\t\t\"WORLD', astring) astring = re.sub('\"Edges', '\n\n\"Edges', astring) astring = re.sub('\"CPDs', '\n\n\"CPDs', astring) astring = re.sub('{', '\n\t\t{', astring) text_file.write(astring) text_file.write('\n') text_file.close() def translation(self, astring, from_man_to_machine): """ Given an array of tuples (a,b) in dictionary, returns the second element of the tuple where astring was found Is used to not loose the users node names as peepo generates standardized names for the corresponding node :param dictionary:an array of tuples -> is created in the method : get_network(file) :param astring: the name of the node passsed by the user :param from_man_to_machine: an integer -> 0 when we want the translation for the user give name to the standardized name, 1 the other way around :return: the corresponding standardized node name :type dictionary: np.array :type astring : string :rtype : string """ source = 0 target = 1 if from_man_to_machine == 1: source = 1 target = 0 for index, item in enumerate(self.dictionary): if item[source] == astring: break return item[target] def clean_edge_list(self, edge_array, parent): '''the get functions for the edges, both in networx as pgmpy contain the parent name this function removes it from the list''' cleaned_list = [] for a in edge_array: if a != parent: cleaned_list.append(a) return cleaned_list def clean_parent_list(self, parent_array, child): '''the get functions for the edges, both in networx as pgmpy contain the parent name this function removes it from the list''' cleaned_list = [] for i, a in enumerate(parent_array): if a[0] != child: cleaned_list.append(a[0]) return cleaned_list def get_edges(self): """ Creates a dictionary with a node as a key and an array with its child as value (the methods get_child give generally a list of tuples (parent,child) :param pgmpy_object: the pgmpy network :return: a dictionary with the edges of all the node :type fpgmpy_object:adress :rtype :dictionary """ edg = self.pgmpy_object.edges() edges = dict() [ edges[str(t[0])].append(str(t[1])) if t[0] in list(edges.keys()) else edges.update({str(t[0]): [str(t[1])]}) for t in edg ] return edges def get_nodes_and_attributes(self): """ Creates an array of tuple with a node as element 0 and a dictionary with cardinalities and cpd as key's and the key cardinality returns an int the key cpd a 2 dimensional matrix :param pgmpy_object: the pgmpy network :return: array of tuple with a node as element 0 and a dictionary with cardinalities and cpd as key's :type :pgmpy_object:adress :rtype :array of tuples """ nodes = self.pgmpy_object.nodes() nod_and_attributes = [] [ nod_and_attributes.append((str(node), { 'cardinality': int(self.pgmpy_object.get_cardinality(node)), 'cpd': self.pgmpy_object.get_cpds(node).values.astype(float) })) for i, node in enumerate(nodes) ] #need to reshape the cpds when more than 1 parent for i, node in enumerate(nod_and_attributes): shape = nod_and_attributes[i][1]['cpd'].shape dimension = nod_and_attributes[i][1]['cpd'].ndim if dimension > 2: col = int(np.prod(shape) / shape[0]) nod_and_attributes[i][1]['cpd'] = nod_and_attributes[i][1][ 'cpd'].reshape(shape[0], col) nod_and_attributes[i][1]['cpd'] = nod_and_attributes[i][1][ 'cpd'].tolist() return nod_and_attributes def translate_pgmpy_to_digraph(self): """ Converts a pgmpy network into a networkx network :param pgmpy_object: the pgmpy network :return networkx : networkx network :type :pgmpy_object:adress :rtype :networkx:adress """ self.networkx_object = nx.DiGraph() edges = self.pgmpy_object.edges() nodes_and_attributes = self.get_nodes_and_attributes() self.networkx_object.add_nodes_from(nodes_and_attributes) self.networkx_object.add_edges_from(edges) return def update_networkx(self, networkx, dic, header): self.header = header self.dictionary = dic self.networkx_object = networkx def update_pgmpy(self, pgmpy, dic, header): self.header = header self.dictionary = dic self.pgmpy_object = pgmpy def save_pgmpy_network(self): """ Saves the passed pgmpy_object class object in a json file """ self.translate_pgmpy_to_digraph() self.save_network() return def translate_digraph_to_pgmpy(self, digraf): """ Converts a pgmpy network into a networkx network :param pgmpy_object: the pgmpy network :return networkx : networkx network :type :pgmpy_object:adress :rtype :networkx:adress """ self.pgmpy_object, x, y = self.get_pgmpy_network(from_object=True, digraph=digraf) return self.pgmpy_object def translate_pgmpy_to_digraph(self): """ Converts a pgmpy network into a networkx network :param pgmpy_object: the pgmpy network :return networkx : networkx network :type :pgmpy_object:adress :rtype :networkx:adress """ self.networkx_object = nx.DiGraph() edges = self.pgmpy_object.edges() nodes_and_attributes = self.get_nodes_and_attributes() self.networkx_object.add_nodes_from(nodes_and_attributes) self.networkx_object.add_edges_from(edges) return def save_network(self): """ Saves the passed networkx class object in a json file """ data = self.get_empty_canvas() data["header"] = self.header nw_nodes = self.networkx_object.nodes(data=True) nw_edges = self.networkx_object.edges() keywords = self.keywords nodes = copy.deepcopy( self.standard_nodes ) #{'RONS': {'BENS': [], 'MEMS': []}, 'LANS': {'LANS': []}, 'LENS': {'MOTOR': [], 'WORLD': []}} edges = [] cpds = [] '''adding edges''' for i, node in enumerate(nw_nodes): node_name = node[0] childs = [] for k, edge in enumerate(nw_edges): if edge[0] == node_name: childs.append(self.translation(edge[1], 1)) if len(childs) != 0: edges.append({self.translation(node_name, 1): childs}) for i, node in enumerate(nw_nodes): node_name = node[0] cardinality = node[1]['cardinality'] cpd = node[1]['cpd'] for pseudonym in keywords: if pseudonym in node_name: node_name_ = self.translation(node_name, 1) if pseudonym == 'BENS' or pseudonym == 'MEMS': nodes['RONS'][pseudonym].append( [node_name_, cardinality]) if pseudonym == 'LANS': nodes['LANS'][pseudonym].append( [node_name_, cardinality]) if pseudonym == 'MOTOR' or pseudonym == 'WORLD': nodes['LENS'][pseudonym].append( [node_name_, cardinality]) cpds.append({self.translation(node_name, 1): cpd}) data['Nodes'] = nodes data['Edges'] = edges data['CPDs'] = cpds data['header']['Date'] = datetime.datetime.now().strftime("%c") self.save_json(json.dumps(data)) return def get_pgmpy_network(self, from_object=False, digraph=None): """ Reads the passed json file and translates it's content to the passed pgmpy class object - uses the get_network(file) to read the json file in a networkx format and translate this to pgmpy - Creates a dictionary for the nodes in the form of an array of tuples : [(names defines by user, standard name)] :param file: : filename without path or extension :pgmp_object : the pgmpy object which will be completed :return: a dictionary as an array of tuples and the header of the json file :type file : string :type pgmp_object : pgmpy class object :rtype : array of tuples, dictionary CAUTION : the method does not perform a check() on the constructed DAG ! -> has to be done in the calling module """ self.pgmpy_object = BayesianModel() if not (from_object): network, dictionary, header = self.get_network() else: network = digraph nw_nodes = network.nodes(data=True) nw_edges = network.edges() '''adding nnodes and edges''' for i, node in enumerate(nw_nodes): node_name = node[0] self.pgmpy_object.add_node(node_name) for k, edge in enumerate(nw_edges): if edge[0] == node_name: self.pgmpy_object.add_edge(node_name, edge[1]) '''add cpd's''' for i, node in enumerate(nw_nodes): parent_nodes = network.in_edges(node[0]) parent_nodes = self.clean_parent_list(parent_nodes, node[0]) cpd = node[1]['cpd'] ''' find the cardinality of the node ''' cardinality_node = node[1]['cardinality'] """ cardinality card of parents has to be determined""" cardinality_parents = [] for i, nod in enumerate(parent_nodes): cardinality_parents.append(network.node[nod]['cardinality']) ''' Depending on the place in the BN and/or the number of parents the PGMPY CPD methods have another call''' if len(cardinality_parents) == 0: self.pgmpy_object.add_cpds( TabularCPD(variable=node[0], variable_card=cardinality_node, values=[cpd])) continue table = TabularCPD(variable=node[0], variable_card= cardinality_node, values=cpd, \ evidence=parent_nodes,\ evidence_card=np.asarray(cardinality_parents)) self.pgmpy_object.add_cpds(table) '''------TO DELETE-------------''' # pgmpy_object.check_model() # draw_network(pgmpy_object) '''-----------------------------''' return self.pgmpy_object, self.dictionary, self.header def get_network(self): """ Reads the passed json file and translate it's content in a networkx class object - The nodes in the object are renamed so they have a standardized signature - Creates a dictionary for the nodes in the form of an array of tuples : [(names defines by user, standard name)] :param file: : filename without path or extension :return: a networkx class object, dictionary as an array of tuples and the header of the json file :type file : string :rtype : networkx class object, array of tuples, dictionary """ self.dictionary = [] self.networkx_object = nx.DiGraph() with open(self.file) as f: data = f.read() '''Remove possible non informative characters''' data = re.sub('\n', '', data) data = re.sub('\t', '', data) data = json.loads(data) self.header = data['header'] '''Feeding G with the nodes''' cardinality = {} for key in data['Nodes'].keys(): for secondkey in data['Nodes'][key].keys(): for c, n in enumerate(data['Nodes'][key][secondkey]): node = secondkey + "_" + str(c) self.networkx_object.add_node(node, { 'cardinality': n[1], 'cpd': [] }) self.dictionary.append((n[0], node)) cardinality.update( {node: n[1]} ) #this contains the cardinality of each node with the node name as dictionary entry '''Feeding G with the edges''' edges = [] for j, pair in enumerate(data['Edges']): for parent in pair.keys(): for child in data['Edges'][j][parent]: parent_ = self.translation(parent, 0) child_ = self.translation(child, 0) edges.append((parent_, child_)) np.ravel(edges) self.networkx_object.add_edges_from(edges) '''Feeding G with the CPD's as nodes attributes''' for j, node in enumerate(data['CPDs']): for parent, cpd in node.items(): node_ = self.translation(parent, 0) self.networkx_object.node[node_]['cpd'] = cpd '''TO REMOVE LATER''' # plt.figure(figsize=(10, 5)) # pos = nx.circular_layout(G, scale=2) # node_labels = nx.get_node_attributes(G, 'cpd') # nx.draw(G, pos, node_size=1200, node_color='lightblue', # linewidths=0.25, font_size=10, font_weight='bold', with_labels=True) # plt.show() return self.networkx_object, self.dictionary, self.header def create_json_file(self, **kwargs): """ EWAMPLE : A helping method if the user prefers to create the BN within the code :param case_name: the file name without path or extension where the json file will be saved :param : **kwargs takes the following variables: description = kwargs.get('description', '') train_from = kwargs.get('train_from', '') cpds = kwargs.get('CPDs', []) bens = kwargs.get('BENS',[]) mems = kwargs.get('MEMS', []) lans = kwargs.get('LANS', []) motors = kwargs.get('MOTORS', []) world = kwargs.get('WORLD', []) edges = kwargs.get('Edges', []) frozen = kwargs.get('frozen',False) . . . :return: void :type case_name : string :type : . . . :rtype : void """ description = kwargs.get('description', '') train_from = kwargs.get('train_from', '') cpds = kwargs.get('CPDs', []) bens = kwargs.get('BENS', []) mems = kwargs.get('MEMS', []) lans = kwargs.get('LANS', []) motors = kwargs.get('MOTORS', []) world = kwargs.get('WORLD', []) edges = kwargs.get('Edges', []) frozen = kwargs.get('frozen', False) #json_tab_file_write = JSONTabIndentFileWriter( Case_name,5a) data = self.get_empty_canvas() ''' - the 3 next items are for tracking purpose only, not fundamentally necessary''' data["header"]['Identification'] = self.file data["header"]['Date'] = datetime.datetime.now().strftime("%c") data["header"]['Description'] = description ''' - the next item gives a file containing possible training data (OPTIONAL)''' data["header"]['Train_from'] = train_from ''' Frozen tells whether or not the model can be considered as final i.e. is there still "training" needed''' data["header"]['Frozen'] = frozen ''' - the 5 next lines tells how much nodes and their names + cardinality the model will start with the names can be any valid python string''' bens = [['pooping', 2], ['peeing', 2], ['constipated', 2]] mems = [['havenotoiletpaper', 2]] lans = [['diarhea', 2], ['happypoop', 2]] motors = [['asshole1', 2], ['asshole2', 2]] world = [['toilet1', 2], ['toilet2', 2], ['garden1', 2], ['garden2', 2], ['doctor', 2]] ''' - the next items describe the edges as a dictionary -> the dictionary entry is always one of the rootnodes, the array following can only contain LANs or LENs''' edges = [] ''' !! in case we start from scratch and we rely on peepo to find the best BN -> leave this array empty''' edges.append({'pooping': ['toilet1', 'diarhea', 'happypoop']}) edges.append({'peeing': ['toilet2', 'garden1', 'garden2']}) edges.append({'constipated': ['doctor']}) edges.append({'havenotoiletpaper': ['garden1', 'garden2']}) edges.append( {'diarhea': ['toilet1', 'doctor', 'asshole1', 'asshole2']}) edges.append( {'happypoop': ['garden1', 'garden2', 'asshole1', 'asshole2']}) ''' - the next items describe the CPD's as a dictionary -> the dictionary entry is the corresponding node''' cpds = [] cpds.append({'pooping': [0.5, 0.5]}) cpds.append({'peeing': [0.2, 0.8]}) cpds.append({'constipated': [0.9, 0.1]}) cpds.append({'havenotoiletpaper': [0.6, 0.4]}) cpds.append({'happypoop': [[0.3, 0.8], [0.7, 0.2]]}) cpds.append({'diarhea': [[0.8, 0.3], [0.2, 0.7]]}) cpds.append({'toilet1': [[0.3, 0.8, 0.8, 0.7], [0.7, 0.2, 0.2, 0.3]]}) cpds.append({'asshole1': [[0.3, 0.8, 0.8, 0.7], [0.7, 0.2, 0.2, 0.3]]}) cpds.append({'asshole2': [[0.3, 0.8, 0.8, 0.7], [0.7, 0.2, 0.2, 0.3]]}) cpds.append({'toilet2': [[0.5, 0.5], [0.5, 0.5]]}) cpds.append({'doctor': [[0.3, 0.8, 0.8, 0.7], [0.7, 0.2, 0.2, 0.3]]}) cpds.append({ 'garden1': [[0.3, 0.8, 0.8, 0.7, 0.8, 0.2, 0.5, 0.5], [0.7, 0.2, 0.2, 0.3, 0.2, 0.8, 0.5, 0.5]] }) cpds.append({ 'garden2': [[0.3, 0.8, 0.8, 0.7, 0.8, 0.2, 0.5, 0.5], [0.7, 0.2, 0.2, 0.3, 0.2, 0.8, 0.5, 0.5]] }) ''' - feeding the data''' data["Nodes"]['RONS']['BENS'] = bens data["Nodes"]['RONS']['MEMS'] = mems data["Nodes"]['LANS']['LANS'] = lans data["Nodes"]['LENS']['MOTOR'] = motors data["Nodes"]['LENS']['WORLD'] = world data["Edges"] = edges data["CPDs"] = cpds ''' dumping to CASENAME file in jason format''' self.save_json(json.dumps(data)) print("Json file for - ", self.file, " - created") def create_json_template(self): """ A helping method if the jason template in the project_repository ditectory has been deleted or corrupted :param : void :return: void :type : void :rtype : void """ self.get_json_path( "Template" ) # creates the right path in which case_name will be saved data = self.get_empty_canvas() data['header']['Identification'] = self.file '''Filling some dummies to facilitate the user''' a_node = ['*', 0] an_edge = {'*': ['&', '&', '&']} a_cpd = {'*': [[0, 0, 0], [0, 0, 0]]} nodes = [] edges = [] cpds = [] for i in range(0, 3): nodes.append(a_node) edges.append(an_edge) cpds.append(a_cpd) data['Nodes']['RONS']['BENS'] = nodes data['Nodes']['RONS']['MEMS'] = nodes data['Nodes']['LANS']['LANS'] = nodes data['Nodes']['LENS']['MOTOR'] = nodes data['Nodes']['LENS']['WORLD'] = nodes data['Edges'] = edges data['CPDs'] = cpds ''' dumping to CASENAME file in jason format''' # with open(case_name, 'w') as f: # json.dump(data, f, separators = (",",":")) self.save_json(json.dumps(data)) print("Empty template created") def get_empty_canvas(self): """ This method creates a json canvas which will be used for the several json creating method :param : void :return: a dictionary with the structure of the json file :type : non :rtype : dictionary """ data = { 'header': { 'Identification': '', 'Date': '', 'Description': '', 'Frozen': '', 'Train_from': '' }, 'Nodes': {}, 'Edges': [], 'CPDs': [] } ''' - the 5 next lines tells how much nodes and their names the model will start with the names can be any valid python string''' bens = [] mems = [] lans = [] motors = [] world = [] ''' - the next items describe the edges as a dictionary -> the dictionary entry is always one of the rootnodes, the array following can only contain LANs or LENs !! in case we start from scratch and we rely on peepo to find the best BN -> leave this array empty''' edges = [] ''' - the next items describe the CPD's as a dictionary -> the dictionary entry is the corresponding node''' cpds = [] ''' - feeding the data''' data['Nodes'] = { 'RONS': { 'BENS': bens, 'MEMS': mems }, 'LANS': { 'LANS': lans }, 'LENS': { 'MOTOR': motors, 'WORLD': world } } data['Edges'] = edges data['CPDs'] = cpds return data
class TestBayesianModelFitPredict(unittest.TestCase): def setUp(self): self.model_disconnected = BayesianModel() self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E']) self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) self.model2 = BayesianModel([('A', 'C'), ('B', 'C')]) self.data1 = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]}) self.data2 = pd.DataFrame(data={'A': [0, np.NaN, 1], 'B': [0, 1, 0], 'C': [1, 1, np.NaN], 'D': [np.NaN, 'Y', np.NaN]}) # data_link - "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv', dtype=str) self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] def test_bayesian_fit(self): print(isinstance(BayesianEstimator, BaseEstimator)) print(isinstance(MaximumLikelihoodEstimator, BaseEstimator)) self.model2.fit(self.data1, estimator=BayesianEstimator, prior_type="dirichlet", pseudo_counts=[9, 3]) self.assertEqual(self.model2.get_cpds('B'), TabularCPD('B', 2, [[11.0 / 15], [4.0 / 15]])) def test_fit_missing_data(self): self.model2.fit(self.data2, state_names={'C': [0, 1]}, complete_samples_only=False) cpds = set([TabularCPD('A', 2, [[0.5], [0.5]]), TabularCPD('B', 2, [[2. / 3], [1. / 3]]), TabularCPD('C', 2, [[0, 0.5, 0.5, 0.5], [1, 0.5, 0.5, 0.5]], evidence=['A', 'B'], evidence_card=[2, 2])]) self.assertSetEqual(cpds, set(self.model2.get_cpds())) def test_disconnected_fit(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model_disconnected.fit(values) for node in ['A', 'B', 'C', 'D', 'E']: cpd = self.model_disconnected.get_cpds(node) self.assertEqual(cpd.variable, node) np_test.assert_array_equal(cpd.cardinality, np.array([2])) value = (values.ix[:, node].value_counts() / values.ix[:, node].value_counts().sum()) value = value.reindex(sorted(value.index)).values np_test.assert_array_equal(cpd.values, value) def test_predict(self): titanic = BayesianModel() titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")]) titanic.fit(self.titanic_data2[500:]) p1 = titanic.predict(self.titanic_data2[["Sex", "Pclass"]][:30]) p2 = titanic.predict(self.titanic_data2[["Survived", "Pclass"]][:30]) p3 = titanic.predict(self.titanic_data2[["Survived", "Sex"]][:30]) p1_res = np.array(['0', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']) p2_res = np.array(['male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'female', 'female', 'male', 'female', 'male', 'male', 'female', 'male']) p3_res = np.array(['3', '1', '1', '1', '3', '3', '3', '3', '1', '1', '1', '1', '3', '3', '3', '1', '3', '1', '3', '1', '3', '1', '1', '1', '3', '1', '3', '3', '1', '3']) np_test.assert_array_equal(p1.values.ravel(), p1_res) np_test.assert_array_equal(p2.values.ravel(), p2_res) np_test.assert_array_equal(p3.values.ravel(), p3_res) def test_connected_predict(self): np.random.seed(42) values = pd.DataFrame(np.array(np.random.randint(low=0, high=2, size=(1000, 5)), dtype=str), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:800] predict_data = values[800:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict, predict_data) predict_data.drop('E', axis=1, inplace=True) e_predict = self.model_connected.predict(predict_data) np_test.assert_array_equal(e_predict.values.ravel(), np.array([1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0], dtype=str)) def test_connected_predict_probability(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(100, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:80] predict_data = values[80:].copy() self.model_connected.fit(fit_data) predict_data.drop('E', axis=1, inplace=True) e_prob = self.model_connected.predict_probability(predict_data) np_test.assert_allclose(e_prob.values.ravel(), np.array([0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.5 ]), atol = 0) predict_data = pd.DataFrame(np.random.randint(low=0, high=2, size=(1, 5)), columns=['A', 'B', 'C', 'F', 'E'])[:] def test_predict_probability_errors(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(2, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:1] predict_data = values[1:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict_probability, predict_data) predict_data = pd.DataFrame(np.random.randint(low=0, high=2, size=(1, 5)), columns=['A', 'B', 'C', 'F', 'E'])[:] self.assertRaises(ValueError, self.model_connected.predict_probability, predict_data) def tearDown(self): del self.model_connected del self.model_disconnected
def main(): andPGM = PGM_t() print('loading features..') train_set, test_set = andPGM.load_features() print('loading features.. Done') # Bayesian network of 19 nodes, 9*2 variables of network given # Initial incomplete Bayesian model connected manually based on intuition print('Generating model.. ') initialModel = BayesianModel({}) initialModel.add_nodes_from(andPGM.img_features.columns[1:10].tolist()) initialModel.add_edges_from([('f6_a' , 'f2_a'),\ ('f3_a' , 'f4_a') ,\ ('f5_a' , 'f9_a') ,\ ('f4_a' , 'f7_a') ]) # Use hill climb search algorithm to find network structure of initial 9 nodes hc = HillClimbSearch(data=andPGM.img_features.iloc[0:,1:10], \ scoring_method=BdeuScore(andPGM.img_features.iloc[0:,1:10], \ equivalent_sample_size=0.1*len(andPGM.img_features)), \ state_names = andPGM.states_9) # Get best estimated structure best_model = hc.estimate(start=initialModel) # Edges in the acquired graph print('model of 9 var: ', best_model.edges()) # Create a Clone of generated Bayesian network structure clone_model = BayesianModel({}) for edge in best_model.edges(): new_edge = [edge[0][:-1] + 'b', edge[1][:-1] + 'b'] clone_model.add_edges_from([new_edge]) # Join together the Original and clone network through node 'same' multinetModel = BayesianModel({}) multinetModel.add_edges_from(best_model.edges() + clone_model.edges()) multinetModel.add_node('same') multinetModel.add_edge('f5_a', 'same') multinetModel.add_edge('f9_a', 'same') multinetModel.add_edge('f5_b', 'same') multinetModel.add_edge('f9_b', 'same') print('Generating model.. Done') # Edges in the final structure print('Final model: ', multinetModel.edges()) print('Fit data into model..') # fit the data to model to generate CPDs using maximum likelyhood estimation multinetModel.fit(data=train_set, state_names=andPGM.states_all) print('Fit data into model.. Done') print('CPDs generated: ') cpds = multinetModel.get_cpds() for cpd in cpds: print(cpd) # Inference using Variable Elimination print('Start inference..') inference = VariableElimination(multinetModel) train_set_same = train_set[train_set['same'] == 0] train_set_not_same = train_set[train_set['same'] == 1] # Accuracy of positive inferences acc_same = andPGM.chk_accuracy( train_set_same, inference, variables=train_set_same.columns[0:9].tolist(), evidence=train_set_same.columns[9:19].tolist()) print('accuracy of positives ', acc_same) # Accuracy of negative inferences acc_nt_same = andPGM.chk_accuracy( train_set_not_same, inference, variables=train_set_not_same.columns[0:9].tolist(), evidence=train_set_not_same.columns[9:19].tolist()) print('accuracy of negatives', acc_nt_same)
import numpy as np import pandas as pd from pgmpy.models import BayesianModel from pgmpy.estimators import MaximumLikelihoodEstimator # Generating some random data raw_data = np.random.randint(low=0, high=2, size=(100, 2)) print(raw_data) data = pd.DataFrame(raw_data, columns=['X', 'Y']) print(data) # Two coin tossing model assuming that they are dependent. coin_model = BayesianModel([('X', 'Y')]) coin_model.fit(data, estimator=MaximumLikelihoodEstimator) cpd_x = coin_model.get_cpds('X') print(cpd_x)
def bayesian_net(): musicianship_model = BayesianModel([('Difficulty', 'Rating'), ('Musicianship', 'Rating'), ('Musicianship', 'Exam'), ('Rating', 'Letter')]) cpd_diff = TabularCPD(variable='Difficulty', variable_card=2, values=[[0.6], [0.4]]) #0->Low, 1->High cpd_music = TabularCPD(variable='Musicianship', variable_card=2, values=[[0.7], [0.3]]) #0->Weak 1->Strong cpd_rating = TabularCPD(variable='Rating', variable_card=3, values=[[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], evidence=['Difficulty', 'Musicianship'], evidence_card=[2, 2]) #0->* 1->** 2-->*** cpd_exam = TabularCPD(variable='Exam', variable_card=2, values=[[0.95, 0.2], [0.05, 0.8]], evidence=['Musicianship'], evidence_card=[2]) #0-->Low 1-->High cpd_letter = TabularCPD(variable='Letter', variable_card=2, values=[[0.1, 0.4, 0.99], [0.9, 0.6, 0.01]], evidence=['Rating'], evidence_card=[3]) #0-->Weak 1-->Strong musicianship_model.add_cpds(cpd_diff, cpd_music, cpd_rating, cpd_exam, cpd_letter) musicianship_model.check_model() infer = SimpleInference(musicianship_model) # query without normalization print('------------------------') print(' EXACT INFERENCE') print('------------------------') print('--------------------') print( ' QUERY Letter with evidence Difficulty: 0, Musicianship: 1, Rating: 1, Exam:1 NOT NORMALIZED' ) print('--------------------') print( infer.query(['Letter'], evidence={('Difficulty', 0), ('Musicianship', 1), ('Rating', 1), ('Exam', 1)})) print('--------------------') print( ' QUERY Letter with evidence Difficulty: 0, Musicianship: 1, Rating: 1, Exam:1 NORMALIZED' ) print('--------------------') infer = VariableElimination(musicianship_model) # query normalized print( infer.query(['Letter'], evidence={ 'Difficulty': 0, 'Musicianship': 1, 'Rating': 1, 'Exam': 1 })['Letter']) print('--------------------') print(' QUERY Letter with no evidence') print('--------------------') print(infer.query(['Letter'])['Letter']) print('--------------------') print(' QUERY Letter with evidence Musicianship: 0 NORMALIZED') print('--------------------') print(infer.query(['Letter'], evidence={'Musicianship': 0})['Letter']) sampling = BayesianModelSampling(musicianship_model) data = sampling.likelihood_weighted_sample(evidence={}, size=2000, return_type='dataframe') musicianship_model_bis = BayesianModel([('Difficulty', 'Rating'), ('Musicianship', 'Rating'), ('Rating', 'Letter'), ('Musicianship', 'Exam')]) musicianship_model_bis.fit(data, estimator=BayesianEstimator) musicianship_model_bis.check_model() infer = VariableElimination(musicianship_model_bis) # query normalized for cpd in musicianship_model_bis.get_cpds(): print("CPD of {variable}:".format(variable=cpd.variable)) print(cpd) print('------------------------') print(' APPROXIMATE INFERENCE') print('------------------------') print('--------------------') print( ' QUERY Letter with evidence Difficulty: 0, Musicianship: 1, Rating: 1, Exam:1 NORMALIZED' ) print('--------------------') print( infer.query(['Letter'], evidence={ 'Difficulty': 0, 'Musicianship': 1, 'Rating': 1, 'Exam': 1 })['Letter']) print('--------------------') print(' QUERY Letter with no evidence') print('--------------------') print(infer.query(['Letter'])['Letter']) print('--------------------') print(' QUERY Letter with evidence Musicianship: 0 NORMALIZED') print('--------------------') print(infer.query(['Letter'], evidence={'Musicianship': 0})['Letter'])
class TestBayesianModelCPD(unittest.TestCase): def setUp(self): self.G = BayesianModel([('d', 'g'), ('i', 'g'), ('g', 'l'), ('i', 's')]) def test_active_trail_nodes(self): self.assertEqual(sorted(self.G.active_trail_nodes('d')), ['d', 'g', 'l']) self.assertEqual(sorted(self.G.active_trail_nodes('i')), ['g', 'i', 'l', 's']) def test_active_trail_nodes_args(self): self.assertEqual(sorted(self.G.active_trail_nodes('d', observed='g')), ['d', 'i', 's']) self.assertEqual(sorted(self.G.active_trail_nodes('l', observed='g')), ['l']) self.assertEqual(sorted(self.G.active_trail_nodes('s', observed=['i', 'l'])), ['s']) self.assertEqual(sorted(self.G.active_trail_nodes('s', observed=['d', 'l'])), ['g', 'i', 's']) def test_is_active_trail_triplets(self): self.assertTrue(self.G.is_active_trail('d', 'l')) self.assertTrue(self.G.is_active_trail('g', 's')) self.assertFalse(self.G.is_active_trail('d', 'i')) self.assertTrue(self.G.is_active_trail('d', 'i', observed='g')) self.assertFalse(self.G.is_active_trail('d', 'l', observed='g')) self.assertFalse(self.G.is_active_trail('i', 'l', observed='g')) self.assertTrue(self.G.is_active_trail('d', 'i', observed='l')) self.assertFalse(self.G.is_active_trail('g', 's', observed='i')) def test_is_active_trail(self): self.assertFalse(self.G.is_active_trail('d', 's')) self.assertTrue(self.G.is_active_trail('s', 'l')) self.assertTrue(self.G.is_active_trail('d', 's', observed='g')) self.assertFalse(self.G.is_active_trail('s', 'l', observed='g')) def test_is_active_trail_args(self): self.assertFalse(self.G.is_active_trail('s', 'l', 'i')) self.assertFalse(self.G.is_active_trail('s', 'l', 'g')) self.assertTrue(self.G.is_active_trail('d', 's', 'l')) self.assertFalse(self.G.is_active_trail('d', 's', ['i', 'l'])) def test_get_cpds(self): cpd_d = TabularCPD('d', 2, np.random.rand(2, 1)) cpd_i = TabularCPD('i', 2, np.random.rand(2, 1)) cpd_g = TabularCPD('g', 2, np.random.rand(2, 4), ['d', 'i'], [2, 2]) cpd_l = TabularCPD('l', 2, np.random.rand(2, 2), ['g'], 2) cpd_s = TabularCPD('s', 2, np.random.rand(2, 2), ['i'], 2) self.G.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s) self.assertEqual(self.G.get_cpds('d').variable, 'd') def test_get_cpds1(self): self.model = BayesianModel([('A', 'AB')]) cpd_a = TabularCPD('A', 2, np.random.rand(2, 1)) cpd_ab = TabularCPD('AB', 2, np.random.rand(2, 2), evidence=['A'], evidence_card=[2]) self.model.add_cpds(cpd_a, cpd_ab) self.assertEqual(self.model.get_cpds('A').variable, 'A') self.assertEqual(self.model.get_cpds('AB').variable, 'AB') def test_add_single_cpd(self): from pgmpy.factors import TabularCPD cpd_s = TabularCPD('s', 2, np.random.rand(2, 2), ['i'], 2) self.G.add_cpds(cpd_s) self.assertListEqual(self.G.get_cpds(), [cpd_s]) def test_add_multiple_cpds(self): from pgmpy.factors import TabularCPD cpd_d = TabularCPD('d', 2, np.random.rand(2, 1)) cpd_i = TabularCPD('i', 2, np.random.rand(2, 1)) cpd_g = TabularCPD('g', 2, np.random.rand(2, 4), ['d', 'i'], [2, 2]) cpd_l = TabularCPD('l', 2, np.random.rand(2, 2), ['g'], 2) cpd_s = TabularCPD('s', 2, np.random.rand(2, 2), ['i'], 2) self.G.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s) self.assertEqual(self.G.get_cpds('d'), cpd_d) self.assertEqual(self.G.get_cpds('i'), cpd_i) self.assertEqual(self.G.get_cpds('g'), cpd_g) self.assertEqual(self.G.get_cpds('l'), cpd_l) self.assertEqual(self.G.get_cpds('s'), cpd_s) def tearDown(self): del self.G
Created on Oct 27, 2017 @author: Adele ''' import numpy as np import pandas data = pandas.read_csv("kaggle.csv") data2 = data[["Survived", "Sex", "Pclass"]] #data2 = data[["Survived", "Sex", "Pclass"]].replace(["female", "male"], [0, 1]).replace({"Pclass": {3: 0}}) intrain = np.random.rand(len(data2)) < 0.8 dtrain = data2[intrain] dtest = data2[~intrain] ##print(len(dtrain), len(dtest)) from pgmpy.models import BayesianModel titanic = BayesianModel() titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")]) titanic.fit(dtrain) for cpd in titanic.get_cpds(): print(cpd) print(dtest[["Sex", "Pclass"]]) titanic.predict(dtest[["Sex", "Pclass"]])
class TestBayesianModelCPD(unittest.TestCase): def setUp(self): self.G = BayesianModel([('d', 'g'), ('i', 'g'), ('g', 'l'), ('i', 's')]) def test_active_trail_nodes(self): self.assertEqual(sorted(self.G.active_trail_nodes('d')), ['d', 'g', 'l']) self.assertEqual(sorted(self.G.active_trail_nodes('i')), ['g', 'i', 'l', 's']) def test_active_trail_nodes_args(self): self.assertEqual(sorted(self.G.active_trail_nodes('d', observed='g')), ['d', 'i', 's']) self.assertEqual(sorted(self.G.active_trail_nodes('l', observed='g')), ['l']) self.assertEqual(sorted(self.G.active_trail_nodes('s', observed=['i', 'l'])), ['s']) self.assertEqual(sorted(self.G.active_trail_nodes('s', observed=['d', 'l'])), ['g', 'i', 's']) def test_is_active_trail_triplets(self): self.assertTrue(self.G.is_active_trail('d', 'l')) self.assertTrue(self.G.is_active_trail('g', 's')) self.assertFalse(self.G.is_active_trail('d', 'i')) self.assertTrue(self.G.is_active_trail('d', 'i', observed='g')) self.assertFalse(self.G.is_active_trail('d', 'l', observed='g')) self.assertFalse(self.G.is_active_trail('i', 'l', observed='g')) self.assertTrue(self.G.is_active_trail('d', 'i', observed='l')) self.assertFalse(self.G.is_active_trail('g', 's', observed='i')) def test_is_active_trail(self): self.assertFalse(self.G.is_active_trail('d', 's')) self.assertTrue(self.G.is_active_trail('s', 'l')) self.assertTrue(self.G.is_active_trail('d', 's', observed='g')) self.assertFalse(self.G.is_active_trail('s', 'l', observed='g')) def test_is_active_trail_args(self): self.assertFalse(self.G.is_active_trail('s', 'l', 'i')) self.assertFalse(self.G.is_active_trail('s', 'l', 'g')) self.assertTrue(self.G.is_active_trail('d', 's', 'l')) self.assertFalse(self.G.is_active_trail('d', 's', ['i', 'l'])) def test_get_cpds(self): cpd_d = TabularCPD('d', 2, np.random.rand(2, 1)) cpd_i = TabularCPD('i', 2, np.random.rand(2, 1)) cpd_g = TabularCPD('g', 2, np.random.rand(2, 4), ['d', 'i'], [2, 2]) cpd_l = TabularCPD('l', 2, np.random.rand(2, 2), ['g'], 2) cpd_s = TabularCPD('s', 2, np.random.rand(2, 2), ['i'], 2) self.G.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s) self.assertEqual(self.G.get_cpds('d').variable, 'd') def test_get_cpds1(self): self.model = BayesianModel([('A', 'AB')]) cpd_a = TabularCPD('A', 2, np.random.rand(2, 1)) cpd_ab = TabularCPD('AB', 2, np.random.rand(2, 2), evidence=['A'], evidence_card=[2]) self.model.add_cpds(cpd_a, cpd_ab) self.assertEqual(self.model.get_cpds('A').variable, 'A') self.assertEqual(self.model.get_cpds('AB').variable, 'AB') def test_add_single_cpd(self): cpd_s = TabularCPD('s', 2, np.random.rand(2, 2), ['i'], 2) self.G.add_cpds(cpd_s) self.assertListEqual(self.G.get_cpds(), [cpd_s]) def test_add_multiple_cpds(self): cpd_d = TabularCPD('d', 2, np.random.rand(2, 1)) cpd_i = TabularCPD('i', 2, np.random.rand(2, 1)) cpd_g = TabularCPD('g', 2, np.random.rand(2, 4), ['d', 'i'], [2, 2]) cpd_l = TabularCPD('l', 2, np.random.rand(2, 2), ['g'], 2) cpd_s = TabularCPD('s', 2, np.random.rand(2, 2), ['i'], 2) self.G.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s) self.assertEqual(self.G.get_cpds('d'), cpd_d) self.assertEqual(self.G.get_cpds('i'), cpd_i) self.assertEqual(self.G.get_cpds('g'), cpd_g) self.assertEqual(self.G.get_cpds('l'), cpd_l) self.assertEqual(self.G.get_cpds('s'), cpd_s) def test_check_model(self): cpd_g = TabularCPD('g', 2, np.array([[0.2, 0.3, 0.4, 0.6], [0.8, 0.7, 0.6, 0.4]]), ['d', 'i'], [2, 2]) cpd_s = TabularCPD('s', 2, np.array([[0.2, 0.3], [0.8, 0.7]]), ['i'], 2) cpd_l = TabularCPD('l', 2, np.array([[0.2, 0.3], [0.8, 0.7]]), ['g'], 2) self.G.add_cpds(cpd_g, cpd_s, cpd_l) self.assertTrue(self.G.check_model()) def test_check_model1(self): cpd_g = TabularCPD('g', 2, np.array([[0.2, 0.3], [0.8, 0.7]]), ['i'], 2) self.G.add_cpds(cpd_g) self.assertRaises(ValueError, self.G.check_model) self.G.remove_cpds(cpd_g) cpd_g = TabularCPD('g', 2, np.array([[0.2, 0.3, 0.4, 0.6], [0.8, 0.7, 0.6, 0.4]]), ['d', 's'], [2, 2]) self.G.add_cpds(cpd_g) self.assertRaises(ValueError, self.G.check_model) self.G.remove_cpds(cpd_g) cpd_g = TabularCPD('g', 2, np.array([[0.2, 0.3], [0.8, 0.7]]), ['l'], 2) self.G.add_cpds(cpd_g) self.assertRaises(ValueError, self.G.check_model) self.G.remove_cpds(cpd_g) cpd_l = TabularCPD('l', 2, np.array([[0.2, 0.3], [0.8, 0.7]]), ['d'], 2) self.G.add_cpds(cpd_l) self.assertRaises(ValueError, self.G.check_model) self.G.remove_cpds(cpd_l) cpd_l = TabularCPD('l', 2, np.array([[0.2, 0.3, 0.4, 0.6], [0.8, 0.7, 0.6, 0.4]]), ['d', 'i'], [2, 2]) self.G.add_cpds(cpd_l) self.assertRaises(ValueError, self.G.check_model) self.G.remove_cpds(cpd_l) cpd_l = TabularCPD('l', 2, np.array([[0.2, 0.3, 0.4, 0.6, 0.2, 0.3, 0.4, 0.6], [0.8, 0.7, 0.6, 0.4, 0.8, 0.7, 0.6, 0.4]]), ['g', 'd', 'i'], [2, 2, 2]) self.G.add_cpds(cpd_l) self.assertRaises(ValueError, self.G.check_model) self.G.remove_cpds(cpd_l) def test_check_model2(self): cpd_s = TabularCPD('s', 2, np.array([[0.5, 0.3], [0.8, 0.7]]), ['i'], 2) self.G.add_cpds(cpd_s) self.assertRaises(ValueError, self.G.check_model) self.G.remove_cpds(cpd_s) cpd_g = TabularCPD('g', 2, np.array([[0.2, 0.3, 0.4, 0.6], [0.3, 0.7, 0.6, 0.4]]), ['d', 'i'], [2, 2]) self.G.add_cpds(cpd_g) self.assertRaises(ValueError, self.G.check_model) self.G.remove_cpds(cpd_g) cpd_l = TabularCPD('l', 2, np.array([[0.2, 0.3], [0.1, 0.7]]), ['g'], 2) self.G.add_cpds(cpd_l) self.assertRaises(ValueError, self.G.check_model) self.G.remove_cpds(cpd_l) def tearDown(self): del self.G
import numpy as np import pandas as pd from pgmpy.models import BayesianModel from pgmpy.estimators import BayesianEstimator # Generating random data for two coin tossing examples raw_data = np.random.randint(low=0, high=2, size=(1000, 2)) data = pd.DataFrame(raw_data, columns=['X', 'Y']) print(data) coin_model = BayesianModel() coin_model.fit(data, estimator=BayesianEstimator) coin_model.get_cpds() coin_model.nodes() coin_model.edges()
# Now in general machine learning problems it doesn't matter which # column of the array represents which variable (until we use same # order for both training and prediction) because all the values # are on symmetrical axis but in graphical models each variable is # different (in the way it is connected to other variables etc) so # we will need to specify which columns of data are for which # variable. For that we will use pandas. import pandas as pd data = pd.DataFrame(data, columns=['cost', 'quality', 'location', 'no_of_people']) data train = data[:750] # We will try to predict the no_of_people from our model. So for # test data we will delete that column and then later on predict # those values. test = data[750:].drop('no_of_people', axis=1) test # Now we will need to create the base network structure for the # model. restaurant_model = BayesianModel([('location', 'cost'), ('quality', 'cost'), ('location', 'no_of_people'), ('cost', 'no_of_people')]) restaurant_model.fit(train) # Fit computes the cpd of all the variables from the training data # that we provided. restaurant_model.get_cpds() # Now for predicting the values of no_of_people using this model # we can simply call the predict method on our test data. restaurant_model.predict(test).values.ravel()