def test_get_independencies(self): chain = BayesianModel([('X', 'Y'), ('Y', 'Z')]) self.assertEqual(chain.get_independencies(), Independencies(('X', 'Z', 'Y'), ('Z', 'X', 'Y'))) fork = BayesianModel([('Y', 'X'), ('Y', 'Z')]) self.assertEqual(fork.get_independencies(), Independencies(('X', 'Z', 'Y'), ('Z', 'X', 'Y'))) collider = BayesianModel([('X', 'Y'), ('Z', 'Y')]) self.assertEqual(collider.get_independencies(), Independencies(('X', 'Z'), ('Z', 'X')))
def test_estimate_from_independencies(self): ind = Independencies(['B', 'C'], ['A', ['B', 'C'], 'D']) ind = ind.closure() model = ConstraintBasedEstimator.estimate_from_independencies("ABCD", ind) self.assertSetEqual(set(model.edges()), set([('B', 'D'), ('A', 'D'), ('C', 'D')])) model1 = BayesianModel([('A', 'C'), ('B', 'C'), ('B', 'D'), ('C', 'E')]) model2 = ConstraintBasedEstimator.estimate_from_independencies( model1.nodes(), model1.get_independencies()) self.assertTrue(set(model2.edges()) == set(model1.edges()) or set(model2.edges()) == set([('B', 'C'), ('A', 'C'), ('C', 'E'), ('D', 'B')]))
def test_build_skeleton(self): ind = Independencies(['B', 'C'], ['A', ['B', 'C'], 'D']) ind = ind.closure() skel1, sep_sets1 = ConstraintBasedEstimator.build_skeleton("ABCD", ind) self.assertTrue(self._edge_list_equal(skel1.edges(), [('A', 'D'), ('B', 'D'), ('C', 'D')])) sep_sets_ref1 = {frozenset({'A', 'C'}): (), frozenset({'A', 'B'}): (), frozenset({'C', 'B'}): ()} self.assertEqual(sep_sets1, sep_sets_ref1) model = BayesianModel([('A', 'C'), ('B', 'C'), ('B', 'D'), ('C', 'E')]) skel2, sep_sets2 = ConstraintBasedEstimator.build_skeleton(model.nodes(), model.get_independencies()) self.assertTrue(self._edge_list_equal(skel2, [('D', 'B'), ('A', 'C'), ('B', 'C'), ('C', 'E')])) sep_sets_ref2 = {frozenset({'D', 'C'}): ('B',), frozenset({'E', 'B'}): ('C',), frozenset({'A', 'D'}): (), frozenset({'E', 'D'}): ('C',), frozenset({'E', 'A'}): ('C',), frozenset({'A', 'B'}): ()} # witnesses/seperators might change on each run, so we cannot compare directly self.assertEqual(sep_sets2.keys(), sep_sets_ref2.keys()) self.assertEqual([len(v) for v in sorted(sep_sets2.values())], [len(v) for v in sorted(sep_sets_ref2.values())])
def bayesnet(): """ References: https://class.coursera.org/pgm-003/lecture/17 http://www.cs.ubc.ca/~murphyk/Bayes/bnintro.html http://www3.cs.stonybrook.edu/~sael/teaching/cse537/Slides/chapter14d_BP.pdf http://www.cse.unsw.edu.au/~cs9417ml/Bayes/Pages/PearlPropagation.html https://github.com/pgmpy/pgmpy.git http://pgmpy.readthedocs.org/en/latest/ http://nipy.bic.berkeley.edu:5000/download/11 """ # import operator as op # # Enumerate all possible events # varcard_list = list(map(op.attrgetter('variable_card'), cpd_list)) # _esdat = list(ut.iprod(*map(range, varcard_list))) # _escol = list(map(op.attrgetter('variable'), cpd_list)) # event_space = pd.DataFrame(_esdat, columns=_escol) # # Custom compression of event space to inspect a specific graph # def compress_space_flags(event_space, var1, var2, var3, cmp12_): # """ # var1, var2, cmp_ = 'Lj', 'Lk', op.eq # """ # import vtool as vt # data = event_space # other_cols = ut.setdiff_ordered(data.columns.tolist(), [var1, var2, var3]) # case_flags12 = cmp12_(data[var1], data[var2]).values # # case_flags23 = cmp23_(data[var2], data[var3]).values # # case_flags = np.logical_and(case_flags12, case_flags23) # case_flags = case_flags12 # case_flags = case_flags.astype(np.int64) # subspace = np.hstack((case_flags[:, None], data[other_cols].values)) # sel_ = vt.unique_row_indexes(subspace) # flags = np.logical_and(mask, case_flags) # return flags # # Build special cases # case_same = event_space.loc[compress_space_flags(event_space, 'Li', 'Lj', 'Lk', op.eq)] # case_diff = event_space.loc[compress_space_flags(event_space, 'Li', 'Lj', 'Lk', op.ne)] # special_cases = [ # case_same, # case_diff, # ] from pgmpy.factors import TabularCPD from pgmpy.models import BayesianModel import pandas as pd from pgmpy.inference import BeliefPropagation # NOQA from pgmpy.inference import VariableElimination # NOQA name_nice = ['n1', 'n2', 'n3'] score_nice = ['low', 'high'] match_nice = ['diff', 'same'] num_names = len(name_nice) num_scores = len(score_nice) nid_basis = list(range(num_names)) score_basis = list(range(num_scores)) semtype2_nice = { 'score': score_nice, 'name': name_nice, 'match': match_nice, } var2_cpd = { } globals()['semtype2_nice'] = semtype2_nice globals()['var2_cpd'] = var2_cpd name_combo = np.array(list(ut.iprod(nid_basis, nid_basis))) combo_is_same = name_combo.T[0] == name_combo.T[1] def get_expected_scores_prob(level1, level2): part1 = combo_is_same * level1 part2 = (1 - combo_is_same) * (1 - (level2)) expected_scores_level = part1 + part2 return expected_scores_level # def make_cpd(): def name_cpd(aid): from pgmpy.factors import TabularCPD cpd = TabularCPD( variable='N' + aid, variable_card=num_names, values=[[1.0 / num_names] * num_names]) cpd.semtype = 'name' return cpd name_cpds = [name_cpd('i'), name_cpd('j'), name_cpd('k')] var2_cpd.update(dict(zip([cpd.variable for cpd in name_cpds], name_cpds))) if True: num_same_diff = 2 samediff_measure = np.array([ # get_expected_scores_prob(.12, .2), # get_expected_scores_prob(.88, .8), get_expected_scores_prob(0, 0), get_expected_scores_prob(1, 1), ]) samediff_vals = (samediff_measure / samediff_measure.sum(axis=0)).tolist() def samediff_cpd(aid1, aid2): cpd = TabularCPD( variable='A' + aid1 + aid2, variable_card=num_same_diff, values=samediff_vals, evidence=['N' + aid1, 'N' + aid2], # [::-1], evidence_card=[num_names, num_names]) # [::-1]) cpd.semtype = 'match' return cpd samediff_cpds = [samediff_cpd('i', 'j'), samediff_cpd('j', 'k'), samediff_cpd('k', 'i')] var2_cpd.update(dict(zip([cpd.variable for cpd in samediff_cpds], samediff_cpds))) if True: def score_cpd(aid1, aid2): semtype = 'score' evidence = ['A' + aid1 + aid2, 'N' + aid1, 'N' + aid2] evidence_cpds = [var2_cpd[key] for key in evidence] evidence_nice = [semtype2_nice[cpd.semtype] for cpd in evidence_cpds] evidence_card = list(map(len, evidence_nice)) evidence_states = list(ut.iprod(*evidence_nice)) variable_basis = semtype2_nice[semtype] variable_values = [] for mystate in variable_basis: row = [] for state in evidence_states: if state[0] == state[1]: if state[2] == 'same': val = .2 if mystate == 'low' else .8 else: val = 1 # val = .5 if mystate == 'low' else .5 elif state[0] != state[1]: if state[2] == 'same': val = .5 if mystate == 'low' else .5 else: val = 1 # val = .9 if mystate == 'low' else .1 row.append(val) variable_values.append(row) cpd = TabularCPD( variable='S' + aid1 + aid2, variable_card=len(variable_basis), values=variable_values, evidence=evidence, # [::-1], evidence_card=evidence_card) # [::-1]) cpd.semtype = semtype return cpd else: score_values = [ [.8, .1], [.2, .9], ] def score_cpd(aid1, aid2): cpd = TabularCPD( variable='S' + aid1 + aid2, variable_card=num_scores, values=score_values, evidence=['A' + aid1 + aid2], # [::-1], evidence_card=[num_same_diff]) # [::-1]) cpd.semtype = 'score' return cpd score_cpds = [score_cpd('i', 'j'), score_cpd('j', 'k')] cpd_list = name_cpds + score_cpds + samediff_cpds else: score_measure = np.array([get_expected_scores_prob(level1, level2) for level1, level2 in zip(np.linspace(.1, .9, num_scores), np.linspace(.2, .8, num_scores))]) score_values = (score_measure / score_measure.sum(axis=0)).tolist() def score_cpd(aid1, aid2): cpd = TabularCPD( variable='S' + aid1 + aid2, variable_card=num_scores, values=score_values, evidence=['N' + aid1, 'N' + aid2], evidence_card=[num_names, num_names]) cpd.semtype = 'score' return cpd score_cpds = [score_cpd('i', 'j'), score_cpd('j', 'k')] cpd_list = name_cpds + score_cpds pass input_graph = [] for cpd in cpd_list: if cpd.evidence is not None: for evar in cpd.evidence: input_graph.append((evar, cpd.variable)) name_model = BayesianModel(input_graph) name_model.add_cpds(*cpd_list) var2_cpd.update(dict(zip([cpd.variable for cpd in cpd_list], cpd_list))) globals()['var2_cpd'] = var2_cpd varnames = [cpd.variable for cpd in cpd_list] # --- PRINT CPDS --- cpd = score_cpds[0] def print_cpd(cpd): print('CPT: %r' % (cpd,)) index = semtype2_nice[cpd.semtype] if cpd.evidence is None: columns = ['None'] else: basis_lists = [semtype2_nice[var2_cpd[ename].semtype] for ename in cpd.evidence] columns = [','.join(x) for x in ut.iprod(*basis_lists)] data = cpd.get_cpd() print(pd.DataFrame(data, index=index, columns=columns)) for cpd in name_model.get_cpds(): print('----') print(cpd._str('phi')) print_cpd(cpd) # --- INFERENCE --- Ni = name_cpds[0] event_space_combos = {} event_space_combos[Ni.variable] = 0 # Set ni to always be Fred for cpd in cpd_list: if cpd.semtype == 'score': event_space_combos[cpd.variable] = list(range(cpd.variable_card)) evidence_dict = ut.all_dict_combinations(event_space_combos) # Query about name of annotation k given different event space params def pretty_evidence(evidence): return [key + '=' + str(semtype2_nice[var2_cpd[key].semtype][val]) for key, val in evidence.items()] def print_factor(factor): row_cards = factor.cardinality row_vars = factor.variables values = factor.values.reshape(np.prod(row_cards), 1).flatten() # col_cards = 1 # col_vars = [''] basis_lists = list(zip(*list(ut.iprod(*[range(c) for c in row_cards])))) nice_basis_lists = [] for varname, basis in zip(row_vars, basis_lists): cpd = var2_cpd[varname] _nice_basis = ut.take(semtype2_nice[cpd.semtype], basis) nice_basis = ['%s=%s' % (varname, val) for val in _nice_basis] nice_basis_lists.append(nice_basis) row_lbls = [', '.join(sorted(x)) for x in zip(*nice_basis_lists)] print(ut.repr3(dict(zip(row_lbls, values)), precision=3, align=True, key_order_metric='-val')) # name_belief = BeliefPropagation(name_model) name_belief = VariableElimination(name_model) import pgmpy import six # NOQA def try_query(evidence): print('--------') query_vars = ut.setdiff_ordered(varnames, list(evidence.keys())) evidence_str = ', '.join(pretty_evidence(evidence)) probs = name_belief.query(query_vars, evidence) factor_list = probs.values() joint_factor = pgmpy.factors.factor_product(*factor_list) print('P(' + ', '.join(query_vars) + ' | ' + evidence_str + ')') # print(six.text_type(joint_factor)) factor = joint_factor # NOQA # print_factor(factor) # import utool as ut print(ut.hz_str([(f._str(phi_or_p='phi')) for f in factor_list])) for evidence in evidence_dict: try_query(evidence) evidence = {'Aij': 1, 'Ajk': 1, 'Aki': 1, 'Ni': 0} try_query(evidence) evidence = {'Aij': 0, 'Ajk': 0, 'Aki': 0, 'Ni': 0} try_query(evidence) globals()['score_nice'] = score_nice globals()['name_nice'] = name_nice globals()['score_basis'] = score_basis globals()['nid_basis'] = nid_basis print('Independencies') print(name_model.get_independencies()) print(name_model.local_independencies([Ni.variable])) # name_belief = BeliefPropagation(name_model) # # name_belief = VariableElimination(name_model) # for case in special_cases: # test_data = case.drop('Lk', axis=1) # test_data = test_data.reset_index(drop=True) # print('----') # for i in range(test_data.shape[0]): # evidence = test_data.loc[i].to_dict() # probs = name_belief.query(['Lk'], evidence) # factor = probs['Lk'] # probs = factor.values # evidence_ = evidence.copy() # evidence_['Li'] = name_nice[evidence['Li']] # evidence_['Lj'] = name_nice[evidence['Lj']] # evidence_['Sij'] = score_nice[evidence['Sij']] # evidence_['Sjk'] = score_nice[evidence['Sjk']] # nice2_prob = ut.odict(zip(name_nice, probs.tolist())) # ut.print_python_code('P(Lk | {evidence}) = {cpt}'.format( # evidence=(ut.repr2(evidence_, explicit=True, nobraces=True, strvals=True)), # cpt=ut.repr3(nice2_prob, precision=3, align=True, key_order_metric='-val') # )) # for case in special_cases: # test_data = case.drop('Lk', axis=1) # test_data = test_data.drop('Lj', axis=1) # test_data = test_data.reset_index(drop=True) # print('----') # for i in range(test_data.shape[0]): # evidence = test_data.loc[i].to_dict() # query_vars = ['Lk', 'Lj'] # probs = name_belief.query(query_vars, evidence) # for queryvar in query_vars: # factor = probs[queryvar] # print(factor._str('phi')) # probs = factor.values # evidence_ = evidence.copy() # evidence_['Li'] = name_nice[evidence['Li']] # evidence_['Sij'] = score_nice[evidence['Sij']] # evidence_['Sjk'] = score_nice[evidence['Sjk']] # nice2_prob = ut.odict(zip([queryvar + '=' + x for x in name_nice], probs.tolist())) # ut.print_python_code('P({queryvar} | {evidence}) = {cpt}'.format( # query_var=query_var, # evidence=(ut.repr2(evidence_, explicit=True, nobraces=True, strvals=True)), # cpt=ut.repr3(nice2_prob, precision=3, align=True, key_order_metric='-val') # )) # _ draw model import plottool as pt import networkx as netx fig = pt.figure() # NOQA fig.clf() ax = pt.gca() netx_nodes = [(node, {}) for node in name_model.nodes()] netx_edges = [(etup[0], etup[1], {}) for etup in name_model.edges()] netx_graph = netx.DiGraph() netx_graph.add_nodes_from(netx_nodes) netx_graph.add_edges_from(netx_edges) # pos = netx.graphviz_layout(netx_graph) pos = netx.pydot_layout(netx_graph, prog='dot') netx.draw(netx_graph, pos=pos, ax=ax, with_labels=True) pt.plt.savefig('foo.png') ut.startfile('foo.png')
variable_card=2, values=[[0.65, 0.3], [0.35, 0.7]], evidence=['Cancer'], evidence_card=[2]) # Associating the parameters with the model structure. cancer_model.add_cpds(cpd_poll, cpd_smoke, cpd_cancer, cpd_xray, cpd_dysp) # Checking if the cpds are valid for the model. print(cancer_model.check_model()) # Check d-separations. This is only meant for those interested. You do not need to understand this to do the project. print(cancer_model.is_active_trail('Pollution', 'Smoker')) print(cancer_model.is_active_trail('Pollution', 'Smoker', observed=['Cancer'])) print(cancer_model.local_independencies('Xray')) print(cancer_model.get_independencies()) # Print model information print(cancer_model.edges()) print(cancer_model.nodes()) print(cancer_model.get_cpds()) # Doing exact inference using Variable Elimination from pgmpy.inference import VariableElimination cancer_infer = VariableElimination(cancer_model) # Query print( cancer_infer.query(variables=['Dyspnoea'], joint=False, evidence={'Cancer': 0})['Dyspnoea'])
heartDisease = heartDisease.replace('?', np.nan) heartDisease.dtypes print(heartDisease.dtypes) heartDisease.columns from pgmpy.models import BayesianModel from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator model = BayesianModel([('age', 'trestbps'), ('age', 'fbs'), ('sex', 'trestbps'), ('sex', 'trestbps'), ('exang', 'trestbps'), ('trestbps', 'heartdisease'), ('fbs', 'heartdisease'), ('heartdisease', 'restecg'), ('heartdisease', 'thalach'), ('heartdisease', 'chol')]) # Learning CPDs using Maximum Likelihood Estimators model.fit(heartDisease, estimator=MaximumLikelihoodEstimator) print(model.get_cpds('age')) print(model.get_cpds('chol')) print(model.get_cpds('sex')) model.get_independencies() # Doing exact inference using Variable Elimination from pgmpy.inference import VariableElimination HeartDisease_infer = VariableElimination(model) # Computing the probability of bronc given smoke. q = HeartDisease_infer.query(variables=['heartdisease'], evidence={'age': 28}) print(q['heartdisease']) q = HeartDisease_infer.query(variables=['heartdisease'], evidence={'chol': 100}) print(q['heartdisease'])
cpd_dysp = TabularCPD(variable='Dyspnoea', variable_card=2, values=[[0.65, 0.3], [0.35, 0.7]], evidence=['Cancer'], evidence_card=[2]) # Associating the parameters with the model structure. cancer_model.add_cpds(cpd_poll, cpd_smoke, cpd_cancer, cpd_xray, cpd_dysp) print('Model generated by adding conditional probability disttributions(cpds)') # Checking if the cpds are valid for the model. print('Checking for Correctness of model : ', end='') print(cancer_model.check_model()) print('All local idependencies are as follows') cancer_model.get_independencies() print('Displaying CPDs') print(cancer_model.get_cpds('Pollution')) print(cancer_model.get_cpds('Smoker')) print(cancer_model.get_cpds('Cancer')) print(cancer_model.get_cpds('Xray')) print(cancer_model.get_cpds('Dyspnoea')) ##Inferencing with Bayesian Network # Computing the probability of Cancer given smoke. cancer_infer = VariableElimination(cancer_model) print('\nInferencing with Bayesian Network')