def load_total_cpds(): # All the nodes in the graph (157 nodes) gnodes = total_G.nodes data = pd.DataFrame(np.random.randint(low=0, high=2, size=(100, len(gnodes))), columns=gnodes) # Option 1 of fitting cpds estimator = BayesianEstimator(total_G, data) p = estimator.get_parameters(prior_type='BDeu', equivalent_sample_size=5) for i, cpd in enumerate(p): total_G.add_cpds(cpd) # Option 2 of fitting cpds for i in range(1, num_sub_symptoms + 1): cpd_sub = estimator.estimate_cpd('sub_sympt_' + str(i), prior_type="BDeu") total_G.add_cpds(cpd_sub) if i <= num_symptoms: cpd_symp = estimator.estimate_cpd('sympt_' + str(i), prior_type="BDeu") total_G.add_cpds(cpd_symp) # this is the time cruncher. for i in range(1, num_conditions + 1): cpd_cond = estimator.estimate_cpd('cond_' + str(i), prior_type="BDeu") total_G.add_cpds(cpd_cond)
def estimate_parameters(self): data = pd.DataFrame(data=self.learning_data) sample_size = len(self.learning_data) # print(sample_size) estimator = BayesianEstimator(self.pgmpy, data) # print('data') # print('pgmpy node : ', self.pgmpy.nodes()) # print(self.learning_data) # print(data) pseudocount = { 'BENS_0': [1, 2], 'BENS_1': [1, 2], 'BENS_2': [1, 2], 'BENS_3': [1, 2], 'WORLD_0': [1, 2], 'WORLD_1': [1, 2], 'WORLD_2': [1, 2] } pseudocount = [0.9, 0.9] if not 'BENS_1' in self.pgmpy.nodes( ) or not 'BENS_2' in self.pgmpy.nodes( ) or not 'BENS_3' in self.pgmpy.nodes(): pseudocount = [0.9, 0.9, 0.9] # print('pseudocount :', pseudocount) for i, node in enumerate(self.nodes): if 'LAN' in node[0] or 'MOTOR' in node[0] or 'WORLD' in node[0]: # print('cardinality node ', node[0], ' : ', self.pgmpy.get_cardinality(node[0])) # print(self.pgmpy.get_cpds(node[0]).values) #self.pgmpy.get_cpds(node[0]).values = estimator.estimate_cpd(node[0], prior_type='dirichlet', pseudo_counts=pseudocount).values self.pgmpy.get_cpds(node[0]).values = estimator.estimate_cpd( node[0], prior_type='BDeu', equivalent_sample_size=sample_size).values
def distribution(excel_rows, item_name, items, file_name): # Using 95% confidence interval # (1-0.95)/2 Z_score = abs(st.norm.ppf(0.025)) alpha = 1 - 0.95 data_files = {} # create dataframe for item in items: if item_name == "Monkey": df = (monkey_df[(monkey_df.Monkey == item)]) elif item_name == "gender": df = (gender_df[(gender_df.gender == item)]) z = BayesianEstimator(model, df) cat_cpd = z.estimate_cpd('Category', prior_type="bdeu", equivalent_sample_size=0) # .to_factor() for condition in conditions: for category in categories: try: count = list( z.state_counts('Category') [condition].to_dict().values())[0][category] # count = z.state_counts('Category')[condition][category][category] prob = cat_cpd.get_value(**{ 'Condition': condition, 'Category': category }) # print(prob) # p_hat and q_hat set to conservative since we have no previous data #0.5 for each # Since its probability I clip to 0 lower_ci = max( prob - Z_score * math.sqrt((0.5 * 0.5) / df.shape[0]), 0) upper_ci = prob + Z_score * math.sqrt( (0.5 * 0.5) / df.shape[0]) if not isNaN(prob) and prob > 0: excel_rows.append([ item, condition, category, count, prob, lower_ci, upper_ci, alpha ]) else: pass # excel_rows.append([item, left, right, cat, count, prob, 0, 0, 0]) except KeyError: pass # excel_rows.append([item, left, right, cat, count, 0, 0 , 0, 0]) prob_df = pd.DataFrame.from_records(excel_rows[1:], columns=excel_rows[0]) writer = pd.ExcelWriter(file_name + ".xlsx") prob_df.to_excel(writer, sheet_name='Distribution') prob_df.sort_values('Probability', ascending=True).drop_duplicates( [item_name]).to_excel(writer, sheet_name='prefference') writer.save() return prob_df
def estimate_parameters(self, log=True): ''' (5) Estimates the parameters of the found network ''' estimator = BayesianEstimator(self.best_model, self.data) self.file_writer.write_txt("Number of nodes: " + str(len(self.variables_names))) self.file_writer.write_txt("Complete list: " + str(self.variables_names)) for node in self.best_model.nodes(): cpd = estimator.estimate_cpd(node, prior_type='K2') self.best_model.add_cpds(cpd) self.log(cpd, log) self.file_writer.write_txt(cpd.__str__())
def learn(self, file1, file2): f1 = open(file1, encoding="utf8") lines = f1.readlines() edges = self.getegdes(lines[0]) data = pd.read_csv(file2) G = nx.DiGraph() for i in range(int(len(edges) / 2)): G.add_edge(edges[2 * i], edges[2 * i + 1]) est = HillClimbSearch(data, scoring_method=BicScore(data)) model = est.estimate() G_ = nx.DiGraph() G_.add_edges_from(model.edges()) for i, j in G_.edges(): if i not in G.nodes() or j not in G.nodes(): G.add_edge(i, j) elif not nx.has_path(G, j, i): G.add_edge(i, j) new_model = BayesianModel() new_model.add_edges_from(G.edges) G = new_model.copy() # N = G.number_of_nodes() # B = np.zeros((N*(N-1)//2, N)) # i = 0 # y = [] # k = 0 # nodes = list(G.nodes._nodes.keys()) # for i in range(len(nodes)): # for j in range(i+1, len(nodes)): # if nx.has_path(G, nodes[i], nodes[j]): # y.append(1) # B[k, i] = 1 # B[k, j] = -1 # elif nx.has_path(G, nodes[j], nodes[i]): # y.append(-1) # B[k, i] = 1 # B[k, j] = -1 # else: # y.append(0) # k += 1 # # W = np.eye(N, N) # est = HillClimbSearch(data, scoring_method=BicScore(data)) # model = est.estimate() # G_ = nx.DiGraph() # G_.add_edges_from(model.edges()) # queue = [] # for node in G_.nodes(): # if G_.in_degree(node) == 0: # queue.append(node) # G.node[node]['s'] = N # else: # G.node[node]['s'] = N//2 # while len(queue)>0: # now = queue[0] # l = list(G_._succ[now].keys()) # for i in l: # G.node[i]['s'] = G.node[now]['s'] - 1 # queue += l # queue.pop(0) # # phai = [] # for node in G.nodes(): # phai.append(G.node[node]['s']) # miu1 = np.dot(np.transpose(B), B) # miu1 = np.linalg.pinv(miu1) # miu2 = np.dot(np.transpose(B), y) # miu2 = miu2 + phai # miu = np.dot(miu1, miu2) # # seq = miu.tolist() # seq = list(zip(seq, nodes)) # seq = sorted(seq, key=lambda s: s[0]) # seq = [x[1] for x in seq] # nx.draw(G) # plt.show() estimator = BayesianEstimator(G, data) edges = [] for i in G.edges: edges.append(str(i)) print(edges) for i in G.nodes: cpd = estimator.estimate_cpd(i, prior_type="K2") nodeName = i values = dict(data[i].value_counts()) valueNum = len(values) CPT = np.transpose(cpd.values) # CPT = cpd.values sequence = cpd.variables[1::] card = [] for x in sequence: s = len(dict(data[x].value_counts())) card.append(s) output = nodeName + '\t' + str(valueNum) + '\t' + str( CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card) print(output)
class Bayes_Net(Core): """ Methods to read_in data and learn the structure and conditional probability tables for a Bayesian Network, as well as assessing the strength of the causal influence of endogenous variables on the target variable of interest. Parameters ---------- target_variable: str, name of the column containing the outcome variable. verbose: bool, optional (default = False). Determines whether the user will get verbose status updates. random_seed: int, optional. Attributes ---------- verbose: boolean Whether verbose mode is activated target_variable: string Name of the target variable in the dataset df: pd.DataFrame pandas dataframe of input dataset structure_algorithm: string Name of the learning structure algo that was chosen structure_model: pgmpy.base.DAG.DAG Learned DAG but without conditional probability tables estimated bn_model: pgmpy.models.BayesianModel Proper, learned Bayesian Network with conditional probability tables estimated odds_ratios: pd.DataFrame DataFrame containing odds ratios for all interventions and levels Methods ---------- read_data: (self, file_path, **kwargs) Reads in dataset using. Essentially a wrapper for pandas' `read_csv` function. learn_structure: (self, file_path, algorithm = 'hc') Learns the structure of a DAG from data. Saves structure as a CSV to disk. Note: this is technically not a bayesian network yet, as we don't have the conditional probability tables estimated yet. plot_network: (self, file_path, **kwargs) Plots the Bayesian Network (highlighting target variable) and saves PNG to disk. plot_causal_influence: (self, file_path) Uses belief propagation to perform inference and calculates odds ratios for how changes in intervention evidence will impact the target variable. A forest plot is produced from this. """ def __init__(self, target_variable, random_seed=0, verbose=False): self.verbose = verbose self.target_variable = target_variable self.random_seed = random_seed # Validate the params self._validate_init_params() if self.verbose: print("Using the following params for Bayesian Network model:") pprint(self.get_params(), indent=4) def _validate_init_params(self): """ Very basic checks that the params used when instantiating Bayes_Net look okay """ # Checks for target_variable if not isinstance(self.target_variable, str): raise TypeError( f"target_variable parameter must be a string type, but found type {type(self.target_variable)}" ) # Checks for verbose if not isinstance(self.verbose, bool): raise TypeError( f"verbose parameter must be a boolean type, but found type {type(self.verbose)}" ) # Checks for random_seed if not isinstance(self.random_seed, (int, type(None))): raise TypeError( f"random_seed parameter must be an int, but found type {type(self.random_seed)}" ) if (isinstance(self.random_seed, int)) and self.random_seed < 0: raise ValueError(f"random_seed parameter must be > 0") def read_data(self, file_path, **kwargs): """ Wrapper for pandas `read_csv` function. Assumes file is CSV with a header row. Arguments: file_path: str, the absolute file path to the CSV file **kwargs: any additional keywords for pandas' `read_csv` function Returns: None """ self.df = pd.read_csv(filepath_or_buffer=file_path, **kwargs) # Check that target variable is in the dataset if self.target_variable not in self.df: raise ValueError( "The target variable you specified isn't in the dataset!") if self.verbose: print("Successfully read in CSV") return None def _cramers_v(self, x, y): """ Static method to that calculates Cramers V correlation between two categorical variables """ confusion_matrix = pd.crosstab(x, y) chi2 = ss.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum().sum() phi2 = chi2 / n r, k = confusion_matrix.shape phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) rcorr = r - ((r - 1)**2) / (n - 1) kcorr = k - ((k - 1)**2) / (n - 1) return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1))) def _initial_filter(self): """ Filters out nodes with zero correlation with target variable """ relevant_vars = [] for node in self.df.columns: if self._cramers_v(self.df[self.target_variable], self.df[node]) > 0: relevant_vars.append(node) return self.df[relevant_vars] def learn_structure(self, file_path, algorithm="hc", significance_level=0.05): """ Employs `pgmpy` package's Bayesian Network structure learning algorithms to learn structure from a dataset. Saves a tabular version of the result as a CSV file. Arguments: algorithm: str, optional (default = 'hc') Determines whether the hill-climbing or Peter-Clark are employed. Two possible values include: 'hc', 'pc'. Note, I found a bug in pgmpy implementation halfway through this project. Don't use the 'pc' method. file_path: str, the absolute path to save the file to (e.g. "~/Desktop/BN_structure.csv") significance_level: float, option (default = 0.05) Statistical significance cutoff for use in pruning the network when using the PC algorithm. Lower values produce sparser networks. Returns: None """ self.structure_algorithm = algorithm if self.verbose: print( "Depending on the number of variables in your dataset, this might take some time..." ) # Learn structure, using one of the algorithms np.random.seed(self.random_seed) if algorithm == "hc": # Filter out columns with zero correlation with target variable self.filtered_df = self._initial_filter() # Run HC algorithm self.structure_model = HillClimbSearch( self.filtered_df, scoring_method=BicScore(self.filtered_df)).estimate() if self.verbose: print( f"Structure learned! Saving structure to the following CSV: {file_path}" ) # Eliminate isolated subgraphs G = self.structure_model.to_undirected() connected_nodes = list( nx.algorithms.components.node_connected_component( G, self.target_variable)) disconnected_nodes = list( set(list(self.structure_model.nodes)) - set(connected_nodes)) for node in disconnected_nodes: self.structure_model.remove_node(node) self.filtered_df.drop([node], axis=1, inplace=True) pd.DataFrame( list(self.structure_model.edges), columns=["from_variable", "to_variable"], ).to_csv(file_path, index=False) elif algorithm == "pc": self.filtered_df = self.df self.structure_model = ConstraintBasedEstimator( self.filtered_df).estimate( significance_level=significance_level) if self.verbose: print( f"Structure learned! Saving structure to the following CSV: {file_path}" ) pd.DataFrame( list(self.structure_model.edges), columns=["from_variable", "to_variable"], ).to_csv(file_path, index=False) def plot_network(self, file_path, **kwargs): """ Plots the learned structure, highlighting the target variable. Arguments: file_path: str, the absolute path to save the file to (e.g. "~/Desktop/plot.png") **kwargs: additional keyword arguments for networkx's draw function Returns: None """ if self.verbose: print( f"Saving Bayesian Network plot to the following PNG file: {file_path}" ) # Identify target variable so we can highlight it in the plot target_index = list(self.structure_model).index(self.target_variable) node_size_list = [300] * len(list(self.structure_model.nodes)) node_color_list = ["#95ABDF"] * len(list(self.structure_model.nodes)) node_size_list[target_index] = 1500 node_color_list[target_index] = "#F09A9A" # Clear any existing pyplot fig, create plot, and save to disk plt.clf() nx.draw( self.structure_model, node_size=node_size_list, node_color=node_color_list, with_labels=True, **kwargs, ) plt.savefig(expanduser(file_path), format="PNG", dpi=300) def _estimate_CPT(self): """ Estimates the conditional probability tables associated with each node in the Bayesian Network. """ self.bn_model = BayesianModel(list(self.structure_model.edges)) self.cpt_model = BayesianEstimator(self.bn_model, self.filtered_df) for node in list(self.bn_model.nodes): self.bn_model.add_cpds(self.cpt_model.estimate_cpd(node)) def plot_causal_influence(self, file_path): """ Computes the odds of the target variable being value 1 over value 0 (i.e. the odds ratio) by iterating through all other network variables/nodes, changing their values, and observing how the probability of the target variable changes. Belief propagation is used for inference. A forest plot is produced from this and saved to disk. Arguments: file_path: str, the absolute path to save the file to (e.g. "~/Desktop/forest_plot.png") Returns: None """ # Estimate CPTs self._estimate_CPT() if self.verbose: print(f"Calculating influence of all nodes on target node") if not self.bn_model.check_model(): print(""" There is a problem with your network structure. You have disconnected nodes or separated sub-networks. Please examine your network plot and re-learn your network structure with tweaked settings. """) return None if self.target_variable not in self.bn_model.nodes: print(""" Your target variable has no parent nodes! Can't perform inference! Please examine your network plot and re-learn your network structure with tweaked settings. """) return None # Prep for belief propagation belief_propagation = BeliefPropagation(self.bn_model) belief_propagation.calibrate() # Iterate over all intervention nodes and values, calculating odds ratios w.r.t target variable overall_dict = {} variables_to_test = list( set(list(self.bn_model.nodes)) - set(list(self.target_variable))) for node in variables_to_test: results = [] for value in self.filtered_df[node].unique(): prob = belief_propagation.query( variables=[self.target_variable], evidence={ node: value }, show_progress=False, ).values results.append([node, value, prob[0], prob[1]]) results_df = pd.DataFrame( results, columns=["node", "value", "probability_0", "probability_1"]) results_df["odds_1"] = (results_df["probability_1"] / results_df["probability_0"]) results_df = results_df.sort_values( "value", ascending=True, inplace=False).reset_index(drop=True) overall_dict[node] = results_df final_df_list = [] for node, temp_df in overall_dict.items(): first_value = temp_df["odds_1"].iloc[0] temp_df["odds_ratio"] = (temp_df["odds_1"] / first_value).round(3) final_df_list.append(temp_df) final_df = pd.concat(final_df_list)[["node", "value", "odds_ratio"]] self.odds_ratios = final_df if self.verbose: print(f"Saving forest plot to the following PNG file: {file_path}") # Clean up the dataframe of odds ratios so plot can have nice labels final_df2 = (pd.concat([ final_df, final_df.groupby("node")["value"].apply( lambda x: x.shift(-1).iloc[-1]).reset_index(), ]).sort_values(by=["node", "value"], ascending=True).reset_index(drop=True)) final_df2["node"][final_df2["value"].isnull()] = np.nan final_df2["value"] = final_df2["value"].astype("Int32").astype(str) final_df2["value"].replace({np.nan: ""}, inplace=True) final_df3 = final_df2.reset_index(drop=True).reset_index() final_df3.rename(columns={"index": "vertical_index"}, inplace=True) final_df3["y_label"] = final_df3["node"] + " = " + final_df3["value"] final_df3["y_label"][final_df3["odds_ratio"] == 1.0] = ( final_df3["y_label"] + " (ref)") final_df3["y_label"].fillna("", inplace=True) # Produce large plot plt.clf() plt.title( "Strength of Associations Between Interventions and Target Variable" ) plt.scatter( x=final_df3["odds_ratio"], y=final_df3["vertical_index"], s=70, color="b", alpha=0.5, ) plt.xlabel("Odds Ratio") plt.axvline(x=1.0, color="red", linewidth="1.5", linestyle="--") plt.yticks(final_df3["vertical_index"], final_df3["y_label"]) for _, row in final_df3.iterrows(): if not np.isnan(row["odds_ratio"]): plt.plot( [0, row["odds_ratio"]], [row["vertical_index"], row["vertical_index"]], color="black", linewidth="0.4", ) plt.xlim([0, final_df3["odds_ratio"].max() + 1]) figure = plt.gcf() figure.set_size_inches(12, 7) plt.savefig(expanduser(file_path), bbox_inches="tight", format="PNG", dpi=300)
class TestBayesianEstimator(unittest.TestCase): def setUp(self): self.m1 = BayesianModel([('A', 'C'), ('B', 'C')]) self.d1 = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]}) self.d2 = pd.DataFrame(data={'A': [0, 0, 1, 0, 2, 0, 2, 1, 0, 2], 'B': ['X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y'], 'C': [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}) self.est1 = BayesianEstimator(self.m1, self.d1) self.est2 = BayesianEstimator(self.m1, self.d1, state_names={'A': [0, 1, 2], 'B': [0, 1], 'C': [0, 1, 23]}) self.est3 = BayesianEstimator(self.m1, self.d2) def test_estimate_cpd_dirichlet(self): cpd_A = self.est1.estimate_cpd('A', prior_type="dirichlet", pseudo_counts=[0, 1]) self.assertEqual(cpd_A, TabularCPD('A', 2, [[0.5], [0.5]])) cpd_B = self.est1.estimate_cpd('B', prior_type="dirichlet", pseudo_counts=[9, 3]) self.assertEqual(cpd_B, TabularCPD('B', 2, [[11.0/15], [4.0/15]])) cpd_C = self.est1.estimate_cpd('C', prior_type="dirichlet", pseudo_counts=[0.4, 0.6]) self.assertEqual(cpd_C, TabularCPD('C', 2, [[0.2, 0.2, 0.7, 0.4], [0.8, 0.8, 0.3, 0.6]], evidence=['A', 'B'], evidence_card=[2, 2])) def test_estimate_cpd_improper_prior(self): cpd_C = self.est1.estimate_cpd('C', prior_type="dirichlet", pseudo_counts=[0, 0]) cpd_C_correct = (TabularCPD('C', 2, [[0.0, 0.0, 1.0, np.NaN], [1.0, 1.0, 0.0, np.NaN]], evidence=['A', 'B'], evidence_card=[2, 2], state_names={'A': [0, 1], 'B': [0, 1], 'C': [0, 1]})) # manual comparison because np.NaN != np.NaN self.assertTrue(((cpd_C.values == cpd_C_correct.values) | np.isnan(cpd_C.values) & np.isnan(cpd_C_correct.values)).all()) def test_estimate_cpd_shortcuts(self): cpd_C1 = self.est2.estimate_cpd('C', prior_type='BDeu', equivalent_sample_size=9) cpd_C1_correct = TabularCPD('C', 3, [[0.2, 0.2, 0.6, 1./3, 1./3, 1./3], [0.6, 0.6, 0.2, 1./3, 1./3, 1./3], [0.2, 0.2, 0.2, 1./3, 1./3, 1./3]], evidence=['A', 'B'], evidence_card=[3, 2]) self.assertEqual(cpd_C1, cpd_C1_correct) cpd_C2 = self.est3.estimate_cpd('C', prior_type='K2') cpd_C2_correct = TabularCPD('C', 2, [[0.5, 0.6, 1./3, 2./3, 0.75, 2./3], [0.5, 0.4, 2./3, 1./3, 0.25, 1./3]], evidence=['A', 'B'], evidence_card=[3, 2]) self.assertEqual(cpd_C2, cpd_C2_correct) def test_get_parameters(self): cpds = set([self.est3.estimate_cpd('A'), self.est3.estimate_cpd('B'), self.est3.estimate_cpd('C')]) self.assertSetEqual(set(self.est3.get_parameters()), cpds) def test_get_parameters2(self): pseudo_counts = {'A': [1, 2, 3], 'B': [4, 5], 'C': [6, 7]} cpds = set([self.est3.estimate_cpd('A', prior_type="dirichlet", pseudo_counts=pseudo_counts['A']), self.est3.estimate_cpd('B', prior_type="dirichlet", pseudo_counts=pseudo_counts['B']), self.est3.estimate_cpd('C', prior_type="dirichlet", pseudo_counts=pseudo_counts['C'])]) self.assertSetEqual(set(self.est3.get_parameters(prior_type="dirichlet", pseudo_counts=pseudo_counts)), cpds) def tearDown(self): del self.m1 del self.d1 del self.d2 del self.est1 del self.est2
from pgmpy.estimators import MaximumLikelihoodEstimator mle = MaximumLikelihoodEstimator(model, data) print(mle.estimate_cpd("fruit")) # unconditional print(" −−−−−−−−−−−−−−−−−−−−−− ") print(mle.estimate_cpd("tasty")) # conditional print(" −−−−−−−−−−−−−−−−−−−−−− ") # Calibrate all CPDs of ‘model' using MLE: model.fit(data, estimator=MaximumLikelihoodEstimator) # Bayesian Parameter Estimation print("−−−−− Bayesian Parameter Estimation −−−−−−−−−−−−−−") from pgmpy.estimators import BayesianEstimator est = BayesianEstimator(model, data) print(est.estimate_cpd("tasty", prior_type="BDeu", equivalent_sample_size=10)) print(" −−−−−−−−−−−−−−−−−−−−−− ") # BayesianEstimator , too , can be used via the fit()−method . Full example : import numpy as np import pandas as pd from pgmpy.models import BayesianModel from pgmpy.estimators import BayesianEstimator # generate data data = pd.DataFrame(np.random.randint(low=0, high=2, size=(5000, 4)), columns=["A", "B", "C", "D"]) model = BayesianModel([("A", "B"), ("A", "C"), ("D", "C"), ("B", "D")]) model.fit(data, estimator=BayesianEstimator, prior_type="BDeu" ) # d e f a u l t e q u i v a l e n t s a m p l e s i z e =5 for cpd in model.get_cpds(): print(cpd)
class TestBayesianEstimator(unittest.TestCase): def setUp(self): self.m1 = BayesianModel([('A', 'C'), ('B', 'C')]) self.d1 = pd.DataFrame(data={ 'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0] }) self.d2 = pd.DataFrame( data={ 'A': [0, 0, 1, 0, 2, 0, 2, 1, 0, 2], 'B': ['X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y'], 'C': [1, 1, 1, 0, 0, 0, 0, 0, 0, 0] }) self.est1 = BayesianEstimator(self.m1, self.d1) self.est2 = BayesianEstimator(self.m1, self.d1, state_names={ 'A': [0, 1, 2], 'B': [0, 1], 'C': [0, 1, 23] }) self.est3 = BayesianEstimator(self.m1, self.d2) def test_estimate_cpd_dirichlet(self): cpd_A = self.est1.estimate_cpd('A', prior_type="dirichlet", pseudo_counts=[[0], [1]]) self.assertEqual(cpd_A, TabularCPD('A', 2, [[0.5], [0.5]])) cpd_B = self.est1.estimate_cpd('B', prior_type="dirichlet", pseudo_counts=[[9], [3]]) self.assertEqual(cpd_B, TabularCPD('B', 2, [[11.0 / 15], [4.0 / 15]])) cpd_C = self.est1.estimate_cpd('C', prior_type="dirichlet", pseudo_counts=[[0.4, 0.4, 0.4, 0.4], [0.6, 0.6, 0.6, 0.6]]) self.assertEqual( cpd_C, TabularCPD('C', 2, [[0.2, 0.2, 0.7, 0.4], [0.8, 0.8, 0.3, 0.6]], evidence=['A', 'B'], evidence_card=[2, 2])) def test_estimate_cpd_improper_prior(self): cpd_C = self.est1.estimate_cpd('C', prior_type="dirichlet", pseudo_counts=[[0, 0, 0, 0], [0, 0, 0, 0]]) cpd_C_correct = (TabularCPD( 'C', 2, [[0.0, 0.0, 1.0, np.NaN], [1.0, 1.0, 0.0, np.NaN]], evidence=['A', 'B'], evidence_card=[2, 2], state_names={ 'A': [0, 1], 'B': [0, 1], 'C': [0, 1] })) # manual comparison because np.NaN != np.NaN self.assertTrue( ((cpd_C.values == cpd_C_correct.values) | np.isnan(cpd_C.values) & np.isnan(cpd_C_correct.values)).all()) def test_estimate_cpd_shortcuts(self): cpd_C1 = self.est2.estimate_cpd('C', prior_type='BDeu', equivalent_sample_size=9) cpd_C1_correct = TabularCPD('C', 3, [[0.2, 0.2, 0.6, 1. / 3, 1. / 3, 1. / 3], [0.6, 0.6, 0.2, 1. / 3, 1. / 3, 1. / 3], [0.2, 0.2, 0.2, 1. / 3, 1. / 3, 1. / 3]], evidence=['A', 'B'], evidence_card=[3, 2]) self.assertEqual(cpd_C1, cpd_C1_correct) cpd_C2 = self.est3.estimate_cpd('C', prior_type='K2') cpd_C2_correct = TabularCPD('C', 2, [[0.5, 0.6, 1. / 3, 2. / 3, 0.75, 2. / 3], [0.5, 0.4, 2. / 3, 1. / 3, 0.25, 1. / 3]], evidence=['A', 'B'], evidence_card=[3, 2]) self.assertEqual(cpd_C2, cpd_C2_correct) def test_get_parameters(self): cpds = set([ self.est3.estimate_cpd('A'), self.est3.estimate_cpd('B'), self.est3.estimate_cpd('C') ]) self.assertSetEqual(set(self.est3.get_parameters()), cpds) def test_get_parameters2(self): pseudo_counts = { 'A': [[1], [2], [3]], 'B': [[4], [5]], 'C': [[6, 6, 6, 6, 6, 6], [7, 7, 7, 7, 7, 7]] } cpds = set([ self.est3.estimate_cpd('A', prior_type="dirichlet", pseudo_counts=pseudo_counts['A']), self.est3.estimate_cpd('B', prior_type="dirichlet", pseudo_counts=pseudo_counts['B']), self.est3.estimate_cpd('C', prior_type="dirichlet", pseudo_counts=pseudo_counts['C']) ]) self.assertSetEqual( set( self.est3.get_parameters(prior_type="dirichlet", pseudo_counts=pseudo_counts)), cpds) def tearDown(self): del self.m1 del self.d1 del self.d2 del self.est1 del self.est2
return links links = CreateLinks(data_columns) model = BayesianModel(links) pe = ParameterEstimator(model, data) # Print ParameterEstimator unconditional pe_symptom1 = pe.state_counts('Symptom_1') print(pe_symptom1) # Print ParameterEstimator conditional disease pe_disease = pe.state_counts('Disease') print(pe_disease) mle = MaximumLikelihoodEstimator(model, data) # Print MaximumLikelihoodEstimator unconditional mle_symptom1 = mle.estimate_cpd('Symptom_1') print(mle_symptom1) # Print MaximumLikelihoodEstimator conditional #mle_disease = mle.estimate_cpd('Disease') #print(mle_disease) # Calibrate all CPDs of `model` using MLE: model.fit(data, estimator=MaximumLikelihoodEstimator) est = BayesianEstimator(model, data) est_disease = est.estimate_cpd('Disease', prior_type='BDeu', equivalent_sample_size=10) print(est_disease)
def task4(): global andRawData, task4_best_bm k2Scores = [] andRawData_temp = pd.DataFrame(andRawData.values, columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9']) #Model 1 est = HillClimbSearch(andRawData_temp, scoring_method=K2Score(andRawData_temp)) model_temp = est.estimate() estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 1: Model through HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 1: K2 Accuracy Score is "+str(k2Scores_temp)) #Model 2: Manual Model based on HillClimbSearch model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f3', 'f8'), ('f1', 'f7'), ('f5', 'f3'), ('f9', 'f8'), ('f1', 'f6'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f2')]) estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 2: Manual Model based on HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 2: K2 Accuracy Score is "+str(k2Scores_temp)) #Model 3: Manual Model based on HillClimbSearch model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f3', 'f8'), ('f5', 'f7'), ('f5', 'f3'), ('f9', 'f8'), ('f1', 'f2'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f2')]) estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 3: Manual Model based on HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 3: K2 Accuracy Score is "+str(k2Scores_temp)) #Model 4: Manual Model based on HillClimbSearch model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f5', 'f7'), ('f5', 'f3'), ('f1', 'f2'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f8'),]) estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 4: Manual Model based on HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 4: K2 Accuracy Score is "+str(k2Scores_temp)) #Model 5: Manual Model based on Intuition model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f4', 'f7'), ('f1', 'f2'), ('f8', 'f5'), ('f9', 'f6'), ('f9', 'f8')]) estimator = BayesianEstimator(model_temp, andRawData_temp) for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']: cpd_fx = estimator.estimate_cpd(fx, prior_type="K2") model_temp.add_cpds(cpd_fx) task4_bms.append(model_temp) print(" Model 5: Manual Model based on HillClimbSearch is : "+str(model_temp.edges())) k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000)) k2Scores_temp = k2Score.score(model_temp) k2Scores.append(k2Scores_temp) print(" Model 5: K2 Accuracy Score is "+str(k2Scores_temp)) task4_best_bm = task4_bms[k2Scores.index(max(k2Scores))] print(" Best Bayesian Model with the highest accuracy score is thus Model "+str(1+k2Scores.index(max(k2Scores))))
print(best_model.edges()) # la relecture de la structure trouvée révèle que le programme donne les liaisons mais pas le sens de ces dernières. # le model avec le bon sens serait donc : bon_model = BayesianModel([('Cancer', 'TbOuCa'), ('TbOuCa', 'Dyspnea'), ('TbOuCa', 'Bronchite'), ('TbOuCa', 'Radiographie'), ('Fumeur', 'Bronchite'), ('Radiographie', 'Dyspnea'), ('Tuberculose', 'TbOuCa'), ('Bronchite', 'Dyspnea')]) #apprentissage des paramètres #print("estimation des cpds :") from pgmpy.estimators import BayesianEstimator est = BayesianEstimator(best_model, data) print(est.estimate_cpd('Cancer', prior_type='BDeu', equivalent_sample_size=10)) best_model.fit(data, estimator=BayesianEstimator, prior_type='BDeu') #for cpd in best_model.get_cpds(): # print(cpd) #Caractéristique des personnes ayant un cancer model_infer = VariableElimination(best_model) q = model_infer.query(variables=[ 'Age', 'Fumeur', 'Tuberculose', 'VisiteAsie', 'Radiographie', 'Bronchite', 'Dyspnea', 'Geographie', 'TbOuCa' ], evidence={'Cancer': 2}) # 0 = ? , 1=False, 2=True print("Caratéristiques des personnes ayant le cancer :") #print(q['Age']) print(q['Fumeur'])
def main(): #Fetching features data features_data = pd.read_csv(fileloc_features) features_data_f = features_data.add_prefix('f') features_data_g = features_data.add_prefix('g') #Seen Training Data seen_traindata = pd.read_csv(fileloc_seen_training, usecols = ['left','right','label']) #seen_traindata_f = pd.read_csv(fileloc_seen_training, usecols = ['left','label']) #seen_traindata_g = pd.read_csv(fileloc_seen_training, usecols = ['right','label']) seen_traindata_merged_f = seen_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') seen_traindata_merged_g = seen_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') seen_traindata_merged_f = seen_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) seen_traindata_merged_g = seen_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) seen_features_traindata_final = pd.concat([seen_traindata_merged_f, seen_traindata_merged_g], axis = 1) seen_label_traindata_final = seen_traindata.loc[:, 'label'] seen_traindata_final = pd.concat([seen_features_traindata_final, seen_label_traindata_final], axis = 1) seen_traindata_final.replace([np.inf, -np.inf], np.nan) seen_traindata_final.dropna(inplace=True) seen_traindata_final = seen_traindata_final.astype(int) seen_traindata_final_NDArray = seen_traindata_final.values #Seen Validation Data seen_validationdata = pd.read_csv(fileloc_seen_validation, usecols = ['left','right','label']) #seen_validationdata_f = pd.read_csv(fileloc_seen_validation, usecols = ['left','label']) #seen_validationdata_g = pd.read_csv(fileloc_seen_validation, usecols = ['right','label']) seen_validationdata_merged_f = seen_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') seen_validationdata_merged_g = seen_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') seen_validationdata_merged_f = seen_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) seen_validationdata_merged_g = seen_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) seen_features_validationdata_final = pd.concat([seen_validationdata_merged_f, seen_validationdata_merged_g], axis = 1) seen_label_validationdata_final = seen_validationdata.loc[:, 'label'] seen_validationdata_final = pd.concat([seen_features_validationdata_final, seen_label_validationdata_final], axis = 1) seen_validationdata_final.replace([np.inf, -np.inf], np.nan) seen_validationdata_final.dropna(inplace=True) seen_validationdata_final = seen_validationdata_final.astype(int) seen_validationdata_final_NDArray = seen_validationdata_final.values #Shuffled Training Data shuffled_traindata = pd.read_csv(fileloc_shuffled_training, usecols = ['left','right','label']) #shuffled_traindata_f = pd.read_csv(fileloc_shuffled_training, usecols = ['left','label']) #shuffled_traindata_g = pd.read_csv(fileloc_shuffled_training, usecols = ['right','label']) shuffled_traindata_merged_f = shuffled_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') shuffled_traindata_merged_g = shuffled_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') shuffled_traindata_merged_f = shuffled_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) shuffled_traindata_merged_g = shuffled_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) shuffled_features_traindata_final = pd.concat([shuffled_traindata_merged_f, shuffled_traindata_merged_g], axis = 1) shuffled_label_traindata_final = shuffled_traindata.loc[:, 'label'] shuffled_traindata_final = pd.concat([shuffled_features_traindata_final, shuffled_label_traindata_final], axis = 1) shuffled_traindata_final.replace([np.inf, -np.inf], np.nan) shuffled_traindata_final.dropna(inplace=True) shuffled_traindata_final = shuffled_traindata_final.astype(int) shuffled_traindata_final_NDArray = shuffled_traindata_final.values #Shuffled Validation Data shuffled_validationdata = pd.read_csv(fileloc_shuffled_validation, usecols = ['left','right','label']) #shuffled_validationdata_f = pd.read_csv(fileloc_shuffled_validation, usecols = ['left','label']) #shuffled_validationdata_g = pd.read_csv(fileloc_shuffled_validation, usecols = ['right','label']) shuffled_validationdata_merged_f = shuffled_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') shuffled_validationdata_merged_g = shuffled_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') shuffled_validationdata_merged_f = shuffled_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) shuffled_validationdata_merged_g = shuffled_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) shuffled_features_validationdata_final = pd.concat([shuffled_validationdata_merged_f, shuffled_validationdata_merged_g], axis = 1) shuffled_label_validationdata_final = shuffled_validationdata.loc[:, 'label'] shuffled_validationdata_final = pd.concat([shuffled_features_validationdata_final, shuffled_label_validationdata_final], axis = 1) shuffled_validationdata_final.replace([np.inf, -np.inf], np.nan) shuffled_validationdata_final.dropna(inplace=True) shuffled_validationdata_final = shuffled_validationdata_final.astype(int) shuffled_validationdata_final_NDArray = shuffled_validationdata_final.values #Unseen Training Data unseen_traindata = pd.read_csv(fileloc_unseen_training, usecols = ['left','right','label']) #unseen_traindata_f = pd.read_csv(fileloc_unseen_training, usecols = ['left','label']) #unseen_traindata_g = pd.read_csv(fileloc_unseen_training, usecols = ['right','label']) unseen_traindata_merged_f = unseen_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') unseen_traindata_merged_g = unseen_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') unseen_traindata_merged_f = unseen_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) unseen_traindata_merged_g = unseen_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) unseen_features_traindata_final = pd.concat([unseen_traindata_merged_f, unseen_traindata_merged_g], axis = 1) unseen_label_traindata_final = unseen_traindata.loc[:, 'label'] unseen_traindata_final = pd.concat([unseen_features_traindata_final, unseen_label_traindata_final], axis = 1) unseen_traindata_final.replace([np.inf, -np.inf], np.nan) unseen_traindata_final.dropna(inplace=True) unseen_traindata_final = unseen_traindata_final.astype(int) unseen_traindata_final_NDArray = unseen_traindata_final.values #Unseen Validation Data unseen_validationdata = pd.read_csv(fileloc_unseen_validation, usecols = ['left','right','label']) #unseen_validationdata_f = pd.read_csv(fileloc_unseen_validation, usecols = ['left','label']) #unseen_validationdata_g = pd.read_csv(fileloc_unseen_validation, usecols = ['right','label']) unseen_validationdata_merged_f = unseen_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename') unseen_validationdata_merged_g = unseen_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename') unseen_validationdata_merged_f = unseen_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1) unseen_validationdata_merged_g = unseen_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1) unseen_features_validationdata_final = pd.concat([unseen_validationdata_merged_f, unseen_validationdata_merged_g], axis = 1) unseen_label_validationdata_final = unseen_validationdata.loc[:, 'label'] unseen_validationdata_final = pd.concat([unseen_features_validationdata_final, unseen_label_validationdata_final], axis = 1) unseen_validationdata_final.replace([np.inf, -np.inf], np.nan) unseen_validationdata_final.dropna(inplace=True) unseen_validationdata_final = unseen_validationdata_final.astype(int) unseen_validationdata_final_NDArray = unseen_validationdata_final.values #Creating base models featureNamesList = ["pen_pressure","letter_spacing","size","dimension","is_lowercase","is_continuous","slantness","tilt","entry_stroke_a", "staff_of_a","formation_n","staff_of_d","exit_stroke_d","word_formation","constancy"] features_only_data = features_data[featureNamesList] initial_hcs = HillClimbSearch(features_only_data) initial_model = initial_hcs.estimate() #print(initial_model.edges()) print("Hill Climb Done") basemodel = BayesianModel([('fpen_pressure', 'fis_lowercase'), ('fpen_pressure', 'fletter_spacing'), ('fsize', 'fslantness'), ('fsize', 'fpen_pressure'), ('fsize', 'fstaff_of_d'), ('fsize', 'fletter_spacing'), ('fsize', 'fexit_stroke_d'), ('fsize', 'fentry_stroke_a'), ('fdimension', 'fsize'), ('fdimension', 'fis_continuous'), ('fdimension', 'fslantness'), ('fdimension', 'fpen_pressure'), ('fis_lowercase', 'fstaff_of_a'), ('fis_lowercase', 'fexit_stroke_d'), ('fis_continuous', 'fexit_stroke_d'), ('fis_continuous', 'fletter_spacing'), ('fis_continuous', 'fentry_stroke_a'), ('fis_continuous', 'fstaff_of_a'), ('fis_continuous', 'fis_lowercase'), ('fslantness', 'fis_continuous'), ('fslantness', 'ftilt'), ('fentry_stroke_a', 'fpen_pressure'), ('fformation_n', 'fconstancy'), ('fformation_n', 'fword_formation'), ('fformation_n', 'fdimension'), ('fformation_n', 'fstaff_of_d'), ('fformation_n', 'fis_continuous'), ('fformation_n', 'fsize'), ('fformation_n', 'fstaff_of_a'), ('fstaff_of_d', 'fis_continuous'), ('fstaff_of_d', 'fexit_stroke_d'), ('fstaff_of_d', 'fis_lowercase'), ('fstaff_of_d', 'fslantness'), ('fstaff_of_d', 'fentry_stroke_a'), ('fword_formation', 'fdimension'), ('fword_formation', 'fstaff_of_a'), ('fword_formation', 'fsize'), ('fword_formation', 'fstaff_of_d'), ('fword_formation', 'fconstancy'), ('fconstancy', 'fstaff_of_a'), ('fconstancy', 'fletter_spacing'), ('fconstancy', 'fdimension'), ('gpen_pressure', 'gis_lowercase'), ('gpen_pressure', 'gletter_spacing'), ('gsize', 'gslantness'), ('gsize', 'gpen_pressure'), ('gsize', 'gstaff_of_d'), ('gsize', 'gletter_spacing'), ('gsize', 'gexit_stroke_d'), ('gsize', 'gentry_stroke_a'), ('gdimension', 'gsize'), ('gdimension', 'gis_continuous'), ('gdimension', 'gslantness'), ('gdimension', 'gpen_pressure'), ('gis_lowercase', 'gstaff_of_a'), ('gis_lowercase', 'gexit_stroke_d'), ('gis_continuous', 'gexit_stroke_d'), ('gis_continuous', 'gletter_spacing'), ('gis_continuous', 'gentry_stroke_a'), ('gis_continuous', 'gstaff_of_a'), ('gis_continuous', 'gis_lowercase'), ('gslantness', 'gis_continuous'), ('gslantness', 'gtilt'), ('gentry_stroke_a', 'gpen_pressure'), ('gformation_n', 'gconstancy'), ('gformation_n', 'gword_formation'), ('gformation_n', 'gdimension'), ('gformation_n', 'gstaff_of_d'), ('gformation_n', 'gis_continuous'), ('gformation_n', 'gsize'), ('gformation_n', 'gstaff_of_a'), ('gstaff_of_d', 'gis_continuous'), ('gstaff_of_d', 'gexit_stroke_d'), ('gstaff_of_d', 'gis_lowercase'), ('gstaff_of_d', 'gslantness'), ('gstaff_of_d', 'gentry_stroke_a'), ('gword_formation', 'gdimension'), ('gword_formation', 'gstaff_of_a'), ('gword_formation', 'gsize'), ('gword_formation', 'gstaff_of_d'), ('gword_formation', 'gconstancy'), ('gconstancy', 'gstaff_of_a'), ('gconstancy', 'gletter_spacing'), ('gconstancy', 'gdimension'), ('fis_continuous', 'label'), ('fword_formation','label'), ('gis_continuous', 'label'), ('gword_formation','label')]) model_seen = basemodel.copy() model_shuffled = basemodel.copy() model_unseen = basemodel.copy() accuracies = {} #Training Seen Model model_seen.fit(seen_traindata_final) estimator_seen = BayesianEstimator(model_seen, seen_traindata_final) cpds=[] for featureName in featureNamesList : cpd = estimator_seen.estimate_cpd('f'+featureName) cpds.append(cpd) cpd = estimator_seen.estimate_cpd('g'+featureName) cpds.append(cpd) cpd = estimator_seen.estimate_cpd('label') cpds.append(cpd) model_seen.add_cpds(*cpds) print("CPDs Calculated") #Testing Seen Model - Training model_seen_ve = VariableElimination(model_seen) model_seen_traindata_predictions = [] for i in range(seen_traindata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=(seen_traindata_final_NDArray[i,index]-1) evidenceDic['g'+featureName]=(seen_traindata_final_NDArray[i+15,index]-1) temp = model_seen_ve.map_query(variables=['label'],evidence=evidenceDic) model_seen_traindata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_seen_traindata_predictions)): if(int(model_seen_traindata_predictions[i]) == int(seen_traindata_final_NDArray[i,30])): correctCnt+=1 accuracies["seen_train"]=correctCnt/len(model_seen_traindata_predictions)*100 print("Bayesian Model Accuracy for Seen Training Data = "+str(accuracies["seen_train"])) #Testing Seen Model - Validation model_seen_ve = VariableElimination(model_seen) model_seen_validationdata_predictions = [] for i in range(seen_validationdata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=seen_validationdata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=seen_validationdata_final_NDArray[i+15,index]-1 temp = model_seen_ve.map_query(variables=['label'],evidence=evidenceDic) model_seen_validationdata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_seen_validationdata_predictions)): if(int(model_seen_validationdata_predictions[i]) == int(seen_validationdata_final_NDArray[i,30])): correctCnt+=1 accuracies["seen_validation"]=correctCnt/len(model_seen_validationdata_predictions)*100 print("Bayesian Model Accuracy for Seen Validation Data = "+str(accuracies["seen_validation"])) #Training Shuffled Model model_shuffled.fit(shuffled_traindata_final) estimator_shuffled = BayesianEstimator(model_shuffled, shuffled_traindata_final) cpds=[] for featureName in featureNamesList : cpd = estimator_shuffled.estimate_cpd('f'+featureName) cpds.append(cpd) cpd = estimator_shuffled.estimate_cpd('g'+featureName) cpds.append(cpd) cpd = estimator_shuffled.estimate_cpd('label') cpds.append(cpd) model_shuffled.add_cpds(*cpds) #Testing Shuffled Model - Training model_shuffled_ve = VariableElimination(model_shuffled) model_shuffled_traindata_predictions = [] for i in range(shuffled_traindata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=shuffled_traindata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=shuffled_traindata_final_NDArray[i+15,index]-1 temp = model_shuffled_ve.map_query(variables=['label'],evidence=evidenceDic) model_shuffled_traindata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_shuffled_traindata_predictions)): if(int(model_shuffled_traindata_predictions[i]) == int(shuffled_traindata_final_NDArray[i,30])): correctCnt+=1 accuracies["shuffled_train"]=correctCnt/len(model_shuffled_traindata_predictions)*100 print("Bayesian Model Accuracy for Shuffled Training Data = "+str(accuracies["shuffled_train"])) #Testing Shuffled Model - Validation model_shuffled_ve = VariableElimination(model_shuffled) model_shuffled_validationdata_predictions = [] for i in range(shuffled_validationdata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=shuffled_validationdata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=shuffled_validationdata_final_NDArray[i+15,index]-1 temp = model_shuffled_ve.map_query(variables=['label'],evidence=evidenceDic) model_shuffled_validationdata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_shuffled_validationdata_predictions)): if(int(model_shuffled_validationdata_predictions[i]) == int(shuffled_validationdata_final_NDArray[i,30])): correctCnt+=1 accuracies["shuffled_validation"]=correctCnt/len(model_shuffled_validationdata_predictions)*100 print("Bayesian Model Accuracy for Shuffled Validation Data = "+str(accuracies["shuffled_validation"])) #Training Unseen Model model_unseen.fit(unseen_traindata_final) estimator_unseen = BayesianEstimator(model_unseen, unseen_traindata_final) cpds=[] for featureName in featureNamesList : cpd = estimator_unseen.estimate_cpd('f'+featureName) cpds.append(cpd) cpd = estimator_unseen.estimate_cpd('g'+featureName) cpds.append(cpd) cpd = estimator_unseen.estimate_cpd('label') cpds.append(cpd) model_unseen.add_cpds(*cpds) #Testing Unseen Model - Training model_unseen_ve = VariableElimination(model_unseen) model_unseen_traindata_predictions = [] for i in range(unseen_traindata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=unseen_traindata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=unseen_traindata_final_NDArray[i+15,index]-1 temp = model_unseen_ve.map_query(variables=['label'],evidence=evidenceDic) model_unseen_traindata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_unseen_traindata_predictions)): if(int(model_unseen_traindata_predictions[i]) == int(unseen_traindata_final_NDArray[i,30])): correctCnt+=1 accuracies["unseen_train"]=correctCnt/len(model_unseen_traindata_predictions)*100 print("Bayesian Model Accuracy for Unseen Training Data = "+str(accuracies["unseen_train"])) #Testing Unseen Model - Validation model_unseen_ve = VariableElimination(model_unseen) model_unseen_validationdata_predictions = [] for i in range(unseen_validationdata_final_NDArray.shape[0]): evidenceDic = {} for index, featureName in enumerate(featureNamesList): evidenceDic['f'+featureName]=unseen_validationdata_final_NDArray[i,index]-1 evidenceDic['g'+featureName]=unseen_validationdata_final_NDArray[i+15,index]-1 temp = model_unseen_ve.map_query(variables=['label'],evidence=evidenceDic) model_unseen_validationdata_predictions.append(temp['label']) correctCnt = 0 for i in range(len(model_unseen_validationdata_predictions)): if(int(model_unseen_validationdata_predictions[i]) == int(unseen_validationdata_final_NDArray[i,30])): correctCnt+=1 accuracies["unseen_validation"]=correctCnt/len(model_unseen_validationdata_predictions)*100 print("Bayesian Model Accuracy for Unseen Validation Data = "+str(accuracies["unseen_validation"]))
class TestBayesianEstimator(unittest.TestCase): def setUp(self): self.m1 = BayesianModel([("A", "C"), ("B", "C")]) self.d1 = pd.DataFrame(data={"A": [0, 0, 1], "B": [0, 1, 0], "C": [1, 1, 0]}) self.d2 = pd.DataFrame( data={ "A": [0, 0, 1, 0, 2, 0, 2, 1, 0, 2], "B": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "Y"], "C": [1, 1, 1, 0, 0, 0, 0, 0, 0, 0], } ) self.est1 = BayesianEstimator(self.m1, self.d1) self.est2 = BayesianEstimator( self.m1, self.d1, state_names={"A": [0, 1, 2], "B": [0, 1], "C": [0, 1, 23]} ) self.est3 = BayesianEstimator(self.m1, self.d2) def test_estimate_cpd_dirichlet(self): cpd_A = self.est1.estimate_cpd( "A", prior_type="dirichlet", pseudo_counts=[[0], [1]] ) self.assertEqual(cpd_A, TabularCPD("A", 2, [[0.5], [0.5]])) cpd_B = self.est1.estimate_cpd( "B", prior_type="dirichlet", pseudo_counts=[[9], [3]] ) self.assertEqual(cpd_B, TabularCPD("B", 2, [[11.0 / 15], [4.0 / 15]])) cpd_C = self.est1.estimate_cpd( "C", prior_type="dirichlet", pseudo_counts=[[0.4, 0.4, 0.4, 0.4], [0.6, 0.6, 0.6, 0.6]], ) self.assertEqual( cpd_C, TabularCPD( "C", 2, [[0.2, 0.2, 0.7, 0.4], [0.8, 0.8, 0.3, 0.6]], evidence=["A", "B"], evidence_card=[2, 2], ), ) def test_estimate_cpd_improper_prior(self): cpd_C = self.est1.estimate_cpd( "C", prior_type="dirichlet", pseudo_counts=[[0, 0, 0, 0], [0, 0, 0, 0]] ) cpd_C_correct = TabularCPD( "C", 2, [[0.0, 0.0, 1.0, np.NaN], [1.0, 1.0, 0.0, np.NaN]], evidence=["A", "B"], evidence_card=[2, 2], state_names={"A": [0, 1], "B": [0, 1], "C": [0, 1]}, ) # manual comparison because np.NaN != np.NaN self.assertTrue( ( (cpd_C.values == cpd_C_correct.values) | np.isnan(cpd_C.values) & np.isnan(cpd_C_correct.values) ).all() ) def test_estimate_cpd_shortcuts(self): cpd_C1 = self.est2.estimate_cpd( "C", prior_type="BDeu", equivalent_sample_size=9 ) cpd_C1_correct = TabularCPD( "C", 3, [ [0.2, 0.2, 0.6, 1.0 / 3, 1.0 / 3, 1.0 / 3], [0.6, 0.6, 0.2, 1.0 / 3, 1.0 / 3, 1.0 / 3], [0.2, 0.2, 0.2, 1.0 / 3, 1.0 / 3, 1.0 / 3], ], evidence=["A", "B"], evidence_card=[3, 2], ) self.assertEqual(cpd_C1, cpd_C1_correct) cpd_C2 = self.est3.estimate_cpd("C", prior_type="K2") cpd_C2_correct = TabularCPD( "C", 2, [ [0.5, 0.6, 1.0 / 3, 2.0 / 3, 0.75, 2.0 / 3], [0.5, 0.4, 2.0 / 3, 1.0 / 3, 0.25, 1.0 / 3], ], evidence=["A", "B"], evidence_card=[3, 2], ) self.assertEqual(cpd_C2, cpd_C2_correct) def test_get_parameters(self): cpds = set( [ self.est3.estimate_cpd("A"), self.est3.estimate_cpd("B"), self.est3.estimate_cpd("C"), ] ) self.assertSetEqual(set(self.est3.get_parameters()), cpds) def test_get_parameters2(self): pseudo_counts = { "A": [[1], [2], [3]], "B": [[4], [5]], "C": [[6, 6, 6, 6, 6, 6], [7, 7, 7, 7, 7, 7]], } cpds = set( [ self.est3.estimate_cpd( "A", prior_type="dirichlet", pseudo_counts=pseudo_counts["A"] ), self.est3.estimate_cpd( "B", prior_type="dirichlet", pseudo_counts=pseudo_counts["B"] ), self.est3.estimate_cpd( "C", prior_type="dirichlet", pseudo_counts=pseudo_counts["C"] ), ] ) self.assertSetEqual( set( self.est3.get_parameters( prior_type="dirichlet", pseudo_counts=pseudo_counts ) ), cpds, ) def tearDown(self): del self.m1 del self.d1 del self.d2 del self.est1 del self.est2
print(mle.estimate_cpd(node='FIO2')) print(mle.estimate_cpd(node='CVP')) # Estimating CPDs for all the nodes in the model print(mle.get_parameters()[:10]) # Show just the first 10 CPDs in the output # Verifying that the learned parameters are almost equal. import numpy as np print(np.allclose(alarm_model.get_cpds('FIO2').values, mle.estimate_cpd('FIO2').values, atol=0.01)) # Fitting the using Bayesian Estimator from pgmpy.estimators import BayesianEstimator best = BayesianEstimator(model=model_struct, data=samples) print(best.estimate_cpd(node='FIO2', prior_type="BDeu", equivalent_sample_size=1000)) # Uniform pseudo count for each state. Can also accept an array of the size of CPD. print(best.estimate_cpd(node='CVP', prior_type="dirichlet", pseudo_counts=100)) # Learning CPDs for all the nodes in the model. For learning all parameters with BDeU prior, a dict of # pseudo_counts need to be provided print(best.get_parameters(prior_type="BDeu", equivalent_sample_size=1000)[:10]) # Shortcut for learning all the parameters and adding the CPDs to the model. model_struct = BayesianModel(ebunch=alarm_model.edges()) model_struct.fit(data=samples, estimator=MaximumLikelihoodEstimator) print(model_struct.get_cpds('FIO2')) model_struct = BayesianModel(ebunch=alarm_model.edges()) model_struct.fit(data=samples, estimator=BayesianEstimator, prior_type='BDeu', equivalent_sample_size=1000)
def opt(self, file1, file2): f1 = open(file1, encoding="utf8") lines = f1.readlines() nodes = self.getegdes(lines[0]) edges = self.getegdes(lines[1]) data = pd.read_csv(file2) G = BayesianModel() G.add_nodes_from(nodes) for i in range(int(len(edges) / 2)): G.add_edge(edges[2 * i], edges[2 * i + 1]) # nx.draw(G) # plt.show() k2 = K2Score(data).score(G) bic = BicScore(data).score(G) bdeu = BDeuScore(data).score(G) print(k2, ",", bic, ",", bdeu) est = HillClimbSearch(data, scoring_method=K2Score(data)) model = est.estimate() model_edges = model.edges() G_ = nx.DiGraph() G_.add_edges_from(model_edges) G_copy = nx.DiGraph() G_copy.add_edges_from(G.edges) add = [] add_mut = [] delete = [] delete_mut = [] # a = list(G.edges._adjdict.key()) for edge in model_edges: node1 = edge[0] node2 = edge[1] if not nx.has_path(G, node2, node1): if not G.has_edge(node1, node2): this = (node1, node2) # this = '('+node1+','+node2+')' add.append(this) x = data[node1] mut = mr.mutual_info_score(data[node1], data[node2]) add_mut.append(mut) seq = list(zip(add_mut, add)) seq = sorted(seq, key=lambda s: s[0], reverse=True) alpha = 0.015 # if seq[0][0] > alpha: # add = seq[0:1] add = seq[0:1] data_edges = [] for edge in G.edges: node1 = edge[0] node2 = edge[1] mut = mr.mutual_info_score(data[node1], data[node2]) delete_mut.append(mut) data_edges.append(edge) # if not (nx.has_path(G_, node1, node2) or nx.has_path(G_, node2, node1)): # this = '('+node1+','+node2+')' # delete.append(this) seq = list(zip(delete_mut, data_edges)) seq = sorted(seq, key=lambda s: s[0]) # if seq[0][0] < alpha: # delete = seq[0:1] if len(edges) > 2: delete = seq[0:1] if len(add) > 0: if delete[0][0] > add[0][0]: delete = [] print('add') for i in add: print(str(i[1]) + "," + str(i[0])) print('delete') for j in delete: print(str(j[1]) + "," + str(j[0])) # print(j[0]) print('cpt') estimator = BayesianEstimator(G, data) for i in G.nodes: cpd = estimator.estimate_cpd(i, prior_type="K2") nodeName = i values = dict(data[i].value_counts()) valueNum = len(values) CPT = np.transpose(cpd.values) # CPT = cpd.values sequence = cpd.variables[1::] card = [] for x in sequence: s = len(dict(data[x].value_counts())) card.append(s) output = nodeName + '\t' + str(valueNum) + '\t' + str( CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card) print(output) print('mutual') output1 = [] for i in range(int(len(edges) / 2)): mut = mr.mutual_info_score(data[edges[2 * i]], data[edges[2 * i + 1]]) output1.append(mut) output2 = {} for node1 in G.nodes(): d = {} for node2 in G.nodes(): if node1 == node2: continue mut = mr.mutual_info_score(data[node1], data[node2]) d[node2] = mut output2[node1] = d print(output1) print(output2)
hc = HillClimbSearch(df, scoring_method=K2Score(df)) best_model = hc.estimate() print(best_model.edges()) # In[163]: # Bayesian Model and parameter estimation model1 = BayesianModel([('f3', 'f4'), ('f3', 'f9'), ('f3', 'f8'), ('f5', 'f9'), ('f5', 'f3'), ('f9', 'f8'), ('f9', 'f7'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f2'), ('f9', 'f4')]) # Bayesian Parameter Estimation est = BayesianEstimator(model1, df) cpd_f1 = est.estimate_cpd('f1', prior_type='K2', equivalent_sample_size=50) cpd_f2 = est.estimate_cpd('f2', prior_type='K2', equivalent_sample_size=50) cpd_f3 = est.estimate_cpd('f3', prior_type='K2', equivalent_sample_size=50) cpd_f4 = est.estimate_cpd('f4', prior_type='K2', equivalent_sample_size=50) cpd_f5 = est.estimate_cpd('f5', prior_type='K2', equivalent_sample_size=50) cpd_f6 = est.estimate_cpd('f6', prior_type='K2', equivalent_sample_size=50) cpd_f7 = est.estimate_cpd('f7', prior_type='K2', equivalent_sample_size=50) cpd_f8 = est.estimate_cpd('f8', prior_type='K2', equivalent_sample_size=50) cpd_f9 = est.estimate_cpd('f9', prior_type='K2', equivalent_sample_size=50) # Associating the CPDs with the network model1.add_cpds(cpd_f1, cpd_f2, cpd_f3, cpd_f4, cpd_f5, cpd_f6, cpd_f7, cpd_f8, cpd_f9) # check_model checks for the network structure and CPDs and verifies that the CPDs are correctly # defined and sum to 1.
def estimate_parameters(self): data = pd.DataFrame(data=self.learning_data) estimator = BayesianEstimator(self.pgmpy, data) for i, node in enumerate(self.nodes): if 'LAN' in node[0] or 'MOTOR' in node[0] or 'WORLD' in node[0]: self.pgmpy.get_cpds(node[0]).values = estimator.estimate_cpd('WORLD_0', prior_type='dirichlet', pseudo_counts=[2, 3]).values
from pgmpy.models import BayesianModel from pgmpy.estimators import BayesianEstimator #test 1 data = pd.DataFrame(data={ 'A': [0.0, 0.0, 1.0], 'B': [0.0, 1.0, 0.0], 'C': [1.0, 1.0, 0.0] }) #data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]}) print(data) model = BayesianModel([('A', 'C'), ('B', 'C')]) estimator = BayesianEstimator(model, data) cpd_C = estimator.estimate_cpd('C', prior_type="dirichlet", pseudo_counts=[1, 2]) print(cpd_C) #test 2 values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 4)), columns=['A', 'B', 'C', 'D']) model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D')]) estimator = BayesianEstimator(model, values) a = estimator.get_parameters(prior_type='BDeu', equivalent_sample_size=5) for i in a: print(i) #print(a) #print(type(a)) #print(len(a)) #print(a[0])
passive_users = "passive, "*24 active_users = [elem for elem in active_users.strip().split(",") if elem != ''] passive_users = [elem for elem in passive_users.strip().split(",") if elem != ''] data = pd.DataFrame(data = {'last_activity' : high + medium + low, 'duration': dhigh + dmedium + dlow, 'pages_viewed': pvhigh + pvmedium + pvlow, 'user_type' : active_users + passive_users }) model = BayesianModel([ ('last_activity', 'duration'), ('duration', 'pages_viewed'), ('pages_viewed', 'user_type')]) pe = ParameterEstimator(model, data) #print("\n", pe.state_counts('last_activity')) # unconditional #print("\n", pe.state_counts('user_type')) # conditional on fruit and size mle = MaximumLikelihoodEstimator(model, data) #print(mle.estimate_cpd('last_activity')) # unconditional #print(mle.estimate_cpd('user_type')) # conditional # Calibrate all CPDs of `model` using MLE: model.fit(data) est = BayesianEstimator(model, data) result = est.estimate_cpd('user_type', prior_type='BDeu', equivalent_sample_size=10) import code code.interact(local=locals())
def distribution(excel_rows, item_name, items, file_name, df_cols, groupby_cols, bp_group): # Using 95% confidence interval # (1-0.95)/2 Z_score = abs(st.norm.ppf(0.025)) alpha = 1 - 0.95 data_files = {} Orientations = ["left", "right"] # create dataframe for item in items: if item_name == "Monkey": df = (monkey_df[(monkey_df.Monkey == item)]) elif item_name == "gender": df = (gender_df[(gender_df.gender == item)]) z = BayesianEstimator(model, df) cat_cpd = z.estimate_cpd('Orientation', prior_type="bdeu", equivalent_sample_size=6) # .to_factor() for left in categories: for right in categories: for cat in Orientations: try: count = z.state_counts('Orientation')[left][right][cat] prob = cat_cpd.get_value( **{ 'Left_categ': left, 'Right_categ': right, 'Orientation': cat }) # p_hat and q_hat set to conservative since we have no previous data #0.5 for each # Since its probability I clip to 0 lower_ci = max( prob - Z_score * math.sqrt( (0.5 * 0.5) / df.shape[0]), 0) upper_ci = prob + Z_score * math.sqrt( (0.5 * 0.5) / df.shape[0]) if not isNaN(prob) and prob > 0: excel_rows.append([ item, left, right, cat, count, prob, lower_ci, upper_ci, alpha ]) else: pass # excel_rows.append([item, left, right, cat, count, prob, 0, 0, 0]) except KeyError: pass # excel_rows.append([item, left, right, cat, count, 0, 0 , 0, 0]) prob_df = pd.DataFrame.from_records(excel_rows[1:], columns=excel_rows[0]) gen_df = prob_df[df_cols].groupby(groupby_cols)['Count'].agg( ['sum']) # .reset_index() ax, bp = gen_df.boxplot(rot=90, fontsize=12, figsize=(16, 10), column=['sum'], by=bp_group, return_type="both")[0] plt.title(item_name.capitalize() + " Box plot grouped by : " + str(bp_group)) plt.suptitle('') plt.ylabel("sum") # group = ['Left-Category', 'Category'] # ax, bp = gen_df.boxplot(rot=90, fontsize=12, figsize=(24, 12), column=['sum'], by=group, return_type="both")[0] # plt.title("Box plot grouped by : " + str(group)) # plt.suptitle('') # plt.ylabel("sum") # # # group = ['Right-Category', 'Category'] # ax, bp = gen_df.boxplot(rot=90, fontsize=12, figsize=(24, 12), column=['sum'], by=group, return_type="both")[0] # plt.title("Box plot grouped by : " + str(group)) # plt.suptitle('') # plt.ylabel("sum") writer = pd.ExcelWriter(file_name + ".xlsx") prob_df.to_excel(writer, sheet_name='Distribution') prob_df.sort_values('Probability', ascending=False).drop_duplicates( [item_name]).to_excel(writer, sheet_name='prefference') writer.save() plt.savefig(file_name + ".png", dpi=100) plt.show() plt.clf()