def gs(data, alpha=0.05, feature_selection=None, debug=False): """ Perform growshink algorithm over dataset to learn Bayesian network structure. This algorithm is clearly a good candidate for numba JIT compilation... STEPS ----- 1. Compute Markov Blanket 2. Compute Graph Structure 3. Orient Edges 4. Remove Cycles 5. Reverse Edges 6. Propagate Directions Arguments --------- *data* : a nested numpy array Data from which you wish to learn structure *alpha* : a float Type I error rate for independence test Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- Speed Test: *** 5 variables, 624 observations *** - 63.7 ms """ n_rv = data.shape[1] data, value_dict = replace_strings(data, return_values=True) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list) ), 'feature_selection must be only one value' _T = [feature_selection] # STEP 1 : COMPUTE MARKOV BLANKETS Mb = dict([(rv, []) for rv in range(n_rv)]) for X in _T: S = [] grow_condition = True while grow_condition: grow_condition = False for Y in range(n_rv): if X != Y and Y not in S: # if there exists some Y such that Y is dependent on X given S, # add Y to S cols = (X, Y) + tuple(S) pval = mi_test(data[:, cols]) if pval < alpha: # dependent grow_condition = True # dependent -> continue searching S.append(Y) shrink_condition = True while shrink_condition: TEMP_S = [] shrink_condition = False for Y in S: s_copy = copy(S) s_copy.remove(Y) # condition on S-{Y} # if X independent of Y given S-{Y}, leave Y out # if X dependent of Y given S-{Y}, keep it in cols = (X, Y) + tuple(s_copy) pval = mi_test(data[:, cols]) if pval < alpha: # dependent TEMP_S.append(Y) else: # independent -> condition searching shrink_condition = True Mb[X] = TEMP_S if debug: print 'Markov Blanket for %s : %s' % (X, str(TEMP_S)) if feature_selection is None: # STEP 2: COMPUTE GRAPH STRUCTURE # i.e. Resolve Markov Blanket edge_dict = resolve_markov_blanket(Mb, data) if debug: print 'Unoriented edge dict:\n %s' % str(edge_dict) # STEP 3: ORIENT EDGES oriented_edge_dict = orient_edges_MB(edge_dict, Mb, data, alpha) if debug: print 'Oriented edge dict:\n %s' % str(oriented_edge_dict) # CREATE BAYESNET OBJECT bn = BayesNet(oriented_edge_dict, value_dict) return bn else: return Mb[_T]
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False): """ From [1]: "A novel algorithm for the induction of Markov blankets from data, called Fast-IAMB, that employs a heuristic to quickly recover the Markov blanket. Empirical results show that Fast-IAMB performs in many cases faster and more reliably than existing algorithms without adversely affecting the accuracy of the recovered Markov blankets." Arguments --------- *data* : a nested numpy array *k* : an integer The max number of edges to add at each iteration of the algorithm. *alpha* : a float Probability of Type I error Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- - Currently does not work. I think it's stuck in an infinite loop... """ # get values value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) # replace strings data = replace_strings(data) n_rv = data.shape[1] Mb = dict([(rv,[]) for rv in range(n_rv)]) N = data.shape[0] card = dict(zip(range(n_rv),unique_bins(data))) #card = dict(zip(range(data.shape[1]),np.amax(data,axis=0))) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value' _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: S = set(range(n_rv)) - {T} for A in S: if not are_independent(data[:,(A,T)]): S.remove(A) s_h_dict = dict([(s,0) for s in S]) while S: insufficient_data = False break_grow_phase = False #### GROW PHASE #### # Calculate mutual information for all variables mi_dict = dict([(s,mi_test(data[:,(s,T)+tuple(Mb[T])])) for s in S]) for x_i in sorted(mi_dict, key=mi_dict.get,reverse=True): # Add top MI-score variables until there isn't enough data for bins if (N / card[x_i]*card[T]*np.prod([card[b] for b in Mb[T]])) >= k: Mb[T].append(x_i) else: insufficient_data = True break #### SHRINK PHASE #### removed_vars = False for A in Mb[T]: cols = (A,T) + tuple(set(Mb[T]) - {A}) # if A is independent of T given Mb[T], remove A if are_independent(data[:,cols]): Mb[T].remove(A) removed_vars=True #### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS #### if insufficient_data and not removed_vars: if debug: print 'Breaking..' break else: A = set(range(n_rv)) - {T} - set(Mb[T]) #A = set(nodes) - {T} - set(Mb[T]) S = set() for a in A: cols = (a,T) + tuple(Mb[T]) if are_independent(data[:,cols]): S.add(a) if debug: print 'Done with %s' % T if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) # ORIENT EDGES oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha) # CREATE BAYESNET OBJECT bn=BayesNet(oriented_edge_dict,value_dict) return BN else: return Mb[_T]
def gs(data, alpha=0.05, feature_selection=None, debug=False): """ Perform growshink algorithm over dataset to learn Bayesian network structure. This algorithm is clearly a good candidate for numba JIT compilation... STEPS ----- 1. Compute Markov Blanket 2. Compute Graph Structure 3. Orient Edges 4. Remove Cycles 5. Reverse Edges 6. Propagate Directions Arguments --------- *data* : a nested numpy array Data from which you wish to learn structure *alpha* : a float Type I error rate for independence test Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- Speed Test: *** 5 variables, 624 observations *** - 63.7 ms """ n_rv = data.shape[1] data, value_dict = replace_strings(data, return_values=True) if feature_selection is None: _T = range(n_rv) else: assert not isinstance(feature_selection, list), "feature_selection must be only one value" _T = [feature_selection] # STEP 1 : COMPUTE MARKOV BLANKETS Mb = dict([(rv, []) for rv in range(n_rv)]) for X in _T: S = [] grow_condition = True while grow_condition: grow_condition = False for Y in range(n_rv): if X != Y and Y not in S: # if there exists some Y such that Y is dependent on X given S, # add Y to S cols = (X, Y) + tuple(S) pval = mi_test(data[:, cols]) if pval < alpha: # dependent grow_condition = True # dependent -> continue searching S.append(Y) shrink_condition = True while shrink_condition: TEMP_S = [] shrink_condition = False for Y in S: s_copy = copy(S) s_copy.remove(Y) # condition on S-{Y} # if X independent of Y given S-{Y}, leave Y out # if X dependent of Y given S-{Y}, keep it in cols = (X, Y) + tuple(s_copy) pval = mi_test(data[:, cols]) if pval < alpha: # dependent TEMP_S.append(Y) else: # independent -> condition searching shrink_condition = True Mb[X] = TEMP_S if debug: print "Markov Blanket for %s : %s" % (X, str(TEMP_S)) if feature_selection is None: # STEP 2: COMPUTE GRAPH STRUCTURE # i.e. Resolve Markov Blanket edge_dict = resolve_markov_blanket(Mb, data) if debug: print "Unoriented edge dict:\n %s" % str(edge_dict) # STEP 3: ORIENT EDGES oriented_edge_dict = orient_edges_MB(edge_dict, Mb, data, alpha) if debug: print "Oriented edge dict:\n %s" % str(oriented_edge_dict) # CREATE BAYESNET OBJECT bn = BayesNet(oriented_edge_dict, value_dict) return bn else: return Mb[_T]
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False): """ From [1]: "A novel algorithm for the induction of Markov blankets from data, called Fast-IAMB, that employs a heuristic to quickly recover the Markov blanket. Empirical results show that Fast-IAMB performs in many cases faster and more reliably than existing algorithms without adversely affecting the accuracy of the recovered Markov blankets." Arguments --------- *data* : a nested numpy array *k* : an integer The max number of edges to add at each iteration of the algorithm. *alpha* : a float Probability of Type I error Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- - Currently does not work. I think it's stuck in an infinite loop... """ # get values value_dict = dict( zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) # replace strings data = replace_strings(data) n_rv = data.shape[1] Mb = dict([(rv, []) for rv in range(n_rv)]) N = data.shape[0] card = dict(zip(range(n_rv), unique_bins(data))) #card = dict(zip(range(data.shape[1]),np.amax(data,axis=0))) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list) ), 'feature_selection must be only one value' _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: S = set(range(n_rv)) - {T} for A in S: if not are_independent(data[:, (A, T)]): S.remove(A) s_h_dict = dict([(s, 0) for s in S]) while S: insufficient_data = False break_grow_phase = False #### GROW PHASE #### # Calculate mutual information for all variables mi_dict = dict([(s, mi_test(data[:, (s, T) + tuple(Mb[T])])) for s in S]) for x_i in sorted(mi_dict, key=mi_dict.get, reverse=True): # Add top MI-score variables until there isn't enough data for bins if (N / card[x_i] * card[T] * np.prod([card[b] for b in Mb[T]])) >= k: Mb[T].append(x_i) else: insufficient_data = True break #### SHRINK PHASE #### removed_vars = False for A in Mb[T]: cols = (A, T) + tuple(set(Mb[T]) - {A}) # if A is independent of T given Mb[T], remove A if are_independent(data[:, cols]): Mb[T].remove(A) removed_vars = True #### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS #### if insufficient_data and not removed_vars: if debug: print 'Breaking..' break else: A = set(range(n_rv)) - {T} - set(Mb[T]) #A = set(nodes) - {T} - set(Mb[T]) S = set() for a in A: cols = (a, T) + tuple(Mb[T]) if are_independent(data[:, cols]): S.add(a) if debug: print 'Done with %s' % T if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) # ORIENT EDGES oriented_edge_dict = orient_edges_MB(edge_dict, Mb, data, alpha) # CREATE BAYESNET OBJECT bn = BayesNet(oriented_edge_dict, value_dict) return BN else: return Mb[_T]