def chow_liu(data,edges_only=False): """ Perform Chow-Liu structure learning algorithm over an entire dataset, and return the BN-tree. Arguments --------- *data* : a nested numpy array The data from which we will learn. It should be the entire dataset. Returns ------- *bn* : a BayesNet object The structure-learned BN. Effects ------- None Notes: Prim's algorithm or Kruskal's Remark: This code is wrong. Since once an edge i->j both not in vertex_cache, It will not be considerred any longer. Even later, when one of them, say i, is added to vertex_cache, apparently i->j would be a safe link, but won't be considerred, leading to lower weight spanning tree. ----- """ value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) n_rv = data.shape[1] edge_list = [(i,j,mi_test(data[:,(i,j)],chi2_test=False)) \ for i in xrange(n_rv) for j in xrange(i+1,n_rv)] edge_list.sort(key=operator.itemgetter(2), reverse=True) # sort by weight vertex_cache = {edge_list[0][0]} # start with first vertex.. mst = dict((rv, []) for rv in xrange(n_rv)) for i,j,w in edge_list: # since undirected, i->j and j-> is the same # and in edge_list, there are only i->j # since edge_list already sorted, when we encounter i->j, # it must be largest weight edge crossing the cut, thus safe edge if i in vertex_cache and j not in vertex_cache: mst[i].append(j) vertex_cache.add(j) elif i not in vertex_cache and j in vertex_cache: mst[j].append(i) vertex_cache.add(i) if edges_only == True: return mst, value_dict bn=BayesNet(mst,value_dict) return bn
def chow_liu(data,edges_only=False): """ Perform Chow-Liu structure learning algorithm over an entire dataset, and return the BN-tree. Arguments --------- *data* : a nested numpy array The data from which we will learn. It should be the entire dataset. Returns ------- *bn* : a BayesNet object The structure-learned BN. Effects ------- None Notes ----- """ value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) n_rv = data.shape[1] edge_list = [(i,j,mi_test(data[:,(i,j)],chi2_test=False)) \ for i in xrange(n_rv) for j in xrange(i+1,n_rv)] edge_list.sort(key=operator.itemgetter(2), reverse=True) # sort by weight vertex_cache = {edge_list[0][0]} # start with first vertex.. mst = dict((rv, []) for rv in xrange(n_rv)) for i,j,w in edge_list: if i in vertex_cache and j not in vertex_cache: mst[i].append(j) vertex_cache.add(j) elif i not in vertex_cache and j in vertex_cache: mst[j].append(i) vertex_cache.add(i) if edges_only == True: return mst, value_dict bn=BayesNet(mst,value_dict) return bn
def chow_liu(data, edges_only=False): """ Perform Chow-Liu structure learning algorithm over an entire dataset, and return the BN-tree. Arguments --------- *data* : a nested numpy array The data from which we will learn. It should be the entire dataset. Returns ------- *bn* : a BayesNet object The structure-learned BN. Effects ------- None Notes ----- """ value_dict = dict( zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) n_rv = data.shape[1] edge_list = [(i,j,mi_test(data[:,(i,j)],chi2_test=False)) \ for i in xrange(n_rv) for j in xrange(i+1,n_rv)] edge_list.sort(key=operator.itemgetter(2), reverse=True) # sort by weight vertex_cache = {edge_list[0][0]} # start with first vertex.. mst = dict((rv, []) for rv in xrange(n_rv)) for i, j, w in edge_list: if i in vertex_cache and j not in vertex_cache: mst[i].append(j) vertex_cache.add(j) elif i not in vertex_cache and j in vertex_cache: mst[j].append(i) vertex_cache.add(i) if edges_only == True: return mst, value_dict bn = BayesNet(mst, value_dict) return bn
def mb_fitness(data, Mb, target=None): """ Evaluate the fitness of a Markov Blanket dictionary learned from a given data set based on the distance metric provided in [1] and [2]. From [2]: A distance measure that indicates the "fitness" of the discovered blanket... to be the average, over all attributes X outside the blanket, of the expected KL-divergence between Pr(T | B(T)) and Pr(T | B(T) u {X}). We can expect this measure to be close to zero when B(T) is an approximate blanket. -- My Note: T is the target variable, and if the KL-divergence between the two distributions above is zero, then it means that {X} provides no new information about T and can thus be excluded from Mb(T) -- this is the exact definition of conditional independence. Notes ----- - Find Pr(T|B(T)) .. - For each variable X outside of the B(T), calculate D( Pr(T|B(T)), Pr(T|B(T)u{X}) ) - Take the average (closer to Zero is better) ^^^ This is basically calculating where T is independent of X given B(T).. i.e. Sum over all X not in B(T) of mi_test(data[:,(T,X,B(T))]) / |X| """ if target is None: nodes = set(Mb.keys()) else: try: nodes = set(target) except TypeError: nodes = {target} fitness_dict = dict([(rv, 0) for rv in nodes]) for T in nodes: non_blanket = nodes - set(Mb[T]) - {T} for X in non_blanket: pval = mi_test(data[:, (T, X) + tuple(Mb[T])]) fitness_dict[T] += 1 / pval return fitness_dict
def orient_edges_gs2(edge_dict, Mb, data, alpha): """ Similar algorithm as above, but slightly modified for speed? Need to test. """ d_edge_dict = dict([(rv,[]) for rv in edge_dict]) for X in edge_dict.keys(): for Y in edge_dict[X]: nxy = set(edge_dict[X]) - set(edge_dict[Y]) - {Y} for Z in nxy: if Y not in d_edge_dict[X]: d_edge_dict[X].append(Y) # SET Y -> X B = min(set(Mb[Y]) - {X} - {Z},set(Mb[Z]) - {X} - {Y}) for i in range(len(B)): for S in itertools.combinations(B,i): cols = (Y,Z,X) + tuple(S) pval = mi_test(data[:,cols]) if pval < alpha and X in d_edge_dict[Y]: # Y IS independent of Z given S+X d_edge_dict[Y].remove(X) if X in d_edge_dict[Y]: break return d_edge_dict
def orient_edges_gs2(edge_dict, Mb, data, alpha): """ Similar algorithm as above, but slightly modified for speed? Need to test. """ d_edge_dict = dict([(rv, []) for rv in edge_dict]) for X in edge_dict.keys(): for Y in edge_dict[X]: nxy = set(edge_dict[X]) - set(edge_dict[Y]) - {Y} for Z in nxy: if Y not in d_edge_dict[X]: d_edge_dict[X].append(Y) # SET Y -> X B = min(set(Mb[Y]) - {X} - {Z}, set(Mb[Z]) - {X} - {Y}) for i in range(len(B)): for S in itertools.combinations(B, i): cols = (Y, Z, X) + tuple(S) pval = mi_test(data[:, cols]) if pval < alpha and X in d_edge_dict[Y]: # Y IS independent of Z given S+X d_edge_dict[Y].remove(X) if X in d_edge_dict[Y]: break return d_edge_dict
def pc(data, alpha=0.05): """ Path Condition algorithm for structure learning. This is a good test, but has some issues with test reliability when the size of the dataset is small. The Necessary Path Condition (NPC) algorithm can solve these problems. Arguments --------- *bn* : a BayesNet object The object we wish to modify. This can be a competely empty BayesNet object, in which case the structure info will be set. This can be a BayesNet object with already initialized structure/params, in which case the structure will be overwritten and the parameters will be cleared. *data* : a nested numpy array The data from which we will learn -> will code for pandas dataframe after numpy works Returns ------- *bn* : a BayesNet object The network created from the learning procedure, with the nodes/edges initialized/changed Effects ------- None Notes ----- Speed Test: ** 5 vars, 624 obs *** - 90.9 ms """ n_rv = data.shape[1] ##### FIND EDGES ##### value_dict = dict(zip(range(n_rv), [list(np.unique(col)) for col in data.T])) edge_dict = dict([(i,[j for j in range(n_rv) if i!=j]) for i in range(n_rv)]) block_dict = dict([(i,[]) for i in range(n_rv)]) stop = False i = 1 while not stop: for x in xrange(n_rv): for y in edge_dict[x]: if i == 0: pval_xy_z = mi_test(data[:,(x,y)]) if pval_xy_z > alpha: if y in edge_dict[x]: edge_dict[x].remove(y) edge_dict[y].remove(x) else: for z in itertools.combinations(edge_dict[x],i): if y not in z: cols = (x,y) + z pval_xy_z = mi_test(data[:,cols]) # if I(X,Y | Z) = TRUE if pval_xy_z > alpha: block_dict[x] = {y:z} block_dict[y] = {x:z} if y in edge_dict[x]: edge_dict[x].remove(y) edge_dict[y].remove(x) i += 1 stop = True for x in xrange(n_rv): if (len(edge_dict[x]) > i-1): stop = False break # ORIENT EDGES (from collider set) directed_edge_dict = orient_edges_CS(edge_dict,block_dict) # CREATE BAYESNET OBJECT bn=BayesNet(directed_edge_dict,value_dict) return bn
def resolve_markov_blanket(Mb, data, alpha=0.05): """ Resolving the Markov blanket is the process by which a PDAG is constructed from the collection of Markov Blankets for each node. Since an undirected graph is returned, the edges still need to be oriented by calling some version of the "orient_edges" function in "pyBN.structure_learn.orient_edges" module. This algorithm is adapted from Margaritis, but also see [3] for good pseudocode. Arguments --------- *Mb* : a dictionary, where key = rv and value = list of vars in rv's markov blanket *data* : a nested numpy array The dataset used to learn the Mb Returns ------- *edge_dict* : a dictionary, where key = rv and value = list of rv's children Effects ------- None Notes ----- """ n_rv = data.shape[1] edge_dict = dict([(rv, []) for rv in range(n_rv)]) for X in range(n_rv): print("X", X) for Y in Mb[X]: print("Y", Y) # X and Y are direct neighbors if X and Y are dependent # given S for all S in T, where T is the smaller of # B(X)-{Y} and B(Y)-{X} if len(Mb[X]) < len(Mb[Y]): T = copy(Mb[X]) # shallow copy is sufficient if Y in T: T.remove(Y) else: T = copy(Mb[Y]) # shallow copy is sufficient if X in T: T.remove(X) # X and Y must be dependent conditioned upon # EVERY POSSIBLE COMBINATION of T direct_neighbors = True for i in range(len(T)): for S in itertools.combinations(T, i): print("Iter", S) cols = (X, Y) + tuple(S) pval = mi_test(data[:, cols]) if pval > alpha: direct_neighbors = False if direct_neighbors: if Y not in edge_dict[X] and X not in edge_dict[Y]: edge_dict[X].append(Y) if X not in edge_dict[Y]: edge_dict[Y].append(X) return edge_dict
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False): """ From [1]: "A novel algorithm for the induction of Markov blankets from data, called Fast-IAMB, that employs a heuristic to quickly recover the Markov blanket. Empirical results show that Fast-IAMB performs in many cases faster and more reliably than existing algorithms without adversely affecting the accuracy of the recovered Markov blankets." Arguments --------- *data* : a nested numpy array *k* : an integer The max number of edges to add at each iteration of the algorithm. *alpha* : a float Probability of Type I error Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- - Currently does not work. I think it's stuck in an infinite loop... """ # get values value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) # replace strings data = replace_strings(data) n_rv = data.shape[1] Mb = dict([(rv, []) for rv in range(n_rv)]) N = data.shape[0] card = dict(zip(range(n_rv), unique_bins(data))) # card = dict(zip(range(data.shape[1]),np.amax(data,axis=0))) if feature_selection is None: _T = range(n_rv) else: assert not isinstance(feature_selection, list), "feature_selection must be only one value" _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: S = set(range(n_rv)) - {T} for A in S: if not are_independent(data[:, (A, T)]): S.remove(A) s_h_dict = dict([(s, 0) for s in S]) while S: insufficient_data = False break_grow_phase = False #### GROW PHASE #### # Calculate mutual information for all variables mi_dict = dict([(s, mi_test(data[:, (s, T) + tuple(Mb[T])])) for s in S]) for x_i in sorted(mi_dict, key=mi_dict.get, reverse=True): # Add top MI-score variables until there isn't enough data for bins if (N / card[x_i] * card[T] * np.prod([card[b] for b in Mb[T]])) >= k: Mb[T].append(x_i) else: insufficient_data = True break #### SHRINK PHASE #### removed_vars = False for A in Mb[T]: cols = (A, T) + tuple(set(Mb[T]) - {A}) # if A is independent of T given Mb[T], remove A if are_independent(data[:, cols]): Mb[T].remove(A) removed_vars = True #### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS #### if insufficient_data and not removed_vars: if debug: print "Breaking.." break else: A = set(range(n_rv)) - {T} - set(Mb[T]) # A = set(nodes) - {T} - set(Mb[T]) S = set() for a in A: cols = (a, T) + tuple(Mb[T]) if are_independent(data[:, cols]): S.add(a) if debug: print "Done with %s" % T if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) # ORIENT EDGES oriented_edge_dict = orient_edges_MB(edge_dict, Mb, data, alpha) # CREATE BAYESNET OBJECT bn = BayesNet(oriented_edge_dict, value_dict) return BN else: return Mb[_T]
def iamb(data, alpha=0.05, feature_selection=None, debug=False): """ IAMB Algorithm for learning the structure of a Discrete Bayesian Network from data. Arguments --------- *data* : a nested numpy array *alpha* : a float The type II error rate. *feature_selection* : None or a string Whether to use IAMB as a structure learning or feature selection algorithm. Returns ------- *bn* : a BayesNet object or *mb* : the markov blanket of a node Effects ------- None Notes ----- - Works but there are definitely some bugs. Speed Test: *** 5 vars, 624 obs *** - 196 ms """ n_rv = data.shape[1] Mb = dict([(rv, []) for rv in range(n_rv)]) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list) ), 'feature_selection must be only one value' _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: V = set(range(n_rv)) - {T} Mb_change = True # GROWING PHASE while Mb_change: Mb_change = False # find X_max in V-Mb(T)-{T} that maximizes # mutual information of X,T|Mb(T) # i.e. max of mi_test(data[:,(X,T,Mb(T))]) max_val = -1 max_x = None for X in V - set(Mb[T]) - {T}: cols = (X, T) + tuple(Mb[T]) mi_val = mi_test(data[:, cols], test=False) if mi_val > max_val: max_val = mi_val max_x = X # if Xmax is dependent on T given Mb(T) cols = (max_x, T) + tuple(Mb[T]) if max_x is not None and are_independent(data[:, cols]): Mb[T].append(X) Mb_change = True if debug: print('Adding %s to MB of %s' % (str(X), str(T))) # SHRINKING PHASE for X in Mb[T]: # if x is independent of t given Mb(T) - {x} cols = (X, T) + tuple(set(Mb[T]) - {X}) if are_independent(data[:, cols], alpha): Mb[T].remove(X) if debug: print('Removing %s from MB of %s' % (str(X), str(T))) if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) if debug: print('Unoriented edge dict:\n %s' % str(edge_dict)) print('MB: %s' % str(Mb)) # ORIENT EDGES oriented_edge_dict = orient_edges_gs2(edge_dict, Mb, data, alpha) if debug: print('Oriented edge dict:\n %s' % str(oriented_edge_dict)) # CREATE BAYESNET OBJECT value_dict = dict( zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) bn = BayesNet(oriented_edge_dict, value_dict) return bn else: return Mb[_T]
def orient_edges_MB(edge_dict, Mb, data, alpha): """ Orient edges from a Markov Blanket based on the rules presented in Margaritis' Thesis pg. 35. This method is used for structure learning algorithms that return/resolve a markov blanket - i.e. growshrink and iamb. Also, see [2] for good full pseudocode. # if there exists a variable Z in N(X)-N(Y)-{Y} # such that Y and Z are dependent given S+{X} for # all S subset of T, where # T is smaller of B(Y)-{X,Z} and B(Z)-{X,Y} Arguments --------- *edge_dict* : a dictionary, where key = node and value = list of neighbors for key. Note: there MUST BE duplicates in edge_dict -> i.e. each edge should be in edge_dict twice since Y in edge_dict[X] and X in edge_dict[Y] *blanket* : a dictionary, where key = node and value = list of nodes in the markov blanket of node *data* : a nested numpy array *alpha* : a float Probability of Type II error. Returns ------- *d_edge_dict* : a dictionary Dictionary of directed edges, so there are no duplicates Effects ------- None Notes ----- """ for X in edge_dict.keys(): for Y in edge_dict[X]: nxy = set(edge_dict[X]) - set(edge_dict[Y]) - {Y} for Z in nxy: by = set(Mb[Y]) - {X} - {Z} bz = set(Mb[Z]) - {X} - {Y} T = min(by, bz) if len(T) > 0: for i in range(len(T)): for S in itertools.combinations(T, i): cols = (Y, Z, X) + tuple(S) pval = mi_test(data[:, cols]) if pval < alpha: if Y in edge_dict[X]: edge_dict[X].remove(Y) else: if Y in edge_dict[X]: edge_dict[Y].remove(X) else: cols = (Y, Z, X) pval = mi_test(data[:, cols]) if pval < alpha: if Y in edge_dict[X]: edge_dict[X].remove(Y) else: if X in edge_dict[Y]: edge_dict[Y].remove(X) return edge_dict
def gs(data, alpha=0.05, feature_selection=None, debug=False): """ Perform growshink algorithm over dataset to learn Bayesian network structure. This algorithm is clearly a good candidate for numba JIT compilation... STEPS ----- 1. Compute Markov Blanket 2. Compute Graph Structure 3. Orient Edges 4. Remove Cycles 5. Reverse Edges 6. Propagate Directions Arguments --------- *data* : a nested numpy array Data from which you wish to learn structure *alpha* : a float Type I error rate for independence test Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- Speed Test: *** 5 variables, 624 observations *** - 63.7 ms """ n_rv = data.shape[1] data, value_dict = replace_strings(data, return_values=True) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value' _T = [feature_selection] # STEP 1 : COMPUTE MARKOV BLANKETS Mb = dict([(rv,[]) for rv in range(n_rv)]) for X in _T: S = [] grow_condition = True while grow_condition: grow_condition=False for Y in range(n_rv): if X!=Y and Y not in S: # if there exists some Y such that Y is dependent on X given S, # add Y to S cols = (X,Y) + tuple(S) pval = mi_test(data[:,cols]) if pval < alpha: # dependent grow_condition=True # dependent -> continue searching S.append(Y) shrink_condition = True while shrink_condition: TEMP_S = [] shrink_condition=False for Y in S: s_copy = copy(S) s_copy.remove(Y) # condition on S-{Y} # if X independent of Y given S-{Y}, leave Y out # if X dependent of Y given S-{Y}, keep it in cols = (X,Y) + tuple(s_copy) pval = mi_test(data[:,cols]) if pval < alpha: # dependent TEMP_S.append(Y) else: # independent -> condition searching shrink_condition=True Mb[X] = TEMP_S if debug: print 'Markov Blanket for %s : %s' % (X, str(TEMP_S)) if feature_selection is None: # STEP 2: COMPUTE GRAPH STRUCTURE # i.e. Resolve Markov Blanket edge_dict = resolve_markov_blanket(Mb,data) if debug: print 'Unoriented edge dict:\n %s' % str(edge_dict) # STEP 3: ORIENT EDGES oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha) if debug: print 'Oriented edge dict:\n %s' % str(oriented_edge_dict) # CREATE BAYESNET OBJECT bn=BayesNet(oriented_edge_dict,value_dict) return bn else: return Mb[_T]
def resolve_markov_blanket(Mb, data, alpha=0.05): """ Resolving the Markov blanket is the process by which a PDAG is constructed from the collection of Markov Blankets for each node. Since an undirected graph is returned, the edges still need to be oriented by calling some version of the "orient_edges" function in "pyBN.structure_learn.orient_edges" module. This algorithm is adapted from Margaritis, but also see [3] for good pseudocode. Arguments --------- *Mb* : a dictionary, where key = rv and value = list of vars in rv's markov blanket *data* : a nested numpy array The dataset used to learn the Mb Returns ------- *edge_dict* : a dictionary, where key = rv and value = list of rv's children Effects ------- None Notes ----- """ n_rv = data.shape[1] edge_dict = dict([(rv, []) for rv in range(n_rv)]) for X in range(n_rv): for Y in Mb[X]: # X and Y are direct neighbors if X and Y are dependent # given S for all S in T, where T is the smaller of # B(X)-{Y} and B(Y)-{X} if len(Mb[X]) < len(Mb[Y]): T = copy(Mb[X]) # shallow copy is sufficient if Y in T: T.remove(Y) else: T = copy(Mb[Y]) # shallow copy is sufficient if X in T: T.remove(X) # X and Y must be dependent conditioned upon # EVERY POSSIBLE COMBINATION of T direct_neighbors = True for i in range(len(T)): for S in itertools.combinations(T, i): cols = (X, Y) + tuple(S) pval = mi_test(data[:, cols]) if pval > alpha: direct_neighbors = False if direct_neighbors: if Y not in edge_dict[X] and X not in edge_dict[Y]: edge_dict[X].append(Y) if X not in edge_dict[Y]: edge_dict[Y].append(X) return edge_dict
def pc(data, alpha=0.05): """ Path Condition algorithm for structure learning. This is a good test, but has some issues with test reliability when the size of the dataset is small. The Necessary Path Condition (NPC) algorithm can solve these problems. Arguments --------- *bn* : a BayesNet object The object we wish to modify. This can be a competely empty BayesNet object, in which case the structure info will be set. This can be a BayesNet object with already initialized structure/params, in which case the structure will be overwritten and the parameters will be cleared. *data* : a nested numpy array The data from which we will learn -> will code for pandas dataframe after numpy works Returns ------- *bn* : a BayesNet object The network created from the learning procedure, with the nodes/edges initialized/changed Effects ------- None Notes ----- Speed Test: ** 5 vars, 624 obs *** - 90.9 ms """ n_rv = data.shape[1] ##### FIND EDGES ##### value_dict = dict( zip(range(n_rv), [list(np.unique(col)) for col in data.T])) edge_dict = dict([(i, [j for j in range(n_rv) if i != j]) for i in range(n_rv)]) block_dict = dict([(i, []) for i in range(n_rv)]) stop = False i = 1 while not stop: for x in range(n_rv): for y in edge_dict[x]: if i == 0: pval_xy_z = mi_test(data[:, (x, y)]) if pval_xy_z > alpha: if y in edge_dict[x]: edge_dict[x].remove(y) edge_dict[y].remove(x) else: for z in itertools.combinations(edge_dict[x], i): if y not in z: cols = (x, y) + z pval_xy_z = mi_test(data[:, cols]) # if I(X,Y | Z) = TRUE if pval_xy_z > alpha: block_dict[x] = {y: z} block_dict[y] = {x: z} if y in edge_dict[x]: edge_dict[x].remove(y) edge_dict[y].remove(x) i += 1 stop = True for x in range(n_rv): if (len(edge_dict[x]) > i - 1): stop = False break # ORIENT EDGES (from collider set) directed_edge_dict = orient_edges_CS(edge_dict, block_dict) # CREATE BAYESNET OBJECT bn = BayesNet(directed_edge_dict, value_dict) return bn
def gs(data, alpha=0.05, feature_selection=None, debug=False): """ Perform growshink algorithm over dataset to learn Bayesian network structure. This algorithm is clearly a good candidate for numba JIT compilation... STEPS ----- 1. Compute Markov Blanket 2. Compute Graph Structure 3. Orient Edges 4. Remove Cycles 5. Reverse Edges 6. Propagate Directions Arguments --------- *data* : a nested numpy array Data from which you wish to learn structure *alpha* : a float Type I error rate for independence test Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- Speed Test: *** 5 variables, 624 observations *** - 63.7 ms """ n_rv = data.shape[1] data, value_dict = replace_strings(data, return_values=True) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value' _T = [feature_selection] # STEP 1 : COMPUTE MARKOV BLANKETS Mb = dict([(rv,[]) for rv in range(n_rv)]) for X in _T: S = [] grow_condition = True while grow_condition: grow_condition=False for Y in range(n_rv): if X!=Y and Y not in S: # if there exists some Y such that Y is dependent on X given S, # add Y to S cols = (X,Y) + tuple(S) pval = mi_test(data[:,cols]) if pval < alpha: # dependent grow_condition=True # dependent -> continue searching S.append(Y) shrink_condition = True while shrink_condition: TEMP_S = [] shrink_condition=False for Y in S: s_copy = copy(S) s_copy.remove(Y) # condition on S-{Y} # if X independent of Y given S-{Y}, leave Y out # if X dependent of Y given S-{Y}, keep it in cols = (X,Y) + tuple(s_copy) pval = mi_test(data[:,cols]) if pval < alpha: # dependent TEMP_S.append(Y) else: # independent -> condition searching shrink_condition=True Mb[X] = TEMP_S if debug: print('Markov Blanket for %s : %s' % (X, str(TEMP_S))) if feature_selection is None: # STEP 2: COMPUTE GRAPH STRUCTURE # i.e. Resolve Markov Blanket edge_dict = resolve_markov_blanket(Mb,data) if debug: print('Unoriented edge dict:\n %s' % str(edge_dict)) # STEP 3: ORIENT EDGES oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha) if debug: print('Oriented edge dict:\n %s' % str(oriented_edge_dict)) # CREATE BAYESNET OBJECT bn=BayesNet(oriented_edge_dict,value_dict) return bn else: return Mb[_T]
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False): """ From [1]: "A novel algorithm for the induction of Markov blankets from data, called Fast-IAMB, that employs a heuristic to quickly recover the Markov blanket. Empirical results show that Fast-IAMB performs in many cases faster and more reliably than existing algorithms without adversely affecting the accuracy of the recovered Markov blankets." Arguments --------- *data* : a nested numpy array *k* : an integer The max number of edges to add at each iteration of the algorithm. *alpha* : a float Probability of Type I error Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- - Currently does not work. I think it's stuck in an infinite loop... """ # get values value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) # replace strings data = replace_strings(data) n_rv = data.shape[1] Mb = dict([(rv,[]) for rv in range(n_rv)]) N = data.shape[0] card = dict(zip(range(n_rv),unique_bins(data))) #card = dict(zip(range(data.shape[1]),np.amax(data,axis=0))) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value' _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: S = set(range(n_rv)) - {T} for A in S: if not are_independent(data[:,(A,T)]): S.remove(A) s_h_dict = dict([(s,0) for s in S]) while S: insufficient_data = False break_grow_phase = False #### GROW PHASE #### # Calculate mutual information for all variables mi_dict = dict([(s,mi_test(data[:,(s,T)+tuple(Mb[T])])) for s in S]) for x_i in sorted(mi_dict, key=mi_dict.get,reverse=True): # Add top MI-score variables until there isn't enough data for bins if (N / card[x_i]*card[T]*np.prod([card[b] for b in Mb[T]])) >= k: Mb[T].append(x_i) else: insufficient_data = True break #### SHRINK PHASE #### removed_vars = False for A in Mb[T]: cols = (A,T) + tuple(set(Mb[T]) - {A}) # if A is independent of T given Mb[T], remove A if are_independent(data[:,cols]): Mb[T].remove(A) removed_vars=True #### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS #### if insufficient_data and not removed_vars: if debug: print 'Breaking..' break else: A = set(range(n_rv)) - {T} - set(Mb[T]) #A = set(nodes) - {T} - set(Mb[T]) S = set() for a in A: cols = (a,T) + tuple(Mb[T]) if are_independent(data[:,cols]): S.add(a) if debug: print 'Done with %s' % T if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) # ORIENT EDGES oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha) # CREATE BAYESNET OBJECT bn=BayesNet(oriented_edge_dict,value_dict) return BN else: return Mb[_T]
def iamb(data, alpha=0.05, feature_selection=None, debug=False): """ IAMB Algorithm for learning the structure of a Discrete Bayesian Network from data. Arguments --------- *data* : a nested numpy array *alpha* : a float The type II error rate. *feature_selection* : None or a string Whether to use IAMB as a structure learning or feature selection algorithm. Returns ------- *bn* : a BayesNet object or *mb* : the markov blanket of a node Effects ------- None Notes ----- - Works but there are definitely some bugs. Speed Test: *** 5 vars, 624 obs *** - 196 ms """ n_rv = data.shape[1] Mb = dict([(rv,[]) for rv in range(n_rv)]) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value' _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: V = set(range(n_rv)) - {T} Mb_change=True # GROWING PHASE while Mb_change: Mb_change = False # find X_max in V-Mb(T)-{T} that maximizes # mutual information of X,T|Mb(T) # i.e. max of mi_test(data[:,(X,T,Mb(T))]) max_val = -1 max_x = None for X in V - set(Mb[T]) - {T}: cols = (X,T)+tuple(Mb[T]) mi_val = mi_test(data[:,cols],test=False) if mi_val > max_val: max_val = mi_val max_x = X # if Xmax is dependent on T given Mb(T) cols = (max_x,T) + tuple(Mb[T]) if max_x is not None and are_independent(data[:,cols]): Mb[T].append(X) Mb_change = True if debug: print 'Adding %s to MB of %s' % (str(X), str(T)) # SHRINKING PHASE for X in Mb[T]: # if x is independent of t given Mb(T) - {x} cols = (X,T) + tuple(set(Mb[T]) - {X}) if are_independent(data[:,cols],alpha): Mb[T].remove(X) if debug: print 'Removing %s from MB of %s' % (str(X), str(T)) if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) if debug: print 'Unoriented edge dict:\n %s' % str(edge_dict) print 'MB: %s' % str(Mb) # ORIENT EDGES oriented_edge_dict = orient_edges_gs2(edge_dict,Mb,data,alpha) if debug: print 'Oriented edge dict:\n %s' % str(oriented_edge_dict) # CREATE BAYESNET OBJECT value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) bn=BayesNet(oriented_edge_dict,value_dict) return bn else: return Mb[_T]