Пример #1
0
def chow_liu(data,edges_only=False):
	"""
	Perform Chow-Liu structure learning algorithm
	over an entire dataset, and return the BN-tree.


	Arguments
	---------
	*data* : a nested numpy array
		The data from which we will learn. It should be
		the entire dataset.

	Returns
	-------
	*bn* : a BayesNet object
		The structure-learned BN.

	Effects
	-------
	None

	Notes: Prim's algorithm or Kruskal's 
	Remark: This code is wrong. Since once an edge i->j both not in vertex_cache,
	It will not be considerred any longer. Even later, when one of them, say i, is
	added to vertex_cache, apparently i->j would be a safe link, but won't be
	considerred, leading to lower weight spanning tree.
	-----

	"""
	value_dict = dict(zip(range(data.shape[1]),
		[list(np.unique(col)) for col in data.T]))

	n_rv = data.shape[1]

	edge_list = [(i,j,mi_test(data[:,(i,j)],chi2_test=False)) \
					for i in xrange(n_rv) for j in xrange(i+1,n_rv)]
	
	edge_list.sort(key=operator.itemgetter(2), reverse=True) # sort by weight
	vertex_cache = {edge_list[0][0]} # start with first vertex..
	mst = dict((rv, []) for rv in xrange(n_rv))

	for i,j,w in edge_list:
		# since undirected, i->j and j-> is the same
		# and in edge_list, there are only i->j
		# since edge_list already sorted, when we encounter i->j,
		# it must be largest weight edge crossing the cut, thus safe edge
		if i in vertex_cache and j not in vertex_cache:
			mst[i].append(j)
			vertex_cache.add(j)
		elif i not in vertex_cache and j in vertex_cache:
			mst[j].append(i)
			vertex_cache.add(i)
	
	if edges_only == True:
		return mst, value_dict

	bn=BayesNet(mst,value_dict)
	return bn
Пример #2
0
def chow_liu(data,edges_only=False):
	"""
	Perform Chow-Liu structure learning algorithm
	over an entire dataset, and return the BN-tree.


	Arguments
	---------
	*data* : a nested numpy array
		The data from which we will learn. It should be
		the entire dataset.

	Returns
	-------
	*bn* : a BayesNet object
		The structure-learned BN.

	Effects
	-------
	None

	Notes
	-----

	"""
	value_dict = dict(zip(range(data.shape[1]),
		[list(np.unique(col)) for col in data.T]))

	n_rv = data.shape[1]

	edge_list = [(i,j,mi_test(data[:,(i,j)],chi2_test=False)) \
					for i in xrange(n_rv) for j in xrange(i+1,n_rv)]
	
	edge_list.sort(key=operator.itemgetter(2), reverse=True) # sort by weight
	vertex_cache = {edge_list[0][0]} # start with first vertex..
	mst = dict((rv, []) for rv in xrange(n_rv))

	for i,j,w in edge_list:
		if i in vertex_cache and j not in vertex_cache:
			mst[i].append(j)
			vertex_cache.add(j)
		elif i not in vertex_cache and j in vertex_cache:
			mst[j].append(i)
			vertex_cache.add(i)
	
	if edges_only == True:
		return mst, value_dict

	bn=BayesNet(mst,value_dict)
	return bn
Пример #3
0
def chow_liu(data, edges_only=False):
    """
	Perform Chow-Liu structure learning algorithm
	over an entire dataset, and return the BN-tree.


	Arguments
	---------
	*data* : a nested numpy array
		The data from which we will learn. It should be
		the entire dataset.

	Returns
	-------
	*bn* : a BayesNet object
		The structure-learned BN.

	Effects
	-------
	None

	Notes
	-----

	"""
    value_dict = dict(
        zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T]))

    n_rv = data.shape[1]

    edge_list = [(i,j,mi_test(data[:,(i,j)],chi2_test=False)) \
        for i in xrange(n_rv) for j in xrange(i+1,n_rv)]

    edge_list.sort(key=operator.itemgetter(2), reverse=True)  # sort by weight
    vertex_cache = {edge_list[0][0]}  # start with first vertex..
    mst = dict((rv, []) for rv in xrange(n_rv))

    for i, j, w in edge_list:
        if i in vertex_cache and j not in vertex_cache:
            mst[i].append(j)
            vertex_cache.add(j)
        elif i not in vertex_cache and j in vertex_cache:
            mst[j].append(i)
            vertex_cache.add(i)

    if edges_only == True:
        return mst, value_dict

    bn = BayesNet(mst, value_dict)
    return bn
Пример #4
0
def mb_fitness(data, Mb, target=None):
    """
	Evaluate the fitness of a Markov Blanket dictionary
	learned from a given data set based on the distance metric
	provided in [1] and [2].

	From [2]:
		A distance measure that indicates the "fitness"
		of the discovered blanket... to be the average, over all attributes
		X outside the blanket, of the expected KL-divergence between
		Pr(T | B(T)) and Pr(T | B(T) u {X}). We can expect this 
		measure to be close to zero when B(T) is an approximate
		blanket. -- 
		My Note: T is the target variable, and if the KL-divergence
		between the two distributions above is zero, then it means that
		{X} provides no new information about T and can thus be excluded
		from Mb(T) -- this is the exact definition of conditional independence.

	Notes
	-----
	- Find Pr(T|B(T)) ..
	- For each variable X outside of the B(T), calculate 
		D( Pr(T|B(T)), Pr(T|B(T)u{X}) )
	- Take the average (closer to Zero is better)

	^^^ This is basically calculating where T is independent of X given B(T)..
		i.e. Sum over all X not in B(T) of mi_test(data[:,(T,X,B(T))]) / |X|
	"""
    if target is None:
        nodes = set(Mb.keys())
    else:
        try:
            nodes = set(target)
        except TypeError:
            nodes = {target}

    fitness_dict = dict([(rv, 0) for rv in nodes])
    for T in nodes:
        non_blanket = nodes - set(Mb[T]) - {T}
        for X in non_blanket:
            pval = mi_test(data[:, (T, X) + tuple(Mb[T])])
            fitness_dict[T] += 1 / pval
    return fitness_dict
Пример #5
0
def mb_fitness(data, Mb, target=None):
    """
	Evaluate the fitness of a Markov Blanket dictionary
	learned from a given data set based on the distance metric
	provided in [1] and [2].

	From [2]:
		A distance measure that indicates the "fitness"
		of the discovered blanket... to be the average, over all attributes
		X outside the blanket, of the expected KL-divergence between
		Pr(T | B(T)) and Pr(T | B(T) u {X}). We can expect this 
		measure to be close to zero when B(T) is an approximate
		blanket. -- 
		My Note: T is the target variable, and if the KL-divergence
		between the two distributions above is zero, then it means that
		{X} provides no new information about T and can thus be excluded
		from Mb(T) -- this is the exact definition of conditional independence.

	Notes
	-----
	- Find Pr(T|B(T)) ..
	- For each variable X outside of the B(T), calculate 
		D( Pr(T|B(T)), Pr(T|B(T)u{X}) )
	- Take the average (closer to Zero is better)

	^^^ This is basically calculating where T is independent of X given B(T)..
		i.e. Sum over all X not in B(T) of mi_test(data[:,(T,X,B(T))]) / |X|
	"""
    if target is None:
        nodes = set(Mb.keys())
    else:
        try:
            nodes = set(target)
        except TypeError:
            nodes = {target}

    fitness_dict = dict([(rv, 0) for rv in nodes])
    for T in nodes:
        non_blanket = nodes - set(Mb[T]) - {T}
        for X in non_blanket:
            pval = mi_test(data[:, (T, X) + tuple(Mb[T])])
            fitness_dict[T] += 1 / pval
    return fitness_dict
Пример #6
0
def orient_edges_gs2(edge_dict, Mb, data, alpha):
	"""
	Similar algorithm as above, but slightly modified for speed?
	Need to test.
	"""
	d_edge_dict = dict([(rv,[]) for rv in edge_dict])
	for X in edge_dict.keys():
		for Y in edge_dict[X]:
			nxy = set(edge_dict[X]) - set(edge_dict[Y]) - {Y}
			for Z in nxy:
				if Y not in d_edge_dict[X]:
					d_edge_dict[X].append(Y) # SET Y -> X
				B = min(set(Mb[Y]) - {X} - {Z},set(Mb[Z]) - {X} - {Y})
				for i in range(len(B)):
					for S in itertools.combinations(B,i):
						cols = (Y,Z,X) + tuple(S)
						pval = mi_test(data[:,cols])
						if pval < alpha and X in d_edge_dict[Y]: # Y IS independent of Z given S+X
							d_edge_dict[Y].remove(X)
				if X in d_edge_dict[Y]:
					break
	return d_edge_dict
Пример #7
0
def orient_edges_gs2(edge_dict, Mb, data, alpha):
    """
	Similar algorithm as above, but slightly modified for speed?
	Need to test.
	"""
    d_edge_dict = dict([(rv, []) for rv in edge_dict])
    for X in edge_dict.keys():
        for Y in edge_dict[X]:
            nxy = set(edge_dict[X]) - set(edge_dict[Y]) - {Y}
            for Z in nxy:
                if Y not in d_edge_dict[X]:
                    d_edge_dict[X].append(Y)  # SET Y -> X
                B = min(set(Mb[Y]) - {X} - {Z}, set(Mb[Z]) - {X} - {Y})
                for i in range(len(B)):
                    for S in itertools.combinations(B, i):
                        cols = (Y, Z, X) + tuple(S)
                        pval = mi_test(data[:, cols])
                        if pval < alpha and X in d_edge_dict[Y]:  # Y IS independent of Z given S+X
                            d_edge_dict[Y].remove(X)
                if X in d_edge_dict[Y]:
                    break
    return d_edge_dict
Пример #8
0
def pc(data, alpha=0.05):
	"""
	Path Condition algorithm for structure learning. This is a
	good test, but has some issues with test reliability when
	the size of the dataset is small. The Necessary Path
	Condition (NPC) algorithm can solve these problems.

	Arguments
	---------
	*bn* : a BayesNet object
		The object we wish to modify. This can be a competely
		empty BayesNet object, in which case the structure info
		will be set. This can be a BayesNet object with already
		initialized structure/params, in which case the structure
		will be overwritten and the parameters will be cleared.

	*data* : a nested numpy array
		The data from which we will learn -> will code for
		pandas dataframe after numpy works

	Returns
	-------
	*bn* : a BayesNet object
		The network created from the learning procedure, with
		the nodes/edges initialized/changed

	Effects
	-------
	None

	Notes
	-----

	Speed Test:
		** 5 vars, 624 obs ***
			- 90.9 ms
	"""
	n_rv = data.shape[1]
	##### FIND EDGES #####
	value_dict = dict(zip(range(n_rv),
		[list(np.unique(col)) for col in data.T]))
	
	edge_dict = dict([(i,[j for j in range(n_rv) if i!=j]) for i in range(n_rv)])
	block_dict = dict([(i,[]) for i in range(n_rv)])
	stop = False
	i = 1
	while not stop:
		for x in xrange(n_rv):
			for y in edge_dict[x]:
				if i == 0:
					pval_xy_z = mi_test(data[:,(x,y)])
					if pval_xy_z > alpha:
						if y in edge_dict[x]:
							edge_dict[x].remove(y)
							edge_dict[y].remove(x)
				else:
					for z in itertools.combinations(edge_dict[x],i):
						if y not in z:
							cols = (x,y) + z
							pval_xy_z = mi_test(data[:,cols])
							# if I(X,Y | Z) = TRUE
							if pval_xy_z > alpha:
								block_dict[x] = {y:z}
								block_dict[y] = {x:z}
								if y in edge_dict[x]:
									edge_dict[x].remove(y)
									edge_dict[y].remove(x)
							
		i += 1
		stop = True
		for x in xrange(n_rv):
			if (len(edge_dict[x]) > i-1):
				stop = False
				break
	
	# ORIENT EDGES (from collider set)
	directed_edge_dict = orient_edges_CS(edge_dict,block_dict)

	# CREATE BAYESNET OBJECT
	bn=BayesNet(directed_edge_dict,value_dict)
	
	return bn
Пример #9
0
def resolve_markov_blanket(Mb, data, alpha=0.05):
    """
	Resolving the Markov blanket is the process
	by which a PDAG is constructed from the collection
	of Markov Blankets for each node. Since an
	undirected graph is returned, the edges still need to 
	be oriented by calling some version of the 
	"orient_edges" function in "pyBN.structure_learn.orient_edges" 
	module.

	This algorithm is adapted from Margaritis, but also see [3]
	for good pseudocode.

	Arguments
	---------
	*Mb* : a dictionary, where
		key = rv and value = list of vars in rv's markov blanket

	*data* : a nested numpy array
		The dataset used to learn the Mb

	Returns
	-------
	*edge_dict* : a dictionary, where
		key = rv and value = list of rv's children

	Effects
	-------
	None

	Notes
	-----
	"""
    n_rv = data.shape[1]
    edge_dict = dict([(rv, []) for rv in range(n_rv)])
    for X in range(n_rv):
        print("X", X)
        for Y in Mb[X]:
            print("Y", Y)
            # X and Y are direct neighbors if X and Y are dependent
            # given S for all S in T, where T is the smaller of
            # B(X)-{Y} and B(Y)-{X}
            if len(Mb[X]) < len(Mb[Y]):
                T = copy(Mb[X])  # shallow copy is sufficient
                if Y in T:
                    T.remove(Y)
            else:
                T = copy(Mb[Y])  # shallow copy is sufficient
                if X in T:
                    T.remove(X)

            # X and Y must be dependent conditioned upon
            # EVERY POSSIBLE COMBINATION of T
            direct_neighbors = True
            for i in range(len(T)):
                for S in itertools.combinations(T, i):
                    print("Iter", S)
                    cols = (X, Y) + tuple(S)
                    pval = mi_test(data[:, cols])
                    if pval > alpha:
                        direct_neighbors = False
            if direct_neighbors:
                if Y not in edge_dict[X] and X not in edge_dict[Y]:
                    edge_dict[X].append(Y)
                if X not in edge_dict[Y]:
                    edge_dict[Y].append(X)
    return edge_dict
Пример #10
0
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False):
    """
	From [1]:
		"A novel algorithm for the induction of
		Markov blankets from data, called Fast-IAMB, that employs
		a heuristic to quickly recover the Markov blanket. Empirical
		results show that Fast-IAMB performs in many cases
		faster and more reliably than existing algorithms without
		adversely affecting the accuracy of the recovered Markov
		blankets."

	Arguments
	---------
	*data* : a nested numpy array

	*k* : an integer
		The max number of edges to add at each iteration of 
		the algorithm.

	*alpha* : a float
		Probability of Type I error

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----
	- Currently does not work. I think it's stuck in an infinite loop...

	"""
    # get values
    value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T]))
    # replace strings
    data = replace_strings(data)

    n_rv = data.shape[1]
    Mb = dict([(rv, []) for rv in range(n_rv)])
    N = data.shape[0]
    card = dict(zip(range(n_rv), unique_bins(data)))
    # card = dict(zip(range(data.shape[1]),np.amax(data,axis=0)))

    if feature_selection is None:
        _T = range(n_rv)
    else:
        assert not isinstance(feature_selection, list), "feature_selection must be only one value"
        _T = [feature_selection]
        # LEARN MARKOV BLANKET
    for T in _T:
        S = set(range(n_rv)) - {T}
        for A in S:
            if not are_independent(data[:, (A, T)]):
                S.remove(A)
        s_h_dict = dict([(s, 0) for s in S])
        while S:
            insufficient_data = False
            break_grow_phase = False

            #### GROW PHASE ####
            # Calculate mutual information for all variables
            mi_dict = dict([(s, mi_test(data[:, (s, T) + tuple(Mb[T])])) for s in S])
            for x_i in sorted(mi_dict, key=mi_dict.get, reverse=True):
                # Add top MI-score variables until there isn't enough data for bins
                if (N / card[x_i] * card[T] * np.prod([card[b] for b in Mb[T]])) >= k:
                    Mb[T].append(x_i)
                else:
                    insufficient_data = True
                    break

                    #### SHRINK PHASE ####
            removed_vars = False
            for A in Mb[T]:
                cols = (A, T) + tuple(set(Mb[T]) - {A})
                # if A is independent of T given Mb[T], remove A
                if are_independent(data[:, cols]):
                    Mb[T].remove(A)
                    removed_vars = True

                    #### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS ####
            if insufficient_data and not removed_vars:
                if debug:
                    print "Breaking.."
                break
            else:
                A = set(range(n_rv)) - {T} - set(Mb[T])
                # A = set(nodes) - {T} - set(Mb[T])
                S = set()
                for a in A:
                    cols = (a, T) + tuple(Mb[T])
                    if are_independent(data[:, cols]):
                        S.add(a)
        if debug:
            print "Done with %s" % T

    if feature_selection is None:
        # RESOLVE GRAPH STRUCTURE
        edge_dict = resolve_markov_blanket(Mb, data)

        # ORIENT EDGES
        oriented_edge_dict = orient_edges_MB(edge_dict, Mb, data, alpha)

        # CREATE BAYESNET OBJECT
        bn = BayesNet(oriented_edge_dict, value_dict)

        return BN
    else:
        return Mb[_T]
Пример #11
0
def iamb(data, alpha=0.05, feature_selection=None, debug=False):
    """
	IAMB Algorithm for learning the structure of a
	Discrete Bayesian Network from data.

	Arguments
	---------
	*data* : a nested numpy array

	*alpha* : a float
		The type II error rate.

	*feature_selection* : None or a string
		Whether to use IAMB as a structure learning
		or feature selection algorithm.

	Returns
	-------
	*bn* : a BayesNet object or
	*mb* : the markov blanket of a node

	Effects
	-------
	None

	Notes
	-----
	- Works but there are definitely some bugs.

	Speed Test:
		*** 5 vars, 624 obs ***
			- 196 ms
	"""
    n_rv = data.shape[1]
    Mb = dict([(rv, []) for rv in range(n_rv)])

    if feature_selection is None:
        _T = range(n_rv)
    else:
        assert (not isinstance(feature_selection, list)
                ), 'feature_selection must be only one value'
        _T = [feature_selection]

    # LEARN MARKOV BLANKET
    for T in _T:

        V = set(range(n_rv)) - {T}
        Mb_change = True

        # GROWING PHASE
        while Mb_change:
            Mb_change = False
            # find X_max in V-Mb(T)-{T} that maximizes
            # mutual information of X,T|Mb(T)
            # i.e. max of mi_test(data[:,(X,T,Mb(T))])
            max_val = -1
            max_x = None
            for X in V - set(Mb[T]) - {T}:
                cols = (X, T) + tuple(Mb[T])
                mi_val = mi_test(data[:, cols], test=False)
                if mi_val > max_val:
                    max_val = mi_val
                    max_x = X
            # if Xmax is dependent on T given Mb(T)
            cols = (max_x, T) + tuple(Mb[T])
            if max_x is not None and are_independent(data[:, cols]):
                Mb[T].append(X)
                Mb_change = True
                if debug:
                    print('Adding %s to MB of %s' % (str(X), str(T)))

        # SHRINKING PHASE
        for X in Mb[T]:
            # if x is independent of t given Mb(T) - {x}
            cols = (X, T) + tuple(set(Mb[T]) - {X})
            if are_independent(data[:, cols], alpha):
                Mb[T].remove(X)
                if debug:
                    print('Removing %s from MB of %s' % (str(X), str(T)))

    if feature_selection is None:
        # RESOLVE GRAPH STRUCTURE
        edge_dict = resolve_markov_blanket(Mb, data)
        if debug:
            print('Unoriented edge dict:\n %s' % str(edge_dict))
            print('MB: %s' % str(Mb))
        # ORIENT EDGES
        oriented_edge_dict = orient_edges_gs2(edge_dict, Mb, data, alpha)
        if debug:
            print('Oriented edge dict:\n %s' % str(oriented_edge_dict))

        # CREATE BAYESNET OBJECT
        value_dict = dict(
            zip(range(data.shape[1]),
                [list(np.unique(col)) for col in data.T]))
        bn = BayesNet(oriented_edge_dict, value_dict)

        return bn
    else:
        return Mb[_T]
Пример #12
0
def orient_edges_MB(edge_dict, Mb, data, alpha):
    """
	Orient edges from a Markov Blanket based on the rules presented
	in Margaritis' Thesis pg. 35. This method is used
	for structure learning algorithms that return/resolve
	a markov blanket - i.e. growshrink and iamb.

	Also, see [2] for good full pseudocode.

	# if there exists a variable Z in N(X)-N(Y)-{Y}
	# such that Y and Z are dependent given S+{X} for
	# all S subset of T, where
	# T is smaller of B(Y)-{X,Z} and B(Z)-{X,Y}

	Arguments
	---------
	*edge_dict* : a dictionary, where
		key = node and value = list
		of neighbors for key. Note: there
		MUST BE duplicates in edge_dict ->
		i.e. each edge should be in edge_dict
		twice since Y in edge_dict[X] and
		X in edge_dict[Y]

	*blanket* : a dictionary, where
		key = node and value = list of
		nodes in the markov blanket of node

	*data* : a nested numpy array

	*alpha* : a float
		Probability of Type II error.

	Returns
	-------
	*d_edge_dict* : a dictionary
		Dictionary of directed edges, so
		there are no duplicates

	Effects
	-------
	None

	Notes
	-----

	"""
    for X in edge_dict.keys():
        for Y in edge_dict[X]:
            nxy = set(edge_dict[X]) - set(edge_dict[Y]) - {Y}

            for Z in nxy:
                by = set(Mb[Y]) - {X} - {Z}
                bz = set(Mb[Z]) - {X} - {Y}
                T = min(by, bz)
                if len(T) > 0:
                    for i in range(len(T)):
                        for S in itertools.combinations(T, i):
                            cols = (Y, Z, X) + tuple(S)
                            pval = mi_test(data[:, cols])
                            if pval < alpha:
                                if Y in edge_dict[X]:
                                    edge_dict[X].remove(Y)
                            else:
                                if Y in edge_dict[X]:
                                    edge_dict[Y].remove(X)
                else:
                    cols = (Y, Z, X)
                    pval = mi_test(data[:, cols])
                    if pval < alpha:
                        if Y in edge_dict[X]:
                            edge_dict[X].remove(Y)
                    else:
                        if X in edge_dict[Y]:
                            edge_dict[Y].remove(X)
    return edge_dict
Пример #13
0
def gs(data, alpha=0.05, feature_selection=None, debug=False):
	"""
	Perform growshink algorithm over dataset to learn
	Bayesian network structure.

	This algorithm is clearly a good candidate for
	numba JIT compilation...

	STEPS
	-----
	1. Compute Markov Blanket
	2. Compute Graph Structure
	3. Orient Edges
	4. Remove Cycles
	5. Reverse Edges
	6. Propagate Directions

	Arguments
	---------
	*data* : a nested numpy array
		Data from which you wish to learn structure

	*alpha* : a float
		Type I error rate for independence test

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----

	Speed Test:
		*** 5 variables, 624 observations ***
		- 63.7 ms

	"""
	n_rv = data.shape[1]
	data, value_dict = replace_strings(data, return_values=True)
	

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]

	# STEP 1 : COMPUTE MARKOV BLANKETS
	Mb = dict([(rv,[]) for rv in range(n_rv)])

	for X in _T:
		S = []

		grow_condition = True
		while grow_condition:

			grow_condition=False
			for Y in range(n_rv):
				if X!=Y and Y not in S:
					# if there exists some Y such that Y is dependent on X given S,
					# add Y to S
					cols = (X,Y) + tuple(S)
					pval = mi_test(data[:,cols])
					if pval < alpha: # dependent
						grow_condition=True # dependent -> continue searching
						S.append(Y)
		
		shrink_condition = True
		while shrink_condition:

			TEMP_S = []
			shrink_condition=False
			for Y in S:
				s_copy = copy(S)
				s_copy.remove(Y) # condition on S-{Y}
				# if X independent of Y given S-{Y}, leave Y out
				# if X dependent of Y given S-{Y}, keep it in
				cols = (X,Y) + tuple(s_copy)
				pval = mi_test(data[:,cols])
				if pval < alpha: # dependent
					TEMP_S.append(Y)
				else: # independent -> condition searching
					shrink_condition=True
		
		Mb[X] = TEMP_S
		if debug:
			print 'Markov Blanket for %s : %s' % (X, str(TEMP_S))
	
	if feature_selection is None:
		# STEP 2: COMPUTE GRAPH STRUCTURE
		# i.e. Resolve Markov Blanket
		edge_dict = resolve_markov_blanket(Mb,data)
		if debug:
			print 'Unoriented edge dict:\n %s' % str(edge_dict)
		
		# STEP 3: ORIENT EDGES
		oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha)
		if debug:
			print 'Oriented edge dict:\n %s' % str(oriented_edge_dict)
		

		# CREATE BAYESNET OBJECT
		bn=BayesNet(oriented_edge_dict,value_dict)
		
		return bn
	else:
		return Mb[_T]
Пример #14
0
def resolve_markov_blanket(Mb, data, alpha=0.05):
    """
	Resolving the Markov blanket is the process
	by which a PDAG is constructed from the collection
	of Markov Blankets for each node. Since an
	undirected graph is returned, the edges still need to 
	be oriented by calling some version of the 
	"orient_edges" function in "pyBN.structure_learn.orient_edges" 
	module.

	This algorithm is adapted from Margaritis, but also see [3]
	for good pseudocode.

	Arguments
	---------
	*Mb* : a dictionary, where
		key = rv and value = list of vars in rv's markov blanket

	*data* : a nested numpy array
		The dataset used to learn the Mb

	Returns
	-------
	*edge_dict* : a dictionary, where
		key = rv and value = list of rv's children

	Effects
	-------
	None

	Notes
	-----
	"""
    n_rv = data.shape[1]
    edge_dict = dict([(rv, []) for rv in range(n_rv)])
    for X in range(n_rv):
        for Y in Mb[X]:
            # X and Y are direct neighbors if X and Y are dependent
            # given S for all S in T, where T is the smaller of
            # B(X)-{Y} and B(Y)-{X}
            if len(Mb[X]) < len(Mb[Y]):
                T = copy(Mb[X])  # shallow copy is sufficient
                if Y in T:
                    T.remove(Y)
            else:
                T = copy(Mb[Y])  # shallow copy is sufficient
                if X in T:
                    T.remove(X)

                    # X and Y must be dependent conditioned upon
                    # EVERY POSSIBLE COMBINATION of T
            direct_neighbors = True
            for i in range(len(T)):
                for S in itertools.combinations(T, i):
                    cols = (X, Y) + tuple(S)
                    pval = mi_test(data[:, cols])
                    if pval > alpha:
                        direct_neighbors = False
            if direct_neighbors:
                if Y not in edge_dict[X] and X not in edge_dict[Y]:
                    edge_dict[X].append(Y)
                if X not in edge_dict[Y]:
                    edge_dict[Y].append(X)
    return edge_dict
Пример #15
0
def pc(data, alpha=0.05):
    """
	Path Condition algorithm for structure learning. This is a
	good test, but has some issues with test reliability when
	the size of the dataset is small. The Necessary Path
	Condition (NPC) algorithm can solve these problems.

	Arguments
	---------
	*bn* : a BayesNet object
		The object we wish to modify. This can be a competely
		empty BayesNet object, in which case the structure info
		will be set. This can be a BayesNet object with already
		initialized structure/params, in which case the structure
		will be overwritten and the parameters will be cleared.

	*data* : a nested numpy array
		The data from which we will learn -> will code for
		pandas dataframe after numpy works

	Returns
	-------
	*bn* : a BayesNet object
		The network created from the learning procedure, with
		the nodes/edges initialized/changed

	Effects
	-------
	None

	Notes
	-----

	Speed Test:
		** 5 vars, 624 obs ***
			- 90.9 ms
	"""
    n_rv = data.shape[1]
    ##### FIND EDGES #####
    value_dict = dict(
        zip(range(n_rv), [list(np.unique(col)) for col in data.T]))

    edge_dict = dict([(i, [j for j in range(n_rv) if i != j])
                      for i in range(n_rv)])
    block_dict = dict([(i, []) for i in range(n_rv)])
    stop = False
    i = 1
    while not stop:
        for x in range(n_rv):
            for y in edge_dict[x]:
                if i == 0:
                    pval_xy_z = mi_test(data[:, (x, y)])
                    if pval_xy_z > alpha:
                        if y in edge_dict[x]:
                            edge_dict[x].remove(y)
                            edge_dict[y].remove(x)
                else:
                    for z in itertools.combinations(edge_dict[x], i):
                        if y not in z:
                            cols = (x, y) + z
                            pval_xy_z = mi_test(data[:, cols])
                            # if I(X,Y | Z) = TRUE
                            if pval_xy_z > alpha:
                                block_dict[x] = {y: z}
                                block_dict[y] = {x: z}
                                if y in edge_dict[x]:
                                    edge_dict[x].remove(y)
                                    edge_dict[y].remove(x)

        i += 1
        stop = True
        for x in range(n_rv):
            if (len(edge_dict[x]) > i - 1):
                stop = False
                break

    # ORIENT EDGES (from collider set)
    directed_edge_dict = orient_edges_CS(edge_dict, block_dict)

    # CREATE BAYESNET OBJECT
    bn = BayesNet(directed_edge_dict, value_dict)

    return bn
Пример #16
0
def gs(data, alpha=0.05, feature_selection=None, debug=False):
	"""
	Perform growshink algorithm over dataset to learn
	Bayesian network structure.

	This algorithm is clearly a good candidate for
	numba JIT compilation...

	STEPS
	-----
	1. Compute Markov Blanket
	2. Compute Graph Structure
	3. Orient Edges
	4. Remove Cycles
	5. Reverse Edges
	6. Propagate Directions

	Arguments
	---------
	*data* : a nested numpy array
		Data from which you wish to learn structure

	*alpha* : a float
		Type I error rate for independence test

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----

	Speed Test:
		*** 5 variables, 624 observations ***
		- 63.7 ms

	"""
	n_rv = data.shape[1]
	data, value_dict = replace_strings(data, return_values=True)
	

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]

	# STEP 1 : COMPUTE MARKOV BLANKETS
	Mb = dict([(rv,[]) for rv in range(n_rv)])

	for X in _T:
		S = []

		grow_condition = True
		while grow_condition:

			grow_condition=False
			for Y in range(n_rv):
				if X!=Y and Y not in S:
					# if there exists some Y such that Y is dependent on X given S,
					# add Y to S
					cols = (X,Y) + tuple(S)
					pval = mi_test(data[:,cols])
					if pval < alpha: # dependent
						grow_condition=True # dependent -> continue searching
						S.append(Y)
		
		shrink_condition = True
		while shrink_condition:

			TEMP_S = []
			shrink_condition=False
			for Y in S:
				s_copy = copy(S)
				s_copy.remove(Y) # condition on S-{Y}
				# if X independent of Y given S-{Y}, leave Y out
				# if X dependent of Y given S-{Y}, keep it in
				cols = (X,Y) + tuple(s_copy)
				pval = mi_test(data[:,cols])
				if pval < alpha: # dependent
					TEMP_S.append(Y)
				else: # independent -> condition searching
					shrink_condition=True
		
		Mb[X] = TEMP_S
		if debug:
			print('Markov Blanket for %s : %s' % (X, str(TEMP_S)))
	
	if feature_selection is None:
		# STEP 2: COMPUTE GRAPH STRUCTURE
		# i.e. Resolve Markov Blanket
		edge_dict = resolve_markov_blanket(Mb,data)
		if debug:
			print('Unoriented edge dict:\n %s' % str(edge_dict))
		
		# STEP 3: ORIENT EDGES
		oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha)
		if debug:
			print('Oriented edge dict:\n %s' % str(oriented_edge_dict))
		

		# CREATE BAYESNET OBJECT
		bn=BayesNet(oriented_edge_dict,value_dict)
		
		return bn
	else:
		return Mb[_T]
Пример #17
0
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False):
	"""
	From [1]:
		"A novel algorithm for the induction of
		Markov blankets from data, called Fast-IAMB, that employs
		a heuristic to quickly recover the Markov blanket. Empirical
		results show that Fast-IAMB performs in many cases
		faster and more reliably than existing algorithms without
		adversely affecting the accuracy of the recovered Markov
		blankets."

	Arguments
	---------
	*data* : a nested numpy array

	*k* : an integer
		The max number of edges to add at each iteration of 
		the algorithm.

	*alpha* : a float
		Probability of Type I error

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----
	- Currently does not work. I think it's stuck in an infinite loop...

	"""
	# get values
	value_dict = dict(zip(range(data.shape[1]),
			[list(np.unique(col)) for col in data.T]))
	# replace strings
	data = replace_strings(data)

	n_rv = data.shape[1]
	Mb = dict([(rv,[]) for rv in range(n_rv)])
	N = data.shape[0]
	card = dict(zip(range(n_rv),unique_bins(data)))
	#card = dict(zip(range(data.shape[1]),np.amax(data,axis=0)))

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]
	# LEARN MARKOV BLANKET
	for T in _T:
		S = set(range(n_rv)) - {T}
		for A in S:
			if not are_independent(data[:,(A,T)]):
				S.remove(A)
		s_h_dict = dict([(s,0) for s in S])
		while S:
			insufficient_data = False
			break_grow_phase = False
			
			#### GROW PHASE ####
			# Calculate mutual information for all variables
			mi_dict = dict([(s,mi_test(data[:,(s,T)+tuple(Mb[T])])) for s in S])
			for x_i in sorted(mi_dict, key=mi_dict.get,reverse=True):
				# Add top MI-score variables until there isn't enough data for bins
				if (N / card[x_i]*card[T]*np.prod([card[b] for b in Mb[T]])) >= k:
					Mb[T].append(x_i)
				else:
					insufficient_data = True
					break

			#### SHRINK PHASE ####
			removed_vars = False
			for A in Mb[T]:
				cols = (A,T) + tuple(set(Mb[T]) - {A})
				# if A is independent of T given Mb[T], remove A
				if are_independent(data[:,cols]):
					Mb[T].remove(A)
					removed_vars=True

			#### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS ####
			if insufficient_data and not removed_vars:
				if debug:
					print 'Breaking..'
				break
			else:
				A = set(range(n_rv)) - {T} - set(Mb[T])
				#A = set(nodes) - {T} - set(Mb[T])
				S = set()
				for a in A:
					cols = (a,T) + tuple(Mb[T])
					if are_independent(data[:,cols]):
						S.add(a)
		if debug:
			print 'Done with %s' % T
	
	if feature_selection is None:
		# RESOLVE GRAPH STRUCTURE
		edge_dict = resolve_markov_blanket(Mb, data)

		# ORIENT EDGES
		oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha)

		# CREATE BAYESNET OBJECT
		bn=BayesNet(oriented_edge_dict,value_dict)

		return BN
	else:
		return Mb[_T]
Пример #18
0
def orient_edges_MB(edge_dict, Mb, data, alpha):
    """
	Orient edges from a Markov Blanket based on the rules presented
	in Margaritis' Thesis pg. 35. This method is used
	for structure learning algorithms that return/resolve
	a markov blanket - i.e. growshrink and iamb.

	Also, see [2] for good full pseudocode.

	# if there exists a variable Z in N(X)-N(Y)-{Y}
	# such that Y and Z are dependent given S+{X} for
	# all S subset of T, where
	# T is smaller of B(Y)-{X,Z} and B(Z)-{X,Y}

	Arguments
	---------
	*edge_dict* : a dictionary, where
		key = node and value = list
		of neighbors for key. Note: there
		MUST BE duplicates in edge_dict ->
		i.e. each edge should be in edge_dict
		twice since Y in edge_dict[X] and
		X in edge_dict[Y]

	*blanket* : a dictionary, where
		key = node and value = list of
		nodes in the markov blanket of node

	*data* : a nested numpy array

	*alpha* : a float
		Probability of Type II error.

	Returns
	-------
	*d_edge_dict* : a dictionary
		Dictionary of directed edges, so
		there are no duplicates

	Effects
	-------
	None

	Notes
	-----

	"""
    for X in edge_dict.keys():
        for Y in edge_dict[X]:
            nxy = set(edge_dict[X]) - set(edge_dict[Y]) - {Y}

            for Z in nxy:
                by = set(Mb[Y]) - {X} - {Z}
                bz = set(Mb[Z]) - {X} - {Y}
                T = min(by, bz)
                if len(T) > 0:
                    for i in range(len(T)):
                        for S in itertools.combinations(T, i):
                            cols = (Y, Z, X) + tuple(S)
                            pval = mi_test(data[:, cols])
                            if pval < alpha:
                                if Y in edge_dict[X]:
                                    edge_dict[X].remove(Y)
                            else:
                                if Y in edge_dict[X]:
                                    edge_dict[Y].remove(X)
                else:
                    cols = (Y, Z, X)
                    pval = mi_test(data[:, cols])
                    if pval < alpha:
                        if Y in edge_dict[X]:
                            edge_dict[X].remove(Y)
                    else:
                        if X in edge_dict[Y]:
                            edge_dict[Y].remove(X)
    return edge_dict
Пример #19
0
def iamb(data, alpha=0.05, feature_selection=None, debug=False):
	"""
	IAMB Algorithm for learning the structure of a
	Discrete Bayesian Network from data.

	Arguments
	---------
	*data* : a nested numpy array

	*alpha* : a float
		The type II error rate.

	*feature_selection* : None or a string
		Whether to use IAMB as a structure learning
		or feature selection algorithm.

	Returns
	-------
	*bn* : a BayesNet object or
	*mb* : the markov blanket of a node

	Effects
	-------
	None

	Notes
	-----
	- Works but there are definitely some bugs.

	Speed Test:
		*** 5 vars, 624 obs ***
			- 196 ms
	"""
	n_rv = data.shape[1]
	Mb = dict([(rv,[]) for rv in range(n_rv)])

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]

	# LEARN MARKOV BLANKET
	for T in _T:

		V = set(range(n_rv)) - {T}
		Mb_change=True

		# GROWING PHASE
		while Mb_change:
			Mb_change = False
			# find X_max in V-Mb(T)-{T} that maximizes 
			# mutual information of X,T|Mb(T)
			# i.e. max of mi_test(data[:,(X,T,Mb(T))])
			max_val = -1
			max_x = None
			for X in V - set(Mb[T]) - {T}:
				cols = (X,T)+tuple(Mb[T])
				mi_val = mi_test(data[:,cols],test=False)
				if mi_val > max_val:
					max_val = mi_val
					max_x = X
			# if Xmax is dependent on T given Mb(T)
			cols = (max_x,T) + tuple(Mb[T])
			if max_x is not None and are_independent(data[:,cols]):
				Mb[T].append(X)
				Mb_change = True
				if debug:
					print 'Adding %s to MB of %s' % (str(X), str(T))

		# SHRINKING PHASE
		for X in Mb[T]:
			# if x is independent of t given Mb(T) - {x}
			cols = (X,T) + tuple(set(Mb[T]) - {X})
			if are_independent(data[:,cols],alpha):
				Mb[T].remove(X)
				if debug:
					print 'Removing %s from MB of %s' % (str(X), str(T))

	if feature_selection is None:
		# RESOLVE GRAPH STRUCTURE
		edge_dict = resolve_markov_blanket(Mb, data)
		if debug:
			print 'Unoriented edge dict:\n %s' % str(edge_dict)
			print 'MB: %s' % str(Mb)
		# ORIENT EDGES
		oriented_edge_dict = orient_edges_gs2(edge_dict,Mb,data,alpha)
		if debug:
			print 'Oriented edge dict:\n %s' % str(oriented_edge_dict)

		# CREATE BAYESNET OBJECT
		value_dict = dict(zip(range(data.shape[1]),
			[list(np.unique(col)) for col in data.T]))
		bn=BayesNet(oriented_edge_dict,value_dict)

		return bn
	else:
		return Mb[_T]