Exemplo n.º 1
0
def gs(data, alpha=0.05, feature_selection=None, debug=False):
	"""
	Perform growshink algorithm over dataset to learn
	Bayesian network structure.

	This algorithm is clearly a good candidate for
	numba JIT compilation...

	STEPS
	-----
	1. Compute Markov Blanket
	2. Compute Graph Structure
	3. Orient Edges
	4. Remove Cycles
	5. Reverse Edges
	6. Propagate Directions

	Arguments
	---------
	*data* : a nested numpy array
		Data from which you wish to learn structure

	*alpha* : a float
		Type I error rate for independence test

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----

	Speed Test:
		*** 5 variables, 624 observations ***
		- 63.7 ms

	"""
	n_rv = data.shape[1]
	data, value_dict = replace_strings(data, return_values=True)
	

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]

	# STEP 1 : COMPUTE MARKOV BLANKETS
	Mb = dict([(rv,[]) for rv in range(n_rv)])

	for X in _T:
		S = []

		grow_condition = True
		while grow_condition:

			grow_condition=False
			for Y in range(n_rv):
				if X!=Y and Y not in S:
					# if there exists some Y such that Y is dependent on X given S,
					# add Y to S
					cols = (X,Y) + tuple(S)
					pval = mi_test(data[:,cols])
					if pval < alpha: # dependent
						grow_condition=True # dependent -> continue searching
						S.append(Y)
		
		shrink_condition = True
		while shrink_condition:

			TEMP_S = []
			shrink_condition=False
			for Y in S:
				s_copy = copy(S)
				s_copy.remove(Y) # condition on S-{Y}
				# if X independent of Y given S-{Y}, leave Y out
				# if X dependent of Y given S-{Y}, keep it in
				cols = (X,Y) + tuple(s_copy)
				pval = mi_test(data[:,cols])
				if pval < alpha: # dependent
					TEMP_S.append(Y)
				else: # independent -> condition searching
					shrink_condition=True
		
		Mb[X] = TEMP_S
		if debug:
			print('Markov Blanket for %s : %s' % (X, str(TEMP_S)))
	
	if feature_selection is None:
		# STEP 2: COMPUTE GRAPH STRUCTURE
		# i.e. Resolve Markov Blanket
		edge_dict = resolve_markov_blanket(Mb,data)
		if debug:
			print('Unoriented edge dict:\n %s' % str(edge_dict))
		
		# STEP 3: ORIENT EDGES
		oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha)
		if debug:
			print('Oriented edge dict:\n %s' % str(oriented_edge_dict))
		

		# CREATE BAYESNET OBJECT
		bn=BayesNet(oriented_edge_dict,value_dict)
		
		return bn
	else:
		return Mb[_T]
Exemplo n.º 2
0
def gs(data, alpha=0.05, feature_selection=None, debug=False):
	"""
	Perform growshink algorithm over dataset to learn
	Bayesian network structure.

	This algorithm is clearly a good candidate for
	numba JIT compilation...

	STEPS
	-----
	1. Compute Markov Blanket
	2. Compute Graph Structure
	3. Orient Edges
	4. Remove Cycles
	5. Reverse Edges
	6. Propagate Directions

	Arguments
	---------
	*data* : a nested numpy array
		Data from which you wish to learn structure

	*alpha* : a float
		Type I error rate for independence test

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----

	Speed Test:
		*** 5 variables, 624 observations ***
		- 63.7 ms

	"""
	n_rv = data.shape[1]
	data, value_dict = replace_strings(data, return_values=True)
	

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]

	# STEP 1 : COMPUTE MARKOV BLANKETS
	Mb = dict([(rv,[]) for rv in range(n_rv)])

	for X in _T:
		S = []

		grow_condition = True
		while grow_condition:

			grow_condition=False
			for Y in range(n_rv):
				if X!=Y and Y not in S:
					# if there exists some Y such that Y is dependent on X given S,
					# add Y to S
					cols = (X,Y) + tuple(S)
					pval = mi_test(data[:,cols])
					if pval < alpha: # dependent
						grow_condition=True # dependent -> continue searching
						S.append(Y)
		
		shrink_condition = True
		while shrink_condition:

			TEMP_S = []
			shrink_condition=False
			for Y in S:
				s_copy = copy(S)
				s_copy.remove(Y) # condition on S-{Y}
				# if X independent of Y given S-{Y}, leave Y out
				# if X dependent of Y given S-{Y}, keep it in
				cols = (X,Y) + tuple(s_copy)
				pval = mi_test(data[:,cols])
				if pval < alpha: # dependent
					TEMP_S.append(Y)
				else: # independent -> condition searching
					shrink_condition=True
		
		Mb[X] = TEMP_S
		if debug:
			print 'Markov Blanket for %s : %s' % (X, str(TEMP_S))
	
	if feature_selection is None:
		# STEP 2: COMPUTE GRAPH STRUCTURE
		# i.e. Resolve Markov Blanket
		edge_dict = resolve_markov_blanket(Mb,data)
		if debug:
			print 'Unoriented edge dict:\n %s' % str(edge_dict)
		
		# STEP 3: ORIENT EDGES
		oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha)
		if debug:
			print 'Oriented edge dict:\n %s' % str(oriented_edge_dict)
		

		# CREATE BAYESNET OBJECT
		bn=BayesNet(oriented_edge_dict,value_dict)
		
		return bn
	else:
		return Mb[_T]
Exemplo n.º 3
0
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False):
	"""
	From [1]:
		"A novel algorithm for the induction of
		Markov blankets from data, called Fast-IAMB, that employs
		a heuristic to quickly recover the Markov blanket. Empirical
		results show that Fast-IAMB performs in many cases
		faster and more reliably than existing algorithms without
		adversely affecting the accuracy of the recovered Markov
		blankets."

	Arguments
	---------
	*data* : a nested numpy array

	*k* : an integer
		The max number of edges to add at each iteration of 
		the algorithm.

	*alpha* : a float
		Probability of Type I error

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----
	- Currently does not work. I think it's stuck in an infinite loop...

	"""
	# get values
	value_dict = dict(zip(range(data.shape[1]),
			[list(np.unique(col)) for col in data.T]))
	# replace strings
	data = replace_strings(data)

	n_rv = data.shape[1]
	Mb = dict([(rv,[]) for rv in range(n_rv)])
	N = data.shape[0]
	card = dict(zip(range(n_rv),unique_bins(data)))
	#card = dict(zip(range(data.shape[1]),np.amax(data,axis=0)))

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]
	# LEARN MARKOV BLANKET
	for T in _T:
		S = set(range(n_rv)) - {T}
		for A in S:
			if not are_independent(data[:,(A,T)]):
				S.remove(A)
		s_h_dict = dict([(s,0) for s in S])
		while S:
			insufficient_data = False
			break_grow_phase = False
			
			#### GROW PHASE ####
			# Calculate mutual information for all variables
			mi_dict = dict([(s,mi_test(data[:,(s,T)+tuple(Mb[T])])) for s in S])
			for x_i in sorted(mi_dict, key=mi_dict.get,reverse=True):
				# Add top MI-score variables until there isn't enough data for bins
				if (N / card[x_i]*card[T]*np.prod([card[b] for b in Mb[T]])) >= k:
					Mb[T].append(x_i)
				else:
					insufficient_data = True
					break

			#### SHRINK PHASE ####
			removed_vars = False
			for A in Mb[T]:
				cols = (A,T) + tuple(set(Mb[T]) - {A})
				# if A is independent of T given Mb[T], remove A
				if are_independent(data[:,cols]):
					Mb[T].remove(A)
					removed_vars=True

			#### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS ####
			if insufficient_data and not removed_vars:
				if debug:
					print 'Breaking..'
				break
			else:
				A = set(range(n_rv)) - {T} - set(Mb[T])
				#A = set(nodes) - {T} - set(Mb[T])
				S = set()
				for a in A:
					cols = (a,T) + tuple(Mb[T])
					if are_independent(data[:,cols]):
						S.add(a)
		if debug:
			print 'Done with %s' % T
	
	if feature_selection is None:
		# RESOLVE GRAPH STRUCTURE
		edge_dict = resolve_markov_blanket(Mb, data)

		# ORIENT EDGES
		oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha)

		# CREATE BAYESNET OBJECT
		bn=BayesNet(oriented_edge_dict,value_dict)

		return BN
	else:
		return Mb[_T]
Exemplo n.º 4
0
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False):
    """
	From [1]:
		"A novel algorithm for the induction of
		Markov blankets from data, called Fast-IAMB, that employs
		a heuristic to quickly recover the Markov blanket. Empirical
		results show that Fast-IAMB performs in many cases
		faster and more reliably than existing algorithms without
		adversely affecting the accuracy of the recovered Markov
		blankets."

	Arguments
	---------
	*data* : a nested numpy array

	*k* : an integer
		The max number of edges to add at each iteration of 
		the algorithm.

	*alpha* : a float
		Probability of Type I error

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----
	- Currently does not work. I think it's stuck in an infinite loop...

	"""
    # get values
    value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T]))
    # replace strings
    data = replace_strings(data)

    n_rv = data.shape[1]
    Mb = dict([(rv, []) for rv in range(n_rv)])
    N = data.shape[0]
    card = dict(zip(range(n_rv), unique_bins(data)))
    # card = dict(zip(range(data.shape[1]),np.amax(data,axis=0)))

    if feature_selection is None:
        _T = range(n_rv)
    else:
        assert not isinstance(feature_selection, list), "feature_selection must be only one value"
        _T = [feature_selection]
        # LEARN MARKOV BLANKET
    for T in _T:
        S = set(range(n_rv)) - {T}
        for A in S:
            if not are_independent(data[:, (A, T)]):
                S.remove(A)
        s_h_dict = dict([(s, 0) for s in S])
        while S:
            insufficient_data = False
            break_grow_phase = False

            #### GROW PHASE ####
            # Calculate mutual information for all variables
            mi_dict = dict([(s, mi_test(data[:, (s, T) + tuple(Mb[T])])) for s in S])
            for x_i in sorted(mi_dict, key=mi_dict.get, reverse=True):
                # Add top MI-score variables until there isn't enough data for bins
                if (N / card[x_i] * card[T] * np.prod([card[b] for b in Mb[T]])) >= k:
                    Mb[T].append(x_i)
                else:
                    insufficient_data = True
                    break

                    #### SHRINK PHASE ####
            removed_vars = False
            for A in Mb[T]:
                cols = (A, T) + tuple(set(Mb[T]) - {A})
                # if A is independent of T given Mb[T], remove A
                if are_independent(data[:, cols]):
                    Mb[T].remove(A)
                    removed_vars = True

                    #### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS ####
            if insufficient_data and not removed_vars:
                if debug:
                    print "Breaking.."
                break
            else:
                A = set(range(n_rv)) - {T} - set(Mb[T])
                # A = set(nodes) - {T} - set(Mb[T])
                S = set()
                for a in A:
                    cols = (a, T) + tuple(Mb[T])
                    if are_independent(data[:, cols]):
                        S.add(a)
        if debug:
            print "Done with %s" % T

    if feature_selection is None:
        # RESOLVE GRAPH STRUCTURE
        edge_dict = resolve_markov_blanket(Mb, data)

        # ORIENT EDGES
        oriented_edge_dict = orient_edges_MB(edge_dict, Mb, data, alpha)

        # CREATE BAYESNET OBJECT
        bn = BayesNet(oriented_edge_dict, value_dict)

        return BN
    else:
        return Mb[_T]