def test_graph_depth_first_trivial_graph(): csgraph = np.array([[0]]) csgraph = csgraph_from_dense(csgraph, null_value=0) bfirst = np.array([[0]]) for directed in [True, False]: bfirst_test = depth_first_tree(csgraph, 0, directed) assert_array_almost_equal(csgraph_to_dense(bfirst_test), bfirst)
def test_graph_depth_first(): csgraph = np.array([[0, 1, 2, 0, 0], [1, 0, 0, 0, 3], [2, 0, 0, 7, 0], [0, 0, 7, 0, 1], [0, 3, 0, 1, 0]]) csgraph = csgraph_from_dense(csgraph, null_value=0) dfirst = np.array([[0, 1, 0, 0, 0], [0, 0, 0, 0, 3], [0, 0, 0, 0, 0], [0, 0, 7, 0, 0], [0, 0, 0, 1, 0]]) for directed in [True, False]: dfirst_test = depth_first_tree(csgraph, 0, directed) assert_array_almost_equal(csgraph_to_dense(dfirst_test), dfirst)
def test_graph_depth_first(): if csgraph_from_dense is None: raise SkipTest("Old version of scipy, doesn't have csgraph.") csgraph = np.array([[0, 1, 2, 0, 0], [1, 0, 0, 0, 3], [2, 0, 0, 7, 0], [0, 0, 7, 0, 1], [0, 3, 0, 1, 0]]) csgraph = csgraph_from_dense(csgraph, null_value=0) dfirst = np.array([[0, 1, 0, 0, 0], [0, 0, 0, 0, 3], [0, 0, 0, 0, 0], [0, 0, 7, 0, 0], [0, 0, 0, 1, 0]]) for directed in [True, False]: dfirst_test = depth_first_tree(csgraph, 0, directed) assert_array_almost_equal(csgraph_to_dense(dfirst_test), dfirst)
def eval_node_probs(self): """Update probability density estimates. Args: None Returns: None """ # Create mutual info matrix mutual_info = np.zeros([self.length, self.length]) for i in range(self.length - 1): for j in range(i + 1, self.length): mutual_info[i, j] = -1 * mutual_info_score( self.keep_sample[:, i], self.keep_sample[:, j]) # Find minimum spanning tree of mutual info matrix mst = minimum_spanning_tree(csr_matrix(mutual_info)) # Convert minimum spanning tree to depth first tree with node 0 as root dft = depth_first_tree(csr_matrix(mst.toarray()), 0, directed=False) dft = dft.toarray() # Determine parent of each node parent = np.argmin(dft[:, 1:], axis=0) # Get probs probs = np.zeros([self.length, self.max_val, self.max_val]) if not len(self.keep_sample): probs[0] = 0 probs[0, :, 0] = 1 else: probs[0, :] = np.histogram(self.keep_sample[:, 0], np.arange(self.max_val + 1), density=True)[0] for i in range(1, self.length): for j in range(self.max_val): subset = self.keep_sample[np.where( self.keep_sample[:, parent[i - 1]] == j)[0]] if not len(subset): probs[i, j] = 0 probs[i, j, 0] = 1 else: probs[i, j] = np.histogram(subset, np.arange(self.max_val + 1), density=True)[0] # Update probs and parent self.node_probs = probs self.parent_nodes = parent
def get_depth_first_tree(csr_adj_matrix, node_index): original_adj_graph = nx.from_scipy_sparse_matrix(csr_adj_matrix) degree_list = sorted([(degree, node) for node, degree in original_adj_graph.degree()], reverse=True) start_node = degree_list[node_index] start_node = start_node[1] #print("start node is:", start_node) depth_tree = csgraph.depth_first_tree( csr_adj_matrix, start_node) # makes a tree from depth first search return depth_tree
def getCycles(edges, vD, connLimit=np.inf, lengthLimit=np.inf): if len(edges)>0: N = len(vD.vertices) dists = map(lambda e: np.linalg.norm(vD.vertices[e[0],:]-vD.vertices[e[1],:]), edges) graph = getUndirAdjMatrix(edges, N, dists) dfsTree = depth_first_tree(graph, edges[0][0], False) cycleHints = makeSet(edges).difference(makeSet(np.transpose(dfsTree.nonzero()))) paths = [] for e in cycleHints: dlsRec(graph, np.ones((N,)) * -1, e[0], e[0], paths, connLimit, connLimit, lengthLimit) return repulseCycles(paths), cycleHints #map(lambda x: x[:-1], repulseCycles(paths)), cycleEdges else: return [], []
def eval_node_probs(self): """Update probability density estimates. """ # Create mutual info matrix mutual_info = self._get_mutual_info_impl() # Find minimum spanning tree of mutual info matrix csr_mx = csr_matrix(mutual_info) mst = minimum_spanning_tree(csr_mx) # Convert minimum spanning tree to depth first tree with node 0 as root dft = depth_first_tree(csr_matrix(mst.toarray()), 0, directed=False) dft = np.round(dft.toarray(), 10) # Determine parent of each node parent = np.argmin(dft[:, 1:], axis=0) # Get probs probs = np.zeros([self.length, self.max_val, self.max_val]) probs[0, :] = np.histogram(self.keep_sample[:, 0], np.arange(self.max_val + 1), density=True)[0] for i in range(1, self.length): for j in range(self.max_val): subset = self.keep_sample[np.where( self.keep_sample[:, parent[i - 1]] == j)[0]] if not len(subset): probs[i, j] = 1 / self.max_val else: temp_probs = np.histogram(subset[:, i], np.arange(self.max_val + 1), density=True)[0] # Check if noise argument is not default (in epsilon) if self.noise > 0: # Add noise, from the mimic argument "noise" temp_probs = (temp_probs + self.noise) # All probability adds up to one temp_probs = np.divide(temp_probs, np.sum(temp_probs)) # Handle floating point error to ensure probability adds up to 1 if sum(temp_probs) != 1.0: temp_probs = np.divide(temp_probs, np.sum(temp_probs)) # Set probability probs[i, j] = temp_probs # Update probs and parent self.node_probs = probs self.parent_nodes = parent
def chow_liu_tree(A): #Getting the parameters of the data #the number of training examples: m = A.shape[0] #the number of variables in the Bayes Net n = A.shape[1] #initialize the mutual information MI, a nxn square matrix with zeros MI = np.zeros((n, n)) #get the indexes of the triangular matrix with 1 offset index_of_tri = np.triu_indices(n, 1) #the parameters theta are: p_1 = (A.sum(axis=0) + 1) / (m + 2) #if(p_1>1): # print('Error p_1 > 1 in chow_liu_tree Rutine ') p_0 = 1 - p_1 #Now we build our complete graph with mutual information mut_info_list = [] for row_index in range(n - 1): for column_index in range(row_index + 1, n): #We get the mutual information but store the negative because we need the max spanning tree mut_info_list.append( -mutual_info(A, row_index, column_index, p_1[row_index], p_1[column_index], m)) MI[index_of_tri] = mut_info_list #the algorithm will understand the triangle is undirected Tcsr = minimum_spanning_tree(MI) #Set the starting porint of the Max Spaning tree as the variable 0 DFS_tree = depth_first_tree(-Tcsr, 0, directed=False) #We extract the dependencies a = DFS_tree.todok().items() #initialize the Bayes Net BN = {} BN[0] = np.array([p_0[0], p_1[0]]) for arrow in a: #specifie the index of the parent and the child parent = arrow[0][0] child = arrow[0][1] p0c1 = (np.logical_and(np.logical_not(A[:, parent]), A[:, child]).sum() + 1) / (m + 4) p1c0 = (np.logical_and(A[:, parent], np.logical_not( A[:, child])).sum() + 1) / (m + 4) p1c1 = (np.logical_and(A[:, parent], A[:, child]).sum() + 1) / (m + 4) p0c0 = 1 - p0c1 - p1c0 - p1c1 theta_c_given_p = [ p0c0 / p_0[parent], p0c1 / p_0[parent], p1c0 / p_1[parent], p1c1 / p_1[parent] ] BN[arrow[0]] = np.array(theta_c_given_p) return BN
def eval_node_probs(self): """Update probability density estimates. """ # Create mutual info matrix mutual_info = np.zeros([self.length, self.length]) for i in range(self.length - 1): for j in range(i + 1, self.length): # DEBUGGING CODE try: mutual_info[i, j] = -1 * mutual_info_score( self.keep_sample[:, i], self.keep_sample[:, j]) except ValueError: print(f'self.keep_sample[:, i] = {self.keep_sample[:, i]}') print(f'self.keep_sample[:, j] = {self.keep_sample[:, j]}') raise Exception("Caught value error") # Find minimum spanning tree of mutual info matrix mst = minimum_spanning_tree(csr_matrix(mutual_info)) # Convert minimum spanning tree to depth first tree with node 0 as root dft = depth_first_tree(csr_matrix(mst.toarray()), 0, directed=False) dft = np.round(dft.toarray(), 10) # Determine parent of each node parent = np.argmin(dft[:, 1:], axis=0) # Get probs probs = np.zeros([self.length, self.max_val, self.max_val]) probs[0, :] = np.histogram(self.keep_sample[:, 0], np.arange(self.max_val + 1), density=True)[0] for i in range(1, self.length): for j in range(self.max_val): subset = self.keep_sample[np.where( self.keep_sample[:, parent[i - 1]] == j)[0]] if not len(subset): probs[i, j] = 1 / self.max_val else: probs[i, j] = np.histogram(subset[:, i], np.arange(self.max_val + 1), density=True)[0] # Update probs and parent self.node_probs = probs self.parent_nodes = parent
p_1 = (training_data.sum(axis=0) + 1) / (m + 2) p_0 = 1 - p_1 #Now we build our complete graph with mutual information mut_info_list = [] for row_index in range(n - 1): for column_index in range(row_index + 1, n): #We get the mutual information but store the negative because we need the max spanning tree mut_info_list.append( -mutual_info(training_data, row_index, column_index, p_1[row_index], p_1[column_index], m)) MI[index_of_tri] = mut_info_list #the algorithm will understand the triangle is undirected Tcsr = minimum_spanning_tree(MI) #Set the starting porint of the Max Spaning tree as the variable 0 DFS_tree = depth_first_tree(-Tcsr, 0, directed=False) #We extract the dependencies a = DFS_tree.todok().items() #initialize the Bayes Net BN = {} BN[0] = np.array([p_0[0], p_1[0]]) for arrow in a: #specifie the index of the parent and the child parent = arrow[0][0] child = arrow[0][1] p0c1 = (np.logical_and(np.logical_not(training_data[:, parent]), training_data[:, child]).sum() + 1) / (m + 4) p1c0 = (np.logical_and(training_data[:, parent], np.logical_not(training_data[:, child])).sum() + 1) / (m + 4) p1c1 = (np.logical_and(training_data[:, parent],
def eval_node_probs(self): """Update probability density estimates. """ if (self.mimic_speed == False): # Create mutual info matrix mutual_info = np.zeros([self.length, self.length]) for i in range(self.length - 1): for j in range(i + 1, self.length): mutual_info[i, j] = -1 * mutual_info_score( self.keep_sample[:, i], self.keep_sample[:, j]) elif (self.mimic_speed == True): # Set ignore error to ignore dividing by zero np.seterr(divide='ignore', invalid='ignore') # get length of the sample which survived from mimic iteration len_sample_kept = self.keep_sample.shape[0] # get the length of the bit sequence / problem size len_prob = self.keep_sample.shape[1] # Expand the matrices to so each row corresponds to a row by row combination of the list of samples b = np.repeat(self.keep_sample, self.length).reshape(len_sample_kept, len_prob * len_prob) d = np.hstack(([self.keep_sample] * len_prob)) # Compute the mutual information matrix in bulk, by iterating through the list of possible feature values ((max_val-1)^2). # For example, a binary string would go through 00 01 10 11, for a total of 4 iterations. # First initialize the mutual info matrix. mut_inf = np.zeros([self.length * self.length]) # Pre-compute the U and V which gets computed multiple times in the inner loop. U = {} V = {} U_sum = {} V_sum = {} for i in range(0, self.max_val): U[i] = (d == i) V[i] = (b == i) U_sum[i] = np.sum(d == i, axis=0) V_sum[i] = np.sum(b == i, axis=0) # Compute the mutual information for all sample to sample combination for each feature combination ((max_val-1)^2) for i in range(0, self.max_val): for j in range(0, self.max_val): # This corresponds to U and V of mutual info matrix, for this feature pair coeff = np.sum(U[i] * V[j], axis=0) # Compute length N, for the particular feature pair UV_length = (U_sum[i] * V_sum[j]) # compute the second term of the MI matrix temp = np.log(coeff) - np.log(UV_length) + np.log( len_sample_kept) # remove the nans and negative infinity temp[np.isnan(temp)] = 0 temp[np.isneginf(temp)] = 0 # combine the first and the second term, divide by the length N. # Add the whole MI matrix for the feature to the previously computed values mut_inf = mut_inf + temp * np.divide( coeff, len_sample_kept) # Need to multiply by negative to get the mutual information mut_inf = -mut_inf.reshape(self.length, self.length) # Only get the upper triangle matrix above the identity row. # Possible enhancements, currently we are doing dobule the computation required. # Pre set the matrix so the compuation is only done for rows that are needed. To do for the future. mutual_info = np.triu(mut_inf, k=1) # Find minimum spanning tree of mutual info matrix mst = minimum_spanning_tree(csr_matrix(mutual_info)) # Convert minimum spanning tree to depth first tree with node 0 as root dft = depth_first_tree(csr_matrix(mst.toarray()), 0, directed=False) dft = np.round(dft.toarray(), 10) # Determine parent of each node parent = np.argmin(dft[:, 1:], axis=0) # Get probs probs = np.zeros([self.length, self.max_val, self.max_val]) probs[0, :] = np.histogram(self.keep_sample[:, 0], np.arange(self.max_val + 1), density=True)[0] for i in range(1, self.length): for j in range(self.max_val): subset = self.keep_sample[np.where( self.keep_sample[:, parent[i - 1]] == j)[0]] if not len(subset): probs[i, j] = 1 / self.max_val else: probs[i, j] = np.histogram(subset[:, i], np.arange(self.max_val + 1), density=True)[0] # Update probs and parent self.node_probs = probs self.parent_nodes = parent
def quick_split(G, train_frac=0.51): r""" Computes one train/test split of edges from an input graph and returns the results. The train set will be (weakly) connected and span all nodes of the input graph (digraph). This implementation uses a depth first tree to obtain edges covering all nodes for the train graph. Input graph (digraph) cannot contain more than one (weakly) connected component. Parameters ---------- G : graph A NetworkX graph train_frac : float, optional The relative size (in range (0.0, 1.0]) of the train set with respect to the total number of edges in the graph. Default is 0.51. Returns ------- train_E : array Column array of train edges as pairs src, dst test_E : array Column array of test edges as pairs src, dst Raises ------ ValueError If the train_frac parameter is not in range (0, 1]. If the input graph G has more than one (weakly) connected component. """ _sanity_check(G) if train_frac <= 0.0 or train_frac > 1.0: raise ValueError( 'The train_frac parameter needs to be in range: (0.0, 1.0]') if train_frac == 1.0: return set(G.edges()), set() # Restrict input graph to its main cc if nx.is_directed(G): a = nx.adj_matrix(G) else: a = triu(nx.adj_matrix(G), k=1) # Compute initial statistics and linear indx of nonzeros n = a.shape[0] num_tr_e = int(a.nnz * train_frac) nz_lin_ind = np.ravel_multi_index(a.nonzero(), (n, n)) # Build a dft starting at a random node. If dir false returns only upper triang dft = depth_first_tree(a, np.random.randint(0, a.shape[0]), directed=nx.is_directed(G)) if nx.is_directed(G): dft_lin_ind = np.ravel_multi_index(dft.nonzero(), (n, n)) else: dft_lin_ind = np.ravel_multi_index( triu(tril(dft).T + dft, k=1).nonzero(), (n, n)) # From all nonzero indx remove those in dft. From the rest take enough to fill train quota. Rest are test rest_lin_ind = np.setdiff1d(nz_lin_ind, dft_lin_ind) aux = np.random.choice(rest_lin_ind, num_tr_e - len(dft_lin_ind), replace=False) lin_tr_e = np.union1d(dft_lin_ind, aux) lin_te_e = np.setdiff1d(rest_lin_ind, aux) # Unravel the linear indices to obtain src, dst pairs tr_e = np.array(np.unravel_index(np.array(lin_tr_e), (n, n))).T te_e = np.array(np.unravel_index(np.array(lin_te_e), (n, n))).T return tr_e, te_e
def quick_split(G, train_frac=0.51): """ Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using the quick split approach (see Notes). The resulting train edge set has the following properties: spans a graph (digraph) with a single connected (weakly connected) component and the same nodes as G. Parameters ---------- G : graph A NetworkX graph or digraph with a single connected (weakly connected) component. train_frac : float, optional The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]). Default is 0.51. Returns ------- train_E : ndarray Column vector of train edges as pairs src, dst. test_E : ndarray Column vector of test edges as pairs src, dst. Raises ------ ValueError If the train_frac parameter is not in range (0, 1]. If the input graph G has more than one (weakly) connected component. Notes ----- The method proceeds as follows: (1) a spanning tree of the input graph is generated using a depth first tree approach starting at a random node, (2) randomly selected edges are added to those of the spanning tree until train_frac is reached, (3) the remaining edges, not used in previous steps, form the test set. """ # Sanity check to make sure the input is correct _sanity_check(G) if train_frac <= 0.0 or train_frac > 1.0: raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]') if train_frac == 1.0: return set(G.edges()), set() # Get Adj matrix if nx.is_directed(G): a = nx.adj_matrix(G) else: a = triu(nx.adj_matrix(G), k=1) # Compute initial statistics and linear indx of nonzeros n = a.shape[0] num_tr_e = int(a.nnz * train_frac) nz_lin_ind = np.ravel_multi_index(a.nonzero(), (n, n)) # Build a dft starting at a random node. If dir false returns only upper triangle dft = depth_first_tree(a, np.random.randint(0, a.shape[0]), directed=nx.is_directed(G)) if nx.is_directed(G): dft_lin_ind = np.ravel_multi_index(dft.nonzero(), (n, n)) else: dft_lin_ind = np.ravel_multi_index(triu(tril(dft).T + dft, k=1).nonzero(), (n, n)) # From all nonzero indx remove those in dft. From the rest take enough to fill train quota. Rest are test rest_lin_ind = np.setdiff1d(nz_lin_ind, dft_lin_ind) aux = np.random.choice(rest_lin_ind, num_tr_e-len(dft_lin_ind), replace=False) lin_tr_e = np.union1d(dft_lin_ind, aux) lin_te_e = np.setdiff1d(rest_lin_ind, aux) # Unravel the linear indices to obtain src, dst pairs tr_e = np.array(np.unravel_index(np.array(lin_tr_e), (n, n))).T te_e = np.array(np.unravel_index(np.array(lin_te_e), (n, n))).T # Return the sets of edges return tr_e, te_e
def eval_node_probs(self): """Update probability density estimates. """ if (self.mimic_speed == False): # Create mutual info matrix mutual_info = np.zeros([self.length, self.length]) for i in range(self.length - 1): for j in range(i + 1, self.length): mutual_info[i, j] = -1 * mutual_info_score( self.keep_sample[:, i], self.keep_sample[:, j]) elif (self.mimic_speed == True): # Set ignore error to ignore dividing by zero np.seterr(divide='ignore', invalid='ignore') # get length of the sample which survived from mimic iteration len_sample_kept = self.keep_sample.shape[0] # get the length of the bit sequence / problem size len_prob = self.keep_sample.shape[1] # Expand the matrices to so each row corresponds to a row by row combination of the list of samples permuted_rows = np.repeat(self.keep_sample, self.length).reshape( len_sample_kept, len_prob * len_prob) duplicated_rows = np.hstack(([self.keep_sample] * len_prob)) # Compute the mutual information matrix in bulk # This is done by iterating through the list of possible feature values ((max_val-1)^2). # For example, a binary string would go through 00 01 10 11, for a total of 4 iterations. # First initialize the mutual info matrix. mutual_info_vectorized = np.zeros([self.length * self.length]) # Pre-compute the clusters U and V which gets computed multiple times in the inner loop. cluster_U = {} cluster_V = {} cluster_U_sum = {} cluster_V_sum = {} for i in range(0, self.max_val): cluster_U[i] = (duplicated_rows == i) cluster_V[i] = (permuted_rows == i) cluster_U_sum[i] = np.sum(duplicated_rows == i, axis=0) cluster_V_sum[i] = np.sum(permuted_rows == i, axis=0) # Compute the mutual information for all sample to sample combination # Done for each feature combination i & j ((max_val-1)^2) for i in range(0, self.max_val): for j in range(0, self.max_val): # |U_i AND V_j|/N Length of cluster matching for feature pair i j over sample length N # This is the first term in the MI computation MI_first_term = np.sum(cluster_U[i] * cluster_V[j], axis=0) MI_first_term = np.divide(MI_first_term, len_sample_kept) # compute the second term of the MI matrix # Length |U_i||V_j|, for the particular feature pair UV_length = (cluster_U_sum[i] * cluster_V_sum[j]) MI_second_term = np.log(MI_first_term) - np.log( UV_length) + np.log(len_sample_kept) # remove the nans and negative infinity, there shouldn't be any MI_second_term[np.isnan(MI_second_term)] = 0 MI_second_term[np.isneginf(MI_second_term)] = 0 # Combine the first and second term # Add the whole MI matrix for the feature to the previously computed values mutual_info_vectorized = mutual_info_vectorized + MI_first_term * MI_second_term # Need to multiply by negative to get the mutual information, and reshape (Full Matrix) mutual_info_full = -mutual_info_vectorized.reshape( self.length, self.length) # Only get the upper triangle matrix above the identity row. mutual_info = np.triu(mutual_info_full, k=1) # Possible enhancements, currently we are doing double the computation required. # Pre set the matrix so the computation is only done for rows that are needed. To do for the future. # Find minimum spanning tree of mutual info matrix mst = minimum_spanning_tree(csr_matrix(mutual_info)) # Convert minimum spanning tree to depth first tree with node 0 as root dft = depth_first_tree(csr_matrix(mst.toarray()), 0, directed=False) dft = np.round(dft.toarray(), 10) # Determine parent of each node parent = np.argmin(dft[:, 1:], axis=0) # Get probs probs = np.zeros([self.length, self.max_val, self.max_val]) probs[0, :] = np.histogram(self.keep_sample[:, 0], np.arange(self.max_val + 1), density=True)[0] for i in range(1, self.length): for j in range(self.max_val): subset = self.keep_sample[np.where( self.keep_sample[:, parent[i - 1]] == j)[0]] if not len(subset): probs[i, j] = 1 / self.max_val else: probs[i, j] = np.histogram(subset[:, i], np.arange(self.max_val + 1), density=True)[0] # Update probs and parent self.node_probs = probs self.parent_nodes = parent
print(i) for j in dataset.columns: M_info[i][j] = mutual_info_score(dataset[i].values, dataset[j].values) from scipy.sparse import csr_matrix, find from scipy.sparse.csgraph import minimum_spanning_tree, depth_first_tree X = csr_matrix(M_info) Tcsr = -minimum_spanning_tree(-X) print(Tcsr) Array1 = Tcsr.toarray().astype(float) #Y = csr_matrix(A) Tcsr_depth = depth_first_tree(Array1, 1, directed=False) print(Tcsr_depth) Array2 = Tcsr_depth.toarray().astype(float) really = np.column_stack(((find(Array2))[0], (find(Array2))[1])) pred = np.zeros(len(test_data)) CPD = np.apply_along_axis(gettingRow, 1, really, dataset=dataset, p_1=prob_x_1, p_0=prob_x_0, test=test_data, prediction=pred)