Пример #1
0
def test_graph_depth_first_trivial_graph():
    csgraph = np.array([[0]])
    csgraph = csgraph_from_dense(csgraph, null_value=0)

    bfirst = np.array([[0]])

    for directed in [True, False]:
        bfirst_test = depth_first_tree(csgraph, 0, directed)
        assert_array_almost_equal(csgraph_to_dense(bfirst_test), bfirst)
Пример #2
0
def test_graph_depth_first_trivial_graph():
    csgraph = np.array([[0]])
    csgraph = csgraph_from_dense(csgraph, null_value=0)

    bfirst = np.array([[0]])

    for directed in [True, False]:
        bfirst_test = depth_first_tree(csgraph, 0, directed)
        assert_array_almost_equal(csgraph_to_dense(bfirst_test),
                                  bfirst)
Пример #3
0
def test_graph_depth_first():
    csgraph = np.array([[0, 1, 2, 0, 0], [1, 0, 0, 0, 3], [2, 0, 0, 7, 0],
                        [0, 0, 7, 0, 1], [0, 3, 0, 1, 0]])
    csgraph = csgraph_from_dense(csgraph, null_value=0)

    dfirst = np.array([[0, 1, 0, 0, 0], [0, 0, 0, 0, 3], [0, 0, 0, 0, 0],
                       [0, 0, 7, 0, 0], [0, 0, 0, 1, 0]])

    for directed in [True, False]:
        dfirst_test = depth_first_tree(csgraph, 0, directed)
        assert_array_almost_equal(csgraph_to_dense(dfirst_test), dfirst)
Пример #4
0
def test_graph_depth_first():
    if csgraph_from_dense is None:
        raise SkipTest("Old version of scipy, doesn't have csgraph.")
    csgraph = np.array([[0, 1, 2, 0, 0], [1, 0, 0, 0, 3], [2, 0, 0, 7, 0], [0, 0, 7, 0, 1], [0, 3, 0, 1, 0]])
    csgraph = csgraph_from_dense(csgraph, null_value=0)

    dfirst = np.array([[0, 1, 0, 0, 0], [0, 0, 0, 0, 3], [0, 0, 0, 0, 0], [0, 0, 7, 0, 0], [0, 0, 0, 1, 0]])

    for directed in [True, False]:
        dfirst_test = depth_first_tree(csgraph, 0, directed)
        assert_array_almost_equal(csgraph_to_dense(dfirst_test), dfirst)
Пример #5
0
    def eval_node_probs(self):
        """Update probability density estimates.

        Args:
        None

        Returns:
        None
        """
        # Create mutual info matrix
        mutual_info = np.zeros([self.length, self.length])
        for i in range(self.length - 1):
            for j in range(i + 1, self.length):
                mutual_info[i, j] = -1 * mutual_info_score(
                    self.keep_sample[:, i], self.keep_sample[:, j])

        # Find minimum spanning tree of mutual info matrix
        mst = minimum_spanning_tree(csr_matrix(mutual_info))

        # Convert minimum spanning tree to depth first tree with node 0 as root
        dft = depth_first_tree(csr_matrix(mst.toarray()), 0, directed=False)
        dft = dft.toarray()

        # Determine parent of each node
        parent = np.argmin(dft[:, 1:], axis=0)

        # Get probs
        probs = np.zeros([self.length, self.max_val, self.max_val])

        if not len(self.keep_sample):
            probs[0] = 0
            probs[0, :, 0] = 1

        else:
            probs[0, :] = np.histogram(self.keep_sample[:, 0],
                                       np.arange(self.max_val + 1),
                                       density=True)[0]

        for i in range(1, self.length):
            for j in range(self.max_val):
                subset = self.keep_sample[np.where(
                    self.keep_sample[:, parent[i - 1]] == j)[0]]

                if not len(subset):
                    probs[i, j] = 0
                    probs[i, j, 0] = 1
                else:
                    probs[i, j] = np.histogram(subset,
                                               np.arange(self.max_val + 1),
                                               density=True)[0]

        # Update probs and parent
        self.node_probs = probs
        self.parent_nodes = parent
Пример #6
0
def get_depth_first_tree(csr_adj_matrix, node_index):
    original_adj_graph = nx.from_scipy_sparse_matrix(csr_adj_matrix)
    degree_list = sorted([(degree, node)
                          for node, degree in original_adj_graph.degree()],
                         reverse=True)
    start_node = degree_list[node_index]
    start_node = start_node[1]
    #print("start node is:", start_node)
    depth_tree = csgraph.depth_first_tree(
        csr_adj_matrix, start_node)  # makes a tree from depth first search
    return depth_tree
Пример #7
0
def getCycles(edges, vD, connLimit=np.inf, lengthLimit=np.inf):
   if len(edges)>0:
      N = len(vD.vertices)
      dists = map(lambda e: np.linalg.norm(vD.vertices[e[0],:]-vD.vertices[e[1],:]), edges)
      graph = getUndirAdjMatrix(edges, N, dists)
      dfsTree = depth_first_tree(graph, edges[0][0], False)
      cycleHints = makeSet(edges).difference(makeSet(np.transpose(dfsTree.nonzero())))
      paths = []
      for e in cycleHints:
         dlsRec(graph, np.ones((N,)) * -1, e[0], e[0], paths, connLimit, connLimit, lengthLimit)
      return repulseCycles(paths), cycleHints #map(lambda x: x[:-1], repulseCycles(paths)), cycleEdges
   else: return [], []
    def eval_node_probs(self):
        """Update probability density estimates.
        """
        # Create mutual info matrix
        mutual_info = self._get_mutual_info_impl()

        # Find minimum spanning tree of mutual info matrix
        csr_mx = csr_matrix(mutual_info)
        mst = minimum_spanning_tree(csr_mx)

        # Convert minimum spanning tree to depth first tree with node 0 as root
        dft = depth_first_tree(csr_matrix(mst.toarray()), 0, directed=False)
        dft = np.round(dft.toarray(), 10)

        # Determine parent of each node
        parent = np.argmin(dft[:, 1:], axis=0)

        # Get probs
        probs = np.zeros([self.length, self.max_val, self.max_val])

        probs[0, :] = np.histogram(self.keep_sample[:, 0],
                                   np.arange(self.max_val + 1),
                                   density=True)[0]

        for i in range(1, self.length):
            for j in range(self.max_val):
                subset = self.keep_sample[np.where(
                    self.keep_sample[:, parent[i - 1]] == j)[0]]

                if not len(subset):
                    probs[i, j] = 1 / self.max_val
                else:
                    temp_probs = np.histogram(subset[:, i],
                                              np.arange(self.max_val + 1),
                                              density=True)[0]

                    # Check if noise argument is not default (in epsilon)
                    if self.noise > 0:
                        # Add noise, from the mimic argument "noise"
                        temp_probs = (temp_probs + self.noise)
                        # All probability adds up to one
                        temp_probs = np.divide(temp_probs, np.sum(temp_probs))
                        # Handle floating point error to ensure probability adds up to 1
                        if sum(temp_probs) != 1.0:
                            temp_probs = np.divide(temp_probs,
                                                   np.sum(temp_probs))
                    # Set probability
                    probs[i, j] = temp_probs

        # Update probs and parent
        self.node_probs = probs
        self.parent_nodes = parent
Пример #9
0
def chow_liu_tree(A):

    #Getting the parameters of the data
    #the number of training examples:
    m = A.shape[0]
    #the number of variables in the Bayes Net
    n = A.shape[1]

    #initialize the mutual information MI, a nxn square matrix with zeros
    MI = np.zeros((n, n))
    #get the indexes of the triangular matrix with 1 offset
    index_of_tri = np.triu_indices(n, 1)
    #the parameters theta are:
    p_1 = (A.sum(axis=0) + 1) / (m + 2)
    #if(p_1>1):
    #    print('Error p_1 > 1 in chow_liu_tree Rutine ')
    p_0 = 1 - p_1
    #Now we build our complete graph with mutual information
    mut_info_list = []
    for row_index in range(n - 1):
        for column_index in range(row_index + 1, n):
            #We get the mutual information but store the negative because we need the max spanning tree
            mut_info_list.append(
                -mutual_info(A, row_index, column_index, p_1[row_index],
                             p_1[column_index], m))

    MI[index_of_tri] = mut_info_list
    #the algorithm will understand the triangle is undirected
    Tcsr = minimum_spanning_tree(MI)
    #Set the starting porint of the Max Spaning tree as the variable 0
    DFS_tree = depth_first_tree(-Tcsr, 0, directed=False)
    #We extract the dependencies
    a = DFS_tree.todok().items()
    #initialize the Bayes Net
    BN = {}
    BN[0] = np.array([p_0[0], p_1[0]])
    for arrow in a:
        #specifie the index of the parent and the child
        parent = arrow[0][0]
        child = arrow[0][1]
        p0c1 = (np.logical_and(np.logical_not(A[:, parent]),
                               A[:, child]).sum() + 1) / (m + 4)
        p1c0 = (np.logical_and(A[:, parent], np.logical_not(
            A[:, child])).sum() + 1) / (m + 4)
        p1c1 = (np.logical_and(A[:, parent], A[:, child]).sum() + 1) / (m + 4)
        p0c0 = 1 - p0c1 - p1c0 - p1c1
        theta_c_given_p = [
            p0c0 / p_0[parent], p0c1 / p_0[parent], p1c0 / p_1[parent],
            p1c1 / p_1[parent]
        ]
        BN[arrow[0]] = np.array(theta_c_given_p)
    return BN
Пример #10
0
def test_graph_depth_first():
    if csgraph_from_dense is None:
        raise SkipTest("Old version of scipy, doesn't have csgraph.")
    csgraph = np.array([[0, 1, 2, 0, 0], [1, 0, 0, 0, 3], [2, 0, 0, 7, 0],
                        [0, 0, 7, 0, 1], [0, 3, 0, 1, 0]])
    csgraph = csgraph_from_dense(csgraph, null_value=0)

    dfirst = np.array([[0, 1, 0, 0, 0], [0, 0, 0, 0, 3], [0, 0, 0, 0, 0],
                       [0, 0, 7, 0, 0], [0, 0, 0, 1, 0]])

    for directed in [True, False]:
        dfirst_test = depth_first_tree(csgraph, 0, directed)
        assert_array_almost_equal(csgraph_to_dense(dfirst_test), dfirst)
    def eval_node_probs(self):
        """Update probability density estimates.
        """
        # Create mutual info matrix
        mutual_info = np.zeros([self.length, self.length])
        for i in range(self.length - 1):
            for j in range(i + 1, self.length):
                # DEBUGGING CODE
                try:
                    mutual_info[i, j] = -1 * mutual_info_score(
                        self.keep_sample[:, i], self.keep_sample[:, j])
                except ValueError:
                    print(f'self.keep_sample[:, i] = {self.keep_sample[:, i]}')
                    print(f'self.keep_sample[:, j] = {self.keep_sample[:, j]}')
                    raise Exception("Caught value error")

        # Find minimum spanning tree of mutual info matrix
        mst = minimum_spanning_tree(csr_matrix(mutual_info))

        # Convert minimum spanning tree to depth first tree with node 0 as root
        dft = depth_first_tree(csr_matrix(mst.toarray()), 0, directed=False)
        dft = np.round(dft.toarray(), 10)

        # Determine parent of each node
        parent = np.argmin(dft[:, 1:], axis=0)

        # Get probs
        probs = np.zeros([self.length, self.max_val, self.max_val])

        probs[0, :] = np.histogram(self.keep_sample[:, 0],
                                   np.arange(self.max_val + 1),
                                   density=True)[0]

        for i in range(1, self.length):
            for j in range(self.max_val):
                subset = self.keep_sample[np.where(
                    self.keep_sample[:, parent[i - 1]] == j)[0]]

                if not len(subset):
                    probs[i, j] = 1 / self.max_val
                else:
                    probs[i, j] = np.histogram(subset[:, i],
                                               np.arange(self.max_val + 1),
                                               density=True)[0]

        # Update probs and parent
        self.node_probs = probs
        self.parent_nodes = parent
Пример #12
0
def test_graph_depth_first():
    csgraph = np.array([[0, 1, 2, 0, 0],
                        [1, 0, 0, 0, 3],
                        [2, 0, 0, 7, 0],
                        [0, 0, 7, 0, 1],
                        [0, 3, 0, 1, 0]])
    csgraph = csgraph_from_dense(csgraph, null_value=0)

    dfirst = np.array([[0, 1, 0, 0, 0],
                       [0, 0, 0, 0, 3],
                       [0, 0, 0, 0, 0],
                       [0, 0, 7, 0, 0],
                       [0, 0, 0, 1, 0]])

    for directed in [True, False]:
        dfirst_test = depth_first_tree(csgraph, 0, directed)
        assert_array_almost_equal(csgraph_to_dense(dfirst_test),
                                  dfirst)
Пример #13
0
    p_1 = (training_data.sum(axis=0) + 1) / (m + 2)
    p_0 = 1 - p_1
    #Now we build our complete graph with mutual information
    mut_info_list = []
    for row_index in range(n - 1):
        for column_index in range(row_index + 1, n):
            #We get the mutual information but store the negative because we need the max spanning tree
            mut_info_list.append(
                -mutual_info(training_data, row_index, column_index,
                             p_1[row_index], p_1[column_index], m))

    MI[index_of_tri] = mut_info_list
    #the algorithm will understand the triangle is undirected
    Tcsr = minimum_spanning_tree(MI)
    #Set the starting porint of the Max Spaning tree as the variable 0
    DFS_tree = depth_first_tree(-Tcsr, 0, directed=False)
    #We extract the dependencies
    a = DFS_tree.todok().items()
    #initialize the Bayes Net
    BN = {}
    BN[0] = np.array([p_0[0], p_1[0]])
    for arrow in a:
        #specifie the index of the parent and the child
        parent = arrow[0][0]
        child = arrow[0][1]
        p0c1 = (np.logical_and(np.logical_not(training_data[:, parent]),
                               training_data[:, child]).sum() + 1) / (m + 4)
        p1c0 = (np.logical_and(training_data[:, parent],
                               np.logical_not(training_data[:, child])).sum() +
                1) / (m + 4)
        p1c1 = (np.logical_and(training_data[:, parent],
Пример #14
0
    def eval_node_probs(self):
        """Update probability density estimates.
        """
        if (self.mimic_speed == False):
            # Create mutual info matrix
            mutual_info = np.zeros([self.length, self.length])
            for i in range(self.length - 1):
                for j in range(i + 1, self.length):
                    mutual_info[i, j] = -1 * mutual_info_score(
                        self.keep_sample[:, i], self.keep_sample[:, j])

        elif (self.mimic_speed == True):
            # Set ignore error to ignore dividing by zero
            np.seterr(divide='ignore', invalid='ignore')

            # get length of the sample which survived from mimic iteration
            len_sample_kept = self.keep_sample.shape[0]
            # get the length of the bit sequence / problem size
            len_prob = self.keep_sample.shape[1]

            # Expand the matrices to so each row corresponds to a row by row combination of the list of samples
            b = np.repeat(self.keep_sample,
                          self.length).reshape(len_sample_kept,
                                               len_prob * len_prob)
            d = np.hstack(([self.keep_sample] * len_prob))

            # Compute the mutual information matrix in bulk, by iterating through the list of possible feature values ((max_val-1)^2).
            # For example, a binary string would go through 00 01 10 11, for a total of 4 iterations.

            # First initialize the mutual info matrix.
            mut_inf = np.zeros([self.length * self.length])
            # Pre-compute the U and V which gets computed multiple times in the inner loop.
            U = {}
            V = {}
            U_sum = {}
            V_sum = {}
            for i in range(0, self.max_val):
                U[i] = (d == i)
                V[i] = (b == i)
                U_sum[i] = np.sum(d == i, axis=0)
                V_sum[i] = np.sum(b == i, axis=0)

            # Compute the mutual information for all sample to sample combination for each feature combination ((max_val-1)^2)
            for i in range(0, self.max_val):
                for j in range(0, self.max_val):
                    # This corresponds to U and V of mutual info matrix, for this feature pair
                    coeff = np.sum(U[i] * V[j], axis=0)
                    # Compute length N, for the particular feature pair
                    UV_length = (U_sum[i] * V_sum[j])

                    # compute the second term of the MI matrix
                    temp = np.log(coeff) - np.log(UV_length) + np.log(
                        len_sample_kept)
                    # remove the nans and negative infinity
                    temp[np.isnan(temp)] = 0
                    temp[np.isneginf(temp)] = 0

                    # combine the first and the second term, divide by the length N.
                    # Add the whole MI matrix for the feature to the previously computed values
                    mut_inf = mut_inf + temp * np.divide(
                        coeff, len_sample_kept)

            # Need to multiply by negative to get the mutual information
            mut_inf = -mut_inf.reshape(self.length, self.length)
            # Only get the upper triangle matrix above the identity row.
            # Possible enhancements, currently we are doing dobule the computation required.
            # Pre set the matrix so the compuation is only done for rows that are needed. To do for the future.
            mutual_info = np.triu(mut_inf, k=1)

        # Find minimum spanning tree of mutual info matrix
        mst = minimum_spanning_tree(csr_matrix(mutual_info))

        # Convert minimum spanning tree to depth first tree with node 0 as root
        dft = depth_first_tree(csr_matrix(mst.toarray()), 0, directed=False)
        dft = np.round(dft.toarray(), 10)

        # Determine parent of each node
        parent = np.argmin(dft[:, 1:], axis=0)

        # Get probs
        probs = np.zeros([self.length, self.max_val, self.max_val])

        probs[0, :] = np.histogram(self.keep_sample[:, 0],
                                   np.arange(self.max_val + 1),
                                   density=True)[0]

        for i in range(1, self.length):
            for j in range(self.max_val):
                subset = self.keep_sample[np.where(
                    self.keep_sample[:, parent[i - 1]] == j)[0]]

                if not len(subset):
                    probs[i, j] = 1 / self.max_val
                else:
                    probs[i, j] = np.histogram(subset[:, i],
                                               np.arange(self.max_val + 1),
                                               density=True)[0]

        # Update probs and parent
        self.node_probs = probs
        self.parent_nodes = parent
Пример #15
0
def quick_split(G, train_frac=0.51):
    r"""
    Computes one train/test split of edges from an input graph and returns the results.
    The train set will be (weakly) connected and span all nodes of the input graph (digraph).
    This implementation uses a depth first tree to obtain edges covering all nodes for the train graph.
    Input graph (digraph) cannot contain more than one (weakly) connected component.

    Parameters
    ----------
    G : graph
        A NetworkX graph
    train_frac : float, optional
        The relative size (in range (0.0, 1.0]) of the train set with respect to the total number of edges in the graph.
        Default is 0.51.

    Returns
    -------
    train_E : array
       Column array of train edges as pairs src, dst
    test_E : array
       Column array of test edges as pairs src, dst

    Raises
    ------
    ValueError
        If the train_frac parameter is not in range (0, 1].
        If the input graph G has more than one (weakly) connected component.
    """
    _sanity_check(G)
    if train_frac <= 0.0 or train_frac > 1.0:
        raise ValueError(
            'The train_frac parameter needs to be in range: (0.0, 1.0]')
    if train_frac == 1.0:
        return set(G.edges()), set()

    # Restrict input graph to its main cc
    if nx.is_directed(G):
        a = nx.adj_matrix(G)
    else:
        a = triu(nx.adj_matrix(G), k=1)

    # Compute initial statistics and linear indx of nonzeros
    n = a.shape[0]
    num_tr_e = int(a.nnz * train_frac)
    nz_lin_ind = np.ravel_multi_index(a.nonzero(), (n, n))

    # Build a dft starting at a random node. If dir false returns only upper triang
    dft = depth_first_tree(a,
                           np.random.randint(0, a.shape[0]),
                           directed=nx.is_directed(G))
    if nx.is_directed(G):
        dft_lin_ind = np.ravel_multi_index(dft.nonzero(), (n, n))
    else:
        dft_lin_ind = np.ravel_multi_index(
            triu(tril(dft).T + dft, k=1).nonzero(), (n, n))

    # From all nonzero indx remove those in dft. From the rest take enough to fill train quota. Rest are test
    rest_lin_ind = np.setdiff1d(nz_lin_ind, dft_lin_ind)
    aux = np.random.choice(rest_lin_ind,
                           num_tr_e - len(dft_lin_ind),
                           replace=False)
    lin_tr_e = np.union1d(dft_lin_ind, aux)
    lin_te_e = np.setdiff1d(rest_lin_ind, aux)

    # Unravel the linear indices to obtain src, dst pairs
    tr_e = np.array(np.unravel_index(np.array(lin_tr_e), (n, n))).T
    te_e = np.array(np.unravel_index(np.array(lin_te_e), (n, n))).T
    return tr_e, te_e
Пример #16
0
def quick_split(G, train_frac=0.51):
    """
    Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using the
    quick split approach (see Notes). The resulting train edge set has the following properties: spans a graph
    (digraph) with a single connected (weakly connected) component and the same nodes as G.

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph with a single connected (weakly connected) component.
    train_frac : float, optional
        The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
        Default is 0.51.

    Returns
    -------
    train_E : ndarray
       Column vector of train edges as pairs src, dst.
    test_E : ndarray
       Column vector of test edges as pairs src, dst.

    Raises
    ------
    ValueError
        If the train_frac parameter is not in range (0, 1].
        If the input graph G has more than one (weakly) connected component.

    Notes
    -----
    The method proceeds as follows: (1) a spanning tree of the input graph is generated using a depth first tree
    approach starting at a random node, (2) randomly selected edges are added to those of the spanning tree until
    train_frac is reached, (3) the remaining edges, not used in previous steps, form the test set.
    """
    # Sanity check to make sure the input is correct
    _sanity_check(G)
    if train_frac <= 0.0 or train_frac > 1.0:
        raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]')
    if train_frac == 1.0:
        return set(G.edges()), set()

    # Get Adj matrix
    if nx.is_directed(G):
        a = nx.adj_matrix(G)
    else:
        a = triu(nx.adj_matrix(G), k=1)

    # Compute initial statistics and linear indx of nonzeros
    n = a.shape[0]
    num_tr_e = int(a.nnz * train_frac)
    nz_lin_ind = np.ravel_multi_index(a.nonzero(), (n, n))

    # Build a dft starting at a random node. If dir false returns only upper triangle
    dft = depth_first_tree(a, np.random.randint(0, a.shape[0]), directed=nx.is_directed(G))
    if nx.is_directed(G):
        dft_lin_ind = np.ravel_multi_index(dft.nonzero(), (n, n))
    else:
        dft_lin_ind = np.ravel_multi_index(triu(tril(dft).T + dft, k=1).nonzero(), (n, n))

    # From all nonzero indx remove those in dft. From the rest take enough to fill train quota. Rest are test
    rest_lin_ind = np.setdiff1d(nz_lin_ind, dft_lin_ind)
    aux = np.random.choice(rest_lin_ind, num_tr_e-len(dft_lin_ind), replace=False)
    lin_tr_e = np.union1d(dft_lin_ind, aux)
    lin_te_e = np.setdiff1d(rest_lin_ind, aux)

    # Unravel the linear indices to obtain src, dst pairs
    tr_e = np.array(np.unravel_index(np.array(lin_tr_e), (n, n))).T
    te_e = np.array(np.unravel_index(np.array(lin_te_e), (n, n))).T

    # Return the sets of edges
    return tr_e, te_e
Пример #17
0
    def eval_node_probs(self):
        """Update probability density estimates.
        """
        if (self.mimic_speed == False):
            # Create mutual info matrix
            mutual_info = np.zeros([self.length, self.length])
            for i in range(self.length - 1):
                for j in range(i + 1, self.length):
                    mutual_info[i, j] = -1 * mutual_info_score(
                        self.keep_sample[:, i], self.keep_sample[:, j])

        elif (self.mimic_speed == True):
            # Set ignore error to ignore dividing by zero
            np.seterr(divide='ignore', invalid='ignore')

            # get length of the sample which survived from mimic iteration
            len_sample_kept = self.keep_sample.shape[0]
            # get the length of the bit sequence / problem size
            len_prob = self.keep_sample.shape[1]

            # Expand the matrices to so each row corresponds to a row by row combination of the list of samples
            permuted_rows = np.repeat(self.keep_sample, self.length).reshape(
                len_sample_kept, len_prob * len_prob)
            duplicated_rows = np.hstack(([self.keep_sample] * len_prob))

            # Compute the mutual information matrix in bulk
            # This is done by iterating through the list of possible feature values ((max_val-1)^2).
            # For example, a binary string would go through 00 01 10 11, for a total of 4 iterations.

            # First initialize the mutual info matrix.
            mutual_info_vectorized = np.zeros([self.length * self.length])
            # Pre-compute the clusters U and V which gets computed multiple times in the inner loop.
            cluster_U = {}
            cluster_V = {}
            cluster_U_sum = {}
            cluster_V_sum = {}
            for i in range(0, self.max_val):
                cluster_U[i] = (duplicated_rows == i)
                cluster_V[i] = (permuted_rows == i)
                cluster_U_sum[i] = np.sum(duplicated_rows == i, axis=0)
                cluster_V_sum[i] = np.sum(permuted_rows == i, axis=0)

            # Compute the mutual information for all sample to sample combination
            # Done for each feature combination i & j ((max_val-1)^2)
            for i in range(0, self.max_val):
                for j in range(0, self.max_val):
                    # |U_i AND V_j|/N Length of cluster matching for feature pair i j over sample length N
                    # This is the first term in the MI computation
                    MI_first_term = np.sum(cluster_U[i] * cluster_V[j], axis=0)
                    MI_first_term = np.divide(MI_first_term, len_sample_kept)

                    # compute the second term of the MI matrix
                    # Length |U_i||V_j|, for the particular feature pair
                    UV_length = (cluster_U_sum[i] * cluster_V_sum[j])
                    MI_second_term = np.log(MI_first_term) - np.log(
                        UV_length) + np.log(len_sample_kept)
                    # remove the nans and negative infinity, there shouldn't be any
                    MI_second_term[np.isnan(MI_second_term)] = 0
                    MI_second_term[np.isneginf(MI_second_term)] = 0

                    # Combine the first and second term
                    # Add the whole MI matrix for the feature to the previously computed values
                    mutual_info_vectorized = mutual_info_vectorized + MI_first_term * MI_second_term

            # Need to multiply by negative to get the mutual information, and reshape (Full Matrix)
            mutual_info_full = -mutual_info_vectorized.reshape(
                self.length, self.length)
            # Only get the upper triangle matrix above the identity row.
            mutual_info = np.triu(mutual_info_full, k=1)
            # Possible enhancements, currently we are doing double the computation required.
            # Pre set the matrix so the computation is only done for rows that are needed. To do for the future.

        # Find minimum spanning tree of mutual info matrix
        mst = minimum_spanning_tree(csr_matrix(mutual_info))

        # Convert minimum spanning tree to depth first tree with node 0 as root
        dft = depth_first_tree(csr_matrix(mst.toarray()), 0, directed=False)
        dft = np.round(dft.toarray(), 10)

        # Determine parent of each node
        parent = np.argmin(dft[:, 1:], axis=0)

        # Get probs
        probs = np.zeros([self.length, self.max_val, self.max_val])

        probs[0, :] = np.histogram(self.keep_sample[:, 0],
                                   np.arange(self.max_val + 1),
                                   density=True)[0]

        for i in range(1, self.length):
            for j in range(self.max_val):
                subset = self.keep_sample[np.where(
                    self.keep_sample[:, parent[i - 1]] == j)[0]]

                if not len(subset):
                    probs[i, j] = 1 / self.max_val
                else:
                    probs[i, j] = np.histogram(subset[:, i],
                                               np.arange(self.max_val + 1),
                                               density=True)[0]

        # Update probs and parent
        self.node_probs = probs
        self.parent_nodes = parent
Пример #18
0
        print(i)
        for j in dataset.columns:

            M_info[i][j] = mutual_info_score(dataset[i].values,
                                             dataset[j].values)

    from scipy.sparse import csr_matrix, find
    from scipy.sparse.csgraph import minimum_spanning_tree, depth_first_tree

    X = csr_matrix(M_info)
    Tcsr = -minimum_spanning_tree(-X)
    print(Tcsr)
    Array1 = Tcsr.toarray().astype(float)

    #Y = csr_matrix(A)
    Tcsr_depth = depth_first_tree(Array1, 1, directed=False)
    print(Tcsr_depth)
    Array2 = Tcsr_depth.toarray().astype(float)

    really = np.column_stack(((find(Array2))[0], (find(Array2))[1]))

    pred = np.zeros(len(test_data))
    CPD = np.apply_along_axis(gettingRow,
                              1,
                              really,
                              dataset=dataset,
                              p_1=prob_x_1,
                              p_0=prob_x_0,
                              test=test_data,
                              prediction=pred)