Пример #1
0
def _build_naive_divisive_tree_helper(boxed_Z, ordered_labels, early_termination=False):
    """
    Try to be somewhat memory efficient because Z can be huge.
    @param boxed_Z: a standardized data matrix, boxed so it can be deleted
    @param ordered_labels: integer labels conformant to rows of Z
    @param early_termination: True iff clustering stops when a split is degenerate
    @return: the root of a tree
    """
    if len(boxed_Z) != 1:
        raise ValueError('expected the input matrix to be boxed for deletion')
    Z = boxed_Z[0]
    if len(Z) != len(ordered_labels):
        raise ValueError('the input labels are incompatible with the input matrix')
    p = len(ordered_labels)
    # define the root
    root = mtree.Node()
    # deal with a degenerate split
    if p == 1:
        root.label = ordered_labels[0]
        return root
    # get the eigenvector whose loadings will be used to split the matrix
    Z = util.get_column_centered_matrix(Z)
    U, S, VT = np.linalg.svd(Z, full_matrices=0)
    v = khorr.get_dominant_vector(U, S)
    del U
    del VT
    # split the matrix
    stack = []
    index_split = splitbuilder.eigenvector_to_split(v)
    # if we are doing early termination and the split is degenerate then we are done
    if early_termination and min(len(x) for x in index_split) < 2:
        for loading, row_index in sorted((x, i) for i, x in enumerate(v)):
            child = mtree.Node()
            child.label = ordered_labels[row_index]
            root.add_child(child)
        return root
    for selection_set in index_split:
        selection = list(sorted(selection_set))
        # define the next standardized (but not column centered) matrix
        next_matrix = np.vstack(row for i, row in enumerate(Z) if i in selection_set)
        # define the next ordered labels
        next_ordered_labels = [ordered_labels[i] for i in selection]
        # add to the stack
        stack.append([next_matrix, next_ordered_labels])
    # we no longer need the Z matrix
    del boxed_Z[0]
    del Z
    # build the tree
    while stack:
        next_matrix, next_ordered_labels = stack.pop()
        next_boxed_Z = [next_matrix]
        del next_matrix
        child = _build_naive_divisive_tree_helper(next_boxed_Z, next_ordered_labels, early_termination)
        root.add_child(child)
    return root
Пример #2
0
def data_to_laplacian_sqrt(X):
    """
    If the output is U, S then (U*S)(U*S)' is like a Laplacian.
    @param X: a data matrix
    @return: U, S
    """
    logging.debug('data_to_laplacian_sqrt: creating the standardized matrix')
    Z = get_standardized_matrix(X)
    logging.debug('data_to_laplacian_sqrt: creating the augmented matrix')
    Q = standardized_to_augmented_C(Z)
    logging.debug('data_to_laplacian_sqrt: creating the column centered matrix')
    W = util.get_column_centered_matrix(Q)
    logging.debug('data_to_laplacian_sqrt: manually cleaning up old matrices')
    del Z
    del Q
    logging.debug('data_to_laplacian_sqrt: doing a singular value decomposition')
    U, S_array, VT = np.linalg.svd(W, full_matrices=0)
    S_pinv_array = np.array([0 if util.is_small(x) else 1/x for x in S_array])
    return U, S_pinv_array
Пример #3
0
def build_single_split_tree(X, use_squared_correlation=True):
    """
    Get the root of an mtree reconstructed from the transformed data.
    Note that only the dominant singular vector is required.
    This may be faster to get than the entire SVD.
    This method is naive compared to build_tree.
    With the use_squared_correlation options disabled, it is even more naive.
    @param X: a data matrix, preferably with more rows than columns
    @param use_squared_correlation: True for squared correlation, False for correlation
    @return: the root of a tree
    """
    # get the eigenvector whose loadings will be used to split and order the rows
    logging.debug('creating the standardized matrix')
    Z = khorr.get_standardized_matrix(X)
    if use_squared_correlation:
        logging.debug('creating the augmented matrix')
        Z = khorr.standardized_to_augmented_C(Z)
    logging.debug('creating the column centered matrix')
    W = util.get_column_centered_matrix(Z)
    logging.debug('manually cleaning up old matrices')
    del Z
    logging.debug('doing a singular value decomposition')
    U, S, VT = np.linalg.svd(W, full_matrices=0)
    logging.debug('getting the dominant eigenvector')
    v = khorr.get_dominant_vector(U, S)
    # account for values near zero, using the same criterion as in splitbuilder
    epsilon = 1e-14
    vprime = [0.0 if abs(x) < epsilon else x for x in v]
    # start making a tree from the eigenvector
    root = mtree.Node()
    neg_child = mtree.Node()
    pos_child = mtree.Node()
    root.add_child(neg_child)
    root.add_child(pos_child)
    for loading, row_index in sorted((x, i) for i, x in enumerate(vprime)):
        grandchild = mtree.Node()
        grandchild.label = row_index
        if loading > 0:
            pos_child.add_child(grandchild)
        else:
            neg_child.add_child(grandchild)
    return root