Пример #1
0
def test_tikhonov_regularization_vs_graph_net():
    # Test for one of the extreme cases of Graph-Net: That is, with
    # l1_ratio = 0 (pure Smooth), we compare Graph-Net's performance
    # with the analytical solution for Tikhonov Regularization

    # XXX A small dataset here (this test is very lengthy)
    G = get_gradient_matrix(w.size, mask)
    optimal_model = np.dot(
        sp.linalg.pinv(np.dot(X.T, X) + y.size * np.dot(G.T, G)),
        np.dot(X.T, y))
    graph_net = BaseSpaceNet(mask=mask_,
                             alphas=1. * X.shape[0],
                             l1_ratios=0.,
                             max_iter=400,
                             fit_intercept=False,
                             screening_percentile=100.,
                             standardize=False)
    graph_net.fit(X_, y.copy())
    coef_ = graph_net.coef_[0]
    graph_net_perf = 0.5 / y.size * extmath.norm(
        np.dot(X, coef_) - y) ** 2\
        + 0.5 * extmath.norm(np.dot(G, coef_)) ** 2
    optimal_model_perf = 0.5 / y.size * extmath.norm(
        np.dot(X, optimal_model) - y) ** 2\
        + 0.5 * extmath.norm(np.dot(G, optimal_model)) ** 2
    assert_almost_equal(graph_net_perf, optimal_model_perf, decimal=1)
Пример #2
0
def test_norm_squared_norm():
    X = np.random.RandomState(42).randn(50, 63)
    X *= 100        # check stability
    X += 200

    assert_almost_equal(np.linalg.norm(X.ravel()), norm(X))
    assert_almost_equal(norm(X) ** 2, squared_norm(X), decimal=6)
    assert_almost_equal(np.linalg.norm(X), np.sqrt(squared_norm(X)), decimal=6)
Пример #3
0
def test_norm_squared_norm():
    X = np.random.RandomState(42).randn(50, 63)
    X *= 100  # check stability
    X += 200

    assert_almost_equal(np.linalg.norm(X.ravel()), norm(X))
    assert_almost_equal(norm(X)**2, squared_norm(X), decimal=6)
    assert_almost_equal(np.linalg.norm(X), np.sqrt(squared_norm(X)), decimal=6)
Пример #4
0
def test_norm_squared_norm():
    X = np.random.RandomState(42).randn(50, 63)
    X *= 100        # check stability
    X += 200

    assert_almost_equal(np.linalg.norm(X.ravel()), norm(X))
    assert_almost_equal(norm(X) ** 2, squared_norm(X), decimal=6)
    assert_almost_equal(np.linalg.norm(X), np.sqrt(squared_norm(X)), decimal=6)
    # Check the warning with an int array and np.dot potential overflow
    assert_warns_message(
                    UserWarning, 'Array type is integer, np.dot may '
                    'overflow. Data should be float type to avoid this issue',
                    squared_norm, X.astype(int))
Пример #5
0
def test_norm_squared_norm():
    X = np.random.RandomState(42).randn(50, 63)
    X *= 100  # check stability
    X += 200

    assert_almost_equal(np.linalg.norm(X.ravel()), norm(X))
    assert_almost_equal(norm(X)**2, squared_norm(X), decimal=6)
    assert_almost_equal(np.linalg.norm(X), np.sqrt(squared_norm(X)), decimal=6)
    # Check the warning with an int array and np.dot potential overflow
    assert_warns_message(
        UserWarning, 'Array type is integer, np.dot may '
        'overflow. Data should be float type to avoid this issue',
        squared_norm, X.astype(int))
Пример #6
0
def mean_shift(X, bandwidth, n_seeds, kernel_function='gaussian', max_iterations=100, proximity_thresh=5):
    '''
    ---Parameters---
    X : data in form (samples, dims)
    bandwidth : radius of nearest neighbors
    n_seeds : 
    kernel_update_function : can be "gaussian" or "flat" or your own kernel
    proximity_thresh : minimum distance (in pixels) a new cluster must be away from previous ones

    ---Returns---
    cluster_centers : 
    cluster_counts : how many pixels are with the neighborhood of each cluster
    '''

    import numpy as np
    from sklearn.neighbors import BallTree, NearestNeighbors
    from sklearn.utils import extmath
    from sklearn.metrics.pairwise import euclidean_distances
    from collections import defaultdict 

    if kernel_function == 'gaussian':
        kernel_update_function = gaussian_kernel
    elif kernel_function == 'flat':
        kernel_update_function = flat_kernel
    else:
        kernel_update_function = kernel_function


    n_points, n_features = X.shape
    stop_thresh = 1e-2 * bandwidth # when mean has converged                                                                                                               
    cluster_centers = []
    cluster_counts = [] 
    # ball_tree = BallTree(X)# to efficiently look up nearby points
    neighbors = NearestNeighbors(radius=bandwidth).fit(X)

    seeds = X[(np.random.uniform(0,X.shape[0], n_seeds)).astype(np.int)]
 
    # For each seed, climb gradient until convergence or max_iterations                                                                                                     
    for weighted_mean in seeds:
         completed_iterations = 0
         while True:
             points_within = X[neighbors.radius_neighbors([weighted_mean], bandwidth, return_distance=False)[0]]
             old_mean = weighted_mean  # save the old mean                                                                                                                  
             weighted_mean = kernel_update_function(old_mean, points_within, bandwidth)
             converged = extmath.norm(weighted_mean - old_mean) < stop_thresh
             if converged or completed_iterations == max_iterations:
                # Only add cluster if it's different enough from other centers
                if len(cluster_centers) > 0:
                    diff_from_prev = [np.linalg.norm(weighted_mean-cluster_centers[i], 2) for i in range(len(cluster_centers))]
                    if np.min(diff_from_prev) > proximity_thresh:
                        cluster_centers.append(weighted_mean)
                        cluster_counts.append(points_within.shape[0])
                else:
                    cluster_centers.append(weighted_mean)
                    cluster_counts.append(points_within.shape[0])
                break
             completed_iterations += 1
 
    return cluster_centers, cluster_counts
Пример #7
0
def test_logistic_derivative_lipschitz_constant():
    # Tests Lipschitz-continuity of of the derivative of logistic loss
    rng = check_random_state(42)
    grad_weight = 2.08e-1
    lipschitz_constant = _logistic_derivative_lipschitz_constant(
        X, mask, grad_weight)
    for _ in range(20):
        x_1 = rng.rand((w.shape[0] + 1)) * rng.randint(1000)
        x_2 = rng.rand((w.shape[0] + 1)) * rng.randint(1000)
        gradient_difference = extmath.norm(
            _logistic_data_loss_and_spatial_grad_derivative(
                X, y, x_1, mask, grad_weight) -
            _logistic_data_loss_and_spatial_grad_derivative(
                X, y, x_2, mask, grad_weight))
        point_difference = extmath.norm(x_1 - x_2)
        assert_true(
            gradient_difference <= lipschitz_constant * point_difference)
Пример #8
0
def test_logistic_derivative_lipschitz_constant():
    # Tests Lipschitz-continuity of of the derivative of logistic loss
    rng = check_random_state(42)
    grad_weight = 2.08e-1
    lipschitz_constant = _logistic_derivative_lipschitz_constant(
        X, mask, grad_weight)
    for _ in range(20):
        x_1 = rng.rand((w.shape[0] + 1)) * rng.randint(1000)
        x_2 = rng.rand((w.shape[0] + 1)) * rng.randint(1000)
        gradient_difference = extmath.norm(
            _logistic_data_loss_and_spatial_grad_derivative(
                X, y, x_1, mask, grad_weight)
            - _logistic_data_loss_and_spatial_grad_derivative(
                X, y, x_2, mask, grad_weight))
        point_difference = extmath.norm(x_1 - x_2)
        assert_true(
            gradient_difference <= lipschitz_constant * point_difference)
Пример #9
0
    def _pa(self, loss_t, x_t):
        denom = extmath.norm(x_t) ** 2.0
        # special case when L_2 norm of x_t is zero (followed libol
        # implementation)
        if denom == 0:
            return 1

        d = loss_t / denom
        return d
Пример #10
0
def test__squared_loss_derivative_lipschitz_constant():
    # Tests Lipschitz-continuity of the derivative of _squared_loss loss
    # function
    rng = check_random_state(42)
    grad_weight = 2.08e-1
    lipschitz_constant = _squared_loss_derivative_lipschitz_constant(
        X, mask, grad_weight)
    for _ in range(20):
        x_1 = rng.rand(*w.shape) * rng.randint(1000)
        x_2 = rng.rand(*w.shape) * rng.randint(1000)
        gradient_difference = extmath.norm(
            _squared_loss_and_spatial_grad_derivative(X, y, x_1, mask,
                                                      grad_weight)
            - _squared_loss_and_spatial_grad_derivative(X, y, x_2, mask,
                                                        grad_weight))
        point_difference = extmath.norm(x_1 - x_2)
        assert_true(
            gradient_difference <= lipschitz_constant * point_difference)
Пример #11
0
def test__squared_loss_derivative_lipschitz_constant():
    # Tests Lipschitz-continuity of the derivative of _squared_loss loss
    # function
    rng = check_random_state(42)
    grad_weight = 2.08e-1
    lipschitz_constant = _squared_loss_derivative_lipschitz_constant(
        X, mask, grad_weight)
    for _ in range(20):
        x_1 = rng.rand(*w.shape) * rng.randint(1000)
        x_2 = rng.rand(*w.shape) * rng.randint(1000)
        gradient_difference = extmath.norm(
            _squared_loss_and_spatial_grad_derivative(X, y, x_1, mask,
                                                      grad_weight) -
            _squared_loss_and_spatial_grad_derivative(X, y, x_2, mask,
                                                      grad_weight))
        point_difference = extmath.norm(x_1 - x_2)
        assert_true(
            gradient_difference <= lipschitz_constant * point_difference)
Пример #12
0
def _reorth(basis, target, rows=None, alpha=0.5):
    """Reorthogonalize a vector using iterated Gram-Schmidt

    Parameters
    ----------
    basis: ndarray, shape (n_features, n_basis)
        The matrix whose rows are a set of basis to reorthogonalize against

    target: ndarray, shape (n_features,)
        The target vector to be reorthogonalized

    rows: {array-like, None}, default None
        Indices of rows from basis to use. Use all if None

    alpha: float, default 0.5
        Parameter for determining whether to do a second reorthogonalization.

    Returns
    -------
    reorthed_target: ndarray, shape (n_features,)
        The reorthogonalized vector
    """
    if rows is not None:
        basis = basis[rows]
    norm_target = norm(target)

    norm_target_old = 0
    n_reorth = 0

    while norm_target < alpha * norm_target_old or n_reorth == 0:
        for row in basis:
            t = fast_dot(row, target)
            target = target - t * row

        norm_target_old = norm_target
        norm_target = norm(target)
        n_reorth += 1

        if n_reorth > 4:
            # target in span(basis) => accpet target = 0
            target = np.zeros(basis.shape[0])
            break

    return target
Пример #13
0
Файл: tga.py Проект: vyraun/tga
def _reorth(basis, target, rows=None, alpha=0.5):
    """Reorthogonalize a vector using iterated Gram-Schmidt

    Parameters
    ----------
    basis: ndarray, shape (n_features, n_basis)
        The matrix whose rows are a set of basis to reorthogonalize against

    target: ndarray, shape (n_features,)
        The target vector to be reorthogonalized

    rows: {array-like, None}, default None
        Indices of rows from basis to use. Use all if None

    alpha: float, default 0.5
        Parameter for determining whether to do a second reorthogonalization.

    Returns
    -------
    reorthed_target: ndarray, shape (n_features,)
        The reorthogonalized vector
    """
    if rows is not None:
        basis = basis[rows]
    norm_target = norm(target)

    norm_target_old = 0
    n_reorth = 0

    while norm_target < alpha * norm_target_old or n_reorth == 0:
        for row in basis:
            t = fast_dot(row, target)
            target = target - t * row

        norm_target_old = norm_target
        norm_target = norm(target)
        n_reorth += 1

        if n_reorth > 4:
            # target in span(basis) => accpet target = 0
            target = np.zeros(basis.shape[0])
            break

    return target
Пример #14
0
def _bistochastic_normalize(X, max_iter=1000, tol=1e-5):
    """Normalize rows and columns of ``X`` simultaneously so that all
    rows sum to one constant and all columns sum to a different
    constant.

    """
    # According to paper, this can also be done more efficiently with
    # deviation reduction and balancing algorithms.
    X = make_nonnegative(X)
    X_scaled = X
    dist = None
    for _ in range(max_iter):
        X_new, _, _ = _scale_normalize(X_scaled)
        if issparse(X):
            dist = norm(X_scaled.data - X.data)
        else:
            dist = norm(X_scaled - X_new)
        X_scaled = X_new
        if dist is not None and dist < tol:
            break
    return X_scaled
Пример #15
0
def _bistochastic_normalize(X, max_iter=1000, tol=1e-5):
    """Normalize rows and columns of ``X`` simultaneously so that all
    rows sum to one constant and all columns sum to a different
    constant.

    """
    # According to paper, this can also be done more efficiently with
    # deviation reduction and balancing algorithms.
    X = make_nonnegative(X)
    X_scaled = X
    dist = None
    for _ in range(max_iter):
        X_new, _, _ = _scale_normalize(X_scaled)
        if issparse(X):
            dist = norm(X_scaled.data - X.data)
        else:
            dist = norm(X_scaled - X_new)
        X_scaled = X_new
        if dist is not None and dist < tol:
            break
    return X_scaled
Пример #16
0
def test_tikhonov_regularization_vs_graph_net():
    # Test for one of the extreme cases of Graph-Net: That is, with
    # l1_ratio = 0 (pure Smooth), we compare Graph-Net's performance
    # with the analytical solution for Tikhonov Regularization

    # XXX A small dataset here (this test is very lengthy)
    G = get_gradient_matrix(w.size, mask)
    optimal_model = np.dot(sp.linalg.pinv(
        np.dot(X.T, X) + y.size * np.dot(G.T, G)), np.dot(X.T, y))
    graph_net = BaseSpaceNet(
        mask=mask_, alphas=1. * X.shape[0], l1_ratios=0., max_iter=400,
        fit_intercept=False,
        screening_percentile=100., standardize=False)
    graph_net.fit(X_, y.copy())
    coef_ = graph_net.coef_[0]
    graph_net_perf = 0.5 / y.size * extmath.norm(
        np.dot(X, coef_) - y) ** 2\
        + 0.5 * extmath.norm(np.dot(G, coef_)) ** 2
    optimal_model_perf = 0.5 / y.size * extmath.norm(
        np.dot(X, optimal_model) - y) ** 2\
        + 0.5 * extmath.norm(np.dot(G, optimal_model)) ** 2
    assert_almost_equal(graph_net_perf, optimal_model_perf, decimal=1)
Пример #17
0
def test_lasso_vs_graph_net():
    # Test for one of the extreme cases of Graph-Net: That is, with
    # l1_ratio = 1 (pure Lasso), we compare Graph-Net's performance with
    # Scikit-Learn lasso
    lasso = Lasso(max_iter=100, tol=1e-8, normalize=False)
    graph_net = BaseSpaceNet(mask=mask, alphas=1. * X_.shape[0],
                             l1_ratios=1, is_classif=False,
                             penalty="graph-net", max_iter=100)
    lasso.fit(X_, y)
    graph_net.fit(X, y)
    lasso_perf = 0.5 / y.size * extmath.norm(np.dot(
        X_, lasso.coef_) - y) ** 2 + np.sum(np.abs(lasso.coef_))
    graph_net_perf = 0.5 * ((graph_net.predict(X) - y) ** 2).mean()
    np.testing.assert_almost_equal(graph_net_perf, lasso_perf, decimal=3)
def f_regression_nosparse(X, y, center=True):
    """Univariate linear regression tests

    Quick linear model for testing the effect of a single regressor,
    sequentially for many regressors.

    This is done in 3 steps:
    1. the regressor of interest and the data are orthogonalized
       with respect to constant regressors
    2. the cross correlation between data and regressors is computed
    3. it is converted to an F score then to a p-value

    Parameters
    ----------
    X : {array-like, sparse matrix}  shape = (n_samples, n_features)
        The set of regressors that will tested sequentially.

    y : array of shape(n_samples).
        The data matrix

    center : True, bool,
        If true, X and y will be centered.

    Returns
    -------
    F : array, shape=(n_features,)
        F values of features.

    pval : array, shape=(n_features,)
        p-values of F-scores.
    """
    X, y = check_arrays(X, y, dtype=np.float)
    y = y.ravel()
    if center:
        y = y - np.mean(y)
        X = X.copy('F')  # faster in fortran
        X -= X.mean(axis=0)

    # compute the correlation
    corr = np.dot(y, X)
    # XXX could use corr /= row_norms(X.T) here, but the test doesn't pass
    corr /= np.asarray(np.sqrt((X ** 2).sum(axis=0))).ravel()
    corr /= norm(y)

    # convert to p-value
    degrees_of_freedom = y.size - (2 if center else 1)
    F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
    pv = stats.f.sf(F, 1, degrees_of_freedom)
    return F, pv
def compute_bench(samples_range,
                  features_range,
                  iter_range=[3],
                  rank_range=[50]):

    it = 0

    results = []
    results1 = []
    results2 = []

    max_it = len(samples_range) * len(features_range) * len(rank_range) * len(
        iter_range)
    for n_samples in samples_range:
        for n_features in features_range:
            for rank in rank_range:
                for n_iter in iter_range:
                    it += 1
                    print('====================')
                    print('Iteration %03d of %03d' % (it, max_it))
                    print('====================')
                    X = make_low_rank_matrix(n_samples,
                                             n_features,
                                             effective_rank=rank,
                                             tail_strength=0.2)
                    value = norm(X)
                    gc.collect()
                    print("benchmarking scipy svd: ")
                    tstart = time()
                    s1, v1, d1 = svd(X, full_matrices=False)
                    value1 = v1[0]
                    results.append(value1 / value1)

                    gc.collect()
                    print("benchmarking randomized_svd: n_iter=1")
                    tstart = time()
                    s2, v2, d2 = randomized_svd(X, rank, n_iter=1)
                    value2 = v2[0]
                    results1.append(value2 / value1)

                    gc.collect()
                    print("benchmarking randomized_svd: n_iter=%d " %
                          iter_range[0])
                    tstart = time()
                    s3, v3, d3 = randomized_svd(X, rank, n_iter=n_iter)
                    value3 = v3[0]
                    results2.append(value3 / value1)

    return results, results1, results2
Пример #20
0
def _iter(X, weighted_mean, kernel_update_function, bandwidth, ball_tree,
          stop_thresh, max_iter):
    """Return the cluster center and the within points visited while iterated from the seed
    to the centroid. This code has been isolated to be executed in parallel using JobLib."""
    visited_points = set()
    completed_iterations = 0
    while True:
        within_idx = ball_tree.query_radius([weighted_mean], bandwidth * 3)[0]
        [visited_points.add(x) for x in within_idx]
        points_within = X[within_idx]
        old_mean = weighted_mean  # save the old mean
        weighted_mean = kernel_update_function(old_mean, points_within,
                                               bandwidth)
        converged = extmath.norm(weighted_mean - old_mean) < stop_thresh
        if converged or completed_iterations == max_iter:
            return weighted_mean, visited_points
        completed_iterations += 1
Пример #21
0
def mean_shift(X, bandwidth, seeds, kernel_update_function, max_iterations=10):
    n_points, n_features = X.shape
    stop_thresh = 1e-3 * bandwidth  # when mean has converged                                                                                                               
    cluster_centers = []
    ball_tree = BallTree(X)  # to efficiently look up nearby points

    # For each seed, climb gradient until convergence or max_iterations                                                                                                     
    for weighted_mean in seeds:
         completed_iterations = 0
         while True:
             points_within = X[ball_tree.query_radius([weighted_mean], bandwidth*3)[0]]
             old_mean = weighted_mean  # save the old mean                                                                                                                  
             weighted_mean = kernel_update_function(old_mean, points_within, bandwidth)
             converged = extmath.norm(weighted_mean - old_mean) < stop_thresh
             if converged or completed_iterations == max_iterations:
                 cluster_centers.append(weighted_mean)
                 break
             completed_iterations += 1

    return cluster_centers
Пример #22
0
def _mean_shift_single_seed(my_mean, X, nbrs, max_iter):
    # For each seed, climb gradient until convergence or max_iter
    bandwidth = nbrs.get_params()['radius']
    stop_thresh = 1e-3 * bandwidth  # when mean has converged
    completed_iterations = 0
    while True:
        # Find mean of points within bandwidth
        i_nbrs = nbrs.radius_neighbors([my_mean],
                                       bandwidth,
                                       return_distance=False)[0]
        points_within = X[i_nbrs]
        if len(points_within) == 0:
            break  # Depending on seeding strategy this condition may occur
        my_old_mean = my_mean  # save the old mean
        my_mean = np.mean(points_within, axis=0)
        # If converged or at max_iter, adds the cluster
        if (extmath.norm(my_mean - my_old_mean) < stop_thresh
                or completed_iterations == max_iter):
            return tuple(my_mean), len(points_within)
        completed_iterations += 1
Пример #23
0
def _iter(X,
          weighted_mean,
          kernel_update_function,
          bandwidth,
          ball_tree,
          stop_thresh,
          max_iter):
    """Return the cluster center and the within points visited while iterated from the seed
    to the centroid. This code has been isolated to be executed in parallel using JobLib."""
    visited_points = set()
    completed_iterations = 0
    while True:
        within_idx = ball_tree.query_radius([weighted_mean], bandwidth*3)[0]
        [visited_points.add(x) for x in within_idx]
        points_within = X[within_idx]
        old_mean = weighted_mean  # save the old mean
        weighted_mean = kernel_update_function(old_mean, points_within, bandwidth)
        converged = extmath.norm(weighted_mean - old_mean) < stop_thresh
        if converged or completed_iterations == max_iter:
            return weighted_mean, visited_points
        completed_iterations += 1
Пример #24
0
def mean_shift(X,
               bandwidth=None,
               seeds=None,
               bin_seeding=False,
               min_bin_freq=1,
               cluster_all=True,
               max_iter=300,
               max_iterations=None,
               kernel='flat',
               gamma=1.0):
    """Perform MeanShift Clustering of data using a flat kernel

    MeanShift clustering aims to discover *blobs* in a smooth density of
    samples. It is a centroid based algorithm, which works by updating
    candidates for centroids to be the mean of the points within a given
    region. These candidates are then filtered in a post-processing stage
    to eliminate near-duplicates to form the final set of centroids.

    Seeding is performed using a binning technique for scalability.

    Parameters
    ----------

    X : array-like, shape=[n_samples, n_features]
        Input data.

    bandwidth : float, optional
        Kernel bandwidth.

        If bandwidth is not given, it is determined using a heuristic based on
        the median of all pairwise distances. This will take quadratic time in
        the number of samples. The sklearn.cluster.estimate_bandwidth function
        can be used to do this more efficiently.
        The only exception is when an rbf kernel is used. In this case
        the default value of bandwidth is 3 / sqrt(gamma).

    seeds : array-like, shape=[n_seeds, n_features] or None
        Point used as initial kernel locations. If None and bin_seeding=False,
        each data point is used as a seed. If None and bin_seeding=True,
        see bin_seeding.

    bin_seeding : boolean, default=False
        If true, initial kernel locations are not locations of all
        points, but rather the location of the discretized version of
        points, where points are binned onto a grid whose coarseness
        corresponds to the bandwidth. Setting this option to True will speed
        up the algorithm because fewer seeds will be initialized.
        Ignored if seeds argument is not None.

    min_bin_freq : int, default=1
       To speed up the algorithm, accept only those bins with at least
       min_bin_freq points as seeds.

    cluster_all : boolean, default True
        If true, then all points are clustered, even those orphans that are
        not within any kernel. Orphans are assigned to the nearest kernel.
        If false, then orphans are given cluster label -1.

    max_iter : int, default 300
        Maximum number of iterations, per seed point before the clustering
        operation terminates (for that seed point), if has not converged yet.
    
    kernel : string, optional, default 'flat'
        The kernel used to update the centroid candidates. Implemented kernels
        are 'flat', 'rbf', 'epanechnikov' and 'biweight'.

    gamma : float, optional, default 1.0
        Kernel coefficient for 'rbf'. Higher values make it prefer more
        datapoints in the center of the cluster, lower values make the kernel
        behave more like the flat kernel.

    Returns
    -------

    cluster_centers : array, shape=[n_clusters, n_features]
        Coordinates of cluster centers.

    labels : array, shape=[n_samples]
        Cluster labels for each point.

    Notes
    -----
    See examples/cluster/plot_meanshift.py for an example.

    """
    # FIXME To be removed in 0.18
    if max_iterations is not None:
        warnings.warn(
            "The `max_iterations` parameter has been renamed to "
            "`max_iter` from version 0.16. The `max_iterations` "
            "parameter will be removed in 0.18", DeprecationWarning)
        max_iter = max_iterations

    if bandwidth is None:
        if kernel == 'rbf':
            bandwidth = 3 / np.sqrt(gamma)
        else:
            bandwidth = estimate_bandwidth(X)
    elif bandwidth <= 0:
        raise ValueError(
            "bandwidth needs to be greater than zero or None, got %f" %
            bandwidth)
    if seeds is None:
        if bin_seeding:
            seeds = get_bin_seeds(X, bandwidth, min_bin_freq)
        else:
            seeds = X
    n_samples, n_features = X.shape
    stop_thresh = 1e-3 * bandwidth  # when mean has converged
    center_intensity_dict = {}
    nbrs = NearestNeighbors(radius=bandwidth).fit(X)

    # For each seed, climb gradient until convergence or max_iter
    for my_mean in seeds:
        completed_iterations = 0
        while True:
            # Find mean of points within bandwidth
            i_nbrs = nbrs.radius_neighbors([my_mean],
                                           bandwidth,
                                           return_distance=False)[0]
            points_within = X[i_nbrs]
            if len(points_within) == 0:
                break  # Depending on seeding strategy this condition may occur
            my_old_mean = my_mean  # save the old mean
            my_mean = _kernel_update(my_old_mean, points_within, bandwidth,
                                     kernel, gamma)
            # If converged or at max_iterations, addS the cluster
            if (extmath.norm(my_mean - my_old_mean) < stop_thresh
                    or completed_iterations == max_iter):
                center_intensity_dict[tuple(my_mean)] = len(points_within)
                break
            completed_iterations += 1

    if not center_intensity_dict:
        # nothing near seeds
        raise ValueError(
            "No point was within bandwidth=%f of any seed."
            " Try a different seeding strategy or increase the bandwidth." %
            bandwidth)

    # POST PROCESSING: remove near duplicate points
    # If the distance between two kernels is less than the bandwidth,
    # then we have to remove one because it is a duplicate. Remove the
    # one with fewer points.
    sorted_by_intensity = sorted(center_intensity_dict.items(),
                                 key=lambda tup: tup[1],
                                 reverse=True)
    sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
    unique = np.ones(len(sorted_centers), dtype=np.bool)
    nbrs = NearestNeighbors(radius=bandwidth).fit(sorted_centers)
    for i, center in enumerate(sorted_centers):
        if unique[i]:
            neighbor_idxs = nbrs.radius_neighbors([center],
                                                  return_distance=False)[0]
            unique[neighbor_idxs] = 0
            unique[i] = 1  # leave the current point as unique
    cluster_centers = sorted_centers[unique]

    # ASSIGN LABELS: a point belongs to the cluster that it is closest to
    nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
    labels = np.zeros(n_samples, dtype=np.int)
    distances, idxs = nbrs.kneighbors(X)
    if cluster_all:
        labels = idxs.flatten()
    else:
        labels.fill(-1)
        bool_selector = distances.flatten() <= bandwidth
        labels[bool_selector] = idxs.flatten()[bool_selector]
    return cluster_centers, labels
Пример #25
0
    def _fit(self, X):
        """Fit the model on X

        Parameters
        ----------
        X: array-like, shape (n_samples, n_features)
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
        """
        self.nnzs = []
        X_orig = X.copy()
        if self.trim_proportion < 0 or self.trim_proportion > 0.5:
            raise ValueError('`trim_proportion` must be between 0 and 0.5,'
                             ' got %s.' % self.trim_proportion)
        
        lam = 1.0 / np.sqrt(np.max(X.shape))
        rng = check_random_state(self.random_state)
        X = check_array(X)
        self.obj = []
        n_samples, n_features = X.shape
        X = as_float_array(X, copy=self.copy)
        # Center data
        if self.centering == 'mean':
            self.center_ = np.mean(X, axis=0)
        elif self.centering == 'median':
            self.center_ = np.median(X, axis=0)
        else:
            raise ValueError("`centering` must be 'mean' or 'median', "
                             "got %s" % self.centering)
        X -= self.center_

        if self.n_components is None:
            n_components = X.shape[1]
        elif not 0 <= self.n_components <= n_features:
            raise ValueError("n_components=%r invalid for n_features=%d"
                             % (self.n_components, n_features))
        else:
            n_components = self.n_components

        self.components_ = np.empty((n_components, n_features))
        for k in range(n_components):
            # compute k'th principle component
            mu = rng.rand(n_features) - 0.5
            mu = mu / norm(mu)

            # initialize using a few EM iterations
            for i in range(3):
                dots = fast_dot(X, mu)
                mu = fast_dot(dots.T, X)
                mu = mu / norm(mu)

            # grassmann average
            for i in range(n_samples):
                prev_mu = mu
                dot_signs = np.sign(fast_dot(X, mu))
                mu = _trimmed_mean(X * dot_signs[:, np.newaxis],
                                   self.trim_proportion)
                mu = mu / norm(mu)

                
                if np.max(np.abs(mu - prev_mu)) < self.tol:
                    break

            # store the estimated vector and possibly re-orthonormalize
            if k > 0:
                mu = _reorth(self.components_[:k-1], mu)
                mu = mu / norm(mu)

            self.components_[k] = mu

            if k < n_components - 1:
                X = X - fast_dot(fast_dot(X, mu)[:, np.newaxis],
                                 mu[np.newaxis, :])
                L = X + self.center_
                S = X_orig - L
                
                o = norm_(L, 'nuc') + lam*np.sum(np.abs(S))
                #print('TGA Objective = ', o)
                self.obj.append(o)
                self.nnzs.append(np.sum(S > 0))
Пример #26
0
Файл: tga.py Проект: vyraun/tga
    def _fit(self, X):
        """Fit the model on X

        Parameters
        ----------
        X: array-like, shape (n_samples, n_features)
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
        """
        if self.trim_proportion < 0 or self.trim_proportion > 0.5:
            raise ValueError('`trim_proportion` must be between 0 and 0.5,'
                             ' got %s.' % self.trim_proportion)

        rng = check_random_state(self.random_state)
        X = check_array(X)
        n_samples, n_features = X.shape
        X = as_float_array(X, copy=self.copy)
        # Center data
        if self.centering == 'mean':
            self.center_ = np.mean(X, axis=0)
        elif self.centering == 'median':
            self.center_ = np.median(X, axis=0)
        else:
            raise ValueError("`centering` must be 'mean' or 'median', "
                             "got %s" % self.centering)
        X -= self.center_

        if self.n_components is None:
            n_components = X.shape[1]
        elif not 0 <= self.n_components <= n_features:
            raise ValueError("n_components=%r invalid for n_features=%d" %
                             (self.n_components, n_features))
        else:
            n_components = self.n_components

        self.components_ = np.empty((n_components, n_features))
        for k in range(n_components):
            # compute k'th principle component
            mu = rng.rand(n_features) - 0.5
            mu = mu / norm(mu)

            # initialize using a few EM iterations
            for i in range(3):
                dots = fast_dot(X, mu)
                mu = fast_dot(dots.T, X)
                mu = mu / norm(mu)

            # grassmann average
            for i in range(n_samples):
                prev_mu = mu
                dot_signs = np.sign(fast_dot(X, mu))
                mu = _trimmed_mean(X * dot_signs[:, np.newaxis],
                                   self.trim_proportion)
                mu = mu / norm(mu)

                if np.max(np.abs(mu - prev_mu)) < self.tol:
                    break

            # store the estimated vector and possibly re-orthonormalize
            if k > 0:
                mu = _reorth(self.components_[:k - 1], mu)
                mu = mu / norm(mu)

            self.components_[k] = mu

            if k < n_components - 1:
                X = X - fast_dot(
                    fast_dot(X, mu)[:, np.newaxis], mu[np.newaxis, :])
Пример #27
0
def mean_shift(X,
               bandwidth,
               n_seeds,
               kernel_function='gaussian',
               max_iterations=100,
               proximity_thresh=5):
    '''
    ---Parameters---
    X : data in form (samples, dims)
    bandwidth : radius of nearest neighbors
    n_seeds : 
    kernel_update_function : can be "gaussian" or "flat" or your own kernel
    proximity_thresh : minimum distance (in pixels) a new cluster must be away from previous ones

    ---Returns---
    cluster_centers : 
    cluster_counts : how many pixels are with the neighborhood of each cluster
    '''

    import numpy as np
    from sklearn.neighbors import BallTree, NearestNeighbors
    from sklearn.utils import extmath
    from sklearn.metrics.pairwise import euclidean_distances
    from collections import defaultdict

    if kernel_function == 'gaussian':
        kernel_update_function = gaussian_kernel
    elif kernel_function == 'flat':
        kernel_update_function = flat_kernel
    else:
        kernel_update_function = kernel_function

    n_points, n_features = X.shape
    stop_thresh = 1e-2 * bandwidth  # when mean has converged
    cluster_centers = []
    cluster_counts = []
    # ball_tree = BallTree(X)# to efficiently look up nearby points
    neighbors = NearestNeighbors(radius=bandwidth).fit(X)

    seeds = X[(np.random.uniform(0, X.shape[0], n_seeds)).astype(np.int)]

    # For each seed, climb gradient until convergence or max_iterations
    for weighted_mean in seeds:
        completed_iterations = 0
        while True:
            points_within = X[neighbors.radius_neighbors(
                [weighted_mean], bandwidth, return_distance=False)[0]]
            old_mean = weighted_mean  # save the old mean
            weighted_mean = kernel_update_function(old_mean, points_within,
                                                   bandwidth)
            converged = extmath.norm(weighted_mean - old_mean) < stop_thresh
            if converged or completed_iterations == max_iterations:
                # Only add cluster if it's different enough from other centers
                if len(cluster_centers) > 0:
                    diff_from_prev = [
                        np.linalg.norm(weighted_mean - cluster_centers[i], 2)
                        for i in range(len(cluster_centers))
                    ]
                    if np.min(diff_from_prev) > proximity_thresh:
                        cluster_centers.append(weighted_mean)
                        cluster_counts.append(points_within.shape[0])
                else:
                    cluster_centers.append(weighted_mean)
                    cluster_counts.append(points_within.shape[0])
                break
            completed_iterations += 1

    return cluster_centers, cluster_counts
Пример #28
0
def mean_shift(X, bandwidth=None, seeds=None, kernel="flat",
               max_cluster_radius=-1., max_iterations=300):
    """Perform MeanShift Clustering of data using the specified kernel

    Parameters
    ----------

    X : array [n_samples, n_features]
        Input points to be clustered

    bandwidth : float,
        Kernel bandwidth

    seeds: array [n_seeds, n_features], optional
        Points used as initial kernel locations
        If not set, then use every point as a seed (which may
        be very slow---consider using the `get_bin_seeds` function
        to create a reduced set of seeds.

    max_cluster_radius: float, default -1.
        Used only in post-processing.
        If negative, then each point is clustered into its nearest cluster.
        If positive, then those points that are not within `max_cluster_radius`
        of any cluster center are said to be 'orphans' that do not belong to
        any cluster. Orphans are given cluster label -1.

    Returns
    -------

    cluster_centers : array [n_clusters, n_features]
        Coordinates of cluster centers

    labels : array [n_samples]
        cluster labels for each point

    Notes
    -----
    See examples/plot_meanshift.py for an example.

    """

    if seeds is None:
        seeds = X
    elif len(seeds) == 0:
        raise ValueError, "If a list of seeds is provided it cannot be empty."

    if not (kernel in KERNELS):
        valid_kernels = " ".join(KERNELS)
        raise ValueError, "Kernel %s is not valid. Valid kernel choices are: %s " % (kernel, valid_kernels)

    # Set maximum neighbor query distance based on kernel
    if kernel in ["flat"]:
        query_distance = bandwidth
        kernel_update_function = flat_kernel_update
        print "Using flat kernel update"
    elif kernel in ["gaussian"]:
        query_distance = bandwidth * 3 # A bit arbitrary
        kernel_update_function = gaussian_kernel_update
        print "Using gaussian kernel update"
    else:
        raise ValueError, "Kernel %s not implemented correctly" % kernel

    n_points, n_features = X.shape
    stop_thresh = 1e-3 * bandwidth  # when mean has converged
    center_intensity_dict = {}
    ball_tree = BallTree(X)  # to efficiently look up nearby points

    # For each seed, climb gradient until convergence or max_iterations
    for weighted_mean in seeds:
        completed_iterations = 0
        while True:
            # Find mean of points within bandwidth
            points_within = X[ball_tree.query_radius([weighted_mean], query_distance)[0]]
            if len(points_within) == 0:
                break  # Depending on seeding strategy this condition may occur
            old_mean = weighted_mean  # save the old mean
            weighted_mean = kernel_update_function(old_mean, points_within, bandwidth)
            # If converged or at max_iterations, addS the cluster
            if extmath.norm(weighted_mean - old_mean) < stop_thresh or \
                   completed_iterations == max_iterations:
                center_intensity_dict[tuple(weighted_mean)] = len(points_within)
                break
            completed_iterations += 1

    # POST PROCESSING: remove near duplicate points
    # If the distance between two kernels is less than the bandwidth,
    # then we have to remove one because it is a duplicate. Remove the
    # one with fewer points.
    print "%d clusters before removing duplicates " % len(center_intensity_dict)
    sorted_by_intensity = sorted(center_intensity_dict.items(),
                                 key=lambda tup: tup[1], reverse=True)
    sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
    unique = np.ones(len(sorted_centers), dtype=np.bool)
    cc_tree = BallTree(sorted_centers)
    for i, center in enumerate(sorted_centers):
        if unique[i]:
            neighbor_idxs = cc_tree.query_radius([center], bandwidth)[0]
            unique[neighbor_idxs] = 0
            unique[i] = 1  # leave the current point as unique
    cluster_centers = sorted_centers[unique]
    print "%d clusters after removing duplicates " % len(cluster_centers)

    # ASSIGN LABELS: a point belongs to the cluster that it is closest to
    centers_tree = BallTree(cluster_centers)
    labels = np.zeros(n_points, dtype=np.int)
    distances, idxs = centers_tree.query(X, 1)
    if max_cluster_radius < 0:
        labels = idxs.flatten()
    else:
        labels[:] = -1
        bool_selector = distances.flatten() <= max_cluster_radius
        labels[bool_selector] = idxs.flatten()[bool_selector]
    return cluster_centers, labels
Пример #29
0
def discretize(vectors,
               copy=True,
               max_svd_restarts=30,
               n_iter_max=20,
               random_state=None):
    """Search for a partition matrix (clustering) which is closest to the
    eigenvector embedding.

    Parameters
    ----------
    vectors : array-like, shape: (n_samples, n_clusters)
        The embedding space of the samples.

    copy : boolean, optional, default: True
        Whether to copy vectors, or perform in-place normalization.

    max_svd_restarts : int, optional, default: 30
        Maximum number of attempts to restart SVD if convergence fails

    n_iter_max : int, optional, default: 30
        Maximum number of iterations to attempt in rotation and partition
        matrix search if machine precision convergence is not reached

    random_state: int seed, RandomState instance, or None (default)
        A pseudo random number generator used for the initialization of the
        of the rotation matrix

    Returns
    -------
    labels : array of integers, shape: n_samples
        The labels of the clusters.

    References
    ----------

    - Multiclass spectral clustering, 2003
      Stella X. Yu, Jianbo Shi
      http://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf

    Notes
    -----

    The eigenvector embedding is used to iteratively search for the
    closest discrete partition.  First, the eigenvector embedding is
    normalized to the space of partition matrices. An optimal discrete
    partition matrix closest to this normalized embedding multiplied by
    an initial rotation is calculated.  Fixing this discrete partition
    matrix, an optimal rotation matrix is calculated.  These two
    calculations are performed until convergence.  The discrete partition
    matrix is returned as the clustering solution.  Used in spectral
    clustering, this method tends to be faster and more robust to random
    initialization than k-means.

    """

    from scipy.sparse import csc_matrix
    from scipy.linalg import LinAlgError

    random_state = check_random_state(random_state)

    vectors = as_float_array(vectors, copy=copy)

    eps = np.finfo(float).eps
    n_samples, n_components = vectors.shape

    # Normalize the eigenvectors to an equal length of a vector of ones.
    # Reorient the eigenvectors to point in the negative direction with respect
    # to the first element.  This may have to do with constraining the
    # eigenvectors to lie in a specific quadrant to make the discretization
    # search easier.
    norm_ones = np.sqrt(n_samples)
    for i in range(vectors.shape[1]):
        vectors[:, i] = (vectors[:, i] / norm(vectors[:, i])) \
            * norm_ones
        if vectors[0, i] != 0:
            vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i])

    # Normalize the rows of the eigenvectors.  Samples should lie on the unit
    # hypersphere centered at the origin.  This transforms the samples in the
    # embedding space to the space of partition matrices.
    vectors = vectors / np.sqrt((vectors**2).sum(axis=1))[:, np.newaxis]

    svd_restarts = 0
    has_converged = False

    # If there is an exception we try to randomize and rerun SVD again
    # do this max_svd_restarts times.
    while (svd_restarts < max_svd_restarts) and not has_converged:

        # Initialize first column of rotation matrix with a row of the
        # eigenvectors
        rotation = np.zeros((n_components, n_components))
        rotation[:, 0] = vectors[random_state.randint(n_samples), :].T

        # To initialize the rest of the rotation matrix, find the rows
        # of the eigenvectors that are as orthogonal to each other as
        # possible
        c = np.zeros(n_samples)
        for j in range(1, n_components):
            # Accumulate c to ensure row is as orthogonal as possible to
            # previous picks as well as current one
            c += np.abs(np.dot(vectors, rotation[:, j - 1]))
            rotation[:, j] = vectors[c.argmin(), :].T

        last_objective_value = 0.0
        n_iter = 0

        while not has_converged:
            n_iter += 1

            t_discrete = np.dot(vectors, rotation)

            labels = t_discrete.argmax(axis=1)
            vectors_discrete = csc_matrix(
                (np.ones(len(labels)), (np.arange(0, n_samples), labels)),
                shape=(n_samples, n_components))

            t_svd = vectors_discrete.T * vectors

            try:
                U, S, Vh = np.linalg.svd(t_svd)
                svd_restarts += 1
            except LinAlgError:
                print("SVD did not converge, randomizing and trying again")
                break

            ncut_value = 2.0 * (n_samples - S.sum())
            if ((abs(ncut_value - last_objective_value) < eps)
                    or (n_iter > n_iter_max)):
                has_converged = True
            else:
                # otherwise calculate rotation and continue
                last_objective_value = ncut_value
                rotation = np.dot(Vh.T, U.T)

    if not has_converged:
        raise LinAlgError('SVD did not converge')
    return labels
Пример #30
0
def variable_bw_mean_shift(X, bandwidth_array, seeds=None, max_iterations=300):
    """Variable bandwidth mean shift with gaussian kernel

	Parameters
	----------
	X : array-like, shape=[n_samples, n_features]
		Input data.

	bandwidth : array[float], shape=[n_samples]
		Kernel bandwidth.

	seeds : array[float, float], shape=(n_seeds, n_features), optional
		Point used as initial kernel locations. Default is
		setting each point in input data as a seed.

	max_iter : int, default 300
		Maximum number of iterations, per seed point before the clustering
		operation terminates (for that seed point), if has not converged yet.

	Returns
	-------
	cluster_centers : array, shape=[n_clusters, n_features]
		Coordinates of cluster centers.

	labels : array, shape=[n_samples]
		Cluster labels for each point.

	Notes
	-----
	Code adapted from scikit-learn library.

	"""

    if not seeds:
        seeds = X

    n_points, n_features = X.shape
    stop_thresh = 1e-3 * np.mean(bandwidth_array)  # when mean has converged
    center_intensity_dict = {}
    cluster_centers = []
    ball_tree = BallTree(X)  # to efficiently look up nearby points

    def gaussian_kernel(x, points, bandwidth):
        distances = euclidean_distances(points, x)
        weights = np.exp(-1 * (distances ** 2 / bandwidth ** 2))
        return np.sum(points * weights, axis=0) / np.sum(weights)

        # For each seed, climb gradient until convergence or max_iterations

    for i, weighted_mean in enumerate(seeds):
        completed_iterations = 0
        while True:
            points_within = X[ball_tree.query_radius([weighted_mean], bandwidth_array[i])[0]]
            old_mean = weighted_mean  # save the old mean
            weighted_mean = gaussian_kernel(old_mean, points_within, bandwidth_array[i])
            converged = extmath.norm(weighted_mean - old_mean) < stop_thresh

            if converged or completed_iterations == max_iterations:
                if completed_iterations == max_iterations:
                    print("reached max iterations")
                cluster_centers.append(weighted_mean)
                center_intensity_dict[tuple(weighted_mean)] = len(points_within)
                break

            completed_iterations += 1

            # POST PROCESSING: remove near duplicate points
            # If the distance between two kernels is less than the bandwidth,
            # then we have to remove one because it is a duplicate. Remove the
            # one with fewer points.
    sorted_by_intensity = sorted(center_intensity_dict.items(), key=lambda tup: tup[1], reverse=True)
    sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
    unique = np.ones(len(sorted_centers), dtype=np.bool)
    ball_tree = BallTree(sorted_centers)

    for i, center in enumerate(sorted_centers):
        if unique[i]:
            neighbor_idxs = ball_tree.query_radius([center], np.mean(bandwidth_array))[0]
            unique[neighbor_idxs] = 0
            unique[i] = 1  # leave the current point as unique
    cluster_centers = sorted_centers[unique]

    # ASSIGN LABELS: a point belongs to the cluster that it is closest to
    nbrs = NearestNeighbors(n_neighbors=1, algorithm="ball_tree").fit(cluster_centers)
    labels = np.zeros(n_points, dtype=np.int)
    distances, idxs = nbrs.kneighbors(X)
    labels = idxs.flatten()

    return cluster_centers, labels
Пример #31
0
def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20,
               random_state=None):
    """Search for a partition matrix (clustering) which is closest to the
    eigenvector embedding.

    Parameters
    ----------
    vectors : array-like, shape: (n_samples, n_clusters)
        The embedding space of the samples.

    copy : boolean, optional, default: True
        Whether to copy vectors, or perform in-place normalization.

    max_svd_restarts : int, optional, default: 30
        Maximum number of attempts to restart SVD if convergence fails

    n_iter_max : int, optional, default: 30
        Maximum number of iterations to attempt in rotation and partition
        matrix search if machine precision convergence is not reached

    random_state: int seed, RandomState instance, or None (default)
        A pseudo random number generator used for the initialization of the
        of the rotation matrix

    Returns
    -------
    labels : array of integers, shape: n_samples
        The labels of the clusters.

    References
    ----------

    - Multiclass spectral clustering, 2003
      Stella X. Yu, Jianbo Shi
      http://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf

    Notes
    -----

    The eigenvector embedding is used to iteratively search for the
    closest discrete partition.  First, the eigenvector embedding is
    normalized to the space of partition matrices. An optimal discrete
    partition matrix closest to this normalized embedding multiplied by
    an initial rotation is calculated.  Fixing this discrete partition
    matrix, an optimal rotation matrix is calculated.  These two
    calculations are performed until convergence.  The discrete partition
    matrix is returned as the clustering solution.  Used in spectral
    clustering, this method tends to be faster and more robust to random
    initialization than k-means.

    """

    from scipy.sparse import csc_matrix
    from scipy.linalg import LinAlgError

    random_state = check_random_state(random_state)

    vectors = as_float_array(vectors, copy=copy)

    eps = np.finfo(float).eps
    n_samples, n_components = vectors.shape

    # Normalize the eigenvectors to an equal length of a vector of ones.
    # Reorient the eigenvectors to point in the negative direction with respect
    # to the first element.  This may have to do with constraining the
    # eigenvectors to lie in a specific quadrant to make the discretization
    # search easier.
    norm_ones = np.sqrt(n_samples)
    for i in range(vectors.shape[1]):
        vectors[:, i] = (vectors[:, i] / norm(vectors[:, i])) \
            * norm_ones
        if vectors[0, i] != 0:
            vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i])

    # Normalize the rows of the eigenvectors.  Samples should lie on the unit
    # hypersphere centered at the origin.  This transforms the samples in the
    # embedding space to the space of partition matrices.
    vectors = vectors / np.sqrt((vectors ** 2).sum(axis=1))[:, np.newaxis]

    svd_restarts = 0
    has_converged = False

    # If there is an exception we try to randomize and rerun SVD again
    # do this max_svd_restarts times.
    while (svd_restarts < max_svd_restarts) and not has_converged:

        # Initialize first column of rotation matrix with a row of the
        # eigenvectors
        rotation = np.zeros((n_components, n_components))
        rotation[:, 0] = vectors[random_state.randint(n_samples), :].T

        # To initialize the rest of the rotation matrix, find the rows
        # of the eigenvectors that are as orthogonal to each other as
        # possible
        c = np.zeros(n_samples)
        for j in range(1, n_components):
            # Accumulate c to ensure row is as orthogonal as possible to
            # previous picks as well as current one
            c += np.abs(np.dot(vectors, rotation[:, j - 1]))
            rotation[:, j] = vectors[c.argmin(), :].T

        last_objective_value = 0.0
        n_iter = 0

        while not has_converged:
            n_iter += 1

            t_discrete = np.dot(vectors, rotation)

            labels = t_discrete.argmax(axis=1)
            vectors_discrete = csc_matrix(
                (np.ones(len(labels)), (np.arange(0, n_samples), labels)),
                shape=(n_samples, n_components))

            t_svd = vectors_discrete.T * vectors

            try:
                U, S, Vh = np.linalg.svd(t_svd)
                svd_restarts += 1
            except LinAlgError:
                print("SVD did not converge, randomizing and trying again")
                break

            ncut_value = 2.0 * (n_samples - S.sum())
            if ((abs(ncut_value - last_objective_value) < eps) or
               (n_iter > n_iter_max)):
                has_converged = True
            else:
                # otherwise calculate rotation and continue
                last_objective_value = ncut_value
                rotation = np.dot(Vh.T, U.T)

    if not has_converged:
        raise LinAlgError('SVD did not converge')
    return labels
Пример #32
0
def mean_shift(X, intensities=None, bandwidth=None, seeds=None,
               cluster_all=True, max_iterations=300, verbose=False, use_scipy=True):
    """mean_shift(X, intensities=None, bandwidth=None, seeds=None,
                  cluster_all=True, max_iterations=300, verbose=False, use_scipy=True)

    Mean shift algorithm

    Implementation taken from scikit-learn with two minor variants:

        - Use (by default) scipy KD-trees, which are faster in our case
        - weigthed version of mean-shift using `intensities` as
          weights (i.e., we compute centers of mass rather than means)

    Parameters
    ----------

    X : array-like, shape=[n_samples, n_features]
        Input data.

    intensities : array-like, shape=[n_samples]
        Voxel intensities, used to weight the mean

    bandwidth : float
        Kernel bandwidth.

    seeds : array-like, shape=[n_seeds, n_features]
        Point used as initial kernel locations.

    use_scipy : bool
        If true use cKDTree from scipy.spatial, otherwise
        use NearestNeighbors from sklearn.neighbors

    Returns
    -------

    cluster_centers : array, shape=[n_clusters, n_features]
        Coordinates of cluster centers.

    labels : array, shape=[n_samples]
        Cluster labels for each point.

    volumes : array, shape=[n_clusters]
        Volume of each cluster (# of points in the cluster)

    masses : array, shape=[n_clusters]
        Mass of each cluster (sum of intensities of points in the cluster).

    trajectories : list
        MS trajectories for debugging purposes.
    """
    if seeds is None:
        seeds = X
    n_points, n_features = X.shape
    stop_thresh = 1e-3 * bandwidth  # when mean has converged
    center_volume_dict = {}
    center_mass_dict = {}
    # tee.log('Fitting NearestNeighbors on', n_points, 'points')
    if use_scipy:
        kdtree = cKDTree(X)
    else:
        nbrs = NearestNeighbors(radius=bandwidth).fit(X)

    # For each seed, climb gradient until convergence or max_iterations
    trajectories = {}  # for each seed, a list of points
    tee.log('Moving kernels for', len(seeds), 'seeds')
    pbar = pb.ProgressBar(widgets=['Moving %d seeds: ' % len(seeds), pb.Percentage()],
                          maxval=len(seeds)).start()
    for seed_no, my_mean in enumerate(seeds):
        completed_iterations = 0
        seed = my_mean
        trajectories[seed_no] = []
        while True:
            # Find mean of points within bandwidth
            if use_scipy:
                i_nbrs = kdtree.query_ball_point(my_mean, r=bandwidth)
            else:
                i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth,
                                               return_distance=False)[0]
            points_within = X[i_nbrs]
            if len(points_within) == 0:
                break  # Depending on seeding strategy this condition may occur
            my_old_mean = my_mean  # save the old mean
            if intensities is None:
                my_mean = np.mean(points_within, axis=0)
            else:
                my_mean = np.average(points_within, axis=0, weights=intensities[i_nbrs])
            # If converged or at max_iterations, addS the cluster
            if extmath.norm(my_mean - my_old_mean) < stop_thresh or completed_iterations == max_iterations:
                center_volume_dict[tuple(my_mean)] = len(points_within)
                center_mass_dict[tuple(my_mean)] = sum(intensities[i_nbrs])
                break
            completed_iterations += 1
            trajectories[seed_no].append(my_mean)
        if verbose:
            print('seed', seed, '-->', my_mean,
                  center_volume_dict[tuple(my_mean)], center_mass_dict[tuple(my_mean)], completed_iterations)

        pbar.update(seed_no+1)
    pbar.finish()
    # POST PROCESSING: remove near duplicate points
    # If the distance between two kernels is less than the bandwidth,
    # then we have to remove one because it is a duplicate. Remove the
    # one with fewer points.
    sorted_by_intensity = sorted(center_mass_dict.items(),
                                 key=lambda tup: tup[1], reverse=True)
    sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
    unique = np.ones(len(sorted_centers), dtype=np.bool)
    print('started from', len(seeds), 'seeds, now |unique|=', len(unique))
    # print('|center_mass_dict|=', len(center_mass_dict))
    if len(center_mass_dict) == 0:
        tee.log('No valid seeds. Giving up')
        return None, None, None, None, None

    nbrs = NearestNeighbors(radius=bandwidth).fit(sorted_centers)
    for i, center in enumerate(sorted_centers):
        if unique[i]:
            neighbor_idxs = nbrs.radius_neighbors([center],
                                                  return_distance=False)[0]
            unique[neighbor_idxs] = 0
            unique[i] = 1  # leave the current point as unique
    cluster_centers = sorted_centers[unique]
    print('|cluster_centers|=', len(cluster_centers))
    volumes = [0]*len(cluster_centers)
    masses = [0]*len(cluster_centers)
    for i, c in enumerate(cluster_centers):
        volumes[i] = center_volume_dict[tuple(c)]
        masses[i] = center_mass_dict[tuple(c)]
    # ASSIGN LABELS: a point belongs to the cluster that it is closest to
    nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
    labels = np.zeros(n_points, dtype=np.int)
    distances, idxs = nbrs.kneighbors(X)
    if cluster_all:
        labels = idxs.flatten()
    else:
        labels[:] = -1
        bool_selector = distances.flatten() <= bandwidth
        labels[bool_selector] = idxs.flatten()[bool_selector]
    return cluster_centers, labels, volumes, masses, trajectories
Пример #33
0
def mean_shift(X,
               bandwidth=None,
               seeds=None,
               kernel="flat",
               max_cluster_radius=-1.,
               max_iterations=300):
    """Perform MeanShift Clustering of data using the specified kernel

    Parameters
    ----------

    X : array [n_samples, n_features]
        Input points to be clustered

    bandwidth : float,
        Kernel bandwidth

    seeds: array [n_seeds, n_features], optional
        Points used as initial kernel locations
        If not set, then use every point as a seed (which may
        be very slow---consider using the `get_bin_seeds` function
        to create a reduced set of seeds.

    max_cluster_radius: float, default -1.
        Used only in post-processing.
        If negative, then each point is clustered into its nearest cluster.
        If positive, then those points that are not within `max_cluster_radius`
        of any cluster center are said to be 'orphans' that do not belong to
        any cluster. Orphans are given cluster label -1.

    Returns
    -------

    cluster_centers : array [n_clusters, n_features]
        Coordinates of cluster centers

    labels : array [n_samples]
        cluster labels for each point

    Notes
    -----
    See examples/plot_meanshift.py for an example.

    """

    if seeds is None:
        seeds = X
    elif len(seeds) == 0:
        raise ValueError, "If a list of seeds is provided it cannot be empty."

    if not (kernel in KERNELS):
        valid_kernels = " ".join(KERNELS)
        raise ValueError, "Kernel %s is not valid. Valid kernel choices are: %s " % (
            kernel, valid_kernels)

    # Set maximum neighbor query distance based on kernel
    if kernel in ["flat"]:
        query_distance = bandwidth
        kernel_update_function = flat_kernel_update
        print "Using flat kernel update"
    elif kernel in ["gaussian"]:
        query_distance = bandwidth * 3  # A bit arbitrary
        kernel_update_function = gaussian_kernel_update
        print "Using gaussian kernel update"
    else:
        raise ValueError, "Kernel %s not implemented correctly" % kernel

    n_points, n_features = X.shape
    stop_thresh = 1e-3 * bandwidth  # when mean has converged
    center_intensity_dict = {}
    ball_tree = BallTree(X)  # to efficiently look up nearby points

    # For each seed, climb gradient until convergence or max_iterations
    for weighted_mean in seeds:
        completed_iterations = 0
        while True:
            # Find mean of points within bandwidth
            points_within = X[ball_tree.query_radius([weighted_mean],
                                                     query_distance)[0]]
            if len(points_within) == 0:
                break  # Depending on seeding strategy this condition may occur
            old_mean = weighted_mean  # save the old mean
            weighted_mean = kernel_update_function(old_mean, points_within,
                                                   bandwidth)
            # If converged or at max_iterations, addS the cluster
            if extmath.norm(weighted_mean - old_mean) < stop_thresh or \
                   completed_iterations == max_iterations:
                center_intensity_dict[tuple(weighted_mean)] = len(
                    points_within)
                break
            completed_iterations += 1

    # POST PROCESSING: remove near duplicate points
    # If the distance between two kernels is less than the bandwidth,
    # then we have to remove one because it is a duplicate. Remove the
    # one with fewer points.
    print "%d clusters before removing duplicates " % len(
        center_intensity_dict)
    sorted_by_intensity = sorted(center_intensity_dict.items(),
                                 key=lambda tup: tup[1],
                                 reverse=True)
    sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
    unique = np.ones(len(sorted_centers), dtype=np.bool)
    cc_tree = BallTree(sorted_centers)
    for i, center in enumerate(sorted_centers):
        if unique[i]:
            neighbor_idxs = cc_tree.query_radius([center], bandwidth)[0]
            unique[neighbor_idxs] = 0
            unique[i] = 1  # leave the current point as unique
    cluster_centers = sorted_centers[unique]
    print "%d clusters after removing duplicates " % len(cluster_centers)

    # ASSIGN LABELS: a point belongs to the cluster that it is closest to
    centers_tree = BallTree(cluster_centers)
    labels = np.zeros(n_points, dtype=np.int)
    distances, idxs = centers_tree.query(X, 1)
    if max_cluster_radius < 0:
        labels = idxs.flatten()
    else:
        labels[:] = -1
        bool_selector = distances.flatten() <= max_cluster_radius
        labels[bool_selector] = idxs.flatten()[bool_selector]
    return cluster_centers, labels
Пример #34
0
def mean_shift(X, intensities=None, bandwidth=None, seeds=None,
               cluster_all=True, max_iterations=300, verbose=False, use_scipy=True):
    """mean_shift(X, intensities=None, bandwidth=None, seeds=None,
                  cluster_all=True, max_iterations=300, verbose=False, use_scipy=True)

    Mean shift algorithm

    Implementation taken from scikit-learn with two minor variants:

        - Use (by default) scipy KD-trees, which are faster in our case
        - weigthed version of mean-shift using `intensities` as
          weights (i.e., we compute centers of mass rather than means)

    Parameters
    ----------

    X : array-like, shape=[n_samples, n_features]
        Input data.

    intensities : array-like, shape=[n_samples]
        Voxel intensities, used to weight the mean

    bandwidth : float
        Kernel bandwidth.

    seeds : array-like, shape=[n_seeds, n_features]
        Point used as initial kernel locations.

    use_scipy : bool
        If true use cKDTree from scipy.spatial, otherwise
        use NearestNeighbors from sklearn.neighbors

    Returns
    -------

    cluster_centers : array, shape=[n_clusters, n_features]
        Coordinates of cluster centers.

    labels : array, shape=[n_samples]
        Cluster labels for each point.

    volumes : array, shape=[n_clusters]
        Volume of each cluster (# of points in the cluster)

    masses : array, shape=[n_clusters]
        Mass of each cluster (sum of intensities of points in the cluster).

    trajectories : list
        MS trajectories for debugging purposes.
    """
    if seeds is None:
        seeds = X
    n_points, n_features = X.shape
    stop_thresh = 1e-3 * bandwidth  # when mean has converged
    center_volume_dict = {}
    center_mass_dict = {}
    # tee.log('Fitting NearestNeighbors on', n_points, 'points')
    if use_scipy:
        kdtree = cKDTree(X)
    else:
        nbrs = NearestNeighbors(radius=bandwidth).fit(X)

    # For each seed, climb gradient until convergence or max_iterations
    trajectories = {}  # for each seed, a list of points
    tee.log('Moving kernels for', len(seeds), 'seeds')
    pbar = pb.ProgressBar(widgets=['Moving %d seeds: ' % len(seeds), pb.Percentage()],
                          maxval=len(seeds)).start()
    for seed_no, my_mean in enumerate(seeds):
        completed_iterations = 0
        seed = my_mean
        trajectories[seed_no] = []
        while True:
            # Find mean of points within bandwidth
            if use_scipy:
                i_nbrs = kdtree.query_ball_point(my_mean, r=bandwidth)
            else:
                i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth,
                                               return_distance=False)[0]
            points_within = X[i_nbrs]
            if len(points_within) == 0:
                break  # Depending on seeding strategy this condition may occur
            my_old_mean = my_mean  # save the old mean
            if intensities is None:
                my_mean = np.mean(points_within, axis=0)
            else:
                my_mean = np.average(points_within, axis=0, weights=intensities[i_nbrs])
            # If converged or at max_iterations, addS the cluster
            if extmath.norm(my_mean - my_old_mean) < stop_thresh or completed_iterations == max_iterations:
                center_volume_dict[tuple(my_mean)] = len(points_within)
                center_mass_dict[tuple(my_mean)] = sum(intensities[i_nbrs])
                break
            completed_iterations += 1
            trajectories[seed_no].append(my_mean)
        if verbose:
            print('seed', seed, '-->', my_mean,
                  center_volume_dict[tuple(my_mean)], center_mass_dict[tuple(my_mean)], completed_iterations)

        pbar.update(seed_no+1)
    pbar.finish()
    # POST PROCESSING: remove near duplicate points
    # If the distance between two kernels is less than the bandwidth,
    # then we have to remove one because it is a duplicate. Remove the
    # one with fewer points.
    sorted_by_intensity = sorted(center_mass_dict.items(),
                                 key=lambda tup: tup[1], reverse=True)
    sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
    unique = np.ones(len(sorted_centers), dtype=np.bool)
    print('started from', len(seeds), 'seeds, now |unique|=', len(unique))
    # print('|center_mass_dict|=', len(center_mass_dict))
    if len(center_mass_dict) == 0:
        tee.log('No valid seeds. Giving up')
        return None, None, None, None, None

    nbrs = NearestNeighbors(radius=bandwidth).fit(sorted_centers)
    for i, center in enumerate(sorted_centers):
        if unique[i]:
            neighbor_idxs = nbrs.radius_neighbors([center],
                                                  return_distance=False)[0]
            unique[neighbor_idxs] = 0
            unique[i] = 1  # leave the current point as unique
    cluster_centers = sorted_centers[unique]
    print('|cluster_centers|=', len(cluster_centers))
    volumes = [0]*len(cluster_centers)
    masses = [0]*len(cluster_centers)
    for i, c in enumerate(cluster_centers):
        volumes[i] = center_volume_dict[tuple(c)]
        masses[i] = center_mass_dict[tuple(c)]
    # ASSIGN LABELS: a point belongs to the cluster that it is closest to
    nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
    labels = np.zeros(n_points, dtype=np.int)
    distances, idxs = nbrs.kneighbors(X)
    if cluster_all:
        labels = idxs.flatten()
    else:
        labels[:] = -1
        bool_selector = distances.flatten() <= bandwidth
        labels[bool_selector] = idxs.flatten()[bool_selector]
    return cluster_centers, labels, volumes, masses, trajectories
Пример #35
0
 def _paii(self, loss_t, x_t):
     return loss_t / (extmath.norm(x_t) ** 2.0) + 1.0 / 2.0 * self.C