示例#1
0
def wardCV(data, labels, cut_level, connect):
    '''calculate cross-validated amount of ward-clusters'''
    #loop for list
    accuracies = np.zeros(len(cut_level))
    for i in cut_level:
        #reduce to set amount of clusters
        agglo = sklcl.WardAgglomeration(connectivity=connect, n_clusters=i)
        cross = sklcv.KFold(n=len(labels), n_folds=len(labels))
        pred_vec = np.zeros_like(labels)
        for train_i, test_i in cross:
            use_train = agglo.fit_transform(data[train_i])
            use_test = agglo.transform(data[test_i])

            scaler = sklpre.StandardScaler()
            use_train = scaler.fit_transform(use_train)
            use_test = scaler.transform(use_test)

            model = sklsvm.NuSVR(kernel='linear', nu=1, C=100)
            model.fit(use_train, labels[train_i])
            pr = model.predict(use_test)
            pred_vec[test_i] = pr
        #save accuracy
        accuracies[cut_level == i], _ = ss.spearmanr(pred_vec, labels)
    #based on loo-accuracy, select the optimal number of features
    #TODO -smooth this?
    accuracies = ssig.medfilt(accuracies)
    best_model = cut_level[accuracies.argmax()]
    return best_model
示例#2
0
# Code source: Gael Varoqueux
# Modified for Documentation merge by Jaques Grobler
# License: BSD

import numpy as np
import pylab as pl

from sklearn import datasets, cluster
from sklearn.feature_extraction.image import grid_to_graph

digits = datasets.load_digits()
images = digits.images
X = np.reshape(images, (len(images), -1))
connectivity = grid_to_graph(*images[0].shape)

agglo = cluster.WardAgglomeration(connectivity=connectivity, n_clusters=32)

agglo.fit(X)
X_reduced = agglo.transform(X)

X_restored = agglo.inverse_transform(X_reduced)
images_restored = np.reshape(X_restored, images.shape)
pl.figure(1, figsize=(4, 3.5))
pl.clf()
pl.subplots_adjust(left=.01, right=.99, bottom=.01, top=.91)
for i in range(4):
    pl.subplot(3, 4, i + 1)
    pl.imshow(images[i], cmap=pl.cm.gray, vmax=16, interpolation='nearest')
    pl.xticks(())
    pl.yticks(())
    if i == 1:
示例#3
0
def do_model(train_d, train_l, test_d, connect, use_modules):

    #ward clustering (a)
    if use_modules.find('a') != -1:
        no_feat = len(train_d[0, :])
        ward_sizes = np.array([
            int(no_feat),
            int(no_feat * 0.8),
            int(no_feat * 0.5),
            int(no_feat * 0.1),
            int(no_feat * 0.01)
        ])  # set to about 100, 50 and 10% add 1/10000 for dbm
        use_wardsize = wardCV(train_d, train_l, ward_sizes, connect)
        agglo = sklcl.WardAgglomeration(connectivity=connect,
                                        n_clusters=use_wardsize)

        train_d = agglo.fit_transform(train_d)
        test_d = agglo.transform(test_d)
    else:
        use_wardsize = '0'

    #include positive values only(b)
    if use_modules.find('b') != -1:
        bool_pos, bool_neg = direction_cutoff(train_d)

        train_d = train_d[:, bool_pos]
        test_d = test_d[:, bool_pos]

    #scale features to z scores(c)
    if use_modules.find('c') != -1:
        scaler = sklpre.StandardScaler()

        train_d = scaler.fit_transform(train_d)
        test_d = scaler.transform(test_d)

    #univariate selection(d)
    if use_modules.find('d') != -1:
        univ_levels = np.array([1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0001])
        #use_cut = univCV(train_d, train_l, univ_levels,use_wardsize,connect,use_modules)
        use_cut = univCV(train_d, train_l, univ_levels)
        univ_select = sklfs.SelectFpr(alpha=use_cut)

        train_d = univ_select.fit_transform(train_d, train_l)
        test_d = univ_select.transform(test_d)
    else:
        use_cut = '0'

    #train model

    nus = np.array([1])  #set nu threshold
    params = dict(nu=nus)
    model = GridSearchCV(
        estimator=sklsvm.NuSVR(kernel='linear', C=100,
                               degree=1)  #changed from 1000 to 10 for dbm
        ,
        param_grid=params,
        cv=10,
        n_jobs=1,
        scoring='r2')  #TODO changed from mse

    model.fit(train_d, train_l)
    pred = model.predict(test_d)

    use_nu = model.best_params_['nu']
    results = [pred, use_wardsize, use_cut, use_nu]

    return results
示例#4
0
def run_pipe(input_files, input_labels, use_modules, no_proc):
    '''run svr forkflow on data'''

    #--------------Organise inputs
    #calculate matrix
    #feature_matrix = prepare_modality(input_files, input_mask)
    #--------------Execute analysis
    #prepare feature agglomeration
    #mask_handle = nb.load(input_mask)
    connect = sklim.grid_to_graph(*input_files[0].shape,
                                  mask=np.invert(
                                      np.isnan(np.sum(input_files, 0))))
    inshape = input_files.shape

    feature_matrix = input_files.reshape((inshape[0], -1))

    #remove nans
    sum_features = np.sum(feature_matrix, 0)
    feature_matrix = feature_matrix[:, np.invert(np.isnan(sum_features))]

    #cross validation
    loo = sklcv.KFold(len(input_labels), n_folds=len(input_labels))
    print('Starting svr')

    cv_pred = jl.Parallel(n_jobs=no_proc, verbose=1, pre_dispatch=no_proc * 2)(
        jl.delayed(do_model)(feature_matrix[train], input_labels[train],
                             feature_matrix[test], connect, use_modules)
        for train, test in loo)
    cv_pred = np.array(cv_pred)
    corr, p = ss.pearsonr(cv_pred[:, 0], input_labels)

    #creating final model
    print('creating final model')
    if use_modules.find('a') != -1:
        final_agglo = sklcl.WardAgglomeration(connectivity=connect,
                                              n_clusters=int(
                                                  np.median(cv_pred[:, 1])))
        feature_matrix = final_agglo.fit_transform(feature_matrix)
    else:
        final_agglo = 0

    if use_modules.find('b') != -1:
        bool_pos, bool_neg = direction_cutoff(feature_matrix)
        feature_matrix = feature_matrix[:, bool_pos]
    else:
        bool_pos = 0

    if use_modules.find('c') != -1:
        final_scaler = sklpre.StandardScaler()
        feature_matrix = final_scaler.fit_transform(feature_matrix)
    else:
        final_scaler = 0

    if use_modules.find('d') != -1:
        final_univ = sklfs.SelectFpr(alpha=np.median(cv_pred[:, 2]))
        feature_matrix = final_univ.fit_transform(feature_matrix, input_labels)
    else:
        final_univ = 0

    final_model = sklsvm.NuSVR(kernel='linear',
                               C=100,
                               degree=1,
                               nu=np.median(cv_pred[:, 3]))
    final_model.fit(feature_matrix, input_labels)

    return cv_pred, corr, p, final_agglo, final_univ, final_scaler, bool_pos, final_model
示例#5
0
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(X, n_neighbors=50)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)

# Compute distances
#distances = np.exp(-euclidean_distances(X))
distances = euclidean_distances(X)

# create clustering estimators
kmeans = cluster.KMeans(n_clusters=2)
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=2)
ward_five = cluster.Ward(n_clusters=2, connectivity=connectivity)
ward_agglo = cluster.WardAgglomeration(n_clusters=2)
spectral = cluster.SpectralClustering(n_clusters=2,
                                      eigen_solver='arpack',
                                      affinity="nearest_neighbors",
                                      n_neighbors=250)
dbscan = cluster.DBSCAN(eps=1)
affinity_propagation = cluster.AffinityPropagation(damping=.99,
                                                   convergence_iter=3,
                                                   max_iter=1,
                                                   verbose=True)
#,preference=-200)

for algorithm in [
        kmeans, two_means, ms, ward_five, dbscan, affinity_propagation,
        spectral
]: