def wardCV(data, labels, cut_level, connect): '''calculate cross-validated amount of ward-clusters''' #loop for list accuracies = np.zeros(len(cut_level)) for i in cut_level: #reduce to set amount of clusters agglo = sklcl.WardAgglomeration(connectivity=connect, n_clusters=i) cross = sklcv.KFold(n=len(labels), n_folds=len(labels)) pred_vec = np.zeros_like(labels) for train_i, test_i in cross: use_train = agglo.fit_transform(data[train_i]) use_test = agglo.transform(data[test_i]) scaler = sklpre.StandardScaler() use_train = scaler.fit_transform(use_train) use_test = scaler.transform(use_test) model = sklsvm.NuSVR(kernel='linear', nu=1, C=100) model.fit(use_train, labels[train_i]) pr = model.predict(use_test) pred_vec[test_i] = pr #save accuracy accuracies[cut_level == i], _ = ss.spearmanr(pred_vec, labels) #based on loo-accuracy, select the optimal number of features #TODO -smooth this? accuracies = ssig.medfilt(accuracies) best_model = cut_level[accuracies.argmax()] return best_model
# Code source: Gael Varoqueux # Modified for Documentation merge by Jaques Grobler # License: BSD import numpy as np import pylab as pl from sklearn import datasets, cluster from sklearn.feature_extraction.image import grid_to_graph digits = datasets.load_digits() images = digits.images X = np.reshape(images, (len(images), -1)) connectivity = grid_to_graph(*images[0].shape) agglo = cluster.WardAgglomeration(connectivity=connectivity, n_clusters=32) agglo.fit(X) X_reduced = agglo.transform(X) X_restored = agglo.inverse_transform(X_reduced) images_restored = np.reshape(X_restored, images.shape) pl.figure(1, figsize=(4, 3.5)) pl.clf() pl.subplots_adjust(left=.01, right=.99, bottom=.01, top=.91) for i in range(4): pl.subplot(3, 4, i + 1) pl.imshow(images[i], cmap=pl.cm.gray, vmax=16, interpolation='nearest') pl.xticks(()) pl.yticks(()) if i == 1:
def do_model(train_d, train_l, test_d, connect, use_modules): #ward clustering (a) if use_modules.find('a') != -1: no_feat = len(train_d[0, :]) ward_sizes = np.array([ int(no_feat), int(no_feat * 0.8), int(no_feat * 0.5), int(no_feat * 0.1), int(no_feat * 0.01) ]) # set to about 100, 50 and 10% add 1/10000 for dbm use_wardsize = wardCV(train_d, train_l, ward_sizes, connect) agglo = sklcl.WardAgglomeration(connectivity=connect, n_clusters=use_wardsize) train_d = agglo.fit_transform(train_d) test_d = agglo.transform(test_d) else: use_wardsize = '0' #include positive values only(b) if use_modules.find('b') != -1: bool_pos, bool_neg = direction_cutoff(train_d) train_d = train_d[:, bool_pos] test_d = test_d[:, bool_pos] #scale features to z scores(c) if use_modules.find('c') != -1: scaler = sklpre.StandardScaler() train_d = scaler.fit_transform(train_d) test_d = scaler.transform(test_d) #univariate selection(d) if use_modules.find('d') != -1: univ_levels = np.array([1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0001]) #use_cut = univCV(train_d, train_l, univ_levels,use_wardsize,connect,use_modules) use_cut = univCV(train_d, train_l, univ_levels) univ_select = sklfs.SelectFpr(alpha=use_cut) train_d = univ_select.fit_transform(train_d, train_l) test_d = univ_select.transform(test_d) else: use_cut = '0' #train model nus = np.array([1]) #set nu threshold params = dict(nu=nus) model = GridSearchCV( estimator=sklsvm.NuSVR(kernel='linear', C=100, degree=1) #changed from 1000 to 10 for dbm , param_grid=params, cv=10, n_jobs=1, scoring='r2') #TODO changed from mse model.fit(train_d, train_l) pred = model.predict(test_d) use_nu = model.best_params_['nu'] results = [pred, use_wardsize, use_cut, use_nu] return results
def run_pipe(input_files, input_labels, use_modules, no_proc): '''run svr forkflow on data''' #--------------Organise inputs #calculate matrix #feature_matrix = prepare_modality(input_files, input_mask) #--------------Execute analysis #prepare feature agglomeration #mask_handle = nb.load(input_mask) connect = sklim.grid_to_graph(*input_files[0].shape, mask=np.invert( np.isnan(np.sum(input_files, 0)))) inshape = input_files.shape feature_matrix = input_files.reshape((inshape[0], -1)) #remove nans sum_features = np.sum(feature_matrix, 0) feature_matrix = feature_matrix[:, np.invert(np.isnan(sum_features))] #cross validation loo = sklcv.KFold(len(input_labels), n_folds=len(input_labels)) print('Starting svr') cv_pred = jl.Parallel(n_jobs=no_proc, verbose=1, pre_dispatch=no_proc * 2)( jl.delayed(do_model)(feature_matrix[train], input_labels[train], feature_matrix[test], connect, use_modules) for train, test in loo) cv_pred = np.array(cv_pred) corr, p = ss.pearsonr(cv_pred[:, 0], input_labels) #creating final model print('creating final model') if use_modules.find('a') != -1: final_agglo = sklcl.WardAgglomeration(connectivity=connect, n_clusters=int( np.median(cv_pred[:, 1]))) feature_matrix = final_agglo.fit_transform(feature_matrix) else: final_agglo = 0 if use_modules.find('b') != -1: bool_pos, bool_neg = direction_cutoff(feature_matrix) feature_matrix = feature_matrix[:, bool_pos] else: bool_pos = 0 if use_modules.find('c') != -1: final_scaler = sklpre.StandardScaler() feature_matrix = final_scaler.fit_transform(feature_matrix) else: final_scaler = 0 if use_modules.find('d') != -1: final_univ = sklfs.SelectFpr(alpha=np.median(cv_pred[:, 2])) feature_matrix = final_univ.fit_transform(feature_matrix, input_labels) else: final_univ = 0 final_model = sklsvm.NuSVR(kernel='linear', C=100, degree=1, nu=np.median(cv_pred[:, 3])) final_model.fit(feature_matrix, input_labels) return cv_pred, corr, p, final_agglo, final_univ, final_scaler, bool_pos, final_model
# connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=50) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # Compute distances #distances = np.exp(-euclidean_distances(X)) distances = euclidean_distances(X) # create clustering estimators kmeans = cluster.KMeans(n_clusters=2) ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=2) ward_five = cluster.Ward(n_clusters=2, connectivity=connectivity) ward_agglo = cluster.WardAgglomeration(n_clusters=2) spectral = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack', affinity="nearest_neighbors", n_neighbors=250) dbscan = cluster.DBSCAN(eps=1) affinity_propagation = cluster.AffinityPropagation(damping=.99, convergence_iter=3, max_iter=1, verbose=True) #,preference=-200) for algorithm in [ kmeans, two_means, ms, ward_five, dbscan, affinity_propagation, spectral ]: