def test_pls_canonical_basics(): # Basic checks for PLSCanonical d = load_linnerud() X = d.data Y = d.target pls = PLSCanonical(n_components=X.shape[1]) pls.fit(X, Y) assert_matrix_orthogonal(pls.x_weights_) assert_matrix_orthogonal(pls.y_weights_) assert_matrix_orthogonal(pls._x_scores) assert_matrix_orthogonal(pls._y_scores) # Check X = TP' and Y = UQ' T = pls._x_scores P = pls.x_loadings_ U = pls._y_scores Q = pls.y_loadings_ # Need to scale first Xc, Yc, x_mean, y_mean, x_std, y_std = _center_scale_xy( X.copy(), Y.copy(), scale=True) assert_array_almost_equal(Xc, np.dot(T, P.T)) assert_array_almost_equal(Yc, np.dot(U, Q.T)) # Check that rotations on training data lead to scores Xt = pls.transform(X) assert_array_almost_equal(Xt, pls._x_scores) Xt, Yt = pls.transform(X, Y) assert_array_almost_equal(Xt, pls._x_scores) assert_array_almost_equal(Yt, pls._y_scores) # Check that inverse_transform works X_back = pls.inverse_transform(Xt) assert_array_almost_equal(X_back, X)
def test_sanity_check_pls_canonical(): # Sanity check for PLSCanonical # The results were checked against the R-package plspm d = load_linnerud() X = d.data Y = d.target pls = PLSCanonical(n_components=X.shape[1]) pls.fit(X, Y) expected_x_weights = np.array( [ [-0.61330704, 0.25616119, -0.74715187], [-0.74697144, 0.11930791, 0.65406368], [-0.25668686, -0.95924297, -0.11817271], ] ) expected_x_rotations = np.array( [ [-0.61330704, 0.41591889, -0.62297525], [-0.74697144, 0.31388326, 0.77368233], [-0.25668686, -0.89237972, -0.24121788], ] ) expected_y_weights = np.array( [ [+0.58989127, 0.7890047, 0.1717553], [+0.77134053, -0.61351791, 0.16920272], [-0.23887670, -0.03267062, 0.97050016], ] ) expected_y_rotations = np.array( [ [+0.58989127, 0.7168115, 0.30665872], [+0.77134053, -0.70791757, 0.19786539], [-0.23887670, -0.00343595, 0.94162826], ] ) assert_array_almost_equal(np.abs(pls.x_rotations_), np.abs(expected_x_rotations)) assert_array_almost_equal(np.abs(pls.x_weights_), np.abs(expected_x_weights)) assert_array_almost_equal(np.abs(pls.y_rotations_), np.abs(expected_y_rotations)) assert_array_almost_equal(np.abs(pls.y_weights_), np.abs(expected_y_weights)) x_rotations_sign_flip = np.sign(pls.x_rotations_ / expected_x_rotations) x_weights_sign_flip = np.sign(pls.x_weights_ / expected_x_weights) y_rotations_sign_flip = np.sign(pls.y_rotations_ / expected_y_rotations) y_weights_sign_flip = np.sign(pls.y_weights_ / expected_y_weights) assert_array_almost_equal(x_rotations_sign_flip, x_weights_sign_flip) assert_array_almost_equal(y_rotations_sign_flip, y_weights_sign_flip) assert_matrix_orthogonal(pls.x_weights_) assert_matrix_orthogonal(pls.y_weights_) assert_matrix_orthogonal(pls._x_scores) assert_matrix_orthogonal(pls._y_scores)
def test_convergence_fail(): # Make sure ConvergenceWarning is raised if max_iter is too small d = load_linnerud() X = d.data Y = d.target pls_nipals = PLSCanonical(n_components=X.shape[1], max_iter=2) with pytest.warns(ConvergenceWarning): pls_nipals.fit(X, Y)
def feature_action_sensitivity(feature_type='TD4'): ''' 对每个特征,分析其在不移位和移位情况下的协方差 ''' results = [] subjects = ['subject_' + str(i + 1) for i in range(5)] # print subjects # sys.exit(0) channel_pos_list = ['S0', # 中心位置 'U1', 'U2', 'D1', 'D2', 'L1', 'L2', 'R1', 'R2'] # 上 下 左 右 pos_num = len(channel_pos_list) actions = [i+1 for i in range(7)] action_num = len(actions) # 7 动作类型个数 if feature_type == 'TD4': feature_list = ['MAV', 'ZC', 'SSC', 'WL'] elif feature_type == 'TD5': feature_list = ['MAV', 'ZC', 'SSC', 'WL','RMS'] feat_num = len(feature_list) # 4 特征维度 groups = [i+1 for i in range(4)] group_num = len(groups) # 4 通道数 group_span = group_num*feat_num # print group_span action_span = feat_num*group_num # 16 # print groups, channel_num, channel_span, feat_num train_dir = 'train4_250_100' results.append(['subject', 'action', 'feature', 'group', 'means_shift', 'std_shift'] ) plsca = PLSCanonical(n_components=2) # pos = 1 k=0 for pos_idx, pos_name in enumerate(channel_pos_list[1:]): pos = pos_idx+1 for subject in subjects: # shift_simulation = np.ones((action_num,action_span,2)) trains, classes = data_load.load_feature_dataset(train_dir, subject, feature_type) # m = trains.shape[0] # print trains.shape, classes.shape, m # print group_span, group_span*2 # sys.exit(0) # m = trains.shape[0]*2/3 m = trains.shape[0]/2 X_train = trains[:m, group_span*pos: group_span*(pos+1)] Y_train = trains[:m:, :group_span] X_test = trains[m:, group_span*pos: group_span*(pos+1)] Y_test = trains[m:, :group_span] plsca.fit(X_train, Y_train) X_train_r, Y_train_r = plsca.transform(X_train, Y_train) X_test_r, Y_test_r = plsca.transform(X_test, Y_test) filename=subject+'_'+pos_name # plot_plsc_figure(X_train_r,Y_train_r,X_test_r, Y_test_r, filename) plot_plsc_figure_two(X_train_r,Y_train_r,X_test_r, Y_test_r, filename)
def correlation_matching(I_tr, T_tr, I_te, T_te, n_comps): """ Learns correlation matching (CM) over I_tr and T_tr and applies it to I_tr, T_tr, I_te, T_te Parameters ---------- I_tr: np.ndarray [shape=(n_tr, d_I)] image data matrix for training T_tr: np.ndarray [shape=(n_tr, d_T)] text data matrix for training I_te: np.ndarray [shape=(n_te, d_I)] image data matrix for testing T_te: np.ndarray [shape=(n_te, d_T)] text data matrix for testing n_comps: int > 0 [scalar] number of canonical componens to use Returns ------- I_tr_cca : np.ndarray [shape=(n_tr, n_comps)] image data matrix represetned in correlation space T_tr_cca : np.ndarray [shape=(n_tr, n_comps)] text data matrix represetned in correlation space I_te_cca : np.ndarray [shape=(n_te, n_comps)] image data matrix represetned in correlation space T_te_cca : np.ndarray [shape=(n_te, n_comps)] text data matrix represetned in correlation space """ # sclale image and text data I_scaler = StandardScaler() I_tr = I_scaler.fit_transform(I_tr) I_te = I_scaler.transform(I_te) T_scaler = StandardScaler() T_tr = T_scaler.fit_transform(T_tr) T_te = T_scaler.transform(T_te) cca = PLSCanonical(n_components=n_comps, scale=False) cca.fit(I_tr, T_tr) I_tr_cca, T_tr_cca = cca.transform(I_tr, T_tr) I_te_cca, T_te_cca = cca.transform(I_te, T_te) return I_tr_cca, T_tr_cca, I_te_cca, T_te_cca
def feature_action_sensitivity(feature_type='TD4'): ''' 对每个特征,分析其在不移位和移位情况下的协方差 ''' results = [] subjects = ['subject_' + str(i + 1) for i in range(1)] channel_pos_list = ['S0', # 中心位置 'U1', 'U2', 'D1', 'D2', 'L1', 'L2', 'R1', 'R2'] # 上 下 左 右 pos_num = len(channel_pos_list) actions = [i+1 for i in range(7)] action_num = len(actions) # 7 动作类型个数 if feature_type == 'TD4': feature_list = ['MAV', 'ZC', 'SSC', 'WL'] elif feature_type == 'TD5': feature_list = ['MAV', 'ZC', 'SSC', 'WL','RMS'] feat_num = len(feature_list) # 4 特征维度 groups = [i+1 for i in range(4)] group_num = len(groups) # 4 通道数 group_span = group_num*feat_num # print group_span action_span = feat_num*group_num # 16 # print groups, channel_num, channel_span, feat_num train_dir = 'train4_250_100' results.append(['subject', 'action', 'feature', 'group', 'means_shift', 'std_shift'] ) plsca = PLSCanonical(n_components=2) # pos = 1 k=0 for pos_idx, pos_name in enumerate(channel_pos_list[1:]): pos = pos_idx+1 for subject in subjects: # shift_simulation = np.ones((action_num,action_span,2)) trains, classes = data_load.load_feature_dataset(train_dir, subject, feature_type) # m = trains.shape[0] # print trains.shape, classes.shape, m # print group_span, group_span*2 # sys.exit(0) # m = trains.shape[0]*2/3 m = trains.shape[0]/2 X_train = trains[:m, group_span*pos: group_span*(pos+1)] Y_train = trains[:m:, :group_span] X_test = trains[m:, group_span*pos: group_span*(pos+1)] Y_test = trains[m:, :group_span] plsca.fit(X_train, Y_train) X_train_r, Y_train_r = plsca.transform(X_train, Y_train) X_test_r, Y_test_r = plsca.transform(X_test, Y_test) filename=subject+'_'+pos_name # plot_plsc_figure(X_train_r,Y_train_r,X_test_r, Y_test_r, filename) plot_plsc_figure_two(X_train_r,Y_train_r,X_test_r, Y_test_r, filename)
def plscorr_eval(train_fmri_ts, train_feat_ts, val_fmri_ts, val_feat_ts, out_dir, mask_file): """Compute PLS correlation between brain activity and CNN activation.""" train_feat_ts = train_feat_ts.reshape(-1, train_feat_ts.shape[3]).T val_feat_ts = val_feat_ts.reshape(-1, val_feat_ts.shape[3]).T train_fmri_ts = train_fmri_ts.T val_fmri_ts = val_fmri_ts.T # Iteration loop for different component number #for n in range(5, 19): # print '--- Components number %s ---' %(n) # plsca = PLSCanonical(n_components=n) # plsca.fit(train_feat_ts, train_fmri_ts) # pred_feat_c, pred_fmri_c = plsca.transform(val_feat_ts, val_fmri_ts) # pred_fmri_ts = plsca.predict(val_feat_ts) # # calculate correlation coefficient between truth and prediction # r = corr2_coef(val_fmri_ts.T, pred_fmri_ts.T, mode='pair') # # get top 20% corrcoef for model evaluation # vsample = int(np.rint(0.2*len(r))) # print 'Sample size for evaluation : %s' % (vsample) # r.sort() # meanr = np.mean(r[-1*vsample:]) # print 'Mean prediction corrcoef : %s' %(meanr) # model generation based on optimized CC number cc_num = 10 plsca = PLSCanonical(n_components=cc_num) plsca.fit(train_feat_ts, train_fmri_ts) from sklearn.externals import joblib joblib.dump(plsca, os.path.join(out_dir, 'plsca_model.pkl')) plsca = joblib.load(os.path.join(out_dir, 'plsca_model.pkl')) # calculate correlation coefficient between truth and prediction pred_fmri_ts = plsca.predict(val_feat_ts) fmri_pred_r = corr2_coef(val_fmri_ts.T, pred_fmri_ts.T, mode='pair') mask = vutil.data_swap(mask_file) vxl_idx = np.nonzero(mask.flatten() == 1)[0] tmp = np.zeros_like(mask.flatten(), dtype=np.float64) tmp[vxl_idx] = fmri_pred_r tmp = tmp.reshape(mask.shape) vutil.save2nifti(tmp, os.path.join(out_dir, 'pred_fmri_r.nii.gz')) pred_feat_ts = pls_y_pred_x(plsca, val_fmri_ts) pred_feat_ts = pred_feat_ts.T.reshape(96, 14, 14, 540) np.save(os.path.join(out_dir, 'pred_feat.npy'), pred_feat_ts) # get PLS-CCA weights feat_cc, fmri_cc = plsca.transform(train_feat_ts, train_fmri_ts) np.save(os.path.join(out_dir, 'feat_cc.npy'), feat_cc) np.save(os.path.join(out_dir, 'fmri_cc.npy'), fmri_cc) feat_weight = plsca.x_weights_.reshape(96, 14, 14, cc_num) #feat_weight = plsca.x_weights_.reshape(96, 11, 11, cc_num) fmri_weight = plsca.y_weights_ np.save(os.path.join(out_dir, 'feat_weights.npy'), feat_weight) np.save(os.path.join(out_dir, 'fmri_weights.npy'), fmri_weight) fmri_orig_ccs = get_pls_components(plsca.y_scores_, plsca.y_loadings_) np.save(os.path.join(out_dir, 'fmri_orig_ccs.npy'), fmri_orig_ccs)
def generate_transform_equations(trains_S0, trains_shift, **kw): print 'generate transform equations.........' new_fold(transform_fold) chan_len = kw['chan_len'] for idx, channel_pos in enumerate(kw['pos_list']): X_trains = trains_shift[:,idx*chan_len:idx*chan_len+chan_len] plsca = PLSCanonical(n_components=12) plsca.fit(X_trains, trains_S0) joblib.dump(plsca, transform_fold+'/cca_transform_'+kw['subject']+'_'+channel_pos+'.model') print 'generate transform equations finished.........'
def generate_transform_equations(trains_S0, trains_shift, **kw): print 'generate transform equations.........' new_fold(transform_fold) chan_len = kw['chan_len'] for idx, channel_pos in enumerate(kw['pos_list']): X_trains = trains_shift[:, idx * chan_len:idx * chan_len + chan_len] plsca = PLSCanonical(n_components=12) plsca.fit(X_trains, trains_S0) joblib.dump( plsca, transform_fold + '/cca_transform_' + kw['subject'] + '_' + channel_pos + '.model') print 'generate transform equations finished.........'
def drawFaces(emb1, emb2, wordRanking, n, reduction="cut"): """ Plot Chernoff faces for n most/less interesting words From: https://gist.github.com/aflaxman/4043086 :param n: if negative: less interesting :param reduction: :return: """ s1 = None s2 = None if reduction=="cut": s1 = emb1.getSimMatrix()[0:,0:18] s2 = emb2.getSimMatrix()[0:,0:18] elif reduction=="svd": s1 = TruncatedSVD(n_components=k).fit_transform(emb1.getSimMatrix()) s2 = TruncatedSVD(n_components=k).fit_transform(emb2.getSimMatrix()) elif reduction=="cca": #use orginal embeddings, not similarity matrix for reduction cca = PLSCanonical(n_components=18) cca.fit(emb1.m, emb2.m) s1, s2 = cca.transform(emb1.m, emb2.m) interesting = list() name = str(n)+"."+reduction if n<0: #plot uninteresting words n *= -1 interesting = [wordRanking[::-1][i] for i in xrange(n)] else: interesting = [wordRanking[i] for i in xrange(n)] fig = plt.figure(figsize=(11,11)) c = 0 for i in range(n): word = interesting[i] j = emb1.d[word] ax = fig.add_subplot(n,2,c+1,aspect='equal') mpl_cfaces.cface(ax, *s1[j]) #nice for similarity matrix *s1[j][:18] ax.axis([-1.2,1.2,-1.2,1.2]) ax.set_xticks([]) ax.set_yticks([]) ax.set_title(word) ax2 = fig.add_subplot(n,2,c+2,aspect='equal') mpl_cfaces.cface(ax2, *s2[j]) ax2.axis([-1.2,1.2,-1.2,1.2]) ax2.set_xticks([]) ax2.set_yticks([]) ax2.set_title(word) c += 2 plotname = "plots/"+NAME+".cface_s1s2_"+name+".png" fig.savefig(plotname) print("\tSaved Chernoff faces plot in '%s'" % (plotname))
class _PLSCanonicalImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X) def predict(self, X): return self._wrapped_model.predict(X)
def pls(x, y, num_cc): random.seed(42) plsca = PLSCanonical(n_components=int(num_cc), algorithm='svd') fit = plsca.fit(x, y) u = fit.x_weights_ v = fit.y_weights_ a1 = np.matmul(np.matrix(x), np.matrix(u)).transpose() d = np.matmul(np.matmul(a1, np.matrix(y)), np.matrix(v)) ds = [d[i, i] for i in range(0, 30)] return u, v, ds
def PLS_Canonical(csv_data, point_index, sub_index, var_name, train=None, components=None): X_array = [] temp_array = [] for j in csv_data: temp_array = j[point_index - 1:point_index + 8] X_array.append(temp_array) X_array = np.array(X_array) if components == None: components = np.shape(X_array)[1] for i in range(7): Y_array = np.array(csv_data[:, sub_index - 1 + i]) plsca = PLSCanonical(n_components=1) plsca.fit(X_array, Y_array) print(var_name[sub_index + i]) print("R^2 =", np.around(plsca.score(X_array, Y_array), decimals=2))
def getCCARanking(self, filter=None): """ Compare how far apart words are in projection into common space by CCA :return: """ cca = PLSCanonical(n_components=self.n) cca.fit(self.emb1.m, self.emb2.m) m1transformed, m2transformed = cca.transform(self.emb1.m, self.emb2.m) #get distances between vectors assert self.emb1.vocab_size == self.emb2.vocab_size distDict = dict() for i in xrange(self.emb1.vocab_size): v1 = m1transformed[i] v2 = m2transformed[i] w = self.emb1.rd[i] distDict[w] = 1-Similarity.euclidean(v1,v2) ranked = sorted(distDict.iteritems(), key=itemgetter(1), reverse=True) if filter is not None: ranked = [(w, s) for (w, s) in distDict.iteritems() if w in filter] return ranked
def plotClustersCCA(self, filter=None): """ Plot clusters in 2dim CCA space: Comparable across embeddings :return: """ if len(self.cluster1) <= 1: cmap1 = plt.get_cmap('jet', 2) else: cmap1 = plt.get_cmap('jet', len(self.cluster1)) cmap1.set_under('gray') if len(self.cluster2) <= 1: cmap2 = plt.get_cmap('jet', 2) else: cmap2 = plt.get_cmap('jet', len(self.cluster2)) cmap2.set_under('gray') cca = PLSCanonical(n_components=2) cca.fit(self.emb1.m, self.emb2.m) m1transformed, m2transformed = cca.transform(self.emb1.m, self.emb2.m) labels1 = [self.emb1.rd[i] for i in xrange(self.emb1.vocab_size)] colors1 = [self.word2cluster1[self.emb1.rd[i]] for i in xrange(self.emb1.vocab_size)] labels2 = [self.emb2.rd[i] for i in xrange(self.emb2.vocab_size)] colors2 = [self.word2cluster2[self.emb2.rd[i]] for i in xrange(self.emb2.vocab_size)] if filter is not None: print("\tFiltering samples to plot") filteredIds = [self.emb1.d[w] for w in filter] #get ids for words in filter m1transformed = m1transformed[filteredIds] m2transformed = m2transformed[filteredIds] labels1 = [l for l in labels1 if l in filter] labels2 = [l for l in labels2 if l in filter] elif m1transformed.shape[0] > 100: #sample indices to display, otherwise it's too messy filteredIds = np.random.randint(low=0, high=m1.transformed.shape[0]) #sample filteredIds m1transformed = m1transformed[filteredIds] m2transformed = m2transformed[filteredIds] labels1 = [l for l in labels1 if l in filter] labels2 = [l for l in labels2 if l in filter] plotWithLabelsAndColors(m1transformed, labels1, colors=colors1, cmap=cmap1, filename="plots/"+NAME+".cca1.png", dimRed="CCA") plotWithLabelsAndColors(m2transformed, labels2, colors=colors2, cmap=cmap2, filename="plots/"+NAME+".cca2.png", dimRed="CCA")
Y_train = Y[:n // 2] X_test = X[n // 2:] Y_test = Y[n // 2:] print("Corr(X)") print(np.round(np.corrcoef(X.T), 2)) print("Corr(Y)") print(np.round(np.corrcoef(Y.T), 2)) # ############################################################################# # Canonical (symmetric) PLS # Transform data # ~~~~~~~~~~~~~~ plsca = PLSCanonical(n_components=2) plsca.fit(X_train, Y_train) X_train_r, Y_train_r = plsca.transform(X_train, Y_train) X_test_r, Y_test_r = plsca.transform(X_test, Y_test) # Scatter plot of scores # ~~~~~~~~~~~~~~~~~~~~~~ # 1) On diagonal plot X vs Y scores on each components plt.figure(figsize=(12, 8)) plt.subplot(221) plt.plot(X_train_r[:, 0], Y_train_r[:, 0], "ob", label="train") plt.plot(X_test_r[:, 0], Y_test_r[:, 0], "or", label="test") plt.xlabel("x scores") plt.ylabel("y scores") plt.title('Comp. 1: X vs Y (test corr = %.2f)' % np.corrcoef(X_test_r[:, 0], Y_test_r[:, 0])[0, 1]) plt.xticks(())
from sklearn import datasets import numpy as np from sklearn.model_selection import train_test_split from sklearn.cross_decomposition import PLSCanonical from sklearn.neighbors import KNeighborsClassifier import math from mlxtend.feature_selection import SequentialFeatureSelector as SFS dataSet = datasets.load_digits() data = dataSet["data"] target = dataSet["target"] plsca = PLSCanonical(n_components=2) plsca.fit(data, target) X_train_r, Y_train_r = plsca.transform(data, target) knn = math.sqrt(len(X_train_r)) knn = KNeighborsClassifier(n_neighbors=int(knn)) Y_train_r = [int(Y_train_r[i]) for i in range(0, len(Y_train_r))] k = knn.fit(X_train_r, Y_train_r) print(k.score(X_train_r, Y_train_r)) knn = KNeighborsClassifier(n_neighbors=4) sfs = SFS(knn, k_features=3, forward=True, floating=False, verbose=2,
knn = KNeighborsClassifier(round(math.sqrt(mnist.data.shape[0])), metric='euclidean', weights='uniform') knn.fit(lda_train, train_targets) print("Score for ", i, " components: ", knn.score(lda_test, test_targets)) if max_value < knn.score(lda_test, test_targets): max_value = knn.score(lda_test, test_targets) max_number = i print("Max for: ", max_number, " is: ", max_value) # Zadanie 4: max_value = 0 max_number = 0 for i in range(1, 6): plsca = PLSCanonical(n_components=i) plsca.fit(train, train_targets) pls_train = plsca.fit(train, train_targets).transform(train) pls_test = plsca.fit(test, test_targets).transform(test) knn = KNeighborsClassifier(round(math.sqrt(mnist.data.shape[0])), metric='euclidean', weights='uniform') knn.fit(pls_train, train_targets) print("Score for ", i, " components: ", knn.score(pls_test, test_targets)) if max_value < knn.score(pls_test, test_targets): max_value = knn.score(pls_test, test_targets) max_number = i print("Max for: ", max_number, " is: ", max_value) # Zadanie 5: knn = KNeighborsClassifier(round(math.sqrt(mnist.data.shape[0]))) sfs = SFS(knn,
#Następnie sprawdź sprawność klasyfikatora kNN dla zbioru testowego ograniczonego do wybranego #podzbioru cech. Parametr kk przyjmij jako pierwiastek z liczby obiektów w zbiorze. #Dla jakiej liczby cech osiągnięto najlepsze rezultaty? from sklearn import datasets from sklearn import model_selection from sklearn.neighbors import KNeighborsClassifier import math from sklearn.cross_decomposition import PLSCanonical mnist_dataset = datasets.load_digits() X = mnist_dataset.data Y = mnist_dataset.target target_names = mnist_dataset.target_names train, test, train_targets, test_targets = model_selection.train_test_split(X, Y, train_size=0.5,test_size=0.5) max = 0 max_n_components = 0 for i in range(1, 10): plsca = PLSCanonical(n_components=i) plsca.fit(train, train_targets) X_r = plsca.fit(train, train_targets).transform(train) Y_r = plsca.fit(test, test_targets).transform(test) clf = KNeighborsClassifier(round(math.sqrt(X.shape[0])),weights="uniform", metric="euclidean") clf.fit(X_r, train_targets) print(i, ":", clf.score(Y_r, test_targets)) if max < clf.score(Y_r, test_targets): max = clf.score(Y_r, test_targets) max_n_components = i print("Best result for:", max_n_components)
#correct not accurate from sklearn.cross_validation import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn import metrics from sklearn.svm import SVC import numpy as np import pandas as pd from sklearn.cross_decomposition import PLSRegression from sklearn.cross_decomposition import PLSCanonical df = pd.read_csv('newdata.csv') x = df.drop(['tag'], axis=1) y = df.drop(['kx', 'ky', 'kz', 'wa', 'wb', 'wc', 'wd', 'we', 'wf'], axis=1) X_train, X_test, Y_train, Y_test = train_test_split(x, y, random_state=5) plsr = PLSRegression() plsr.fit(X_train, Y_train) plsc = PLSCanonical() plsc.fit(X_train, Y_train) print(plsr.score(X_test, Y_test)) print(plsc.score(X_test, Y_test))
#correct not accurate from sklearn.cross_validation import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn import metrics from sklearn.svm import SVC import numpy as np import pandas as pd from sklearn.cross_decomposition import PLSRegression from sklearn.cross_decomposition import PLSCanonical df=pd.read_csv('newdata.csv') x=df.drop(['tag'],axis=1) y=df.drop(['kx','ky','kz','wa','wb','wc','wd','we','wf'],axis=1) X_train , X_test , Y_train , Y_test = train_test_split(x,y , random_state=5) plsr=PLSRegression() plsr.fit(X_train,Y_train) plsc=PLSCanonical() plsc.fit(X_train,Y_train) print (plsr.score(X_test,Y_test)) print (plsc.score(X_test,Y_test))
class Wrapper: """ This is a wrapper class for linear, regularised and kernel CCA, Multiset CCA and Generalized CCA. We create an instance with a method and number of latent dimensions. If we have more than 2 views we need to use generalized methods, but we can override in the 2 view case also with the generalized parameter. The class has a number of methods: fit(): gives us train correlations and stores the variables needed for out of sample prediction as well as some method-specific variables cv_fit(): allows us to perform a hyperparameter search and then fit the model using the optimal hyperparameters predict_corr(): allows us to predict the out of sample correlation for supplied views predict_view(): allows us to predict a reconstruction of missing views from the supplied views transform_view(): allows us to transform given views to the latent variable space remaining methods are used to """ def __init__(self, latent_dims: int = 1, method: str = 'l2', generalized: bool = False, max_iter: int = 500, tol=1e-6): self.latent_dims = latent_dims self.method = method self.generalized = generalized self.max_iter = max_iter self.tol = tol def fit(self, *args, params=None): if params is None: params = {} self.params = params if len(args) > 2: self.generalized = True print('more than 2 views therefore switched to generalized') if 'c' not in self.params: self.params = {'c': [0] * len(args)} if self.method == 'kernel': #Linear kernel by default if 'kernel' not in self.params: self.params['kernel'] = 'linear' #First order polynomial by default if 'degree' not in self.params: self.params['degree'] = 1 # First order polynomial by default if 'sigma' not in self.params: self.params['sigma'] = 1.0 # Fit returns in-sample score vectors and correlations as well as models with transform functionality self.dataset_list = [] self.dataset_means = [] for dataset in args: self.dataset_means.append(dataset.mean(axis=0)) self.dataset_list.append(dataset - dataset.mean(axis=0)) if self.method == 'kernel': self.fit_kcca = cca_zoo.KCCA.KCCA(self.dataset_list[0], self.dataset_list[1], params=self.params, latent_dims=self.latent_dims) self.score_list = [self.fit_kcca.U, self.fit_kcca.V] elif self.method == 'pls': self.fit_scikit_pls(self.dataset_list[0], self.dataset_list[1]) elif self.method == 'scikit': self.fit_scikit_cca(self.dataset_list[0], self.dataset_list[1]) elif self.method == 'mcca': self.fit_mcca(*self.dataset_list) elif self.method == 'gcca': self.fit_gcca(*self.dataset_list) else: self.outer_loop(*self.dataset_list) if self.method[:4] == 'tree': self.tree_list = [self.tree_list[i] for i in range(len(args))] self.weights_list = [np.expand_dims(tree.feature_importances_, axis=1) for tree in self.tree_list] else: self.rotation_list = [] for i in range(len(args)): self.rotation_list.append( self.weights_list[i] @ pinv2(self.loading_list[i].T @ self.weights_list[i], check_finite=False)) self.train_correlations = self.predict_corr(*args) return self def cv_fit(self, *args, param_candidates=None, folds: int = 5, verbose: bool = False): best_params = cross_validate(*args, max_iter=self.max_iter, latent_dims=self.latent_dims, method=self.method, param_candidates=param_candidates, folds=folds, verbose=verbose, tol=self.tol) self.fit(*args, params=best_params) return self def bayes_cv_fit(self, *args, param_candidates=None, folds: int = 5, verbose: bool = False): space = { "n_estimators": hp.choice("n_estimators", [100, 200, 300, 400, 500, 600]), "max_depth": hp.quniform("max_depth", 1, 15, 1), "criterion": hp.choice("criterion", ["gini", "entropy"]), } trials = Trials() best_params = fmin( fn=Wrapper(), space=space, algo=tpe.suggest, max_evals=100, trials=trials ) self.fit(*args, params=best_params) return self def predict_corr(self, *args): # Takes two datasets and predicts their out of sample correlation using trained model transformed_views = self.transform_view(*args) all_corrs = [] for x, y in itertools.product(transformed_views, repeat=2): all_corrs.append(np.diag(np.corrcoef(x.T, y.T)[:self.latent_dims, self.latent_dims:])) all_corrs = np.array(all_corrs).reshape((len(args), len(args), self.latent_dims)) return all_corrs def predict_view(self, *args): # Regress original given views onto target transformed_views = self.transform_view(*args) # Get the regression from the training data with available views predicted_target = np.mean([transformed_views[i] for i in range(len(args)) if args[i] is not None], axis=0) predicted_views = [] for i, view in enumerate(args): if view is None: predicted_views.append(predicted_target @ pinv2(self.weights_list[i])) else: predicted_views.append(view) for i, predicted_view in enumerate(predicted_views): predicted_views[i] += self.dataset_means[i] return predicted_views def transform_view(self, *args): # Demeaning new_views = [] for i, new_view in enumerate(args): if new_view is None: new_views.append(None) else: new_views.append(new_view - self.dataset_means[i]) if self.method == 'kernel': transformed_views = list(self.fit_kcca.transform(new_views[0], new_views[1])) elif self.method == 'pls': transformed_views = list(self.PLS.transform(new_views[0], new_views[1])) elif self.method[:4] == 'tree': transformed_views = [] for i, new_view in enumerate(new_views): if new_view is None: transformed_views.append(None) else: transformed_views.append(self.tree_list[i].predict(new_view)) else: transformed_views = [] for i, new_view in enumerate(new_views): if new_view is None: transformed_views.append(None) else: transformed_views.append(new_view @ self.rotation_list[i]) # d x n x k return transformed_views def outer_loop(self, *args): # list of d: p x k self.weights_list = [np.zeros((args[i].shape[1], self.latent_dims)) for i in range(len(args))] # list of d: n x k self.score_list = [np.zeros((args[i].shape[0], self.latent_dims)) for i in range(len(args))] # list of d: self.loading_list = [np.zeros((args[i].shape[1], self.latent_dims)) for i in range(len(args))] if len(args) == 2: C_train = args[0].T @ args[1] C_train_res = C_train.copy() else: C_train_res = None residuals = list(args) # For each of the dimensions for k in range(self.latent_dims): self.inner_loop = cca_zoo.alternating_least_squares.ALS_inner_loop(*residuals, C=C_train_res, generalized=self.generalized, params=self.params, method=self.method, max_iter=self.max_iter) for i in range(len(args)): if self.method[:4] == 'tree': self.tree_list = self.inner_loop.weights else: self.weights_list[i][:, k] = self.inner_loop.weights[i] self.score_list[i][:, k] = self.inner_loop.targets[i, :] self.loading_list[i][:, k] = residuals[i].T @ self.score_list[i][:, k] / np.linalg.norm( self.score_list[i][:, k]) residuals[i] -= np.outer(self.score_list[i][:, k] / np.linalg.norm(self.score_list[i][:, k]), self.loading_list[i][:, k]) return self def fit_scikit_cca(self, train_set_1, train_set_2): self.cca = CCA(n_components=self.latent_dims, scale=False) self.cca.fit(train_set_1, train_set_2) self.score_list = [self.cca.x_scores_, self.cca.y_scores_] self.weights_list = [self.cca.x_weights_, self.cca.y_weights_] self.loading_list = [self.cca.x_loadings_, self.cca.y_loadings_] self.rotation_list = [self.cca.x_rotations_, self.cca.y_rotations_] return self def fit_scikit_pls(self, train_set_1, train_set_2): self.PLS = PLSCanonical(n_components=self.latent_dims, scale=False) self.PLS.fit(train_set_1, train_set_2) self.score_list = [self.PLS.x_scores_, self.PLS.y_scores_] self.weights_list = [self.PLS.x_weights_, self.PLS.y_weights_] return self def fit_mcca(self, *args): all_views = np.concatenate(args, axis=1) C = all_views.T @ all_views # Can regularise by adding to diagonal D = block_diag(*[(1 - self.params['c'][i]) * m.T @ m + self.params['c'][i] * np.eye(m.shape[1]) for i, m in enumerate(args)]) R = cholesky(D, lower=False) whitened = np.linalg.inv(R.T) @ C @ np.linalg.inv(R) [eigvals, eigvecs] = np.linalg.eig(whitened) idx = np.argsort(eigvals, axis=0)[::-1] eigvecs = eigvecs[:, idx].real eigvals = eigvals[idx].real eigvecs = np.linalg.inv(R) @ eigvecs splits = np.cumsum([0] + [view.shape[1] for view in args]) self.weights_list = [eigvecs[splits[i]:splits[i + 1], :self.latent_dims] for i in range(len(args))] self.rotation_list = self.weights_list self.score_list = [self.dataset_list[i] @ self.weights_list[i] for i in range(len(args))] def fit_gcca(self, *args): Q = [] for i, view in enumerate(args): view_cov = view.T @ view view_cov = (1 - self.params['c'][i]) * view_cov + self.params['c'][i] * np.eye(view_cov.shape[0]) Q.append(view @ np.linalg.inv(view_cov) @ view.T) Q = np.sum(Q, axis=0) [eigvals, eigvecs] = np.linalg.eig(Q) idx = np.argsort(eigvals, axis=0)[::-1] eigvecs = eigvecs[:, idx].real eigvals = eigvals[idx].real self.weights_list = [np.linalg.pinv(view) @ eigvecs[:, :self.latent_dims] for view in args] self.rotation_list = self.weights_list self.score_list = [self.dataset_list[i] @ self.weights_list[i] for i in range(len(args))]
if not include_negative_weights: # set negative connectivities to 0 edge_data = np.apply_along_axis( lambda x: [0 if element < 0 else element for element in x], 1, edge_data) # re-split data (3 ways) for CCA X1_train = edge_data[:140, :] X2_train = edge_data[140:280, :] X2_remain = edge_data[280:, :] #cca = CCA(n_components =2) #cca.fit(X1_train, X2_train) cca = PLSCanonical(n_components=100) cca.fit(X1_train, X2_train) block_1_transformed, block_2_transformed = cca.transform(X1_train, X2_train, copy=False) block_3_transformed = np.dot(X2_remain, cca.y_rotations_) edge_data_transformed = np.vstack( (block_1_transformed, block_2_transformed, block_3_transformed)) # initialise the classifier clf = svm.SVC(kernel='precomputed') # optional shuffle perm = np.random.permutation(n_subjects) #print perm #print n_subjects
plt.xlim(1, np.amax(nComponents)) plt.title('PLS SVD accuracy') plt.xlabel('Number of components') plt.ylabel('accuracy') plt.legend(['LR', 'LDA', 'GNB', 'Linear SVM', 'rbf SVM'], loc='lower right') plt.grid(True) if (0): #%% PLS Cannonical nComponents = np.arange(1, nClasses + 1) plsCanScores = np.zeros((5, np.alen(nComponents))) for i, n in enumerate(nComponents): plscan = PLSCanonical(n_components=n) plscan.fit(Xtrain, Ytrain) XtrainT = plscan.transform(Xtrain) XtestT = plscan.transform(Xtest) plsCanScores[:, i] = util.classify(XtrainT, XtestT, labelsTrain, labelsTest) plscan = PLSCanonical(n_components=2) plscan.fit(Xtrain, Ytrain) xt = plscan.transform(Xtrain) fig = plt.figure() util.plotData(fig, xt, labelsTrain, classColors) plt.title('First 2 components of projected data') #%% Plot accuracies for PLSSVD plt.figure() for i in range(5):
plt.plot(nComponents,plsSvdScores[i,:],lw=3) plt.xlim(1,np.amax(nComponents)) plt.title('PLS SVD accuracy') plt.xlabel('Number of components') plt.ylabel('accuracy') plt.legend (['LR','LDA','GNB','Linear SVM','rbf SVM'],loc='lower right') plt.grid(True) if (0): #%% PLS Cannonical nComponents = np.arange(1,nClasses+1) plsCanScores = np.zeros((5,np.alen(nComponents))) for i,n in enumerate(nComponents): plscan = PLSCanonical(n_components=n) plscan.fit(Xtrain,Ytrain) XtrainT = plscan.transform(Xtrain) XtestT = plscan.transform(Xtest) plsCanScores[:,i] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest) plscan = PLSCanonical(n_components=2) plscan.fit(Xtrain,Ytrain) xt = plscan.transform(Xtrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) plt.title('First 2 components of projected data') #%% Plot accuracies for PLSSVD plt.figure()
plt.plot([0, 1], [0, 1]) plt.xlim([0, 1]) plt.gca().set_aspect('equal', adjustable='box') plt.legend(['Cell volume', 'Age', 'Both']) #NB: Strong colinearity between Age and Volume # Transition rate prediction using PLS X = dfc_g1[['vol_sm', 'Age', 'gr_sm']] # Design matrix y = dfc_g1['G1S_logistic'] # Response var # Drop NaN rows I = np.isnan(dfc_g1['gr_sm']) X = X.loc[~I].copy() y = y[~I] pls_model = PLSCanonical() pls_model.fit(scale(X), y) X_c, y_c = pls_model.transform(scale(X), y) # Multiple linearregression on birth size and growth rate df['bvol'] = df['Birth volume'] df['exp_gr'] = df['Exponential growth rate'] df['g1_len'] = df['G1 length'] model = smf.ols('g1_len ~ exp_gr + bvol', data=df).fit() model.summary() print model.pvalues # Delete S/G2 after first time point g1s_marked = [] for c in collated_filtered: c = c[c['Phase'] != 'Daughter G1'].copy()
def training_lda_TD4_intra(my_clfs, trains, classes, **kw): start_time = time.time() if(kw.has_key('log_fold')): log_fold = root_path + '/result/' + kw['log_fold'] new_fold(log_fold) chan_len = kw['chan_len'] action_num = kw['action_num'] cv = 3 results = [] results.append( ['Feat', 'Algorithm','n_components', 'Channel_Pos', 'Accuracy', 'std']) log_file = 'feat_'+kw['feature_type']+'_intra' clf = sklearn.lda.LDA(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) data_num = trains.shape[0]/action_num scores = sklearn.cross_validation.cross_val_score(clf, trains, classes, cv=cv) results.append(['feat_TD4_cv_'+str(cv), 'lda', 'ALL', 0, scores.mean(), scores.std()]) # 组内训练策略 9组数据 print '组内训练.............' for idx, channel_pos in enumerate(kw['pos_list']): # print '----training TD4 intra , channel_pos: ', channel_pos,'......' trains_intra = trains[:,idx*chan_len: idx*chan_len+chan_len] scores = sklearn.cross_validation.cross_val_score( clf, trains_intra, classes, cv=cv) results.append(['feat_TD4_cv_'+str(cv), 'lda', 0, channel_pos, scores.mean(), scores.std()]) # 中心训练策略 print '中心训练策略.............' trains_intra_S0 = trains[:,0:chan_len] for idx, channel_pos in enumerate(kw['pos_list']): if channel_pos == 'S0': continue tests_shift = trains[:,idx*chan_len: idx*chan_len+chan_len] # if channel_pos == 'L2': # print idx*chan_len, idx*chan_len+chan_len, tests_shift.shape, trains.shape # sys.exit(0) scores = clf.fit(trains_intra_S0, classes).score(tests_shift, classes) results.append(['feat_TD4_cv_'+str(cv), 'lda', 0, 'train S0' + ' test ' + channel_pos, scores.mean(), scores.std()]) # 组训练策略(不同于组内训练策略) 5-fold print '组训练策略.............' trains_intra_S0 = trains[:,0:chan_len] kf = KFold(data_num, n_folds=cv) for idx, channel_pos in enumerate(kw['pos_list']): if channel_pos == 'S0': continue itera = cv scores = np.zeros( (itera,) ) # stds = np.zeros( (itera,) ) itera -= 1 trains_shift = trains[:,idx*chan_len: idx*chan_len+chan_len] for train_idx, test_idx in kf: train_idx_all = np.array([], np.int) test_idx_all = np.array([], np.int) for action_idx in range(action_num): train_idx_all = np.concatenate( (train_idx_all, train_idx*(action_idx+1)), axis=0) test_idx_all = np.concatenate( (test_idx_all, test_idx*(action_idx+1)), axis=0) X_train = np.concatenate( (trains_intra_S0[train_idx_all], trains_shift[train_idx_all]), axis=0) y_train = np.concatenate( (classes[train_idx_all], classes[train_idx_all]), axis=0) X_test = trains_shift[test_idx_all] y_test = classes[test_idx_all] # X_test = trains_shift # y_test = classes score = clf.fit(X_train, y_train).score(X_test, y_test) scores[itera] = score.mean() itera -= 1 # print scores results.append(['feat_TD4_cv_'+str(cv), 'lda', 0, 'S0 + '+channel_pos, np.mean(scores), np.std(scores)]) # 基于CCA的训练策略 5-fold 交叉验证 print 'CCA训练策略.............' trains_S0 = trains[:,0:chan_len] n_components_list = [6, 8, 10, 12, 14, 16] # 子空间维数 # n_components_list = [12,14,16] kf = KFold(data_num, n_folds=cv) for n_components in n_components_list: for idx, channel_pos in enumerate(kw['pos_list']): if channel_pos == 'S0': continue itera = cv scores = np.zeros( (itera,) ) stds = np.zeros( (itera,) ) itera -= 1 trains_shift = trains[:,idx*chan_len: idx*chan_len+chan_len] for train_idx, test_idx in kf: train_idx_all = np.array([], np.int) test_idx_all = np.array([], np.int) for action_idx in range(action_num): train_idx_all = np.concatenate( (train_idx_all, train_idx*(action_idx+1)), axis=0) test_idx_all = np.concatenate( (test_idx_all, test_idx*(action_idx+1)), axis=0) # print train_idx_all.shape, train_idx_all, test_idx_all.shape, test_idx_all # plsca.fit(trains_shift[train_idx_all], trains_S0[train_idx_all]) plsca = PLSCanonical(n_components=n_components) plsca.fit(trains_shift, trains_S0) trains_shift_cca, trains_S0_cca = plsca.transform(trains_shift, trains_S0) X_trains = np.concatenate( (trains_S0_cca, trains_shift_cca[train_idx_all]), axis=0) y_trains = np.concatenate( (classes, classes[train_idx_all]), axis=0) score = clf.fit(X_trains, y_trains).score(trains_shift_cca[test_idx_all], classes[test_idx_all]) scores[itera] = score.mean() # stds[itera] = score.std() itera -= 1 results.append(['feat_TD4_cv_'+str(cv), 'lda_cca', n_components, 'S0 + '+channel_pos, np.mean(scores), np.std(scores)]) log_result(results, log_fold + '/' + log_file + '_action_1-'+str(action_num), 2) print '----Log Fold:', log_fold, ', log_file: ', log_file + '_action_1-'+str(action_num) print '----training TD4 time elapsed:', time.time() - start_time
plt.plot(Y_train_r[:, 0], Y_train_r[:, 1], "*b", label="train") plt.plot(Y_test_r[:, 0], Y_test_r[:, 1], "*r", label="test") plt.xlabel("Y comp. 1") plt.ylabel("Y comp. 2") plt.title('Y comp. 1 vs Y comp. 2 , (test corr = %.2f)' % numpy.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1]) plt.legend(loc="best") plt.xticks(()) plt.yticks(()) plt.savefig(output_file) plt.close() # PLSCA plsca = PLSCanonical(n_components=2) plsca.fit(Xtrain, Ytrain) # PLSCanonical(algorithm='nipals', copy=True, max_iter=500, n_components=2, # scale=True, tol=1e-06) X_train_r, Y_train_r = plsca.transform(Xtrain, Ytrain) X_test_r, Y_test_r = plsca.transform(Xtest, Ytest) do_plot(X_train_r, Y_train_r, X_test_r, Y_test_r, '%s/PLSCA_2comp_norm.pdf' % output_folder) # CCA # probably not necessary, but just in case the data was modified in some way Ytrain = norm.loc[train, :] Ytest = norm.loc[holdout, :] Xtrain = numpy.array(X.loc[train, :]) Xtest = X.loc[holdout, :] cca = CCA(n_components=2) cca.fit(Xtrain, Ytrain)
def test_sanity_check_pls_canonical_random(): # Sanity check for PLSCanonical on random data # The results were checked against the R-package plspm n = 500 p_noise = 10 q_noise = 5 # 2 latents vars: rng = check_random_state(11) l1 = rng.normal(size=n) l2 = rng.normal(size=n) latents = np.array([l1, l1, l2, l2]).T X = latents + rng.normal(size=4 * n).reshape((n, 4)) Y = latents + rng.normal(size=4 * n).reshape((n, 4)) X = np.concatenate((X, rng.normal(size=p_noise * n).reshape(n, p_noise)), axis=1) Y = np.concatenate((Y, rng.normal(size=q_noise * n).reshape(n, q_noise)), axis=1) pls = PLSCanonical(n_components=3) pls.fit(X, Y) expected_x_weights = np.array([ [0.65803719, 0.19197924, 0.21769083], [0.7009113, 0.13303969, -0.15376699], [0.13528197, -0.68636408, 0.13856546], [0.16854574, -0.66788088, -0.12485304], [-0.03232333, -0.04189855, 0.40690153], [0.1148816, -0.09643158, 0.1613305], [0.04792138, -0.02384992, 0.17175319], [-0.06781, -0.01666137, -0.18556747], [-0.00266945, -0.00160224, 0.11893098], [-0.00849528, -0.07706095, 0.1570547], [-0.00949471, -0.02964127, 0.34657036], [-0.03572177, 0.0945091, 0.3414855], [0.05584937, -0.02028961, -0.57682568], [0.05744254, -0.01482333, -0.17431274], ]) expected_x_loadings = np.array([ [0.65649254, 0.1847647, 0.15270699], [0.67554234, 0.15237508, -0.09182247], [0.19219925, -0.67750975, 0.08673128], [0.2133631, -0.67034809, -0.08835483], [-0.03178912, -0.06668336, 0.43395268], [0.15684588, -0.13350241, 0.20578984], [0.03337736, -0.03807306, 0.09871553], [-0.06199844, 0.01559854, -0.1881785], [0.00406146, -0.00587025, 0.16413253], [-0.00374239, -0.05848466, 0.19140336], [0.00139214, -0.01033161, 0.32239136], [-0.05292828, 0.0953533, 0.31916881], [0.04031924, -0.01961045, -0.65174036], [0.06172484, -0.06597366, -0.1244497], ]) expected_y_weights = np.array([ [0.66101097, 0.18672553, 0.22826092], [0.69347861, 0.18463471, -0.23995597], [0.14462724, -0.66504085, 0.17082434], [0.22247955, -0.6932605, -0.09832993], [0.07035859, 0.00714283, 0.67810124], [0.07765351, -0.0105204, -0.44108074], [-0.00917056, 0.04322147, 0.10062478], [-0.01909512, 0.06182718, 0.28830475], [0.01756709, 0.04797666, 0.32225745], ]) expected_y_loadings = np.array([ [0.68568625, 0.1674376, 0.0969508], [0.68782064, 0.20375837, -0.1164448], [0.11712173, -0.68046903, 0.12001505], [0.17860457, -0.6798319, -0.05089681], [0.06265739, -0.0277703, 0.74729584], [0.0914178, 0.00403751, -0.5135078], [-0.02196918, -0.01377169, 0.09564505], [-0.03288952, 0.09039729, 0.31858973], [0.04287624, 0.05254676, 0.27836841], ]) assert_array_almost_equal(np.abs(pls.x_loadings_), np.abs(expected_x_loadings)) assert_array_almost_equal(np.abs(pls.x_weights_), np.abs(expected_x_weights)) assert_array_almost_equal(np.abs(pls.y_loadings_), np.abs(expected_y_loadings)) assert_array_almost_equal(np.abs(pls.y_weights_), np.abs(expected_y_weights)) x_loadings_sign_flip = np.sign(pls.x_loadings_ / expected_x_loadings) x_weights_sign_flip = np.sign(pls.x_weights_ / expected_x_weights) y_weights_sign_flip = np.sign(pls.y_weights_ / expected_y_weights) y_loadings_sign_flip = np.sign(pls.y_loadings_ / expected_y_loadings) assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip) assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip) assert_matrix_orthogonal(pls.x_weights_) assert_matrix_orthogonal(pls.y_weights_) assert_matrix_orthogonal(pls._x_scores) assert_matrix_orthogonal(pls._y_scores)
def plot_compare_cross_decomposition(): # Dataset based latent variables model n = 500 # 2 latents vars: l1 = np.random.normal(size=n) l2 = np.random.normal(size=n) latents = np.array([l1, l1, l2, l2]).T X = latents + np.random.normal(size=4 * n).reshape((n, 4)) Y = latents + np.random.normal(size=4 * n).reshape((n, 4)) X_train = X[:n // 2] Y_train = Y[:n // 2] X_test = X[n // 2:] Y_test = Y[n // 2:] print("Corr(X)") print(np.round(np.corrcoef(X.T), 2)) print("Corr(Y)") print(np.round(np.corrcoef(Y.T), 2)) # ############################################################################# # Canonical (symmetric) PLS # Transform data # ~~~~~~~~~~~~~~ plsca = PLSCanonical(n_components=2) plsca.fit(X_train, Y_train) X_train_r, Y_train_r = plsca.transform(X_train, Y_train) X_test_r, Y_test_r = plsca.transform(X_test, Y_test) # Scatter plot of scores # ~~~~~~~~~~~~~~~~~~~~~~ # 1) On diagonal plot X vs Y scores on each components plt.figure(figsize=(12, 8)) plt.subplot(221) plt.scatter(X_train_r[:, 0], Y_train_r[:, 0], label="train", marker="o", s=25) plt.scatter(X_test_r[:, 0], Y_test_r[:, 0], label="test", marker="o", s=25) plt.xlabel("x scores") plt.ylabel("y scores") plt.title('Comp. 1: X vs Y (test corr = %.2f)' % np.corrcoef(X_test_r[:, 0], Y_test_r[:, 0])[0, 1]) plt.xticks(()) plt.yticks(()) plt.legend(loc="best") plt.subplot(224) plt.scatter(X_train_r[:, 1], Y_train_r[:, 1], label="train", marker="o", s=25) plt.scatter(X_test_r[:, 1], Y_test_r[:, 1], label="test", marker="o", s=25) plt.xlabel("x scores") plt.ylabel("y scores") plt.title('Comp. 2: X vs Y (test corr = %.2f)' % np.corrcoef(X_test_r[:, 1], Y_test_r[:, 1])[0, 1]) plt.xticks(()) plt.yticks(()) plt.legend(loc="best") # 2) Off diagonal plot components 1 vs 2 for X and Y plt.subplot(222) plt.scatter(X_train_r[:, 0], X_train_r[:, 1], label="train", marker="*", s=50) plt.scatter(X_test_r[:, 0], X_test_r[:, 1], label="test", marker="*", s=50) plt.xlabel("X comp. 1") plt.ylabel("X comp. 2") plt.title('X comp. 1 vs X comp. 2 (test corr = %.2f)' % np.corrcoef(X_test_r[:, 0], X_test_r[:, 1])[0, 1]) plt.legend(loc="best") plt.xticks(()) plt.yticks(()) plt.subplot(223) plt.scatter(Y_train_r[:, 0], Y_train_r[:, 1], label="train", marker="*", s=50) plt.scatter(Y_test_r[:, 0], Y_test_r[:, 1], label="test", marker="*", s=50) plt.xlabel("Y comp. 1") plt.ylabel("Y comp. 2") plt.title('Y comp. 1 vs Y comp. 2 , (test corr = %.2f)' % np.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1]) plt.legend(loc="best") plt.xticks(()) plt.yticks(()) plt.show() # ############################################################################# # PLS regression, with multivariate response, a.k.a. PLS2 n = 1000 q = 3 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) B = np.array([[1, 2] + [0] * (p - 2)] * q).T # each Yj = 1*X1 + 2*X2 + noize Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print("True B (such that: Y = XB + Err)") print(B) # compare pls2.coef_ with B print("Estimated B") print(np.round(pls2.coef_, 1)) pls2.predict(X) # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 pls1 = PLSRegression(n_components=3) pls1.fit(X, y) # note that the number of components exceeds 1 (the dimension of y) print("Estimated betas") print(np.round(pls1.coef_, 1)) # ############################################################################# # CCA (PLS mode B with symmetric deflation) cca = CCA(n_components=2) cca.fit(X_train, Y_train) X_train_r, Y_train_r = cca.transform(X_train, Y_train) X_test_r, Y_test_r = cca.transform(X_test, Y_test)
plt.yticks(()) plt.subplot(223) plt.plot(Y_train_r[:, 0], Y_train_r[:, 1], "*b", label="train") plt.plot(Y_test_r[:, 0], Y_test_r[:, 1], "*r", label="test") plt.xlabel("Y comp. 1") plt.ylabel("Y comp. 2") plt.title('Y comp. 1 vs Y comp. 2 , (test corr = %.2f)'% numpy.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1]) plt.legend(loc="best") plt.xticks(()) plt.yticks(()) plt.savefig(output_file) plt.close() # PLSCA plsca = PLSCanonical(n_components=2) plsca.fit(Xtrain, Ytrain) # PLSCanonical(algorithm='nipals', copy=True, max_iter=500, n_components=2, # scale=True, tol=1e-06) X_train_r, Y_train_r = plsca.transform(Xtrain, Ytrain) X_test_r, Y_test_r = plsca.transform(Xtest, Ytest) do_plot(X_train_r,Y_train_r,X_test_r,Y_test_r,'%s/PLSCA_2comp_norm.pdf' %output_folder) # CCA # probably not necessary, but just in case the data was modified in some way Ytrain = norm.loc[train,:] Ytest = norm.loc[holdout,:] Xtrain = numpy.array(X.loc[train,:]) Xtest = X.loc[holdout,:] cca = CCA(n_components=2) cca.fit(Xtrain, Ytrain) # CCA(copy=True, max_iter=500, n_components=2, scale=True, tol=1e-06)
plssvd = PLSSVD(n_components=2) xt,yt = plssvd.fit_transform(dataTrain,Ytrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) u = plssvd.x_weights_ plt.quiver(u[0,0],u[1,0],color='k',edgecolor='k',lw=1,scale=0.1,figure=fig) plt.quiver(-u[1,0],u[0,0],color='k',edgecolor='k',lw=1,scale=0.4,figure=fig) #%% PLS mode-A lda = LDA() nComponents = np.arange(1,nClasses+1) plsCanScores = np.zeros((2,np.alen(nComponents))) for i,n in enumerate(nComponents): plscan = PLSCanonical(n_components=n) plscan.fit(dataTrain,Ytrain) dataTrainT = plscan.transform(dataTrain) dataTestT = plscan.transform(dataTest) plsCanScores[:,i] = util.classify(dataTrainT,dataTestT,labelsTrain,labelsTest) fig = plt.figure() util.plotAccuracy(fig,nComponents,plsCanScores) plt.title('PLS Canonical accuracy',figure=fig) plscan = PLSCanonical(n_components=2) xt,yt = plscan.fit_transform(dataTrain,Ytrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) u = plscan.x_weights_ plt.quiver(u[0,0],u[1,0],color='k',edgecolor='k',lw=1,scale=0.1,figure=fig) plt.quiver(-u[1,0],u[0,0],color='k',edgecolor='k',lw=1,scale=0.4,figure=fig)
def training_lda_TD4_inter(my_clfs, trains_S0, trains_shift, classes, **kw): print 'training_lda_TD4_inter.........' start_time = time.time() log_fold = root_path + '/result/' + kw['log_fold'] new_fold(log_fold) chan_len = kw['chan_len'] action_num = kw['action_num'] print "----training " + kw[ 'feature_type'] + " inter, training by position O, testing by electrode shift " cv = 5 results = [] results.append(['Feat', 'Algorithm', 'Channel_Pos', 'Accuracy', 'std']) log_file = 'feat_' + kw['feature_type'] + '_inter' clf = sklearn.lda.LDA(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) data_num = trains_S0.shape[0] / action_num # print data_num scores = sklearn.cross_validation.cross_val_score(clf, trains_S0, classes, cv=cv) results.append( ['feat_TD4_cv_' + str(cv), 'lda', 'S0', scores.mean(), scores.std()]) kf = KFold(data_num, n_folds=cv) for idx, channel_pos in enumerate(kw['pos_list']): X_test = trains_shift[:, idx * chan_len:idx * chan_len + chan_len] y_test = classes iteration = cv scores = np.zeros((iteration, )) cca_scores = np.zeros((iteration, )) iteration -= 1 for train_idx, test_idx in kf: train_idx_all = np.array([], np.int) test_idx_all = np.array([], np.int) for action_idx in range(action_num): train_idx_all = np.concatenate( (train_idx_all, train_idx * action_idx), axis=0) test_idx_all = np.concatenate( (test_idx_all, test_idx * action_idx), axis=0) # X_train, y_train = trains_S0[train_idx_all], classes[train_idx_all] X_train, y_train = trains_S0, classes X_train_shift, y_train_shift = X_test[train_idx_all], classes[ train_idx_all] X_train_all = np.concatenate((X_train, X_train_shift), axis=0) y_train_all = np.concatenate((y_train, y_train_shift), axis=0) sys.exit(0) score_inter = clf.fit(X_train_all, y_train_all).score(X_test, y_test) scores[iteration] = score_inter.mean() # print X_train.shape, y_train.shape if channel_pos != 'S0': # plsca = joblib.load(transform_fold+'/cca_transform_'+kw['subject']+'_'+channel_pos+'.model') plsca = PLSCanonical(n_components=14) # print X_test.shape, X_train.shape # sys.exit(0) plsca.fit(X_test[train_idx], X_train) X_test_cca, X_train_cca = plsca.transform(X_test, X_train) cca_score = clf.fit(X_train_cca, y_train).score(X_test_cca, y_test) cca_scores[iteration] = cca_score.mean() iteration -= 1 # print scores # print cca_scores # sys.exit(0) results.append( ['feat_TD4', 'lda', channel_pos, np.mean(scores), np.std(scores)]) results.append([ 'feat_TD4', 'lda_cca', channel_pos, np.mean(cca_scores), np.std(cca_scores) ]) log_result(results, log_fold + '/' + log_file + '_' + str(kw['num']), 2) print '----Log Fold:', log_fold, ', log_file: ', log_file + '_' + channel_pos + '_' + str( kw['num']) print '----training TD4 time elapsed:', time.time() - start_time
def pls_decomposition(videos, audios, n_components=256): plsca = PLSCanonical(n_components=n_components) plsca.fit(audios, videos) videos_c, audios_c = plsca.transform(videos, audios) return videos_c, audios_c
def training_lda_TD4_intra(my_clfs, trains, classes, **kw): start_time = time.time() if (kw.has_key('log_fold')): log_fold = root_path + '/result/' + kw['log_fold'] new_fold(log_fold) chan_len = kw['chan_len'] action_num = kw['action_num'] cv = 3 results = [] results.append([ 'Feat', 'Algorithm', 'n_components', 'Channel_Pos', 'Accuracy', 'std' ]) log_file = 'feat_' + kw['feature_type'] + '_intra' clf = sklearn.lda.LDA(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) data_num = trains.shape[0] / action_num scores = sklearn.cross_validation.cross_val_score(clf, trains, classes, cv=cv) results.append([ 'feat_TD4_cv_' + str(cv), 'lda', 'ALL', 0, scores.mean(), scores.std() ]) # 组内训练策略 9组数据 print '组内训练.............' for idx, channel_pos in enumerate(kw['pos_list']): # print '----training TD4 intra , channel_pos: ', channel_pos,'......' trains_intra = trains[:, idx * chan_len:idx * chan_len + chan_len] scores = sklearn.cross_validation.cross_val_score(clf, trains_intra, classes, cv=cv) results.append([ 'feat_TD4_cv_' + str(cv), 'lda', 0, channel_pos, scores.mean(), scores.std() ]) # 中心训练策略 print '中心训练策略.............' trains_intra_S0 = trains[:, 0:chan_len] for idx, channel_pos in enumerate(kw['pos_list']): if channel_pos == 'S0': continue tests_shift = trains[:, idx * chan_len:idx * chan_len + chan_len] # if channel_pos == 'L2': # print idx*chan_len, idx*chan_len+chan_len, tests_shift.shape, trains.shape # sys.exit(0) scores = clf.fit(trains_intra_S0, classes).score(tests_shift, classes) results.append([ 'feat_TD4_cv_' + str(cv), 'lda', 0, 'train S0' + ' test ' + channel_pos, scores.mean(), scores.std() ]) # 组训练策略(不同于组内训练策略) 5-fold print '组训练策略.............' trains_intra_S0 = trains[:, 0:chan_len] kf = KFold(data_num, n_folds=cv) for idx, channel_pos in enumerate(kw['pos_list']): if channel_pos == 'S0': continue itera = cv scores = np.zeros((itera, )) # stds = np.zeros( (itera,) ) itera -= 1 trains_shift = trains[:, idx * chan_len:idx * chan_len + chan_len] for train_idx, test_idx in kf: train_idx_all = np.array([], np.int) test_idx_all = np.array([], np.int) for action_idx in range(action_num): train_idx_all = np.concatenate( (train_idx_all, train_idx * (action_idx + 1)), axis=0) test_idx_all = np.concatenate( (test_idx_all, test_idx * (action_idx + 1)), axis=0) X_train = np.concatenate( (trains_intra_S0[train_idx_all], trains_shift[train_idx_all]), axis=0) y_train = np.concatenate( (classes[train_idx_all], classes[train_idx_all]), axis=0) X_test = trains_shift[test_idx_all] y_test = classes[test_idx_all] # X_test = trains_shift # y_test = classes score = clf.fit(X_train, y_train).score(X_test, y_test) scores[itera] = score.mean() itera -= 1 # print scores results.append([ 'feat_TD4_cv_' + str(cv), 'lda', 0, 'S0 + ' + channel_pos, np.mean(scores), np.std(scores) ]) # 基于CCA的训练策略 5-fold 交叉验证 print 'CCA训练策略.............' trains_S0 = trains[:, 0:chan_len] n_components_list = [6, 8, 10, 12, 14, 16] # 子空间维数 # n_components_list = [12,14,16] kf = KFold(data_num, n_folds=cv) for n_components in n_components_list: for idx, channel_pos in enumerate(kw['pos_list']): if channel_pos == 'S0': continue itera = cv scores = np.zeros((itera, )) stds = np.zeros((itera, )) itera -= 1 trains_shift = trains[:, idx * chan_len:idx * chan_len + chan_len] for train_idx, test_idx in kf: train_idx_all = np.array([], np.int) test_idx_all = np.array([], np.int) for action_idx in range(action_num): train_idx_all = np.concatenate( (train_idx_all, train_idx * (action_idx + 1)), axis=0) test_idx_all = np.concatenate( (test_idx_all, test_idx * (action_idx + 1)), axis=0) # print train_idx_all.shape, train_idx_all, test_idx_all.shape, test_idx_all # plsca.fit(trains_shift[train_idx_all], trains_S0[train_idx_all]) plsca = PLSCanonical(n_components=n_components) plsca.fit(trains_shift, trains_S0) trains_shift_cca, trains_S0_cca = plsca.transform( trains_shift, trains_S0) X_trains = np.concatenate( (trains_S0_cca, trains_shift_cca[train_idx_all]), axis=0) y_trains = np.concatenate((classes, classes[train_idx_all]), axis=0) score = clf.fit(X_trains, y_trains).score(trains_shift_cca[test_idx_all], classes[test_idx_all]) scores[itera] = score.mean() # stds[itera] = score.std() itera -= 1 results.append([ 'feat_TD4_cv_' + str(cv), 'lda_cca', n_components, 'S0 + ' + channel_pos, np.mean(scores), np.std(scores) ]) log_result(results, log_fold + '/' + log_file + '_action_1-' + str(action_num), 2) print '----Log Fold:', log_fold, ', log_file: ', log_file + '_action_1-' + str( action_num) print '----training TD4 time elapsed:', time.time() - start_time
def training_lda_TD4_inter(my_clfs, trains_S0, trains_shift, classes, **kw): print 'training_lda_TD4_inter.........' start_time = time.time() log_fold = root_path + '/result/' + kw['log_fold'] new_fold(log_fold) chan_len = kw['chan_len'] action_num = kw['action_num'] print "----training "+kw['feature_type']+" inter, training by position O, testing by electrode shift " cv = 5 results = [] results.append(['Feat', 'Algorithm','Channel_Pos', 'Accuracy', 'std']) log_file = 'feat_'+kw['feature_type']+'_inter' clf = sklearn.lda.LDA(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) data_num = trains_S0.shape[0]/action_num # print data_num scores = sklearn.cross_validation.cross_val_score( clf, trains_S0, classes, cv=cv) results.append(['feat_TD4_cv_'+str(cv), 'lda', 'S0', scores.mean(), scores.std()]) kf = KFold(data_num, n_folds=cv) for idx, channel_pos in enumerate(kw['pos_list']): X_test = trains_shift[:,idx*chan_len:idx*chan_len+chan_len] y_test = classes iteration = cv scores = np.zeros((iteration,)) cca_scores = np.zeros((iteration,)) iteration -= 1 for train_idx, test_idx in kf: train_idx_all = np.array([], np.int) test_idx_all = np.array([], np.int) for action_idx in range(action_num): train_idx_all = np.concatenate( (train_idx_all, train_idx*action_idx), axis=0) test_idx_all = np.concatenate( (test_idx_all, test_idx*action_idx), axis=0) # X_train, y_train = trains_S0[train_idx_all], classes[train_idx_all] X_train, y_train = trains_S0, classes X_train_shift, y_train_shift = X_test[train_idx_all], classes[train_idx_all] X_train_all = np.concatenate( (X_train, X_train_shift), axis=0) y_train_all = np.concatenate( (y_train, y_train_shift), axis=0) sys.exit(0) score_inter = clf.fit(X_train_all, y_train_all).score(X_test, y_test) scores[iteration] = score_inter.mean() # print X_train.shape, y_train.shape if channel_pos != 'S0': # plsca = joblib.load(transform_fold+'/cca_transform_'+kw['subject']+'_'+channel_pos+'.model') plsca = PLSCanonical(n_components=14) # print X_test.shape, X_train.shape # sys.exit(0) plsca.fit(X_test[train_idx], X_train) X_test_cca, X_train_cca = plsca.transform(X_test, X_train) cca_score = clf.fit(X_train_cca, y_train).score(X_test_cca, y_test) cca_scores[iteration] = cca_score.mean() iteration -= 1 # print scores # print cca_scores # sys.exit(0) results.append(['feat_TD4', 'lda', channel_pos, np.mean(scores), np.std(scores)]) results.append(['feat_TD4', 'lda_cca', channel_pos, np.mean(cca_scores), np.std(cca_scores)]) log_result(results, log_fold + '/' + log_file + '_' + str(kw['num']), 2) print '----Log Fold:', log_fold, ', log_file: ', log_file + '_' + channel_pos + '_' + str(kw['num']) print '----training TD4 time elapsed:', time.time() - start_time # mean_shift = 0 # std_shift = 0 # for i in range(2, 10): # mean_shift += results[i][4] # std_shift += results[i][5] # mean_shift /= 9 # std_shift /= 9 # results.append(['feat_TD4','lda(svd;tol=0.0001)', 'Shift_means', '1.0', mean_shift, std_shift]) # mean_all = 0 # std_all = 0 # for i in range(1, 10): # mean_all += results[i][4] # std_all += results[i][5] # mean_all /= 9 # std_all /= 9
x_train = x[:n / 2] y_train = y[:n / 2] x_test = x[n / 2:] y_test = y[n / 2:] print("corr(x)") print(np.round(np.corrcoef(x.T), 2)) print("corr(y)") print(np.round(np.corrcoef(y.T), 2)) ################################################################# # Canonical (symmetric) PLS # transform the data plsca = PLSCanonical(n_components=2) plsca.fit(x_train, y_train) x_train_r, y_train_r = plsca.transform(x_train, y_train) x_test_r, y_test_r = plsca.transform(x_test, y_test) # Scatter plot of scores # ~~~~~~~~~~~~~~~~~~~~~~ # 1) On diagonal plot x vs y scores on each components plt.figure(figsize=(12, 8)) plt.subplot(221) plt.plot(x_train_r[:, 0], y_train_r[:, 0], "ob", label="train") plt.plot(x_test_r[:, 0], y_test_r[:, 0], "or", label="test") plt.xlabel("x scores") plt.ylabel("y scores") plt.title('Comp. 1: x vs y (test corr = %.2f)' % np.corrcoef(x_test_r[:, 0], y_test_r[:, 0])[0, 1]) plt.xticks(())