def feature_action_sensitivity(feature_type='TD4'): ''' 对每个特征,分析其在不移位和移位情况下的协方差 ''' results = [] subjects = ['subject_' + str(i + 1) for i in range(1)] channel_pos_list = ['S0', # 中心位置 'U1', 'U2', 'D1', 'D2', 'L1', 'L2', 'R1', 'R2'] # 上 下 左 右 pos_num = len(channel_pos_list) actions = [i+1 for i in range(7)] action_num = len(actions) # 7 动作类型个数 if feature_type == 'TD4': feature_list = ['MAV', 'ZC', 'SSC', 'WL'] elif feature_type == 'TD5': feature_list = ['MAV', 'ZC', 'SSC', 'WL','RMS'] feat_num = len(feature_list) # 4 特征维度 groups = [i+1 for i in range(4)] group_num = len(groups) # 4 通道数 group_span = group_num*feat_num # print group_span action_span = feat_num*group_num # 16 # print groups, channel_num, channel_span, feat_num train_dir = 'train4_250_100' results.append(['subject', 'action', 'feature', 'group', 'means_shift', 'std_shift'] ) plsca = PLSCanonical(n_components=2) # pos = 1 k=0 for pos_idx, pos_name in enumerate(channel_pos_list[1:]): pos = pos_idx+1 for subject in subjects: # shift_simulation = np.ones((action_num,action_span,2)) trains, classes = data_load.load_feature_dataset(train_dir, subject, feature_type) # m = trains.shape[0] # print trains.shape, classes.shape, m # print group_span, group_span*2 # sys.exit(0) # m = trains.shape[0]*2/3 m = trains.shape[0]/2 X_train = trains[:m, group_span*pos: group_span*(pos+1)] Y_train = trains[:m:, :group_span] X_test = trains[m:, group_span*pos: group_span*(pos+1)] Y_test = trains[m:, :group_span] plsca.fit(X_train, Y_train) X_train_r, Y_train_r = plsca.transform(X_train, Y_train) X_test_r, Y_test_r = plsca.transform(X_test, Y_test) filename=subject+'_'+pos_name # plot_plsc_figure(X_train_r,Y_train_r,X_test_r, Y_test_r, filename) plot_plsc_figure_two(X_train_r,Y_train_r,X_test_r, Y_test_r, filename)
def correlation_matching(I_tr, T_tr, I_te, T_te, n_comps): """ Learns correlation matching (CM) over I_tr and T_tr and applies it to I_tr, T_tr, I_te, T_te Parameters ---------- I_tr: np.ndarray [shape=(n_tr, d_I)] image data matrix for training T_tr: np.ndarray [shape=(n_tr, d_T)] text data matrix for training I_te: np.ndarray [shape=(n_te, d_I)] image data matrix for testing T_te: np.ndarray [shape=(n_te, d_T)] text data matrix for testing n_comps: int > 0 [scalar] number of canonical componens to use Returns ------- I_tr_cca : np.ndarray [shape=(n_tr, n_comps)] image data matrix represetned in correlation space T_tr_cca : np.ndarray [shape=(n_tr, n_comps)] text data matrix represetned in correlation space I_te_cca : np.ndarray [shape=(n_te, n_comps)] image data matrix represetned in correlation space T_te_cca : np.ndarray [shape=(n_te, n_comps)] text data matrix represetned in correlation space """ # sclale image and text data I_scaler = StandardScaler() I_tr = I_scaler.fit_transform(I_tr) I_te = I_scaler.transform(I_te) T_scaler = StandardScaler() T_tr = T_scaler.fit_transform(T_tr) T_te = T_scaler.transform(T_te) cca = PLSCanonical(n_components=n_comps, scale=False) cca.fit(I_tr, T_tr) I_tr_cca, T_tr_cca = cca.transform(I_tr, T_tr) I_te_cca, T_te_cca = cca.transform(I_te, T_te) return I_tr_cca, T_tr_cca, I_te_cca, T_te_cca
def test_pls_canonical_basics(): # Basic checks for PLSCanonical d = load_linnerud() X = d.data Y = d.target pls = PLSCanonical(n_components=X.shape[1]) pls.fit(X, Y) assert_matrix_orthogonal(pls.x_weights_) assert_matrix_orthogonal(pls.y_weights_) assert_matrix_orthogonal(pls._x_scores) assert_matrix_orthogonal(pls._y_scores) # Check X = TP' and Y = UQ' T = pls._x_scores P = pls.x_loadings_ U = pls._y_scores Q = pls.y_loadings_ # Need to scale first Xc, Yc, x_mean, y_mean, x_std, y_std = _center_scale_xy(X.copy(), Y.copy(), scale=True) assert_array_almost_equal(Xc, np.dot(T, P.T)) assert_array_almost_equal(Yc, np.dot(U, Q.T)) # Check that rotations on training data lead to scores Xt = pls.transform(X) assert_array_almost_equal(Xt, pls._x_scores) Xt, Yt = pls.transform(X, Y) assert_array_almost_equal(Xt, pls._x_scores) assert_array_almost_equal(Yt, pls._y_scores) # Check that inverse_transform works X_back = pls.inverse_transform(Xt) assert_array_almost_equal(X_back, X) _, Y_back = pls.inverse_transform(Xt, Yt) assert_array_almost_equal(Y_back, Y)
def drawFaces(emb1, emb2, wordRanking, n, reduction="cut"): """ Plot Chernoff faces for n most/less interesting words From: https://gist.github.com/aflaxman/4043086 :param n: if negative: less interesting :param reduction: :return: """ s1 = None s2 = None if reduction=="cut": s1 = emb1.getSimMatrix()[0:,0:18] s2 = emb2.getSimMatrix()[0:,0:18] elif reduction=="svd": s1 = TruncatedSVD(n_components=k).fit_transform(emb1.getSimMatrix()) s2 = TruncatedSVD(n_components=k).fit_transform(emb2.getSimMatrix()) elif reduction=="cca": #use orginal embeddings, not similarity matrix for reduction cca = PLSCanonical(n_components=18) cca.fit(emb1.m, emb2.m) s1, s2 = cca.transform(emb1.m, emb2.m) interesting = list() name = str(n)+"."+reduction if n<0: #plot uninteresting words n *= -1 interesting = [wordRanking[::-1][i] for i in xrange(n)] else: interesting = [wordRanking[i] for i in xrange(n)] fig = plt.figure(figsize=(11,11)) c = 0 for i in range(n): word = interesting[i] j = emb1.d[word] ax = fig.add_subplot(n,2,c+1,aspect='equal') mpl_cfaces.cface(ax, *s1[j]) #nice for similarity matrix *s1[j][:18] ax.axis([-1.2,1.2,-1.2,1.2]) ax.set_xticks([]) ax.set_yticks([]) ax.set_title(word) ax2 = fig.add_subplot(n,2,c+2,aspect='equal') mpl_cfaces.cface(ax2, *s2[j]) ax2.axis([-1.2,1.2,-1.2,1.2]) ax2.set_xticks([]) ax2.set_yticks([]) ax2.set_title(word) c += 2 plotname = "plots/"+NAME+".cface_s1s2_"+name+".png" fig.savefig(plotname) print("\tSaved Chernoff faces plot in '%s'" % (plotname))
class _PLSCanonicalImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X) def predict(self, X): return self._wrapped_model.predict(X)
def getCCARanking(self, filter=None): """ Compare how far apart words are in projection into common space by CCA :return: """ cca = PLSCanonical(n_components=self.n) cca.fit(self.emb1.m, self.emb2.m) m1transformed, m2transformed = cca.transform(self.emb1.m, self.emb2.m) #get distances between vectors assert self.emb1.vocab_size == self.emb2.vocab_size distDict = dict() for i in xrange(self.emb1.vocab_size): v1 = m1transformed[i] v2 = m2transformed[i] w = self.emb1.rd[i] distDict[w] = 1-Similarity.euclidean(v1,v2) ranked = sorted(distDict.iteritems(), key=itemgetter(1), reverse=True) if filter is not None: ranked = [(w, s) for (w, s) in distDict.iteritems() if w in filter] return ranked
def plotClustersCCA(self, filter=None): """ Plot clusters in 2dim CCA space: Comparable across embeddings :return: """ if len(self.cluster1) <= 1: cmap1 = plt.get_cmap('jet', 2) else: cmap1 = plt.get_cmap('jet', len(self.cluster1)) cmap1.set_under('gray') if len(self.cluster2) <= 1: cmap2 = plt.get_cmap('jet', 2) else: cmap2 = plt.get_cmap('jet', len(self.cluster2)) cmap2.set_under('gray') cca = PLSCanonical(n_components=2) cca.fit(self.emb1.m, self.emb2.m) m1transformed, m2transformed = cca.transform(self.emb1.m, self.emb2.m) labels1 = [self.emb1.rd[i] for i in xrange(self.emb1.vocab_size)] colors1 = [self.word2cluster1[self.emb1.rd[i]] for i in xrange(self.emb1.vocab_size)] labels2 = [self.emb2.rd[i] for i in xrange(self.emb2.vocab_size)] colors2 = [self.word2cluster2[self.emb2.rd[i]] for i in xrange(self.emb2.vocab_size)] if filter is not None: print("\tFiltering samples to plot") filteredIds = [self.emb1.d[w] for w in filter] #get ids for words in filter m1transformed = m1transformed[filteredIds] m2transformed = m2transformed[filteredIds] labels1 = [l for l in labels1 if l in filter] labels2 = [l for l in labels2 if l in filter] elif m1transformed.shape[0] > 100: #sample indices to display, otherwise it's too messy filteredIds = np.random.randint(low=0, high=m1.transformed.shape[0]) #sample filteredIds m1transformed = m1transformed[filteredIds] m2transformed = m2transformed[filteredIds] labels1 = [l for l in labels1 if l in filter] labels2 = [l for l in labels2 if l in filter] plotWithLabelsAndColors(m1transformed, labels1, colors=colors1, cmap=cmap1, filename="plots/"+NAME+".cca1.png", dimRed="CCA") plotWithLabelsAndColors(m2transformed, labels2, colors=colors2, cmap=cmap2, filename="plots/"+NAME+".cca2.png", dimRed="CCA")
plt.xlim(1,np.amax(nComponents)) plt.title('PLS SVD accuracy') plt.xlabel('Number of components') plt.ylabel('accuracy') plt.legend (['LR','LDA','GNB','Linear SVM','rbf SVM'],loc='lower right') plt.grid(True) if (0): #%% PLS Cannonical nComponents = np.arange(1,nClasses+1) plsCanScores = np.zeros((5,np.alen(nComponents))) for i,n in enumerate(nComponents): plscan = PLSCanonical(n_components=n) plscan.fit(Xtrain,Ytrain) XtrainT = plscan.transform(Xtrain) XtestT = plscan.transform(Xtest) plsCanScores[:,i] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest) plscan = PLSCanonical(n_components=2) plscan.fit(Xtrain,Ytrain) xt = plscan.transform(Xtrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) plt.title('First 2 components of projected data') #%% Plot accuracies for PLSSVD plt.figure() for i in range (5):
vec_c.append(i) if j < len_train: l_p.append(vec_p) l_c.append(vec_c) else: l_p_t.append(vec_p) l_c_t.append(vec_c) j += 1 sorted_p = np.asarray(l_p) sorted_c = np.asarray(l_c) #Convert the input to an array plc = PLSCanonical() plc.fit_transform(sorted_c, sorted_p) sorted_c, sorted_p = plc.transform(sorted_c, sorted_p) sorted_c_test = np.asarray(l_c_t) sorted_p_test = np.asarray(l_p_t) sorted_c_test, sorted_p_test = plc.transform(sorted_c_test, sorted_p_test) plr = PLSRegression() plr.fit(sorted_c, sorted_p) params = plr.get_params() plr.set_params(**params) y_score = plr.predict(sorted_c_test) sim_count = 0 print("Test Similarity: ") for i in range(len(y_score)): result_sim = 1 - spatial.distance.cosine(y_score[i], sorted_p_test[i])
# set negative connectivities to 0 edge_data = np.apply_along_axis( lambda x: [0 if element < 0 else element for element in x], 1, edge_data) # re-split data (3 ways) for CCA X1_train = edge_data[:140, :] X2_train = edge_data[140:280, :] X2_remain = edge_data[280:, :] #cca = CCA(n_components =2) #cca.fit(X1_train, X2_train) cca = PLSCanonical(n_components=100) cca.fit(X1_train, X2_train) block_1_transformed, block_2_transformed = cca.transform(X1_train, X2_train, copy=False) block_3_transformed = np.dot(X2_remain, cca.y_rotations_) edge_data_transformed = np.vstack( (block_1_transformed, block_2_transformed, block_3_transformed)) # initialise the classifier clf = svm.SVC(kernel='precomputed') # optional shuffle perm = np.random.permutation(n_subjects) #print perm #print n_subjects labels = labels[perm] edge_data_transformed = edge_data_transformed[perm, :]
plt.xlim(1, np.amax(nComponents)) plt.title('PLS SVD accuracy') plt.xlabel('Number of components') plt.ylabel('accuracy') plt.legend(['LR', 'LDA', 'GNB', 'Linear SVM', 'rbf SVM'], loc='lower right') plt.grid(True) if (0): #%% PLS Cannonical nComponents = np.arange(1, nClasses + 1) plsCanScores = np.zeros((5, np.alen(nComponents))) for i, n in enumerate(nComponents): plscan = PLSCanonical(n_components=n) plscan.fit(Xtrain, Ytrain) XtrainT = plscan.transform(Xtrain) XtestT = plscan.transform(Xtest) plsCanScores[:, i] = util.classify(XtrainT, XtestT, labelsTrain, labelsTest) plscan = PLSCanonical(n_components=2) plscan.fit(Xtrain, Ytrain) xt = plscan.transform(Xtrain) fig = plt.figure() util.plotData(fig, xt, labelsTrain, classColors) plt.title('First 2 components of projected data') #%% Plot accuracies for PLSSVD plt.figure() for i in range(5): plt.plot(nComponents, plsCanScores[i, :], lw=3)
def pls_decomposition(videos, audios, n_components=256): plsca = PLSCanonical(n_components=n_components) plsca.fit(audios, videos) videos_c, audios_c = plsca.transform(videos, audios) return videos_c, audios_c
for e in temp1[:]: temp4.append(e) for e in temp2[:]: temp4.append(e) if len(temp4) == 600 and len(temp3) == 300: x_n.append(temp4) y_n.append(temp3) npx = np.asarray(x, dtype=np.float64) npy = np.asarray(y, dtype=np.float64) npxn = np.asarray(x_n, dtype=np.float64) npyn = np.asarray(y_n, dtype=np.float64) cca = PLSCanonical(n_components=2) cca.fit_transform(npx, npy) npx, npy = cca.transform(npx, npy) npxn, npyn = cca.transform(npxn, npyn) pls.fit(npx, npy) params = pls.get_params(deep=True) print(params) pls.set_params(**params) y_score = pls.predict(npxn) sim_count = 0 tol = 0.1 for index in range(len(y_score)): sub_result = np.subtract(y_score, npyn) result = 1 - spatial.distance.cosine(y_score[index], npyn[index])
def training_lda_TD4_inter(my_clfs, trains_S0, trains_shift, classes, **kw): print 'training_lda_TD4_inter.........' start_time = time.time() log_fold = root_path + '/result/' + kw['log_fold'] new_fold(log_fold) chan_len = kw['chan_len'] action_num = kw['action_num'] print "----training " + kw[ 'feature_type'] + " inter, training by position O, testing by electrode shift " cv = 5 results = [] results.append(['Feat', 'Algorithm', 'Channel_Pos', 'Accuracy', 'std']) log_file = 'feat_' + kw['feature_type'] + '_inter' clf = sklearn.lda.LDA(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) data_num = trains_S0.shape[0] / action_num # print data_num scores = sklearn.cross_validation.cross_val_score(clf, trains_S0, classes, cv=cv) results.append( ['feat_TD4_cv_' + str(cv), 'lda', 'S0', scores.mean(), scores.std()]) kf = KFold(data_num, n_folds=cv) for idx, channel_pos in enumerate(kw['pos_list']): X_test = trains_shift[:, idx * chan_len:idx * chan_len + chan_len] y_test = classes iteration = cv scores = np.zeros((iteration, )) cca_scores = np.zeros((iteration, )) iteration -= 1 for train_idx, test_idx in kf: train_idx_all = np.array([], np.int) test_idx_all = np.array([], np.int) for action_idx in range(action_num): train_idx_all = np.concatenate( (train_idx_all, train_idx * action_idx), axis=0) test_idx_all = np.concatenate( (test_idx_all, test_idx * action_idx), axis=0) # X_train, y_train = trains_S0[train_idx_all], classes[train_idx_all] X_train, y_train = trains_S0, classes X_train_shift, y_train_shift = X_test[train_idx_all], classes[ train_idx_all] X_train_all = np.concatenate((X_train, X_train_shift), axis=0) y_train_all = np.concatenate((y_train, y_train_shift), axis=0) sys.exit(0) score_inter = clf.fit(X_train_all, y_train_all).score(X_test, y_test) scores[iteration] = score_inter.mean() # print X_train.shape, y_train.shape if channel_pos != 'S0': # plsca = joblib.load(transform_fold+'/cca_transform_'+kw['subject']+'_'+channel_pos+'.model') plsca = PLSCanonical(n_components=14) # print X_test.shape, X_train.shape # sys.exit(0) plsca.fit(X_test[train_idx], X_train) X_test_cca, X_train_cca = plsca.transform(X_test, X_train) cca_score = clf.fit(X_train_cca, y_train).score(X_test_cca, y_test) cca_scores[iteration] = cca_score.mean() iteration -= 1 # print scores # print cca_scores # sys.exit(0) results.append( ['feat_TD4', 'lda', channel_pos, np.mean(scores), np.std(scores)]) results.append([ 'feat_TD4', 'lda_cca', channel_pos, np.mean(cca_scores), np.std(cca_scores) ]) log_result(results, log_fold + '/' + log_file + '_' + str(kw['num']), 2) print '----Log Fold:', log_fold, ', log_file: ', log_file + '_' + channel_pos + '_' + str( kw['num']) print '----training TD4 time elapsed:', time.time() - start_time
X_test = X[n // 2:] Y_test = Y[n // 2:] print("Corr(X)") print(np.round(np.corrcoef(X.T), 2)) print("Corr(Y)") print(np.round(np.corrcoef(Y.T), 2)) # ############################################################################# # Canonical (symmetric) PLS # Transform data # ~~~~~~~~~~~~~~ plsca = PLSCanonical(n_components=2) plsca.fit(X_train, Y_train) X_train_r, Y_train_r = plsca.transform(X_train, Y_train) X_test_r, Y_test_r = plsca.transform(X_test, Y_test) # Scatter plot of scores # ~~~~~~~~~~~~~~~~~~~~~~ # 1) On diagonal plot X vs Y scores on each components plt.figure(figsize=(12, 8)) plt.subplot(221) plt.plot(X_train_r[:, 0], Y_train_r[:, 0], "ob", label="train") plt.plot(X_test_r[:, 0], Y_test_r[:, 0], "or", label="test") plt.xlabel("x scores") plt.ylabel("y scores") plt.title('Comp. 1: X vs Y (test corr = %.2f)' % np.corrcoef(X_test_r[:, 0], Y_test_r[:, 0])[0, 1]) plt.xticks(()) plt.yticks(())
from sklearn import datasets import numpy as np from sklearn.model_selection import train_test_split from sklearn.cross_decomposition import PLSCanonical from sklearn.neighbors import KNeighborsClassifier import math from mlxtend.feature_selection import SequentialFeatureSelector as SFS dataSet = datasets.load_digits() data = dataSet["data"] target = dataSet["target"] plsca = PLSCanonical(n_components=2) plsca.fit(data, target) X_train_r, Y_train_r = plsca.transform(data, target) knn = math.sqrt(len(X_train_r)) knn = KNeighborsClassifier(n_neighbors=int(knn)) Y_train_r = [int(Y_train_r[i]) for i in range(0, len(Y_train_r))] k = knn.fit(X_train_r, Y_train_r) print(k.score(X_train_r, Y_train_r)) knn = KNeighborsClassifier(n_neighbors=4) sfs = SFS(knn, k_features=3, forward=True, floating=False, verbose=2,
def training_lda_TD4_intra(my_clfs, trains, classes, **kw): start_time = time.time() if (kw.has_key('log_fold')): log_fold = root_path + '/result/' + kw['log_fold'] new_fold(log_fold) chan_len = kw['chan_len'] action_num = kw['action_num'] cv = 3 results = [] results.append([ 'Feat', 'Algorithm', 'n_components', 'Channel_Pos', 'Accuracy', 'std' ]) log_file = 'feat_' + kw['feature_type'] + '_intra' clf = sklearn.lda.LDA(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) data_num = trains.shape[0] / action_num scores = sklearn.cross_validation.cross_val_score(clf, trains, classes, cv=cv) results.append([ 'feat_TD4_cv_' + str(cv), 'lda', 'ALL', 0, scores.mean(), scores.std() ]) # 组内训练策略 9组数据 print '组内训练.............' for idx, channel_pos in enumerate(kw['pos_list']): # print '----training TD4 intra , channel_pos: ', channel_pos,'......' trains_intra = trains[:, idx * chan_len:idx * chan_len + chan_len] scores = sklearn.cross_validation.cross_val_score(clf, trains_intra, classes, cv=cv) results.append([ 'feat_TD4_cv_' + str(cv), 'lda', 0, channel_pos, scores.mean(), scores.std() ]) # 中心训练策略 print '中心训练策略.............' trains_intra_S0 = trains[:, 0:chan_len] for idx, channel_pos in enumerate(kw['pos_list']): if channel_pos == 'S0': continue tests_shift = trains[:, idx * chan_len:idx * chan_len + chan_len] # if channel_pos == 'L2': # print idx*chan_len, idx*chan_len+chan_len, tests_shift.shape, trains.shape # sys.exit(0) scores = clf.fit(trains_intra_S0, classes).score(tests_shift, classes) results.append([ 'feat_TD4_cv_' + str(cv), 'lda', 0, 'train S0' + ' test ' + channel_pos, scores.mean(), scores.std() ]) # 组训练策略(不同于组内训练策略) 5-fold print '组训练策略.............' trains_intra_S0 = trains[:, 0:chan_len] kf = KFold(data_num, n_folds=cv) for idx, channel_pos in enumerate(kw['pos_list']): if channel_pos == 'S0': continue itera = cv scores = np.zeros((itera, )) # stds = np.zeros( (itera,) ) itera -= 1 trains_shift = trains[:, idx * chan_len:idx * chan_len + chan_len] for train_idx, test_idx in kf: train_idx_all = np.array([], np.int) test_idx_all = np.array([], np.int) for action_idx in range(action_num): train_idx_all = np.concatenate( (train_idx_all, train_idx * (action_idx + 1)), axis=0) test_idx_all = np.concatenate( (test_idx_all, test_idx * (action_idx + 1)), axis=0) X_train = np.concatenate( (trains_intra_S0[train_idx_all], trains_shift[train_idx_all]), axis=0) y_train = np.concatenate( (classes[train_idx_all], classes[train_idx_all]), axis=0) X_test = trains_shift[test_idx_all] y_test = classes[test_idx_all] # X_test = trains_shift # y_test = classes score = clf.fit(X_train, y_train).score(X_test, y_test) scores[itera] = score.mean() itera -= 1 # print scores results.append([ 'feat_TD4_cv_' + str(cv), 'lda', 0, 'S0 + ' + channel_pos, np.mean(scores), np.std(scores) ]) # 基于CCA的训练策略 5-fold 交叉验证 print 'CCA训练策略.............' trains_S0 = trains[:, 0:chan_len] n_components_list = [6, 8, 10, 12, 14, 16] # 子空间维数 # n_components_list = [12,14,16] kf = KFold(data_num, n_folds=cv) for n_components in n_components_list: for idx, channel_pos in enumerate(kw['pos_list']): if channel_pos == 'S0': continue itera = cv scores = np.zeros((itera, )) stds = np.zeros((itera, )) itera -= 1 trains_shift = trains[:, idx * chan_len:idx * chan_len + chan_len] for train_idx, test_idx in kf: train_idx_all = np.array([], np.int) test_idx_all = np.array([], np.int) for action_idx in range(action_num): train_idx_all = np.concatenate( (train_idx_all, train_idx * (action_idx + 1)), axis=0) test_idx_all = np.concatenate( (test_idx_all, test_idx * (action_idx + 1)), axis=0) # print train_idx_all.shape, train_idx_all, test_idx_all.shape, test_idx_all # plsca.fit(trains_shift[train_idx_all], trains_S0[train_idx_all]) plsca = PLSCanonical(n_components=n_components) plsca.fit(trains_shift, trains_S0) trains_shift_cca, trains_S0_cca = plsca.transform( trains_shift, trains_S0) X_trains = np.concatenate( (trains_S0_cca, trains_shift_cca[train_idx_all]), axis=0) y_trains = np.concatenate((classes, classes[train_idx_all]), axis=0) score = clf.fit(X_trains, y_trains).score(trains_shift_cca[test_idx_all], classes[test_idx_all]) scores[itera] = score.mean() # stds[itera] = score.std() itera -= 1 results.append([ 'feat_TD4_cv_' + str(cv), 'lda_cca', n_components, 'S0 + ' + channel_pos, np.mean(scores), np.std(scores) ]) log_result(results, log_fold + '/' + log_file + '_action_1-' + str(action_num), 2) print '----Log Fold:', log_fold, ', log_file: ', log_file + '_action_1-' + str( action_num) print '----training TD4 time elapsed:', time.time() - start_time
plt.gca().set_aspect('equal', adjustable='box') plt.legend(['Cell volume', 'Age', 'Both']) #NB: Strong colinearity between Age and Volume # Transition rate prediction using PLS X = dfc_g1[['vol_sm', 'Age', 'gr_sm']] # Design matrix y = dfc_g1['G1S_logistic'] # Response var # Drop NaN rows I = np.isnan(dfc_g1['gr_sm']) X = X.loc[~I].copy() y = y[~I] pls_model = PLSCanonical() pls_model.fit(scale(X), y) X_c, y_c = pls_model.transform(scale(X), y) # Multiple linearregression on birth size and growth rate df['bvol'] = df['Birth volume'] df['exp_gr'] = df['Exponential growth rate'] df['g1_len'] = df['G1 length'] model = smf.ols('g1_len ~ exp_gr + bvol', data=df).fit() model.summary() print model.pvalues # Delete S/G2 after first time point g1s_marked = [] for c in collated_filtered: c = c[c['Phase'] != 'Daughter G1'].copy() g1 = c[c['Phase'] == 'G1'] g1['G1S_mark'] = 0
def plot_compare_cross_decomposition(): # Dataset based latent variables model n = 500 # 2 latents vars: l1 = np.random.normal(size=n) l2 = np.random.normal(size=n) latents = np.array([l1, l1, l2, l2]).T X = latents + np.random.normal(size=4 * n).reshape((n, 4)) Y = latents + np.random.normal(size=4 * n).reshape((n, 4)) X_train = X[:n // 2] Y_train = Y[:n // 2] X_test = X[n // 2:] Y_test = Y[n // 2:] print("Corr(X)") print(np.round(np.corrcoef(X.T), 2)) print("Corr(Y)") print(np.round(np.corrcoef(Y.T), 2)) # ############################################################################# # Canonical (symmetric) PLS # Transform data # ~~~~~~~~~~~~~~ plsca = PLSCanonical(n_components=2) plsca.fit(X_train, Y_train) X_train_r, Y_train_r = plsca.transform(X_train, Y_train) X_test_r, Y_test_r = plsca.transform(X_test, Y_test) # Scatter plot of scores # ~~~~~~~~~~~~~~~~~~~~~~ # 1) On diagonal plot X vs Y scores on each components plt.figure(figsize=(12, 8)) plt.subplot(221) plt.scatter(X_train_r[:, 0], Y_train_r[:, 0], label="train", marker="o", s=25) plt.scatter(X_test_r[:, 0], Y_test_r[:, 0], label="test", marker="o", s=25) plt.xlabel("x scores") plt.ylabel("y scores") plt.title('Comp. 1: X vs Y (test corr = %.2f)' % np.corrcoef(X_test_r[:, 0], Y_test_r[:, 0])[0, 1]) plt.xticks(()) plt.yticks(()) plt.legend(loc="best") plt.subplot(224) plt.scatter(X_train_r[:, 1], Y_train_r[:, 1], label="train", marker="o", s=25) plt.scatter(X_test_r[:, 1], Y_test_r[:, 1], label="test", marker="o", s=25) plt.xlabel("x scores") plt.ylabel("y scores") plt.title('Comp. 2: X vs Y (test corr = %.2f)' % np.corrcoef(X_test_r[:, 1], Y_test_r[:, 1])[0, 1]) plt.xticks(()) plt.yticks(()) plt.legend(loc="best") # 2) Off diagonal plot components 1 vs 2 for X and Y plt.subplot(222) plt.scatter(X_train_r[:, 0], X_train_r[:, 1], label="train", marker="*", s=50) plt.scatter(X_test_r[:, 0], X_test_r[:, 1], label="test", marker="*", s=50) plt.xlabel("X comp. 1") plt.ylabel("X comp. 2") plt.title('X comp. 1 vs X comp. 2 (test corr = %.2f)' % np.corrcoef(X_test_r[:, 0], X_test_r[:, 1])[0, 1]) plt.legend(loc="best") plt.xticks(()) plt.yticks(()) plt.subplot(223) plt.scatter(Y_train_r[:, 0], Y_train_r[:, 1], label="train", marker="*", s=50) plt.scatter(Y_test_r[:, 0], Y_test_r[:, 1], label="test", marker="*", s=50) plt.xlabel("Y comp. 1") plt.ylabel("Y comp. 2") plt.title('Y comp. 1 vs Y comp. 2 , (test corr = %.2f)' % np.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1]) plt.legend(loc="best") plt.xticks(()) plt.yticks(()) plt.show() # ############################################################################# # PLS regression, with multivariate response, a.k.a. PLS2 n = 1000 q = 3 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) B = np.array([[1, 2] + [0] * (p - 2)] * q).T # each Yj = 1*X1 + 2*X2 + noize Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print("True B (such that: Y = XB + Err)") print(B) # compare pls2.coef_ with B print("Estimated B") print(np.round(pls2.coef_, 1)) pls2.predict(X) # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 pls1 = PLSRegression(n_components=3) pls1.fit(X, y) # note that the number of components exceeds 1 (the dimension of y) print("Estimated betas") print(np.round(pls1.coef_, 1)) # ############################################################################# # CCA (PLS mode B with symmetric deflation) cca = CCA(n_components=2) cca.fit(X_train, Y_train) X_train_r, Y_train_r = cca.transform(X_train, Y_train) X_test_r, Y_test_r = cca.transform(X_test, Y_test)
def training_lda_TD4_inter(my_clfs, trains_S0, trains_shift, classes, **kw): print 'training_lda_TD4_inter.........' start_time = time.time() log_fold = root_path + '/result/' + kw['log_fold'] new_fold(log_fold) chan_len = kw['chan_len'] action_num = kw['action_num'] print "----training "+kw['feature_type']+" inter, training by position O, testing by electrode shift " cv = 5 results = [] results.append(['Feat', 'Algorithm','Channel_Pos', 'Accuracy', 'std']) log_file = 'feat_'+kw['feature_type']+'_inter' clf = sklearn.lda.LDA(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) data_num = trains_S0.shape[0]/action_num # print data_num scores = sklearn.cross_validation.cross_val_score( clf, trains_S0, classes, cv=cv) results.append(['feat_TD4_cv_'+str(cv), 'lda', 'S0', scores.mean(), scores.std()]) kf = KFold(data_num, n_folds=cv) for idx, channel_pos in enumerate(kw['pos_list']): X_test = trains_shift[:,idx*chan_len:idx*chan_len+chan_len] y_test = classes iteration = cv scores = np.zeros((iteration,)) cca_scores = np.zeros((iteration,)) iteration -= 1 for train_idx, test_idx in kf: train_idx_all = np.array([], np.int) test_idx_all = np.array([], np.int) for action_idx in range(action_num): train_idx_all = np.concatenate( (train_idx_all, train_idx*action_idx), axis=0) test_idx_all = np.concatenate( (test_idx_all, test_idx*action_idx), axis=0) # X_train, y_train = trains_S0[train_idx_all], classes[train_idx_all] X_train, y_train = trains_S0, classes X_train_shift, y_train_shift = X_test[train_idx_all], classes[train_idx_all] X_train_all = np.concatenate( (X_train, X_train_shift), axis=0) y_train_all = np.concatenate( (y_train, y_train_shift), axis=0) sys.exit(0) score_inter = clf.fit(X_train_all, y_train_all).score(X_test, y_test) scores[iteration] = score_inter.mean() # print X_train.shape, y_train.shape if channel_pos != 'S0': # plsca = joblib.load(transform_fold+'/cca_transform_'+kw['subject']+'_'+channel_pos+'.model') plsca = PLSCanonical(n_components=14) # print X_test.shape, X_train.shape # sys.exit(0) plsca.fit(X_test[train_idx], X_train) X_test_cca, X_train_cca = plsca.transform(X_test, X_train) cca_score = clf.fit(X_train_cca, y_train).score(X_test_cca, y_test) cca_scores[iteration] = cca_score.mean() iteration -= 1 # print scores # print cca_scores # sys.exit(0) results.append(['feat_TD4', 'lda', channel_pos, np.mean(scores), np.std(scores)]) results.append(['feat_TD4', 'lda_cca', channel_pos, np.mean(cca_scores), np.std(cca_scores)]) log_result(results, log_fold + '/' + log_file + '_' + str(kw['num']), 2) print '----Log Fold:', log_fold, ', log_file: ', log_file + '_' + channel_pos + '_' + str(kw['num']) print '----training TD4 time elapsed:', time.time() - start_time # mean_shift = 0 # std_shift = 0 # for i in range(2, 10): # mean_shift += results[i][4] # std_shift += results[i][5] # mean_shift /= 9 # std_shift /= 9 # results.append(['feat_TD4','lda(svd;tol=0.0001)', 'Shift_means', '1.0', mean_shift, std_shift]) # mean_all = 0 # std_all = 0 # for i in range(1, 10): # mean_all += results[i][4] # std_all += results[i][5] # mean_all /= 9 # std_all /= 9
def training_lda_TD4_intra(my_clfs, trains, classes, **kw): start_time = time.time() if(kw.has_key('log_fold')): log_fold = root_path + '/result/' + kw['log_fold'] new_fold(log_fold) chan_len = kw['chan_len'] action_num = kw['action_num'] cv = 3 results = [] results.append( ['Feat', 'Algorithm','n_components', 'Channel_Pos', 'Accuracy', 'std']) log_file = 'feat_'+kw['feature_type']+'_intra' clf = sklearn.lda.LDA(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) data_num = trains.shape[0]/action_num scores = sklearn.cross_validation.cross_val_score(clf, trains, classes, cv=cv) results.append(['feat_TD4_cv_'+str(cv), 'lda', 'ALL', 0, scores.mean(), scores.std()]) # 组内训练策略 9组数据 print '组内训练.............' for idx, channel_pos in enumerate(kw['pos_list']): # print '----training TD4 intra , channel_pos: ', channel_pos,'......' trains_intra = trains[:,idx*chan_len: idx*chan_len+chan_len] scores = sklearn.cross_validation.cross_val_score( clf, trains_intra, classes, cv=cv) results.append(['feat_TD4_cv_'+str(cv), 'lda', 0, channel_pos, scores.mean(), scores.std()]) # 中心训练策略 print '中心训练策略.............' trains_intra_S0 = trains[:,0:chan_len] for idx, channel_pos in enumerate(kw['pos_list']): if channel_pos == 'S0': continue tests_shift = trains[:,idx*chan_len: idx*chan_len+chan_len] # if channel_pos == 'L2': # print idx*chan_len, idx*chan_len+chan_len, tests_shift.shape, trains.shape # sys.exit(0) scores = clf.fit(trains_intra_S0, classes).score(tests_shift, classes) results.append(['feat_TD4_cv_'+str(cv), 'lda', 0, 'train S0' + ' test ' + channel_pos, scores.mean(), scores.std()]) # 组训练策略(不同于组内训练策略) 5-fold print '组训练策略.............' trains_intra_S0 = trains[:,0:chan_len] kf = KFold(data_num, n_folds=cv) for idx, channel_pos in enumerate(kw['pos_list']): if channel_pos == 'S0': continue itera = cv scores = np.zeros( (itera,) ) # stds = np.zeros( (itera,) ) itera -= 1 trains_shift = trains[:,idx*chan_len: idx*chan_len+chan_len] for train_idx, test_idx in kf: train_idx_all = np.array([], np.int) test_idx_all = np.array([], np.int) for action_idx in range(action_num): train_idx_all = np.concatenate( (train_idx_all, train_idx*(action_idx+1)), axis=0) test_idx_all = np.concatenate( (test_idx_all, test_idx*(action_idx+1)), axis=0) X_train = np.concatenate( (trains_intra_S0[train_idx_all], trains_shift[train_idx_all]), axis=0) y_train = np.concatenate( (classes[train_idx_all], classes[train_idx_all]), axis=0) X_test = trains_shift[test_idx_all] y_test = classes[test_idx_all] # X_test = trains_shift # y_test = classes score = clf.fit(X_train, y_train).score(X_test, y_test) scores[itera] = score.mean() itera -= 1 # print scores results.append(['feat_TD4_cv_'+str(cv), 'lda', 0, 'S0 + '+channel_pos, np.mean(scores), np.std(scores)]) # 基于CCA的训练策略 5-fold 交叉验证 print 'CCA训练策略.............' trains_S0 = trains[:,0:chan_len] n_components_list = [6, 8, 10, 12, 14, 16] # 子空间维数 # n_components_list = [12,14,16] kf = KFold(data_num, n_folds=cv) for n_components in n_components_list: for idx, channel_pos in enumerate(kw['pos_list']): if channel_pos == 'S0': continue itera = cv scores = np.zeros( (itera,) ) stds = np.zeros( (itera,) ) itera -= 1 trains_shift = trains[:,idx*chan_len: idx*chan_len+chan_len] for train_idx, test_idx in kf: train_idx_all = np.array([], np.int) test_idx_all = np.array([], np.int) for action_idx in range(action_num): train_idx_all = np.concatenate( (train_idx_all, train_idx*(action_idx+1)), axis=0) test_idx_all = np.concatenate( (test_idx_all, test_idx*(action_idx+1)), axis=0) # print train_idx_all.shape, train_idx_all, test_idx_all.shape, test_idx_all # plsca.fit(trains_shift[train_idx_all], trains_S0[train_idx_all]) plsca = PLSCanonical(n_components=n_components) plsca.fit(trains_shift, trains_S0) trains_shift_cca, trains_S0_cca = plsca.transform(trains_shift, trains_S0) X_trains = np.concatenate( (trains_S0_cca, trains_shift_cca[train_idx_all]), axis=0) y_trains = np.concatenate( (classes, classes[train_idx_all]), axis=0) score = clf.fit(X_trains, y_trains).score(trains_shift_cca[test_idx_all], classes[test_idx_all]) scores[itera] = score.mean() # stds[itera] = score.std() itera -= 1 results.append(['feat_TD4_cv_'+str(cv), 'lda_cca', n_components, 'S0 + '+channel_pos, np.mean(scores), np.std(scores)]) log_result(results, log_fold + '/' + log_file + '_action_1-'+str(action_num), 2) print '----Log Fold:', log_fold, ', log_file: ', log_file + '_action_1-'+str(action_num) print '----training TD4 time elapsed:', time.time() - start_time
xt,yt = plssvd.fit_transform(dataTrain,Ytrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) u = plssvd.x_weights_ plt.quiver(u[0,0],u[1,0],color='k',edgecolor='k',lw=1,scale=0.1,figure=fig) plt.quiver(-u[1,0],u[0,0],color='k',edgecolor='k',lw=1,scale=0.4,figure=fig) #%% PLS mode-A lda = LDA() nComponents = np.arange(1,nClasses+1) plsCanScores = np.zeros((2,np.alen(nComponents))) for i,n in enumerate(nComponents): plscan = PLSCanonical(n_components=n) plscan.fit(dataTrain,Ytrain) dataTrainT = plscan.transform(dataTrain) dataTestT = plscan.transform(dataTest) plsCanScores[:,i] = util.classify(dataTrainT,dataTestT,labelsTrain,labelsTest) fig = plt.figure() util.plotAccuracy(fig,nComponents,plsCanScores) plt.title('PLS Canonical accuracy',figure=fig) plscan = PLSCanonical(n_components=2) xt,yt = plscan.fit_transform(dataTrain,Ytrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) u = plscan.x_weights_ plt.quiver(u[0,0],u[1,0],color='k',edgecolor='k',lw=1,scale=0.1,figure=fig) plt.quiver(-u[1,0],u[0,0],color='k',edgecolor='k',lw=1,scale=0.4,figure=fig)
X_test = X[n // 2:] Y_test = Y[n // 2:] print("Corr(X)") print(np.round(np.corrcoef(X.T), 2)) print("Corr(Y)") print(np.round(np.corrcoef(Y.T), 2)) # ############################################################################# # Canonical (symmetric) PLS # Transform data # ~~~~~~~~~~~~~~ plsca = PLSCanonical(n_components=2) plsca.fit(X_train, Y_train) X_train_r, Y_train_r = plsca.transform(X_train, Y_train) X_test_r, Y_test_r = plsca.transform(X_test, Y_test) # Scatter plot of scores # ~~~~~~~~~~~~~~~~~~~~~~ # 1) On diagonal plot X vs Y scores on each components plt.figure(figsize=(12, 8)) plt.subplot(221) plt.scatter(X_train_r[:, 0], Y_train_r[:, 0], label="train", marker="o", c="b", s=25) plt.scatter(X_test_r[:, 0], Y_test_r[:, 0],
y_train = y[:n / 2] x_test = x[n / 2:] y_test = y[n / 2:] print("corr(x)") print(np.round(np.corrcoef(x.T), 2)) print("corr(y)") print(np.round(np.corrcoef(y.T), 2)) ################################################################# # Canonical (symmetric) PLS # transform the data plsca = PLSCanonical(n_components=2) plsca.fit(x_train, y_train) x_train_r, y_train_r = plsca.transform(x_train, y_train) x_test_r, y_test_r = plsca.transform(x_test, y_test) # Scatter plot of scores # ~~~~~~~~~~~~~~~~~~~~~~ # 1) On diagonal plot x vs y scores on each components plt.figure(figsize=(12, 8)) plt.subplot(221) plt.plot(x_train_r[:, 0], y_train_r[:, 0], "ob", label="train") plt.plot(x_test_r[:, 0], y_test_r[:, 0], "or", label="test") plt.xlabel("x scores") plt.ylabel("y scores") plt.title('Comp. 1: x vs y (test corr = %.2f)' % np.corrcoef(x_test_r[:, 0], y_test_r[:, 0])[0, 1]) plt.xticks(()) plt.yticks(())