def train_eval(self, train_index, test_index, ignore_eval=False): normalized_train, normalized_test = normalize_by_train(self.source[train_index], self.source[test_index]) if self.comp is not None: if self.use_scikit is not None: if self.use_scikit == 'cca': dim_reduction = CCA(n_components=self.comp) else: dim_reduction = PCA(n_components=self.comp) # fit cca according to train data only dim_reduction.fit(normalized_train, self.target[train_index]) # convert source into lower dimensional representation normalized_train = dim_reduction.transform(normalized_train) normalized_test = dim_reduction.transform(normalized_test) else: _, wa, _ = tutorial_on_cca(normalized_train, self.target[train_index]) normalized_train = normalized_train @ wa[:, :self.comp] normalized_test = normalized_test @ wa[:, :self.comp] model = self.build_model() model.fit(normalized_train, self.target[train_index]) prediction = model.predict(normalized_test) # res_df.to_csv(f"{self.out_name}/res1.csv") if not ignore_eval: return self.evaluate_regression(prediction, test_index) else: return prediction
class CCA_method(): def __init__(self, n_latents): self._n_latents = n_latents self._cca = CCA(n_components=n_latents, scale=False, max_iter=10000, tol=1e-8) self._Q = np.eye(self._n_latents) def fit(self, X, Y): # projections U'X, V'Y such that U'X and V'Y are maximally correlated self._cca.fit(X, Y) # get time-course of projected data UX, VY = self._cca.transform(X, Y) # learn linear regression VY = UX * Q # (Q will be optimal in least-squares sense) self._Q = np.linalg.pinv(UX).dot(VY) def predict(self, X): # transform source data into latent space UX = self._cca.transform(X) # predict latent activity in target space QUX = UX.dot(self._Q) # predict observed activity in target space Ypred = QUX.dot(self._cca.y_loadings_.T) return Ypred
def compCorrCoefs(self, learningSet, EEGSignals): n_components = 1 cca = CCA(n_components) #print(EEGSignals.shape) ''' correlation14 = abs(np.corrcoef(np.mean(learningSet[0:3].T, axis=1),np.mean(EEGSignals.T, axis=1))[0, 1]) correlation28 = abs(np.corrcoef(np.mean(learningSet[3:6].T, axis=1),np.mean(EEGSignals.T, axis=1))[0, 1]) correlation8 = abs(np.corrcoef(np.mean(learningSet[6:9].T, axis=1),np.mean(EEGSignals.T, axis=1))[0, 1]) print(learningSet[0][0],learningSet[1][0],learningSet[2][0]) for i in range(0,9,3): print(abs(np.corrcoef(learningSet[i].T,EEGSignals[int(i/3)].T)[0, 1]), abs(np.corrcoef(learningSet[i+1].T,EEGSignals[int(i/3)].T)[0, 1]), abs(np.corrcoef(learningSet[i+2].T,EEGSignals[int(i/3)].T)[0, 1])) print("---") ''' cca.fit(learningSet[0:3].T, EEGSignals.T) U, V = cca.transform(learningSet[0:3].T, EEGSignals.T) correlation14 = abs(np.corrcoef(U.T, V.T)[0, 1]) cca.fit(learningSet[3:6].T, EEGSignals.T) U, V = cca.transform(learningSet[3:6].T, EEGSignals.T) correlation28 = abs(np.corrcoef(U.T, V.T)[0, 1]) cca.fit(learningSet[6:9].T, EEGSignals.T) U, V = cca.transform(learningSet[6:9].T, EEGSignals.T) correlation8 = abs(np.corrcoef(U.T, V.T)[0, 1]) return correlation14, correlation28, correlation8
def CCA_transform(train_feature, train_label, test_feature, n_components): """ CCA: Canonical Correlation Analysis """ from sklearn.cross_decomposition import CCA cca = CCA(n_components).fit(train_feature, train_label) train_feature_transformed = cca.transform(train_feature) test_feature_transformed = cca.transform(test_feature) return train_feature_transformed, test_feature_transformed
def cca(vocab1, vocab2, cca_model=None, dim=300, max_iter=1000, thre=0.5): if not cca_model: cca_model = CCA(n_components=dim, max_iter=max_iter) try: cca_model.fit(vocab1, vocab2) [cca_vec1, cca_vec2] = cca_model.transform(vocab1, vocab2) except: print('svd cannot converge, try smaller dim') else: [cca_vec1, cca_vec2] = cca_model.transform(vocab1, vocab2) comb_cca = (thre * cca_vec1 + (1 - thre) * cca_vec2) return comb_cca, cca_vec1, cca_vec2, cca_model
def CalculaCCA(data): samplingRate = 500 data_filtered = butter_bandpass_filter(data, 4.0, 35.0, samplingRate) data_filtered = data data_notch = butter_bandstop_filter(data_filtered, 58.0, 62.0, samplingRate, 4) numpyBuffer = np.array(data_notch) size = np.shape(data_notch) freq1 = getReferenceSignals(size[1], 5, samplingRate) freq2 = getReferenceSignals(size[1], 7, samplingRate) freq3 = getReferenceSignals(size[1], 9, samplingRate) freq4 = getReferenceSignals(size[1], 11, samplingRate) cca = CCA(n_components=1) cca.fit(numpyBuffer.T, freq1.T) O1_a, O1_b = cca.transform(numpyBuffer.T, freq1.T) result1 = np.corrcoef(O1_a.T, O1_b.T)[0, 1] cca.fit(numpyBuffer.T, freq2.T) O1_a, O1_b = cca.transform(numpyBuffer.T, freq2.T) result2 = np.corrcoef(O1_a.T, O1_b.T)[0, 1] cca.fit(numpyBuffer.T, freq3.T) O1_a, O1_b = cca.transform(numpyBuffer.T, freq3.T) result3 = np.corrcoef(O1_a.T, O1_b.T)[0, 1] cca.fit(numpyBuffer.T, freq4.T) O1_a, O1_b = cca.transform(numpyBuffer.T, freq4.T) result4 = np.corrcoef(O1_a.T, O1_b.T)[0, 1] result = [abs(result1), abs(result2), abs(result3), abs(result4)] ab = max(result, key=float) if (abs(result1) == ab): value = 5 if (abs(result2) == ab): value = 7 if (abs(result3) == ab): value = 9 if (abs(result4) == ab): value = 11 return value
def compute_mcc(args, config): rep1 = pickle.load( open( os.path.join(args.checkpoints, 'seed{}'.format(args.seed), 'test_representations.p'), 'rb'))['rep'] rep2 = pickle.load( open( os.path.join(args.checkpoints, 'seed{}'.format(args.second_seed), 'test_representations.p'), 'rb'))['rep'] # cutoff = 50 if args.dataset == 'CIFAR100' else 5 # ii = np.where(res_cond[0]['lab'] < cutoff)[0] # in sample points to learn from # iinot = np.where(res_cond[0]['lab'] >= cutoff)[0] # out of sample points cutoff = 5000 # half the test dataset ii = np.arange(cutoff) iinot = np.arange(cutoff, 2 * cutoff) mcc_strong_out = mean_corr_coef_out_of_sample(x=rep1[ii], y=rep2[ii], x_test=rep1[iinot], y_test=rep2[iinot]) mcc_strong_in = (mean_corr_coef(x=rep1[ii], y=rep2[ii])) pickle.dump({ 'in': mcc_strong_in, 'out': mcc_strong_out }, open( os.path.join( args.output, 'mcc_strong_{}_{}.p'.format(args.seed, args.second_seed)), 'wb')) cca_dim = 20 cca = CCA(n_components=cca_dim) cca.fit(rep1[ii], rep2[ii]) res_out = cca.transform(rep1[iinot], rep2[iinot]) mcc_weak_out = mean_corr_coef(res_out[0], res_out[1]) res_in = cca.transform(rep1[ii], rep2[ii]) mcc_weak_in = mean_corr_coef(res_in[0], res_in[1]) pickle.dump({ 'in': mcc_weak_in, 'out': mcc_weak_out }, open( os.path.join( args.output, 'mcc_weak_{}_{}.p'.format(args.seed, args.second_seed)), 'wb'))
def compute_mcc(rep1, rep2, weak=False, cca_dim=None, return_cca_outputs=False): # print(rep1.shape) assert rep1.shape == rep2.shape cutoff = rep1.shape[0] // 2 ii = np.arange(cutoff) iinot = np.arange(cutoff, 2 * cutoff) # in sample and out of sample mcc mcc_strong_out = mean_corr_coef_out_of_sample(x=rep1[ii], y=rep2[ii], x_test=rep1[iinot], y_test=rep2[iinot]) mcc_strong_in = (mean_corr_coef(x=rep1[ii], y=rep2[ii])) # pickle.dump({'in': mcc_strong_in, 'out': mcc_strong_out}, # open(os.path.join(args.output, 'mcc_strong_{}_{}.p'.format(args.seed, args.second_seed)), 'wb')) print("MCC strong: in {:.4f}, out {:.4f}.".format(mcc_strong_in, mcc_strong_out)) # this computes in and out of sample mcc after applying CCA. # NB: if the number of samples is too small with respect to the size of the embedding, this does not work correctly! if weak: if cca_dim is None: cca_dim = rep1.shape[1] cca = CCA(n_components=cca_dim) cca.fit( rep1[ii], rep2[ii] ) # this raises an error if the shape[1] of the tensors is smaller than cca_dim res_out = cca.transform(rep1[iinot], rep2[iinot]) # this does not make much sense as when it computes the weak MCC it still looks for the best permutation of # them, which does not make sense I believe. However that in practice does not matter, as the diagonal elements # are selected, which denote the best ones. mcc_weak_out = mean_corr_coef(res_out[0], res_out[1]) res_in = cca.transform(rep1[ii], rep2[ii]) mcc_weak_in = mean_corr_coef(res_in[0], res_in[1]) print("MCC weak: in {:.4f}, out {:.4f}.".format( mcc_weak_in, mcc_weak_out)) if return_cca_outputs: return mcc_strong_in, mcc_strong_out, mcc_weak_in, mcc_weak_out, res_out[ 0], res_out[1] else: return mcc_strong_in, mcc_strong_out, mcc_weak_in, mcc_weak_out return mcc_strong_in, mcc_strong_out
def cca_classify(X_eeg_signals, Yi_frequency_signals): cca = CCA(1) corr_results = [] for fr in range(0, Yi_frequency_signals.shape[0]): X = X_eeg_signals Yi = Yi_frequency_signals[fr, :, :] #计算X与Yi之间的相关性 cca.fit(X.T, np.squeeze(Yi).T) X_train_r, Yi_train_r = cca.transform(X.T, np.squeeze(Yi).T) corr = np.corrcoef(X_train_r[:, 0], Yi_train_r[:, 0])[0, 1] #得出X与每个Yi的相关性 corr_results.append(corr) if corr_results[np.argmax(corr_results)] > 0.50: #设置阈值 global index global all_data classify_result = np.argmax(corr_results) + 1 print(corr_results) index += 1 #保存数据 TT = pd.DataFrame(X_eeg_signals) all_data = all_data.append(np.transpose(TT[1:9])) if index == 50: #保存数据 all_data = pd.DataFrame(all_data) all_data.to_csv('./j_8_all_data.csv', index=False) return classify_result else: return -1
def main(args): (training_file, label_file, test_file, u_file, e, c, output_file, components) = args X_training = load_feat(training_file) n = len(X_training) U = load_feat(u_file) y_training = [float(line.strip()) for line in open(label_file)] U = np.asarray(U) X_training = np.asarray(X_training) #X = preprocessing.normalize(X, norm='l2') y_training = np.asarray(y_training) X_test = load_feat(test_file) y_test = [float(line.strip()) for line in open(test_label)] X_test = np.asarray(X_test) X_test[np.isnan(X_test)] = 0.0 #test_X = preprocessing.normalize(test_X, norm='l2') y_test = np.asarray(y_test) s = min(len(X_training), len(U)) cca = CCA(n_components=components, max_iter=50) (X_cca, U_cca) = cca.fit_transform(X_training[:s], U[:s]) X_test_cca = cca.transform(X_test) svr = SVR(C=c, epsilon=e, kernel='rbf') svr.fit(X_cca, y_training[:s]) pred = svr.predict(X_test_cca) with open(output_file, 'w') as output: for p in pred: print >> output, p return
def map_spaces(self, algo, src_mapped_embed=None, trg_mapped_embed=None): # (There may be duplicates in self.shared_vocab_src and/or self.shared_vocab_trg, # swap_vocab can be used to only inspect one-to-one translations) src_embed = self.model_src[self.shared_vocab_src] trg_embed = self.model_trg[self.shared_vocab_trg] os.makedirs(algo, exist_ok=True) if algo == "procrustes": logging.info( "Calculating Rotation Matrix (Procrustes Problem) and applying it to first embedding" ) #ortho, _ = orthogonal_procrustes(src_embed, trg_embed) # does the same as u, _, vt = np.linalg.svd(trg_embed.T.dot(src_embed)) w = vt.T.dot(u.T) self.model_src.vectors.dot(w, out=self.model_src.vectors) elif algo == "noise": logging.info( "Calculating Rotation Matrix with noise aware algorithm and applying it to first embedding" ) transform_matrix, alpha, clean_indices, noisy_indices = noise_aware( src_embed, trg_embed) #write cleaned vocab to file with open("vocab.clean.txt", 'w') as v: for src, trg in np.asarray(self.shared_vocab)[clean_indices]: v.write("{}\t{}\n".format(src, trg)) self.model_src.vectors.dot(transform_matrix, out=self.model_src.vectors) logging.info("Percentage of clean indices: {}".format(alpha)) elif algo == "cca": logging.info( "Calculating Mapping based on CCA and applying it to both embeddings" ) cca = CCA(n_components=100, max_iter=5000) cca.fit(src_embed, trg_embed) self.model_src.vectors, self.model_trg.vectors = cca.transform( self.model_src.vectors, self.model_trg.vectors) elif algo == "gcca": logging.info( "Calculating Mapping based on GCCA and applying it to both embeddings" ) gcca = GCCA() gcca.fit([src_embed, trg_embed]) transform_l = gcca.transform_as_list( (self.model_src.vectors, self.model_trg.vectors)) # gcca computes positive and negative correlations (eigenvalues), sorted in ascending order. # We are only interested in the positive portion self.model_src.vectors = transform_l[0][:, 100:] self.model_trg.vectors = transform_l[1][:, 100:] # save transformed model(s) if src_mapped_embed: self.model_src.save(os.path.join(algo, src_mapped_embed)) if trg_mapped_embed: self.model_trg.save(os.path.join(algo, trg_mapped_embed))
def find_correlation_cca_method1(signal, reference_signals, n_components=2): r""" Perform canonical correlation analysis (CCA) Reference: https://github.com/aaravindravi/Brain-computer-interfaces/blob/master/notebook_12_class_cca.ipynb Args: signal : ndarray, shape (channel,time) Input signal in time domain reference_signals : ndarray, shape (len(flick_freq),2*num_harmonics,time) Required sinusoidal reference templates corresponding to the flicker frequency for SSVEP classification n_components : int, default: 2 number of components to keep (for sklearn.cross_decomposition.CCA) Returns: result : array, size: len(flick_freq) Probability for each reference signals Dependencies: CCA : sklearn.cross_decomposition.CCA np : numpy package """ cca = CCA(n_components) corr = np.zeros(n_components) result = np.zeros(reference_signals.shape[0]) for freq_idx in range(0, reference_signals.shape[0]): cca_x = signal.T cca_y = np.squeeze(reference_signals[freq_idx, :, :]).T cca.fit(cca_x, cca_y) a, b = cca.transform(cca_x, cca_y) for ind_val in range(0, n_components): corr[ind_val] = np.corrcoef(a[:, ind_val], b[:, ind_val])[0, 1] result[freq_idx] = np.max(corr) return result
def CCA_project_vectors(args, src_dico, tgt_dico, src_full, tgt_full, src_train, tgt_train, NUM_dim=100): print('Exporting embeddings...') OutputDir = "output/{}-{}/".format(args.src_lang, args.tgt_lang) if not os.path.exists(OutputDir): os.makedirs(OutputDir) cca = CCA(n_components=NUM_dim) print("Fitting...") cca.fit(src_train, tgt_train) print(cca.get_params()) X_c, Y_c = cca.transform(src_full, tgt_full) src_out, tgt_out = utils.norm_embeddings(X_c), utils.norm_embeddings(Y_c) print("Exporting embeddings...") utils.export_embeddings(src_dico[0], src_out, OutputDir + 'projected.{}'.format(args.src_lang)) utils.export_embeddings(tgt_dico[0], tgt_out, OutputDir + 'projected.{}'.format(args.tgt_lang)) print("work over!")
def visualize_with_cca(X, y, title): cca = CCA(n_components=2) cca.fit(X, y) X_cca = cca.transform(X) Xax = X_cca[:, 0] Yax = X_cca[:, 1] labels = (y > 0).astype(int) cdict = {0: 'red', 1: 'green'} labl = {0: 'home_loss', 1: 'home_win'} marker = {0: '*', 1: 'o'} alpha = {0: .3, 1: .5} fig, ax = plt.subplots(figsize=(7, 5)) fig.patch.set_facecolor('white') for l in np.unique(labels): ix = np.where(labels == l) ax.scatter(Xax[ix], Yax[ix], c=cdict[l], s=40, label=labl[l], marker=marker[l], alpha=alpha[l]) plt.xlabel("First Principal Component", fontsize=14) plt.ylabel("Second Principal Component", fontsize=14) plt.legend() plt.title(title) plt.show()
class CCAFusion(TransformerMixin, BaseEstimator): def __init__(self, c1, c2): self.pipes = [c1, c2] self.max_iter = 500 self.cca = None def fit(self, X, y=None, **fit_params): C = [] n_components = None for pipe in self.pipes: c = pipe.fit_transform(X, y) if hasattr(c, 'toarray'): c = c.toarray() if n_components is None: n_components = c.shape[1] else: n_components = min(c.shape[1], n_components) C += [c] self.cca = CCA(n_components=n_components, max_iter=self.max_iter) self.cca.fit(*C) return self def transform(self, X, y=None): C = [] for pipe in self.pipes: c = pipe.transform(X, y) if hasattr(c, 'toarray'): c = c.toarray() C += [c] return self.cca.transform(*C)[0] def fit_transform(self, X, y=None, **fit_params): return self.fit(X, y, **fit_params).transform(X, y)
def cca(m1, m2, preprocessing=None): """ Use CCA to decompose two views and plot result. Params: m1, m2: Every column is a example with every row as a feature. preprocessing: If None, we don't do pre-processing; if 'orth', we adjust center to 0 and perform PCA. """ # Adjust means to be 0 and perform PCA. if preprocessing == "orth": # Zero means. m1 -= np.mean(m1, axis=1, keepdims=True) # print("m1=", np.sum(m1, axis=1)) m2 -= np.mean(m2, axis=1, keepdims=True) # PCA. cca = CCA(n_components=3, max_iter=100) cca.fit(m1.T, m2.T) X_c = cca.transform(m1.T) fig, ax = plt.subplots() ax.set_title('Fig.2.(c)') # ax.set_color_cycle(['blue', 'green', 'red']) ax.set_prop_cycle('color', ['blue', 'red', 'green']) ax.plot(X_c) # ax.plot(Y_c) plt.show()
def main(args): (training_file, label_file, test_file, u_file, e, c, output_file, components) = args X_training = load_feat(training_file) n = len(X_training) U = load_feat(u_file) y_training = [float(line.strip()) for line in open(label_file)] U = np.asarray(U) X_training = np.asarray(X_training) #X = preprocessing.normalize(X, norm='l2') y_training = np.asarray(y_training) X_test = load_feat(test_file) y_test = [float(line.strip()) for line in open(test_label)] X_test = np.asarray(X_test) X_test[np.isnan(X_test)] = 0.0 #test_X = preprocessing.normalize(test_X, norm='l2') y_test = np.asarray(y_test) s = min(len(X_training), len(U)) cca = CCA(n_components=components, max_iter=50) (X_cca, U_cca) = cca.fit_transform(X_training[:s], U[:s]) X_test_cca = cca.transform(X_test) svr = SVR(C=c, epsilon=e, kernel='rbf') svr.fit(X_cca, y_training[:s]) pred = svr.predict(X_test_cca) with open(output_file, 'w') as output: for p in pred: print >>output, p return
def fit_cca(self, outfile=''): # fits linear CCA constraint and replaces pretrained name embeddings with CCA transformed embeddings self.load_embeddings() self.extract_pretrained_prototype_embeddings() items, vectors = zip( *[(k, v) for k, v in self.pretrained_prototype_embeddings.items() if k in self.exemplar_to_concept]) concept_embs = Reach(vectors, items) train_vectors = [] for x in items: train_vectors.append(self.train_embeddings[x]) train_vectors = Reach.normalize(train_vectors) cca = CCA(n_components=self.train_embeddings.size, max_iter=10000) cca.fit(train_vectors, concept_embs.norm_vectors) # transform all name embeddings using the CCA mapping all_name_embeddings = deepcopy(self.pretrained_name_embeddings) items = [x for _, x in sorted(all_name_embeddings.indices.items())] projected_name_embeddings = cca.transform( all_name_embeddings.norm_vectors) new_name_embeddings = Reach(projected_name_embeddings, items) self.pretrained_name_embeddings = new_name_embeddings self.load_embeddings() if outfile: with open('{}_cca.p', 'wb') as f: pickle.dump(cca, f)
def project_vectors(origForeignVecFile, origEnVecFile, subsetEnVecFile, subsetForeignVecFile, outputEnFile, outputForeignFile, NUMCC=40): ''' 将词典的向量输入到CCA中,生成投影向量,再生成双语向量 :param origForeignVecFile: 外语向量矩阵 :param origEnVecFile: 英语向量矩阵 :param subsetEnVecFile: 词典中的英语向量矩阵 :param subsetForeignVecFile: 词典中的外语向量矩阵 :param outputEnFile: 重新获得的英语词向量 :param outputForeignFile: 重新获得的外语词向量 :param truncRatio: 模型的训练系数 ''' '''数据读入,处理掉开头的英文单词,只保留词向量''' tmp = np.loadtxt(origEnVecFile, dtype=np.str, delimiter=' ') origEnVecs = tmp[:, 1:].astype(np.float) tmp2 = np.loadtxt(origForeignVecFile, dtype=np.str, delimiter=' ') origForeignVecs = tmp2[:, 1:].astype(np.float) tmp3 = np.loadtxt(subsetEnVecFile, dtype=np.str, delimiter=' ') subsetEnVecs = tmp3[:, 1:].astype(np.float) tmp4 = np.loadtxt(subsetForeignVecFile, dtype=np.str, delimiter=' ') subsetForeignVecs = tmp4[:, 1:].astype(np.float) '''预处理,使每行正则化''' #origEnVecs=preprocessing.normalize(origEnVecs) #origForeignVecs=preprocessing.normalize(origForeignVecs) subsetEnVecs = preprocessing.normalize(subsetEnVecs) subsetForeignVecs = preprocessing.normalize(subsetForeignVecs) '''训练CCA''' ''' num = [NUMCC] regs = [1e-1] cca = rcca.CCACrossValidate(regs=regs,numCCs=num,kernelcca=False,cutoff=0.1) cca.train([subsetEnVecs, subsetForeignVecs]) ''' cca = CCA(n_components=NUMCC) cca.fit(subsetEnVecs, subsetForeignVecs) print cca.get_params() X_c, Y_c = cca.transform(origEnVecs, origForeignVecs) '''生成投影后的向量''' #tmpOutput = rcca._listdot([d.T for d in [origEnVecs, origForeignVecs]], cca.ws) origEnVecsProjected = preprocessing.normalize(X_c) #origEnVecsProjected = preprocessing.scale(tmpOutput[0]) origEnVecsProjected = np.column_stack( (tmp[:, :1], origEnVecsProjected.astype(np.str))) origForeignVecsProjected = preprocessing.normalize(Y_c) #origForeignVecsProjected = preprocessing.scale(tmpOutput[1]) origForeignVecsProjected = np.column_stack( (tmp2[:, :1], origForeignVecsProjected.astype(np.str))) np.savetxt(outputEnFile, origEnVecsProjected, fmt="%s", delimiter=' ') np.savetxt(outputForeignFile, origForeignVecsProjected, fmt="%s", delimiter=' ') print "work over!"
def cca_analysis(X, Y, X_dev, Y_dev): cca = CCA(n_components=1, max_iter=2000) cca.fit(X, Y) X_dev_c, Y_dev_c = cca.transform(X_dev, Y_dev) corrcoef = np.corrcoef(X_dev_c.T, Y_dev_c.T)[0,1] return corrcoef
def cca_d_h(d_var, h_var, components_num): cca=CCA(n_components=components_num, scale=True, max_iter=2000) cca.fit(d_var, h_var) d_c,h_c=cca.transform(d_var, h_var) ah = np.linalg.inv((h_var.T).dot(h_var)).dot(h_var.T).dot(h_c) ad = np.linalg.inv((d_var.T).dot(d_var)).dot(d_var.T).dot(d_c) return d_c, h_c, ad, ah
def _CCA(data, graph, n): cca = CCA(n_components=n) adjacencyMatrix = createAffinityMatrix(graph) cca.fit(data, adjacencyMatrix) X_c, Y_c = cca.transform(data, adjacencyMatrix) writeCSV(X_c, 'CCA_X') writeCSV(Y_c, 'CCA_Y')
def cca_score(X, Y): # Calculate the CCA score of the first component pair ca = CCA(n_components=1) ca.fit(X, Y) Xc, Yc = ca.transform(X, Y) score = np.corrcoef(Xc[:, 0], Yc[:, 0]) return score[0][1]
def mean_canonical_correlations(scaled_features, df): cca = CCA(1) cca.fit(scaled_features, df.iloc[:,-1]) X_c, Y_c = cca.transform(scaled_features, df.iloc[:,-1]) mean_canonical_correlation = np.mean(X_c) return mean_canonical_correlation
def CCA_analysis(self, recordings_index, trial_index, brain_area): path = self.all_data_path + '/' + self.selected_recordings[ recordings_index] #Prepare rates rates = self.convert_one_population_to_rates(recordings_index, trial_index, brain_area).T #Prepare behavior trials = np.load(path + '/' + 'trials.intervals.npy') #Behavioral data mot_timestamps = np.load(path + '/' + 'face.timestamps.npy') mot_energy = np.load(path + '/' + 'face.motionEnergy.npy') beh_range = np.bitwise_and( mot_timestamps[:, 1] >= trials[trial_index][0], mot_timestamps[:, 1] <= trials[trial_index][1]) #print(np.where(beh_range==True)) #print(mot_timestamps[beh_range]) beh_subset = mot_energy[beh_range] beh_subset_aligned = self.align_rate_and_behavior( beh_subset, rates[:, 0]).reshape(-1, 1) from sklearn.cross_decomposition import CCA cca = CCA(n_components=2) cca.fit(rates, beh_subset_aligned) X_train_r, Y_train_r = cca.transform(rates, beh_subset_aligned) print(X_train_r.shape) print(Y_train_r.shape) plt.scatter(X_train_r[:, 0], Y_train_r[:], label="train", marker="*", c="b", s=50) plt.show() plt.scatter(X_train_r[:, 1], Y_train_r[:], label="train", marker="*", c="b", s=50) plt.show() #rates_test=self.convert_one_population_to_rates(recordings_index,2,brain_area).T #X_test_r, Y_test_r = cca.transform(rates_test, beh_subset_aligned) #plt.scatter(X_test_r[:, 0], Y_test_r[:], label="test", #marker="^", c="b", s=50) #plt.show() print(beh_subset_aligned.shape) print(rates.shape)
def solve(self): v1, v2 = self.list_view clf = CCA(n_components=self.m_rank) clf = clf.fit(v1.T, v2.T) self.model = clf X_c, Y_c = clf.transform(v1.T, v2.T) self.list_projection = [X_c, Y_c] self.list_U = [clf.x_rotations_, clf.y_rotations_]
def cca(src_dict, tgt_dict, bi_dict, dim=250): #with open('../data/seed_embedding.dat', 'wb') as f: # pickle.dump(x, f) # pickle.dump(y, f) cca_model = CCA(n_components=dim) src_mat, tgt_mat = make_training_matrices(src_dict, tgt_dict, bi_dict) cca_model.fit(src_mat, tgt_mat) return cca_model.transform(src_dict.embed, tgt_dict.embed)
def fit_cal(self,F): # PlsDemo PD_m=PlsDemo(self.x_m_cal,self.y_cal,self.max_folds,self.max_components) W_m,T_m,P_m,coefs_B_m,RMSECV_m,min_RMSECV_m,comp_best=PD_m.pls_fit(F) # print "comp_best =", comp_best cca_m = CCA(comp_best) cca_m.fit(self.x_m_cal, self.y_cal) X_score, Y_score = cca_m.transform(self.x_m_cal, self.y_cal) W_m = cca_m.x_rotations_ x_m_cal_mean = np.mean(self.x_m_cal, axis=0) x_m_cal_center = np.subtract(self.x_m_cal, x_m_cal_mean) L_m = np.dot(x_m_cal_center, W_m) cca_s = CCA(comp_best) cca_s.fit(self.x_s_std, self.y_std) X_score, Y_score = cca_s.transform(self.x_s_std, self.y_std) W_s = cca_s.x_rotations_ x_s_std_mean = np.mean(self.x_s_std, axis=0) x_s_std_center = np.subtract(self.x_s_std, x_s_std_mean) L_s = np.dot(x_s_std_center, W_s) # print "L.shape =", np.shape(L_m),np.shape(L_s) F_1 = np.linalg.lstsq(L_s, L_m)[0] F_2 = np.linalg.lstsq(L_m, self.x_m_std)[0] coefficient = np.dot(np.dot(np.dot(W_s, F_1), F_2), coefs_B_m) #RMSEC # xs_std_center=np.subtract(self.x_s_std, self.x_s_std.mean(axis=0)) # # xs_std_center=np.subtract(self.x_s_std, self.x_m_cal.mean(axis=0)) # y_predict=np.dot(xs_std_center, coefficient)+self.y_cal.mean(axis=0) # RMSEC=np.sqrt(np.sum(np.square(np.subtract(y_predict,self.y_std)),axis=0)/self.y_std.shape[0]) # print "RMSEC =", RMSEC xs_cal_center=np.subtract(self.x_s_cal, self.x_s_cal.mean(axis=0)) # xs_cal_center=np.subtract(self.x_s_cal, self.x_m_cal.mean(axis=0)) y_predict=np.dot(xs_cal_center, coefficient)+self.y_cal.mean(axis=0) RMSEC=np.sqrt(np.sum(np.square(np.subtract(y_predict,self.y_cal)),axis=0)/self.y_cal.shape[0]) return coefficient, comp_best, RMSEC
def cca_feature(self, data, parameter_list): cca = CCA(1) result = [] for i in range(parameter_list[-1]): # reference_signals = self.reference_signals(parameter_list[1][i], parameter_list[2], parameter_list[3]) reference_signals = self.reference_signals(parameter_list[1][i], parameter_list[2], parameter_list[3]) cca.fit(data.T, reference_signals.T) x, y = cca.transform(data.T, np.squeeze(reference_signals).T) corr = np.corrcoef(x[:, 0], y[:, 0])[0, 1] result.append(corr) return result
def predict(self): if self.k_CCA is None: if self.verbose: print('Going to compute best components first') self.determine_CCA_components() # self.cca_predictions, _ = self.ccaCV.predict(self.features, self.ccaCV.ws) cca = CCA(n_components=self.k_CCA) cca.fit(self.features[:6000], self.graph[:6000]) self.cca_predictions = cca.transform(self.features) if self.verbose: print('Produced predictions') print('Size of predictions {}'.format(self.cca_predictions.shape))
def load_mutation_data(): if os.path.isfile(mutation_pickle_path): pickle_load = pickle.load(open(mutation_pickle_path, 'rb')) return pickle_load[0], pickle_load[1] gene_effect_df = pd.read_csv( r"C:\Users\Nitay\Documents\courses\roded-seminar\Achilles_gene_dependency.csv" ) mutations_df = pd.read_csv( r"C:\Users\Nitay\Documents\courses\roded-seminar\CCLE_mutations.csv") mutations_df = mutations_df[mutations_df["isDeleterious"].fillna(False)] gene_effect_df = gene_effect_df.set_index("Unnamed: 0").T gene_effect_df.columns.names = ["cell_line"] gene_effect_df.index.names = ["gene"] def clean_gene_name(name): return name.split("(")[0].strip() clean_gene_effect_df = gene_effect_df.rename(index=clean_gene_name) common_genes = set(clean_gene_effect_df.index).intersection( set(mutations_df['Hugo_Symbol'])) mutations_cell_line = set(mutations_df['DepMap_ID']) new_mutations_df = pd.DataFrame(np.zeros( (len(common_genes), len(mutations_cell_line))), columns=mutations_cell_line, index=common_genes) for i, row in mutations_df.iterrows(): cell_line = row["DepMap_ID"] gene = row['Hugo_Symbol'] if gene in common_genes and cell_line in mutations_cell_line: new_mutations_df.loc[gene, cell_line] = 1 filtered_gene_effect_df = clean_gene_effect_df.filter(items=common_genes, axis=0) filtered_mutations_df = new_mutations_df.loc[new_mutations_df.sum(1) > 0, new_mutations_df.sum(0) > 0] from sklearn.cross_decomposition import CCA Y = filtered_gene_effect_df.values X = filtered_mutations_df.values cca = CCA(n_components=10) cca.fit(X, Y) X_c = cca.transform(X) filtered_mutations_df = pd.DataFrame(X_c) pickle.dump([filtered_gene_effect_df, filtered_mutations_df], open(mutation_pickle_path, "wb")) return filtered_gene_effect_df, filtered_mutations_df
def canonical_approach(): from sklearn.cross_decomposition import CCA (X, Y), cities = pull_xy_data() cca = CCA(n_components=2) cca.fit(X, Y) ccaX, ccaY = cca.transform(X, Y) plot(ccaX, cities, ["CC01", "CC02", "CC03"], 1) return "OK What Now?"
def cca_for_ssvep(input_data, sampling_rate, compared_frequencies): # TODO: Strick input checks, exceptions and avoid crashing and processing errors # Pre-allocate SSVEP signals matrix to be compared with original EEG recordings using CCA number_time_points = input_data.shape[1] number_harmonics = 2 cca_base_signal_matrix = [[] for loop_var in compared_frequencies] # Pre-allocate output: one correlation coefficient (Rho) for each target SSVEP frequency # Note: Row 1 is for default Rho scores, Row 2 is for the Rho scores After cca transformation cca_rho_values = numpy.zeros([1, len(compared_frequencies)], dtype='float') # For each target frequency, fill Y matrix with sine and cosine signals for every harmonic for loop_frequencies in range(len(compared_frequencies)): # For this current SSVEP frequency, pre-allocate the harmonics matrix cca_base_signal_matrix[loop_frequencies] = numpy.zeros([number_harmonics * 2, number_time_points]) time_points_count = numpy.arange(number_time_points, dtype='float') time_points_count = time_points_count / sampling_rate # Generate sine and cosine reference signals, for every harmonic for loop_harmonics in range(number_harmonics): # Compute the reference signals for current harmonic base_constant = 2 * numpy.pi * (loop_harmonics + 1) * compared_frequencies[loop_frequencies] base_sine_signal = numpy.sin((base_constant * time_points_count)) base_cosine_signal = numpy.cos((base_constant * time_points_count)) # Copy signals back to reference matrix base_position = loop_harmonics + 1 sine_position = (2 * (base_position - 1) + 1) cosine_position = 2 * base_position cca_base_signal_matrix[loop_frequencies][sine_position - 1, :] = base_sine_signal cca_base_signal_matrix[loop_frequencies][cosine_position - 1, :] = base_cosine_signal # After the loop, extract the y_matrix from reference matrix for current SSVEP frequency y_matrix = cca_base_signal_matrix[loop_frequencies] # Create a CCA object and compute the correlation score cca_object = CCA(n_components=number_harmonics) cca_object.fit(numpy.transpose(input_data), numpy.transpose(y_matrix)) values_x, values_y = cca_object.transform(input_data, y_matrix) cca_rho_values[0, loop_frequencies] = cca_object.score(input_data, y_matrix, values_y) # Score = Rho value? # After loop return and exit return cca_rho_values
def mainExec(name_file1, name_file2, features1, features2): ''' Given two files with names, and two files with features, perform the Stacked Auxiliary Embedding method on two matrices. The first one is the concatenation of both feature lists, the second matrix contains tf-idf weighted representations of the training sentences of Flickr30kEntities. The intermediate CCA model is written to disk, as well as the final model :param name_file1 :param name_file2 :param features1 :param features2 ''' print "Creating vocabulary" voc = readVocabulary() print "Generating document vectors" occurrenceVectors, idf = createOccurrenceVectors(voc) print "Weighing vectors" weightedVectors = weight_tfidf(occurrenceVectors, idf) print "creating feature dictionary" featuresDict = createFeatDict(weightedVectors.keys(), name_file1, name_file2, features1, features2 ) imagematrix, sentenceMatrix = createSnippetMatrices(featuresDict, weightedVectors) print "Modelling cca" cca = CCA(n_components = 128) cca = fitCCA(cca, imagematrix, sentenceMatrix, "ccasnippetmodel.p") trainingimages, trainingsentences = createTrainMatrices(voc) trans_img, trans_sent = cca.transform(trainingimages, trainingsentences) nn_img = nearest_neighbor(trainingimages) nn_sent = nearest_neighbor(trainingsentences) print "NN Image: " + str(nn_img) print "NN Sentence: " + str(nn_sent) augmented_imgs, augmented_sentences = augmentMatrices(nn_img, nn_sent, trainingimages, trainingsentences, trans_img, trans_sent) print "Fitting augmented CCA model" augmentedcca = CCA(n_components=96) augmentedcca = fitCCA(augmentedcca, augmented_imgs, augmented_sentences, "augmentedcca.p") print "Writing the model to disk" resultingModel = StackedCCAModel(nn_img, nn_sent, cca, augmentedcca) pickle.dump(resultingModel, open("completestackedCCAModel.p", 'w+'))
Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print("True B (such that: Y = XB + Err)") print(B) # compare pls2.coef_ with B print("Estimated B") print(np.round(pls2.coef_, 1)) pls2.predict(X) # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 pls1 = PLSRegression(n_components=3) pls1.fit(X, y) # note that the number of components exceeds 1 (the dimension of y) print("Estimated betas") print(np.round(pls1.coef_, 1)) # ############################################################################# # CCA (PLS mode B with symmetric deflation) cca = CCA(n_components=2) cca.fit(X_train, Y_train) X_train_r, Y_train_r = cca.transform(X_train, Y_train) X_test_r, Y_test_r = cca.transform(X_test, Y_test)
def mainExec(name_file, features): ''' Based on a list of image names and image features, learn a CCA model based on Stacked Auxiliary Embedding and save this model to disk. :param name_file :param features :return: ''' print "Creating vocabulary" voc = readVocabulary() print "Generating document vectors" occurrenceVectors, idf = createOccurrenceVectors(voc) print "Weighing vectors" weightedVectors = weight_tfidf(occurrenceVectors, idf) sentenceMatrix = [] imagematrix = [] print "Creating matrices" currentSentence = 0 for i in weightedVectors.keys(): if isLargeEnough(i): currentSentence += 1 print "current Sentence: " + str(currentSentence) for j in range(len(weightedVectors[i])): weightedVectors[i][j] = float(weightedVectors[i][j]) if currentSentence == 1: sentenceMatrix = weightedVectors[i] imagematrix = getImage(i,name_file, features) elif currentSentence ==2: sentenceMatrix = np.concatenate(([sentenceMatrix], [weightedVectors[i]]), axis = 0) imagematrix = np.concatenate(([imagematrix], [getImage(i,name_file, features)]), axis = 0) else: sentenceMatrix = np.concatenate((sentenceMatrix, [weightedVectors[i]]), axis = 0) imagematrix = np.concatenate((imagematrix, [getImage(i,name_file, features)]), axis = 0) print "Modelling cca" cca = CCA(n_components=128) cca.fit(sentenceMatrix, imagematrix) pickle.dump(cca, open("ccasnippetmodel.p",'w+')) idf = np.zeros(len(voc)) trainingimages = [] trainingsentences = [] dp = getDataProvider('flickr30k') currentPair = 0 for pair in dp.sampleImageSentencePair(): currentPair += 1 if currentPair % 100 == 0: print "Current pair: " + str(currentPair) img = pair['image']['feat'] trainingimages.append(img) sentence = getFullSentence(pair) for i in range(len(sentence)): if sentence[i] > 0: idf[i] += 1 trainingsentences.append(sentence) for i in range(len(trainingsentences)): trainingsentences[i] = trainingsentences[i]*idf trans_img, trans_sent = cca.transform(trainingimages, trainingsentences) nn_img = nearest_neighbor(trainingimages) nn_sent = nearest_neighbor(trainingsentences) augmented_imgs = [] augmented_sentences = [] for i in range(len(trans_img)): augm_img = trainingimages[i].extend(phi(3000,nn_img, trans_img[i])) augmented_imgs.append(augm_img) for i in range(len(trans_sent)): augm_sent = trainingsentences[i].extend(phi(3000, nn_sent, trans_sent[i])) augmented_sentences.append(augm_sent) augmentedcca = CCA(n_components= 96) augmentedcca.fit(augmented_sentences, augmented_imgs) pickle.dump(cca, open("augmentedcca.p",'w+'))
os.makedirs(dir_name) OutputLog().set_path(dir_name) OutputLog().set_verbosity(configuration.output_parameters['verbosity']) data_config = ConfigParser.ConfigParser() data_config.read(data_set_config) data_parameters = ConfigSectionMap("dataset_parameters", data_config) # construct data set data_set = Container().create(data_parameters['name'], data_parameters) cca_model = CCA(n_components=top, scale=True, copy=False) train_transformed_x, train_transformed_y = cca_model.fit_transform(data_set.trainset[0], data_set.trainset[1]) test_transformed_x, test_transformed_y = cca_model.transform(data_set.testset[0], data_set.testset[1]) OutputLog().write('test results:') correlations, trace_correlation, var, x_test, y_test, test_best_layer = TraceCorrelationTester( data_set.testset[0], data_set.testset[1], top).test(IdentityTransformer(), configuration.hyper_parameters) OutputLog().write('train results:') correlations, train_trace_correlation, var, x_train, y_train, train_best_layer = TraceCorrelationTester( data_set.trainset[0], data_set.trainset[1], top).test(IdentityTransformer(), configuration.hyper_parameters) OutputLog().write('\nTest results : \n') configuration.hyper_parameters.print_parameters(OutputLog())
__author__ = 'cancobanoglu' ''' CCA is Canonical Correlation Analysis ''' print(__doc__) from sklearn.cross_decomposition import CCA from sklearn import datasets X = [[0., 0., 1.], [1., 0., 0.], [2., 2., 2.], [3., 5., 4.]] Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]] cca = CCA(n_components=1) cca.fit(X, Y) CCA(copy=True, max_iter=500, n_components=1, scale=True, tol=1e-06) X_c, Y_c = cca.transform(X, Y)
# session.execute("DROP TABLE IF EXISTS Tweet") rows = session.execute("SELECT text, hashtags FROM Tweet limit 1000") X, Y = [], [] for row in rows: X.append(row.text) Y.append([x.lower() for x in row.hashtags]) vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english', decode_error='ignore') # print(vectorizer) X = vectorizer.fit_transform(X).toarray() # print '40', X # print type(X) Y_indicator = LabelBinarizer().fit(Y).transform(Y) cca = CCA(n_components = 100, max_iter=10) cca.fit(X, Y_indicator) X = cca.transform(X) # print '45', X # print type(X) classif = OneVsRestClassifier(SVC(kernel='linear')) classif.fit(X, Y) for row in rows: # row = rows[0] # print vectorizer.transform([row.text]).toarray() # print cca.predict(vectorizer.transform([row.text]).toarray()) transformed = vectorizer.transform([row.text]).toarray() # print '55', transformed ccad = cca.transform(transformed) # print '57', ccad predicts = classif.predict(ccad) if len(predicts) > 0:
# check type of array #print(np.dtype(data_selection)) # force dtype = float32 data_selection = data_selection.astype(np.float32, copy=False) # complete cases data_selection = data_selection[~np.isnan(data_selection).any(axis=1)] data_selection = data_selection[np.isfinite(data_selection).any(axis=1)] # target variable / covariates y = data_selection[:,0:3] x = data_selection[:,4:] # split test-train x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y, test_size=0.2, random_state=0) cca = CCA(n_components=1,scale=True) cca.fit(x_train, y_train) #CCA(copy=True, max_iter=500, n_components=1, scale=True, tol=1e-06), X_train_r, Y_train_r = cca.transform(x_train,y_train) X_test_r, Y_test_r = cca.transform(x_test, y_test) print(type(X_train_r)) print(np.shape(X_train_r)) print(np.shape(Y_train_r)) print(np.shape(x)) print(np.corrcoef(X_train_r[:,0],Y_train_r[:,0])) print(np.corrcoef(X_test_r[:,0],Y_test_r[:,0]))
for i in range (5): plt.plot(nComponents,plsRegScores[i,:],lw=3) plt.xlim(1,np.amax(nComponents)) plt.title('PLS Regression accuracy') plt.xlabel('Number of components') plt.ylabel('accuracy') plt.legend (['LR','LDA','GNB','Linear SVM','rbf SVM'],loc='lower right') plt.grid(True) if (0): #%% Canonical Correlation Analysis nComponents = np.arange(1,nClasses +1) cca = CCA(n_components=nClasses) cca.fit(Xtrain,Ytrain) XtrainT = cca.transform(Xtrain) XtestT = cca.transform(Xtest) ccaScores = np.zeros((5,np.alen(nComponents))) for i,n in enumerate(nComponents): ccaScores[:,i] = util.classify(XtrainT[:,0:n],XtestT[:,0:n],labelsTrain,labelsTest) cca = CCA(n_components=3) cca.fit(Xtrain,Ytrain) xt = cca.transform(Xtrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) plt.title('First 3 components of projected data') #%% Plot accuracies for CCA plt.figure()
class CCA_Model: def __init__(self,n_components): self.n_components = n_components self.cca = CCA(n_components=n_components) self.ntop = 10 def learn_model(self,X_chanel, Y_chanel,Y_Distinct=None): """ :param X_chanel: array-like for X chanel :param Y_chanel: array-line for Y chanel :return: """ print "Start learning..." self.x_dim = len(X_chanel[0]) self.y_dim = len(Y_chanel[0]) self.cca.fit(X_chanel,Y_chanel) if Y_Distinct == None: self.X_transform ,self.Y_transform = self.cca.transform(X_chanel,Y_chanel) else: self.X_transform ,self.Y_transform = self.cca.transform(X_chanel,Y_Distinct) print "Learning completed" def get_bet_match_index_transform_x2y(self,x_transform): shape = self.Y_transform.shape scores = np.ndarray(shape[0],dtype=float) for i in xrange(shape[0]): scores[i] = np.dot(self.Y_transform[i],x_transform) #scores[i] = entropy(x_transform,self.Y_transform[i]) indices = (-scores).argsort()[:self.ntop] return [indices, scores[indices]] def get_bet_match_index_transform_y2x(self,y_transform): shape = self.X_transform.shape scores = np.ndarray(shape[0], dtype=float) for i in xrange(shape[0]): scores[i] = np.dot(self.X_transform[i], y_transform) #scores[i] = entropy(y_transform,self.X_transform[i]) indices = (-scores).argsort()[:self.ntop] return [indices, scores[indices]] def get_best_match_cross_indices_x2y(self,x_inputs): x_transformes = self.cca.transform(x_inputs) results = [] for x_transform in x_transformes: results.append(self.get_bet_match_index_transform_x2y(x_transform)) return results def get_best_match_cross_indices_y2x(self,y_inputs): _, y_transformes = self.cca.transform([[0 for i in xrange(self.x_dim)]],y_inputs) results = [] for y_transform in y_transformes: results.append(self.get_bet_match_index_transform_y2x(y_transform)) return results